### Import Libraries

In [0]:
from pyspark.sql.functions import schema_of_json, current_timestamp, input_file_name, lit, col, to_json, from_json, to_timestamp, to_utc_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType, LongType, BooleanType
import re

### Bronze Layer

In [0]:

def list_all_files(path):
    file_paths = []

    for item in dbutils.fs.ls(path):
        if item.isDir():
            file_paths = file_paths + list_all_files(item.path)
        else:
            file_paths.append(item.path)

    return file_paths


def list_first_file(path):
    file_path = None

    for item in dbutils.fs.ls(path):
        if item.isDir():
            return list_first_file(item.path)
        else:
            file_path = item.path
            return file_path

In [0]:
catalog_name = "weather"
schema_name = "bronze"
table_name = "weather_measurements_raw"

volume_name = "checkpoints"
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.{volume_name}")
checkpoint_dir = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/{table_name}"

s3_path = "s3a://databricks-aws-sebastiancuya-bucket/weather-data"
example_file_path = list_first_file(s3_path)

df = spark.read.json(example_file_path)
inferred_schema = df.schema

In [0]:
streaming_df = spark.readStream \
    .format("json") \
    .schema(inferred_schema) \
    .option("recursiveFileLookup", "true") \
    .load(s3_path)

streaming_df = streaming_df \
    .withColumn("timestamp", current_timestamp()) \
    .withColumn("file_path", col("_metadata.file_path"))

streaming_df = streaming_df.selectExpr(
    "location.name as district",
    "location.region as region",
    "location.country as country",
    "location.lat as lat",
    "location.lon as lon",
    "location.tz_id as tz_id",
    "location.localtime_epoch as localtime_epoch",
    "location.localtime as localtime",
    "current.last_updated_epoch as last_updated_epoch",
    "current.last_updated as last_updated",
    "current.temp_c as temp_c",
    "current.temp_f as temp_f",
    "current.is_day as is_day",
    "current.condition.text as condition_text",
    "current.condition.icon as condition_icon",
    "current.condition.code as condition_code",
    "current.wind_mph as wind_mph",
    "current.wind_kph as wind_kph",
    "current.wind_degree as wind_degree",
    "current.wind_dir as wind_dir",
    "current.pressure_mb as pressure_mb",
    "current.pressure_in as pressure_in",
    "current.precip_mm as precip_mm",
    "current.precip_in as precip_in",
    "current.humidity as humidity",
    "current.cloud as cloud",
    "current.feelslike_c as feelslike_c",
    "current.feelslike_f as feelslike_f",
    "current.windchill_c as windchill_c",
    "current.windchill_f as windchill_f",
    "current.heatindex_c as heatindex_c",
    "current.heatindex_f as heatindex_f",
    "current.dewpoint_c as dewpoint_c",
    "current.dewpoint_f as dewpoint_f",
    "current.vis_km as vis_km",
    "current.vis_miles as vis_miles",
    "current.uv as uv",
    "current.gust_mph as gust_mph",
    "current.gust_kph as gust_kph",
    "current.air_quality.`co` as air_quality_co",
    "current.air_quality.`no2` as air_quality_no2",
    "current.air_quality.`o3` as air_quality_o3",
    "current.air_quality.`so2` as air_quality_so2",
    "current.air_quality.`pm2_5` as air_quality_pm2_5",
    "current.air_quality.`pm10` as air_quality_pm10",
    "current.air_quality.`us-epa-index` as air_quality_us_epa_index",
    "current.air_quality.`gb-defra-index` as air_quality_gb_defra_index",
    "timestamp as ingestion_timestamp",
    "file_path as file_path"
)

writer = streaming_df.writeStream \
    .outputMode("append") \
    .format("delta") \
    .option("mergeSchema", "true") \
    .option("checkpointLocation", checkpoint_dir) \
    .trigger(once=True) \
    .toTable(f"{catalog_name}.{schema_name}.{table_name}")

writer.awaitTermination()
