### Import Libraries

In [0]:
from pyspark.sql.functions import schema_of_json, current_timestamp, input_file_name, lit, col, to_json, from_json, to_timestamp, to_utc_timestamp, md5, concat_ws
from pyspark.sql.types import *
import re

### Bronze Layer

In [0]:
catalog_name = "weather"
schema_name = "01_bronze"
table_name = "weather_sensor_measurements_raw"

spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.checkpoints")
checkpoint_dir = f"/Volumes/{catalog_name}/{schema_name}/checkpoints/{table_name}"

spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.bad_records")
bad_records_dir = f"/Volumes/{catalog_name}/{schema_name}/bad_records/{table_name}"

s3_path = "s3://databricks-aws-sebastiancuya-bucket/weather-sensor-data"

from pyspark.sql.types import *

schema = StructType([
    StructField("device", StringType(), True),
    StructField("location", StructType([
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True)
    ]), True),
    StructField("request_timestamp", TimestampType(), True),
    StructField("measurement", StructType([
        StructField("humidity", DoubleType(), True),
        StructField("temperature", DoubleType(), True),
        StructField("pressure", DoubleType(), True),
        StructField("timestamp", TimestampType(), True)
    ]), True)
])

In [0]:
streaming_df = spark.readStream \
    .format("cloudFiles") \
    .option("cloudFiles.format", "json") \
    .option("cloudFiles.includeExistingFiles", "true") \
    .option("recursiveFileLookup", "true") \
    .option("badRecordsPath", bad_records_dir) \
    .schema(schema) \
    .load(s3_path)

streaming_df = streaming_df \
    .withColumn("timestamp", current_timestamp()) \
    .withColumn("file_path", col("_metadata.file_path")) \
    .withColumn("uuid", md5(concat_ws("_", col("device"), col("measurement.timestamp").cast("string"))))

streaming_df = streaming_df.selectExpr(
    "uuid as uuid",
    "device as device",
    "location.latitude as latitude",
    "location.longitude as longitude",
    "request_timestamp as request_timestamp",
    "measurement.humidity as humidity",
    "measurement.temperature as temperature",
    "measurement.pressure as pressure",
    "measurement.timestamp as measurement_timestamp",
    "timestamp as ingestion_timestamp",
    "file_path as file_path"
)

query = streaming_df.writeStream \
    .queryName("weather_sensor_ingestion_stream") \
    .outputMode("append") \
    .format("delta") \
    .option("mergeSchema", "true") \
    .option("checkpointLocation", checkpoint_dir) \
    .toTable(f"{catalog_name}.{schema_name}.{table_name}")


In [0]:
# for q in spark.streams.active:
#     if q.name == "weather_sensor_ingestion_stream":
#         q.stop()