### Import Libraries

In [0]:
from pyspark.sql.functions import schema_of_json, current_timestamp, input_file_name, lit, col, to_json, from_json, to_timestamp, to_utc_timestamp, window
from pyspark.sql.types import *
import pyspark.sql.functions as F
import re

### Gold Layer

In [0]:
silver_catalog_name = "weather"
silver_schema_name = "02_silver"
silver_table_name = "weather_sensor_measurements"

gold_catalog_name = "weather"
gold_schema_name = "03_gold"
gold_table_name = "weather_sensor_statistics_per_hour"

spark.sql(f"CREATE CATALOG IF NOT EXISTS {gold_catalog_name}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {gold_catalog_name}.{gold_schema_name}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {gold_catalog_name}.{gold_schema_name}.checkpoints")
checkpoint_dir = f"/Volumes/{gold_catalog_name}/{gold_schema_name}/checkpoints/{gold_table_name}"

In [0]:
source_df = spark.readStream \
    .format("cloudFiles") \
    .option("cloudFiles.format", "delta") \
    .option("cloudFiles.includeExistingFiles", "true") \
    .table(f"{silver_catalog_name}.{silver_schema_name}.{silver_table_name}")

aggregation_df = source_df \
    .withWatermark("measurement_timestamp", "1 minute") \
    .groupBy(
        window("measurement_timestamp", "1 hour"),
        "latitude",
        "longitude",
    ) \
    .agg(
        F.min("humidity").alias("min_humidity"),
        F.max("humidity").alias("max_humidity"),
        F.avg("humidity").alias("avg_humidity"),
        F.min("temperature").alias("min_temperature"),
        F.max("temperature").alias("max_temperature"),
        F.avg("temperature").alias("avg_temperature"),
        F.min("pressure").alias("min_pressure"),
        F.max("pressure").alias("max_pressure"),
        F.avg("pressure").alias("avg_pressure")
    ) \
    .withColumn("window_start", col("window.start")) \
    .withColumn("window_end", col("window.end")) \
    .drop("window")

aggregation_df = aggregation_df.select(
    "latitude", "longitude", "window_start", "window_end",
    "min_humidity", "max_humidity", "avg_humidity",
    "min_temperature", "max_temperature", "avg_temperature",
    "min_pressure", "max_pressure", "avg_pressure"
)

writer = aggregation_df.writeStream \
    .queryName("weather_sensor_summary_stream") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_dir) \
    .toTable(f"{gold_catalog_name}.{gold_schema_name}.{gold_table_name}")

In [0]:
# for q in spark.streams.active:
#     if q.name == "weather_sensor_summary_stream":
#         q.stop()