### Import Libraries

In [0]:
from pyspark.sql.functions import schema_of_json, current_timestamp, input_file_name, lit, col, to_json, from_json, to_timestamp, to_utc_timestamp, window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType, LongType, BooleanType
import pyspark.sql.functions as F
import re

### Gold Layer

In [0]:
silver_catalog_name = "weather"
silver_schema_name = "silver"
silver_table_name = "weather_measurements"

gold_catalog_name = "weather"
gold_schema_name = "gold"
gold_table_name = "weather_statistics_per_day"

gold_volume_name = "checkpoints"
spark.sql(f"CREATE CATALOG IF NOT EXISTS {gold_catalog_name}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {gold_catalog_name}.{gold_schema_name}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {gold_catalog_name}.{gold_schema_name}.{gold_volume_name}")
checkpoint_dir = f"/Volumes/{gold_catalog_name}/{gold_schema_name}/{gold_volume_name}/{gold_table_name}"

In [0]:
source_df = spark.readStream \
  .format("delta") \
  .table(f"{silver_catalog_name}.{silver_schema_name}.{silver_table_name}")

aggregation_df = source_df \
    .withWatermark("last_updated_timestamp", "1 hour") \
    .groupBy(
        window("last_updated_timestamp", "1 day"),
        "country",
        "region",
        "district"
    ) \
    .agg(
        F.min("temp_c").alias("min_temperature"),
        F.max("temp_c").alias("max_temperature"),
        F.avg("temp_c").alias("avg_temperature"),
        F.min("pressure_mb").alias("min_pressure"),
        F.max("pressure_mb").alias("max_pressure"),
        F.avg("pressure_mb").alias("avg_pressure")
    ) \
    .withColumn("window_start", col("window.start")) \
    .withColumn("window_end", col("window.end")) \
    .drop("window")

aggregation_df = aggregation_df.select(
    "country", "region", "district", "window_start", "window_end",
    "min_temperature", "max_temperature", "avg_temperature",
    "min_pressure", "max_pressure", "avg_pressure"
)

writer = aggregation_df.writeStream \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_dir) \
    .trigger(once=True) \
    .partitionBy("country") \
    .toTable(f"{gold_catalog_name}.{gold_schema_name}.{gold_table_name}")

writer.awaitTermination()