### Import Libraries

In [0]:
from pyspark.sql.functions import schema_of_json, current_timestamp, input_file_name, lit, col, to_json, from_json, to_timestamp, to_utc_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType, LongType, BooleanType
import re

### Silver Layer

In [0]:
bronze_catalog_name = "weather"
bronze_schema_name = "bronze"
bronze_table_name = "weather_measurements_raw"

silver_catalog_name = "weather"
silver_schema_name = "silver"
silver_table_name = "weather_measurements"

silver_volume_name = "checkpoints"
spark.sql(f"CREATE VOLUME IF NOT EXISTS {silver_catalog_name}.{silver_schema_name}.{silver_volume_name}")
checkpoint_dir = f"/Volumes/{silver_catalog_name}/{silver_schema_name}/{silver_volume_name}/{silver_table_name}"

In [0]:
create_silver_table_sql_statement = f"""
    CREATE TABLE IF NOT EXISTS {silver_catalog_name}.{silver_schema_name}.{silver_table_name} (
        country STRING,
        region STRING,
        district STRING,
        lat DECIMAL(10,6),
        lon DECIMAL(10,6),
        tz_id STRING,
        api_call_timestamp TIMESTAMP,
        last_updated_timestamp TIMESTAMP,
        temp_c DECIMAL(10,2),
        is_day BOOLEAN,
        weather_condition STRING,
        wind_mph DECIMAL(10,2),
        wind_degree DECIMAL(10,2),
        pressure_mb DECIMAL(10,2),
        humidity DECIMAL(10,2),
        cloud DECIMAL(10,2),
        feelslike_c DECIMAL(10,2),
        windchill_c DECIMAL(10,2),
        heatindex_c DECIMAL(10,2),
        dewpoint_c DECIMAL(10,2),
        vis_km DECIMAL(10,2),
        gust_mph DECIMAL(10,2),
        air_quality_co DECIMAL(10,2),
        air_quality_no2 DECIMAL(10,2),
        air_quality_o3 DECIMAL(10,2),
        air_quality_so2 DECIMAL(10,2),
        air_quality_pm2_5 DECIMAL(10,2),
        air_quality_pm10 DECIMAL(10,2),
        air_quality_us_epa_index BIGINT,
        air_quality_gb_defra_index BIGINT,
        ingestion_timestamp TIMESTAMP,
        processing_timestamp TIMESTAMP
    )
    USING DELTA
    PARTITIONED BY (country);
"""

spark.sql(create_silver_table_sql_statement)

In [0]:
streaming_df = spark.readStream \
    .format("delta") \
    .table(f"{bronze_catalog_name}.{bronze_schema_name}.{bronze_table_name}")

processing_df = streaming_df \
    .withColumn("processing_timestamp", current_timestamp()) \
    .withColumn("api_call_timestamp", to_utc_timestamp(to_timestamp("localtime", "yyyy-MM-dd HH:mm"), streaming_df.tz_id)) \
    .withColumn("last_updated_timestamp", to_utc_timestamp(to_timestamp("last_updated", "yyyy-MM-dd HH:mm"), streaming_df.tz_id))\
    .withColumn("is_day", col("is_day").cast("boolean")) \
    .withColumn("humidity", col("humidity").cast("float")) \
    .withColumn("cloud", col("cloud").cast("float")) \
    .withColumnRenamed("condition_text", "weather_condition") \
    .dropDuplicates(['country', 'region', 'district', 'last_updated_timestamp'])
    

column_names = [
  line.split()[0] 
  for line in create_silver_table_sql_statement.splitlines()[1:] 
  if (
    len(line.split()) == 2 
    and re.match(r'([A-Z]*).*', line.split()[1].strip()).group(1) in [
      'STRING', 'FLOAT', 'DOUBLE', 'BIGINT', 'BOOLEAN', 'TIMESTAMP', 'DECIMAL']
    )
  ]

filtered_processing_df = processing_df.select(*column_names)

# Function to upsert microBatchOutputDF into Delta table using merge
def upsertToDelta(microBatchOutputDF, batchId):
  microBatchOutputDF.createOrReplaceTempView("streaming_updates")

  # In Databricks Runtime 10.5 and below, you must use the following:
  # microBatchOutputDF._jdf.sparkSession().sql("""
  microBatchOutputDF.sparkSession.sql(f"""
    MERGE INTO {silver_catalog_name}.{silver_schema_name}.{silver_table_name} t
    USING streaming_updates s
    ON 
        s.country = t.country
        AND s.region = t.region
        AND s.district = t.district
        AND s.last_updated_timestamp = t.last_updated_timestamp
    WHEN MATCHED THEN UPDATE SET *
    WHEN NOT MATCHED THEN INSERT *
  """)

# Write the output of a streaming aggregation query into Delta table
writer = filtered_processing_df.writeStream \
    .foreachBatch(upsertToDelta) \
    .outputMode("update")\
    .option("checkpointLocation", checkpoint_dir) \
    .trigger(once=True) \
    .start()

writer.awaitTermination()


In [0]:
spark.sql(f'OPTIMIZE {silver_catalog_name}.{silver_schema_name}.{silver_table_name} ZORDER BY (last_updated_timestamp)')