# Deduplicate temperature measurements in silver layer
- Source table: *silver temperature_measurements*
- Target table: *silver temperature_measurements_deduped*






### Imports and common variables

In [0]:
from pyspark.sql.functions import col, current_timestamp, concat, sha2, split # type: ignore
from delta.tables import * # type: ignore


catalog = dbutils.widgets.get("catalog") #"air_polution_analytics_dev"
landing_schema = dbutils.widgets.get("landing_schema") # "00_landing"
silver_schema = dbutils.widgets.get("silver_schema") # "02_silver"
bronze_schema = dbutils.widgets.get("bronze_schema") # "01_bronze"
source_table = "temperature_measurements"
target_table = "fct_weather"

base_path = f"/Volumes/{catalog}/{silver_schema}/metadata"
metadata_path = f"{base_path}/openaq/{target_table}"


### Create deduped table, volume and directory for metadata

In [0]:

create_table = f"""
create table if not exists {catalog}.{silver_schema}.{target_table} (
  id string NOT NULL,
  location_id string NOT NULL REFERENCES {catalog}.{silver_schema}.dim_weather_locations(location_id),
  date_int INT NOT NULL REFERENCES {catalog}.{silver_schema}.dim_calendar(date_int),
  time_int INT NOT NULL REFERENCES {catalog}.{silver_schema}.dim_time(time_int),
  apparent_temperature	float,
  precipitation	float,
  relative_humidity_2m	int,
  temperature_2m	float,
  wind_direction_100m	int,
  wind_direction_10m	int,
  wind_gusts_10m	float,
  wind_speed_100m	float,
  wind_speed_10m	float,
  surface_pressure	float 
)
"""

create_volume = f"create volume if not exists {catalog}.{silver_schema}.metadata;"

spark.sql(create_table)
spark.sql(create_volume)
dbutils.fs.mkdirs(metadata_path)


### Read streaming table, set up a write stream and a deduplication function
Generate a unique id for every record in the stream and use it as deduplication key

In [0]:

def upsertToDelta(microBatchOutputDF, batchId):
    tableDeduped = DeltaTable.forName(spark, f"{catalog}.{silver_schema}.{target_table}")
    (tableDeduped.alias("t").merge(
        microBatchOutputDF.alias("s"),
        "s.id = t.id")
    .whenNotMatchedInsertAll()
    .execute()
    )
    
df = (spark.readStream
    .table(f"{catalog}.{bronze_schema}.{source_table}")
    .withColumn("id", sha2(concat(col("location_id"), col("datetime")), 256))
    .selectExpr(
        "id",
        "location_id",
        "year(datetime) * 10000 + month(datetime) * 100 + day(datetime) as date_int",
        "cast(date_format(datetime, 'HHmm') as int) as time_int",
        "apparent_temperature",
        "precipitation",
        "relative_humidity_2m",
        "temperature_2m",
        "wind_direction_100m",
        "wind_direction_10m",
        "wind_gusts_10m",
        "wind_speed_100m",
        "wind_speed_10m",
        "surface_pressure"
    )
)

(df.writeStream
  .foreachBatch(upsertToDelta)
  .outputMode("update")
  .trigger(availableNow=True)
  .option("checkpointLocation", f"{metadata_path}/_schema")
  .start()
)
