In [0]:
dbutils.widgets.text("run_date", "")
dbutils.widgets.text("raw_path", "")
dbutils.widgets.text("bronze_path", "")

run_date = dbutils.widgets.get("run_date")
raw_input_path = dbutils.widgets.get("raw_path")
bronze_path = dbutils.widgets.get("bronze_path")

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *


# raw_input_path = "abfss://raw@adlsairqualitypoc.dfs.core.windows.net/aqi/"
# bronze_path = "abfss://bronze@adlsairqualitypoc.dfs.core.windows.net/aqi"

In [0]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

In [0]:
# raw_df = (
#     spark.read
#          .option("header", "true")
#          .csv(raw_input_path)
# )

raw_df = (
    spark.read
         .format("csv")
         .option("header", "true")
         .option("includeMetadata", "true")
         .load(raw_input_path)
)

In [0]:
if run_date:
    raw_df = raw_df.withColumn(
        "event_ts_tmp",
        to_timestamp("last_update", "dd-MM-yyyy HH:mm:ss")
    ).filter(
        to_date(col("event_ts_tmp")) == run_date
    ).drop("event_ts_tmp")

In [0]:
bronze_df = (
    raw_df
    .withColumn("event_ts", to_timestamp("last_update", "dd-MM-yyyy HH:mm:ss"))
    .withColumn("ingestion_ts", current_timestamp())
    .withColumn("ingestion_date", to_date(col("ingestion_ts")))
    .withColumn("source_file", col("_metadata.file_path"))
)

In [0]:
bronze_df.createOrReplaceTempView("incoming_bronze")

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS delta.`{bronze_path}`
USING DELTA
PARTITIONED BY (ingestion_date)
AS SELECT * FROM incoming_bronze WHERE 1=0
""")

In [0]:
spark.sql(f"""
MERGE INTO delta.`{bronze_path}` t
USING incoming_bronze s
ON  t.country = s.country
AND t.state = s.state
AND t.city = s.city
AND t.station = s.station
AND t.pollutant_id = s.pollutant_id
AND t.event_ts = s.event_ts

WHEN MATCHED AND s.ingestion_ts > t.ingestion_ts
  THEN UPDATE SET *

WHEN NOT MATCHED
  THEN INSERT *
""")

In [0]:
# spark.read.format("delta").load(bronze_path).count()
# spark.read.format("delta").load(bronze_path).display(5)

In [0]:
# spark.sql(f"DESCRIBE DETAIL delta.`{bronze_path}`").display(truncate=False)