In [0]:
dbutils.widgets.text("raw_path", "")
dbutils.widgets.text("bronze_path", "")

raw_path = dbutils.widgets.get("raw_path")
bronze_path = dbutils.widgets.get("bronze_path")

In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable

# spark.conf.set("fs.azure.account.key.adlsairqualitypoc.dfs.core.windows.net", "wyOLWhyzy7LbDaqB/GYL602VZsO98fuB5Elr6qUQBHV2uswoPxlHEfTyYS1bTRvKYQD2s/lrk2Uk+AStSn6bNg==")

# raw_input_path = "abfss://raw@adlsairqualitypoc.dfs.core.windows.net/aqi/"
# bronze_path = "abfss://bronze@adlsairqualitypoc.dfs.core.windows.net/aqi"

In [0]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

print("Starting Bronze ingestion...")

In [0]:
raw_df = spark.read.json(raw_path)

In [0]:
records_df = raw_df.select(
    explode(col("records")).alias("record")
)

incoming_df = records_df.select(
    col("record.country").alias("country"),
    col("record.state").alias("state"),
    col("record.city").alias("city"),
    col("record.station").alias("station"),
    col("record.pollutant_id").alias("pollutant_id"),
    expr("try_cast(record.min_value as double)").alias("pollutant_min"),
    expr("try_cast(record.max_value as double)").alias("pollutant_max"),
    expr("try_cast(record.avg_value as double)").alias("pollutant_avg"),
    to_timestamp(col("record.last_update"), "dd-MM-yyyy HH:mm:ss").alias("event_ts"),
    current_timestamp().alias("ingestion_ts"),
    to_date(current_timestamp()).alias("ingestion_date")
)

In [0]:
incoming_df = incoming_df.dropDuplicates(
    ["station", "pollutant_id", "event_ts"]
)

if incoming_df.limit(1).count() == 0:
    print("No records received from API.")
    dbutils.notebook.exit("No data")

In [0]:
if not DeltaTable.isDeltaTable(spark, bronze_path):

    print("Creating Bronze table...")

    (incoming_df.write
        .format("delta")
        .partitionBy("ingestion_date")
        .save(bronze_path)
    )

else:

    print("Merging incremental data...")

    delta_table = DeltaTable.forPath(spark, bronze_path)

    merge_condition = """
        target.station = source.station AND
        target.pollutant_id = source.pollutant_id AND
        target.event_ts = source.event_ts
    """

    (delta_table.alias("target")
        .merge(
            incoming_df.alias("source"),
            merge_condition
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

print("Bronze ingestion completed successfully.")

In [0]:
# spark.read.format("delta").load(bronze_path).count()
# spark.read.format("delta").load(bronze_path).display(5)

In [0]:
# spark.sql(f"DESCRIBE DETAIL delta.`{bronze_path}`").display(truncate=False)