In [0]:
dbutils.widgets.text("raw_path", "")
dbutils.widgets.text("bronze_path", "")

raw_path = dbutils.widgets.get("raw_path")
bronze_path = dbutils.widgets.get("bronze_path")

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.utils import AnalysisException

# spark.conf.set("fs.azure.account.key.adlsairqualitypoc.dfs.core.windows.net", "wyOLWhyzy7LbDaqB/GYL602VZsO98fuB5Elr6qUQBHV2uswoPxlHEfTyYS1bTRvKYQD2s/lrk2Uk+AStSn6bNg==")

# raw_input_path = "abfss://raw@adlsairqualitypoc.dfs.core.windows.net/aqi/"
# bronze_path = "abfss://bronze@adlsairqualitypoc.dfs.core.windows.net/aqi"

In [0]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

print("Starting Bronze processing...")

In [0]:
raw_df = spark.read.json(raw_path)

In [0]:
records_df = raw_df.select(explode(col("records")).alias("record"))

incoming_df = records_df.select(
    col("record.country").alias("country"),
    col("record.state").alias("state"),
    col("record.city").alias("city"),
    col("record.station").alias("station"),
    col("record.pollutant_id").alias("pollutant_id"),
    col("record.min_value").alias("pollutant_min"),
    col("record.max_value").alias("pollutant_max"),
    col("record.avg_value").alias("pollutant_avg"),
    to_timestamp(col("record.last_update"), "dd-MM-yyyy HH:mm:ss").alias("event_ts"),
    current_timestamp().alias("ingestion_ts"),
    to_date(current_timestamp()).alias("ingestion_date")
)

In [0]:
#Watermark Logic (event_ts based)

try:
    bronze_existing = spark.read.format("delta").load(bronze_path)

    last_processed_ts = (
        bronze_existing
            .agg(max(col("event_ts")).alias("max_ts"))
            .collect()[0]["max_ts"]
    )

    print(f"Last processed event_ts: {last_processed_ts}")

except AnalysisException:
    last_processed_ts = None
    print("Bronze table not found. First full load.")

# Filter only NEW data from API
if last_processed_ts:
    incoming_df = incoming_df.filter(col("event_ts") > last_processed_ts)

# Exit early if nothing new
if incoming_df.limit(1).count() == 0:
    print("No new records to ingest.")
    dbutils.notebook.exit("No new data")

In [0]:
incoming_df.createOrReplaceTempView("incoming_bronze")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS delta.`{bronze_path}`
USING DELTA
PARTITIONED BY (ingestion_date)
AS SELECT * FROM incoming_bronze WHERE 1=0
""")


(incoming_df.write
    .format("delta")
    .mode("append")
    .save(bronze_path)
)

print("Bronze incremental ingestion complete.")

In [0]:
# spark.read.format("delta").load(bronze_path).count()
# spark.read.format("delta").load(bronze_path).display(5)

In [0]:
# spark.sql(f"DESCRIBE DETAIL delta.`{bronze_path}`").display(truncate=False)