In [0]:
dbutils.widgets.text("run_date", "")
dbutils.widgets.text("replay_offset_days", "0")
dbutils.widgets.text("raw_path", "")
dbutils.widgets.text("bronze_path", "")


run_date = dbutils.widgets.get("run_date")
raw_input_path = dbutils.widgets.get("raw_path")
bronze_path = dbutils.widgets.get("bronze_path")
replay_offset_days = int(dbutils.widgets.get("replay_offset_days"))

In [0]:
from pyspark.sql.functions import (
    col,
    current_timestamp,
    input_file_name,
    to_timestamp,
    to_date,
    expr
)

# Replay control (change this per run: 0, 1, 2, ...)
#replay_offset_days = 2


spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")


# raw_input_path = "abfss://raw@adlsairqualitypoc.dfs.core.windows.net/aqi/"
# bronze_path = "abfss://bronze@adlsairqualitypoc.dfs.core.windows.net/aqi"

In [0]:
# raw_df = (
#     spark.read
#          .option("header", "true")
#          .csv(raw_input_path)
# )

raw_df = (
    spark.read
         .format("csv")
         .option("header", "true")
         .option("includeMetadata", "true")
         .load(raw_input_path)
)

In [0]:
if run_date:
    raw_df = raw_df.withColumn(
        "event_ts_tmp",
        to_timestamp("last_update", "dd-MM-yyyy HH:mm:ss")
    ).filter(
        to_date(col("event_ts_tmp")) == run_date
    ).drop("event_ts_tmp")

In [0]:
# events_df = (
#     raw_df
#     .withColumn("event_ts", to_timestamp("last_update", "dd-MM-yyyy HH:mm:ss"))
#     .filter(col("event_ts").isNotNull())
# )

events_df = (
    raw_df
    .withColumn(
        "event_ts",
        expr(
            f"to_timestamp(last_update, 'dd-MM-yyyy HH:mm:ss') + INTERVAL {replay_offset_days} DAYS"
        )
    )
    .filter(col("event_ts").isNotNull())
)

In [0]:
from pyspark.sql.functions import window

windowed_events = (
    events_df
    .withColumn("event_window", window(col("event_ts"), "5 minutes"))
)

In [0]:
event_windows = (
    windowed_events
    .select("event_window")
    .distinct()
    .orderBy("event_window.start")
    .collect()
)

In [0]:

import time

max_windows = 20

for i, w in enumerate(event_windows):

    if i >= max_windows:
        print("Max window limit reached, stopping replay.")
        break

    start_ts = w.event_window.start
    end_ts = w.event_window.end

    batch_df = (
        events_df
        .filter((col("event_ts") >= start_ts) & (col("event_ts") < end_ts))
        .withColumn("ingestion_ts", current_timestamp())
        .withColumn("ingestion_date", to_date(col("ingestion_ts")))
        .withColumn("source_file", col("_metadata.file_path"))
    )

    if batch_df.count() == 0:
        continue

    batch_df.createOrReplaceTempView("incoming_bronze")

    spark.sql(f"""
        MERGE INTO delta.`{bronze_path}` t
        USING incoming_bronze s
        ON  t.country = s.country
        AND t.state = s.state
        AND t.city = s.city
        AND t.station = s.station
        AND t.pollutant_id = s.pollutant_id
        AND t.event_ts = s.event_ts

        WHEN MATCHED AND s.ingestion_ts > t.ingestion_ts
          THEN UPDATE SET *

        WHEN NOT MATCHED
          THEN INSERT *
    """)

    print(f"Processed window: {start_ts} to {end_ts}")
    time.sleep(5)

print("Simulated streaming ingestion completed.")

In [0]:
#spark.read.format("delta").load(bronze_path).count()