In [0]:
dbutils.widgets.text("bronze_path", "")
dbutils.widgets.text("silver_path", "")

bronze_path = dbutils.widgets.get("bronze_path")
silver_path = dbutils.widgets.get("silver_path")
#silver_path = "abfs://silver@adlsairqualitypoc.dfs.core.windows.net/aqi"

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable

spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

print("Starting Silver processing...")

In [0]:
# spark.conf.set("fs.azure.account.key.adlsairqualitypoc.dfs.core.windows.net", "wyOLWhyzy7LbDaqB/GYL602VZsO98fuB5Elr6qUQBHV2uswoPxlHEfTyYS1bTRvKYQD2s/lrk2Uk+AStSn6bNg==")

# bronze_path = "abfs://bronze@adlsairqualitypoc.dfs.core.windows.net/aqi"
# silver_path = "abfs://silver@adlsairqualitypoc.dfs.core.windows.net/aqi"

In [0]:
bronze_df = spark.read.format("delta").load(bronze_path)
# display(bronze_df)
# bronze_df.printSchema()

# bronze_df = spark.read.format("delta").load(bronze_path).select(
#     "country",
#     "state",
#     "city",
#     "station",
#     "pollutant_id",
#     "pollutant_min",
#     "pollutant_max",
#     "pollutant_avg",
#     "event_ts",
#     "ingestion_ts",
#     "ingestion_date"
# )

In [0]:
# #Watermark Logic (Incremental)

# try:
#     silver_existing = spark.read.format("delta").load(silver_path)

#     last_processed_ts = (
#         silver_existing
#             .agg(max(col("ingestion_ts")).alias("max_ts"))
#             .collect()[0]["max_ts"]
#     )

#     print(f"Last processed ingestion_ts: {last_processed_ts}")

# except AnalysisException:
#     # Silver table does not exist yet
#     last_processed_ts = None
#     print("Silver table not found. First full load.")


# # FILTER ONLY NEW BRONZE DATA

# if last_processed_ts is not None:
#     bronze_df = bronze_df.filter(col("ingestion_ts") > last_processed_ts)

# # Exit early if no new data
# if bronze_df.limit(1).count() == 0:
#     print("No new bronze data to process. Exiting.")
#     dbutils.notebook.exit("No new data")

In [0]:
# silver_df = bronze_df.select(
#     "country",
#     "state",
#     "city",
#     "station",
#     "pollutant_id",
#     "pollutant_min",
#     "pollutant_max",
#     "pollutant_avg",
#     "event_ts",
#     "ingestion_ts",
#     "ingestion_date"
# )

In [0]:
silver_df = (
    bronze_df
    .withColumn("pollutant_avg", expr("try_cast(pollutant_avg as double)"))
    .withColumn("pollutant_min", expr("try_cast(pollutant_min as double)"))
    .withColumn("pollutant_max", expr("try_cast(pollutant_max as double)"))
    .filter(
        (col("pollutant_avg").isNotNull()) &
        (col("pollutant_avg") >= 0) &
        (col("pollutant_avg") <= 500) &
        (col("event_ts").isNotNull())
    )
    .withColumn("event_date", to_date("event_ts"))
)

In [0]:
# Deduplication
window_spec = Window.partitionBy(
    "country", "state", "city", "station",
    "pollutant_id", "event_ts"
).orderBy(col("ingestion_ts").desc())

silver_df = (
    silver_df
    .withColumn("rn", row_number().over(window_spec))
    .filter(col("rn") == 1)
    .drop("rn")
)

In [0]:
# Outlier definition
silver_df = silver_df.withColumn(
    "is_outlier",
    when(col("pollutant_avg") > 300, True).otherwise(False)
)

In [0]:
# (
#     silver_df
#     .write
#     .format("delta")
#     .mode("overwrite")
#     .partitionBy("state")
#     .save(silver_path)
# )

# Merge logic
if not DeltaTable.isDeltaTable(spark, silver_path):

    (silver_df.write
        .format("delta")
        .partitionBy("event_date")
        .mode("append")
        .save(silver_path)
    )

else:

    delta_table = DeltaTable.forPath(spark, silver_path)

    merge_condition = """
        t.country = s.country AND
        t.state = s.state AND
        t.city = s.city AND
        t.station = s.station AND
        t.pollutant_id = s.pollutant_id AND
        t.event_ts = s.event_ts
    """

    (delta_table.alias("t")
        .merge(
            silver_df.alias("s"),
            merge_condition
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

print("Silver processing complete.")

In [0]:
# spark.read.format("delta").load(silver_path).display(10)
# spark.read.format("delta").load(silver_path).printSchema()