In [0]:
dbutils.widgets.text("bronze_path", "")
dbutils.widgets.text("silver_path", "")
dbutils.widgets.text("gold_fact_path", "")
dbutils.widgets.text("dq_log_path", "")

bronze_path = dbutils.widgets.get("bronze_path")
silver_path = dbutils.widgets.get("silver_path")
gold_fact_path = dbutils.widgets.get("gold_fact_path")
dq_log_path = dbutils.widgets.get("dq_log_path")

In [0]:
from pyspark.sql.functions import col, max as spark_max, expr
from pyspark.sql.types import *
from datetime import datetime, timezone

print("Starting Data Quality checks...")

In [0]:

# bronze_path = "abfss://bronze@adlsairqualitypoc.dfs.core.windows.net/aqi"
# silver_path = "abfss://silver@adlsairqualitypoc.dfs.core.windows.net/aqi"
# gold_fact_path = "abfss://gold@adlsairqualitypoc.dfs.core.windows.net/aqi/fact_air_quality"
# dq_log_path = "abfss://data-quality@adlsairqualitypoc.dfs.core.windows.net/aqi"

In [0]:
bronze_df = spark.read.format("delta").load(bronze_path)
silver_df = spark.read.format("delta").load(silver_path)
fact_df = spark.read.format("delta").load(gold_fact_path)

In [0]:
dq_log_schema = StructType([
    StructField("layer", StringType(), False),
    StructField("check_name", StringType(), False),
    StructField("status", StringType(), False),
    StructField("message", StringType(), True),
    StructField("record_count", LongType(), True),
    StructField("run_ts", TimestampType(), False)
])

In [0]:
dq_failed = False
run_ts = datetime.now(timezone.utc)

def log_dq(layer, check_name, status, message=None, record_count=None):
    global dq_failed
    
    if status == "FAIL":
        dq_failed = True

    data = [(
        layer,
        check_name,
        status,
        message if message else "",
        record_count,
        run_ts
    )]

    spark.createDataFrame(data, dq_log_schema) \
        .write \
        .format("delta") \
        .mode("append") \
        .save(dq_log_path)

In [0]:
print("Running Bronze checks...")

bronze_expected_cols = {
    "country", "state", "city", "station", "pollutant_id",
    "pollutant_min", "pollutant_max", "pollutant_avg",
    "event_ts", "ingestion_ts", "ingestion_date"
}

missing_cols = bronze_expected_cols - set(bronze_df.columns)

if missing_cols:
    log_dq("bronze", "schema_check", "FAIL", f"Missing columns: {missing_cols}")
else:
    log_dq("bronze", "schema_check", "PASS")

In [0]:
bronze_df_safe = bronze_df.withColumn(
    "pollutant_avg_num",
    expr("try_cast(pollutant_avg as double)")
)

In [0]:
# Null check

# null_count = bronze_df_safe.filter(col("pollutant_avg_num").isNull()).count()

# if null_count > 0:
#     log_dq("bronze", "null_check_pollutant_avg", "FAIL",
#            f"Null pollutant_avg rows: {null_count}", null_count)
# else:
#     log_dq("bronze", "null_check_pollutant_avg", "PASS", record_count=0)

null_count = bronze_df_safe.filter(col("pollutant_avg_num").isNull()).count()

log_dq(
    "bronze",
    "null_check_pollutant_avg",
    "PASS",
    f"Nulls allowed in Bronze: {null_count}",
    null_count
)

In [0]:
# Range check
invalid_count = bronze_df_safe.filter(
    col("pollutant_avg_num").isNotNull() &
    ((col("pollutant_avg_num") < 0) |
     (col("pollutant_avg_num") > 1000))
).count()

if invalid_count > 0:
    log_dq("bronze", "range_check_pollutant_avg", "FAIL",
           f"Severely out of range rows: {invalid_count}", invalid_count)
else:
    log_dq("bronze", "range_check_pollutant_avg", "PASS")

In [0]:
# Volume check
bronze_count = bronze_df.count()

if bronze_count == 0:
    log_dq("bronze", "volume_check", "FAIL", "No records ingested")
else:
    log_dq("bronze", "volume_check", "PASS", record_count=bronze_count)

In [0]:
# Freshness check
max_event_ts = bronze_df.select(spark_max("event_ts")).collect()[0][0]

if max_event_ts is None:
    log_dq("bronze", "freshness_check", "FAIL", "event_ts missing")
else:
    log_dq("bronze", "freshness_check", "PASS")

In [0]:
#silver checks

print("Running Silver checks...")

silver_count = silver_df.count()

if silver_count == 0:
    log_dq("silver", "volume_check", "FAIL", "No silver records")
else:
    log_dq("silver", "volume_check", "PASS", record_count=silver_count)

# Deduplication check
duplicate_count = (
    silver_df.groupBy(
        "country","state","city","station",
        "pollutant_id","event_ts"
    )
    .count()
    .filter(col("count") > 1)
    .count()
)

if duplicate_count > 0:
    log_dq("silver", "dedup_check", "FAIL",
           f"Duplicate records found: {duplicate_count}", duplicate_count)
else:
    log_dq("silver", "dedup_check", "PASS")

# Range enforcement (0–500 AQI business rule)
range_violation = silver_df.filter(
    (col("pollutant_avg") < 0) |
    (col("pollutant_avg") > 500)
).count()

if range_violation > 0:
    log_dq("silver", "business_range_check", "FAIL",
           f"AQI outside 0–500: {range_violation}", range_violation)
else:
    log_dq("silver", "business_range_check", "PASS")

In [0]:
#gold checks
print("Running Gold checks...")

gold_count = fact_df.count()

if gold_count == 0:
    log_dq("gold", "volume_check", "FAIL", "No gold fact records")
else:
    log_dq("gold", "volume_check", "PASS", record_count=gold_count)

# Surrogate key integrity
null_key_count = fact_df.filter(
    col("location_key").isNull() |
    col("pollutant_key").isNull()
).count()

if null_key_count > 0:
    log_dq("gold", "key_integrity_check", "FAIL",
           f"Null surrogate keys: {null_key_count}", null_key_count)
else:
    log_dq("gold", "key_integrity_check", "PASS")

In [0]:
if dq_failed:
    print("Data Quality FAILED")
    dbutils.notebook.exit("FAIL")
else:
    print("Data Quality PASSED")
    dbutils.notebook.exit("PASS")