In [0]:
# ------------------------
# ROW COUNT VALIDATION
# ------------------------
from pyspark.sql.functions import col, sum

fact_df = spark.table("real_time_projects.ecommerce_historical.fact_sales")
fact_count = fact_df.count()

if fact_count == 0:
    raise Exception("‚ùå DQ FAILED: fact_sales table has ZERO records")

print(f"‚úÖ Row count check passed: {fact_count} records found")

# ------------------------------
# NULL CHECKS ON KEY COLUMNS
# ------------------------------
from pyspark.sql.functions import col

key_columns = [
    "order_id",
    "order_item_id",
    "customer_id",
    "product_id"
]

for key in key_columns:
    null_count = fact_df.filter(col(key).isNull()).count()

    if null_count > 0:
        raise Exception(
            f"‚ùå DQ FAILED: Column '{key}' has {null_count} NULL values"
        )

    print(f"‚úÖ Null check passed for column: {key}")

# -------------------------------------------------------------------
# REVENUE SANITY CHECK - REVENUE SHOULD BE = price + freight_value
# -------------------------------------------------------------------
from pyspark.sql.functions import col

invalid_revenue_count = fact_df.filter(
    col("revenue") != (col("price") + col("freight_value"))
).count()

if invalid_revenue_count > 0:
    raise Exception(
        f"‚ùå DQ FAILED: {invalid_revenue_count} records have incorrect revenue calculation"
    )

print("‚úÖ Revenue calculation check passed")

# ------------------------------------------------------------
# NEGATIVE REVENUE CHECK - REVENUE SHOULD NEVER BE NEGATIVE
# ------------------------------------------------------------
from pyspark.sql.functions import col

negative_revenue_count = fact_df.filter(col("revenue") < 0).count()

if negative_revenue_count > 0:
    raise Exception(
        f"‚ùå DQ FAILED: {negative_revenue_count} records have NEGATIVE revenue"
    )

print("‚úÖ Negative revenue check passed")

# ---------------------------------------------------------------------------
# ROW COUNT RECONCILIATION - fact_sales COUNT MUST MATCH order_items COUNT
# ---------------------------------------------------------------------------
from pyspark.sql.functions import col

order_items_count = spark.table("real_time_projects.ecommerce_historical.order_items").count()

if fact_count != order_items_count:
    raise Exception(
        f"‚ùå DQ FAILED: fact_sales count ({fact_count}) "
        f"does not match order_items count ({order_items_count})"
    )

print("‚úÖ Row count reconciliation passed")

print("üéâ ALL DATA QUALITY CHECKS PASSED SUCCESSFULLY")