In [None]:
from pyspark.sql import functions as F
from datetime import datetime
import uuid

# =========================================================
# Parameters (later pass from pipeline)
# =========================================================
PIPELINE_NAME = "rp_orchestrator_dev"
RUN_DATE = "2026-01-10"
STAGE = "OPS_MONITORING"

run_id = str(uuid.uuid4())
run_ts = datetime.utcnow()

print(f"RunId: {run_id}")
print(f"Pipeline: {PIPELINE_NAME}")
print(f"RunDate: {RUN_DATE}")
print(f"RunTS(UTC): {run_ts}")

# =========================================================
# Helper: table exists
# =========================================================
def table_exists(tname: str) -> bool:
    try:
        return spark.catalog.tableExists(tname)
    except:
        return False

# =========================================================
# OPS tables (Lakehouse)
# =========================================================
spark.sql("""
CREATE TABLE IF NOT EXISTS ops_run_log (
  run_id STRING,
  pipeline_name STRING,
  stage STRING,
  run_date STRING,
  run_ts TIMESTAMP,
  status STRING,
  message STRING
) USING DELTA
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS ops_table_metrics (
  run_id STRING,
  pipeline_name STRING,
  run_date STRING,
  run_ts TIMESTAMP,
  table_name STRING,
  row_count LONG,
  distinct_key_count LONG,
  duplicate_key_count LONG,
  null_key_count LONG,
  min_ts TIMESTAMP,
  max_ts TIMESTAMP,
  notes STRING
) USING DELTA
""")

# =========================================================
# Metrics function
# =========================================================
def compute_table_metrics(table_name, key_cols=None, ts_candidates=None, notes=""):
    if key_cols is None:
        key_cols = []
    if ts_candidates is None:
        ts_candidates = []

    if not table_exists(table_name):
        return {
            "table_name": table_name,
            "row_count": None,
            "distinct_key_count": None,
            "duplicate_key_count": None,
            "null_key_count": None,
            "min_ts": None,
            "max_ts": None,
            "notes": "SKIPPED (table not found)"
        }

    df = spark.table(table_name)
    cols = set(df.columns)

    row_count = df.count()

    # timestamp column detection
    ts_col = next((c for c in ts_candidates if c in cols), None)

    min_ts = max_ts = None
    if ts_col:
        stats = df.select(
            F.min(ts_col).alias("min_ts"),
            F.max(ts_col).alias("max_ts")
        ).collect()[0]
        min_ts = stats["min_ts"]
        max_ts = stats["max_ts"]

    usable_keys = [k for k in key_cols if k in cols]

    distinct_key_count = duplicate_key_count = null_key_count = None

    if usable_keys:
        null_cond = None
        for k in usable_keys:
            c = F.col(k).isNull()
            null_cond = c if null_cond is None else null_cond | c

        null_key_count = df.filter(null_cond).count()
        distinct_key_count = df.select(*usable_keys).distinct().count()
        duplicate_key_count = row_count - distinct_key_count
    else:
        notes = f"{notes} | Keys not found"

    return {
        "table_name": table_name,
        "row_count": row_count,
        "distinct_key_count": distinct_key_count,
        "duplicate_key_count": duplicate_key_count,
        "null_key_count": null_key_count,
        "min_ts": min_ts,
        "max_ts": max_ts,
        "notes": notes.strip()
    }

# =========================================================
# Tables to monitor (ALIGNED TO YOUR SCHEMA)
# =========================================================
monitor_list = [
    # -------- Silver (Lakehouse)
    {"name": "silver_products_clean",        "keys": ["product_id"],            "ts": ["silver_ingest_ts"]},
    {"name": "silver_orders_clean",          "keys": ["order_id"],              "ts": ["order_ts","silver_ingest_ts"]},
    {"name": "silver_order_items_clean",     "keys": ["order_item_id"],         "ts": ["silver_ingest_ts"]},
    {"name": "silver_customers_current",     "keys": ["customer_id"],           "ts": ["silver_ingest_ts"]},
    {"name": "silver_customers_scd2",        "keys": ["customer_id","effective_start_ts"], "ts": ["effective_start_ts"]},
    {"name": "silver_payments_clean",        "keys": ["payment_id"],            "ts": ["payment_ts"]},
    {"name": "silver_returns_clean",         "keys": ["return_id"],             "ts": ["return_ts"]},

    # -------- Gold staging (Lakehouse)
    {"name": "gold_fact_sales_staging",      "keys": ["order_id","product_id"], "ts": ["order_ts"]},
    {"name": "gold_fact_payments_staging",   "keys": ["payment_id"],            "ts": []},
    {"name": "gold_fact_returns_staging",    "keys": ["return_id"],             "ts": []},
]

# =========================================================
# Run START log
# =========================================================
spark.createDataFrame([(
    run_id, PIPELINE_NAME, STAGE, RUN_DATE, run_ts, "STARTED", "OPS monitoring started"
)], ["run_id","pipeline_name","stage","run_date","run_ts","status","message"]) \
.write.mode("append").saveAsTable("ops_run_log")

# =========================================================
# Compute metrics
# =========================================================
rows = []
for item in monitor_list:
    m = compute_table_metrics(item["name"], item["keys"], item["ts"])
    rows.append((
        run_id, PIPELINE_NAME, RUN_DATE, run_ts,
        m["table_name"], m["row_count"],
        m["distinct_key_count"], m["duplicate_key_count"],
        m["null_key_count"], m["min_ts"], m["max_ts"], m["notes"]
    ))

spark.createDataFrame(rows, [
    "run_id","pipeline_name","run_date","run_ts",
    "table_name","row_count","distinct_key_count",
    "duplicate_key_count","null_key_count","min_ts","max_ts","notes"
]).write.mode("append").saveAsTable("ops_table_metrics")

# =========================================================
# SCD2 current duplicates check (important)
# =========================================================
if table_exists("silver_customers_scd2"):
    dup_cnt = (
        spark.table("silver_customers_scd2")
        .filter("is_current = true")
        .groupBy("customer_id").count()
        .filter("count > 1")
        .count()
    )
    print(f"⚠️ SCD2 current duplicates: {dup_cnt}")

# =========================================================
# Run END log
# =========================================================
spark.createDataFrame([(
    run_id, PIPELINE_NAME, STAGE, RUN_DATE, datetime.utcnow(), "COMPLETED", "OPS monitoring completed"
)], ["run_id","pipeline_name","stage","run_date","run_ts","status","message"]) \
.write.mode("append").saveAsTable("ops_run_log")

print("✅ OPS monitoring completed successfully")
