In [0]:
dbutils.widgets.text("start_ts", "", "Manual Start(Backfill)")
dbutils.widgets.text("end_ts", "", "Manual End")

In [0]:
# MAGIC %load_ext autoreload
# MAGIC %autoreload 2

In [0]:
import sys
import os
import pyspark.sql.functions as f
from datetime import datetime
sys.path.append(os.path.abspath('../..'))

In [0]:
from transformations.order_transforms import transform_orders, upsert_order, enrich_order_data
from transformations.transform_utils import normalize_raw_schema, get_watermark, update_watermark, optimize_partitions

In [0]:
start = dbutils.widgets.get("start_ts").strip()
end = dbutils.widgets.get("end_ts").strip()

In [0]:
CATALOG = "pei"

In [0]:
raw_order_table_name = "raw_orders"
enriched_order_table_name = "orders_enriched"
quarantine_table_name = "orders_quarantine"

In [0]:
try: 
    is_backfill = False if start == "" else True

    last_ts = start if is_backfill else get_watermark(spark, raw_order_table_name)
    upper_bound = end if end != "" else datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    df_raw_orders = (
        spark.read.table(f"{CATALOG}.bronze.{raw_order_table_name}")
        .filter(f.col("ingestion_timestamp") > last_ts)
        .filter(f.col("ingestion_timestamp") <= upper_bound)
    )

    if not df_raw_orders.isEmpty(): 
        df_normalized = normalize_raw_schema(df_raw_orders)
        df_transformed = transform_orders(df_normalized)
        df_validated = df_transformed.filter(~f.col("is_critical"))

        df_quarantine = (
            df_validated
            .filter(f.col("is_critical") | f.col("is_warning")) 
            .withColumn("severity_level", f.when(f.col("is_critical"), "CRITICAL").otherwise("WARNING")) 
            .select("row_id", "quarantine_reason", "severity_level", "file_path", "ingestion_timestamp", "processing_timestamp")
        )

        if not df_quarantine.isEmpty(): 
            print(f"Writing records to quarantine.")
            df_quarantine.write.format("delta").mode("append").saveAsTable(f"{CATALOG}.silver.{quarantine_table_name}")

        df_prod = spark.read.table(f"{CATALOG}.silver.products_enriched").select("product_id", "category", "sub_category")
        df_cust = spark.read.table(f"{CATALOG}.silver.customers_enriched").select("customer_id", "customer_name", "country")

        df_enriched = enrich_order_data(df_validated, df_prod, df_cust)
        df_unique = df_enriched.dropDuplicates(["order_id"])

        upsert_order(spark, 
                     df_unique, 
                     f"{CATALOG}.silver.{enriched_order_table_name}"
                     )

        optimize_partitions(spark, 
                            f"{CATALOG}.silver.{enriched_order_table_name}", 
                            df_unique, 
                            "year_month", 
                            "customer_name, category")

        if not is_backfill:
            update_watermark(spark, raw_order_table_name, upper_bound)
        
        print(f"Finished processing batch up to {upper_bound}")
    else: 
        print("No new data to process.")
except Exception as e: 
    print(f"FAILED: Orders Enrichment. Error: {str(e)}")