In [0]:
dbutils.widgets.text("start_version", "", "Manual Start Version (Backfill)")
dbutils.widgets.text("end_version", "", "Manual End Version")

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import sys
import os
import pyspark.sql.functions as f
from datetime import datetime
sys.path.append(os.path.abspath('../..'))

In [0]:
from transformations.order_transforms import transform_orders, enrich_order_data
from data_writers.write_data import upsert_delta_table
from utils.transform_utils import normalize_raw_schema
from utils.metadata_manager  import get_last_processed_version, update_last_processed_version, get_latest_table_version, get_pipeline_version_range
from data_writers.maintenance import optimize_partitions

In [0]:
start_val = dbutils.widgets.get("start_version").strip()
end_val = dbutils.widgets.get("end_version").strip()

In [0]:
CATALOG = "pei"
SOURCE_SCHEMA = "bronze"

In [0]:
raw_order_table_name = "raw_orders"
enriched_order_table_name = "orders_enriched"
quarantine_table_name = "orders_quarantine"

In [0]:
try: 
    is_backfill = start_val.strip() != ""
    
    # get start and end versions for backfill or incremental processing
    start_version, end_version = get_pipeline_version_range(
    spark, CATALOG, SOURCE_SCHEMA, raw_order_table_name, start_val, end_val
    )

    if start_version > end_version:
        raise Exception(f"Invalid start and end versions. Start version: {start_version} is greater than end version: {end_version}.")

    df_raw_orders = (
        spark.read.format("delta")
        .option("startingVersion", start_version)
        .option("endingVersion", end_version)
        .table(f"{CATALOG}.{SOURCE_SCHEMA}.{raw_order_table_name}")
    )

    # if changes, process them
    if not df_raw_orders.isEmpty(): 
        df_normalized = normalize_raw_schema(df_raw_orders)
        df_transformed = transform_orders(df_normalized)
        df_validated = df_transformed.filter(~f.col("is_critical"))

        # contruct quarantine dataframe from valid dataframe
        df_quarantine = (
            df_validated
            .filter(f.col("is_critical") | f.col("is_warning")) 
            .withColumn("severity_level", f.when(f.col("is_critical"), "CRITICAL").otherwise("WARNING")) 
            .select("row_id", "quarantine_reason", "severity_level", "file_path", "ingestion_timestamp", "processing_timestamp")
        )

        # write quarantine data for later analysis
        if not df_quarantine.isEmpty(): 
            print(f"Writing records to quarantine.")
            df_quarantine.write.format("delta").mode("append").saveAsTable(f"{CATALOG}.silver.{quarantine_table_name}")

        df_prod = spark.read.table(f"{CATALOG}.silver.products_enriched").select("product_id", "category", "sub_category")
        df_cust = spark.read.table(f"{CATALOG}.silver.customers_enriched").select("customer_id", "customer_name", "country")

        # enrich orders data with customer and product details
        df_enriched = enrich_order_data(df_validated, df_prod, df_cust)
        df_unique = df_enriched.dropDuplicates(["order_id"])

        # upsert to silver table
        upsert_delta_table(
            spark_session=spark,
            df=df_unique,
            target_table_name=f"{CATALOG}.silver.{enriched_order_table_name}",
            join_key="order_id",
            partition_col="year_month"
        )

        # Usually this will be part of the maintenance job.
        #optimize_partitions(spark, 
        #                    f"{CATALOG}.silver.{enriched_order_table_name}", 
        #                    df_unique, 
        #                    "year_month", 
        #                    "category,sub_category,customer_name")

        if not is_backfill:
            # update version for incremental run
            update_last_processed_version(spark, CATALOG, SOURCE_SCHEMA, raw_order_table_name, end_version)
        
        print(f"Finished processing batch up to {end_version}")
    else: 
        print("No new data to process.")
except Exception as e: 
    print(f"FAILED: Orders Enrichment. Error: {str(e)}")