In [0]:
# MAGIC %load_ext autoreload
# MAGIC %autoreload 2

In [0]:
import sys
import os
import pyspark.sql.functions as f
sys.path.append(os.path.abspath('..'))

In [0]:
from transformations.customer_transforms import enrich_customers, upsert as upsert_customer
from transformations.product_transforms import enrich_products, upsert as upsert_product
from transformations.transform_utils import get_watermark, update_watermark, normalize_raw_schema, optimize_partitions
from transformations.order_transforms import transform_orders, enrich_order_data, upsert as upsert_order

In [0]:
CATALOG = "pei"

#### Start Enrichement

#####1. Customer Enrichment

In [0]:
raw_customer_table_name = "raw_customers"
enriched_customer_table_name = "customers_enriched"

In [0]:
try:
    last_ts = get_watermark(spark, raw_customer_table_name)

    raw_customers_df = (spark.read.table(f"{CATALOG}.bronze.{raw_customer_table_name}")
                    .filter(f.col("ingestion_timestamp") > last_ts))

    if raw_customers_df.count() > 0: 
        normalized_df = normalize_raw_schema(raw_customers_df)
        
        enriched_customers_df = enrich_customers(normalized_df)

        upsert_customer(spark, 
                        enriched_customers_df, 
                        f"{CATALOG}.silver.{enriched_customer_table_name}")

        max_ts = enriched_customers_df.agg(f.max("ingestion_timestamp")).collect()[0][0]
        update_watermark(spark, raw_customer_table_name, max_ts)

        print(f"Finished processing Customers up to {max_ts}")
    else: 
        print("No new Customer data to process.")
except Exception as e: 
    print(f"FAILED: Customer Enrichment. Error: {str(e)}")

#####2. Product Enrichment

In [0]:
raw_product_table_name = "raw_products"
enriched_product_table_name = "products_enriched"

In [0]:
try:
    last_ts = get_watermark(spark, raw_product_table_name)

    raw_products_df = (spark.read.table(f"{CATALOG}.bronze.{raw_product_table_name}")
                    .filter(f.col("ingestion_timestamp") > last_ts))

    if raw_products_df.count() > 0: 
        normalized_df = normalize_raw_schema(raw_products_df)
        
        enriched_products_df = enrich_products(normalized_df)

        upsert_product(spark, 
                       enriched_products_df, 
                        f"{CATALOG}.silver.{enriched_product_table_name}")

        max_ts = enriched_customers_df.agg(f.max("ingestion_timestamp")).collect()[0][0]
        update_watermark(spark, raw_product_table_name, max_ts)

        print(f"Finished processing Products up to {max_ts}")
    else: 
        print("No new Product data to process.")
except Exception as e: 
    print(f"FAILED: Product Enrichment. Error: {str(e)}")

#####3. Order Enrichment

In [0]:
raw_order_table_name = "raw_orders"
enriched_order_table_name = "orders_enriched"
quarantine_table_name = "orders_quarantine"

In [0]:
try: 
    last_ts = get_watermark(spark, raw_order_table_name)
    df_orders_raw = (spark.read.table(f"{CATALOG}.bronze.{raw_order_table_name}")
                .filter(f.col("ingestion_timestamp") > last_ts))
    
    if not df_orders_raw.isEmpty(): 
        df_normalized = normalize_raw_schema(df_orders_raw)
        df_transformed = transform_orders(df_normalized)
        df_validated = df_transformed.filter(~f.col("is_critical"))

        df_quarantine = (
            df_validated
            .filter(f.col("is_critical") | f.col("is_warning")) 
            .withColumn("severity_level", f.when(f.col("is_critical"), "CRITICAL").otherwise("WARNING")) 
            .select("row_id", "quarantine_reason", "severity_level", "file_path", "ingestion_timestamp", "processing_timestamp")
        )

        if not df_quarantine.isEmpty(): 
            print(f"Writing records to quarantine.")
            df_quarantine.write.format("delta").mode("append").saveAsTable(f"{CATALOG}.silver.{quarantine_table_name}")

        df_prod = spark.read.table(f"{CATALOG}.silver.products_enriched").select("product_id", "category", "sub_category")
        df_cust = spark.read.table(f"{CATALOG}.silver.customers_enriched").select("customer_id", "customer_name", "country")

        df_enriched = enrich_order_data(df_validated, df_prod, df_cust)
        df_unique = df_enriched.dropDuplicates(["order_id"])

        upsert_order(spark, 
                     df_unique, 
                     f"{CATALOG}.silver.{enriched_order_table_name}")

        optimize_partitions(spark, 
                            f"{CATALOG}.silver.{enriched_order_table_name}", 
                            df_unique, 
                            "year_month", 
                            "customer_name, category")

        max_ts = df_orders_raw.agg(f.max("ingestion_timestamp")).collect()[0][0]
        update_watermark(spark, raw_order_table_name, max_ts)
        
        print(f"Finished processing batch up to {max_ts}")
    else: 
        print("No new data to process.")
except Exception as e: 
    print(f"FAILED: Orders Enrichment. Error: {str(e)}")