In [0]:
dbutils.widgets.text("start_version", "", "Manual Start Version (Backfill)")
dbutils.widgets.text("end_version", "", "Manual End Version")

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import sys
import os
import pyspark.sql.functions as f
from datetime import datetime
sys.path.append(os.path.abspath('../..'))

In [0]:
from transformations.customer_transforms import transform_customers
from data_writers.write_data import upsert_delta_table
from utils.transform_utils import normalize_raw_schema
from utils.metadata_manager  import get_last_processed_version, update_last_processed_version, get_latest_table_version, get_pipeline_version_range

In [0]:
start_val = dbutils.widgets.get("start_version").strip()
end_val = dbutils.widgets.get("end_version").strip()

In [0]:
CATALOG = "pei"
SOURCE_SCHEMA = "bronze"

In [0]:
raw_customer_table_name = "raw_customers"
enriched_customer_table_name = "customers_enriched"

In [0]:
CUSTOMER_TARGET_COLUMNS = [
    "customer_id", "customer_name", "first_name", "last_name", "email", 
    "phone", "address", "segment", "country", "city", "state", 
    "postal_code", "region", "file_path", "ingestion_timestamp", "processing_timestamp"
]

In [0]:
try:
    is_backfill = start_val.strip() != ""
    
    # get start and end versions for backfill or incremental processing
    start_version, end_version = get_pipeline_version_range(
    spark, CATALOG, SOURCE_SCHEMA, raw_customer_table_name, start_val, end_val
    )

    if start_version > end_version:
        raise Exception(f"Invalid start and end versions. Start version: {start_version} is greater than end version: {end_version}.")

    
    df_raw_customers = (
        spark.read.format("delta")
        .option("startingVersion", start_version)
        .option("endingVersion", end_version)
        .table(f"{CATALOG}.{SOURCE_SCHEMA}.{raw_customer_table_name}")
    )

    # if there are changes, process them
    if not df_raw_customers.isEmpty(): 
        df_normalized = normalize_raw_schema(df_raw_customers)
        
        df_enriched = transform_customers(df_normalized)
        df_enforced = df_enriched.select(*CUSTOMER_TARGET_COLUMNS)
        
        upsert_delta_table(
            spark_session = spark,
            df = df_enforced,
            target_table_name = f"{CATALOG}.silver.{enriched_customer_table_name}",
            join_key = "customer_id"
        )

        if not is_backfill:
            update_last_processed_version(spark, CATALOG, SOURCE_SCHEMA, raw_customer_table_name, end_version)

        print(f"Finished processing Customers up to {end_version}")
    else: 
        print("No new Customer data to process.")
except Exception as e: 
    print(f"FAILED: Customer Enrichment. Error: {str(e)}")