In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import pyspark.sql.functions as f
import os
import sys
sys.path.append(os.path.abspath('../..'))

In [0]:
CATALOG = "pei"
SOURCE_SCHEMA = "silver"
TARGET_SCHEMA = "gold"
SOURCE_TABLE_NAME = "orders_enriched"
TARGET_TABLE_NAME = "agg_sales_performance"

In [0]:
from utils.metadata_manager import get_last_processed_version, update_last_processed_version, get_latest_table_version
from data_writers.write_data import upsert_delta_table
from data_writers.maintenance import optimize_partitions

In [0]:
# source table latest version
latest_table_version = get_latest_table_version(spark, CATALOG, SOURCE_SCHEMA, SOURCE_TABLE_NAME)

# source tale last processed version
last_processed_version = get_last_processed_version(spark, CATALOG, SOURCE_SCHEMA, SOURCE_TABLE_NAME)

# calculate last version
last_version = last_processed_version if last_processed_version else -1

if last_version >= latest_table_version:
    dbutils.notebook.exit
    (
        f"Table {TARGET_TABLE_NAME} is already up to date at version {last_version}."
    )

try: 
    if last_version == -1:
        # if full refresh, then read all data from silver table
        changes_df = (spark.read.format("delta")
                      .option("versionAsOf", latest_table_version)
                      .table(f"{CATALOG}.{SOURCE_SCHEMA}.{SOURCE_TABLE_NAME}"))
    else:
        # read incremental changes or updates
        changes_df = (spark.read.format("delta")
                        .option("readChangeFeed", "true")
                        .option("startingVersion", last_version + 1)
                        .option("endingVersion", latest_table_version) 
                        .table(f"{CATALOG}.{SOURCE_SCHEMA}.{SOURCE_TABLE_NAME}")
                        .filter(f.col("_change_type").isin("insert", "update_postimage")))
        
    if changes_df.isEmpty():
        update_last_processed_version(spark, CATALOG, SOURCE_SCHEMA, SOURCE_TABLE_NAME, latest_table_version)
        dbutils.notebook.exit("No changes found, Updated version to latest version")
    
    # aggregate on year, category, sub category and customer
    df_batch_agg = (
        changes_df
        .groupBy("order_year", "category", "sub_category", "customer_name")
        .agg(f.sum("profit").alias("total_profit"))
    )
    
    # upsert to gold
    upsert_delta_table(
        spark_session=spark,
        df=df_batch_agg,
        target_table_name=f"{CATALOG}.{TARGET_SCHEMA}.{TARGET_TABLE_NAME}",
        join_key="order_year,category,sub_category,customer_name",
        partition_col="order_year"
    )

    # mainatenance script 
    #optimize_partitions(spark, 
    #                    f"{CATALOG}.{TARGET_SCHEMA}.{TARGET_TABLE_NAME}", 
    #                    df_batch_agg, 
    #                    "order_year", 
    #                    "category,sub_category,customer_name")

    # update table version processed
    update_last_processed_version(spark, CATALOG, SOURCE_SCHEMA, SOURCE_TABLE_NAME, latest_table_version)

except Exception as e: 
    print(f"Pipeline failed. Watermark NOT updated. Error: {e}")