In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

# Read streaming data from the Silver tables
orders_df = spark.readStream.table("oms_analytics.silver.orders").alias("orders")
order_items_df = spark.readStream.table("oms_analytics.silver.order_items").alias("order_items")

# Rename order timestamp to avoid ambiguity
order_items_df = order_items_df.withColumnRenamed("order_timestamp", "order_item_timestamp")

# Add watermark to handle late arrivals
orders_df = orders_df.withWatermark("order_timestamp", "1 minutes")
order_items_df = order_items_df.withWatermark("order_item_timestamp", "1 minutes")

# Join the orders and order_items dataframes
joined_df = orders_df.join(order_items_df, "order_id")

# Aggregate and add other required fields
aggregated_df = joined_df \
    .groupBy(
        "date_id",
        "customer_id",
        "product_id",
        F.window("order_timestamp", "1 minutes")
    ) \
    .agg(
        F.sum("quantity").alias("items_sold"),
        F.sum("line_total").alias("sales_amount")
    ) \
    .drop("window") 

final_df = aggregated_df \
    .withColumn("surrogate_key", F.sha2(F.concat_ws("_", 
        F.col("date_id").cast("string"),
        F.col("customer_id").cast("string"),
        F.col("product_id").cast("string")
    ), 256)) \
    .withColumn("process_id", F.lit("de_nb_102")) \
    .withColumn("gold_load_ts", F.current_timestamp()) \
    .select(
        "surrogate_key",
        "customer_id",
        "date_id",
        "product_id",
        "items_sold",
        "sales_amount",
        "process_id",
        "gold_load_ts"
    )


# Define the checkpoint locations
external_location_name = "abfss://orders@omslanding.dfs.core.windows.net"
checkpoint_location_daily_sales_fact = f"{external_location_name}/checkpoints/gold_loader/daily_sales_fact"

# Define the function to upsert into Delta table
def upsert_to_delta(final_df, batchId):
    print("Batch ID:", batchId)
    print("Record Count:", final_df.count())

    if not spark.catalog.tableExists("oms_analytics.gold.daily_sales_fact"):
        final_df.write.format("delta").saveAsTable("oms_analytics.gold.daily_sales_fact")
    else:
        delta_table = DeltaTable.forName(spark, "oms_analytics.gold.daily_sales_fact")
        delta_table.alias("tgt").merge(
            final_df.alias("src"),
            "tgt.customer_id = src.customer_id AND tgt.date_id = src.date_id AND tgt.product_id = src.product_id"
        ).whenMatchedUpdate(
            set={
                "items_sold": "tgt.items_sold + src.items_sold",
                "sales_amount": "tgt.sales_amount + src.sales_amount",
                "gold_load_ts": "src.gold_load_ts"
            }
        ).whenNotMatchedInsert(
            values={
                "surrogate_key": "src.surrogate_key",
                "customer_id": "src.customer_id",
                "date_id": "src.date_id",
                "product_id": "src.product_id",
                "items_sold": "src.items_sold",
                "sales_amount": "src.sales_amount",
                "process_id": "src.process_id",
                "gold_load_ts": "src.gold_load_ts"
            }
        ).execute()

# Write the aggregated metrics to a Delta table in real-time using foreachBatch
query = final_df.writeStream \
    .outputMode("append") \
    .foreachBatch(upsert_to_delta) \
    .option("checkpointLocation", checkpoint_location_daily_sales_fact) \
    .start()

query.awaitTermination()