# 📓 3. Apply Updates
## Update Benchmarking for TC.03.x–TC.08.x

This notebook simulates and benchmarks update strategies for both Delta and Warehouse tables:
- TC.03.x/TC.04.x: Full Refresh (replace table with latest state)
- TC.05.x/TC.06.x: Full Compare (append event history for insert/update/delete)
- TC.07.x/TC.08.x: Incremental Update (append event history from update slice)

Metrics are logged for each test case. All event log tables use `update_type` ('insert', 'update', 'delete').

In [None]:
import time
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType
from pyspark.sql.functions import col, lit

In [None]:
# Table names and paths
target_lakehouse = "BenchmarkLakehouse"
target_warehouse = "BenchmarkWarehouse"

row_count = 10000  # Change as needed

base_file = f"abfss://FabricBenchmarking@onelake.dfs.fabric.microsoft.com/DataSourceLakehouse.Lakehouse/Files/base/base_{row_count}_parquet.parquet"
updates_file = f"abfss://FabricBenchmarking@onelake.dfs.fabric.microsoft.com/DataSourceLakehouse.Lakehouse/Files/updates/updates_{row_count}_parquet.parquet"

delta_tables = {
    "refresh": "delta_refresh_load",
    "compare": "delta_compare_load",
    "increment": "delta_increment_load"
}
warehouse_tables = {
    "refresh": "wh_table_refresh_load",
    "compare": "wh_table_compare_load",
    "increment": "wh_table_increment_load"
}

In [None]:
# Metrics schema (already exists in BenchmarkLakehouse)
metrics_schema = StructType([
    StructField("test_case_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("format", StringType(), True),
    StructField("location", StringType(), True),
    StructField("rows", IntegerType(), True),
    StructField("update_strategy", StringType(), True),
    StructField("ingest_time_s", FloatType(), True),
    StructField("spinup_time_s", FloatType(), True),
    StructField("storage_size_mb", FloatType(), True),
    StructField("query_type", StringType(), True),
    StructField("query_time_s", FloatType(), True),
    StructField("cu_used", FloatType(), True),
    StructField("notes", StringType(), True)
])

## TC.03.x: Full Refresh — Delta Table

Simulates a full refresh by applying all updates to base data and replacing the table with the latest state.

In [None]:
# Load base and updates
base_df = spark.read.format("parquet").load(base_file)
updates_df = spark.read.format("parquet").load(updates_file)

# Apply updates to base (simulate latest state)
from pyspark.sql import Window
from pyspark.sql.functions import row_number

df_all = base_df.unionByName(updates_df, allowMissingColumns=True)

# Latest record per id (for insert/update, skip deletes)
window = Window.partitionBy("id").orderBy(col("ts_1").desc())
df_all = df_all.withColumn("rn", row_number().over(window))
df_current = df_all.filter((col("update_type") != "delete") & (col("rn") == 1)).drop("rn")

# Timer start
tc03_start = time.time()
df_current.write.mode("overwrite").saveAsTable(f"{target_lakehouse}.{delta_tables['refresh']}")
tc03_end = time.time()
tc03_ingest_time = tc03_end - tc03_start

# Storage size
try:
    import mssparkutils
    table_path = f"/lakehouse/{target_lakehouse}/Tables/{delta_tables['refresh']}"
    storage_files = mssparkutils.fs.ls(table_path)
    storage_size_mb = sum(f.size for f in storage_files) / (1024 * 1024)
except Exception:
    storage_size_mb = float('nan')

# Metrics
metrics_tc03 = [
    (
        "TC.03.x",
        datetime.now(),
        "Delta",
        "Tables",
        df_current.count(),
        "Full Refresh",
        tc03_ingest_time,
        float('nan'),
        storage_size_mb,
        "N/A",
        float('nan'),
        float('nan'),
        "Full refresh with applied updates"
    )
]
spark.createDataFrame(metrics_tc03, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.03.x (Delta Full Refresh) complete | Ingest time: {tc03_ingest_time:.2f}s | Storage: {storage_size_mb:.2f} MB")

## TC.04.x: Full Refresh — Warehouse Table

Simulates a full refresh by replacing the warehouse table with the latest state.

In [None]:
from com.microsoft.spark.fabric import Constants

# Cast timestamp_ntz columns for warehouse compatibility
for c in df_current.columns:
    if dict(df_current.dtypes)[c] == "timestamp_ntz":
        df_current = df_current.withColumn(c, col(c).cast("timestamp"))

tc04_start = time.time()
df_current.write.mode("overwrite").synapsesql(f"{target_warehouse}.dbo.{warehouse_tables['refresh']}")
tc04_end = time.time()
tc04_ingest_time = tc04_end - tc04_start

# Metrics
metrics_tc04 = [
    (
        "TC.04.x",
        datetime.now(),
        "Warehouse",
        "Tables",
        df_current.count(),
        "Full Refresh",
        tc04_ingest_time,
        float('nan'),
        float('nan'),
        "N/A",
        float('nan'),
        float('nan'),
        "Full refresh with applied updates"
    )
]
spark.createDataFrame(metrics_tc04, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.04.x (Warehouse Full Refresh) complete | Ingest time: {tc04_ingest_time:.2f}s")

## TC.05.x: Full Compare — Delta Table

Compares current data to existing event log, appends insert/update/delete history.

In [None]:
# For benchmarking, use updates_df as batch of changes for event log
tc05_start = time.time()
updates_df.write.mode("append").saveAsTable(f"{target_lakehouse}.{delta_tables['compare']}")
tc05_end = time.time()
tc05_ingest_time = tc05_end - tc05_start

try:
    import mssparkutils
    table_path = f"/lakehouse/{target_lakehouse}/Tables/{delta_tables['compare']}"
    storage_files = mssparkutils.fs.ls(table_path)
    storage_size_mb = sum(f.size for f in storage_files) / (1024 * 1024)
except Exception:
    storage_size_mb = float('nan')

metrics_tc05 = [
    (
        "TC.05.x",
        datetime.now(),
        "Delta",
        "Tables",
        updates_df.count(),
        "Full Compare",
        tc05_ingest_time,
        float('nan'),
        storage_size_mb,
        "N/A",
        float('nan'),
        float('nan'),
        "Full compare: append events to event log"
    )
]
spark.createDataFrame(metrics_tc05, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.05.x (Delta Compare) complete | Ingest time: {tc05_ingest_time:.2f}s | Storage: {storage_size_mb:.2f} MB")

## TC.06.x: Full Compare — Warehouse Table

Appends event history to warehouse event log table.

In [None]:
# Cast timestamp_ntz columns for warehouse compatibility
for c in updates_df.columns:
    if dict(updates_df.dtypes)[c] == "timestamp_ntz":
        updates_df = updates_df.withColumn(c, col(c).cast("timestamp"))

tc06_start = time.time()
updates_df.write.mode("append").synapsesql(f"{target_warehouse}.dbo.{warehouse_tables['compare']}")
tc06_end = time.time()
tc06_ingest_time = tc06_end - tc06_start

metrics_tc06 = [
    (
        "TC.06.x",
        datetime.now(),
        "Warehouse",
        "Tables",
        updates_df.count(),
        "Full Compare",
        tc06_ingest_time,
        float('nan'),
        float('nan'),
        "N/A",
        float('nan'),
        float('nan'),
        "Full compare: append events to event log"
    )
]
spark.createDataFrame(metrics_tc06, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.06.x (Warehouse Compare) complete | Ingest time: {tc06_ingest_time:.2f}s")

## TC.07.x: Incremental Update — Delta Table

Appends all update events to the incremental Delta event log table.

In [None]:
tc07_start = time.time()
updates_df.write.mode("append").saveAsTable(f"{target_lakehouse}.{delta_tables['increment']}")
tc07_end = time.time()
tc07_ingest_time = tc07_end - tc07_start

try:
    import mssparkutils
    table_path = f"/lakehouse/{target_lakehouse}/Tables/{delta_tables['increment']}"
    storage_files = mssparkutils.fs.ls(table_path)
    storage_size_mb = sum(f.size for f in storage_files) / (1024 * 1024)
except Exception:
    storage_size_mb = float('nan')

metrics_tc07 = [
    (
        "TC.07.x",
        datetime.now(),
        "Delta",
        "Tables",
        updates_df.count(),
        "Incremental",
        tc07_ingest_time,
        float('nan'),
        storage_size_mb,
        "N/A",
        float('nan'),
        float('nan'),
        "Incremental: append events to event log"
    )
]
spark.createDataFrame(metrics_tc07, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.07.x (Delta Incremental) complete | Ingest time: {tc07_ingest_time:.2f}s | Storage: {storage_size_mb:.2f} MB")

## TC.08.x: Incremental Update — Warehouse Table

Appends all update events to the incremental warehouse event log table.

In [None]:
# Cast timestamp_ntz columns for warehouse compatibility
for c in updates_df.columns:
    if dict(updates_df.dtypes)[c] == "timestamp_ntz":
        updates_df = updates_df.withColumn(c, col(c).cast("timestamp"))

tc08_start = time.time()
updates_df.write.mode("append").synapsesql(f"{target_warehouse}.dbo.{warehouse_tables['increment']}")
tc08_end = time.time()
tc08_ingest_time = tc08_end - tc08_start

metrics_tc08 = [
    (
        "TC.08.x",
        datetime.now(),
        "Warehouse",
        "Tables",
        updates_df.count(),
        "Incremental",
        tc08_ingest_time,
        float('nan'),
        float('nan'),
        "N/A",
        float('nan'),
        float('nan'),
        "Incremental: append events to event log"
    )
]
spark.createDataFrame(metrics_tc08, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.08.x (Warehouse Incremental) complete | Ingest time: {tc08_ingest_time:.2f}s")

In [None]:
print("Completion")
print("Updates applied and metrics logged for TC.03.x through TC.08.x.")