# 📓 3. Apply Updates
## Update Benchmarking for TC.03.x–TC.08.x
### Ensure `DataSourceLakehouse` & `BenchmarkLakehouse` are connected as data sources before running.

This notebook simulates and benchmarks update strategies for both Delta and Warehouse tables:
- TC.03.x/TC.04.x: Full Refresh (replace table with latest state)
- TC.05.x/TC.06.x: Full Compare (append event history for insert/update/delete)
- TC.07.x/TC.08.x: Incremental Update (append event history from update slice)

Metrics are logged for each test case. All event log tables use `update_type` ('insert', 'update', 'delete').

Notes: storage_size_mb and cu_used are used as integer row-count proxies (not MB/CU) per the lightweight proxy approach.

In [None]:
import time
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType
from pyspark.sql.functions import col, lit

In [None]:
# Helper function to make columns nullable in a Spark DataFrame
from pyspark.sql.types import StructType, StructField

def make_columns_nullable(df, columns=None):
    """
    Returns a DataFrame where the specified columns (or all columns if None) are set to nullable=True in the schema.
    """
    if columns is None:
        columns = [f.name for f in df.schema.fields]
    new_schema = StructType([
        StructField(f.name, f.dataType, True) if f.name in columns else f
        for f in df.schema.fields
    ])
    return df.sparkSession.createDataFrame(df.rdd, schema=new_schema)

In [None]:
# Table names and paths
target_lakehouse = "BenchmarkLakehouse"
target_warehouse = "BenchmarkWarehouse"

row_count = 10000  # Change as needed

base_file = f"abfss://FabricBenchmarking@onelake.dfs.fabric.microsoft.com/DataSourceLakehouse.Lakehouse/Files/base/base_{row_count}_parquet.parquet"
updates_file = f"abfss://FabricBenchmarking@onelake.dfs.fabric.microsoft.com/DataSourceLakehouse.Lakehouse/Files/updates/updates_{row_count}_parquet.parquet"

delta_tables = {
    "refresh": "delta_refresh_load",
    "compare": "delta_compare_load",
    "increment": "delta_increment_load"
}
warehouse_tables = {
    "refresh": "wh_table_refresh_load",
    "compare": "wh_table_compare_load",
    "increment": "wh_table_increment_load"
}

In [None]:
# Metrics schema (storage_size_mb and cu_used are integer row-count proxies)
metrics_schema = StructType([
    StructField("test_case_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("format", StringType(), True),
    StructField("location", StringType(), True),
    StructField("rows", IntegerType(), True),
    StructField("update_strategy", StringType(), True),
    StructField("ingest_time_s", FloatType(), True),
    StructField("spinup_time_s", FloatType(), True),
    StructField("storage_size_mb", IntegerType(), True),
    StructField("query_type", StringType(), True),
    StructField("query_time_s", FloatType(), True),
    StructField("cu_used", IntegerType(), True),
    StructField("notes", StringType(), True)
])

## TC.03.x: Full Refresh — Delta Table

Simulates a full refresh by applying all updates to base data and replacing the table with the latest state.

In [None]:
# Load base and updates using spark.read.parquet for correct timestamp type
base_df = spark.read.parquet(base_file)
updates_df = spark.read.parquet(updates_file)

# Apply updates to base (simulate latest state)
from pyspark.sql import Window
from pyspark.sql.functions import row_number

df_all = base_df.unionByName(updates_df, allowMissingColumns=True)

# Latest record per id (for insert/update, skip deletes)
window = Window.partitionBy("id").orderBy(col("ts_1").desc())
df_all = df_all.withColumn("rn", row_number().over(window))
df_current = df_all.filter((col("update_type") != "delete") & (col("rn") == 1)).drop("rn")

# For full refresh, set all update_type to 'insert' (best practice for snapshot)
df_current = df_current.withColumn("update_type", lit("insert"))

# Timer start
tc03_start = time.time()
df_current.write.mode("overwrite").saveAsTable(f"{target_lakehouse}.{delta_tables['refresh']}")
tc03_end = time.time()
tc03_ingest_time = tc03_end - tc03_start

# Compute counts once (avoid repeated .count())
base_count = base_df.count()
updates_count = updates_df.count()
final_count = df_current.count()

# Storage size (row-count proxy)
try:
    import mssparkutils
    table_path = f"/lakehouse/{target_lakehouse}/Tables/{delta_tables['refresh']}"
    storage_files = mssparkutils.fs.ls(table_path)
    storage_size_mb = int(sum(f.size for f in storage_files) / (1024 * 1024))
except Exception:
    storage_size_mb = int(final_count)  # use row count in lieu of storage size

# Metrics: cu_used = source rows scanned + rows written (per your preference)
metrics_tc03 = [
    (
        "TC.03.x",
        datetime.now(),
        "Delta",
        "Tables",
        int(final_count),
        "Full Refresh",
        tc03_ingest_time,
        None,
        int(storage_size_mb),
        "N/A",
        None,
        int(row_count + final_count),  # rows processed proxy (source scanned + rows written)
        "Full refresh with applied updates"
    )
]
spark.createDataFrame(metrics_tc03, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.03.x (Delta Full Refresh) complete | Ingest time: {tc03_ingest_time:.2f}s | Storage(rows proxy): {storage_size_mb}")

## TC.04.x: Full Refresh — Warehouse Table

Simulates a full refresh by replacing the warehouse table with the latest state.

In [None]:
from com.microsoft.spark.fabric import Constants

# For full refresh, set all update_type to 'insert'
df_current = df_current.withColumn("update_type", lit("insert"))

# Ensure update_type is nullable to match warehouse schema
df_current = make_columns_nullable(df_current, columns=["update_type"])

tc04_start = time.time()
df_current.write.mode("overwrite").synapsesql(f"{target_warehouse}.dbo.{warehouse_tables['refresh']}")
tc04_end = time.time()
tc04_ingest_time = tc04_end - tc04_start

# compute counts once
base_count = base_df.count()
final_count = df_current.count()

# storage proxy = final row count
storage_size_mb = int(final_count)

# Metrics
metrics_tc04 = [
    (
        "TC.04.x",
        datetime.now(),
        "Warehouse",
        "Tables",
        int(final_count),
        "Full Refresh",
        tc04_ingest_time,
        None,
        int(storage_size_mb),
        "N/A",
        None,
        int(row_count + final_count),  # rows processed proxy
        "Full refresh with applied updates"
    )
]
spark.createDataFrame(metrics_tc04, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.04.x (Warehouse Full Refresh) complete | Ingest time: {tc04_ingest_time:.2f}s")

## TC.05.x: Full Compare — Delta Table

Compares current data to existing event log, appends insert/update/delete history.

In [None]:
# For benchmarking, use updates_df as batch of changes for event log
tc05_start = time.time()
updates_df.write.mode("append").saveAsTable(f"{target_lakehouse}.{delta_tables['compare']}")
tc05_end = time.time()
tc05_ingest_time = tc05_end - tc05_start

# compute counts once
base_count = base_df.count()
updates_count = updates_df.count()

try:
    import mssparkutils
    table_path = f"/lakehouse/{target_lakehouse}/Tables/{delta_tables['compare']}"
    storage_files = mssparkutils.fs.ls(table_path)
    storage_size_mb = int(sum(f.size for f in storage_files) / (1024 * 1024))
except Exception:
    storage_size_mb = int(row_count + df_current.count())  # use row count in lieu of storage size

# cu_used: treating updates/deletes as inserts -> base_count + updates_count
cu_used = int(row_count + updates_count)

# preferred: final event-log table count if accessible
try:
    final_compare_count = spark.table(f"{target_lakehouse}.{delta_tables['compare']}").count()
except Exception:
    final_compare_count = row_count + updates_count

metrics_tc05 = [
    (
        "TC.05.x",
        datetime.now(),
        "Delta",
        "Tables",
        int(updates_count),
        "Full Compare",
        tc05_ingest_time,
        None,
        int(final_compare_count),
        "N/A",
        None,
        cu_used,
        "Full compare: append events to event log"
    )
]
spark.createDataFrame(metrics_tc05, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.05.x (Delta Compare) complete | Ingest time: {tc05_ingest_time:.2f}s | Storage(rows proxy): {final_compare_count}")

## TC.06.x: Full Compare — Warehouse Table

Appends event history to warehouse event log table.

In [None]:
# Ensure update_type is nullable to match warehouse schema
updates_df_nullable = make_columns_nullable(updates_df, columns=["update_type"])

tc06_start = time.time()
updates_df_nullable.write.mode("append").synapsesql(f"{target_warehouse}.dbo.{warehouse_tables['compare']}")
tc06_end = time.time()
tc06_ingest_time = tc06_end - tc06_start

# compute counts once
updates_count_nullable = updates_df_nullable.count()

storage_size_mb = int(row_count + df_current.count())  # fallback row-count proxy

# cu_used: treating updates/deletes as inserts -> row_count + updates_count_nullable
cu_used = int(row_count + updates_count_nullable)

try:
    final_compare_wh_count = spark.table(f"{target_warehouse}.dbo.{warehouse_tables['compare']}").count()
except Exception:
    final_compare_wh_count = row_count + updates_count_nullable

metrics_tc06 = [
    (
        "TC.06.x",
        datetime.now(),
        "Warehouse",
        "Tables",
        int(updates_count_nullable),
        "Full Compare",
        tc06_ingest_time,
        None,
        int(final_compare_wh_count),
        "N/A",
        None,
        cu_used,
        "Full compare: append events to event log"
    )
]
spark.createDataFrame(metrics_tc06, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.06.x (Warehouse Compare) complete | Ingest time: {tc06_ingest_time:.2f}s | Storage(rows proxy): {final_compare_wh_count}")

## TC.07.x: Incremental Update — Delta Table

Appends all update events to the incremental Delta event log table.

In [None]:
tc07_start = time.time()
updates_df.write.mode("append").saveAsTable(f"{target_lakehouse}.{delta_tables['increment']}")
tc07_end = time.time()
tc07_ingest_time = tc07_end - tc07_start

# compute counts once
updates_count = updates_df.count()

try:
    import mssparkutils
    table_path = f"/lakehouse/{target_lakehouse}/Tables/{delta_tables['increment']}"
    storage_files = mssparkutils.fs.ls(table_path)
    storage_size_mb = int(sum(f.size for f in storage_files) / (1024 * 1024))
except Exception:
    storage_size_mb = int(row_count + updates_count)  # use row count in lieu of storage size

metrics_tc07 = [
    (
        "TC.07.x",
        datetime.now(),
        "Delta",
        "Tables",
        int(updates_count),
        "Incremental",
        tc07_ingest_time,
        None,
        int(storage_size_mb),
        "N/A",
        None,
        int(updates_count),  # use rows processed in lieu of cu_used
        "Incremental: append events to event log"
    )
]
spark.createDataFrame(metrics_tc07, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.07.x (Delta Incremental) complete | Ingest time: {tc07_ingest_time:.2f}s | Storage(rows proxy): {storage_size_mb}")

## TC.08.x: Incremental Update — Warehouse Table

Appends all update events to the incremental warehouse event log table.

In [None]:
# Ensure update_type is nullable to match warehouse schema
updates_df_nullable = make_columns_nullable(updates_df, columns=["update_type"])

tc08_start = time.time()
updates_df_nullable.write.mode("append").synapsesql(f"{target_warehouse}.dbo.{warehouse_tables['increment']}")
tc08_end = time.time()
tc08_ingest_time = tc08_end - tc08_start

updates_count_nullable = updates_df_nullable.count()

try:
    final_inc_wh_count = spark.table(f"{target_warehouse}.dbo.{warehouse_tables['increment']}").count()
except Exception:
    final_inc_wh_count = row_count + updates_count_nullable

metrics_tc08 = [
    (
        "TC.08.x",
        datetime.now(),
        "Warehouse",
        "Tables",
        int(updates_count_nullable),
        "Incremental",
        tc08_ingest_time,
        None,
        int(final_inc_wh_count),
        "N/A",
        None,
        int(updates_count_nullable),  # use rows processed in lieu of cu_used
        "Incremental: append events to event log"
    )
]
spark.createDataFrame(metrics_tc08, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print(f"TC.08.x (Warehouse Incremental) complete | Ingest time: {tc08_ingest_time:.2f}s")

In [None]:
print("Completion")
print("Updates applied and metrics logged for TC.03.x through TC.08.x.")