In [None]:
%%configure -f
{
  "conf": {
    "spark.notebook.parameters": "{}"
  },
  "defaultLakehouse": {
    "name": "MetricsLakehouse"
  }
}


# ðŸ“Š 5. Visualize Metrics
### ðŸ”— `MetricsLakehouse` is the data source.

This notebook compares performance and storage across different ingestion, update, and query strategies.

Activities visualized:
- Initial ingestion
- Update
- Query
- Storage cost for each target


In [None]:
# Merge metrics by reading each workspace's BenchmarkLakehouse Files/metrics ABFSS directory
# Revised: use workspace display name verbatim for the ABFSS container (no .replace)
# Uses mssparkutils.fs.ls to probe ABFSS paths and reads delta tables safely.

import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType

spark = SparkSession.builder.getOrCreate()

abfss_account = "onelake.dfs.fabric.microsoft.com"
source_lakehouse_name = "BenchmarkLakehouse"
controller_table = "MetricsLakehouse.metrics"

# canonical metrics schema (used only to create empty table if missing)
metrics_schema = StructType([
    StructField("test_case_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("source", StringType(), True),
    StructField("format", StringType(), True),
    StructField("rows", IntegerType(), True),
    StructField("update_strategy", StringType(), True),
    StructField("action", StringType(), True),
    StructField("ingest_time_s", FloatType(), True),
    StructField("spinup_time_s", FloatType(), True),
    StructField("query_type", StringType(), True),
    StructField("query_time_s", FloatType(), True),
    StructField("notes", StringType(), True)
])

# parse runs (safe default)
conf_key = "spark.notebook.parameters"
runs = json.loads(spark.conf.get(conf_key, "{}")).get("runs", [])
workspace_names = [r.get("name") for r in runs if r.get("name")]
print("Workspaces to inspect (from runs):", workspace_names)

# ensure controller table exists
if not spark.catalog.tableExists(controller_table):
    empty = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema=metrics_schema)
    empty.write.mode("overwrite").saveAsTable(controller_table)

dest_df = spark.table(controller_table)
before_total = dest_df.count()
print(f"Controller metrics table found: {controller_table} rows={before_total}")

# prefer mssparkutils for path probing in Fabric notebooks
try:
    from notebookutils import mssparkutils
except Exception:
    try:
        import mssparkutils
    except Exception:
        mssparkutils = None

total_inserted = 0

for ws in workspace_names:
    if not ws:
        continue

    # Use workspace display name verbatim as the ABFSS container (test_case names are already safe)
    container = str(ws)
    # point to the delta table folder (avoid pointing into _delta_log)
    source_table_root = f"abfss://{container}@{abfss_account}/{source_lakehouse_name}.lakehouse/Tables/metrics"
    print(f"\nWorkspace '{ws}': probing {source_table_root}")

    path_exists = False
    # Prefer mssparkutils.fs.ls which is safe in Fabric notebooks
    if mssparkutils:
        try:
            ls_entries = mssparkutils.fs.ls(source_table_root)
            # ls returns entries for files/dirs; non-empty list means the path is present
            path_exists = bool(ls_entries)
        except Exception:
            # treat any exception as "path does not exist or not accessible"
            path_exists = False
    else:
        # As a fallback, attempt a guarded delta load to detect presence; catch errors locally
        try:
            tmp = spark.read.format("delta").load(source_table_root)
            path_exists = True
        except Exception:
            path_exists = False

    print(f"  path_exists={path_exists}")

    if not path_exists:
        print(f"  Path not present or inaccessible for workspace '{ws}'; skipping.")
        continue

    # read the delta table (delta reader handles _delta_log correctly)
    src_df = spark.read.format("delta").load(source_table_root)
    src_count = src_df.count()
    print(f"  Read {src_count} rows from delta table at {source_table_root}")

    # Deduplicate exact rows against destination by using common columns
    dest_cols = dest_df.columns
    common_cols = [c for c in dest_cols if c in src_df.columns]
    if not common_cols:
        print(f"  No common columns between source ({source_table_root}) and destination ({controller_table}); skipping")
        continue

    src_sel = src_df.select(*common_cols)
    dest_sel = dest_df.select(*common_cols)

    new_rows = src_sel.join(dest_sel, on=common_cols, how="left_anti")
    insert_count = new_rows.count()
    print(f"  New unique rows to insert from '{ws}': {insert_count}")

    if insert_count > 0:
        new_rows.write.mode("append").saveAsTable(controller_table)
        dest_df = spark.table(controller_table)
        total_inserted += insert_count
        print(f"  Appended {insert_count} rows from '{ws}' into {controller_table}")

print(f"\nFinished: before_total={before_total}, total_inserted={total_inserted}")
after_total = spark.table(controller_table).count()
print(f"After total rows in {controller_table}: {after_total}")

metrics_df = spark.table(controller_table).toPandas()
print("Loaded merged metrics into pandas DataFrame with rows:", len(metrics_df))


In [None]:
# Normalize numeric proxy fields (storage_size_mb and cu_used may be stored as floats in the metrics table)
import numpy as np
import pandas as pd
metrics_df['storage_size_mb'] = pd.to_numeric(metrics_df.get('storage_size_mb', pd.Series(np.nan)), errors='coerce')
metrics_df['cu_used'] = pd.to_numeric(metrics_df.get('cu_used', pd.Series(np.nan)), errors='coerce')

# Create integer proxy columns for display and plotting (None for missing)
def to_int_proxy(x):
    if pd.isna(x):
        return None
    try:
        return int(x)
    except Exception:
        try:
            return int(float(x))
        except Exception:
            return None

metrics_df['storage_rows_proxy'] = metrics_df['storage_size_mb'].apply(to_int_proxy)
metrics_df['cu_used_int'] = metrics_df['cu_used'].apply(to_int_proxy)

metrics_df


In [None]:
# Calculate storage_size_mb if missing (best effort)
import math

def calculate_storage_for_table(table_path):
    try:
        import mssparkutils
        files = mssparkutils.fs.ls(table_path)
        size_mb = sum(f.size for f in files) / (1024 * 1024)
        return size_mb
    except Exception:
        return float('nan')

# Identify unique targets and fill missing storage_size_mb when possible
for idx, row in metrics_df.iterrows():
    if ('storage_size_mb' in row and (pd.isna(row['storage_size_mb']) or math.isnan(row['storage_size_mb']))) and row.get('update_strategy'):
        # crude extraction: use test_case_id to guess table name (customize if needed)
        tc = row.get('test_case_id')
        table_map = {
            'TC.01.x': '/lakehouse/BenchmarkLakehouse/Tables/delta_refresh_load',
            'TC.02.x': '/lakehouse/BenchmarkLakehouse/Tables/wh_table_refresh_load',
            'TC.03.x': '/lakehouse/BenchmarkLakehouse/Tables/delta_refresh_load',
            'TC.04.x': '/lakehouse/BenchmarkLakehouse/Tables/wh_table_refresh_load',
            'TC.05.x': '/lakehouse/BenchmarkLakehouse/Tables/delta_compare_load',
            'TC.06.x': '/lakehouse/BenchmarkLakehouse/Tables/wh_table_compare_load',
            'TC.07.x': '/lakehouse/BenchmarkLakehouse/Tables/delta_increment_load',
            'TC.08.x': '/lakehouse/BenchmarkLakehouse/Tables/wh_table_increment_load',
        }
        table_path = table_map.get(tc, None)
        if table_path:
            metrics_df.at[idx, 'storage_size_mb'] = calculate_storage_for_table(table_path)
            # refresh proxy column when we populate storage_size_mb
            try:
                metrics_df.at[idx, 'storage_rows_proxy'] = int(metrics_df.at[idx, 'storage_size_mb'])
            except Exception:
                metrics_df.at[idx, 'storage_rows_proxy'] = None

# Update the integer proxy column after any fills
metrics_df['storage_size_mb'] = pd.to_numeric(metrics_df['storage_size_mb'], errors='coerce')
metrics_df['storage_rows_proxy'] = metrics_df['storage_size_mb'].apply(lambda x: int(x) if not pd.isna(x) else None)
metrics_df


In [None]:
# Initial ingestion performance (stacked: Read_Time = spinup_time_s, Load Time = ingest_time_s)
import matplotlib.pyplot as plt
import numpy as np

# Select the Full Refresh metrics rows for Delta and Warehouse formats
ingest_df = metrics_df[
    (metrics_df['update_strategy'] == 'Full Refresh')
].copy()

if ingest_df.empty:
    print("No Full Refresh metrics found in metrics_df.")
else:
    # If multiple runs exist, take the latest row per format
    if 'timestamp' in ingest_df.columns:
        ingest_summary = ingest_df.sort_values('timestamp').groupby('format', as_index=False).last()
    else:
        ingest_summary = ingest_df.groupby('format', as_index=False).last()

    # Ensure a stable order of formats for plotting
    formats = ['Delta', 'Warehouse']
    ingest_summary = ingest_summary.set_index('format').reindex(formats).reset_index()

    # Extract read (spinup) and load (ingest) times; coerce missing to 0.0
    read_times = ingest_summary['spinup_time_s'].fillna(0.0).astype(float).values
    load_times = ingest_summary['ingest_time_s'].fillna(0.0).astype(float).values
    labels = ingest_summary['format'].fillna('').values

    # Plot stacked bars
    x = np.arange(len(labels))
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.bar(x, read_times, color='skyblue', label='Read Time (spinup)')
    ax.bar(x, load_times, bottom=read_times, color='orange', label='Load Time (ingest)')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.set_title("Initial Ingestion Time by Format (Read = shared, Load = per-format)")
    ax.set_ylabel("Time (s)")
    ax.legend()

    # Annotate component and total values
    max_total = float(np.max(read_times + load_times)) if len(labels) > 0 else 0.0
    for i in range(len(labels)):
        r = read_times[i]
        l = load_times[i]
        total = r + l
        if r > 0:
            ax.text(x[i], r / 2, f"{r:.2f}s", ha='center', va='center', color='white', fontsize=9)
        if l > 0:
            ax.text(x[i], r + l / 2, f"{l:.2f}s", ha='center', va='center', color='black', fontsize=9)
        ax.text(x[i], total + max_total * 0.03, f"Total: {total:.2f}s", ha='center', va='bottom', fontsize=9)

    plt.show()


In [None]:
# Update performance comparison
update_df = metrics_df[metrics_df['update_strategy'].isin(['Full Compare', 'Incremental'])]
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
for strategy in update_df['update_strategy'].unique():
    strat_df = update_df[update_df['update_strategy'] == strategy]
    plt.bar(strat_df['format'] + " " + strat_df['update_strategy'], strat_df['ingest_time_s'], label=strategy)
plt.title("Update Time by Strategy and Format")
plt.xlabel("Strategy")
plt.ylabel("Update Time (s)")
plt.legend()
plt.show()
print('Update performance chart complete')

In [None]:
# Query performance comparison
query_df = metrics_df[metrics_df['query_type'].notna()]
plt.figure(figsize=(10,5))
plt.bar(query_df['query_type'], query_df['query_time_s'], color='seagreen')
plt.xticks(rotation=45)
plt.title("Query Performance Comparison")
plt.xlabel("Query Type")
plt.ylabel("Query Time (s)")
plt.tight_layout()
plt.show()
print('Query performance chart complete')

In [None]:
# Storage cost comparison (use integer proxy column created above)
storage_df = metrics_df.dropna(subset=['storage_rows_proxy'])
storage_summary = storage_df.groupby(['format', 'update_strategy'])['storage_rows_proxy'].mean().reset_index()
plt.figure(figsize=(10,5))
plt.bar(storage_summary['format'] + " " + storage_summary['update_strategy'], storage_summary['storage_rows_proxy'], color='orchid')
plt.title("Storage Size by Target Table")
plt.xlabel("Target Table")
plt.ylabel("Storage Size (rows) â€” row-count proxy")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
print('Storage comparison chart complete')

In [None]:
# Show all metrics as interactive table
import seaborn as sns
sns.set(style="whitegrid")
# Expose the two proxy columns for clarity in the table view
display(metrics_df.assign(storage_rows_proxy=metrics_df['storage_rows_proxy'], cu_used_int=metrics_df['cu_used_int']))
print('Displayed metrics table')

# Visualize Metrics Matrix (Compact 5Ã—6)

This compact matrix provides the small-multiples view we discussed: six columns (strategies) Ã— five rows (metrics). It uses the same metrics table and the integer row-count proxies for storage_s[...]

In [None]:
# Build and display the 5x6 metrics matrix
import math
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib import ticker
import pandas as pd

cols = [
    ("Warehouse", "Full Refresh", "TC.04.x"),
    ("Warehouse", "Full Compare", "TC.06.x"),
    ("Warehouse", "Incremental", "TC.08.x"),
    ("Delta",     "Full Refresh", "TC.03.x"),
    ("Delta",     "Full Compare", "TC.05.x"),
    ("Delta",     "Incremental", "TC.07.x"),
]
ingest_tc_by_format = {"Delta": "TC.01.x", "Warehouse": "TC.02.x"}

def latest_row_for_testcase(df, tc):
    if tc not in df['test_case_id'].values:
        return None
    sub = df[df['test_case_id'] == tc]
    if 'timestamp' in sub.columns:
        sub = sub.sort_values('timestamp')
    return sub.iloc[-1]

def sget_int(s, colname, default=0):
    if s is None or colname not in s.index:
        return int(default)
    val = s[colname]
    if pd.isna(val):
        return int(default)
    try:
        return int(val)
    except Exception:
        try:
            return int(float(val))
        except Exception:
            return int(default)

def sget_float(s, colname, default=0.0):
    if s is None or colname not in s.index:
        return float(default)
    val = s[colname]
    if pd.isna(val):
        return float(default)
    try:
        return float(val)
    except Exception:
        return float(default)

ingest_rows = []
ingest_time_s = []
update_rows = []
update_time_s = []
storage_rows = []
col_labels = []

for fmt, strat, update_tc in cols:
    col_labels.append(f"{fmt}\n{strat}")
    ingest_tc = ingest_tc_by_format.get(fmt)
    ingest_row = latest_row_for_testcase(metrics_df, ingest_tc)
    ingest_rows.append(sget_int(ingest_row, 'rows', default=0))
    ingest_time_s.append(sget_float(ingest_row, 'ingest_time_s', default=0.0))

    update_row = latest_row_for_testcase(metrics_df, update_tc)
    ur = 0
    if update_row is not None:
        if 'cu_used' in update_row.index and (not pd.isna(update_row['cu_used'])):
            try:
                ur = int(update_row['cu_used'])
            except Exception:
                ur = sget_int(update_row, 'rows', default=0)
        else:
            ur = sget_int(update_row, 'rows', default=0)
    update_rows.append(int(ur))
    update_time_s.append(sget_float(update_row, 'ingest_time_s', default=0.0))

    sr = None
    if update_row is not None and 'storage_size_mb' in update_row.index and (not pd.isna(update_row['storage_size_mb'])):
        try:
            sr = int(update_row['storage_size_mb'])
        except Exception:
            sr = None
    if sr is None and ingest_row is not None and 'storage_size_mb' in ingest_row.index and (not pd.isna(ingest_row['storage_size_mb'])):
        try:
            sr = int(ingest_row['storage_size_mb'])
        except Exception:
            sr = None
    if sr is None:
        sr = 0
    storage_rows.append(int(sr))

matrix_df = pd.DataFrame(
    data=[ingest_rows, ingest_time_s, update_rows, update_time_s, storage_rows],
    index=["Ingestion rows", "Ingestion time_s", "Update rows", "Update time_s", "Storage rows"],
    columns=col_labels
)

print("Metrics matrix (values):")
display(matrix_df)

n_rows, n_cols = matrix_df.shape
fig, axes = plt.subplots(n_rows, n_cols, figsize=(3 * n_cols, 2.2 * n_rows), squeeze=False)
plt.subplots_adjust(hspace=0.6, wspace=0.6, top=0.93)
fig.suptitle("Metrics matrix (rows = metrics, cols = strategies)", fontsize=16)

colors = ["#2a9d8f"] * n_cols

for r_i, metric in enumerate(matrix_df.index):
    vals = matrix_df.loc[metric].astype(float).values
    vmin = 0.0
    vmax = float(np.nanmax(vals)) if len(vals) > 0 else 1.0
    if math.isclose(vmax, 0.0, abs_tol=1e-12):
        vmax = 1.0
    pad = vmax * 0.06
    y_min, y_max = vmin, vmax + pad

    for c_i, col_label in enumerate(matrix_df.columns):
        ax = axes[r_i][c_i]
        val = matrix_df.at[metric, col_label]
        ax.bar([0], [val], width=0.6, color=colors[c_i])
        ax.set_xlim(-0.8, 0.8)
        ax.set_ylim(y_min, y_max)
        ax.set_xticks([])
        ax.yaxis.set_major_locator(ticker.MaxNLocator(3))

        if r_i == 0:
            ax.set_title(col_label, fontsize=9)

        if c_i == 0:
            ax.set_ylabel(metric, fontsize=10)

        try:
            label_text = f"{int(val):,}"
        except Exception:
            label_text = f"{val}"
        ax.text(0, y_min - (y_max - y_min) * 0.14, label_text, ha='center', va='top', fontsize=10)
        for spine in ['top', 'right', 'left']:
            ax.spines[spine].set_visible(False)

plt.show()

summary = matrix_df.T.reset_index().rename(columns={'index': 'strategy'})
print('\nSummary (columns = metrics):')
display(summary)


In [None]:
print("Completion")
print("Visualizations produced from MetricsLakehouse.metrics. Controller metrics merge complete.")