# 📊 Visualize Metrics
### 🔗 Ensure `BenchmarkLakehouse` is connected as a data source before running.

This notebook compares performance and storage across different ingestion, update, and query strategies.

Activities visualized:
- Initial ingestion
- Update
- Query
- Storage cost for each target


In [None]:
# Load metrics table from Lakehouse (or local CSV for dev)
try:
    metrics_df = spark.read.table("BenchmarkLakehouse.metrics").toPandas()
except Exception:
    import pandas as pd
    metrics_df = pd.read_csv("metrics.csv")
metrics_df.head()

In [None]:
# Calculate storage_size_mb if missing
import math

def calculate_storage_for_table(table_path):
    try:
        import mssparkutils
        files = mssparkutils.fs.ls(table_path)
        size_mb = sum(f.size for f in files) / (1024 * 1024)
        return size_mb
    except Exception:
        return float('nan')

# Identify unique targets and fill missing storage_size_mb
for idx, row in metrics_df.iterrows():
    if ('storage_size_mb' in row and (pd.isna(row['storage_size_mb']) or math.isnan(row['storage_size_mb']))) and row['update_strategy']:
        # crude extraction: use test_case_id to guess table name (customize if needed)
        tc = row['test_case_id']
        # Mapping for demo, real code should use actual table names
        table_map = {
            'TC.01.x': '/lakehouse/BenchmarkLakehouse/Tables/delta_refresh_load',
            'TC.02.x': '/lakehouse/BenchmarkLakehouse/Tables/wh_table_refresh_load',
            'TC.03.x': '/lakehouse/BenchmarkLakehouse/Tables/delta_refresh_load',
            'TC.04.x': '/lakehouse/BenchmarkLakehouse/Tables/wh_table_refresh_load',
            'TC.05.x': '/lakehouse/BenchmarkLakehouse/Tables/delta_compare_load',
            'TC.06.x': '/lakehouse/BenchmarkLakehouse/Tables/wh_table_compare_load',
            'TC.07.x': '/lakehouse/BenchmarkLakehouse/Tables/delta_increment_load',
            'TC.08.x': '/lakehouse/BenchmarkLakehouse/Tables/wh_table_increment_load',
        }
        table_path = table_map.get(tc, None)
        if table_path:
            metrics_df.at[idx, 'storage_size_mb'] = calculate_storage_for_table(table_path)

In [None]:
# Initial ingestion performance
import matplotlib.pyplot as plt
ingest_df = metrics_df[metrics_df['update_strategy'] == 'Full Refresh']
plt.figure(figsize=(8,4))
plt.bar(ingest_df['format'], ingest_df['ingest_time_s'], color=['skyblue', 'orange'])
plt.title("Initial Ingestion Time by Format")
plt.xlabel("Format")
plt.ylabel("Ingestion Time (s)")
plt.show()

In [None]:
# Update performance comparison
update_df = metrics_df[metrics_df['update_strategy'].isin(['Full Compare', 'Incremental'])]
plt.figure(figsize=(8,4))
for strategy in update_df['update_strategy'].unique():
    strat_df = update_df[update_df['update_strategy'] == strategy]
    plt.bar(strat_df['format'] + " " + strat_df['update_strategy'], strat_df['ingest_time_s'], label=strategy)
plt.title("Update Time by Strategy and Format")
plt.xlabel("Strategy")
plt.ylabel("Update Time (s)")
plt.legend()
plt.show()

In [None]:
# Query performance comparison
query_df = metrics_df[metrics_df['query_type'].notna()]
plt.figure(figsize=(10,5))
plt.bar(query_df['query_type'], query_df['query_time_s'], color='seagreen')
plt.xticks(rotation=45)
plt.title("Query Performance Comparison")
plt.xlabel("Query Type")
plt.ylabel("Query Time (s)")
plt.tight_layout()
plt.show()

In [None]:
# Storage cost comparison
storage_df = metrics_df.dropna(subset=['storage_size_mb'])
storage_summary = storage_df.groupby(['format', 'update_strategy'])['storage_size_mb'].mean().reset_index()
plt.figure(figsize=(10,5))
plt.bar(storage_summary['format'] + " " + storage_summary['update_strategy'], storage_summary['storage_size_mb'], color='orchid')
plt.title("Storage Size by Target Table")
plt.xlabel("Target Table")
plt.ylabel("Storage Size (MB)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Show all metrics as interactive table
import seaborn as sns
sns.set(style="whitegrid")
display(metrics_df)

In [None]:
print("Visualization complete. Review charts above for performance and storage comparisons across ingestion, update, and query activities.")