In [None]:
%%configure -f
{
  "conf": {
    "spark.notebook.parameters": "{}"
  },
  "defaultLakehouse": {
    "name": "MetricsLakehouse"
  }
}


# ðŸ“Š 5. Visualize Metrics (data export only)

This notebook gathers metrics from parameter_set workspaces into the central
`MetricsLakehouse.metrics` table and exports a cleaned metrics file for
downstream visualization/analysis. Charts and plotting have been removed so
the notebook focuses on producing reliable, reproducible data artifacts.


In [None]:
# Merge metrics by reading each workspace's BenchmarkLakehouse Files/metrics ABFSS directory
# Uses mssparkutils.fs.ls to probe ABFSS paths and reads delta tables safely.

import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType

spark = SparkSession.builder.getOrCreate()

abfss_account = "onelake.dfs.fabric.microsoft.com"
source_lakehouse_name = "BenchmarkLakehouse"
controller_table = "MetricsLakehouse.metrics"

# canonical metrics schema (used only to create empty table if missing)
metrics_schema = StructType([
    StructField("test_case_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("source", StringType(), True),
    StructField("format", StringType(), True),
    StructField("rows", IntegerType(), True),
    StructField("update_strategy", StringType(), True),
    StructField("action", StringType(), True),
    StructField("ingest_time_s", FloatType(), True),
    StructField("spinup_time_s", FloatType(), True),
    StructField("query_type", StringType(), True),
    StructField("query_time_s", FloatType(), True),
    StructField("notes", StringType(), True)
])

# parse runs (safe default)
conf_key = "spark.notebook.parameters"
runs = json.loads(spark.conf.get(conf_key, "{}")).get("runs", [])
workspace_names = [r.get("name") for r in runs if r.get("name")]
print("Workspaces to inspect (from runs):", workspace_names)

# ensure controller table exists
if not spark.catalog.tableExists(controller_table):
    empty = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema=metrics_schema)
    empty.write.mode("overwrite").saveAsTable(controller_table)

dest_df = spark.table(controller_table)
before_total = dest_df.count()
print(f"Controller metrics table found: {controller_table} rows={before_total}")

# prefer mssparkutils for path probing in Fabric notebooks
try:
    from notebookutils import mssparkutils
except Exception:
    try:
        import mssparkutils
    except Exception:
        mssparkutils = None

total_inserted = 0

for ws in workspace_names:
    if not ws:
        continue

    container = str(ws)
    source_table_root = f"abfss://{container}@{abfss_account}/{source_lakehouse_name}.lakehouse/Tables/metrics"
    print(f"\nWorkspace '{ws}': probing {source_table_root}")

    path_exists = False
    if mssparkutils:
        try:
            ls_entries = mssparkutils.fs.ls(source_table_root)
            path_exists = bool(ls_entries)
        except Exception:
            path_exists = False
    else:
        # Fallback: try a guarded read
        try:
            tmp = spark.read.format("delta").load(source_table_root)
            path_exists = True
        except Exception:
            path_exists = False

    print(f"  path_exists={path_exists}")

    if not path_exists:
        print(f"  Path not present or inaccessible for workspace '{ws}'; skipping.")
        continue

    # read the delta table (delta reader handles _delta_log correctly)
    src_df = spark.read.format("delta").load(source_table_root)
    src_count = src_df.count()
    print(f"  Read {src_count} rows from delta table at {source_table_root}")

    # Deduplicate exact rows against destination by using common columns
    dest_cols = dest_df.columns
    common_cols = [c for c in dest_cols if c in src_df.columns]
    if not common_cols:
        print(f"  No common columns between source ({source_table_root}) and destination ({controller_table}); skipping")
        continue

    src_sel = src_df.select(*common_cols)
    dest_sel = dest_df.select(*common_cols)

    # left_anti finds rows in src_sel that are not present in dest_sel
    new_rows = src_sel.join(dest_sel, on=common_cols, how="left_anti")
    insert_count = new_rows.count()
    print(f"  New unique rows to insert from '{ws}': {insert_count}")

    if insert_count > 0:
        new_rows.write.mode("append").saveAsTable(controller_table)
        dest_df = spark.table(controller_table)
        total_inserted += insert_count
        print(f"  Appended {insert_count} rows from '{ws}' into {controller_table}")

print(f"\nFinished: before_total={before_total}, total_inserted={total_inserted}")
after_total = spark.table(controller_table).count()
print(f"After total rows in {controller_table}: {after_total}")

metrics_df = spark.table(controller_table).toPandas()
print("Loaded merged metrics into pandas DataFrame with rows:", len(metrics_df))


In [None]:
# Show all metrics as interactive table
import seaborn as sns
sns.set(style="whitegrid")
# Expose the two proxy columns for clarity in the table view
display(metrics_df.assign(storage_rows_proxy=metrics_df['storage_rows_proxy'], cu_used_int=metrics_df['cu_used_int']))
print('Displayed metrics table')


In [None]:
print("Completion")
print("Metrics merge and export complete. Charts removed; outputs are in .state/")
