In [None]:
%%configure -f
{
  "conf": {
    "spark.notebook.parameters": "{}"
  },
  "defaultLakehouse": {
    "name": "BenchmarkLakehouse"
  }
}


# ðŸ““ 3. Queries
## Query Performance Benchmarking

### ðŸ”— Ensure `BenchmarkLakehouse`/`BenchmarkWarehouse` are reachable before running.

This notebook runs query performance against the single target defined by the notebook parameters (one parameter_set per workspace/run).
It runs only the queries relevant to the configured format (Delta or Warehouse) and logs concise progress prints at the end of cells.

In [None]:
import json
import time
import re
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType
print('Imports ready')

In [None]:
# Load parameters from spark.notebook.parameters (fail hard if missing/incorrect)
conf_key = 'spark.notebook.parameters'
conf_str = spark.conf.get(conf_key)
if not conf_str:
    # fail hard per instruction
    raise SystemExit('Missing spark.notebook.parameters (expected the parameter set in %%configure)')

params = json.loads(conf_str)

# Required params (fail hard if not present)
test_case_name = params['name']
dataset_name = params['dataset_name']
fmt = params['format']  # expected 'delta' or 'warehouse'
target_lakehouse = params['target_lakehouse']
target_warehouse = params['target_warehouse']

# sanitized name (table name) â€” same logic as other notebooks
sanitized_name = re.sub(r"[^a-z0-9_]", "", re.sub(r"[\s-]+", "_", test_case_name.strip().lower()))
target_table = sanitized_name

print('Loaded params:')
print('  test_case_name:', test_case_name)
print('  dataset_name:', dataset_name)
print('  format:', fmt)
print('  target_table:', target_table)


In [None]:
# Metrics schema used to log the query timings to the lakehouse metrics table
metrics_schema = StructType([
    StructField("test_case_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("format", StringType(), True),
    StructField("location", StringType(), True),
    StructField("rows", IntegerType(), True),
    StructField("update_strategy", StringType(), True),
    StructField("ingest_time_s", FloatType(), True),
    StructField("spinup_time_s", FloatType(), True),
    StructField("storage_size_mb", FloatType(), True),
    StructField("query_type", StringType(), True),
    StructField("query_time_s", FloatType(), True),
    StructField("cu_used", FloatType(), True),
    StructField("notes", StringType(), True)
])
print('Metrics schema ready')

In [None]:
# Load the single target table depending on the configured format
if fmt.lower() == 'delta':
    lakehouse_table = f"{target_lakehouse}.{target_table}"
    print(f'Loading Delta table: {lakehouse_table}')
    df = spark.read.table(lakehouse_table)
    target_location = 'Lakehouse'
elif fmt.lower() == 'warehouse':
    warehouse_table = f"{target_warehouse}.dbo.{target_table}"
    print(f'Loading Warehouse table via synapsesql: {warehouse_table}')
    # synapsesql read requires the Fabric connector
    from com.microsoft.spark.fabric import Constants
    df = spark.read.synapsesql(warehouse_table)
    target_location = 'Warehouse'
else:
    raise SystemExit(f'Unsupported format in parameters: {fmt}')

print('Loaded target table into DataFrame')

In [None]:
# Define simple query functions (assume schema as in ingest/generate flows)
def q_filter(df):
    return df.filter(col('cat_1') == 'A').count()

def q_aggregate(df):
    return df.groupBy('cat_1').agg({ 'num_1': 'avg', 'num_2': 'max' }).count()

def q_batch(df):
    # select updates after a reasonable cutoff (keeps static string as before)
    return df.filter((col('update_type') == 'update') & (col('ts_1') > '2025-01-01')).count()

def q_topn(df):
    return df.orderBy(col('num_1').desc()).limit(10).count()

print('Query functions ready')

In [None]:
# Run the four queries exactly once against the single target and record timings
query_list = [
    ('Filter', q_filter),
    ('Aggregate', q_aggregate),
    ('Batch', q_batch),
    ('TopN', q_topn),
]
metrics = []
for qname, qfunc in query_list:
    t0 = time.time()
    rows = qfunc(df)
    t1 = time.time()
    elapsed = t1 - t0
    print(f"{qname}: {elapsed:.3f}s (rows={rows})")
    metrics.append({'query': qname, 'rows': int(rows), 'time_s': float(elapsed)})

print('All queries executed')

In [None]:
# Display metrics locally (pandas) â€” concise view
import pandas as pd
metrics_df = pd.DataFrame(metrics)
display(metrics_df)
print('Displayed metrics dataframe')

In [None]:
# Log metrics to the lakehouse metrics table (use test_case_name as test_case_id)
def log_query_to_metrics(test_case_id, format_name, location, rows, query_type, query_time_s, notes=''):
    metrics_row = [(
        test_case_id,
        datetime.now(),
        format_name,
        location,
        int(rows),
        '',                # update_strategy (N/A)
        float('nan'),      # ingest_time_s (N/A)
        float('nan'),      # spinup_time_s (N/A)
        float('nan'),      # storage_size_mb (N/A)
        query_type,
        float(query_time_s),
        float('nan'),      # cu_used (N/A)
        notes
    )]
    spark.createDataFrame(metrics_row, schema=metrics_schema).write.mode('append').saveAsTable(f"{target_lakehouse}.metrics")
print('Log-to-metrics helper ready')

In [None]:
# Persist metrics for this run (use format label capitalized)
format_label = 'Delta' if fmt.lower() == 'delta' else 'Warehouse'
for m in metrics:
    log_query_to_metrics(
        test_case_id=test_case_name,
        format_name=format_label,
        location=target_location,
        rows=m['rows'],
        query_type=m['query'],
        query_time_s=m['time_s'],
        notes=f"Query performance ({m['query']})"
    )
print('Logged all metrics to lakehouse.metrics')

In [None]:
print('Queries notebook complete for test_case:', test_case_name, 'format:', fmt)
print('Sanitized table name used:', target_table)
