# 📓 4. Run Queries
## Query Performance Benchmarking for TC.11.x–TC.13.x

### 🔗 Ensure `BenchmarkLakehouse` is connected as a data source before running.

Assumes synthetic data, initial load, and updates have been completed.
All queries are run against the target table in BenchmarkLakehouse and BenchmarkWarehouse.
Join queries are excluded (single table per location).


In [None]:
import time
from pyspark.sql.functions import col

In [None]:
# Paths and table names
target_lakehouse = "BenchmarkLakehouse"
target_warehouse = "BenchmarkWarehouse"
row_count = 10000  # Update as appropriate

delta_tables = {
    "refresh": "delta_refresh_load",
    "compare": "delta_compare_load",
    "increment": "delta_increment_load"
}
warehouse_tables = {
    "refresh": "wh_table_refresh_load",
    "compare": "wh_table_compare_load",
    "increment": "wh_table_increment_load"
}

# Choose table to query (usually 'increment' for event log)
lakehouse_table = f"{target_lakehouse}.{delta_tables['increment']}"
warehouse_table = f"{target_warehouse}.dbo.{warehouse_tables['increment']}"

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType
from datetime import datetime

# Keep storage_size_mb and cu_used as FloatType to match ingest/apply updates notebooks.
metrics_schema = StructType([
    StructField("test_case_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("format", StringType(), True),
    StructField("location", StringType(), True),
    StructField("rows", IntegerType(), True),
    StructField("update_strategy", StringType(), True),
    StructField("ingest_time_s", FloatType(), True),
    StructField("spinup_time_s", FloatType(), True),
    StructField("storage_size_mb", FloatType(), True),
    StructField("query_type", StringType(), True),
    StructField("query_time_s", FloatType(), True),
    StructField("cu_used", FloatType(), True),
    StructField("notes", StringType(), True)
])

In [None]:
# Read Delta table from Lakehouse
df_lakehouse = spark.read.table(lakehouse_table)

## Query Types
- **Filter Query**: Select rows with a specific category value.
- **Aggregate Query**: Group by category, aggregate numeric columns.
- **Batch Query**: Select update events in a specific time window.
- **Top-N Query**: Retrieve top N rows by a numeric column.

*(Join queries are excluded—only one table in each target location.)*

In [None]:
# Define query functions
def run_filter_query(df):
    # Example: Filter by category
    return df.filter(col("cat_1") == "A").count()

def run_aggregate_query(df):
    # Example: Group by category, aggregate numerics
    return df.groupBy("cat_1").agg({"num_1": "avg", "num_2": "max"}).count()

def run_batch_query(df):
    # Example: Select events by update_type and time window
    return df.filter((col("update_type") == "update") & (col("ts_1") > "2025-01-01")).count()

def run_topn_query(df, n=10):
    # Example: Top-N by numeric value
    return df.orderBy(col("num_1").desc()).limit(n).count()

In [None]:
# Performance logging utility
def log_query_perf(query_func, df, description):
    start = time.time()
    result = query_func(df)
    elapsed = time.time() - start
    print(f"{description}: {elapsed:.3f}s (Rows: {result})")
    return {"query": description, "rows": result, "time_s": elapsed}

In [None]:
# Run queries on Lakehouse Delta table
lakehouse_metrics = []
lakehouse_metrics.append(log_query_perf(run_filter_query, df_lakehouse, "Lakehouse Filter cat_1 == 'A'"))
lakehouse_metrics.append(log_query_perf(run_aggregate_query, df_lakehouse, "Lakehouse Aggregate by cat_1"))
lakehouse_metrics.append(log_query_perf(run_batch_query, df_lakehouse, "Lakehouse Batch update_type == update, ts_1 > '2025-01-01'"))
lakehouse_metrics.append(log_query_perf(run_topn_query, df_lakehouse, "Lakehouse Top 10 num_1"))

In [None]:
# Read Warehouse table (for Spark SQL endpoint, not as DataFrame)
from com.microsoft.spark.fabric import Constants
df_warehouse = spark.read.synapsesql(warehouse_table)

In [None]:
# Run queries on Warehouse table
warehouse_metrics = []
warehouse_metrics.append(log_query_perf(run_filter_query, df_warehouse, "Warehouse Filter cat_1 == 'A'"))
warehouse_metrics.append(log_query_perf(run_aggregate_query, df_warehouse, "Warehouse Aggregate by cat_1"))
warehouse_metrics.append(log_query_perf(run_batch_query, df_warehouse, "Warehouse Batch update_type == update, ts_1 > '2025-01-01'"))
warehouse_metrics.append(log_query_perf(run_topn_query, df_warehouse, "Warehouse Top 10 num_1"))

In [None]:
# Display metrics as table
import pandas as pd

all_metrics = lakehouse_metrics + warehouse_metrics
metrics_df = pd.DataFrame(all_metrics)
display(metrics_df)

# Print completion message
print("Query performance benchmarking complete. Metrics above can be visualized in the next step.")

In [None]:
# Log metrics to metrics table in BenchmarkLakehouse
def log_query_to_metrics(test_case_id, format, location, rows, query_type, query_time_s, notes=""):
    # Use float('nan') for non-applicable numeric fields so they match FloatType schema
    metrics_row = [(
        test_case_id,
        datetime.now(),
        format,
        location,
        rows,
        "",                  # update_strategy (N/A for queries)
        float('nan'),        # ingest_time_s (N/A for queries)
        float('nan'),        # spinup_time_s (N/A)
        float('nan'),        # storage_size_mb (N/A for queries) cast to float to match FloatType
        query_type,
        float(query_time_s),
        float('nan'),        # cu_used (N/A for queries) cast to float to match FloatType
        notes
    )]
    spark.createDataFrame(metrics_row, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')

In [None]:
# Log all Lakehouse query metrics
for metric in lakehouse_metrics:
    log_query_to_metrics(
        test_case_id="TC.11.x",
        format="Delta",
        location="Tables",
        rows=metric['rows'],
        query_type=metric['query'],
        query_time_s=metric['time_s'],
        notes="Lakehouse query performance"
    )

# Log all Warehouse query metrics
for metric in warehouse_metrics:
    log_query_to_metrics(
        test_case_id="TC.12.x",
        format="Warehouse",
        location="Tables",
        rows=metric['rows'],
        query_type=metric['query'],
        query_time_s=metric['time_s'],
        notes="Warehouse query performance"
    )