In [None]:
%%configure -f
{
  "conf": {
    "spark.notebook.parameters": "{\"DATASET_NAME\": \"10k\", \"SOURCE\": \"lakehouse\", \"FORMAT\": \"warehouse\", \"AZURE_SQL_SERVER\": \"benchmarking-bff\", \"AZURE_SQL_DB\": \"benchmarking\", \"AZURE_SQL_SCHEMA\": \"dbo\"}"
  },
  "defaultLakehouse": {
    "name": "BenchmarkLakehouse"
  }
}

# ðŸ““ 1. Ingest Data
## Ingestion Module â€” single parameter-set run

This notebook ingests one parameter set (decided at runtime) and supports four ingestion paths:
- lakehouse -> delta
- lakehouse -> warehouse
- sql -> delta
- sql -> warehouse

Style: strict parameter expectations (no silent defaults). Required spark.notebook.parameters keys for this notebook: DATASET_NAME, SOURCE, FORMAT. If SOURCE == "sql", AZURE_SQL_SERVER and AZURE_SQL_DB are also required.

In [None]:
import json
import time
from datetime import datetime
import requests
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType
from pyspark.sql.functions import lit

# Controller workspace and lakehouse names are fixed here for simplicity
controller_workspace_name = "BFF Controller"
controller_lakehouse_name = "DataSourceLakehouse"

print('Setup imports done')


In [None]:
spark = SparkSession.builder.getOrCreate()
conf_key = 'spark.notebook.parameters'
conf_str = None
try:
    conf_str = spark.conf.get(conf_key, None)
except Exception:
    conf_str = None
if not conf_str:
    try:
        conf_str = spark.sparkContext.getConf().get(conf_key, None)
    except Exception:
        conf_str = None

if not conf_str:
    raise SystemExit('Missing required spark.notebook.parameters. Provide DATASET_NAME, SOURCE, and FORMAT in the %%configure cell.')

raw_params = json.loads(conf_str)

# Required minimal parameters (no defaults)
required = ['DATASET_NAME', 'SOURCE', 'FORMAT']
missing = [k for k in required if k not in raw_params]
if missing:
    raise SystemExit(f"Missing required parameters in spark.notebook.parameters: {', '.join(missing)}")

dataset_name = raw_params['DATASET_NAME']
source = str(raw_params['SOURCE']).lower()
fmt = str(raw_params['FORMAT']).lower()

# Hard-coded target lakehouse/warehouse
target_lakehouse = 'BenchmarkLakehouse'
target_warehouse = 'BenchmarkWarehouse'

# SQL params (required only when source == 'sql')
AZURE_SQL_SERVER = raw_params.get('AZURE_SQL_SERVER')
AZURE_SQL_DB = raw_params.get('AZURE_SQL_DB')
AZURE_SQL_SCHEMA = raw_params.get('AZURE_SQL_SCHEMA', 'dbo')
if source == 'sql' and (not AZURE_SQL_SERVER or not AZURE_SQL_DB):
    raise SystemExit('SOURCE=sql requires AZURE_SQL_SERVER and AZURE_SQL_DB in spark.notebook.parameters')

# For initial ingest this notebook writes to the refresh target table name
if fmt == 'delta':
    target_table = f'delta_{dataset_name}'
else:
    target_table = f'wh_table_{dataset_name}'

# Resolve DataSourceLakehouse path using the controller lakehouse name + dataset_name
data_source_lakehouse_path = raw_params.get('data_source_lakehouse_path') or f"/lakehouse/{controller_lakehouse_name}/Files/{dataset_name}/base/base.parquet"

print(f"Params: DATASET_NAME={dataset_name} SOURCE={source} FORMAT={fmt}")
print('Resolved data_source_lakehouse_path =', data_source_lakehouse_path)


In [None]:
metrics_schema = StructType([
    StructField("test_case_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("source", StringType(), True),
    StructField("format", StringType(), True),
    StructField("rows", IntegerType(), True),
    StructField("update_strategy", StringType(), True),
    StructField("action", StringType(),True),
    StructField("ingest_time_s", FloatType(), True),
    StructField("spinup_time_s", FloatType(), True),
    StructField("query_type", StringType(), True),
    StructField("query_time_s", FloatType(), True),
    StructField("cu_used", FloatType(), True),
    StructField("notes", StringType(), True)
])
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {target_lakehouse}.metrics (
    test_case_id STRING,
    timestamp TIMESTAMP,
    source STRING,
    format STRING,
    rows INT,
    update_strategy STRING,
    action STRING,
    ingest_time_s FLOAT,
    spinup_time_s FLOAT,
    query_type STRING,
    query_time_s FLOAT,
    cu_used FLOAT,
    notes STRING
)
""")
print('Ensured metrics table exists in', target_lakehouse)


In [None]:
# SQL helpers (only used when source == 'sql')
try:
    from notebookutils import mssparkutils
except Exception:
    mssparkutils = None

SQL_COPT_SS_ACCESS_TOKEN = 1256

def _token_struct():
    if not mssparkutils:
        raise RuntimeError('mssparkutils not available to get token')
    t = mssparkutils.credentials.getToken('https://database.windows.net/')
    exptoken = b''.join(bytes([c]) + b'\x00' for c in t.encode('utf-8'))
    return __import__('struct').pack('=i', len(exptoken)) + exptoken

def _pyodbc_conn_with_retry(server=None, database=None, timeout=120, retries=2, backoff=2):
    import pyodbc
    server = server or AZURE_SQL_SERVER
    database = database or AZURE_SQL_DB
    if not server or not database:
        raise RuntimeError('AZURE_SQL_SERVER and AZURE_SQL_DB must be set (or passed in)')
    if not server.lower().endswith('.database.windows.net'):
        server = server.rstrip('.') + '.database.windows.net'
    conn_str = (
        'Driver={ODBC Driver 18 for SQL Server};'
        f'Server=tcp:{server},1433;'
        f'Database={database};'
        'Encrypt=yes;TrustServerCertificate=no;'
    )
    last_exc = None
    for attempt in range(1, retries + 1):
        try:
            return pyodbc.connect(conn_str, attrs_before={SQL_COPT_SS_ACCESS_TOKEN: _token_struct()}, timeout=timeout)
        except Exception as e:
            last_exc = e
            if attempt < retries:
                time.sleep(backoff * attempt)
            else:
                raise
    raise last_exc

print('SQL helper functions ready')


In [None]:
spinup_start = time.time()
if source == 'lakehouse':
    base_file = data_source_lakehouse_path
    print('Reading parquet from lakehouse path:', base_file)
    df = spark.read.parquet(base_file)
    print('Read lakehouse parquet rows:', df.count())
elif source == 'sql':
    table_name_sql = f"{AZURE_SQL_SCHEMA}.base_{dataset_name}"
    print(f'Reading Azure SQL table {table_name_sql} from {AZURE_SQL_SERVER}/{AZURE_SQL_DB} (token-based)')
    conn = _pyodbc_conn_with_retry(server=AZURE_SQL_SERVER, database=AZURE_SQL_DB)
    try:
        pdf = pd.read_sql(f'SELECT * FROM {table_name_sql}', conn)
        print('Pandas rows read from SQL:', len(pdf))
        for c in pdf.columns:
            if str(pdf[c].dtype).startswith('datetime'):
                pdf[c] = pd.to_datetime(pdf[c], errors='coerce')
                if getattr(pdf[c].dt, 'tz', None) is None:
                    pdf[c] = pdf[c].dt.tz_localize('UTC')
        df = spark.createDataFrame(pdf)
        print('Converted pandas -> spark rows:', df.count())
    finally:
        conn.close()
else:
    raise SystemExit(f'Unsupported source: {source}')
spinup_end = time.time()
spinup_duration = spinup_end - spinup_start
print('Spinup duration (s):', spinup_duration)


In [None]:
ingest_start = time.time()
storage_size_mb = float('nan')
if fmt == 'delta':
    table_full = f"{target_lakehouse}.{target_table}"
    print('Writing Delta table ->', table_full)
    df.write.mode('overwrite').saveAsTable(table_full)
    ingest_end = time.time()
    ingest_duration = ingest_end - ingest_start
    try:
        if mssparkutils:
            table_path = f"/lakehouse/{target_lakehouse}/Tables/{target_table}"
            storage_files = mssparkutils.fs.ls(table_path)
            storage_size_mb = sum(f.size for f in storage_files) / (1024 * 1024)
    except Exception:
        storage_size_mb = float('nan')
    print(f'Delta write complete. Ingest time: {ingest_duration:.2f}s | storage_size_mb: {storage_size_mb}')
elif fmt == 'warehouse':
    table_full = f"{target_warehouse}.dbo.{target_table}"
    print('Writing Warehouse table ->', table_full)
    try:
        df.write.mode('overwrite').synapsesql(table_full)
        ingest_end = time.time()
        ingest_duration = ingest_end - ingest_start
        print('synapsesql write succeeded')
    except Exception as _e:
        print('synapsesql not available or failed; falling back to parquet write:', _e)
        fallback_path = f"/lakehouse/{target_lakehouse}/Files/{dataset_name}/{target_table}/"
        print('Falling back to parquet write at:', fallback_path)
        try:
            df.write.mode('overwrite').parquet(fallback_path)
            print('Parquet fallback write complete at', fallback_path)
        except Exception as e2:
            print('Parquet fallback failed:', e2)
        ingest_end = time.time()
        ingest_duration = ingest_end - ingest_start
    storage_size_mb = float('nan')
else:
    raise SystemExit(f'Unsupported format: {fmt}')

print('Ingest duration (s):', ingest_duration)


In [None]:
metrics_row = [
    (
        f"TC.{dataset_name}.{fmt}.initial_load",
        datetime.now(),
        fmt.upper(),
        'Delta' if fmt == 'delta' else 'Warehouse',
        int(df.count()),
        'Initial Load',
        float(ingest_duration),
        float(spinup_duration),
        float(storage_size_mb),
        'N/A',
        float('nan'),
        float(df.count()),
        f'Ingest from {source} into {fmt} target table {target_table}'
    )
]
spark.createDataFrame(metrics_row, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print('Metrics appended to', target_lakehouse + '.metrics')


In [None]:
print('Ingest step completed successfully for single parameter set.')
print('Summary:')
print(f" dataset: {dataset_name} | source: {source} | format: {fmt} | target_table: {target_table}")
