In [None]:
%%configure -f
{
  "conf": {
    "spark.notebook.parameters": "{\"SOURCE\": \"lakehouse\", \"FORMAT\": \"warehouse\", \"AZURE_SQL_SERVER\": \"benchmarking-bff\", \"AZURE_SQL_DB\": \"benchmarking\", \"AZURE_SQL_SCHEMA\": \"dbo\"}"
  },
  "defaultLakehouse": {
    "name": "BenchmarkLakehouse"
  }
}

# ðŸ““ 1. Ingest Data
## Ingestion Module â€” single parameter-set run

This notebook ingests one parameter set (decided at runtime) and supports four ingestion paths:
- lakehouse -> delta
- lakehouse -> warehouse
- sql -> delta
- sql -> warehouse

Behavior and style:
- Linear, happy-path code with debug prints.
- Parameters are read from SparkConf (spark.notebook.parameters). Keep the notebook simple: it runs exactly one path per execution.
- SQL connectivity reuses the token-based pyodbc pattern from generate_data.ipynb to read/write to Azure SQL.
- Metrics for the single ingest are appended to BenchmarkLakehouse.metrics (same schema as before).

Required params (via spark.notebook.parameters JSON string):
{ "dataset_name": "10k", "row_count": 10000, "source": "lakehouse|sql", "format": "delta|warehouse", "update_strategy": "Full Refresh|Full Compare|Incremental", "target_lakehouse": "BenchmarkLakehouse", "target_warehouse": "BenchmarkWarehouse", "AZURE_SQL_SERVER": "...", "AZURE_SQL_DB": "..." }

This notebook is case-insensitive for parameter keys (it accepts SOURCE or source, FORMAT or format, etc.).
If spark.notebook.parameters is not provided, the notebook will fall back to the original hard-coded sample variables for local/debug runs.

In [None]:
import os
import json
import time
from datetime import datetime
from pathlib import Path
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType
from pyspark.sql.functions import lit

print('Setup imports done')


In [None]:
spark = SparkSession.builder.getOrCreate()
conf_key = 'spark.notebook.parameters'
conf_str = None
try:
    conf_str = spark.conf.get(conf_key, None)
except Exception:
    conf_str = None
if not conf_str:
    try:
        conf_str = spark.sparkContext.getConf().get(conf_key, None)
    except Exception:
        conf_str = None

if conf_str:
    print('Loaded parameters from spark.notebook.parameters')
    raw_params = json.loads(conf_str)
else:
    print('spark.notebook.parameters not found - falling back to embedded defaults (for debug)')
    raw_params = {
        'dataset_name': '10k',
        'row_count': 10000,
        'source': 'lakehouse',      # lakehouse | sql
        'format': 'delta',         # delta | warehouse
        'update_strategy': 'Full Refresh',
        'target_lakehouse': 'BenchmarkLakehouse',
        'target_warehouse': 'BenchmarkWarehouse',
    }

# Normalize keys to be case-insensitive (accept SOURCE or source etc.)
params = {k.lower(): v for k, v in raw_params.items()}

print('Params (normalized keys):')
print(json.dumps(params, indent=2))

# Convenience locals (use lowercase keys)
dataset_name = params.get('dataset_name')
row_count = int(params.get('row_count', 10000))
source = str(params.get('source', params.get('SOURCE', 'lakehouse'))).lower()
fmt = str(params.get('format', params.get('FORMAT', 'delta'))).lower()
update_strategy = params.get('update_strategy', params.get('UPDATE_STRATEGY', 'Full Refresh'))
target_lakehouse = params.get('target_lakehouse', params.get('TARGET_LAKEHOUSE', 'BenchmarkLakehouse'))
target_warehouse = params.get('target_warehouse', params.get('TARGET_WAREHOUSE', 'BenchmarkWarehouse'))
AZURE_SQL_SERVER = params.get('azure_sql_server', params.get('AZURE_SQL_SERVER'))
AZURE_SQL_DB = params.get('azure_sql_db', params.get('AZURE_SQL_DB'))
AZURE_SQL_SCHEMA = params.get('azure_sql_schema', params.get('AZURE_SQL_SCHEMA', 'dbo'))
data_source_lakehouse_path = params.get('data_source_lakehouse_path', params.get('data_source_path', f"/Files/{dataset_name}base/base.parquet"))

print(f"Effective run: dataset={dataset_name} rows={row_count} source={source} format={fmt} strategy={update_strategy}")


In [None]:
metrics_schema = StructType([
    StructField("test_case_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("format", StringType(), True),
    StructField("location", StringType(), True),
    StructField("rows", IntegerType(), True),
    StructField("update_strategy", StringType(), True),
    StructField("ingest_time_s", FloatType(), True),
    StructField("spinup_time_s", FloatType(), True),
    StructField("storage_size_mb", FloatType(), True),
    StructField("query_type", StringType(), True),
    StructField("query_time_s", FloatType(), True),
    StructField("cu_used", FloatType(), True),
    StructField("notes", StringType(), True)
])

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {target_lakehouse}.metrics (
    test_case_id STRING,
    timestamp TIMESTAMP,
    format STRING,
    location STRING,
    rows INT,
    update_strategy STRING,
    ingest_time_s FLOAT,
    spinup_time_s FLOAT,
    storage_size_mb FLOAT,
    query_type STRING,
    query_time_s FLOAT,
    cu_used FLOAT,
    notes STRING
)
""")
print('Ensured metrics table exists in', target_lakehouse)


## Ingest logic
Pick the proper source read path and the proper destination write path based on parameters. Only a single destination table is written per run (named by strategy + format mapping).

In [None]:
# --- SQL token helpers (copied from generate_data.ipynb - concise happy-path) ---
try:
    from notebookutils import mssparkutils
except Exception:
    # in environments without notebookutils available, set a stub; SQL paths will fail later with clear message
    mssparkutils = None

SQL_COPT_SS_ACCESS_TOKEN = 1256

def _token_struct():
    if not mssparkutils:
        raise RuntimeError('mssparkutils not available to get token')
    t = mssparkutils.credentials.getToken('https://database.windows.net/')
    exptoken = b''.join(bytes([c]) + b'\x00' for c in t.encode('utf-8'))
    return __import__('struct').pack('=i', len(exptoken)) + exptoken

def _pyodbc_conn_with_retry(server=None, database=None, timeout=120, retries=2, backoff=2):
    import pyodbc
    server = server or AZURE_SQL_SERVER
    database = database or AZURE_SQL_DB
    if not server or not database:
        raise RuntimeError('AZURE_SQL_SERVER and AZURE_SQL_DB must be set (or passed in)')
    # ensure fully-qualified server name
    if not server.lower().endswith('.database.windows.net'):
        server = server.rstrip('.') + '.database.windows.net'
    conn_str = (
        'Driver={ODBC Driver 18 for SQL Server};'
        f'Server=tcp:{server},1433;'
        f'Database={database};'
        'Encrypt=yes;TrustServerCertificate=no;'
    )
    last_exc = None
    for attempt in range(1, retries + 1):
        try:
            return pyodbc.connect(conn_str, attrs_before={SQL_COPT_SS_ACCESS_TOKEN: _token_struct()}, timeout=timeout)
        except Exception as e:
            last_exc = e
            if attempt < retries:
                time.sleep(backoff * attempt)
            else:
                raise
    raise last_exc

def push_df_concise(df_pd, table_name, schema_name='dbo', server=None, database=None):
    # minimal schema mapping; used for writing to Warehouse via pyodbc when synapsesql isn't available
    import pyodbc
    conn = _pyodbc_conn_with_retry(server=server, database=database)
    cur = conn.cursor()
    def _col_type_from_name(col):
        if col == 'id':
            return 'BIGINT'
        if col.startswith('num_'):
            return 'FLOAT'
        if col.startswith('cat_'):
            return 'NVARCHAR(100)'
        if col.startswith('ts_'):
            return 'DATETIMEOFFSET'
        if col == 'update_type':
            return 'NVARCHAR(32)'
        return 'NVARCHAR(MAX)'
    # sanitize df
    df2 = df_pd.copy()
    for c in df2.columns:
        if df2[c].dtype == object:
            df2[c] = df2[c].replace('', pd.NA)
    cols_ddl = [f'[{c}] {_col_type_from_name(c)} NULL' for c in df2.columns]
    full_table = f'{schema_name}.{table_name}'
    create_sql = f"IF OBJECT_ID(N'{full_table}', 'U') IS NOT NULL DROP TABLE {full_table}; CREATE TABLE {full_table} ({', '.join(cols_ddl)});"
    cur.execute(create_sql)
    conn.commit()
    col_names = ['[' + c.replace('"','') + ']' for c in df2.columns]
    placeholders = ', '.join('?' for _ in col_names)
    insert_sql = f'INSERT INTO {full_table} ({', '.join(col_names)}) VALUES ({placeholders})'
    records = [tuple(None if pd.isna(v) else (v.to_pydatetime() if isinstance(v, pd.Timestamp) else v) for v in row) for row in df2.itertuples(index=False, name=None)]
    cur.fast_executemany = True
    cur.executemany(insert_sql, records)
    conn.commit()
    cur.close()
    conn.close()
    return len(records)

print('SQL helper functions ready (pyodbc token approach)')


In [None]:
# Map format + update_strategy -> target table name (consistent with prior naming)
def choose_table(format_lower, strategy):
    strat_normal = strategy.lower()
    if format_lower == 'delta':
        if 'refresh' in strat_normal:
            return 'delta_refresh_load'
        if 'compare' in strat_normal:
            return 'delta_compare_load'
        return 'delta_increment_load'
    else:
        # warehouse mapping
        if 'refresh' in strat_normal:
            return 'wh_table_refresh_load'
        if 'compare' in strat_normal:
            return 'wh_table_compare_load'
        return 'wh_table_increment_load'

target_table = choose_table(fmt, update_strategy)
print('Target table chosen:', target_table)


In [None]:
# Read source data depending on 'source' param
spinup_start = time.time()
if source == 'lakehouse':
    # default parquet path pattern; controllers/orchestrator should set explicit path if needed
    base_file = data_source_lakehouse_path
    print('Reading parquet from lakehouse path:', base_file)
    df = spark.read.parquet(base_file)
    print('Read lakehouse parquet rows:', df.count())
elif source == 'sql':
    # Use pyodbc + token to pull base_{dataset_name} into pandas, then to Spark (happy-path)
    if not (AZURE_SQL_SERVER and AZURE_SQL_DB):
        raise SystemExit('AZURE_SQL_SERVER and AZURE_SQL_DB must be set in parameters for source=sql')
    table_name_sql = f"{AZURE_SQL_SCHEMA}.base_{dataset_name}"
    print(f'Reading Azure SQL table {table_name_sql} from {AZURE_SQL_SERVER}/{AZURE_SQL_DB} (token-based)')
    conn = _pyodbc_conn_with_retry(server=AZURE_SQL_SERVER, database=AZURE_SQL_DB)
    try:
        # pandas read_sql works with pyodbc connection
        pdf = pd.read_sql(f'SELECT * FROM {table_name_sql}', conn)
        print('Pandas rows read from SQL:', len(pdf))
        # Ensure timestamps are timezone-aware if present: convert naive to UTC
        for c in pdf.columns:
            if str(pdf[c].dtype).startswith('datetime'):
                pdf[c] = pd.to_datetime(pdf[c], errors='coerce')
                if getattr(pdf[c].dt, 'tz', None) is None:
                    pdf[c] = pdf[c].dt.tz_localize('UTC')
        # Convert to Spark DataFrame
        df = spark.createDataFrame(pdf)
        print('Converted pandas -> spark rows:', df.count())
    finally:
        conn.close()
else:
    raise SystemExit(f'Unsupported source: {source}')
spinup_end = time.time()
spinup_duration = spinup_end - spinup_start
print('Spinup duration (s):', spinup_duration)


In [None]:
# Write the dataframe to the chosen target table according to format
ingest_start = time.time()
storage_size_mb = float('nan')
if fmt == 'delta':
    # Write to delta table in the target lakehouse
    table_full = f"{target_lakehouse}.{target_table}"
    print('Writing Delta table ->', table_full)
    df.write.mode('overwrite').saveAsTable(table_full)
    ingest_end = time.time()
    ingest_duration = ingest_end - ingest_start
    # Attempt to compute storage size using mssparkutils if available
    try:
        if mssparkutils:
            table_path = f"/lakehouse/{target_lakehouse}/Tables/{target_table}"
            storage_files = mssparkutils.fs.ls(table_path)
            storage_size_mb = sum(f.size for f in storage_files) / (1024 * 1024)
    except Exception:
        storage_size_mb = float('nan')
    print(f'Delta write complete. Ingest time: {ingest_duration:.2f}s | storage_size_mb: {storage_size_mb}')
elif fmt == 'warehouse':
    # Try synapsesql write first (Fabric runtime). If not available, fallback to pyodbc push via pandas.
    table_full = f"{target_warehouse}.dbo.{target_table}"
    print('Writing Warehouse table ->', table_full)
    try:
        # attempt the synapsesql API (available in Fabric runtimes)
        df.write.mode('overwrite').synapsesql(table_full)
        ingest_end = time.time()
        ingest_duration = ingest_end - ingest_start
        print('synapsesql write succeeded')
    except Exception as _e:
        # fallback: convert to pandas and push via pyodbc token helper
        print('synapsesql not available or failed; falling back to pyodbc push via pandas:', _e)
        pdf = df.toPandas()
        pushed = push_df_concise(pdf, target_table, schema_name='dbo', server=AZURE_SQL_SERVER, database=AZURE_SQL_DB)
        ingest_end = time.time()
        ingest_duration = ingest_end - ingest_start
        print(f'pyodbc fallback inserted rows: {pushed}')
    # Warehouse storage size is not easy to get; leave as NaN
    storage_size_mb = float('nan')
else:
    raise SystemExit(f'Unsupported format: {fmt}')

print('Ingest duration (s):', ingest_duration)


In [None]:
# Log a single metrics row for this ingest run
metrics_row = [
    (
        f"TC.{dataset_name}.{fmt}.{update_strategy}",
        datetime.now(),
        fmt.upper(),
        'Delta' if fmt == 'delta' else 'Warehouse',
        int(row_count),
        update_strategy,
        float(ingest_duration),
        float(spinup_duration),
        float(storage_size_mb),
        'N/A',
        float('nan'),
        float(row_count),
        f'Ingest from {source} into {fmt} target table {target_table}'
    )
]
spark.createDataFrame(metrics_row, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
print('Metrics appended to', target_lakehouse + '.metrics')


In [None]:
print('Ingest step completed successfully for single parameter set.')
print('Summary:')
print(f" dataset: {dataset_name} | rows: {row_count} | source: {source} | format: {fmt} | target_table: {target_table}")
