In [None]:
%%configure
# All values that may be overridden by an orchestrator are declared here.
# This makes it # easy to validate when invoked programmatically with different numbers.

# -- Datasets array (interactive defaults are different than automation defaults).
DATASETS_PARAM = [
    {
        "name": "1k",
        "row_count": 1000,
        "change_fraction": 0.01,
        "new_fraction": 0.005,
        "delete_fraction": 0.001,
        "seed": 42,
        "description": "Interactive small dataset (1k rows)"
    },
    {
        "name": "100k",
        "row_count": 100000,
        "change_fraction": 0.01,
        "new_fraction": 0.005,
        "delete_fraction": 0.001,
        "seed": 42,
        "description": "Interactive medium dataset (100k rows)"
    }
]

# Do the test cases (datasets and parameter sets) include Azure SQL as a data source?
PUSH_TO_AZURE_SQL = True

# Azure SQL / ODBC parameters (interactive defaults). 
# These should be passed in from the automation handler.
AZURE_SQL_SERVER = "benchmarking-bff"
AZURE_SQL_DB = "benchmarking"

# These connection details do not need to be passed in but can be optionally changed
AZURE_SQL_CONNECTION = f"Driver={{ODBC Driver 18 for SQL Server}};Server=tcp:{AZURE_SQL_SERVER}.database.windows.net,1433;Database={AZURE_SQL_DB};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30"
AZURE_SQL_SCHEMA = "dbo"

distribution = "uniform"  # Options: "uniform", "skewed", "null_injection"
seed = 42  # For deterministic output

# Lakehouse / local output folders -- these will be dynamically created per-dataset in the loop
# Orchestrators can overwrite any of the above by injecting values into the notebook 
# (e.g. pass 'DATASETS_PARAM' JSON, AZURE_SQL_SERVER, AZURE_SQL_DB into the notebook). 
# End of parameters cell. 


In [None]:
# Minimal schema_config used by the generator (kept outside the parameters cell so the first cell remains exact)
schema_config = {
    "categorical_fields": 3,
    "numeric_fields": 5,
    "timestamp_fields": 2
}


# Generate Data — minimal parquet writer (per-dataset paths) + optional SQL copy

This notebook is intentionally minimal and focused. It loops DATASETS_PARAM, generates a base and updates parquet per dataset, writes per-dataset lakehouse paths (e.g. /lakehouse/default/Files/{name}base/), and also writes controller/local copies under data/{name}/. Filenames are base.parquet and updates.parquet.

In [None]:
import os
import numpy as np
import pandas as pd
import random
import pytz
from datetime import datetime, timedelta
from pathlib import Path

print('Imports OK — proceeding with generation')

## --- Schema Definition
Create a small synthetic schema generator (categorical, numeric, timestamps).

In [None]:
def create_synthetic_schema(config):
    np.random.seed(seed)
    random.seed(seed)
    schema = {}
    # Categorical fields
    for i in range(config["categorical_fields"]):
        schema[f"cat_{i+1}"] = lambda n, _choices=['A','B','C','D']: np.random.choice(_choices, size=n)
    # Numeric fields
    for i in range(config["numeric_fields"]):
        schema[f"num_{i+1}"] = lambda n: np.random.uniform(0, 1000, size=n)
    # Timestamp fields
    for i in range(config["timestamp_fields"]):
        start = pd.Timestamp("2023-01-01")
        schema[f"ts_{i+1}"] = lambda n: pd.date_range(start, periods=n, freq="ms")
    return schema

schema = create_synthetic_schema(schema_config)


## --- Synthetic Data Generation
Generate the base dataframe from the schema and the selected distribution.

In [None]:
def generate_base_dataframe(row_count, schema, distribution, seed):
    """Generate base dataframe. Parameters are required (no defaults)."""
    # Print input parameters for visibility
    print(f"generate_base_dataframe called with row_count={row_count}, distribution={distribution}, seed={seed}")
    np.random.seed(seed)
    data = {}
    for col, gen in schema.items():
        if "num" in col and distribution == "skewed":
            data[col] = np.random.exponential(500, size=row_count)
        elif "num" in col and distribution == "null_injection":
            col_data = gen(row_count)
            null_mask = np.random.rand(row_count) < 0.05
            col_data[null_mask] = np.nan
            data[col] = col_data
        else:
            data[col] = gen(row_count)
    df = pd.DataFrame(data)
    df["id"] = np.arange(1, row_count + 1)
    # Ensure timestamp columns are pandas datetime64[ms] and timezone-aware (UTC)
    for col in df.columns:
        if col.startswith("ts_"):
            df[col] = pd.to_datetime(df[col]).dt.tz_localize(pytz.UTC)
    # Add update_type column, always 'insert' for base
    df["update_type"] = "insert"
    return df


## --- Generate Updates (function)
Create updates (updates, inserts, deletes) and shuffle them.

In [None]:
def generate_updates(df, change_pct, new_pct, delete_pct, seed):
    """Generate updates dataframe. Parameters are required (no defaults)."""
    # Print input parameters for visibility
    print(f"generate_updates called with rows={len(df)}, change_pct={change_pct}, new_pct={new_pct}, delete_pct={delete_pct}, seed={seed}")
    np.random.seed(seed)
    random.seed(seed)
    n = len(df)
    # Changed rows
    changed_count = int(n * change_pct)
    change_idx = np.random.choice(df.index, changed_count, replace=False)
    changed_df = df.loc[change_idx].copy()
    # Example modification of one numeric column if present
    if "num_1" in changed_df.columns:
        changed_df["num_1"] = changed_df["num_1"] + np.random.uniform(1, 10, changed_count)
    changed_df["update_type"] = "update"
    # New rows
    new_count = int(n * new_pct)
    new_df = df.sample(new_count).copy()
    new_df["id"] = np.arange(n + 1, n + new_count + 1)
    new_df["update_type"] = "insert"
    # Deleted rows
    delete_count = int(n * delete_pct)
    delete_idx = np.random.choice(df.index, delete_count, replace=False)
    deleted_df = df.loc[delete_idx][["id"]].copy()
    deleted_df["update_type"] = "delete"
    # Combine and shuffle updates
    updates = pd.concat([changed_df, new_df, deleted_df], ignore_index=True)
    updates = updates.sample(frac=1, random_state=seed).reset_index(drop=True)  # Shuffle rows
    # No need to process timestamps again—they are already UTC tz-aware
    return updates


## --- Save Loop (per-dataset paths and filenames) — strict, no fallbacks or extra validation
This loop uses DATASETS_PARAM directly and lets errors surface if keys are missing or malformed.

In [None]:
from pathlib import Path
data_root = Path('data')

for ds in DATASETS_PARAM:
    # Use the dataset fields directly; let errors surface if keys are missing or invalid
    name = ds['name']
    row_count_ds = int(ds['row_count'])
    change_fraction = float(ds['change_fraction'])
    new_fraction = float(ds['new_fraction'])
    delete_fraction = float(ds['delete_fraction'])
    seed_ds = int(ds['seed'])

    print(f"\n=== Generating dataset {name}: rows={row_count_ds}")

    # Per-dataset lakehouse paths (as requested)
    base_output_folder = f"/lakehouse/default/Files/{name}base/"
    updates_output_folder = f"/lakehouse/default/Files/{name}updates/"

    # Local controller area (always write a local copy under data/{name}/)
    local_dir = data_root / name
    local_dir.mkdir(parents=True, exist_ok=True)

    # Generate base and updates
    base_df = generate_base_dataframe(row_count_ds, schema, distribution, seed_ds)
    base_file_local = local_dir / 'base.parquet'
    base_df.to_parquet(str(base_file_local), engine='pyarrow', index=False, coerce_timestamps='ms')
    print('  wrote local base:', base_file_local)

    updates_df = generate_updates(base_df, change_fraction, new_fraction, delete_fraction, seed_ds)
    updates_file_local = local_dir / 'updates.parquet'
    updates_df.to_parquet(str(updates_file_local), engine='pyarrow', index=False, coerce_timestamps='ms')
    print('  wrote local updates:', updates_file_local)

    # Best-effort: write lakehouse copies (may fail outside Fabric)
    try:
        Path(base_output_folder).mkdir(parents=True, exist_ok=True)
        (Path(base_output_folder) / 'base.parquet').write_bytes(base_file_local.read_bytes())
        Path(updates_output_folder).mkdir(parents=True, exist_ok=True)
        (Path(updates_output_folder) / 'updates.parquet').write_bytes(updates_file_local.read_bytes())
        print('  wrote lakehouse copies ->', base_output_folder, updates_output_folder)
    except Exception:
        print('  could not write to lakehouse path (running outside Fabric or no permission); skipped lakehouse writes')

    # Controller copies (local controller area)
    ctrl_dir = data_root / 'controller' / name
    ctrl_dir.mkdir(parents=True, exist_ok=True)
    (ctrl_dir / 'base.parquet').write_bytes(base_file_local.read_bytes())
    (ctrl_dir / 'updates.parquet').write_bytes(updates_file_local.read_bytes())
    print('  wrote controller copies ->', ctrl_dir)

print('\nGeneration loop complete.')

## --- Metadata Logging
Log simple metadata about the files created.

In [None]:
def log_metadata(file_path, row_count_val, fmt, distribution_val, update_type):
    import json
    meta = {
        "file": file_path,
        "rows": row_count_val,
        "format": fmt,
        "distribution": distribution_val,
        "update_type": update_type,
        "timestamp": pd.Timestamp.now().isoformat()
    }
    meta_file = str(file_path).replace('.parquet', '.meta.json')
    with open(meta_file, 'w') as f:
        json.dump(meta, f, indent=2)
    print(f"Metadata logged: {meta_file}")

# Log metadata for each dataset's local files
for ds in DATASETS_PARAM:
    name = ds['name']
    row_count_ds = int(ds['row_count'])
    local_dir = data_root / name
    base_file_local = local_dir / 'base.parquet'
    updates_file_local = local_dir / 'updates.parquet'
    if base_file_local.exists():
        log_metadata(str(base_file_local), row_count_ds, 'parquet', distribution, 'base')
    if updates_file_local.exists():
        log_metadata(str(updates_file_local), len(pd.read_parquet(str(updates_file_local), engine='pyarrow')), 'parquet', distribution, 'batch_updates')
