In [None]:
%%configure
# All values that may be overridden by an orchestrator are declared here.
# This makes it # easy to validate when invoked programmatically with different numbers.

# -- Datasets array (interactive defaults are different than automation defaults).
DATASETS_PARAM = [
    {
        "name": "1k",
        "row_count": 1000,
        "change_fraction": 0.01,
        "new_fraction": 0.005,
        "delete_fraction": 0.001,
        "seed": 42,
        "description": "Interactive small dataset (1k rows)"
    },
    {
        "name": "100k",
        "row_count": 100000,
        "change_fraction": 0.01,
        "new_fraction": 0.005,
        "delete_fraction": 0.001,
        "seed": 42,
        "description": "Interactive medium dataset (100k rows)"
    }
]

# Do the test cases (datasets and parameter sets) include Azure SQL as a data source?
PUSH_TO_AZURE_SQL = True

# Azure SQL / ODBC parameters (interactive defaults). 
# These should be passed in from the automation handler.
AZURE_SQL_SERVER = "benchmarking-bff"
AZURE_SQL_DB = "benchmarking"

# These connection details do not need to be passed in but can be optionally changed
AZURE_SQL_CONNECTION = f"Driver={{ODBC Driver 18 for SQL Server}};Server=tcp:{AZURE_SQL_SERVER}.database.windows.net,1433;Database={AZURE_SQL_DB};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30"
AZURE_SQL_SCHEMA = "dbo"

distribution = "uniform"  # Options: "uniform", "skewed", "null_injection"
seed = 42  # For deterministic output

# Lakehouse / local output folders -- these will be dynamically created for each dataset
# base_output_folder = "/lakehouse/default/Files/base{name}/"
# updates_output_folder = "/lakehouse/default/Files/updates{name}/"

# Orchestrators can overwrite any of the above by injecting values into the notebook 
# (e.g. pass 'DATASETS_PARAM' JSON, AZURE_SQL_SERVER, AZURE_SQL_DB into the notebook). 
# End of parameters cell. 


# Generate Data — minimal parquet writer + optional SQL copy

This notebook is intentionally minimal: it iterates the datasets declared in the parameters cell, produces a base.parquet and updates.parquet for each dataset, writes controller copies under `data/controller/{dataset}/`, and optionally uploads those tables into Azure SQL if `PUSH_TO_AZURE_SQL=True` and a connection string is provided.

In [None]:
import os
import numpy as np
import pandas as pd
import random
import pytz
from datetime import datetime, timedelta
from pathlib import Path

print('Imports OK — proceeding with generation')

## --- 2. Schema Definition
Create a small synthetic schema generator (categorical, numeric, timestamps).

In [None]:
def create_synthetic_schema(config):
    np.random.seed(seed)
    random.seed(seed)
    schema = {}
    # Categorical fields
    for i in range(config["categorical_fields"]):
        # Use a closure that captures the local RNG; return numpy arrays of choices
        schema[f"cat_{i+1}"] = lambda n, _choices=['A','B','C','D']: np.random.choice(_choices, size=n)
    # Numeric fields
    for i in range(config["numeric_fields"]):
        schema[f"num_{i+1}"] = lambda n: np.random.uniform(0, 1000, size=n)
    # Timestamp fields
    for i in range(config["timestamp_fields"]):
        start = pd.Timestamp("2023-01-01")
        # Millisecond precision to guarantee Spark compatibility
        schema[f"ts_{i+1}"] = lambda n: pd.date_range(start, periods=n, freq="ms")
    return schema


## --- 3. Synthetic Data Generation
Generate the base dataframe from the schema and the selected distribution.

In [None]:
def generate_base_dataframe(row_count, schema, distribution, seed=42):
    np.random.seed(seed)
    data = {}
    for col, gen in schema.items():
        if "num" in col and distribution == "skewed":
            data[col] = np.random.exponential(500, size=row_count)
        elif "num" in col and distribution == "null_injection":
            col_data = gen(row_count)
            null_mask = np.random.rand(row_count) < 0.05
            col_data[null_mask] = np.nan
            data[col] = col_data
        else:
            data[col] = gen(row_count)
    df = pd.DataFrame(data)
    df["id"] = np.arange(1, row_count + 1)
    # Ensure timestamp columns are pandas datetime64[ms] and timezone-aware (UTC)
    for col in df.columns:
        if col.startswith("ts_"):
            df[col] = pd.to_datetime(df[col]).dt.tz_localize(pytz.UTC)
    # Add update_type column, always 'insert' for base
    df["update_type"] = "insert"
    return df

schema = create_synthetic_schema(schema_config)
base_df = generate_base_dataframe(row_count, schema, distribution, seed)
print(f"Base dataframe created: rows={len(base_df)}")

## --- 4. Save Base Dataset (Parquet/Delta)
Write the base dataset to parquet (or delta if you change format).

In [None]:
os.makedirs('/tmp', exist_ok=True)
base_output_folder = f"/tmp/base_{row_count}/" if not base_output_folder else base_output_folder
updates_output_folder = f"/tmp/updates_{row_count}/" if not 'updates_output_folder' in globals() else updates_output_folder
os.makedirs(base_output_folder, exist_ok=True)
base_file = f"{base_output_folder}base_{row_count}_{format}.parquet"
# All timestamps are timezone-aware UTC, update_type is 'insert'
base_df.to_parquet(base_file, coerce_timestamps='ms', engine='pyarrow')
print(f"Base dataset saved: {base_file}")

## --- 5. Generate Incremental Update Slices (Batch & CDC)
Create updates (updates, inserts, deletes) and shuffle them.

In [None]:
def generate_updates(df, change_pct=0.01, new_pct=0.005, delete_pct=0.001, seed=42):
    np.random.seed(seed)
    random.seed(seed)
    n = len(df)
    # Changed rows
    changed_count = int(n * change_pct)
    change_idx = np.random.choice(df.index, changed_count, replace=False)
    changed_df = df.loc[change_idx].copy()
    # Example modification of one numeric column if present
    if "num_1" in changed_df.columns:
        changed_df["num_1"] = changed_df["num_1"] + np.random.uniform(1, 10, changed_count)
    changed_df["update_type"] = "update"
    # New rows
    new_count = int(n * new_pct)
    new_df = df.sample(new_count).copy()
    new_df["id"] = np.arange(n + 1, n + new_count + 1)
    new_df["update_type"] = "insert"
    # Deleted rows
    delete_count = int(n * delete_pct)
    delete_idx = np.random.choice(df.index, delete_count, replace=False)
    deleted_df = df.loc[delete_idx][["id"]].copy()
    deleted_df["update_type"] = "delete"
    # Combine and shuffle updates
    updates = pd.concat([changed_df, new_df, deleted_df], ignore_index=True)
    updates = updates.sample(frac=1, random_state=seed).reset_index(drop=True)  # Shuffle rows
    # No need to process timestamps again—they are already UTC tz-aware
    return updates

updates_df = generate_updates(base_df, change_pct=schema_config.get('change_fraction', 0.01) if isinstance(schema_config, dict) else 0.01,
                             new_pct=schema_config.get('new_fraction', 0.005) if isinstance(schema_config, dict) else 0.005,
                             delete_pct=schema_config.get('delete_fraction', 0.001) if isinstance(schema_config, dict) else 0.001,
                             seed=seed)
print(f"Updates dataframe created: rows={len(updates_df)}")

## --- 6. Save Updates (All Change Types, Shuffled)
Write the updates parquet file.

In [None]:
os.makedirs(updates_output_folder, exist_ok=True)
updates_file = f"{updates_output_folder}updates_{row_count}_{format}.parquet"
updates_df.to_parquet(updates_file, coerce_timestamps='ms', engine='pyarrow')
print(f"Updates saved: {updates_file}")

## --- 7. Metadata Logging
Log simple metadata about the files created.

In [None]:
def log_metadata(file_path, row_count, format, distribution, update_type):
    import json
    meta = {
        "file": file_path,
        "rows": row_count,
        "format": format,
        "distribution": distribution,
        "update_type": update_type,
        "timestamp": pd.Timestamp.now().isoformat()
    }
    meta_file = file_path.replace(".parquet", ".meta.json")
    with open(meta_file, "w") as f:
        json.dump(meta, f, indent=2)
    print(f"Metadata logged: {meta_file}")

log_metadata(base_file, row_count, format, distribution, "base")
log_metadata(updates_file, len(updates_df), format, distribution, "batch_updates")