In [None]:
# Notebook parameters (Papermill / Fabric friendly)
# All values that may be overridden by an orchestrator or by interactive runs
# are declared here in a single top-level parameters cell.  This makes it
# easy to validate what the notebook actually used when invoked programmatically.

# -- Datasets array (interactive defaults). Use Option A naming as requested.
DATASETS_PARAM = [
    {
        "name": "1k",
        "row_count": 1000,
        "change_fraction": 0.01,
        "new_fraction": 0.005,
        "delete_fraction": 0.001,
        "seed": 42,
        "description": "Interactive small dataset (1k rows)"
    },
    {
        "name": "100k",
        "row_count": 100000,
        "change_fraction": 0.01,
        "new_fraction": 0.005,
        "delete_fraction": 0.001,
        "seed": 42,
        "description": "Interactive medium dataset (100k rows)"
    }
]

# Where generated files will be written locally (or in the notebook filesystem)
OUTPUT_DIR = "data"

# Path to repository parameter_sets config (used when orchestrator doesn't pass datasets)
CONFIG_PATH = "config/parameter_sets.yml"

# When orchestrators call the notebook they should pass the datasets array under this input name
# (mssparkutils.notebook.getContext().getInput('datasets') or papermill 'datasets')
NOTEBOOK_PARAM_INPUT_NAME = "datasets"

# Controller copy behavior: write controller copies under OUTPUT_DIR/controller/{dataset}/
GENERATE_CONTROLLER_COPY = True

# Fabric Files destinations (templates). The notebook will attempt to write to these
# paths when running inside Fabric with mssparkutils available. Use {dataset} as placeholder.
FILES_BASE_TEMPLATE = "/Files/{dataset}base/"
FILES_UPDATES_TEMPLATE = "/Files/{dataset}updates/"

# Azure SQL / ODBC parameters (interactive defaults). Orchestrators should pass secure
# connection strings via secrets and override SQL_PUSH=True and SQL_ODBC_CONNECTION.
SQL_PUSH = False
SQL_ODBC_CONNECTION = ""   # Example: "Driver={ODBC Driver 17 for SQL Server};Server=tcp:<server>.database.windows.net,1433;Database=<db>;UID=<user>;PWD=<password>;Encrypt=yes;"
SQL_SCHEMA = "dbo"
SQL_CHUNKSIZE = 10000
SQL_IF_EXISTS = "replace"   # options: 'replace','append','fail'

# CSV / write options
CSV_DELIMITER = ","

# Generator defaults (can be overridden per-dataset in the datasets array)
GENERATOR_DEFAULT_SEED = 42

# Safety / behavior toggles
SKIP_FILES_COPY_ON_ERROR = True   # don't hard-fail if mssparkutils copy fails; log and continue

# Expose a variable that downstream cells will use (may be replaced at runtime)
SELECTED_DATASETS = None

# End of parameters cell. Orchestrators/papermill can overwrite any of the above by injecting
# values into the notebook (e.g. pass 'datasets' JSON into the notebook). At startup the
# notebook will (in order) try: incoming param -> config file -> DATASETS_PARAM defaults.


In [None]:
%%configure -f
{
  "conf": {
    "spark.driver.memory": "4g",
    "spark.executor.memory": "4g",
    "spark.executor.cores": "2",
    "spark.dynamicAllocation.enabled": "false"
  }
}

# Generate Data â€” embedded generator + Azure SQL push

This notebook generates base and updates datasets for each dataset entry and then (optionally) writes them into an Azure SQL database.
It will also write controller copies and attempt to copy into Fabric Files (/Files/{dataset}base/ and /Files/{dataset}updates/) if running inside Fabric with mssparkutils.

In [None]:
try:
    import yaml
except Exception:
    import sys
    !{sys.executable} -m pip install pyyaml
    import yaml

import os
import csv
import random
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, Any, Iterable, List, Tuple

print('yaml:', getattr(yaml, '__version__', 'unknown'))

In [None]:
# Determine datasets to use (priority: passed-in params -> config file -> embedded defaults)
datasets_cfg = None
incoming = None
try:
    import mssparkutils
    try:
        incoming = mssparkutils.notebook.getContext().getInput(NOTEBOOK_PARAM_INPUT_NAME)
    except Exception:
        # Some runtimes may expose inputs differently; try 'params' as a fallback
        try:
            incoming = mssparkutils.notebook.getContext().getInput('params')
        except Exception:
            incoming = None
except Exception:
    incoming = None

if incoming:
    print('Using datasets supplied to the notebook (incoming).')
    # incoming may be JSON text or already a Python object
    if isinstance(incoming, str):
        try:
            datasets_cfg = yaml.safe_load(incoming)
        except Exception:
            import json
            datasets_cfg = json.loads(incoming)
    else:
        datasets_cfg = incoming
else:
    # No incoming params; try config file
    if os.path.exists(CONFIG_PATH):
        print(f'Reading datasets from {CONFIG_PATH}')
        cfg = yaml.safe_load(open(CONFIG_PATH, 'r', encoding='utf-8'))
        datasets_cfg = cfg.get('datasets', [])
    else:
        print('No config found; using embedded notebook defaults (DATASETS_PARAM).')
        datasets_cfg = DATASETS_PARAM

print('\nDatasets to be generated:')
for d in datasets_cfg:
    print(' -', d.get('name'), 'rows=', d.get('row_count'))

# expose for downstream cells
SELECTED_DATASETS = datasets_cfg

## Embedded generator functions

Functions to generate base rows and an updates file (op: update/insert/delete).

In [None]:
def generate_base_rows(n: int, seed: int = None, start_ts: datetime | None = None) -> Iterable[Dict[str, Any]]:
    rnd = random.Random(seed) if seed is not None else random.Random()
    categories = ['A','B','C']
    start = start_ts or datetime.utcnow()
    for i in range(1, n+1):
        ts = (start + timedelta(seconds=i)).isoformat() + 'Z'
        yield {
            'id': i,
            'value': round(rnd.uniform(0, 100), 4),
            'timestamp': ts,
            'category': rnd.choice(categories)
        }

def generate_updates_for_base(base_count: int, change_fraction: float, new_fraction: float, delete_fraction: float, seed: int | None = None, start_ts: datetime | None = None) -> Iterable[Dict[str, Any]]:
    rnd = random.Random(seed+1 if seed is not None else None)
    n_change = max(1, int(round(base_count * change_fraction))) if change_fraction > 0 else 0
    n_new = max(0, int(round(base_count * new_fraction)))
    n_delete = max(0, int(round(base_count * delete_fraction)))

    ids = list(range(1, base_count+1))
    rnd.shuffle(ids)
    change_ids = ids[:n_change]
    delete_ids = ids[n_change:n_change+n_delete] if n_delete>0 else []

    categories = ['A','B','C']
    start = start_ts or datetime.utcnow()
    # updates: updates for change_ids
    idx = 0
    for idv in change_ids:
        idx += 1
        ts = (start + timedelta(seconds=base_count + idx)).isoformat() + 'Z'
        yield {
            'op': 'update',
            'id': idv,
            'value': round(rnd.uniform(0, 100), 4),
            'timestamp': ts,
            'category': rnd.choice(categories)
        }
    # inserts
    for i in range(1, n_new+1):
        new_id = base_count + i
        ts = (start + timedelta(seconds=base_count + n_change + i)).isoformat() + 'Z'
        yield {'op':'insert','id': new_id, 'value': round(rnd.uniform(0,100),4), 'timestamp': ts, 'category': rnd.choice(categories)}
    # deletes
    for idv in delete_ids:
        yield {'op':'delete', 'id': idv, 'value':'', 'timestamp':'', 'category':''}

def write_csv(path: str, rows: Iterable[Dict[str, Any]], header: List[str]) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'w', newline='', encoding='utf-8') as fh:
        writer = csv.writer(fh)
        writer.writerow(header)
        for r in rows:
            writer.writerow([r.get(c, '') for c in header])


In [None]:
# Generate datasets and write local and controller copies; then attempt Fabric Files copy
import shutil
from pprint import pprint

data_root = Path(OUTPUT_DIR)
for ds in SELECTED_DATASETS:
    name = ds.get('name')
    row_count = int(ds.get('row_count', 10000))
    change_fraction = float(ds.get('change_fraction', 0.01))
    new_fraction = float(ds.get('new_fraction', 0.0))
    delete_fraction = float(ds.get('delete_fraction', 0.0))
    seed = int(ds.get('seed')) if ds.get('seed') is not None else GENERATOR_DEFAULT_SEED

    print(f"\n=== Generating dataset {name}: rows={row_count}")
    # base
    base_rows = list(generate_base_rows(row_count, seed=seed))
    base_path = data_root / name / 'base.csv'
    write_csv(str(base_path), base_rows, ['id','value','timestamp','category'])
    print(f"Wrote local base: {base_path}")
    # updates
    updates_rows = list(generate_updates_for_base(row_count, change_fraction, new_fraction, delete_fraction, seed=seed))
    updates_path = data_root / name / 'updates.csv'
    write_csv(str(updates_path), updates_rows, ['op','id','value','timestamp','category'])
    print(f"Wrote local updates: {updates_path}")
    # controller copies
    if GENERATE_CONTROLLER_COPY:
        ctrl_dir = data_root / 'controller' / name
        os.makedirs(ctrl_dir, exist_ok=True)
        shutil.copyfile(str(base_path), str(ctrl_dir / 'base.csv'))
        shutil.copyfile(str(updates_path), str(ctrl_dir / 'updates.csv'))
        print(f"Wrote controller copies -> {ctrl_dir}")

    # Attempt to write into Fabric Files if available
    try:
        import mssparkutils
        files_base_dir = FILES_BASE_TEMPLATE.format(dataset=name)
        files_updates_dir = FILES_UPDATES_TEMPLATE.format(dataset=name)
        # Read binary and put
        with open(str(base_path), 'rb') as fh:
            mssparkutils.fs.put(files_base_dir + 'base.csv', fh.read(), overwrite=True)
        with open(str(updates_path), 'rb') as fh:
            mssparkutils.fs.put(files_updates_dir + 'updates.csv', fh.read(), overwrite=True)
        print(f"Copied to Fabric Files: {files_base_dir} and {files_updates_dir}")
    except Exception as e:
        print('mssparkutils not available or copy failed (running outside Fabric?). Skipping Files copy. Error:', e)

print('\nAll dataset generation done.')

## Push generated CSVs into Azure SQL

Provide an ODBC connection string when prompted. Example:
"Driver={ODBC Driver 17 for SQL Server};Server=tcp:<yourserver>.database.windows.net,1433;Database=<db>;UID=<user>;PWD=<password>;Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;"
The notebook converts this into a SQLAlchemy engine using the pyodbc driver. Tables written:
- dbo.base_{dataset_name}
- dbo.updates_{dataset_name}


In [None]:
push_sql = SQL_PUSH
if not push_sql:
    # If SQL_PUSH default is False, prompt interactively in notebook runs
    user = input('Push generated data to Azure SQL? (yes/no) > ').strip().lower()
    if user in ('y','yes'):
        push_sql = True
        conn_str = input('Provide ODBC connection string (Driver=...;Server=...;Database=...;UID=...;PWD=...) : ').strip()
    else:
        push_sql = False
        conn_str = SQL_ODBC_CONNECTION
else:
    conn_str = SQL_ODBC_CONNECTION

if push_sql:
    if not conn_str:
        print('No connection string provided; skipping SQL upload.')
    else:
        # Install sqlalchemy + pyodbc + pandas if missing
        import sys
        try:
            import sqlalchemy
            import pandas as pd
        except Exception:
            !{sys.executable} -m pip install sqlalchemy pyodbc pandas
            import sqlalchemy
            import pandas as pd
        from sqlalchemy import create_engine
        from urllib.parse import quote_plus

        odbc_conn_str = quote_plus(conn_str)
        engine_url = f'mssql+pyodbc:///?odbc_connect={odbc_conn_str}'
        print('Creating SQLAlchemy engine...')
        engine = create_engine(engine_url)

        for ds in SELECTED_DATASETS:
            name = ds.get('name')
            base_path = data_root / name / 'base.csv'
            updates_path = data_root / name / 'updates.csv'
            if base_path.exists():
                print(f"Uploading base -> {SQL_SCHEMA}.base_{name} (from {base_path})")
                df_base = pd.read_csv(str(base_path))
                # write in chunks
                df_base.to_sql(f'base_{name}', con=engine, schema=SQL_SCHEMA, if_exists=SQL_IF_EXISTS, index=False, chunksize=SQL_CHUNKSIZE)
                print('  Uploaded base')
            else:
                print(f'  Base file missing: {base_path}; skipping')
            if updates_path.exists():
                print(f"Uploading updates -> {SQL_SCHEMA}.updates_{name} (from {updates_path})")
                df_upd = pd.read_csv(str(updates_path))
                df_upd.to_sql(f'updates_{name}', con=engine, schema=SQL_SCHEMA, if_exists=SQL_IF_EXISTS, index=False, chunksize=SQL_CHUNKSIZE)
                print('  Uploaded updates')
            else:
                print(f'  Updates file missing: {updates_path}; skipping')
        print('All SQL uploads attempted.')
else:
    print('Skipping SQL upload.')

In [None]:
print('\nGeneration & optional SQL push complete. Preview generated files:')
for ds in SELECTED_DATASETS:
    name = ds.get('name')
    base_path = data_root / name / 'base.csv'
    updates_path = data_root / name / 'updates.csv'
    print(' -', name)
    if base_path.exists():
        print('    base:', base_path, 'size=', base_path.stat().st_size)
    if updates_path.exists():
        print('    updates:', updates_path, 'size=', updates_path.stat().st_size)
