# 📓 Synthetic Data Generator


In [None]:
# Synthetic Data Generation for Fabric Benchmarking
# Purpose: Generate base datasets and incremental update slices for capacity and performance testing

# ---
# 1. Parameter Setup

# Parameters (can be set via notebook widgets or workflow inputs)
row_count = 10000  # Options: 10000, 1000000
format = "parquet" # Options: "parquet", "delta"
schema_config = {
    "categorical_fields": 3,
    "numeric_fields": 5,
    "timestamp_fields": 2
}
distribution = "uniform"  # Options: "uniform", "skewed", "null_injection"
seed = 42  # For deterministic output

# Lakehouse output folders (Fabric)
base_output_folder = "/lakehouse/default/Files/base/"
updates_output_folder = "/lakehouse/default/Files/updates/"
cdc_output_folder = "/lakehouse/default/Files/cdc/"

import os
import numpy as np
import pandas as pd

# ---
# 2. Schema Definition

def create_synthetic_schema(config):
    import random
    np.random.seed(seed)
    random.seed(seed)
    schema = {}
    # Categorical fields
    for i in range(config["categorical_fields"]):
        schema[f"cat_{i+1}"] = lambda n: np.random.choice(['A', 'B', 'C', 'D'], size=n)
    # Numeric fields
    for i in range(config["numeric_fields"]):
        schema[f"num_{i+1}"] = lambda n: np.random.uniform(0, 1000, size=n)
    # Timestamp fields
    for i in range(config["timestamp_fields"]):
        start = pd.Timestamp("2023-01-01")
        schema[f"ts_{i+1}"] = lambda n: pd.date_range(start, periods=n)
    return schema

# ---
# 3. Synthetic Data Generation

def generate_base_dataframe(row_count, schema, distribution, seed=42):
    np.random.seed(seed)
    data = {}
    for col, gen in schema.items():
        if "num" in col and distribution == "skewed":
            data[col] = np.random.exponential(500, size=row_count)
        elif "num" in col and distribution == "null_injection":
            col_data = gen(row_count)
            null_mask = np.random.rand(row_count) < 0.05
            col_data[null_mask] = np.nan
            data[col] = col_data
        else:
            data[col] = gen(row_count)
    df = pd.DataFrame(data)
    df["id"] = np.arange(1, row_count + 1)
    return df

schema = create_synthetic_schema(schema_config)
base_df = generate_base_dataframe(row_count, schema, distribution, seed)

# ---
# 4. Save Base Dataset (Parquet/Delta)

os.makedirs(base_output_folder, exist_ok=True)
base_file = f"{base_output_folder}base_{row_count}_{format}.parquet"
base_df.to_parquet(base_file)

# If using Delta format (with PySpark), add logic here:
# TODO: Save as Delta table if format == "delta"

print(f"Base dataset saved: {base_file}")

# ---


In [None]:
%%pyspark
# Minimal PySpark cell to validate kernel and Lakehouse binding
df = spark.range(100).withColumnRenamed("id", "synthetic_id")
df.write.mode("overwrite").parquet("/lakehouse/default/Files/test_spark_output/")
print("PySpark test write complete.")


In [None]:
# 5. Generate Incremental Update Slices (Batch + CDC)

def generate_updates(df, change_pct=0.01, new_pct=0.005, delete_pct=0.001, seed=42):
    np.random.seed(seed)
    n = len(df)
    # Changed rows
    changed_count = int(n * change_pct)
    change_idx = np.random.choice(df.index, changed_count, replace=False)
    changed_df = df.loc[change_idx].copy()
    changed_df["num_1"] += np.random.uniform(1, 10, changed_count)  # Example modification
    changed_df["update_type"] = "update"
    # New rows
    new_count = int(n * new_pct)
    new_df = df.sample(new_count).copy()
    new_df["id"] = np.arange(n + 1, n + new_count + 1)
    new_df["update_type"] = "insert"
    # Deleted rows
    delete_count = int(n * delete_pct)
    delete_idx = np.random.choice(df.index, delete_count, replace=False)
    deleted_df = df.loc[delete_idx][["id"]].copy()
    deleted_df["update_type"] = "delete"
    # Combine updates
    updates = pd.concat([changed_df, new_df, deleted_df], ignore_index=True)
    return updates

updates_df = generate_updates(base_df)

# ---
# 6. Save Updates and CDC Slices

os.makedirs(updates_output_folder, exist_ok=True)
os.makedirs(cdc_output_folder, exist_ok=True)
updates_file = f"{updates_output_folder}updates_{row_count}_{format}.parquet"
cdc_file = f"{cdc_output_folder}cdc_{row_count}_{format}.parquet"

updates_df.to_parquet(updates_file)
# For CDC, use only changes (updates + deletes)
cdc_df = updates_df[updates_df["update_type"] != "insert"]
cdc_df.to_parquet(cdc_file)

print(f"Updates saved: {updates_file}")
print(f"CDC slice saved: {cdc_file}")

# ---
# 7. Metadata Logging

def log_metadata(file_path, row_count, format, distribution, update_type):
    import json
    meta = {
        "file": file_path,
        "rows": row_count,
        "format": format,
        "distribution": distribution,
        "update_type": update_type,
        "timestamp": pd.Timestamp.now().isoformat()
    }
    meta_file = file_path.replace(".parquet", ".meta.json")
    with open(meta_file, "w") as f:
        json.dump(meta, f, indent=2)
    print(f"Metadata logged: {meta_file}")

log_metadata(base_file, row_count, format, distribution, "base")
log_metadata(updates_file, len(updates_df), format, distribution, "batch_updates")
log_metadata(cdc_file, len(cdc_df), format, distribution, "cdc_updates")

# ---
# 8. Summary and Next Steps

print("Synthetic data generation complete.")
print("Proceed to ingestion workflows and benchmarking.")

# ---
# 9. Diagnostics: List Lakehouse Files
try:
    import msparkutils
    print("Lakehouses:", msparkutils.lakehouse.listLakehouses())
    print("Base folder files:", msparkutils.fs.ls(base_output_folder))
    print("Updates folder files:", msparkutils.fs.ls(updates_output_folder))
    print("CDC folder files:", msparkutils.fs.ls(cdc_output_folder))
except ImportError:
    print("msparkutils not available. Skipping Lakehouse diagnostics.")
