# 📓 1. Generate Data
## Synthetic Data Generator
### Ensure `DataSourceLakehouse` is connected as a data source before running.


In [None]:
# Synthetic Data Generation for Fabric Benchmarking
# Purpose: Generate base datasets and incremental update slices for capacity and performance testing

# ---
# 1. Parameter Setup

# Parameters (can be set via notebook widgets or workflow inputs)
row_count = 10000  # Options: 10000, 1000000
format = "parquet" # Options: "parquet", "delta"
schema_config = {
    "categorical_fields": 3,
    "numeric_fields": 5,
    "timestamp_fields": 2
}
distribution = "uniform"  # Options: "uniform", "skewed", "null_injection"
seed = 42  # For deterministic output

# Lakehouse output folders (Fabric)
lakehouse_name = "DataSourceLakehouse"
base_output_folder = f"/lakehouse/{lakehouse_name}/Files/base/"
updates_output_folder = f"/lakehouse/{lakehouse_name}/Files/updates/"

In [None]:
import os
import numpy as np
import pandas as pd
import random
from datetime import datetime

In [None]:
def create_synthetic_schema(config):
    np.random.seed(seed)
    random.seed(seed)
    schema = {}
    # Categorical fields
    for i in range(config["categorical_fields"]):
        schema[f"cat_{i+1}"] = lambda n: np.random.choice(['A', 'B', 'C', 'D'], size=n)
    # Numeric fields
    for i in range(config["numeric_fields"]):
        schema[f"num_{i+1}"] = lambda n: np.random.uniform(0, 1000, size=n)
    # Timestamp fields (guaranteed compatible)
    for i in range(config["timestamp_fields"]):
        start = pd.Timestamp("2023-01-01")
        # Generate with millisecond freq to avoid nanosecond precision
        schema[f"ts_{i+1}"] = lambda n: pd.date_range(start, periods=n, freq="ms")
    return schema

In [None]:
def generate_base_dataframe(row_count, schema, distribution, seed=42):
    np.random.seed(seed)
    data = {}
    for col, gen in schema.items():
        if "num" in col and distribution == "skewed":
            data[col] = np.random.exponential(500, size=row_count)
        elif "num" in col and distribution == "null_injection":
            col_data = gen(row_count)
            null_mask = np.random.rand(row_count) < 0.05
            col_data[null_mask] = np.nan
            data[col] = col_data
        else:
            data[col] = gen(row_count)
    df = pd.DataFrame(data)
    df["id"] = np.arange(1, row_count + 1)
    # Ensure timestamp columns are pandas datetime64[ms] (no nanosecond precision)
    for col in df.columns:
        if col.startswith("ts_"):
            df[col] = pd.to_datetime(df[col]).astype('datetime64[ms]')
    return df

schema = create_synthetic_schema(schema_config)
base_df = generate_base_dataframe(row_count, schema, distribution, seed)

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# Convert pandas DataFrame to Spark DataFrame for Parquet/Delta compatibility
spark_base_df = spark.createDataFrame(base_df)

base_file = f"{base_output_folder}base_{row_count}_parquet.parquet"
# Write as Parquet with Spark (guarantees TIMESTAMP(MILLIS))
spark_base_df.write.mode("overwrite").parquet(base_file)
print(f"Base dataset saved: {base_file}")

In [None]:
def generate_updates(df, change_pct=0.01, new_pct=0.005, delete_pct=0.001, seed=42):
    np.random.seed(seed)
    random.seed(seed)
    n = len(df)
    # Changed rows
    changed_count = int(n * change_pct)
    change_idx = np.random.choice(df.index, changed_count, replace=False)
    changed_df = df.loc[change_idx].copy()
    changed_df["num_1"] += np.random.uniform(1, 10, changed_count)
    changed_df["update_type"] = "update"
    # New rows
    new_count = int(n * new_pct)
    new_df = df.sample(new_count).copy()
    new_df["id"] = np.arange(n + 1, n + new_count + 1)
    new_df["update_type"] = "insert"
    # Deleted rows
    delete_count = int(n * delete_pct)
    delete_idx = np.random.choice(df.index, delete_count, replace=False)
    deleted_df = df.loc[delete_idx][["id"]].copy()
    deleted_df["update_type"] = "delete"
    # Combine and shuffle updates
    updates = pd.concat([changed_df, new_df, deleted_df], ignore_index=True)
    updates = updates.sample(frac=1, random_state=seed).reset_index(drop=True)
    return updates

updates_df = generate_updates(base_df)
# Ensure timestamp columns are pandas datetime64[ms]
for col in updates_df.columns:
    if col.startswith("ts_"):
        updates_df[col] = pd.to_datetime(updates_df[col]).astype('datetime64[ms]')

In [None]:
# Save updates as Parquet via Spark
spark_updates_df = spark.createDataFrame(updates_df)
updates_file = f"{updates_output_folder}updates_{row_count}_parquet.parquet"
spark_updates_df.write.mode("overwrite").parquet(updates_file)
print(f"Updates saved: {updates_file}")

In [None]:
# --- Metadata Logging ---
def log_metadata(file_path, row_count, format, distribution, update_type):
    import json
    meta = {
        "file": file_path,
        "rows": row_count,
        "format": format,
        "distribution": distribution,
        "update_type": update_type,
        "timestamp": datetime.now().isoformat()
    }
    meta_file = file_path.replace(".parquet", ".meta.json")
    with open(meta_file, "w") as f:
        json.dump(meta, f, indent=2)
    print(f"Metadata logged: {meta_file}")

log_metadata(base_file, row_count, format, distribution, "base")
log_metadata(updates_file, len(updates_df), format, distribution, "batch_updates")

In [None]:
# --- Diagnostics: List Lakehouse Files ---
try:
    import msparkutils
    print("Lakehouses:", msparkutils.lakehouse.listLakehouses())
    print("Base folder files:", msparkutils.fs.ls(base_output_folder))
    print("Updates folder files:", msparkutils.fs.ls(updates_output_folder))
except ImportError:
    print("msparkutils not available. Skipping Lakehouse diagnostics.")

Synthetic data generation complete. Proceed to ingestion workflows and benchmarking.