In [7]:
# This the shape generator file for Excel spreadsheets
# Test Runs: Create 20 spreadsheets in long format (4 columns (Date, Channel, Metric, Value), 8 rows)
# Noise transformations: random values for int, set value of strings for now (Daily granularity for date, 2 choices for Channel and Metric)


In [8]:
import pandas as pd
import numpy as np
from itertools import product
import random
import os

long_folder = 'long_first'
multi_index_folder = 'multi_index'
multi_table_folder = 'multi_table'
relational_folder = 'relational'
wide_folder = 'wide'


Long format function

In [9]:
def generate_long_df(
    num_rows=16,
    add_empty_cells=True,
    empty_frac=0.1,
    change_types=True,
    wrong_type_cols=['Value']
):
    # Generate date range
    base_date = pd.to_datetime('2025-07-10') + pd.Timedelta(days=np.random.randint(0, 365))
    dates = pd.date_range(base_date, periods=num_rows//4, freq='7D')

    # Generate combinations
    channels = ['TV', 'Radio']
    metrics = ['Spend', 'GRPs']
    all_combinations = list(product(dates, channels, metrics))
    df = pd.DataFrame(all_combinations, columns=['Date', 'Channel', 'Metric'])

    # Assign values
    def random_value(metric):
        if metric == 'Spend':
            return np.random.randint(50, 201)
        return np.random.randint(1, 11)

    df['Value'] = df['Metric'].apply(random_value)

    # Pad or trim to desired num_rows
    if len(df) < num_rows:
        df = pd.concat([df] * (num_rows // len(df) + 1), ignore_index=True)
    df = df.sample(n=num_rows, random_state=42).reset_index(drop=True)

    # Introduce empty cells
    if add_empty_cells:
        total_cells = df.size
        num_empty = int(total_cells * empty_frac)
        for _ in range(num_empty):
            row = np.random.randint(0, df.shape[0])
            col = np.random.choice(df.columns)
            df.at[row, col] = np.nan

    # Introduce wrong data types
    if change_types:
        for col in wrong_type_cols:
            for row in np.random.choice(df.index, size=max(1, len(df)//8), replace=False):
                if col == 'Value':
                    df.at[row, col] = random.choice(['one hundred', 'ten', 'NaN'])
                elif col == 'Date':
                    df.at[row, col] = random.choice(['not a date', 'yesterday', 'soon'])

    return df

Long Output

In [10]:
os.makedirs(long_folder, exist_ok=True)

NUM_SETS = 20

for i in range(NUM_SETS):
    params = {
        'num_rows': random.randint(12, 50),
        'add_empty_cells': random.choice([True, False]),
        'empty_frac': round(random.uniform(0.05, 0.3), 2),
        'change_types': random.choice([True, False]),
        'wrong_type_cols': random.sample(['Value', 'Date'], 
                                         k=random.randint(0, 2))
    }

    df = generate_long_df(**params)
    filepath = os.path.join(long_folder, f'synthetic_dataset_{i+1}.xlsx')
    df.to_excel(filepath, index=False)

    print(f"Saved: {filepath}")

  df.at[row, col] = random.choice(['one hundred', 'ten', 'NaN'])
  df.at[row, col] = random.choice(['not a date', 'yesterday', 'soon'])


Saved: long_first\synthetic_dataset_1.xlsx
Saved: long_first\synthetic_dataset_2.xlsx
Saved: long_first\synthetic_dataset_3.xlsx
Saved: long_first\synthetic_dataset_4.xlsx
Saved: long_first\synthetic_dataset_5.xlsx
Saved: long_first\synthetic_dataset_6.xlsx
Saved: long_first\synthetic_dataset_7.xlsx


  df.at[row, col] = random.choice(['not a date', 'yesterday', 'soon'])
  df.at[row, col] = random.choice(['not a date', 'yesterday', 'soon'])


Saved: long_first\synthetic_dataset_8.xlsx
Saved: long_first\synthetic_dataset_9.xlsx
Saved: long_first\synthetic_dataset_10.xlsx


PermissionError: [Errno 13] Permission denied: 'long_first\\synthetic_dataset_11.xlsx'

Wide Format function

In [11]:
def generate_wide_df(
        num_columns=16,
        add_empty_cells=True,
        empty_frac=0.05,
        change_types=True,
        wrong_type_rows=['TV_Spend']
):
    num_dates = max(1, num_rows // 4)

    base_date = pd.to_datetime('2025-07-10') + pd.Timedelta(days=np.random.randint(0,365))
    dates = pd.date_range(base_date, periods=num_dates, freq='7D')

    channels = ['TV', 'Spend']
    metrics = ['Spend', 'GRPs']

    data = []
    for date in dates:
        for channel in channels:
            for metric in metrics:
                value = np.random.randint(50, 201) if metric == 'Spend' else np.random.randint(1, 11)
                data.append([date, channel, metric, value])

    df_long = pd.DataFrame(data, columns=['Date', 'Channel', 'Metric', 'Value'])

    df_wide = df_long.pivot(index='Date', columns=['Channel','Metric'], values='Value')

    df_wide.columns -[f"{chan}_{metric}" for chan, metric in df_wide.columns]
    df_wide.reset_index(inplace=True)


    # Add empty cells
    if add_empty_cells:
        total_cells = df_wide.size
        num_empty = int(total_cells * empty_frac)
        for _ in range(num_empty):
            row = np.random.randint(0, df_wide.shape[0])
            col = np.random.choice(df_wide.columns)
            df_wide.at[row, col] = np.nan

    # Add wrong data types
    for col in wrong_type_cols:
        if col in df_wide.columns:
            for row in np.random.choice(df_wide.index, size=max(1, len(df_wide)//8), replace=False):
                if 'Spend' in col:
                    df_wide.at[row, col] = random.choice(['one hundred', 'N/A'])
                elif 'GRPs' in col:
                    df_wide.at[row, col] = random.choice(['low', 'unknown'])
                elif col == 'Date':
                    df_wide.at[row, col] = random.choice(['next week', 'soon'])

    return df_wide

Wide Output

In [12]:
os.makedirs(wide_folder, exist_ok=True)

NUM_SETS = 20

for i in range(NUM_SETS):
    params = {
        'num_rows': random.randint(12, 50),
        'add_empty_cells': random.choice([True, False]),
        'empty_frac': round(random.uniform(0.05, 0.3), 2),
        'change_types': random.choice([True, False]),
        'wrong_type_cols': random.sample(['Value', 'Date'], 
                                         k=random.randint(0, 2))
    }

    df = generate_long_df(**params)
    filepath = os.path.join(wide_folder, f'synthetic_dataset_{i+1}.xlsx')
    df.to_excel(filepath, index=False)

    print(f"Saved: {filepath}")

Saved: wide\synthetic_dataset_1.xlsx
Saved: wide\synthetic_dataset_2.xlsx
Saved: wide\synthetic_dataset_3.xlsx
Saved: wide\synthetic_dataset_4.xlsx
Saved: wide\synthetic_dataset_5.xlsx
Saved: wide\synthetic_dataset_6.xlsx
Saved: wide\synthetic_dataset_7.xlsx
Saved: wide\synthetic_dataset_8.xlsx


  df.at[row, col] = random.choice(['not a date', 'yesterday', 'soon'])
  df.at[row, col] = random.choice(['one hundred', 'ten', 'NaN'])
  df.at[row, col] = random.choice(['not a date', 'yesterday', 'soon'])
  df.at[row, col] = random.choice(['one hundred', 'ten', 'NaN'])


Saved: wide\synthetic_dataset_9.xlsx
Saved: wide\synthetic_dataset_10.xlsx
Saved: wide\synthetic_dataset_11.xlsx
Saved: wide\synthetic_dataset_12.xlsx
Saved: wide\synthetic_dataset_13.xlsx
Saved: wide\synthetic_dataset_14.xlsx
Saved: wide\synthetic_dataset_15.xlsx
Saved: wide\synthetic_dataset_16.xlsx


  df.at[row, col] = random.choice(['not a date', 'yesterday', 'soon'])
  df.at[row, col] = random.choice(['one hundred', 'ten', 'NaN'])
  df.at[row, col] = random.choice(['not a date', 'yesterday', 'soon'])
  df.at[row, col] = random.choice(['one hundred', 'ten', 'NaN'])
  df.at[row, col] = random.choice(['not a date', 'yesterday', 'soon'])


Saved: wide\synthetic_dataset_17.xlsx
Saved: wide\synthetic_dataset_18.xlsx
Saved: wide\synthetic_dataset_19.xlsx
Saved: wide\synthetic_dataset_20.xlsx


  df.at[row, col] = random.choice(['not a date', 'yesterday', 'soon'])
  df.at[row, col] = random.choice(['one hundred', 'ten', 'NaN'])
