# Synthetic Data Generation (key metric: conversion rate)

## 3 main data streams for AB testing

`exposure_events.csv`:
- user_id
- experiment_id
- variant
- exposure_time
---
`user_events.csv`
- user_id
- timestamp
- event_name
- event_value
---
`user_attributes.csv`
- user_id
- country
- device
---
For each experiment + metric:

- Take exposure events
- Define analysis window (e.g. 7 days after exposure)
- Join user_events within window
- Aggregate per user
- Compare distributions between variants

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## hard coded parameters

In [108]:
N_EXPERIMENTS = 2
GLOBAL_N_USERS = 8_000

variant = ['A', 'B']
event_name = ['session_start', 'purchase', 'view', 'error', 'scroll']

country = ['USA', 'Germany', 'China', 'Japan', 'India', 
           'UK', 'France', 'Italy', 'Russia', 'Canada', 
           'Australia', 'Brazil', 'South Korea', 
           'Indonesia', 'Netherlands', 'Argentina']
devices = ['ios', 'android', 'web']
devices_probabilities = [0.18, 0.45, 0.37]
BASELINE = 0.1
ERROR_RATE_A = 0.01

### hard coded simulation parameters

In [109]:
# number of participants PER experiment (so, for both variants in each experiment)
SAMPLE_SIZE = 5_000
LIFT = 0.02
ERROR_RATE_B = 0.02
SESSION_RATE = 2.0

## constants

50/50 allocation for variant A and B.

## Data Generation Functions

In [110]:
def generate_exposure_events():
    
    rng = np.random.default_rng()
    users = np.arange(GLOBAL_N_USERS)
    start_time = pd.to_datetime('2025-01-01')

    rows = []

    for exp_id in range(N_EXPERIMENTS):
        participants = rng.choice(users, size=SAMPLE_SIZE, replace=False)
        rng.shuffle(participants)
        half = SAMPLE_SIZE // 2
        
        for i, user in enumerate(participants):
            variant = "A" if i < half else "B"
    
            exposure_time = (
                start_time
                + pd.to_timedelta(rng.integers(0, 14), unit="D")
                + pd.to_timedelta(rng.integers(0, 86400), unit="s")
            )
            
            rows.append({
                'user_id': user,
                'experiment_id': exp_id,
                'variant': variant,
                'exposure_time': exposure_time
            })
    
    return pd.DataFrame(rows)

In [111]:
generate_exposure_events()[4995:5001]

Unnamed: 0,user_id,experiment_id,variant,exposure_time
4995,6656,0,B,2025-01-06 10:58:50
4996,1881,0,B,2025-01-08 17:51:47
4997,2838,0,B,2025-01-08 19:27:32
4998,3654,0,B,2025-01-09 20:46:19
4999,7765,0,B,2025-01-09 22:37:41
5000,6310,1,A,2025-01-07 01:00:54


In [112]:
def generate_user_events(exposures: pd.DataFrame):
    rng = np.random.default_rng()
    events = []

    for _, row in exposures.iterrows():

        # event: session start
        n_sessions = rng.poisson(SESSION_RATE)

        for _ in range(n_sessions):
            events.append({
                "user_id": row['user_id'],
                "event_name": "session_start",
                "event_time": row['exposure_time'] + pd.to_timedelta(rng.integers(0, 2), unit="D"),
                "event_value": None
            })

        # event: conversion (KEY metric)
        if row['variant'] == "A":
            p = BASELINE
        else:
            p = BASELINE + LIFT
        if rng.random() < p:
            purchase_time = row['exposure_time'] + pd.to_timedelta(rng.integers(0, 10), unit="D")
            events.append({
                "user_id": row['user_id'],
                "event_name": "purchase",
                "event_time": purchase_time,
                "event_value": rng.lognormal(mean=3.227, sigma=0.426)
            })
        # noise
        if row['variant'] == "A":
            specific_e_rate = ERROR_RATE_A
        else:
            specific_e_rate = ERROR_RATE_B
        if rng.random() < specific_e_rate:
            events.append({
                "user_id": row['user_id'],
                "event_name": "error",
                "event_time": row['exposure_time'] + pd.to_timedelta(rng.integers(0, 7), unit="D"),
                "event_value": None
            })

        n_scrolls = rng.poisson(3)
        for _ in range(n_scrolls):
            events.append({
                "user_id": row['user_id'],
                "event_name": "scroll",
                "event_time": row['exposure_time'] + pd.to_timedelta(rng.integers(0, 2), unit="D"),
                "event_value": None
            })
    return pd.DataFrame(events)

In [113]:
def generate_user_info():
    user_devices = np.random.choice(devices, 
                                    p=devices_probabilities, 
                                    size=GLOBAL_N_USERS)
    user_countries = np.random.default_rng().choice(country, 
                                                    size=GLOBAL_N_USERS)
    return pd.DataFrame({
        'user_id': np.arange(GLOBAL_N_USERS),
        'device': user_devices,
        'country': user_countries
    })

## Get DataFrames and save as `csv` files

In [114]:
exposure_events = generate_exposure_events()
exposure_events.head()

Unnamed: 0,user_id,experiment_id,variant,exposure_time
0,2939,0,A,2025-01-14 09:36:38
1,5826,0,A,2025-01-14 10:33:40
2,3805,0,A,2025-01-02 07:33:36
3,3651,0,A,2025-01-13 05:48:00
4,2361,0,A,2025-01-10 06:16:07


In [115]:
user_events = generate_user_events(exposure_events)
user_events.head(20)

Unnamed: 0,user_id,event_name,event_time,event_value
0,2939,session_start,2025-01-14 09:36:38,
1,2939,session_start,2025-01-14 09:36:38,
2,2939,scroll,2025-01-15 09:36:38,
3,2939,scroll,2025-01-15 09:36:38,
4,3805,session_start,2025-01-02 07:33:36,
5,3805,session_start,2025-01-02 07:33:36,
6,3805,session_start,2025-01-03 07:33:36,
7,3805,scroll,2025-01-03 07:33:36,
8,3805,scroll,2025-01-02 07:33:36,
9,3805,scroll,2025-01-02 07:33:36,


In [116]:
user_info = generate_user_info()
user_info.head()

Unnamed: 0,user_id,device,country
0,0,android,France
1,1,ios,UK
2,2,web,Japan
3,3,ios,Indonesia
4,4,web,Italy


## sanity checks

In [117]:
exposure_events.groupby("experiment_id").user_id.nunique()

experiment_id
0    5000
1    5000
Name: user_id, dtype: int64

In [120]:
exposure_events.groupby("user_id").experiment_id.nunique().value_counts()

experiment_id
1    3780
2    3110
Name: count, dtype: int64