# Synthetic Data Generation (key metric: conversion rate)

## 3 main data streams for AB testing

`exposure_events.csv`:
- user_id
- experiment_id
- variant
- exposure_time
---
`user_events.csv`
- user_id
- timestamp
- event_name
- event_value
---
`user_attributes.csv`
- user_id
- country
- device
---
For each experiment + metric:

- Take exposure events
- Define analysis window (e.g. 7 days after exposure)
- Join user_events within window
- Aggregate per user
- Compare distributions between variants

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## hard coded parameters

In [17]:
N_EXPERIMENTS = 2
GLOBAL_N_USERS = 8_000

variant = ['A', 'B']
event_name = ['session_start', 'purchase', 'view', 'error', 'scroll']

country = ['USA', 'Germany', 'China', 'Japan', 'India', 
           'UK', 'France', 'Italy', 'Russia', 'Canada', 
           'Australia', 'Brazil', 'South Korea', 
           'Indonesia', 'Netherlands', 'Argentina']
devices = ['ios', 'android', 'web']
devices_probabilities = [0.18, 0.45, 0.37]
BASELINE = 0.1
ERROR_RATE_A = 0.01

### hard coded simulation parameters

In [18]:
# number of participants PER experiment (so, for both variants in each experiment)
SAMPLE_SIZE = 5_000
LIFT = 0.02
ERROR_RATE_B = 0.02
SESSION_RATE = 2.0

## constants

50/50 allocation for variant A and B.

## Data Generation Functions

In [19]:
def generate_exposure_events():
    
    rng = np.random.default_rng()
    users = np.arange(GLOBAL_N_USERS)
    start_time = pd.to_datetime('2025-01-01')

    rows = []

    for exp_id in range(N_EXPERIMENTS):
        participants = rng.choice(users, size=SAMPLE_SIZE, replace=False)
        rng.shuffle(participants)
        half = SAMPLE_SIZE // 2
        
        for i, user in enumerate(participants):
            variant = "A" if i < half else "B"
    
            exposure_time = (
                start_time
                + pd.to_timedelta(rng.integers(0, 14), unit="D")
                + pd.to_timedelta(rng.integers(0, 86400), unit="s")
            )
            
            rows.append({
                'user_id': user,
                'experiment_id': exp_id,
                'variant': variant,
                'exposure_time': exposure_time
            })
    
    return pd.DataFrame(rows)

In [20]:
generate_exposure_events()[4995:5001]

Unnamed: 0,user_id,experiment_id,variant,exposure_time
4995,5867,0,B,2025-01-12 16:25:35
4996,6216,0,B,2025-01-08 19:25:10
4997,5581,0,B,2025-01-01 05:33:00
4998,2163,0,B,2025-01-10 10:27:20
4999,3142,0,B,2025-01-09 21:59:27
5000,7246,1,A,2025-01-08 02:21:53


In [21]:
def generate_user_events(exposures: pd.DataFrame):
    rng = np.random.default_rng()
    events = []

    for _, row in exposures.iterrows():

        # event: session start
        n_sessions = rng.poisson(SESSION_RATE)

        for _ in range(n_sessions):
            events.append({
                "user_id": row['user_id'],
                "event_name": "session_start",
                "event_time": row['exposure_time'] + pd.to_timedelta(rng.integers(0, 2), unit="D"),
                "event_value": None
            })

        # event: conversion (KEY metric)
        if row['variant'] == "A":
            p = BASELINE
        else:
            p = BASELINE + LIFT
        if rng.random() < p:
            purchase_time = row['exposure_time'] + pd.to_timedelta(rng.integers(0, 10), unit="D")
            events.append({
                "user_id": row['user_id'],
                "event_name": "purchase",
                "event_time": purchase_time,
                "event_value": rng.lognormal(mean=3.227, sigma=0.426)
            })
        # noise
        if row['variant'] == "A":
            specific_e_rate = ERROR_RATE_A
        else:
            specific_e_rate = ERROR_RATE_B
        if rng.random() < specific_e_rate:
            events.append({
                "user_id": row['user_id'],
                "event_name": "error",
                "event_time": row['exposure_time'] + pd.to_timedelta(rng.integers(0, 7), unit="D"),
                "event_value": None
            })

        n_scrolls = rng.poisson(3)
        for _ in range(n_scrolls):
            events.append({
                "user_id": row['user_id'],
                "event_name": "scroll",
                "event_time": row['exposure_time'] + pd.to_timedelta(rng.integers(0, 2), unit="D"),
                "event_value": None
            })
    return pd.DataFrame(events)

In [22]:
def generate_user_info():
    user_devices = np.random.choice(devices, 
                                    p=devices_probabilities, 
                                    size=GLOBAL_N_USERS)
    user_countries = np.random.default_rng().choice(country, 
                                                    size=GLOBAL_N_USERS)
    return pd.DataFrame({
        'user_id': np.arange(GLOBAL_N_USERS),
        'device': user_devices,
        'country': user_countries
    })

## Get DataFrames and save as `csv` files

In [23]:
exposure_events = generate_exposure_events()
exposure_events.head()

Unnamed: 0,user_id,experiment_id,variant,exposure_time
0,4772,0,A,2025-01-02 01:12:51
1,6597,0,A,2025-01-10 21:19:04
2,5833,0,A,2025-01-09 05:36:59
3,1117,0,A,2025-01-07 21:22:17
4,1307,0,A,2025-01-03 18:56:33


In [24]:
user_events = generate_user_events(exposure_events)
user_events.head(20)

Unnamed: 0,user_id,event_name,event_time,event_value
0,4772,session_start,2025-01-03 01:12:51,
1,4772,scroll,2025-01-03 01:12:51,
2,4772,scroll,2025-01-02 01:12:51,
3,4772,scroll,2025-01-02 01:12:51,
4,4772,scroll,2025-01-03 01:12:51,
5,4772,scroll,2025-01-02 01:12:51,
6,4772,scroll,2025-01-02 01:12:51,
7,6597,session_start,2025-01-11 21:19:04,
8,6597,scroll,2025-01-11 21:19:04,
9,6597,scroll,2025-01-10 21:19:04,


In [25]:
user_info = generate_user_info()
user_info.head()

Unnamed: 0,user_id,device,country
0,0,android,Germany
1,1,ios,Netherlands
2,2,web,China
3,3,android,Italy
4,4,ios,Australia


## sanity checks

In [26]:
exposure_events.groupby("experiment_id").user_id.nunique()

experiment_id
0    5000
1    5000
Name: user_id, dtype: int64

In [27]:
exposure_events.groupby("user_id").experiment_id.nunique().value_counts()

experiment_id
1    3720
2    3140
Name: count, dtype: int64

## export as `csv`

In [28]:
exposure_events.to_csv("../data/exposure_events.csv")
user_events.to_csv("../data/user_events_conversion.csv")
user_info.to_csv("../data/user_info.csv")