In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Manual Input

In [2]:
e_id = 1

In [3]:
n = 5000
N = 2 * n

In [4]:
start_date = pd.to_datetime("2025-03-01")
end_date = pd.to_datetime("2025-03-15")

In [5]:
baseline_conversion = 0.10
relative_lift = 0.05

In [6]:
allocation = {"A": 0.50, "B": 0.50}
target_metric = "conversion_rate"
guardrails = ["crash_rate", "latency_ms"]

# Auto generated

In [7]:
duration_days = (end_date - start_date).days

p1 = baseline_conversion
p2 = baseline_conversion * (1 + relative_lift)

user_ids = np.arange(N)

variant = np.array(["A"]*n + ["B"]*n)

segment = np.random.choice(
    ["ios", "android", "web"],
    size=N,
    p=[0.4, 0.4, 0.2]
)

# acquisition channel
cohort = np.random.choice(
    ["organic", "ads", "referral"],
    size=N,
    p=[0.6, 0.3, 0.1]
)

converted = np.where(
    variant == "A",
    np.random.binomial(1, p1, N),
    np.random.binomial(1, p2, N),
)

## Daily traffic simulation

In [8]:
daily_users = np.random.poisson(lam=N / duration_days, size=int(duration_days))
diff = N - daily_users.sum()
daily_users[-1] += diff
daily_user_arrival = np.repeat(
    np.arange(duration_days),
    daily_users
)
exposure_time = [
    start_date + pd.Timedelta(days=int(d)) + pd.Timedelta(hours=np.random.uniform(0, 24))
    for d in daily_user_arrival
]

## Exposure log

## Event generation

In [10]:
ev_user_ids = []
ev_types = []
ev_times = []
ev_values = []

for i in range(N):
    uid = user_ids[i]
    var = variant[i]
    e_time = arrival_time[i]
    conv = converted[i]
    
    base_sessions = 3 if var == "A" else 3.3
    sessions = max(1, np.random.poisson(base_sessions))
    base_crash_p = 0.03 if var == "A" else 0.04
    latency_mean = 300 if var == "A" else 330
    
    for s in range(sessions):        
        ts = e_time + pd.Timedelta(minutes=np.random.uniform(0, 180))

        ev_user_ids.append(uid)
        ev_types.append("latency")
        ev_times.append(ts)
        ev_values.append(np.random.normal(latency_mean, 20))

        # crash event
        if np.random.rand() < base_crash_p:
            ev_user_ids.append(uid)
            ev_types.append("crash")
            ev_times.append(ts)
            ev_values.append(None)

        # view events
        n_views = np.random.randint(3, 7)
        for _ in range(n_views):
            view_ts = ts + pd.Timedelta(seconds=np.random.uniform(0, 120))
            ev_user_ids.append(uid)
            ev_types.append("view")
            ev_times.append(view_ts)
            ev_values.append(None)

            # click events
            click_prob = 0.25 if var == "A" else 0.28
            if np.random.rand() < click_prob:
                click_ts = view_ts + pd.Timedelta(seconds=np.random.uniform(1, 20))
                ev_user_ids.append(uid)
                ev_types.append("click")
                ev_times.append(click_ts)
                ev_values.append(None)

        # purchase (only when converted)
        if conv == 1:
            purchase_ts = ts + pd.Timedelta(minutes=np.random.uniform(2, 40))
            revenue = np.random.lognormal(mean=3, sigma=0.4)
            ev_user_ids.append(uid)
            ev_variants.append(var)
            ev_types.append("purchase")
            ev_times.append(purchase_ts)
            ev_values.append(revenue)

## Final table

In [11]:
events = pd.DataFrame({
    "experiment_id": [e_id] * len(ev_user_ids),
    "user_id": ev_user_ids,
    "variant": ev_variants,
    "segment": ev_segments,
    "cohort": ev_cohorts,
    "event_type": ev_types,
    "event_time": ev_times,
    "value": ev_values
})

users = pd.DataFrame({
    "user_id": user_ids,
    "variant": variant,
    "segment": segment,
    "cohort": cohort,
    "converted": converted,
    "arrival_time": arrival_time
})

In [12]:
events.head()

Unnamed: 0,experiment_id,user_id,variant,segment,cohort,event_type,event_time,value
0,1,0,A,web,organic,latency,2025-03-01 17:19:35.320779332,303.926369
1,1,0,A,web,organic,view,2025-03-01 17:21:02.381841800,
2,1,0,A,web,organic,view,2025-03-01 17:21:13.531690945,
3,1,0,A,web,organic,click,2025-03-01 17:21:19.537923865,
4,1,0,A,web,organic,view,2025-03-01 17:21:10.121239254,


In [13]:
users.head()

Unnamed: 0,user_id,variant,segment,cohort,converted,arrival_time
0,0,A,web,organic,0,2025-03-01 14:26:20.699565961
1,1,A,web,referral,0,2025-03-01 12:35:21.812982009
2,2,A,android,referral,0,2025-03-01 10:26:53.636773894
3,3,A,ios,organic,0,2025-03-01 14:17:12.548072804
4,4,A,ios,organic,0,2025-03-01 05:45:24.494666714


In [14]:
events.to_csv('../data/events.csv')
users.to_csv('../data/users.csv')

experiments
-----------
experiment_id
name
start_time
end_time
status            (running / stopped / archived)
primary_metric    (conversion / revenue / latency)
secondary_metrics (array/json)
alpha
power_target
min_runtime
max_runtime
created_at

In [18]:
n_experiments = 10
exp_id = []
start_time = []
status = []
primary_metric = []
secondary_metric = []
alpha = []
power_target = []
status_choices = np.array(['running', 'stopped', 'archived'])
primary_metric_choices = np.array(['conversion', 'revenue', 'latency'])
secondary_metric_choices = np.array(['array', 'json'])

In [17]:
for i in range(n_experiments):
    exp_id.append(i + 1)
    start_time.append(pd.to_datetime('2025-01-01') + pd.to_timedelta(np.random.randint(0,366), unit='D'))
    status.append(np.random.choice(status_choices))
    primary_metric.append(np.random.choice(primary_metric_choices))
    secondary_metric.append(np.random.choice(secondary_metric_choices))
    alpha.append(0.05)
    power_target.append(0.8)
    min_runtime_wks = 2
    max_runtime_wks = 6

Timestamp('2025-10-27 00:00:00')