In [5]:
import pandas as pd
import numpy as np
import datetime
import random

### Configurable parameters

In [6]:
NUM_CUSTOMERS = 1000
NUM_TEST_ENTRIES = 2000
REGISTRATION_RATE = 0.60
PURCHASE_RATE = 0.20

TEST_NAMES = ["Landing Page Test", "Email Subject Test", "Checkout Flow Test"]
TEST_GROUPS = ["A", "B"]
REGISTRATION_RATE = 0.60
PURCHASE_RATE = 0.20

START_DATE = datetime.datetime(2025, 1, 1)
END_DATE = datetime.datetime(2025, 3, 1)

### Utility functions

In [10]:
# Return a random datetime between start and end.
def random_date(start, end):
  total_seconds = int((end - start).total_seconds())
  random_second = random.randrange(total_seconds)
  return start + datetime.timedelta(seconds=random_second)

### 1) Generate d_customer

In [8]:
customer_ids = list(range(1, NUM_CUSTOMERS + 1))
ages = np.random.randint(18, 70, size=NUM_CUSTOMERS)
genders = np.random.choice(["M", "F"], size=NUM_CUSTOMERS)

d_customer = pd.DataFrame({
  "customer_id": customer_ids,
  "age": ages,
  "gender": genders
})

### 2) Generate f_test_entries (fact table)

In [None]:
# We will randomly pick customers, tests, and groups, and assign random dates.
fact_rows   = []
seen_pairs  = set()            # to enforce (customer_id, test_name) uniqueness
target_p_if_registered = PURCHASE_RATE / REGISTRATION_RATE   # ≈ 0.3333

while len(fact_rows) < NUM_TEST_ENTRIES:
    cid      = random.choice(customer_ids)
    t_name   = random.choice(TEST_NAMES)

    # Skip if this customer already entered this test once
    if (cid, t_name) in seen_pairs:
        continue
    seen_pairs.add((cid, t_name))

    t_group  = random.choice(TEST_GROUPS)
    entry_dt = random_date(START_DATE, END_DATE)

    # Registration
    if random.random() < REGISTRATION_RATE:
        reg_dt = random_date(entry_dt, END_DATE)

        # Purchase (conditional on registration)
        if random.random() < target_p_if_registered:
            purch_dt = random_date(reg_dt, END_DATE)
        else:
            purch_dt = None
    else:
        reg_dt   = None
        purch_dt = None

    fact_rows.append({
        "customer_id":             cid,
        "test_entry_datetime":     entry_dt.strftime("%Y-%m-%d %H:%M:%S"),
        "test_name":               t_name,
        "test_group":              t_group,
        "registration_datetime":   (reg_dt.strftime("%Y-%m-%d %H:%M:%S")
                                    if reg_dt else ""),
        "first_purchase_datetime": (purch_dt.strftime("%Y-%m-%d %H:%M:%S")
                                    if purch_dt else "")
    })

f_test_entries = pd.DataFrame(fact_rows)

### 3) Save to CSV

In [12]:
d_customer.to_csv("d_customer.csv", index=False)
f_test_entries.to_csv("f_test_entries.csv", index=False)

from google.colab import files
files.download("d_customer.csv")
files.download("f_test_entries.csv")

print("Data generation complete! Files saved as d_customer.csv and f_test_entries.csv.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Data generation complete! Files saved as d_customer.csv and f_test_entries.csv.
