In [2]:
from pathlib import Path
import duckdb, pandas as pd
import numpy as np
from scipy import stats

DB_PATH = Path("ds_week1.duckdb")  # same folder as this notebook
con = duckdb.connect(str(DB_PATH))
con.execute("SHOW TABLES").df()


Unnamed: 0,name
0,events
1,orders
2,users


In [12]:
# Set random seed for reproducibility
np.random.seed(7)

# Fetch users and assign them randomly to variants A or B
users = con.execute("SELECT user_id, signup_date FROM users").df()
users["variant"] = np.where(np.random.rand(len(users)) < 0.5, "A", "B")

# Store the assignments in a new table
con.execute("DROP TABLE IF EXISTS experiment_assignments")
con.register("assign_df", users[["user_id","variant"]])
con.execute("CREATE TABLE experiment_assignments AS SELECT * FROM assign_df")

<_duckdb.DuckDBPyConnection at 0x21c648916f0>

In [13]:
# Verify the assignment counts
counts = con.execute("""
SELECT variant, COUNT(*) AS n
FROM experiment_assignments
GROUP BY 1
ORDER BY 1
""").df()
counts

Unnamed: 0,variant,n
0,A,1014
1,B,986


In [14]:
# Chi-squared test to check for equal distribution
obs = counts["n"].to_numpy()
exp = np.array([obs.sum()/2, obs.sum()/2])
chi2, p = stats.chisquare(obs, f_exp=exp)
chi2, p

(np.float64(0.392), np.float64(0.5312499859948483))

In [15]:
# Define the conversion metric: purchase within 7 days of signup
metric = con.execute("""
WITH first_purchase AS (
  SELECT user_id, MIN(order_time) AS first_purchase_time
  FROM orders
  GROUP BY 1
)
SELECT
  a.variant,
  u.user_id,
  CASE
    WHEN fp.first_purchase_time IS NOT NULL
     AND fp.first_purchase_time <= u.signup_date + INTERVAL 7 DAY
    THEN 1 ELSE 0
  END AS converted_7d
FROM users u
JOIN experiment_assignments a USING (user_id)
LEFT JOIN first_purchase fp USING (user_id);
""").df()

metric.head()

Unnamed: 0,variant,user_id,converted_7d
0,A,1,0
1,B,4,0
2,B,11,1
3,A,17,1
4,B,19,1


In [16]:
# Summarize conversion rates by variant
summary = metric.groupby("variant")["converted_7d"].agg(["count","mean"])
summary

Unnamed: 0_level_0,count,mean
variant,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1014,0.14497
B,986,0.119675


In [17]:
# Z-test for difference in proportions
nA = summary.loc["A","count"]; pA = summary.loc["A","mean"]
nB = summary.loc["B","count"]; pB = summary.loc["B","mean"]

# pooled proportion
p_pool = (pA*nA + pB*nB) / (nA+nB)
se = np.sqrt(p_pool*(1-p_pool)*(1/nA + 1/nB))
z = (pB - pA) / se
p_value = 2*(1 - stats.norm.cdf(abs(z)))

lift = pB - pA
lift, z, p_value

(np.float64(-0.025294957811731092),
 np.float64(-1.6681446834872344),
 np.float64(0.09528700877175877))

In [18]:
# 95% confidence interval for the lift
se_unpooled = np.sqrt(pA*(1-pA)/nA + pB*(1-pB)/nB)
ci_low = lift - 1.96*se_unpooled
ci_high = lift + 1.96*se_unpooled
(ci_low, ci_high)

(np.float64(-0.05496107948832689), np.float64(0.0043711638648647015))

In [19]:
# Power simulation function
def power_sim(n_per_group=1000, p_control=0.05, abs_lift=0.005, alpha=0.05, sims=2000, seed=0):
    rng = np.random.default_rng(seed)
    rejections = 0
    for _ in range(sims):
        A = rng.binomial(1, p_control, n_per_group)
        B = rng.binomial(1, p_control + abs_lift, n_per_group)
        pA = A.mean(); pB = B.mean()
        p_pool = (A.sum()+B.sum())/(2*n_per_group)
        se = np.sqrt(p_pool*(1-p_pool)*(2/n_per_group))
        z = (pB - pA)/se
        pval = 2*(1 - stats.norm.cdf(abs(z)))
        rejections += (pval < alpha)
    return rejections/sims

power_sim(n_per_group=2000, p_control=0.05, abs_lift=0.005)


np.float64(0.109)