In [3]:
# Imports + Load

import pandas as pd
import numpy as np

from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import ttest_ind

df = pd.read_csv(r"C:\Users\User\OneDrive\Documents\OR PROJECTS\data\raw\ab_test_data.csv")

control = df[df["group"] == "control"].copy()
treatment = df[df["group"] == "treatment"].copy()

len(control), len(treatment)


(5000, 5000)

In [4]:
# Conversion rate test (one-sided + two-sided)

successes = np.array([control["converted"].sum(), treatment["converted"].sum()])
obs = np.array([len(control), len(treatment)])

# two-sided test
z_2s, p_2s = proportions_ztest(count=successes, nobs=obs, alternative="two-sided")
# one-sided test (treatment > control)
z_1s, p_1s = proportions_ztest(count=successes, nobs=obs, alternative="smaller")
# NOTE: statsmodels uses alternative="smaller" to test p_control < p_treatment given ordering.
# Here successes = [control, treatment], so "smaller" means control rate < treatment rate.

cr_control = control["converted"].mean()
cr_treat = treatment["converted"].mean()
lift = cr_treat - cr_control
rel_lift = lift / cr_control

print(f"Control conversion:   {cr_control:.4f}")
print(f"Treatment conversion: {cr_treat:.4f}")
print(f"Absolute lift:        {lift:.4f}")
print(f"Relative lift:        {rel_lift:.2%}")
print()
print(f"Z (two-sided): {z_2s:.3f}, p-value: {p_2s:.6f}")
print(f"Z (one-sided, treat>control): {z_1s:.3f}, p-value: {p_1s:.6f}")


Control conversion:   0.0958
Treatment conversion: 0.1272
Absolute lift:        0.0314
Relative lift:        32.78%

Z (two-sided): -4.988, p-value: 0.000001
Z (one-sided, treat>control): -4.988, p-value: 0.000000


In [5]:
# 95% CI for conversion lift (normal approx)

import math

p1, p2 = cr_control, cr_treat
n1, n2 = len(control), len(treatment)

se = math.sqrt(p1*(1-p1)/n1 + p2*(1-p2)/n2)
ci_low = (p2 - p1) - 1.96*se
ci_high = (p2 - p1) + 1.96*se

print(f"95% CI for (treat - control) conversion lift: [{ci_low:.4f}, {ci_high:.4f}]")


95% CI for (treat - control) conversion lift: [0.0191, 0.0437]


In [7]:
# Revenue test (Welch t-test)

r_control = control["revenue"]
r_treat = treatment["revenue"]

t_stat, p_rev = ttest_ind(r_treat, r_control, equal_var=False)  # Welch

print(f"Avg revenue (control):   {r_control.mean():.2f}")
print(f"Avg revenue (treatment): {r_treat.mean():.2f}")
print(f"Difference (treat - control): {(r_treat.mean()-r_control.mean()):.2f}")
print()
print(f"Welch t-stat: {t_stat:.3f}, p-value: {p_rev:.6f}")


Avg revenue (control):   39.71
Avg revenue (treatment): 48.78
Difference (treat - control): 9.07

Welch t-stat: 14.893, p-value: 0.000000


In [8]:
# Practical significance: revenue per user (RPU) & combined impact

rpu_control = r_control.mean()
rpu_treat = r_treat.mean()

print(f"RPU control:   {rpu_control:.2f}")
print(f"RPU treatment: {rpu_treat:.2f}")
print(f"RPU lift:      {(rpu_treat - rpu_control):.2f} ({(rpu_treat/rpu_control - 1):.2%})")


RPU control:   39.71
RPU treatment: 48.78
RPU lift:      9.07 (22.84%)


### Interpretation (Statistical):

We test conversion using a z-test for proportions and compute a 95% confidence interval for the conversion lift.

We test revenue using Welchâ€™s t-test (does not assume equal variance).

Interpretation (Business):
Even if statistically significant, we care whether the lift is large enough to justify rollout. We report both absolute lift and relative lift, and we evaluate impact on RPU.