In [49]:
import pandas as pd
import numpy as np

# A/B test data with randomization problems
ab_test = pd.DataFrame({
    'user_id': range(1, 5001),
    'group': ['A'] * 2000 + ['B'] * 3000,  # Unbalanced!
    'age': np.random.normal(35, 12, 5000),
    'gender': np.random.choice(['M', 'F'], 5000),
    'country': np.random.choice(['US', 'UK', 'CA'], 5000, p=[0.7, 0.2, 0.1]),
    'device': np.random.choice(['mobile', 'desktop'], 5000, p=[0.6, 0.4]),
    'previous_purchases': np.random.poisson(2, 5000),
    'purchased': np.random.choice([0, 1], 5000, p=[0.85, 0.15])
})
# Check if previous_purchases affects conversion
ab_test.groupby('previous_purchases')['purchased'].mean()

# Check if the group column is randomly split
ab_test['group'].value_counts()

# Check if the age demographic is similar
ab_test.groupby('group')['age'].mean()
ab_test.groupby('group')['age'].describe()

#Check if the gender demographic is similar
ab_test.groupby('group')['gender'].describe()

# Check if the gender demographic is similar
pd.crosstab(ab_test['group'], ab_test['gender'], normalize='index')

# Check for country
pd.crosstab(ab_test['group'], ab_test['country'], normalize='index')

# Conversion analysis
#pd.crosstab(ab_test['group'], (ab_test['purchased'] / ab_test['group'].count()), normalize='index')
# Conversion analysis Foundational
pd.crosstab(ab_test['group'], ab_test['purchased'], normalize='index')

# Has the group imbalance created demographic differences?
pd.crosstab(ab_test['group'], ab_test['previous_purchases'], normalize='index')

ab_test['group'].value_counts()

group
B    3000
A    2000
Name: count, dtype: int64

In [46]:
import math

# Your data
n_a, success_a = 2000, 295
n_b, success_b = 3000, 484

# Step 1: Calculate proportions
p_a = success_a / n_a
p_b = success_b / n_b

# Step 2: Calculate pooled proportion
p_pooled = (success_a + success_b) / (n_a + n_b)

# Step 3: Calculate standard error
se = math.sqrt(p_pooled * (1 - p_pooled) * (1/n_a + 1/n_b))

# Step 4: Calculate z-statistic
z_stat = (p_a - p_b) / se

print(f"Group A rate: {p_a:.4f}")
print(f"Group B rate: {p_b:.4f}")
print(f"Pooled rate: {p_pooled:.4f}")
print(f"Standard error: {se:.4f}")
print(f"Z-statistic: {z_stat:.3f}")

Group A rate: 0.1475
Group B rate: 0.1613
Pooled rate: 0.1558
Standard error: 0.0105
Z-statistic: -1.321
