In [202]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
from statsmodels.stats.power import NormalIndPower


In [204]:
#Load data for A/B testing intoa pandas dataframe
df = pd.read_csv("/Users/poojitha/Downloads/ab_data.csv")
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [206]:
#To find the number of rows, columns in the dataset.
df.shape

(294478, 5)

In [208]:
#To find the number of unique users in the dataset.
df.user_id.nunique()

290584

In [210]:
#proportion of users converted
(df.converted==1).value_counts()

converted
False    259241
True      35237
Name: count, dtype: int64

In [212]:
#To find data inconsistencies when the 'new_page' and 'treatment' or the 'old_page' and 'control' don't lineup
((df.group=='treatment') & (df.landing_page=='old_page')).sum()+((df.group=='control') & (df.landing_page=='new_page')).sum()

3893

In [214]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [216]:
#Percentage of mismatched values
((((df.group=='treatment') & (df.landing_page=='old_page')).sum()+((df.group=='control') & (df.landing_page=='new_page')).sum())/df.shape[0])*100

1.322000285250511

In [218]:
#Let us drop the mismatched rows, since changing the values of new_page/old_page to have 'treatment/control' respectively might alter the analysis
#There are just 1.3% mismatched rows

df['misaligned']=((df.group=='treatment') & (df.landing_page=='old_page')) | ((df.group=='control') & (df.landing_page=='new_page'))

In [220]:
#Extract rows which are not misaligned
df2=df.query('misaligned==False')

In [222]:
df2.shape

(290585, 6)

In [224]:
#to double check if all correct rows were removed
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]

0

In [226]:
# check for the number of unique users
df2['user_id'].nunique()

290584

In [228]:
#find the user who is repeated
df2['user_id'].value_counts().sort_values(ascending=False).head()

user_id
773192    2
834487    1
936923    1
679687    1
719014    1
Name: count, dtype: int64

In [230]:
#Row information of the repeat userID
df2.query('user_id==773192')

Unnamed: 0,user_id,timestamp,group,landing_page,converted,misaligned
1899,773192,2017-01-09 05:37:58.781806,treatment,new_page,0,False
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0,False


In [232]:
#drop one of the repeat rows for userID 773192
df2 = df2.drop(1899, axis=0)


In [234]:
df2.query('user_id==773192')


Unnamed: 0,user_id,timestamp,group,landing_page,converted,misaligned
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0,False


In [236]:
(df2.converted==1).value_counts()

converted
False    255831
True      34753
Name: count, dtype: int64

In [238]:
#Percentage of people in control group who converted
p_control=(df2.query('group=="control"')['converted']==1).mean()
p_control

0.1203863045004612

In [240]:
#Percentage of people in treatment group who converted
p_test=(df2.query('group=="treatment"')['converted']==1).mean()
p_test

0.11880806551510564

The conversion rates for the control and treatment groups are approximately 12.04% and 11.88%, respectively.
The treatment group is performing slightly worse than the control group.
However, we cannot conclude that the test is complete based solely on these percentages, as statistical significance and confidence intervals must be considered before making a decision.

Our primary metric is the conversion rate, which is binary (converted or not). Since we are comparing two proportions, we can use a two-proportion z-test to evaluate whether the difference between the control and treatment groups is statistically significant.

From our observations, the difference in conversion rates is approximately −0.157 percentage points, which is very small. We will use a statistical test to determine whether this difference is significant and whether our sample size is sufficient for reliable conclusions.

Additionally, the experiment was designed to maintain a 50/50 split between control and treatment groups. It is important to verify that this split was achieved. If there is a significant imbalance in group sizes, the experiment may be biased, and we cannot fully trust the results.

In [244]:
N_test = df2.query('landing_page == "new_page"').landing_page.count()
N_control = df2.query('landing_page == "old_page"').landing_page.count()
proportion = (N_test/df2.shape[0],N_control/df2.shape[0])
proportion

(0.5000619442226688, 0.4999380557773312)

Use Chi-squared goodness of fit test , since we are comparing observed counts to expected counts ( 50/50)

In [247]:
Total_sample=df2.shape[0]
observed_counts = [proportion[0]*Total_sample, proportion[1]*Total_sample]
expected_counts = [Total_sample/2, Total_sample/2]

In [249]:
from scipy.stats import chisquare

chi2, pval = chisquare(f_obs=observed_counts, f_exp=expected_counts)
print(f"Chi-square statistic: {chi2}, p-value: {pval}")

Chi-square statistic: 0.004459984032155934, p-value: 0.9467543681597944


From the above test, we can conclude:
1. We fail to reject NULL hypothesis.
2. The observed split (49.9938% vs 50.0061%) is not significantly different from 50/50.
3. That means the experiment’s randomization worked properly, and you can trust that users were evenly assigned between control and test groups.

Before interpreting the results, it is important to ensure that our experiment has the appropriate sample size.
1. With the right sample size, the test will have sufficient statistical power (typically 80% or 90%) to detect meaningful differences between control and treatment groups.
2. If the sample size is too small, we risk false negatives (missing a real effect) or false positives (detecting differences due to random noise).
3. If the sample size is too large, we waste traffic, time, and opportunity.
   
For this experiment, we consider:
1. Significance level (α) = 0.05
2. Power (1 − β) = 0.8–0.9
3. Minimum Detectable Effect (MDE) = 0.5% (smallest difference in conversion rate we want to detect)
4. Baseline conversion rate = 12%

With these parameters, we can calculate the required sample size per group to ensure a well-powered and efficient A/B test.

In [253]:
baseline_rate = 0.12       # control conversion
mde = 0.005                 # absolute difference we want to detect (0.5%)
alpha = 0.05
power = 0.8

effect_size = proportion_effectsize(baseline_rate, baseline_rate + mde)
analysis = NormalIndPower()
sample_size = analysis.solve_power(effect_size=effect_size, power=power, alpha=alpha, ratio=1)
print(f"Required sample size per group: {round(sample_size)}")

Required sample size per group: 67490


From the calculations above:

1. The actual sample size per group (~145k) is much larger than the required sample size (~67k) for detecting a 0.5% difference.
2. The experiment is well-powered to detect a 0.5% absolute lift in conversion.
3. Even smaller differences, such as the observed difference of 0.157 percentage points, could be statistically detected. However, such a tiny difference may not be practically meaningful.

Overall, we have more than enough sample size to run this A/B test reliably.

We will perform a two-proportion z-test to compare conversion rates between the control and treatment groups.

Null Hypothesis (H₀): The conversion rates in the two groups are the same.
H₀​:pcontrol​=ptest ​OR pcontrol​−ptest​=0

Alternative Hypothesis (H₁):The conversion rates are different.
Two-tailed test: Detects any difference (increase or decrease)
H₁: p(control) != p(test) 
One-tailed test: Detects only improvement (if we only care whether the test is better than control)
H₁: p(test) > p(control) 

For this experiment, we will use the two-tailed test, as we are interested in detecting any difference in conversion rates between the two groups.

In [257]:
print(f'Total sample size: {Total_sample}')
print(f'Conversion rate of Control group: {p_control}')
print(f'Conversion rate of Test group: {p_test}')
print(f'Size of Control group: {N_control}')
print(f'Size of Test group: {N_test}')

Total sample size: 290584
Conversion rate of Control group: 0.1203863045004612
Conversion rate of Test group: 0.11880806551510564
Size of Control group: 145274
Size of Test group: 145310


In [259]:
Conv_control=(df2.query('group=="control"')['converted'] == 1).sum()
Conv_test=(df2.query('group=="treatment"')['converted'] == 1).sum()

print(f'No of conversions from Control group: {Conv_control}')
print(f'No of conversions from Test group: {Conv_test}')

No of conversions from Control group: 17489
No of conversions from Test group: 17264


In [261]:
# 2-proportion z-test
count = np.array([Conv_control, Conv_test])
nobs = np.array([N_control, N_test])

z_stat, p_value = proportions_ztest(count, nobs, alternative='two-sided')
print("\n--- Two-proportion Z-test ---")
print(f"Z-statistic: {z_stat:.6f}")
print(f"P-value:     {p_value:.6f}")


--- Two-proportion Z-test ---
Z-statistic: 1.310924
P-value:     0.189883


The p-value from the two-proportion z-test is 0.1898, which is greater than the significance level of 0.05.
Therefore, we fail to reject the null hypothesis.

This indicates that the new page is not statistically better than the old page in terms of conversion rate.

In [264]:
# 2) Confidence Interval for difference in proportions
# observed proportions
p1, p2 = Conv_control / N_control, Conv_test / N_test
diff = p2 - p1

# standard error

p_pool = (Conv_control + Conv_test) / (N_control + N_test)
se = math.sqrt(p_pool * (1 - p_pool) * (1/N_control + 1/N_test))

ci_low = diff - 1.96 * se
ci_high = diff + 1.96 * se

print("\n--- 95% Confidence Interval ---")
print(f"Observed difference (test - control): {diff:.6f} ({diff*100:.3f} %)")
print(f"95% CI: [{ci_low:.6f}, {ci_high:.6f}]")


--- 95% Confidence Interval ---
Observed difference (test - control): -0.001578 (-0.158 %)
95% CI: [-0.003938, 0.000781]


Confidence Interval Interpretation

The difference in conversion rates is uncertain, but with 95% confidence, the true difference lies somewhere between a 0.39% decrease and a 0.08% increase for the test page.
1. These are very small effects compared to the baseline conversion rate of ~12% (control group).
2. Since the confidence interval crosses zero (ranges from negative to positive), the result is not statistically significant at 
𝛼=0.05
Conclusion: We fail to reject the null hypothesis.
1. The new page is unlikely to provide a meaningful improvement.
2. At best, it is neutral or slightly positive; at worst, it could cause a small decrease in conversions.

In [267]:
# 3) Power analysis for the observed effect
analysis = NormalIndPower()
effect_size = proportion_effectsize(p1, p2)
power = analysis.power(effect_size=effect_size, nobs1=N_control, alpha=0.05, ratio=N_test/N_control)

print("\n--- Power Analysis ---")
print(f"Observed effect size (Cohen h): {effect_size:.6f}")
print(f"Power with current sample size: {power:.4f}")


--- Power Analysis ---
Observed effect size (Cohen h): 0.004864
Power with current sample size: 0.2587


Power Analysis
We performed a power analysis to determine the probability of detecting the observed effect at a significance level of 𝛼 = 0.05
1. The observed difference between the test and control groups is very small, approximately 0.158%.
2. Given our current sample size, the power to detect an effect of this magnitude is only 25.87%, which is well below the commonly accepted threshold of 80%.
3. This indicates that the test is underpowered for detecting such a tiny effect.

Additionally, the observed effect size, calculated as Cohen's h, is approximately 0.00486, which is considered extremely small. This aligns with the negligible difference observed between the test and control groups.

In [270]:
# 4) Required sample size per group for 80% power to detect observed diff
req_n_per_group = analysis.solve_power(effect_size=effect_size, power=0.8, alpha=0.05, ratio=1)
print(f"Required per-group sample size (to detect observed diff): {req_n_per_group:.0f}")

Required per-group sample size (to detect observed diff): 663574


To have an 80% chance of detecting the tiny observed effect (~−0.1578 percentage points) if it is real, we would need approximately:
663,574 users in the control group; 663,574 users in the test group. This totals to roughly 1,327,148 users.

Tiny effects require very large sample sizes to detect reliably, which explains why our current experiment is underpowered for this specific difference.

Final Results, Interpretation, and Recommendations

Experiment Power and Practical Significance
1. The current experiment (~145k users per group) is well-powered to detect practical Minimum Detectable Effects (MDEs) ≥ 0.35%.
2. Since the business MDE for this experiment was 0.5%, the observed difference of −0.16% indicates no evidence that the new page improves conversions.
Conclusion: The new webpage does not generate meaningful increases in conversions compared to the old page.

Confidence Interval Analysis
1. The observed 95% confidence interval for the difference in conversion rates is [−0.003938, 0.000781].
2. Our chosen MDE (0.5%) lies outside this interval, confirming that the observed difference is not practically significant.
Interpretation: The new page is not better than the old page in a business-relevant sense.

Detecting Very Small Effects
1. The experiment is underpowered to reliably detect the very small observed effect (~0.16 percentage points).
2. Detecting such a tiny difference would require an impractically large experiment (~1.3M users total).
Recommendation: If detecting effects this small is critical, consider increasing the sample size or exploring higher-variance metrics or targeted segments.