In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.stats.api as sms
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
from statsmodels.stats.power import TTestIndPower

In [2]:
df = pd.read_csv('ab_data.csv')

In [3]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [4]:
pd.crosstab(df['group'], df['converted'])

converted,0,1
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,129479,17723
treatment,129762,17514


In [6]:
c = 17723/(17723 + 129479)
t = 17514/(17514 + 129762)

In [7]:
print("Conversion in control group : ", c*100)
print("Conversion in treatment group : ", t*100)

Conversion in control group :  12.039917935897611
Conversion in treatment group :  11.891957956489856


In [8]:
pd.crosstab(df["group"], df["landing_page"])

landing_page,new_page,old_page
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,1928,145274
treatment,145311,1965


In [9]:
session_counts = df["user_id"].value_counts(ascending=False)
multi_users = session_counts[session_counts>1].count()
session_counts

user_id
635984    2
905609    2
895459    2
770459    2
692897    2
         ..
751197    1
945152    1
734608    1
697314    1
936923    1
Name: count, Length: 290584, dtype: int64

In [10]:
print("user visited more then once", multi_users)

user visited more then once 3894


In [11]:
users_to_drop = session_counts[session_counts>1].index

In [12]:
df = df[~df["user_id"].isin(users_to_drop)]

In [13]:
df

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
...,...,...,...,...,...
294473,751197,2017-01-03 22:28:38.630509,control,old_page,0
294474,945152,2017-01-12 00:51:57.078372,control,old_page,0
294475,734608,2017-01-22 11:45:03.439544,control,old_page,0
294476,697314,2017-01-15 01:20:28.957438,control,old_page,0


In [14]:
# Randomly selecting 1000 observation for both control and treatment group
control_sample = df[df["group"] == "control"].sample(n=1000)
treatment_sample = df[df["group"] == "treatment"].sample(n=1000)

In [15]:
ab_test = pd.concat([control_sample, treatment_sample], axis=0)

In [16]:
ab_test.shape

(2000, 5)

In [17]:
pd.crosstab(ab_test["group"], ab_test["converted"])

converted,0,1
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,887,113
treatment,877,123


In [18]:
control_results = ab_test[ab_test['group']=='control']['converted']
treatment_results = ab_test[ab_test['group']=='treatment']['converted']
n_con = control_results.count()
n_treat = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]
z_stats, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)
print(f"z statistics: {z_stats:.2f}")
print(f"p-value: {pval:.3f}") 
print(f"ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]")
print(f"ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]")

z statistics: -0.69
p-value: 0.488
ci 95% for control group: [0.093, 0.133]
ci 95% for treatment group: [0.103, 0.143]


In [19]:
effect_size = sms.proportion_effectsize(0.12, 0.17)
required_n = sms.NormalIndPower().solve_power(
    effect_size,
    power=0.8,
    alpha=0.05,
    ratio=1
)
print(required_n)
required_n = np.ceil(required_n)

773.1100014338027


In [20]:
required_n

np.float64(774.0)

In [None]:
# Sample size calculation for t-test
effect = 0.8
alpha = 0.05
power = 0.8
analysis = TTestIndPower()
result = analysis.solve_power(effect, power=power, nobs1=None, ratio=1.0, alpha=alpha)
print("sample size %.3f" % result)

sample size 25.525


In [22]:
# for power of a test
power = TTestIndPower()
n_test = power.solve_power(nobs1=24, effect_size=0.8, power=None, alpha=0.05)
print("Power of the test: %.3f" % n_test)

Power of the test: 0.774
