In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.cm import tab10
plt.style.use('seaborn')


In [None]:
data = pd.read_csv('../input/ab-testing/ab_data.csv')
data.head()

# Data wrangling

Data contains id's of users that used the site, timestamp at which they visited the page the group they belonged too and a binary varible describing if they did convert.

First we check if the data are balanced or not and if it contains any NA's.

In [None]:
data.isna().sum(axis = 0)

In [None]:
sizes = data.pivot_table(values = 'converted', index = 'group', aggfunc = 'count')
sizes

In [None]:
s_check = data.groupby(['group','landing_page']).count()
s_check

The data are balanced although they are some records that have visited new page in control group and old page in treatment group.

In [None]:
(round((s_check.loc[('control','new_page'),'converted']
 / sizes.loc['control']).values[0],2),
round((s_check.loc[('treatment','old_page'),'converted']
 / sizes.loc['treatment']).values[0],2))

The wrong data make about 1% of the groups, thus we decide to remove them without huge loss.

In [None]:
mask = ((data['group'] == 'treatment')&(data['landing_page'] == 'new_page'))|((data['group'] == 'control')&(data['landing_page'] == 'old_page'))
data = data.loc[mask].copy()

In [None]:
data.groupby(['group','landing_page']).count()

Let us estimate the sample means for our two groups.

In [None]:
means = data.pivot_table(values = 'converted', index = 'group', aggfunc = 'mean')
means

In [None]:
data.groupby(['group','converted']).count()['user_id']

In [None]:
import numpy as np
fig, ax = plt.subplots(figsize = (12,9))
barplot = ax.bar([0,0.5,1.5,2],data.groupby(['group','converted']).count().loc[:,'timestamp'], width = 0.5)
for i,bar in enumerate(barplot):
    bar.set_color(tab10(i))
ax.set_xticks(np.arange(-0.25,2,0.5))
xaxis_tolabel = [f'{gr} group \n {val}' for gr in ['control','treatment'] for val in ['not converted','converted']]
xaxis_tolabel.insert(2,'')
ax.set_xticklabels(xaxis_tolabel, fontdict = {'rotation':45, 'fontsize':12, 'horizontalalignment':'left'})
ax.set_title('Conversion between two groups', fontsize = 18)
plt.show()

# Power analysis


Power analyisis is the stage of experiment design where you decide what your sample size should be to get significant results. Different tests require different power analysis methods thus at this stage we need to determine our statistical hypothesis. As our means are realtively close to each other we are going to use two-sided test to figure out if they are different.

Let us denote by $X$ the variable describing the control group and by $Y$ the variable describing the treatment group. We have that $X \sim Bin(n,p_1)$ and $Y \sim Bin(m,p_2)$. Our hypotheses will be

\\[ \begin{array}{ll} H_0 :& p_1 = p_2 \\
H_1 :& p_1 \neq p_2\\ \end{array} \\]

For purposes of this test we are going to compare two proportions (conversion rates). As our variables are binomial, our test statisic is of the form 
\\[ Z = \frac{ \hat{Y} - \hat{X} }{\sqrt{P(1-P)( \frac{1}{n} + \frac{1}{m})}} \\]
\\[ \begin{array}{ll} \hat{X} & \mbox{empirical mean of the first population}\\
\hat{Y} & \mbox{ empirical mean of the second population}\\
n & \mbox{ the size of the first sample} \\
m & \mbox{ the size of the second sample} \\
P = \frac{n\hat{X} + m\hat{Y}}{n+m} & \mbox{the pooled variable empirical mean}.
\end{array} \\]
Under $H_0$- true we have that $Z \sim \mathit{N}(0,1)$.

In [None]:
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize

The first function calculates the minimum sample size for achieving certain power of test for two sample Z statistic test.
The second function calculates the effect size for proportions.

We have measured that the control group has mean 0.12, thus we assume that before any changes this was the mean conversion rate, we want to notice a change of 0.01 to calculate effect size.

In [None]:
e_size = proportion_effectsize(0.12,0.13)
min_size = NormalIndPower().solve_power( 
    effect_size = e_size,
    nobs1 = None,
    power = 0.8,
    alpha = 0.05,
    ratio = 1,
    alternative = 'two-sided'
)
int(min_size)+1

To get significant result ($ \alpha = 0.05$) we need at least 17164 observations per group.

# Testing the hypothesis

First let us calculate the value of the test statistic "by hand".

In [None]:
n = sizes.loc['control'].values[0]
m = sizes.loc['treatment'].values[0]
Xhat = means.loc['control'].values[0]
Yhat = means.loc['treatment'].values[0]
phat = data['converted'].sum()/len(data['converted']) 

In [None]:
Z_val = (Yhat - Xhat)/(np.sqrt((phat*(1-phat)*(1/n + 1/m))))
Z_val

Let's import the Z test from statsmodels.api.

In [None]:
from statsmodels.stats.proportion import proportions_ztest

In [None]:
mask_treatment = data['group'] == 'treatment'
treatment_ones = data.loc[mask_treatment, 'converted'].value_counts().loc[1]
mask_control = data['group'] == 'control'
control_ones = data.loc[mask_control, 'converted'].value_counts().loc[1]

In [None]:
z_test = proportions_ztest(
    count = [treatment_ones,control_ones],
    nobs = [sizes.loc['treatment'].values[0],sizes.loc['control'].values[0]],
    value = 0,
    alternative = 'two-sided',
    prop_var = False
)
print('The value of the Z statistic is %0.3f and the p-value is %0.3f' % (z_test[0],z_test[1]))

The $p$-value is bigger that 0.05, thus we can't reject the hypothesis that $p_1 = p_2$.