**Import the necessary libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm

**Load the data into a pandas dataframe**

In [2]:
df=pd.read_csv('AB_test_AD.csv')

**Check the data for any missing values,incorrect datatypes, duplicates or outliers**

In [3]:
df.head(10)

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
5,0027ce48-d3c6-4935-bb12-dfb5d5627857,control,2020-07-03,15,Samsung SM-G960F,6,Facebook,0,0
6,002e308b-1a07-49d6-8560-0fbcdcd71e4b,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
7,00393fb9-ca32-40c0-bfcb-1bd83f319820,control,2020-07-09,5,Samsung SM-G973F,6,Facebook,0,0
8,004940f5-c642-417a-8fd2-c8e5d989f358,exposed,2020-07-04,0,Generic Smartphone,6,Chrome Mobile WebView,0,0
9,004c4cc9-f2ca-4df7-adc9-3d0c3c4f0342,control,2020-07-05,14,Generic Smartphone,6,Chrome Mobile,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8077 entries, 0 to 8076
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   auction_id   8077 non-null   object
 1   experiment   8077 non-null   object
 2   date         8077 non-null   object
 3   hour         8077 non-null   int64 
 4   device_make  8077 non-null   object
 5   platform_os  8077 non-null   int64 
 6   browser      8077 non-null   object
 7   yes          8077 non-null   int64 
 8   no           8077 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 568.0+ KB


In [5]:
df.describe()

Unnamed: 0,hour,platform_os,yes,no
count,8077.0,8077.0,8077.0,8077.0
mean,11.61508,5.947134,0.070818,0.083075
std,5.734879,0.224333,0.256537,0.276013
min,0.0,5.0,0.0,0.0
25%,7.0,6.0,0.0,0.0
50%,13.0,6.0,0.0,0.0
75%,15.0,6.0,0.0,0.0
max,23.0,7.0,1.0,1.0


In [6]:
df.describe(include='O')  

Unnamed: 0,auction_id,experiment,date,device_make,browser
count,8077,8077,8077,8077,8077
unique,8077,2,8,269,15
top,0008ef63-77a7-448b-bd1e-075f42c55e39,control,2020-07-03,Generic Smartphone,Chrome Mobile
freq,1,4071,2015,4743,4554


In [7]:
df.isnull().any().sum()

0

In [8]:
df.dtypes

auction_id     object
experiment     object
date           object
hour            int64
device_make    object
platform_os     int64
browser        object
yes             int64
no              int64
dtype: object

In [9]:
df.duplicated().any().sum()

0

**Drop unnecessary columns**

In [10]:
df['auction_id'].nunique()

8077

In [11]:
df.drop(columns='auction_id',inplace=True)

In [12]:
df.head()

Unnamed: 0,experiment,date,hour,device_make,platform_os,browser,yes,no
0,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0


**Choose an appropriate index if necessary**

In [13]:
df.set_index('date',drop=True,inplace=True)

In [14]:
df.head()

Unnamed: 0_level_0,experiment,hour,device_make,platform_os,browser,yes,no
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-07-10,exposed,8,Generic Smartphone,6,Chrome Mobile,0,0
2020-07-07,exposed,10,Generic Smartphone,6,Chrome Mobile,0,0
2020-07-05,exposed,2,E5823,6,Chrome Mobile WebView,0,1
2020-07-03,control,15,Samsung SM-A705FN,6,Facebook,0,0
2020-07-03,control,15,Generic Smartphone,6,Chrome Mobile,0,0


In [15]:
df['response']=np.where(df['yes']==1,'yes','no')

In [16]:
df.head()

Unnamed: 0_level_0,experiment,hour,device_make,platform_os,browser,yes,no,response
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-07-10,exposed,8,Generic Smartphone,6,Chrome Mobile,0,0,no
2020-07-07,exposed,10,Generic Smartphone,6,Chrome Mobile,0,0,no
2020-07-05,exposed,2,E5823,6,Chrome Mobile WebView,0,1,no
2020-07-03,control,15,Samsung SM-A705FN,6,Facebook,0,0,no
2020-07-03,control,15,Generic Smartphone,6,Chrome Mobile,0,0,no


**A/B test modeling**

**Check the number of samples in control and treatment group**

In [33]:
df['experiment'].value_counts()

control    4071
exposed    4006
Name: experiment, dtype: int64

In [34]:
df.groupby('experiment')['response'].value_counts()

experiment  response
control     no          3807
            yes          264
exposed     no          3698
            yes          308
Name: response, dtype: int64

  **Create contingency table** 

In [36]:
# Contingency Table
CT_values=pd.crosstab(df['experiment'],df['response'])  

In [37]:
CT_values

response,no,yes
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1
control,3807,264
exposed,3698,308


In [21]:
# CT_values=pd.crosstab(df['experiment'],df['response']).values
# CT_values

**Formulating a hypothesis**

Given we don’t know if the new design for the BIO questionnaire will perform better or worse (or the same?) as our current design, we’ll choose a two-tailed test:
Null Hypothesis⟹Ho: Conversion rate of control group (CG)=Conversion rate of treatment or exposed group (EG)

Alternative Hypothesis⟹H1: Conversion rate of control group (CG)≠Conversion rate of treatment or exposed group (EG)

We’ll also set a confidence level of 95%:

α = 0.05

The α value is a threshold we set, by which we say “if the probability of observing a result as extreme or more (p-value) is lower than α, then we reject the Null hypothesis”. Since our α=0.05 (indicating 5% probability), our confidence (1 — α) is 95%.

Whatever conversion rate we observe for our new design in our test, we want to be 95% confident it is statistically different from the conversion rate of our old design, before we decide to reject the Null hypothesis Hₒ.

In [23]:
# Null hypothesis -> Ho : CR_CG = CR_EG
# Alternative hypothesis -> H1 : CR_CG ≠ CR_EG

After formulating the hypothesis and performing the experiment we collected the following data in the contingency table.
**Calculate the conversion rate of control group and exposed group**

In [38]:
# The conversion rate of CG and EG
CG=264/(264+3807)
EG=308/(308+3698)
print(CG,EG)

0.06484893146647015 0.07688467299051423


The conversion rate  of the treatment group's value is higher than the control group. Is this difference *statistically significant*?

**Testing the hypothesis**

Using statistical significance tests we can measure if the collected data shows a result more extreme than the chance might produce. If the result is beyond the chance variation, then it is statistically significant. In this example, we have categorical variables in the contingency data format, which follow a Bernoulli distribution. Bernoulli Distribution has a probability of being 1 and a probability of being 0. In our example, it is conversion=1 and no conversion=0. Considering we are using the conversions as the metric, which is a categorical variable following Bernoulli distribution, we will be using the Chi-Squared test to interpret the results.

The Chi-Squared test assumes observed frequencies for a categorical variable match with the expected frequencies. It calculates a test statistic (Chi) that has a chi-squared distribution and is interpreted to reject or fail to reject the null hypothesis if the expected and observed frequencies are the same. In this article, we will be using scipy.stats package for the statistical functions.
The probability density function of Chi-Squared distribution varies with the degrees of freedom (df) which depends on the size of the contingency table, and calculated as df=(#rows-1)*(#columns-1) In this example df=1.

Key terms we need to know to interpret the test result using Python are p-value and alpha. P-value is the probability of obtaining test results at least as extreme as the results actually observed, under the assumption that the null hypothesis is correct. P-value is is one of the outcomes of the test. Alpha also known as the level of statistical significance is the probability of making type I error (rejecting the null hypothesis when it is actually true). The probability of making a type II error (failing to reject the null hypothesis when it is actually false) is called beta, but it is out of scope for this article. In general, alpha is taken as 0.05 indicating 5% risk of concluding a difference exists between the groups when there is no actual difference.

In terms of a p-value and a chosen significance level (alpha), the test can be interpreted as follows:

If p-value <= alpha: significant result, reject null hypothesis
If p-value > alpha: not significant result, do not reject null hypothesis
We can also interpret the test result by using the test statistic and the critical value:

If test statistic >= critical value: significant result, reject null hypothesis
If test statistic < critical value: not significant result, do not to reject null hypothesis

In [24]:
### chi2 test on contingency table
from scipy.stats import chi2_contingency
chi2_statistic,p_value,dof,expected_values=chi2_contingency(CT_values,correction=False) 
print(chi2_statistic,p_value) 

4.444890940566488 0.03500582596832453


In [25]:
prob = 0.95
alpha=1.0-prob
print('significance=%.3f, p=%.3f' % (alpha,p_value))
if p_value<=alpha:
    print('Reject null hypothesis')
else:
    print('Do not reject null hypothesis')

significance=0.050, p=0.035
Reject null hypothesis


In [26]:
prob = 0.95
alpha=1.0-prob
print("{a1:.3f},{p1:.3f}".format(a1=alpha,p1=p_value))
if p_value<=alpha:
    print('Reject null hypothesis')
else:
    print('Do not reject null hypothesis')

0.050,0.035
Reject null hypothesis


In [27]:
### interpret test-statistic
from scipy.stats import chi2
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, chi2_statistic=%.3f' % (prob, critical, chi2_statistic))
if abs(chi2_statistic) >= critical:
    print('Reject null hypothesis')
else:
    print('Do not reject null hypothesis')

probability=0.950, critical=3.841, chi2_statistic=4.445
Reject null hypothesis


**As can be seen from the result we reject the null hypothesis, in other words, the positive relative difference between the conversion rates is significant.**

Since the number of samples are large enough (>30), we can use central limit theorem. we can use the normal approximation for calculating our p-value (i.e. z-test).

Again, Python makes all the calculations very easy. We can use the statsmodels.stats.proportion module to get the 
p-value and confidence intervals.

In [39]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
n_con = (264+3807)
n_exp = (308+3698)
successes = [264,308]
nobs = [n_con, n_exp]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_exp), (upper_con, upper_exp) = proportion_confint(successes, nobs=nobs, alpha=0.05)

print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_exp:.3f}, {upper_exp:.3f}]')

z statistic: -2.11
p-value: 0.035
ci 95% for control group: [0.057, 0.072]
ci 95% for treatment group: [0.069, 0.085]


In [29]:
from scipy.stats import norm
prop = 0.95
prop_inv = (1.0 - prop) / 2.0
gauss_critical = norm.ppf(prop_inv)

In [30]:
gauss_critical

-1.959963984540054