# Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
import statsmodels.stats.proportion as sp

# Matplolib default parameters
from matplotlib import rcParams
plt.style.use('default')
plt.rcParams['axes.prop_cycle'] = plt.rcParamsDefault['axes.prop_cycle']

# import warning
import warnings
warnings.filterwarnings('ignore')

# Dataset Overview & Exploration

In [2]:
df = pd.read_csv('ab_data.csv')
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [3]:
print(f'Dataset contains of {df.shape[0]} Rows and {df.shape[1]} Columns')

Dataset contains of 294478 Rows and 5 Columns


In [4]:
# Check missing values
df.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [5]:
# Check duplicated data 
df.duplicated().sum()

0

In [3]:
# change timestamp to datetime datatype
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['dt'] = df['timestamp'].dt.date

In [9]:
print('First Session Datetime :', df['timestamp'].min())
print('Last Session Datetime :', df['timestamp'].max())
print('This experimental design run for' ,df['timestamp'].dt.day.max() - df['timestamp'].dt.day.min(), 'Days')

First Session Datetime : 2017-01-02 13:42:05.378582
Last Session Datetime : 2017-01-24 13:41:54.460509
This experimental design run for 22 Days


In [10]:
df['landing_page'].value_counts()

old_page    147239
new_page    147239
Name: landing_page, dtype: int64

In [11]:
df['group'].value_counts()

treatment    147276
control      147202
Name: group, dtype: int64

In [12]:
df['converted'].value_counts()

0    259241
1     35237
Name: converted, dtype: int64

In [24]:
df.groupby('dt').agg(user_id=('user_id','count')).reset_index()

Unnamed: 0,dt,user_id
0,2017-01-02,5783
1,2017-01-03,13394
2,2017-01-04,13284
3,2017-01-05,13124
4,2017-01-06,13528
5,2017-01-07,13381
6,2017-01-08,13564
7,2017-01-09,13439
8,2017-01-10,13523
9,2017-01-11,13553


In [53]:
df.groupby(['dt','group']).agg(user_id=('user_id','count')).reset_index()

Unnamed: 0,dt,group,user_id
0,2017-01-02,control,2894
1,2017-01-02,treatment,2889
2,2017-01-03,control,6684
3,2017-01-03,treatment,6710
4,2017-01-04,control,6655
5,2017-01-04,treatment,6629
6,2017-01-05,control,6538
7,2017-01-05,treatment,6586
8,2017-01-06,control,6684
9,2017-01-06,treatment,6844


In [25]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted,dt
0,851104,2017-01-21 22:11:48.556739,control,old_page,0,2017-01-21
1,804228,2017-01-12 08:01:45.159739,control,old_page,0,2017-01-12
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0,2017-01-11
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0,2017-01-08
4,864975,2017-01-21 01:52:26.210827,control,old_page,1,2017-01-21


In [28]:
print('Control Group Shape ', df[df['group'] == 'control'].shape)
print('Treatment Group Shape ', df[df['group'] == 'treatment'].shape)

Control Group Shape  (147202, 6)
Treatment Group Shape  (147276, 6)


In [29]:
print('Old Page Shape ', df[df['landing_page'] == 'old_page'].shape)
print('New Page Shape ', df[df['landing_page'] == 'new_page'].shape)

Old Page Shape  (147239, 6)
New Page Shape  (147239, 6)


In [33]:
df.groupby(['group', 'landing_page']).agg(num_user=('user_id','count'),
                                          convert=('converted','sum')).reset_index()

Unnamed: 0,group,landing_page,num_user,convert
0,control,new_page,1928,234
1,control,old_page,145274,17489
2,treatment,new_page,145311,17264
3,treatment,old_page,1965,250


## Daily Active User

In [5]:
data_group=df.groupby(['group','dt']).count().reset_index()
data_group['dt'] = pd.to_datetime(data_group['dt'])
data_group

Unnamed: 0,group,dt,user_id,timestamp,landing_page,converted
0,control,2017-01-02,2894,2894,2894,2894
1,control,2017-01-03,6684,6684,6684,6684
2,control,2017-01-04,6655,6655,6655,6655
3,control,2017-01-05,6538,6538,6538,6538
4,control,2017-01-06,6684,6684,6684,6684
5,control,2017-01-07,6678,6678,6678,6678
6,control,2017-01-08,6769,6769,6769,6769
7,control,2017-01-09,6725,6725,6725,6725
8,control,2017-01-10,6745,6745,6745,6745
9,control,2017-01-11,6787,6787,6787,6787


In [6]:
import altair as alt

alt.Chart(data_group).mark_line().encode(x='dt:T', y='user_id:T', color='group:N')

## Num of converted

In [7]:
data_group2=df.groupby(['group','dt']).agg(num_convert=('converted','sum')).reset_index()
data_group2['dt'] = pd.to_datetime(data_group2['dt'])
data_group2

Unnamed: 0,group,dt,num_convert
0,control,2017-01-02,362
1,control,2017-01-03,760
2,control,2017-01-04,810
3,control,2017-01-05,801
4,control,2017-01-06,778
5,control,2017-01-07,805
6,control,2017-01-08,805
7,control,2017-01-09,806
8,control,2017-01-10,763
9,control,2017-01-11,808


In [8]:
alt.Chart(data_group2).mark_line().encode(x='dt:T', y='num_convert:T', color='group:N')

## Data Preparation

In [11]:
control = df[df['group']=='control']
treatment = df[df['group']=='treatment']

print('Control Shape', control.shape)
print('Tratment Shape', treatment.shape)

Control Shape (147202, 6)
Tratment Shape (147276, 6)


## Calculate Number of Success and Number of Observation

In [16]:
n_success_control = control['converted'].sum()
n_success_treatment = treatment['converted'].sum()

n_obs_control = control['converted'].count()
n_obs_treatment = treatment['converted'].count()

print('n_success_control :', n_success_control)
print('n_success_treatment :', n_success_treatment)
print('n_obs_control :', n_obs_control)
print('n_obs_treatment :', n_obs_treatment)

n_success_control : 17723
n_success_treatment : 17514
n_obs_control : 147202
n_obs_treatment : 147276


In [17]:
control_conversion_rate = n_success_control/n_obs_control
treatment_conversion_rate = n_success_treatment/n_obs_treatment

print('Conversion Rate from Control : {0:0.2f}%'.format(control_conversion_rate*100))
print('Conversion Rate from Treatment : {0:0.2f}%'.format(treatment_conversion_rate*100))

Conversion Rate from Control : 12.04%
Conversion Rate from Treatment : 11.89%


# Experiment

## Define Experiment

**1) What is the name of the experiment?** <br>
AB Test New Design for Landing Page Design <br>
**2) Define Hypothesis** <br>
- H0 : existing design and new design **have same** conversion rate <br>
- H1 : existing design and new design **do not have same** conversion rate

**3) Who is the participant?** <br>
The user that visit landing page <br>
**4) What variables will be tested?** <br>
 New Page & Old Page

## Define Metric

**Metric** : Conversion Rate

## Define Sample Size

In [18]:
def binomial_sample_size(metric, mde, alpha, beta):
    # standard normal distribution to determine z-values
    snd = stats.norm(0, 1)

    Z_beta = snd.ppf(1-beta)
    print('Z_beta :', Z_beta)

    Z_alpha = snd.ppf(1-alpha/2)
    print('Z_alpha :', Z_alpha)

    # average of probabilities from both groups
    p = (metric + metric+mde) / 2
    print('p :', p)
    print('\n')

    N = (2 * p * 
             (1 - p) * 
             (0.84 + 1.96)**2
             / mde**2)

    return print(f'Num of samples (at least) needed : {round(N)}')

In [45]:
# we want to set MDE = 0.02 (about 14% increment from metric)
binomial_sample_size(0.1189, 0.02, 0.05, 0.2)

Z_beta : 0.8416212335729143
Z_alpha : 1.959963984540054
p : 0.12890000000000001


Num of samples (at least) needed : 4402


**We need at least 4402 user samples** obvservation for each group. **This is just for the sake of the exercise** and we keep use all the users given (because the available datasets are intended for a/b testing purposes, so when we use all of these datasets, it will not interfere with other users who are not tested)

## Define Duration

In [46]:
print('First Session Datetime :', df['timestamp'].min())
print('Last Session Datetime :', df['timestamp'].max())
print('This experimental design run for' ,df['timestamp'].dt.day.max() - df['timestamp'].dt.day.min(), 'Days')

First Session Datetime : 2017-01-02 13:42:05.378582
Last Session Datetime : 2017-01-24 13:41:54.460509
This experimental design run for 22 Days


## Test Experiment

In [47]:
## making array
success = np.array([n_success_control,n_success_treatment])
obs = np.array([n_obs_control,n_obs_treatment])

In [48]:
stats,pvalue = sp.proportions_ztest(success,obs)

In [49]:
print('P-Value : {0:0.4f}'.format(pvalue))
if pvalue >= 0.05:
    print('Insufficient to reject H0')
else :
    print('Sufficient to reject H0')

P-Value : 0.2161
Insufficient to reject H0


There's no difference between new page and old page in terms of num of converted customer. So, new page design failed to gain more converted customer. We can use existing page or new page in for implementation (they didn't bring much difference on number of converted customer)