In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as plt
import datetime

## Load Data

more than half of the ads_channel column from test_table are null

In [10]:
test = pd.read_csv('data/test_table.csv')
user = pd.read_csv('data/user_table.csv')
print(f'user table dimension: {user.shape}')
print(f'test table dimension: {test.shape}')
test.info()

user table dimension: (452867, 4)
test table dimension: (453321, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453321 entries, 0 to 453320
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           453321 non-null  int64 
 1   date              453321 non-null  object
 2   source            453321 non-null  object
 3   device            453321 non-null  object
 4   browser_language  453321 non-null  object
 5   ads_channel       181877 non-null  object
 6   browser           453321 non-null  object
 7   conversion        453321 non-null  int64 
 8   test              453321 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 31.1+ MB


user_table is a complete table

In [11]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452867 entries, 0 to 452866
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  452867 non-null  int64 
 1   sex      452867 non-null  object
 2   age      452867 non-null  int64 
 3   country  452867 non-null  object
dtypes: int64(2), object(2)
memory usage: 13.8+ MB


In [29]:
print(f'test table looks like \n{test.head()}')
print(f'user table looks like \n{user.head()}')
print(f'\nlooks like user table contains demographics for users one row per user')
user['user_id'].groupby(user.user_id).value_counts().sort_values(ascending=False)

test table looks like 
   user_id        date  source  device browser_language ads_channel  \
0   315281  2015-12-03  Direct     Web               ES         NaN   
1   497851  2015-12-04     Ads     Web               ES      Google   
2   848402  2015-12-04     Ads     Web               ES    Facebook   
3   290051  2015-12-03     Ads  Mobile            Other    Facebook   
4   548435  2015-11-30     Ads     Web               ES      Google   

       browser  conversion  test  
0           IE           1     0  
1           IE           0     1  
2       Chrome           0     0  
3  Android_App           0     1  
4      FireFox           0     1  
user table looks like 
   user_id sex  age    country
0   765821   M   20     Mexico
1   343561   F   27  Nicaragua
2   118744   M   23   Colombia
3   987753   F   27  Venezuela
4   554597   F   20      Spain

looks like user table contains demographics for users one row per user


user_id  user_id
1        1          1
666601   666601     1
666628   666628     1
666626   666626     1
666625   666625     1
                   ..
333083   333083     1
333082   333082     1
333081   333081     1
333078   333078     1
1000000  1000000    1
Name: user_id, Length: 452867, dtype: int64

## Data Quality

In [28]:
print(f'unique user count in test table: {test.user_id.nunique()}')
print(f'unique user count in user table: {user.user_id.nunique()}')
print(f'we are missing {test.user_id.nunique() - user.user_id.nunique()} users demographic information in the user table')

unique user count in test table: 453321
unique user count in user table: 452867
we are missing 454 users demographic information in the user table


since we don't have these 454 people's demographics, we need to remove their test results from the test table

In [36]:
data = test.merge(user, on = ['user_id'])
data['date'] = pd.to_datetime(data['date'])
print(data.info())
print(data.describe())
print(data.describe(include = object))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 452867 entries, 0 to 452866
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   user_id           452867 non-null  int64         
 1   date              452867 non-null  datetime64[ns]
 2   source            452867 non-null  object        
 3   device            452867 non-null  object        
 4   browser_language  452867 non-null  object        
 5   ads_channel       181693 non-null  object        
 6   browser           452867 non-null  object        
 7   conversion        452867 non-null  int64         
 8   test              452867 non-null  int64         
 9   sex               452867 non-null  object        
 10  age               452867 non-null  int64         
 11  country           452867 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(7)
memory usage: 44.9+ MB
None
              user_id     conversion        

## Q1: is test result truely negative?

In [41]:
# control group
control_result = data.query('test == 0').groupby('country')['conversion'].mean().sort_values(ascending = False)
print(f' control group\n {control_result}')
# test group
test_result = data.query('test == 1').groupby('country')['conversion'].mean().sort_values(ascending = False)
print(f' test group\n {test_result}')

 control group
 country
Spain          0.079719
El Salvador    0.053554
Nicaragua      0.052647
Costa Rica     0.052256
Colombia       0.052089
Honduras       0.050906
Guatemala      0.050643
Venezuela      0.050344
Peru           0.049914
Mexico         0.049495
Bolivia        0.049369
Ecuador        0.049154
Paraguay       0.048493
Chile          0.048107
Panama         0.046796
Argentina      0.015071
Uruguay        0.012048
Name: conversion, dtype: float64
 test group
 country
Costa Rica     0.054738
Nicaragua      0.054177
Chile          0.051295
Mexico         0.051186
Peru           0.050604
Colombia       0.050571
Panama         0.049370
Paraguay       0.049229
Ecuador        0.048988
Venezuela      0.048978
Guatemala      0.048647
El Salvador    0.047947
Bolivia        0.047901
Honduras       0.047540
Argentina      0.013725
Uruguay        0.012907
Name: conversion, dtype: float64


## t-test 

The indepentent T-test is a parametric test used to test for a statistically significant difference in the means between 2 groups. As with all parametric tests, there are certain conditions that need to be met in order for the test results to be considered reliable.
1. Population distributions are normal
2. Samples have equal variances
3. The two samples are independent

nothing changed for Spanish users, so we will remove Spain from data

In [70]:
# pair or independent?
import statistics
data = test.merge(user, on = ['user_id'])
data['date'] = pd.to_datetime(data['date'])
# remove Spain from data 
data = data.query('country != "Spain"')
group_test = data[data['test'] == 1]
group_control = data[data['test'] == 0]
pair = group_test.merge(group_control, on = ['user_id'])
print(f'there is {pair.shape[0]} pair user in both test and control group.')
var_test = statistics.variance(group_test['conversion'])
var_control = statistics.variance(group_control['conversion'])
print(f'test group variance: {var_test}')
print(f'control group variance: {var_control}')
print(f'test group size: {group_test.shape[0]}')
print(f'control group size: {group_control.shape[0]}')

there is 0 pair user in both test and control group.
test group variance: 0.04152682517533804
control group variance: 0.045959941537429036
test group size: 215774
control group size: 185311


In [76]:
import researchpy as rp
import scipy.stats as stats

# test t-test assumption: samples have equal variance
stats, pvalue = stats.levene(data['conversion'][data['test'] == 0],
             data['conversion'][data['test'] == 1])
print(f'levene test statistics: {stats}, p-value: {pvalue}\n')
print(f'it seems like the two groups do not have equal variance\n')
print(f'however, Two sample t-test is relatively robust to the assumption of normality and homogeneity of variances when sample size is large (n ≥ 30) and there are equal number of samples (n1 = n2) in both groups ')

levene test statistics: 54.497646998915, p-value: 1.5593292774404536e-13

it seems like the two groups do not have equal variance

however, Two sample t-test is relatively robust to the assumption of normality and homogeneity of variances when sample size is large (n ≥ 30) and there are equal number of samples (n1 = n2) in both groups 


In [108]:
summary, results = rp.ttest(group1 = data['conversion'][data['test']==0], group1_name = 'control',
         group2 = data['conversion'][data['test']==1], group2_name = 'test')
print(summary)
print(results) 
print('\n')
p_value = results['results'][5]
print(f'p value: {p_value}')
print(f'control group is converting {summary.Mean[0]*100:.1f}% on average, while test group {summary.Mean[1]*100:.1f}%.')
print(f'the difference (control - test) < 0 p value obtained from the t-test is significant, and therefore, we conclude that the conversion rate of test group is significantly different than control group\n')

   Variable         N      Mean        SD        SE  95% Conf.  Interval
0   control  185311.0  0.048292  0.214383  0.000498   0.047316  0.049268
1      test  215774.0  0.043411  0.203781  0.000439   0.042551  0.044271
2  combined  401085.0  0.045666  0.208760  0.000330   0.045020  0.046312
               Independent t-test      results
0  Difference (control - test) =        0.0049
1           Degrees of freedom =   401083.0000
2                            t =        7.3823
3        Two side test p value =        0.0000
4       Difference < 0 p value =        1.0000
5       Difference > 0 p value =        0.0000
6                    Cohen's d =        0.0234
7                    Hedge's g =        0.0234
8                Glass's delta =        0.0228
9                  Pearson's r =        0.0117


p value: 0.0
control group is converting 4.8% on average, while test group 4.3%.
the difference (control - test) < 0 p value obtained from the t-test is significant, and therefore, we concl

## Q2. What can go wrong in a A/B test

maybe there has been bias in the experiment so that the test/control group are not really random.
We can check every single variable to see if it has same distribution across test and control group

In [123]:
data.shape

(401085, 12)

In [140]:
Ads_test = data.loc[ (data['source'] == 'Ads') & (data['test'] == 1)].shape[0]
Direct_test = data.loc[ (data['source'] == 'Direct') & (data['test'] == 1)].shape[0]
SEO_test = data.loc[ (data['source'] == 'SEO') & (data['test'] == 1)].shape[0]
print(f'Ads in test group: {Ads_test}')
print(f'Direct in test group: {Direct_test}')
print(f'SEO in test group: {SEO_test}')
print('\n')
freq_source = data.groupby(['source', 'test'])['test'].agg(
    freq = 'count'
) 
print(freq_source / freq_source.groupby('test').sum())

Ads in test group: 86448
Direct in test group: 43047
SEO in test group: 86279


                 freq
source test          
Ads    0     0.401228
       1     0.400641
Direct 0     0.200949
       1     0.199500
SEO    0     0.397823
       1     0.399858


In [141]:
data.describe(include = object)

Unnamed: 0,source,device,browser_language,ads_channel,browser,sex,country
count,401085,401085,401085,160800,401085,401085,401085
unique,3,2,3,5,7,2,16
top,Ads,Web,ES,Facebook,Android_App,M,Mexico
freq,160800,222479,334014,60453,137355,234080,128484


In [142]:
freq_device = data.groupby(['device', 'test'])['test'].agg(
    freq = 'count'
) 
print(freq_device / freq_device.groupby('test').sum())

freq_browser_language = data.groupby(['browser_language', 'test'])['test'].agg(
    freq = 'count'
) 
print(freq_browser_language / freq_browser_language.groupby('test').sum())

freq_ads_channel = data.groupby(['ads_channel', 'test'])['test'].agg(
    freq = 'count'
) 
print(freq_ads_channel / freq_ads_channel.groupby('test').sum())

freq_browser = data.groupby(['browser', 'test'])['test'].agg(
    freq = 'count'
) 
print(freq_browser / freq_browser.groupby('test').sum())

freq_sex = data.groupby(['sex', 'test'])['test'].agg(
    freq = 'count'
) 
print(freq_sex / freq_sex.groupby('test').sum())

freq_country = data.groupby(['country', 'test'])['test'].agg(
    freq = 'count'
) 
print(freq_country / freq_country.groupby('test').sum())


                 freq
device test          
Mobile 0     0.444388
       1     0.446096
Web    0     0.555612
       1     0.553904
                           freq
browser_language test          
EN               0     0.139101
                 1     0.139526
ES               0     0.833280
                 1     0.832343
Other            0     0.027618
                 1     0.028131
                      freq
ads_channel test          
Bing        0     0.074847
            1     0.075282
Facebook    0     0.374516
            1     0.377186
Google      0     0.375551
            1     0.374537
Other       0     0.023079
            1     0.022326
Yahoo       0     0.152007
            1     0.150669
                      freq
browser     test          
Android_App 0     0.341307
            1     0.343447
Chrome      0     0.223273
            1     0.225954
FireFox     0     0.089719
            1     0.089719
IE          0     0.136964
            1     0.135410
Iphone_App  0     

as you can see from above frequency comparison, Argentina and Uruguay are distributed unevenly across test and control group

## Conclusion

In [144]:
no_Argentina_Uruguay_summary, no_Argentina_Uruguay_results = rp.ttest(
         group1 = data['conversion'][(data['test']==0) & (data['country'] != 'Argentina') & (data['country'] != 'Uruguay')], 
         group1_name = 'control',
         group2 = data['conversion'][(data['test']==1) & (data['country'] != 'Argentina') & (data['country'] != 'Uruguay')], 
         group2_name = 'test')
print(no_Argentina_Uruguay_summary)
print(no_Argentina_Uruguay_results) 
print('\n')
new_p_value = no_Argentina_Uruguay_results['results'][5]
print(f'p value: {new_p_value}')
#print(f'control group is converting {no_Argentina_Uruguay_summary.Mean[0]*100:.1f}% on average, while test group {no_Argentina_Uruguay_summary.Mean[1]*100:.1f}%.')
#print(f'the difference (control - test) < 0 p value obtained from the t-test is significant, and therefore, w

   Variable         N      Mean        SD        SE  95% Conf.  Interval
0   control  175540.0  0.050148  0.218251  0.000521   0.049127  0.051169
1      test  174678.0  0.050413  0.218796  0.000524   0.049387  0.051439
2  combined  350218.0  0.050280  0.218523  0.000369   0.049556  0.051004
               Independent t-test      results
0  Difference (control - test) =       -0.0003
1           Degrees of freedom =   350216.0000
2                            t =       -0.3583
3        Two side test p value =        0.7201
4       Difference < 0 p value =        0.3600
5       Difference > 0 p value =        0.6400
6                    Cohen's d =       -0.0012
7                    Hedge's g =       -0.0012
8                Glass's delta =       -0.0012
9                  Pearson's r =        0.0006


p value: 0.64
