In [77]:
import pandas as pd
import numpy as np

In [78]:
df_base = pd.read_csv("data/baseline_values.csv", index_col=False, header=None,
                      names=['name', 'baseline_value'])
# set metric names to lower case
df_base['name']=df_base['name'].map(lambda x: x.lower())


In [79]:
# Rename for easier read
df_base['metric'] = ['cookies', 'clicks', 'user-ids', 'ctp', 'gross conversion', 'retention', 'net conversion']
df_base['dmin'] = [3000, 240, -50, 0.01, -0.01, 0.01, 0.0075]
df_base

Unnamed: 0,name,baseline_value,metric,dmin
0,unique cookies to view course overview page pe...,40000.0,cookies,3000.0
1,"unique cookies to click ""start free trial"" per...",3200.0,clicks,240.0
2,enrollments per day:,660.0,user-ids,-50.0
3,"click-through-probability on ""start free trial"":",0.08,ctp,0.01
4,"probability of enrolling, given click:",0.20625,gross conversion,-0.01
5,"probability of payment, given enroll:",0.53,retention,0.01
6,"probability of payment, given click",0.109313,net conversion,0.0075


### 3. Measure of variability in metrics
3.1 Scaling

In [80]:
n_sample = 5000
n_cookie = df_base.loc[df_base['metric']=='cookies', 'baseline_value'].iloc[0]
scaling_factor = n_sample/n_cookie

# create a new column to store scaled baseline values
df_base['scaled_n']= np.nan
df_base['scaled_n'] = (df_base['baseline_value'] * scaling_factor).where(df_base['metric'].isin(['cookies', 'clicks', 'user-ids']), np.nan) 
df_base 

Unnamed: 0,name,baseline_value,metric,dmin,scaled_n
0,unique cookies to view course overview page pe...,40000.0,cookies,3000.0,5000.0
1,"unique cookies to click ""start free trial"" per...",3200.0,clicks,240.0,400.0
2,enrollments per day:,660.0,user-ids,-50.0,82.5
3,"click-through-probability on ""start free trial"":",0.08,ctp,0.01,
4,"probability of enrolling, given click:",0.20625,gross conversion,-0.01,
5,"probability of payment, given enroll:",0.53,retention,0.01,
6,"probability of payment, given click",0.109313,net conversion,0.0075,


3.2 Standard error

$SE=\sqrt{\frac{\hat p(1-\hat p)}{n}}$

In [81]:
def stderr(m_n, m_p):
    #calculate standard error of the population
    #Start with df_base. m_n = metric of unit of diversion, m_p = metric of probability 
    f1 = df_base[df_base['metric'] == m_n]
    if f1.empty:
        raise ValueError(f'Metric {m_n} not found.')

    f2 = df_base[df_base['metric'] == m_p]
    if f2.empty:
        raise ValueError(f'Metric {m_p} not found.')
    
    n = f1['scaled_n'].iloc[0]
    p = f2['baseline_value'].iloc[0]
    return (p * (1 - p) / n) ** 0.5


# Gross Conversion (clicks)
se_gross_conv = stderr('clicks', 'gross conversion')

# Retention (enrolls)
se_retention = stderr('user-ids', 'retention')

# Net conversion (clicks)
se_net_conv = stderr('clicks', 'net conversion')

print(f'Gross Conversion (clicks): {se_gross_conv: .4f}, Retenstion (enrolls): {se_retention:.4f}, Net conversion (clicks): {se_net_conv:.4f}')

Gross Conversion (clicks):  0.0202, Retenstion (enrolls): 0.0549, Net conversion (clicks): 0.0156


In [82]:
# create a new column to store standard error for evaluation metrics
df_base['se'] = [np.nan, np.nan, np.nan, np.nan, round(se_gross_conv, 4), round(se_retention, 4), round(se_net_conv, 4)]
df_base

Unnamed: 0,name,baseline_value,metric,dmin,scaled_n,se
0,unique cookies to view course overview page pe...,40000.0,cookies,3000.0,5000.0,
1,"unique cookies to click ""start free trial"" per...",3200.0,clicks,240.0,400.0,
2,enrollments per day:,660.0,user-ids,-50.0,82.5,
3,"click-through-probability on ""start free trial"":",0.08,ctp,0.01,,
4,"probability of enrolling, given click:",0.20625,gross conversion,-0.01,,0.0202
5,"probability of payment, given enroll:",0.53,retention,0.01,,0.0549
6,"probability of payment, given click",0.109313,net conversion,0.0075,,0.0156


3.3 Determine sample size

Calculate the sample size for the selected evaluation metrics, use alpha = 0.05 and beta = 0.2.

In [93]:
from statsmodels.stats.power import zt_ind_solve_power
import math 
alpha=0.05
beta =0.2

In [84]:
def spl_size(m):
    # Calculate sample size.
    # Start with df_base. m = metric for prob and mde 
    f3 = df_base[df_base['metric'] == m]
    if f3.empty:
        raise ValueError(f'Metric {m} not found.')
    p = f3['baseline_value'].iloc[0]
    mde = f3['dmin'].iloc[0]

    # pooled probability of p1 and p2
    pooled_prob = (p+(p+abs(mde)))/2
    # calculate effective size using minimum detectable effect and pooled proability
    effect_size = abs(mde)/((pooled_prob*(1-pooled_prob))**0.5)
    return 2*zt_ind_solve_power(effect_size=effect_size, alpha=alpha, power=1-beta, alternative='two-sided')

In [None]:
# clicks for clicks for gross conversion
n_sample_gross_conv = spl_size('gross conversion')
# Enrollment for clicks for retention
n_sample_retention = spl_size('retention')
# clicks for clicks for net conversion
n_sample_net_conv = spl_size('net conversion')

print(f'Clicks needed for Gross Conversion: {n_sample_gross_conv: .0f}, Retenstion: {n_sample_retention:.0f}, Net conversion: {n_sample_net_conv:.0f}')


Clicks needed for Gross Conversion:  52312, Retenstion: 78104, Net conversion: 55970


Use the sample size calculator (https://www.evanmiller.org/ab-testing/sample-size.html)

Eg Gross Coversion:

Given Baseline Value =  20.625%, and Minimum Detectable Effect = 0.01, using the sample size calculator, the sample size for one group for the number of clicks on "start free trial" is 25,835. Since our A/B testing experiment have one control group and one experiment group, the sample size for the clicks n = 25835*2=51,670.

- Clicks for Gross Conversion = 25835*2=51670
- Enrollment for Retention = 39115*2=78230
- Clicks for Net Conversion = 27413*2=54826

Set the pageviews based on number of clicks. Pageviews = 40000, clicks = 3200, so the ratio of clicks/pageviews = 3200/40000=0.08.
Set the pageviews based on number of enrollments. Pageviews = 40000, enrollments = 660, so the ratio of enrollments/pageviews = 660/40000=0.0165.

- Pageviews for Gross Conversion = 51670/0.08 = 645875
- Pageviews for Retention = 78230/0.0165 = 4741212
- Pageviews for Net Conversion = 54826/0.08 = 685325

In order to test for all three metrics, use the maximum number of pageviews, 4741212.

3.4 Experiment duration and exposure

What fraction of Udacity's traffic would you divert to this experiment?

If we divert 100% of the traffic, given 40,000 pageviews per day, the experiment will take 119 days. This is too long for an experiment. If we eliminate retention and only care about gross conversion and net conversion, and divert 100% of the traffic, we only need 18 days. If we divert 80% of traffic to the experiment, we need 22 days.

In [94]:
n_pageviews = 4741212
duration = n_pageviews/n_cookie
print(math.ceil(duration))

119


In [95]:
n_pageviews = 685325
duration = n_pageviews/n_cookie
print(math.ceil(duration))

18


In [96]:
n_pageviews = 685325
duration = n_pageviews/(n_cookie*0.8)
print(math.ceil(duration))

22


### 4. Sanity Check
For each metric that you chose as an invariant metric, compute a 95% confidence interval for the value you expect to observe. Enter the upper and lower bounds, and the observed value, all to 4 decimal places.

In [97]:
df_cont = pd.read_csv('data/control.csv')
df_cont

Unnamed: 0,Date,Pageviews,Clicks,Enrollments,Payments
0,"Sat, Oct 11",7723,687,134.0,70.0
1,"Sun, Oct 12",9102,779,147.0,70.0
2,"Mon, Oct 13",10511,909,167.0,95.0
3,"Tue, Oct 14",9871,836,156.0,105.0
4,"Wed, Oct 15",10014,837,163.0,64.0
5,"Thu, Oct 16",9670,823,138.0,82.0
6,"Fri, Oct 17",9008,748,146.0,76.0
7,"Sat, Oct 18",7434,632,110.0,70.0
8,"Sun, Oct 19",8459,691,131.0,60.0
9,"Mon, Oct 20",10667,861,165.0,97.0


In [98]:
df_exp = pd.read_csv('data/experiment.csv')
df_exp

Unnamed: 0,Date,Pageviews,Clicks,Enrollments,Payments
0,"Sat, Oct 11",7716,686,105.0,34.0
1,"Sun, Oct 12",9288,785,116.0,91.0
2,"Mon, Oct 13",10480,884,145.0,79.0
3,"Tue, Oct 14",9867,827,138.0,92.0
4,"Wed, Oct 15",9793,832,140.0,94.0
5,"Thu, Oct 16",9500,788,129.0,61.0
6,"Fri, Oct 17",9088,780,127.0,44.0
7,"Sat, Oct 18",7664,652,94.0,62.0
8,"Sun, Oct 19",8434,697,120.0,77.0
9,"Mon, Oct 20",10496,860,153.0,98.0


In [99]:
# compute the total of cookies, clicks, enrollments, and payments
sums = {'control': pd.Series([df_cont['Pageviews'].sum(), df_cont['Clicks'].sum(), df_cont['Enrollments'].sum(), df_cont['Payments'].sum()],
                             index = ['cookies','clicks','enrollments','payments']),
        'experiment': pd.Series([df_exp['Pageviews'].sum(), df_exp['Clicks'].sum(), df_exp['Enrollments'].sum(), df_exp['Payments'].sum()],
                             index = ['cookies','clicks','enrollments','payments']),
        }

df_sums = pd.DataFrame(sums)
df_sums

Unnamed: 0,control,experiment
cookies,345543.0,344660.0
clicks,28378.0,28325.0
enrollments,3785.0,3423.0
payments,2033.0,1945.0


Count metrics (invariant metrics): expect the total number of cookies and clicks in the control group and the experiment group to be 50% each. SE = sqrt(p(1-p)/n)

In [103]:
df_sums['total']=df_sums['control']+df_sums['experiment']
df_sums['prob']=0.5
# stderr = sqrt(p(1-p)/(n_cont+n_exp))
df_sums['stderr']=np.sqrt((df_sums['prob']*(1-df_sums['prob']))/df_sums['total'])
df_sums['CI_lower']=df_sums['prob']-1.96*df_sums['stderr']
df_sums['CI_upper']=df_sums['prob']+1.96*df_sums['stderr']
df_sums['observed_p']=round(df_sums['experiment']/df_sums['total'],4)
df_sums['Pass Sanity']=df_sums.apply(lambda x: (x.observed_p > x.CI_lower) and (x.observed_p<x.CI_upper), axis=1)
df_sums['d'] = abs((df_sums['control']-df_sums['experiment'])/df_sums['total'])
df_sums

Unnamed: 0,control,experiment,total,prob,stderr,CI_lower,CI_upper,observed_p,Pass Sanity,d
cookies,345543.0,344660.0,690203.0,0.5,0.000602,0.49882,0.50118,0.4994,True,0.001279
clicks,28378.0,28325.0,56703.0,0.5,0.0021,0.495884,0.504116,0.4995,True,0.000935
enrollments,3785.0,3423.0,7208.0,0.5,0.005889,0.488457,0.511543,0.4749,False,0.050222
payments,2033.0,1945.0,3978.0,0.5,0.007928,0.484462,0.515538,0.4889,True,0.022122


To perform sanity check for the Click Through Probability, we would expect that the difference between the two groups be zero.
![standard error for two proportions](images/stderr_calc.png)

In [125]:
# CTP = # of clicks/# of pageviews
df_ctp = pd.DataFrame(
    {
        'ctp_cont': [df_sums['control'].loc['clicks']/df_sums['control'].loc['cookies']],
        'ctp_exp': [df_sums['experiment'].loc['clicks']/df_sums['experiment'].loc['cookies']],
        'n_cont':[df_sums['control'].loc['clicks']]
    }, index=['ctp']
)
df_ctp['se']=np.sqrt(df_ctp['ctp_cont']*(1-df_ctp['ctp_cont'])/df_ctp['n_cont'])
df_ctp['CI_lower'] = df_ctp['ctp_cont'] - 1.96*df_ctp['se']
df_ctp['CI_upper'] = df_ctp['ctp_cont'] + 1.96*df_ctp['se']
df_ctp['Pass Sanity'] = df_ctp.apply(lambda x: (x['ctp_exp']> x['CI_lower']) and (x['ctp_exp']< x['CI_upper']), axis=1)
df_ctp

Unnamed: 0,ctp_cont,ctp_exp,n_cont,se,CI_lower,CI_upper,Pass Sanity
ctp,0.082126,0.082182,28378.0,0.00163,0.078931,0.08532,True


In [123]:
# CTP = # of clicks/# of pageviews
df_ctp = pd.DataFrame(
    {
        'ctp_cont': [df_sums['control'].loc['clicks']/df_sums['control'].loc['cookies']],
        'ctp_exp': [df_sums['experiment'].loc['clicks']/df_sums['experiment'].loc['cookies']],
        'n_cont':[df_sums['control'].loc['clicks']],
        'n_exp':[df_sums['experiment'].loc['clicks']]
    }, index=['ctp']
)
df_ctp['se']=np.sqrt(df_ctp['ctp_cont']*(1-df_ctp['ctp_cont'])/df_ctp['n_cont'] + df_ctp['ctp_exp']*(1-df_ctp['ctp_exp'])/df_ctp['n_exp'])
df_ctp['CI_lower'] = 0 - 1.96*df_ctp['se']
df_ctp['CI_upper'] = 0 + 1.96*df_ctp['se']
df_ctp['observed_p'] = df_ctp['ctp_exp'] - df_ctp['ctp_cont']
df_ctp['Pass Sanity'] = df_ctp.apply(lambda x: (x['observed_p']> x['CI_lower']) and (x['observed_p']< x['CI_upper']), axis=1)
df_ctp

Unnamed: 0,ctp_cont,ctp_exp,n_cont,n_exp,se,CI_lower,CI_upper,observed_p,Pass Sanity
ctp,0.082126,0.082182,28378.0,28325.0,0.002306,-0.00452,0.00452,5.7e-05,True


### 5. Effect Size Tests
Evaluation metrics: As we use gross conversion and net conversion as the evaluation metrics, we know that d_min=0.01 for gross conversion, and d_min=0.075 for net conversion.

In [None]:
# Compute the observed difference of GC and NC after excluding nan values
df_cont_e = df_cont.dropna(subset=['Enrollments'])
df_exp_e = df_exp.dropna(subset=['Enrollments'])
sums_e = {'control': pd.Series([df_cont_e['Pageviews'].sum(), df_cont_e['Clicks'].sum(), df_cont_e['Enrollments'].sum(), df_cont_e['Payments'].sum()],
                             index = ['cookies','clicks','enrollments','payments']),
        'experiment': pd.Series([df_exp_e['Pageviews'].sum(), df_exp_e['Clicks'].sum(), df_exp_e['Enrollments'].sum(), df_exp_e['Payments'].sum()],
                             index = ['cookies','clicks','enrollments','payments']),
        }

df_sums_e = pd.DataFrame(sums_e)
df_sums_e['total']=df_sums_e['control']+df_sums_e['experiment']
df_sums_e


Unnamed: 0,control,experiment,total
cookies,212163.0,211362.0,423525.0
clicks,17293.0,17260.0,34553.0
enrollments,3785.0,3423.0,7208.0
payments,2033.0,1945.0,3978.0


In [135]:
# control values
click_cont = df_sums_e.loc['clicks'].control
enroll_cont = df_sums_e.loc['enrollments'].control
payment_cont = df_sums_e.loc['payments'].control

# experiment values
click_exp = df_sums_e.loc['clicks'].experiment
enroll_exp = df_sums_e.loc['enrollments'].experiment
payment_exp = df_sums_e.loc['payments'].experiment

# evaluation metrics
gross_conv_cont = enroll_cont/click_cont
gross_conv_exp = enroll_exp/click_exp

net_conv_cont = payment_cont/click_cont
net_conv_exp = payment_exp/click_exp

gross_conv_pool = (enroll_cont+enroll_exp)/(click_cont+click_exp)
net_conv_pool = (payment_cont+payment_exp)/(click_cont+click_exp)

print('Pooled Gross Conversion: {} \n Pooled Net Conversion:{}'.format(gross_conv_pool,net_conv_pool))


Pooled Gross Conversion: 0.20860706740369866 
 Pooled Net Conversion:0.1151274853124186


In [152]:
def stats(p, n_cont, n_exp, zscore, diff):
    # output standard error, margin, and confidence interval
    # p: pooled probabiliry
    # n_cont, n_exp: number of units in control and experiment
    # diff: evaluation metric different between experiment and control
    stderr = np.sqrt(p*(1-p)*(1/n_cont+1/n_exp))
    ci_lower = diff - zscore*stderr
    ci_upper = diff + zscore*stderr
    return stderr, ci_lower, ci_upper

In [186]:
dmin_gc = -0.01
dmin_nc = 0.0075

zscore = 1.96

In [172]:
d_obs_gc = gross_conv_exp - gross_conv_cont
se_gross, cil_gross, ciu_gross = stats(gross_conv_pool, click_cont, click_exp, zscore, d_obs_gc)

print(click_cont, click_exp, gross_conv_pool, d_obs_gc, se_gross, cil_gross, ciu_gross)

17293.0 17260.0 0.20860706740369866 -0.020554874580361565 0.004371675385225936 -0.0291233583354044 -0.01198639082531873


In [173]:
d_obs_nc = net_conv_exp - net_conv_cont
se_net, cil_net, ciu_net = stats(net_conv_pool, click_cont, click_exp, zscore, d_obs_nc)

print(click_cont, click_exp, net_conv_pool, d_obs_nc, se_net, cil_net, ciu_net)

17293.0 17260.0 0.1151274853124186 -0.0048737226745441675 0.0034341335129324238 -0.011604624359891718 0.001857179010803383


In [187]:
# compute a dataframe to contain the evaluation metrics
effect_size = pd.DataFrame(columns=['dmin_practical','observed diff','CI_lower', 'CI_upper', 'Practically Significant','Statistically Significant'], 
                           index=['Gross Conversion', 'Net Conversion'])

effect_size['dmin_practical'] = [dmin_gc, dmin_nc]
effect_size['observed diff'] = [d_obs_gc, d_obs_nc]
effect_size['CI_lower'] = [cil_gross, cil_net]
effect_size['CI_upper'] = [ciu_gross, ciu_net]
effect_size

Unnamed: 0,dmin_practical,observed diff,CI_lower,CI_upper,Practically Significant,Statistically Significant
Gross Conversion,-0.01,-0.020555,-0.029123,-0.011986,,
Net Conversion,0.0075,-0.004874,-0.011605,0.001857,,


- An evaluation metric is statistically significant if the CI does not include 0 (that is, you can be confident there was a change)
- An evaluation metric is practically significant if the CI does not include the practical significance boundary (that is, you can be confident there is a change that matters to the business.)

In [190]:
effect_size['Practically Significant'] = np.where((effect_size['dmin_practical']>effect_size['CI_lower']) & 
                                                  (effect_size['dmin_practical']<effect_size['CI_upper']), 
                                                  'No', 'Yes')
effect_size['Statistically Significant']= np.where((effect_size['CI_lower']<0 ) &
                                                   (effect_size['CI_upper']>0),
                                                   'No', 'Yes')
effect_size

Unnamed: 0,dmin_practical,observed diff,CI_lower,CI_upper,Practically Significant,Statistically Significant
Gross Conversion,-0.01,-0.020555,-0.029123,-0.011986,Yes,Yes
Net Conversion,0.0075,-0.004874,-0.011605,0.001857,Yes,No


Gross Conversion is both Statistically and Practically significant while Net Conversion is only Practically Significant.


### 6. Sign Test
Run a sign test on each of your evaluation metrics using the day-by-data data. Enter each p-value, and indicate whether each result is statistically significant.

In [None]:
# If a unit has higher gross conversion in Experiment than in Control , it is a correct sign.
df_sign = pd.merge(df_cont_e[['Date']], df_exp_e[['Date']], on='Date') 
df_sign['GC_cont'] = df_cont_e['Enrollments']/df_cont_e['Clicks']
df_sign['GC_exp'] = df_exp_e['Enrollments']/df_exp_e['Clicks']
df_sign['GC_diff'] = df_sign['GC_exp'] - df_sign['GC_cont']

df_sign['NC_cont'] = df_cont_e['Payments']/df_cont_e['Clicks']
df_sign['NC_exp'] = df_exp_e['Payments']/df_exp_e['Clicks']
df_sign['NC_diff'] = df_sign['NC_exp'] - df_sign['NC_cont']

df_sign

Unnamed: 0,Date,GC_cont,GC_exp,GC_diff,NC_cont,NC_exp,NC_diff
0,"Sat, Oct 11",0.195051,0.153061,-0.04199,0.101892,0.049563,-0.05233
1,"Sun, Oct 12",0.188703,0.147771,-0.040933,0.089859,0.115924,0.026065
2,"Mon, Oct 13",0.183718,0.164027,-0.019691,0.10451,0.089367,-0.015144
3,"Tue, Oct 14",0.186603,0.166868,-0.019735,0.125598,0.111245,-0.014353
4,"Wed, Oct 15",0.194743,0.168269,-0.026474,0.076464,0.112981,0.036517
5,"Thu, Oct 16",0.167679,0.163706,-0.003974,0.099635,0.077411,-0.022224
6,"Fri, Oct 17",0.195187,0.162821,-0.032367,0.101604,0.05641,-0.045194
7,"Sat, Oct 18",0.174051,0.144172,-0.029879,0.110759,0.095092,-0.015667
8,"Sun, Oct 19",0.18958,0.172166,-0.017414,0.086831,0.110473,0.023643
9,"Mon, Oct 20",0.191638,0.177907,-0.013731,0.11266,0.113953,0.001294


Link to p-value for binomial test: https://www.graphpad.com/quickcalcs/binomial1/

In [200]:
print('For Gross conversion, experiment group is higher than the control group for', len(df_sign[df_sign.GC_diff > 0]),
      'out of', len(df_sign))
print('p-value is 0.0026, which is significant.')

For Gross conversion, experiment group is higher than the control group for 4 out of 23
p-value for 0.17391304347826086 is 


In [203]:
print('For Net conversion, experiment group is higher than the control group for', len(df_sign[df_sign.NC_diff > 0]),
      'out of', len(df_sign))
print('p-value is 0.6776, which is not significant.')

For Net conversion, experiment group is higher than the control group for 10 out of 23
p-value is 0.6776, which is not significant.
