In [1]:
import numpy as np
import statsmodels.stats.api as sms
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as smf
import random

In [2]:
infile = 'exp_data_3.csv'
df = pd.read_csv(infile)

In [3]:
df.head(100)

Unnamed: 0,user,click,like,age,gender,friend_cnt,sns_like_cnt,sns_comment_cnt,pre_click,treat
0,1,1,0,34,2,230,209,320,1,0
1,4,0,0,26,2,168,46,113,0,0
2,5,0,0,24,2,105,7,30,0,1
3,6,0,0,39,1,167,23,35,0,0
4,7,0,0,39,2,443,24,155,0,1
...,...,...,...,...,...,...,...,...,...,...
95,137,0,0,27,2,485,61,53,0,1
96,138,0,0,34,2,387,1764,1983,0,1
97,139,0,0,30,2,555,30,318,0,0
98,140,0,0,28,2,202,105,328,0,0


In [4]:
df.shape

(7999, 10)

### **1. CUPED**
Controlled Experiments by Utilizing Pre-Experiment Data (Deng, Xu, Kohavi, & Walker, 2013) 
Remove variance in a metric that can be accounted for by pre-experiment information.
Control Variates: Pre-experiment information.


#### 1.1 Define CUPED

In [5]:
#x is preclick
var_x=np.var(df.pre_click, ddof=1)
print(var_x)

0.05154226423839803


In [7]:
#np.cov returns a var-cov metrix
cov_xy = np.cov(df.pre_click,df.click, ddof=1)[0][1]
print(cov_xy)

0.03190474453217632


In [8]:
theta = cov_xy/var_x
print(theta)
df['theta']=theta 

0.6190016097198905


In [15]:
# different X
var_x=np.var(df.age, ddof=1)
cov_xy = np.cov(df.age,df.click, ddof=1)[0][1]
theta = cov_xy/var_x
print(theta)

0.0010019387709386388


In [9]:
df['click_cuped']=df.click - df.pre_click*theta
df.head(100)

Unnamed: 0,user,click,like,age,gender,friend_cnt,sns_like_cnt,sns_comment_cnt,pre_click,treat,theta,click_cuped
0,1,1,0,34,2,230,209,320,1,0,0.619002,0.380998
1,4,0,0,26,2,168,46,113,0,0,0.619002,0.000000
2,5,0,0,24,2,105,7,30,0,1,0.619002,0.000000
3,6,0,0,39,1,167,23,35,0,0,0.619002,0.000000
4,7,0,0,39,2,443,24,155,0,1,0.619002,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
95,137,0,0,27,2,485,61,53,0,1,0.619002,0.000000
96,138,0,0,34,2,387,1764,1983,0,1,0.619002,0.000000
97,139,0,0,30,2,555,30,318,0,0,0.619002,0.000000
98,140,0,0,28,2,202,105,328,0,0,0.619002,0.000000


#### 1.2 T-Test Without CUPED

In [12]:
d_0 = df[df['treat'] == 0]['click']
d_1 = df[df['treat'] == 1]['click']
diff = np.mean(d_1) - np.mean(d_0)
print(diff)

-0.003601024491467819


In [13]:
cm = sms.CompareMeans(sms.DescrStatsW(d_1), sms.DescrStatsW(d_0))
ttest = cm.ttest_ind(alternative = 'two-sided', usevar = 'unequal')
se = cm.std_meandiff_separatevar
print(se,ttest)

0.004052064006037295 (-0.8886889462018719, 0.3741970443208038, 7984.625655071232)


#### 1.3 T-Test With CUPED

In [11]:
d_0_cuped = df[df['treat'] == 0]['click_cuped']
d_1_cuped = df[df['treat'] == 1]['click_cuped']
diff = np.mean(d_1_cuped) - np.mean(d_0_cuped)
print(diff)

-0.003997530453238325


In [13]:
cm = sms.CompareMeans(sms.DescrStatsW(d_1_cuped), sms.DescrStatsW(d_0_cuped))
ttest = cm.ttest_ind(alternative = 'two-sided', usevar = 'unequal')
se = cm.std_meandiff_separatevar
print(se,ttest)

0.0025601980289050017 (-1.5614145500096597, 0.1184656562164157, 7973.716532346689)


### **2. Regression**

In [16]:
mod = smf.ols(formula='click ~ treat', data=df)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  click   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.7893
Date:                Thu, 16 Feb 2023   Prob (F-statistic):              0.374
Time:                        20:05:01   Log-Likelihood:                 2312.1
No. Observations:                7999   AIC:                            -4620.
Df Residuals:                    7997   BIC:                            -4606.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0358      0.003     12.526      0.0

#### 2.1 Regression + pre_click (control variable)

In [17]:
mod = smf.ols(formula='click ~ treat + pre_click', data=df)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  click   R-squared:                       0.601
Model:                            OLS   Adj. R-squared:                  0.601
Method:                 Least Squares   F-statistic:                     6029.
Date:                Thu, 16 Feb 2023   Prob (F-statistic):               0.00
Time:                        20:06:30   Log-Likelihood:                 5989.1
No. Observations:                7999   AIC:                        -1.197e+04
Df Residuals:                    7996   BIC:                        -1.195e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0023      0.002      1.230      0.2

### 2.2 + Gender

In [18]:
mod = smf.ols(formula='click ~ treat + gender', data=df)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  click   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     12.22
Date:                Thu, 16 Feb 2023   Prob (F-statistic):           5.01e-06
Time:                        20:07:41   Log-Likelihood:                 2323.9
No. Observations:                7999   AIC:                            -4642.
Df Residuals:                    7996   BIC:                            -4621.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0667      0.007      9.580      0.0

#### + Gender, Pre-Click

In [19]:
mod = smf.ols(formula='click ~ treat + gender + pre_click', data=df)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  click   R-squared:                       0.602
Model:                            OLS   Adj. R-squared:                  0.602
Method:                 Least Squares   F-statistic:                     4032.
Date:                Thu, 16 Feb 2023   Prob (F-statistic):               0.00
Time:                        20:09:31   Log-Likelihood:                 5996.7
No. Observations:                7999   AIC:                        -1.199e+04
Df Residuals:                    7995   BIC:                        -1.196e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0180      0.004      4.063      0.0