## Regression with Clustered Standard Errors

In [16]:
import numpy as np, statsmodels.stats.api as sms
import itertools
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
import random
from sklearn import preprocessing

In [17]:
infile = 'exp_data_cluster.csv' 
data = pd.read_csv(infile)

In [18]:
data.shape

(16000, 4)

In [19]:
data.head()

Unnamed: 0,user,adid,expid,if_click
0,92092,1e4fb0d22340fdf9ee02b6ae4a7d9a83,1,0
1,878501,97fc187415fe1785d5ee02bfd348968e,0,0
2,266367,0b0fa14b56d3741178196daaa92e6a1e,1,0
3,329319,1f1fe825014d9e9a0881233d9950bd43,1,0
4,64672,5b2ae85128137e449eb015f6de78add5,0,0


In [20]:
import statsmodels.formula.api as smf

#### OLS with Clustered SE

In [24]:
# users' behaviors of the same ads are correlated.
model = smf.ols(formula='if_click ~ expid', data=data)
result = model.fit(cov_type='cluster', cov_kwds = {'groups': data.user})
print('Result with cluster')
print(result.summary2())

Result with cluster
                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.000    
Dependent Variable: if_click         AIC:                1467.5819
Date:               2023-02-06 20:39 BIC:                1482.9425
No. Observations:   16000            Log-Likelihood:     -731.79  
Df Model:           1                F-statistic:        6.710    
Df Residuals:       15998            Prob (F-statistic): 0.00960  
R-squared:          0.000            Scale:              0.064166 
--------------------------------------------------------------------
              Coef.    Std.Err.      z      P>|z|    [0.025   0.975]
--------------------------------------------------------------------
Intercept     0.0638     0.0027   23.3379   0.0000   0.0584   0.0691
expid         0.0104     0.0040    2.5904   0.0096   0.0025   0.0182
------------------------------------------------------------------
Omnibus:             10828.253     Durbin-Watson: 

#### OLS without Clustered SE

In [23]:
result = model.fit()
print('Result without cluster')
print(result.summary2())

Result without cluster
                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.000    
Dependent Variable: if_click         AIC:                1467.5819
Date:               2023-02-06 20:39 BIC:                1482.9425
No. Observations:   16000            Log-Likelihood:     -731.79  
Df Model:           1                F-statistic:        6.710    
Df Residuals:       15998            Prob (F-statistic): 0.00960  
R-squared:          0.000            Scale:              0.064166 
--------------------------------------------------------------------
              Coef.    Std.Err.      t      P>|t|    [0.025   0.975]
--------------------------------------------------------------------
Intercept     0.0638     0.0028   22.5098   0.0000   0.0582   0.0693
expid         0.0104     0.0040    2.5904   0.0096   0.0025   0.0182
------------------------------------------------------------------
Omnibus:             10828.253     Durbin-Watso

#### Compare Means with t tests

In [7]:
y0 = data[data['expid'] == 0]['if_click']

In [8]:
y1 = data[data['expid'] == 1]['if_click']
mean_d = np.mean(y1)-np.mean(y0)
print(mean_d)

0.010374999999999995


In [9]:
cm = sms.CompareMeans(sms.DescrStatsW(y0), sms.DescrStatsW(y1))
print(cm.ttest_ind(alternative='two-sided', usevar='unequal'))

(-2.590388082735368, 0.009595515482880781, 15920.640661495236)
