# Chapter 12: Mediation and Instrumental Variables

## Mediation

In [1]:
#Common packages
import pandas as pd
from statsmodels.formula.api import ols
import numpy as np

In [2]:
#Loading the data from the chapter on moderation
hist_data_df = pd.read_csv('chap11-historical_data.csv')

In [3]:
#Regressions used in the text
ols("duration~play_area", data=hist_data_df).fit().summary()

0,1,2,3
Dep. Variable:,duration,R-squared:,0.21
Model:,OLS,Adj. R-squared:,0.21
Method:,Least Squares,F-statistic:,166000.0
Date:,"Wed, 02 Jun 2021",Prob (F-statistic):,0.0
Time:,10:13:25,Log-Likelihood:,-2417800.0
No. Observations:,623610,AIC:,4836000.0
Df Residuals:,623608,BIC:,4836000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,23.8039,0.018,1287.327,0.000,23.768,23.840
play_area,12.5570,0.031,407.397,0.000,12.497,12.617

0,1,2,3
Omnibus:,65895.576,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,95301.029
Skew:,0.817,Prob(JB):,0.0
Kurtosis:,3.999,Cond. No.,2.42


In [4]:
ols("grocery_purchases~play_area", data=hist_data_df).fit().summary()

0,1,2,3
Dep. Variable:,grocery_purchases,R-squared:,0.164
Model:,OLS,Adj. R-squared:,0.164
Method:,Least Squares,F-statistic:,122100.0
Date:,"Wed, 02 Jun 2021",Prob (F-statistic):,0.0
Time:,10:13:30,Log-Likelihood:,-3004900.0
No. Observations:,623610,AIC:,6010000.0
Df Residuals:,623608,BIC:,6010000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,49.1421,0.047,1036.494,0.000,49.049,49.235
play_area,27.6200,0.079,349.485,0.000,27.465,27.775

0,1,2,3
Omnibus:,133724.275,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,331024.147
Skew:,1.187,Prob(JB):,0.0
Kurtosis:,5.666,Cond. No.,2.42


In [5]:
ols("grocery_purchases~duration", data=hist_data_df).fit().summary()

0,1,2,3
Dep. Variable:,grocery_purchases,R-squared:,0.772
Model:,OLS,Adj. R-squared:,0.772
Method:,Least Squares,F-statistic:,2113000.0
Date:,"Wed, 02 Jun 2021",Prob (F-statistic):,0.0
Time:,10:13:34,Log-Likelihood:,-2599500.0
No. Observations:,623610,AIC:,5199000.0
Df Residuals:,623608,BIC:,5199000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.9357,0.047,-62.420,0.000,-3.028,-2.844
duration,2.1897,0.002,1453.749,0.000,2.187,2.193

0,1,2,3
Omnibus:,49180.757,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,297013.648
Skew:,0.031,Prob(JB):,0.0
Kurtosis:,6.38,Cond. No.,74.2


In [6]:
ols("grocery_purchases~duration+play_area", data=hist_data_df).fit().summary()

0,1,2,3
Dep. Variable:,grocery_purchases,R-squared:,0.772
Model:,OLS,Adj. R-squared:,0.772
Method:,Least Squares,F-statistic:,1057000.0
Date:,"Wed, 02 Jun 2021",Prob (F-statistic):,0.0
Time:,10:13:38,Log-Likelihood:,-2599500.0
No. Observations:,623610,AIC:,5199000.0
Df Residuals:,623607,BIC:,5199000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.9177,0.047,-61.647,0.000,-3.010,-2.825
duration,2.1870,0.002,1290.410,0.000,2.184,2.190
play_area,0.1575,0.046,3.393,0.001,0.066,0.248

0,1,2,3
Omnibus:,49182.534,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,297057.749
Skew:,0.031,Prob(JB):,0.0
Kurtosis:,6.381,Cond. No.,78.1


In [7]:
def percentage_mediated_fun(dat_df):
    total_effect = ols("grocery_purchases~play_area", data=dat_df).fit(disp=0).params['play_area']
    coeff_med1 = ols("duration~play_area", data=dat_df).fit(disp=0).params['play_area']
    coeff_med2 = ols("grocery_purchases~duration", data=dat_df).fit(disp=0).params['duration']
    mediated_effect = coeff_med1 * coeff_med2
    percentage_mediated = mediated_effect / total_effect
    return percentage_mediated 
percentage_mediated_fun(hist_data_df)

0.9954969198988962

In [8]:
def boot_CI_fun(dat_df, metric_fun, B = 100):
  #Setting sample size
  N = len(dat_df)
  conf_level = 0.9
  coeffs = []
  
  for i in range(B):
      sim_data_df = dat_df.sample(n=N, replace = True)
      coeff = metric_fun(sim_data_df)
      coeffs.append(coeff)
  
  coeffs.sort()
  start_idx = round(B * (1 - conf_level) / 2)
  end_idx = - round(B * (1 - conf_level) / 2)
  
  confint = [coeffs[start_idx], coeffs[end_idx]]  
  
  return confint
boot_CI_fun(hist_data_df, percentage_mediated_fun)

[0.9940682183687283, 0.9973655927218781]

## Instrumental Variables

### Data

In [9]:
#Loading the experimental data from the chapter 9
exp_data_df = pd.read_csv('chap10-experimental_data.csv')

#Reformat group variable to binary in experimental data
exp_data_df.group = np.where(exp_data_df.group == 'treat', 1, 0)

### Packages

In [10]:
#Common packages
import pandas as pd
from statsmodels.formula.api import ols
import numpy as np

from linearmodels.iv import IV2SLS

### Understanding and Applying IVs

In [11]:
#First stage regression, coeff = 0.5
ols("call_CSAT~group+age+reason", data=exp_data_df).fit(disp=0).summary()

0,1,2,3
Dep. Variable:,call_CSAT,R-squared:,0.061
Model:,OLS,Adj. R-squared:,0.061
Method:,Least Squares,F-statistic:,5001.0
Date:,"Fri, 14 May 2021",Prob (F-statistic):,0.0
Time:,08:42:28,Log-Likelihood:,-424640.0
No. Observations:,231659,AIC:,849300.0
Df Residuals:,231655,BIC:,849300.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.1038,0.012,348.066,0.000,4.081,4.127
reason[T.property],0.2006,0.007,30.394,0.000,0.188,0.214
group,0.5406,0.006,85.935,0.000,0.528,0.553
age,0.0202,0.000,72.142,0.000,0.020,0.021

0,1,2,3
Omnibus:,4301.339,Durbin-Watson:,0.34
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4540.773
Skew:,0.342,Prob(JB):,0.0
Kurtosis:,2.945,Cond. No.,159.0


In [12]:
#Reduced regression, coeff = 1.6
ols("M6Spend~group+age+reason", data=exp_data_df).fit(disp=0).summary()

0,1,2,3
Dep. Variable:,M6Spend,R-squared:,0.082
Model:,OLS,Adj. R-squared:,0.082
Method:,Least Squares,F-statistic:,6942.0
Date:,"Fri, 14 May 2021",Prob (F-statistic):,0.0
Time:,08:42:29,Log-Likelihood:,-1263000.0
No. Observations:,231659,AIC:,2526000.0
Df Residuals:,231655,BIC:,2526000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,99.9319,0.440,227.242,0.000,99.070,100.794
reason[T.property],0.4446,0.246,1.806,0.071,-0.038,0.927
group,1.6169,0.235,6.891,0.000,1.157,2.077
age,-1.4678,0.010,-140.536,0.000,-1.488,-1.447

0,1,2,3
Omnibus:,108112.619,Durbin-Watson:,1.986
Prob(Omnibus):,0.0,Jarque-Bera (JB):,722364.562
Skew:,2.159,Prob(JB):,0.0
Kurtosis:,10.496,Cond. No.,159.0


In [13]:
#Baseline (biased) regression, coeff = 4.00
ols("M6Spend~call_CSAT+age+reason", data=exp_data_df).fit(disp=0).summary()

0,1,2,3
Dep. Variable:,M6Spend,R-squared:,0.093
Model:,OLS,Adj. R-squared:,0.093
Method:,Least Squares,F-statistic:,7936.0
Date:,"Fri, 14 May 2021",Prob (F-statistic):,0.0
Time:,08:42:30,Log-Likelihood:,-1261600.0
No. Observations:,231659,AIC:,2523000.0
Df Residuals:,231655,BIC:,2523000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,83.2283,0.536,155.302,0.000,82.178,84.279
reason[T.property],-0.3582,0.245,-1.461,0.144,-0.839,0.122
call_CSAT,4.0019,0.076,52.767,0.000,3.853,4.151
age,-1.5488,0.010,-147.549,0.000,-1.569,-1.528

0,1,2,3
Omnibus:,108721.44,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,740331.063
Skew:,2.166,Prob(JB):,0.0
Kurtosis:,10.611,Cond. No.,194.0


In [14]:
#IV regression, coeff = 2.99
iv_mod = IV2SLS.from_formula('M6Spend ~ 1 + age + reason + [call_CSAT ~ group]', 
                             exp_data_df).fit()
print(iv_mod.params)

  if is_categorical(s):


Intercept             87.658610
reason[T.property]    -0.155326
age                   -1.528264
call_CSAT              2.990706
Name: parameter, dtype: float64
