# Packages

In [16]:
import numpy as np
import pandas as pd
from doubleml.datasets import fetch_bonus
from doubleml import DoubleMLData 
import statsmodels.api as sm
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLIV
from doubleml import DoubleMLPLR
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, LogisticRegressionCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
np.random.seed(3293423)

# Load the Data

In [18]:
df = pd.read_csv('/Users/pranjal/Desktop/Causal-Inference/data/wage2.csv')
cat = df.select_dtypes('object').columns
df = pd.get_dummies(df, columns = cat, drop_first = True)
#df = df[df.wage>0]
print(df.shape)
df.head()

(935, 17)


Unnamed: 0,wage,hours,IQ,KWW,educ,exper,tenure,age,married,black,south,urban,sibs,brthord,meduc,feduc,lwage
0,769,40,93,35,12,11,2,31,1,0,0,1,1,2.0,8.0,8.0,6.645091
1,808,50,119,41,18,11,16,37,1,0,0,1,1,,14.0,14.0,6.694562
2,825,40,108,46,14,11,9,33,1,0,0,1,1,2.0,14.0,14.0,6.715383
3,650,40,96,32,12,13,7,32,1,0,0,1,4,3.0,12.0,12.0,6.476973
4,562,40,74,27,11,14,5,34,1,0,0,1,10,6.0,6.0,11.0,6.331502


In [19]:
print(df.isnull().sum())

wage         0
hours        0
IQ           0
KWW          0
educ         0
exper        0
tenure       0
age          0
married      0
black        0
south        0
urban        0
sibs         0
brthord     83
meduc       78
feduc      194
lwage        0
dtype: int64


In [20]:
outcome = 'lwage'
treatment = 'educ'
rest = ['IQ', 'KWW', 'exper', 'tenure', 'age','married', 'black', 'south', 'urban']
instruments = ['sibs', 'brthord', 'meduc','feduc']
df = df.dropna()
df = df[[outcome] + [treatment] + rest + instruments]
y = df[outcome]
d = df[treatment]
x = df[rest].astype('float')
z = df[instruments]
print(y.shape, d.shape, x.shape)

(663,) (663,) (663, 9)


# First Stage

In [70]:
np.random.seed(42)
model1=LinearRegression()
model2=LassoCV()
model3=RandomForestRegressor(max_features='sqrt', max_depth=6, n_estimators=100)
model4=GradientBoostingRegressor(max_features='sqrt',n_estimators=100)
model5=MLPRegressor((5,3),max_iter=5000,activation='tanh')

table = PrettyTable()
table.field_names = ['Estimator', 'Test R2: E[Y|X]=l(X)', 'Test R2: E[D|X]=r(X)']
a = ['Linear/Logistic',np.mean(cross_val_score(model1, x, y, cv=5)), 
    np.mean(cross_val_score(model1, x, d, cv=5))]
table.add_row(a)
a = ['Linear/Logistic (Reg)',np.mean(cross_val_score(model2, x, y, cv=5)), 
     np.mean(cross_val_score(model2, x, d, cv=5))]
table.add_row(a)
a = ['Random Forest',np.mean(cross_val_score(model3, x, y, cv=5)),
     np.mean(cross_val_score(model3, x, d, cv=5))]
table.add_row(a)
a = ['Boosting',np.mean(cross_val_score(model4, x, y, cv=5)),
     np.mean(cross_val_score(model4, x, d, cv=5))]
table.add_row(a)
a = ['Neural Network',np.mean(cross_val_score(model5, x, y, cv=5)),
     np.mean(cross_val_score(model5, x, d, cv=5))]
table.add_row(a)
table.float_format = '0.3'
print(table)

+-----------------------+----------------------+----------------------+
|       Estimator       | Test R2: E[Y|X]=l(X) | Test R2: E[D|X]=r(X) |
+-----------------------+----------------------+----------------------+
|    Linear/Logistic    |        0.167         |        0.469         |
| Linear/Logistic (Reg) |        0.169         |        0.469         |
|     Random Forest     |        0.151         |        0.486         |
|        Boosting       |        0.100         |        0.494         |
|     Neural Network    |        -0.003        |        0.313         |
+-----------------------+----------------------+----------------------+


In [71]:
np.random.seed(42)
model1=LinearRegression()
model2=LassoCV()
model3=RandomForestRegressor(max_features='sqrt', max_depth=6, n_estimators=500)
model4=GradientBoostingRegressor(max_features='sqrt',max_depth=6,n_estimators=500)
model5=MLPRegressor((20,10,), max_iter=1000,)

table = PrettyTable()
table.field_names = ['Estimator', 'Test R2: E[D|X,Z]']
a = ['Linear/Logistic',np.mean(cross_val_score(model1, np.c_[x,z], d, cv=5))]
table.add_row(a)
a = ['Linear/Logistic (Reg)',np.mean(cross_val_score(model2, np.c_[x,z], d, cv=5))]
table.add_row(a)
a = ['Random Forest',np.mean(cross_val_score(model3, np.c_[x,z], d, cv=5))]
table.add_row(a)
a = ['Boosting',np.mean(cross_val_score(model4, np.c_[x,z], d, cv=5))]
table.add_row(a)
a = ['Neural Network',np.mean(cross_val_score(model5, np.c_[x,z], d, cv=5))]
table.add_row(a)
table.float_format = '0.3'
print(table)

+-----------------------+---------------------------+
|       Estimator       | Test Rquared for E[D|X,Z] |
+-----------------------+---------------------------+
|    Linear/Logistic    |           0.501           |
| Linear/Logistic (Reg) |           0.502           |
|     Random Forest     |           0.502           |
|        Boosting       |           0.454           |
|     Neural Network    |           0.400           |
+-----------------------+---------------------------+


In [72]:
z_bar = model1.fit(np.c_[x,z], d).predict(np.c_[x,z])
np.random.seed(42)
model1=LinearRegression()
model2=LassoCV()
model3=RandomForestRegressor(max_features='sqrt', max_depth=6, n_estimators=500)
model4=GradientBoostingRegressor(max_features='sqrt',max_depth=6,n_estimators=500)
model5=MLPRegressor((20,10,), max_iter=1000,)

table = PrettyTable()
table.field_names = ['Estimator', 'Test R2: E[Z_opt|X]=m(X)']
a = ['Linear/Logistic',np.mean(cross_val_score(model1, x, z_bar, cv=5))]
table.add_row(a)
a = ['Linear/Logistic (Reg)',np.mean(cross_val_score(model2, x, z_bar, cv=5))]
table.add_row(a)
a = ['Random Forest',np.mean(cross_val_score(model3, x, z_bar, cv=5))]
table.add_row(a)
a = ['Boosting',np.mean(cross_val_score(model4, x, z_bar, cv=5))]
table.add_row(a)
a = ['Neural Network',np.mean(cross_val_score(model5, x, z_bar, cv=5))]
table.add_row(a)
table.float_format = '0.3'
print(table)

+-----------------------+--------------------------+
|       Estimator       | Test R2: E[Z_opt|X]=m(X) |
+-----------------------+--------------------------+
|    Linear/Logistic    |          0.932           |
| Linear/Logistic (Reg) |          0.928           |
|     Random Forest     |          0.834           |
|        Boosting       |          0.889           |
|     Neural Network    |          0.905           |
+-----------------------+--------------------------+


# OLS

In [53]:
OLS = sm.OLS(endog=y, exog=sm.add_constant(np.c_[d,x]), prepend=True).fit()
print(OLS.summary(xname=['constant', treatment]+rest))

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.270
Model:                            OLS   Adj. R-squared:                  0.258
Method:                 Least Squares   F-statistic:                     24.08
Date:                Thu, 15 Dec 2022   Prob (F-statistic):           8.51e-39
Time:                        23:10:39   Log-Likelihood:                -248.51
No. Observations:                 663   AIC:                             519.0
Df Residuals:                     652   BIC:                             568.5
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
constant       4.9392      0.202     24.398      0.0

In [54]:
from statsmodels.sandbox.regression.gmm import IV2SLS
endog = df['lwage']
exog =  df[[treatment] + rest]
instr = df[rest + instruments]
exog_constant = sm.add_constant(exog)
instr_constant = sm.add_constant(instr)
IV2SLS = IV2SLS(endog, exog_constant, instrument = instr_constant).fit()
print(IV2SLS.summary())

                          IV2SLS Regression Results                           
Dep. Variable:                  lwage   R-squared:                       0.181
Model:                         IV2SLS   Adj. R-squared:                  0.169
Method:                     Two Stage   F-statistic:                     20.15
                        Least Squares   Prob (F-statistic):           1.19e-32
Date:                Thu, 15 Dec 2022                                         
Time:                        23:10:39                                         
No. Observations:                 663                                         
Df Residuals:                     652                                         
Df Model:                          10                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.6647      0.246     18.939      0.0

# ML Estimation

In [30]:
l = LassoCV() # Model for E[Y|X] = E[θD+g(X)]
g = LassoCV() # Model for E[Y - θD|X]=g(X)
r = 
m = LassoCV() # Model for E[D|X]

def score(y, d, l_hat, m_hat, g_hat, smpls):
    "Score function for Single ML"
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

# Orthogonal + Crossfitting (DML)

In [31]:
data = DoubleMLData(df, y_col=outcome,d_cols=treatment,x_cols=rest,z_cols=instruments)
DML = DoubleMLPLIV(data, l,m,g, n_folds=5,apply_cross_fitting=True)
DML.fit();
print(DML.summary)

          coef   std err         t     P>|t|     2.5 %    97.5 %
educ  0.131423  0.036221  3.628383  0.000285  0.060431  0.202414


# Summary

In [32]:
table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 'Std Error','t','p','2.5%','97.25%']
idx = 1
a = ['OLS']+ np.c_[OLS.params[idx], OLS.bse[idx], OLS.tvalues[idx], OLS.pvalues[idx], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
a = ['IV-2SLS']+ np.c_[IV2SLS.params[1], IV2SLS.bse[1], IV2SLS.tvalues[1], IV2SLS.pvalues[1], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
a = ['Double ML (DML)']+ np.array(DML.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+-----------------+-------+-----------+-------+-------+-------+--------+
|    Estimator    | θ_hat | Std Error |   t   |   p   |  2.5% | 97.25% |
+-----------------+-------+-----------+-------+-------+-------+--------+
|       OLS       | 0.047 |   0.009   | 5.348 | 0.000 |  nan  |  nan   |
|     IV-2SLS     | 0.124 |   0.036   | 3.500 | 0.000 |  nan  |  nan   |
| Double ML (DML) | 0.131 |   0.036   | 3.628 | 0.000 | 0.060 | 0.202  |
+-----------------+-------+-----------+-------+-------+-------+--------+
