# Packages

In [33]:
import numpy as np
import pandas as pd
from doubleml.datasets import fetch_bonus
from doubleml import DoubleMLData
import statsmodels.api as sm
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLR
from doubleml import DoubleMLPLR
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, LogisticRegressionCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
np.random.seed(3293423)

# Load the Data

In [34]:
np.random.seed(1)
df = fetch_bonus('DataFrame')
df.head(5)

Unnamed: 0,index,abdt,tg,inuidur1,inuidur2,female,black,hispanic,othrace,dep,...,recall,agelt35,agegt54,durable,nondurable,lusd,husd,muld,dep1,dep2
0,0,10824,0,2.890372,18,0,0,0,0,2,...,0,0,0,0,0,0,1,0,0.0,1.0
1,3,10824,0,0.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0.0,0.0
2,4,10747,0,3.295837,27,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0.0,0.0
3,11,10607,1,2.197225,9,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0.0,0.0
4,12,10831,0,3.295837,27,0,0,0,0,1,...,0,0,1,1,0,1,0,0,1.0,0.0


In [35]:
print(df.isnull().sum())

index         0
abdt          0
tg            0
inuidur1      0
inuidur2      0
female        0
black         0
hispanic      0
othrace       0
dep           0
q1            0
q2            0
q3            0
q4            0
q5            0
q6            0
recall        0
agelt35       0
agegt54       0
durable       0
nondurable    0
lusd          0
husd          0
muld          0
dep1          0
dep2          0
dtype: int64


In [36]:
outcome = 'inuidur1'
treatment = 'tg'
rest = ['female', 'black', 'othrace', 'dep1', 'dep2','q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54','durable', 'lusd', 'husd']
df = df[[outcome] + [treatment] + rest]
y = np.array(df.inuidur1).reshape(-1, 1)
d = np.array(df.tg).astype(int).reshape(-1, 1)
x = np.array(df[rest])
print(y.shape, d.shape, x.shape)

(5099, 1) (5099, 1) (5099, 15)


# First Stage

In [None]:
np.random.seed(42)
table = PrettyTable()
table.field_names = ['Estimator', 'g(X):Rsquared', 'm(X):Accuracy']
a = ['Linear/Logistic',np.mean(cross_val_score(LinearRegression(), x, y, cv=5)),np.mean(cross_val_score(LogisticRegression(), x, d, cv=5))]
table.add_row(a)
a = ['Linear/Logistic (Reg)',np.mean(cross_val_score(LassoCV(), x, y, cv=5)),np.mean(cross_val_score(LogisticRegressionCV(), x, d, cv=5))]
table.add_row(a)
a = ['Random Forests',np.mean(cross_val_score(RandomForestRegressor(max_depth=5,n_estimators=500,verbose=0), x, y, cv=5)),np.mean(cross_val_score(RandomForestClassifier(max_depth=5), x, d, cv=5))]
table.add_row(a)
a = ['Boosting',np.mean(cross_val_score(XGBRegressor(max_depth=3,verbosity=0), x, y, cv=5)),np.mean(cross_val_score(XGBClassifier(verbosity=0,max_depth=5), x, d, cv=5))]
table.add_row(a)
a = ['Neural Networks',np.mean(cross_val_score(MLPRegressor((5,2,), activation = 'tanh', max_iter=500, learning_rate_init=0.01), normalize(x), y, cv=5)),np.mean(cross_val_score(MLPClassifier((5,2,), activation = 'tanh', max_iter=500,learning_rate_init=0.01), normalize(x), d, cv=5))]
table.add_row(a)
table.float_format = '0.3'
print(table)

# OLS

In [None]:
OLS = sm.OLS(y,sm.add_constant(np.c_[d,x])).fit()
OLS.summary()

# ML Estimation

In [None]:
l = RandomForestRegressor(max_depth=5,n_estimators=500,verbose=0) # Model for E[Y|X]=E[θD+g(X)]
g = RandomForestRegressor(max_depth=5,n_estimators=500,verbose=0) # Model for E[Y - θD|X]=g(X)
m = RandomForestClassifier(max_depth=5,n_estimators=500,verbose=0) # Model for E[D|X]

def score(y, d, l_hat, m_hat, g_hat, smpls):
    "Score function for Single ML"
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

# Single-ML

In [None]:
data = DoubleMLData(df, y_col=outcome,d_cols=treatment,x_cols=rest)
SML = DoubleMLPLR(data, l, m, g, n_folds=1, apply_cross_fitting=False, score=score)
SML.fit()
print(SML.summary)

# Orthogonal-ML

In [None]:
data = DoubleMLData(df,y_col=outcome,d_cols=treatment,x_cols=rest)
OML = DoubleMLPLR(data,l, m, g, n_folds=1,apply_cross_fitting=False,score='IV-type')
OML.fit();
print(OML.summary)

# Orthogonal + Crossfitting (DML)

In [None]:
data = DoubleMLData(df,y_col=outcome,d_cols=treatment,x_cols=rest)
DML = DoubleMLPLR(data, l,m,g, n_folds=5,apply_cross_fitting=True,score='IV-type')
DML.fit();
print(DML.summary)

# Summary

In [None]:
table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 'Std Error','t','p','2.5%','97.25%']
idx = 1
a = ['OLS']+ np.c_[OLS.params[idx], OLS.bse[idx], OLS.tvalues[idx], OLS.pvalues[idx], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
a = ['Single ML (SML)']+ np.array(SML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Orthogonal ML (OML)']+ np.array(OML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Double ML (DML)']+ np.array(DML.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)