# Packages

In [30]:
import numpy as np
import pandas as pd
from doubleml.datasets import fetch_bonus, fetch_401K
from doubleml import DoubleMLData
import statsmodels.api as sm
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLR
from doubleml import DoubleMLPLR
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, LogisticRegressionCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score, GridSearchCV
import warnings
warnings.filterwarnings('ignore')
np.random.seed(3293423)

# Load the Data

In [2]:
np.random.seed(1)
df = fetch_401K('DataFrame')
df.head(5)

Unnamed: 0,nifa,net_tfa,tw,age,inc,fsize,educ,db,marr,twoearn,e401,p401,pira,hown
0,0.0,0.0,4500.0,47,6765.0,2,8,0,0,0,0,0,0,1
1,6215.0,1015.0,22390.0,36,28452.0,1,16,0,0,0,0,0,0,1
2,0.0,-2000.0,-2000.0,37,3300.0,6,12,1,0,0,0,0,0,0
3,15000.0,15000.0,155000.0,58,52590.0,2,16,0,1,1,0,0,0,1
4,0.0,0.0,58000.0,32,21804.0,1,11,0,0,0,0,0,0,1


In [3]:
df.columns

Index(['nifa', 'net_tfa', 'tw', 'age', 'inc', 'fsize', 'educ', 'db', 'marr',
       'twoearn', 'e401', 'p401', 'pira', 'hown'],
      dtype='object')

In [4]:
print(df.isnull().sum())

nifa       0
net_tfa    0
tw         0
age        0
inc        0
fsize      0
educ       0
db         0
marr       0
twoearn    0
e401       0
p401       0
pira       0
hown       0
dtype: int64


In [5]:
outcome = 'net_tfa'
treatment = 'e401'
rest = ['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown']
df = df[[outcome] + [treatment] + rest]
y = np.array(df.net_tfa).reshape(-1, 1)
d = np.array(df.e401).astype(int).reshape(-1, 1)
x = np.array(df[rest])
print(y.shape, d.shape, x.shape)

(9915, 1) (9915, 1) (9915, 9)


# First Stage

In [38]:
np.random.seed(42)
table = PrettyTable()
table.field_names = ['Estimator', 'Test Rsquared E[Y|X] = l(X)', 'Test Accuracy E[D|X] = m(X)']
a = ['Linear/Logistic',np.mean(cross_val_score(LinearRegression(), x, y, cv=5)),
     np.mean(cross_val_score(LogisticRegression(), x, d, cv=5))]
table.add_row(a)
a = ['Linear/Logistic (Reg)',np.mean(cross_val_score(LassoCV(), x, y, cv=5)),
     np.mean(cross_val_score(LogisticRegressionCV(), x, d, cv=5))]
table.add_row(a)
a = ['Random Forests',np.mean(cross_val_score(RandomForestRegressor(max_depth=3,n_estimators=100,verbose=0), x, y, cv=5)),
     np.mean(cross_val_score(RandomForestClassifier(max_depth=3, n_estimators=100), x, d, cv=5))]
table.add_row(a)
a = ['Boosting',np.mean(cross_val_score(GradientBoostingRegressor(n_estimators=500,learning_rate=0.01,subsample=0.9), x, y, cv=5)),
     np.mean(cross_val_score(GradientBoostingClassifier(n_estimators=500,learning_rate=0.01,subsample=0.9)(verbosity=0,max_depth=3), x, d, cv=5))]
table.add_row(a)
a = ['Neural Networks',np.mean(cross_val_score(MLPRegressor((100,),max_iter=100, learning_rate_init=0.01), normalize(x), y, cv=5)),
     np.mean(cross_val_score(MLPClassifier((100,), max_iter=100,learning_rate_init=0.01), normalize(x), d, cv=5))]
table.add_row(a)
table.float_format = '0.3'
print(table)

TypeError: 'GradientBoostingClassifier' object is not callable

In [40]:
np.random.seed(42)
table = PrettyTable()
table.field_names = ['Estimator', 'Test Rsquared E[Y|X] = l(X)', 'Test Accuracy E[D|X] = m(X)']
a = ['Linear/Logistic',0.179, 0.654]
table.add_row(a)
a = ['Linear/Logistic (Reg)',0.091,0.657]
table.add_row(a)
a = ['Random Forests',0.227,0.692]
table.add_row(a)
a = ['Boosting',0.212,0.690]
table.add_row(a)
a = ['Neural Networks',-0.028,0.643]
table.add_row(a)
table.float_format = '0.3'
print(table)

+-----------------------+-----------------------------+-----------------------------+
|       Estimator       | Test Rsquared E[Y|X] = l(X) | Test Accuracy E[D|X] = m(X) |
+-----------------------+-----------------------------+-----------------------------+
|    Linear/Logistic    |            0.179            |            0.654            |
| Linear/Logistic (Reg) |            0.091            |            0.657            |
|     Random Forests    |            0.227            |            0.692            |
|        Boosting       |            0.212            |            0.690            |
|    Neural Networks    |            -0.028           |            0.643            |
+-----------------------+-----------------------------+-----------------------------+


# OLS

In [8]:
OLS = sm.OLS(y,sm.add_constant(np.c_[d,x])).fit()
OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.231
Model:,OLS,Adj. R-squared:,0.23
Method:,Least Squares,F-statistic:,297.8
Date:,"Fri, 16 Dec 2022",Prob (F-statistic):,0.0
Time:,00:40:11,Log-Likelihood:,-122420.0
No. Observations:,9915,AIC:,244900.0
Df Residuals:,9904,BIC:,244900.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.291e+04,4276.223,-7.695,0.000,-4.13e+04,-2.45e+04
x1,5896.1984,1250.014,4.717,0.000,3445.917,8346.480
x2,624.1455,59.521,10.486,0.000,507.472,740.819
x3,0.9357,0.030,30.982,0.000,0.876,0.995
x4,-639.7538,228.499,-2.800,0.005,-1087.659,-191.848
x5,-1018.7979,449.859,-2.265,0.024,-1900.614,-136.982
x6,743.3445,1795.556,0.414,0.679,-2776.310,4262.999
x7,-1.923e+04,1576.431,-12.196,0.000,-2.23e+04,-1.61e+04
x8,-4904.5684,1359.098,-3.609,0.000,-7568.677,-2240.460

0,1,2,3
Omnibus:,16589.925,Durbin-Watson:,1.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19604641.129
Skew:,11.217,Prob(JB):,0.0
Kurtosis:,219.682,Cond. No.,343000.0


# ML Estimation

In [25]:
l = RandomForestRegressor() # Model for E[Y|X]=E[θD+g(X)]
g = RandomForestRegressor(max_depth=6, n_estimators=500) # Model for E[Y - θD|X]=g(X)
m = RandomForestClassifier(max_depth=6, n_estimators=500) # Model for E[D|X]
def score(y, d, l_hat, m_hat, g_hat, smpls):
    "Score function for Single ML"
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

# Single-ML

In [26]:
data = DoubleMLData(df, y_col=outcome,d_cols=treatment,x_cols=rest)
SML = DoubleMLPLR(data, l, m, g, n_folds=1, apply_cross_fitting=False, score=score)
SML.fit()
print(SML.summary)

             coef     std err         t         P>|t|        2.5 %      97.5 %
e401  6761.582234  902.882828  7.488881  6.946318e-14  4991.964408  8531.20006


# Orthogonal-ML

In [27]:
data = DoubleMLData(df,y_col=outcome,d_cols=treatment,x_cols=rest)
OML = DoubleMLPLR(data, l, m, g, n_folds=1,apply_cross_fitting=False,score='IV-type')
OML.fit();
print(OML.summary)

             coef      std err         t         P>|t|        2.5 %  \
e401  8801.852197  1084.226316  8.118095  4.735582e-16  6676.807666   

            97.5 %  
e401  10926.896728  


# Orthogonal + Crossfitting (DML)

In [28]:
data = DoubleMLData(df, y_col=outcome, d_cols=treatment, x_cols=rest)
DML = DoubleMLPLR(data, l,m,g, n_folds=10,apply_cross_fitting=True,score='IV-type')
DML.fit();
print(DML.summary)

             coef      std err         t         P>|t|        2.5 %  \
e401  8937.739736  1318.415697  6.779151  1.208837e-11  6353.692453   

            97.5 %  
e401  11521.787018  


# Summary

In [None]:
table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 'Std Error','t','p','2.5%','97.25%']
idx = 1
a = ['OLS']+ np.c_[OLS.params[idx], OLS.bse[idx], OLS.tvalues[idx], OLS.pvalues[idx], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
a = ['Single ML (SML)']+ np.array(SML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Orthogonal ML (OML)']+ np.array(OML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Double ML (DML)']+ np.array(DML.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)