# Packages

In [46]:
import numpy as np
import pandas as pd
from doubleml.datasets import fetch_bonus
from doubleml import DoubleMLData
import statsmodels.api as sm
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLR
from doubleml import DoubleMLPLR
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, LogisticRegressionCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score, GridSearchCV
import warnings
warnings.filterwarnings('ignore')
np.random.seed(3293423)

# Load the Data

In [47]:
import wooldridge
df = wooldridge.data('jtrain2')
#df = df[df.re78>0]
print(df.shape)
df.head()

(445, 19)


Unnamed: 0,train,age,educ,black,hisp,married,nodegree,mosinex,re74,re75,re78,unem74,unem75,unem78,lre74,lre75,lre78,agesq,mostrn
0,1,37,11,1,0,1,1,13,0.0,0.0,9.93005,1,1,0,0.0,0.0,2.295566,1369,13
1,1,22,9,0,1,0,1,13,0.0,0.0,3.59589,1,1,0,0.0,0.0,1.279792,484,13
2,1,30,12,1,0,0,0,13,0.0,0.0,24.9095,1,1,0,0.0,0.0,3.215249,900,13
3,1,27,11,1,0,0,1,13,0.0,0.0,7.50615,1,1,0,0.0,0.0,2.015723,729,13
4,1,33,8,1,0,0,1,13,0.0,0.0,0.28979,1,1,0,0.0,0.0,-1.238599,1089,13


In [48]:
print(df.isnull().sum())

train       0
age         0
educ        0
black       0
hisp        0
married     0
nodegree    0
mosinex     0
re74        0
re75        0
re78        0
unem74      0
unem75      0
unem78      0
lre74       0
lre75       0
lre78       0
agesq       0
mostrn      0
dtype: int64


In [49]:
outcome = 're78'
treatment = 'train'
rest = ['age', 'educ', 'black', 'hisp', 'married', 're74', 're75']
df = df[[outcome] + [treatment] + rest]
y = np.array(df.re78).reshape(-1, 1)
d = np.array(df.train).astype(int).reshape(-1, 1)
x = np.array(df[rest])
print(y.shape, d.shape, x.shape)

(445, 1) (445, 1) (445, 7)


# First Stage

In [58]:
np.random.seed(42)
table = PrettyTable()
table.field_names = ['Estimator', 'Test Rsquared E[Y|X] = l(X)', 'Test Accuracy E[D|X] = m(X)']
a = ['Linear/Logistic',np.mean(cross_val_score(LinearRegression(), x, y, cv=5)),
     np.mean(cross_val_score(LogisticRegression(), x, d, cv=5))]
table.add_row(a)
a = ['Linear/Logistic (Reg)',np.mean(cross_val_score(LassoCV(), x, y, cv=5)),
     np.mean(cross_val_score(LogisticRegressionCV(), x, d, cv=5))]
table.add_row(a)
a = ['Random Forests',np.mean(cross_val_score(RandomForestRegressor(max_depth=2,n_estimators=500,verbose=0), x, y, cv=5)),
     np.mean(cross_val_score(RandomForestClassifier(max_depth=2, n_estimators=500), x, d, cv=5))]
table.add_row(a)
a = ['Boosting',np.mean(cross_val_score(XGBRegressor(max_depth=2,verbosity=0), x, y, cv=5)),
     np.mean(cross_val_score(XGBClassifier(verbosity=0,max_depth=2), x, d, cv=5))]
table.add_row(a)
a = ['Neural Networks',np.mean(cross_val_score(MLPRegressor((100,),max_iter=1000, learning_rate_init=0.01), normalize(x), y, cv=5)),
     np.mean(cross_val_score(MLPClassifier((100,), max_iter=1000,learning_rate_init=0.01), normalize(x), d, cv=5))]
table.add_row(a)
table.float_format = '0.3'
print(table)

+-----------------------+-----------------------------+-----------------------------+
|       Estimator       | Test Rsquared E[Y|X] = l(X) | Test Accuracy E[D|X] = m(X) |
+-----------------------+-----------------------------+-----------------------------+
|    Linear/Logistic    |            -0.076           |            0.548            |
| Linear/Logistic (Reg) |            -0.059           |            0.551            |
|     Random Forests    |            -0.100           |            0.553            |
|        Boosting       |            -0.436           |            0.542            |
|    Neural Networks    |            -0.073           |            0.557            |
+-----------------------+-----------------------------+-----------------------------+


# OLS

In [289]:
OLS = sm.OLS(y,sm.add_constant(np.c_[d, x])).fit()
OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.055
Model:,OLS,Adj. R-squared:,0.037
Method:,Least Squares,F-statistic:,3.161
Date:,"Thu, 15 Dec 2022",Prob (F-statistic):,0.00171
Time:,15:07:35,Log-Likelihood:,-1460.2
No. Observations:,445,AIC:,2938.0
Df Residuals:,436,BIC:,2975.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6217,2.447,0.254,0.800,-4.189,5.432
x1,1.6826,0.632,2.663,0.008,0.441,2.924
x2,0.0558,0.045,1.246,0.214,-0.032,0.144
x3,0.4059,0.176,2.311,0.021,0.061,0.751
x4,-2.1698,1.159,-1.873,0.062,-4.447,0.107
x5,0.1579,1.545,0.102,0.919,-2.879,3.195
x6,-0.1403,0.878,-0.160,0.873,-1.867,1.586
x7,0.0829,0.077,1.081,0.280,-0.068,0.234
x8,0.0515,0.134,0.384,0.701,-0.212,0.315

0,1,2,3
Omnibus:,284.573,Durbin-Watson:,2.059
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3759.179
Skew:,2.543,Prob(JB):,0.0
Kurtosis:,16.299,Cond. No.,247.0


# ML Estimation

In [294]:
l = LassoCV() # Model for E[Y|X]=E[θD+g(X)]
g = LassoCV() # Model for E[Y - θD|X]=g(X)
m = RandomForestClassifier(verbose=0,max_depth=3) # Model for E[D|X]

def score(y, d, l_hat, m_hat, g_hat, smpls):
    "Score function for Single ML"
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

# Single-ML

In [295]:
data = DoubleMLData(df, y_col=outcome,d_cols=treatment,x_cols=rest)
SML = DoubleMLPLR(data, l, m, g, n_folds=1, apply_cross_fitting=False, score=score)
SML.fit()
print(SML.summary)

           coef  std err         t     P>|t|     2.5 %    97.5 %
train  1.678206  0.56537  2.968333  0.002994  0.570102  2.786311


# Orthogonal-ML

In [296]:
data = DoubleMLData(df,y_col=outcome,d_cols=treatment,x_cols=rest)
OML = DoubleMLPLR(data,l, m, g, n_folds=1,apply_cross_fitting=False,score='IV-type')
OML.fit();
print(OML.summary)

           coef   std err         t     P>|t|     2.5 %    97.5 %
train  1.665686  0.656571  2.536947  0.011182  0.378831  2.952542


# Orthogonal + Crossfitting (DML)

In [297]:
data = DoubleMLData(df,y_col=outcome,d_cols=treatment,x_cols=rest)
DML = DoubleMLPLR(data, l,m,g, n_folds=10,apply_cross_fitting=True,score='IV-type')
DML.fit();
print(DML.summary)

           coef  std err         t     P>|t|     2.5 %    97.5 %
train  1.675692  0.66064  2.536468  0.011198  0.380862  2.970522


# Summary

In [298]:
table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 'Std Error','t','p','2.5%','97.25%']
idx = 1
a = ['OLS']+ np.c_[OLS.params[idx], OLS.bse[idx], OLS.tvalues[idx], OLS.pvalues[idx], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
a = ['Single ML (SML)']+ np.array(SML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Orthogonal ML (OML)']+ np.array(OML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Double ML (DML)']+ np.array(DML.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+---------------------+-------+-----------+-------+-------+-------+--------+
|      Estimator      | θ_hat | Std Error |   t   |   p   |  2.5% | 97.25% |
+---------------------+-------+-----------+-------+-------+-------+--------+
|         OLS         | 1.683 |   0.632   | 2.663 | 0.008 |  nan  |  nan   |
|   Single ML (SML)   | 1.678 |   0.565   | 2.968 | 0.003 | 0.570 | 2.786  |
| Orthogonal ML (OML) | 1.666 |   0.657   | 2.537 | 0.011 | 0.379 | 2.953  |
|   Double ML (DML)   | 1.676 |   0.661   | 2.536 | 0.011 | 0.381 | 2.971  |
+---------------------+-------+-----------+-------+-------+-------+--------+
