# Packages

In [1]:
import numpy as np
import pandas as pd
from doubleml.datasets import fetch_bonus
from doubleml import DoubleMLData
import statsmodels.api as sm
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLR
from doubleml import DoubleMLPLR
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, LogisticRegressionCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
np.random.seed(3293423)

# Load the Data

In [2]:
np.random.seed(1)
df = fetch_bonus('DataFrame')
df.head(5)

Unnamed: 0,index,abdt,tg,inuidur1,inuidur2,female,black,hispanic,othrace,dep,...,recall,agelt35,agegt54,durable,nondurable,lusd,husd,muld,dep1,dep2
0,0,10824,0,2.890372,18,0,0,0,0,2,...,0,0,0,0,0,0,1,0,0.0,1.0
1,3,10824,0,0.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0.0,0.0
2,4,10747,0,3.295837,27,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0.0,0.0
3,11,10607,1,2.197225,9,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0.0,0.0
4,12,10831,0,3.295837,27,0,0,0,0,1,...,0,0,1,1,0,1,0,0,1.0,0.0


In [5]:
outcome = 'inuidur1'
treatment = 'tg'
rest = ['female', 'black', 'othrace', 'dep1', 'dep2','q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54','durable', 'lusd', 'husd']
df = df[[outcome] + [treatment] + rest]
y = np.array(df.inuidur1).reshape(-1, 1)
d = np.array(df.tg).astype(int).reshape(-1, 1)
x = np.array(df[rest])
print(y.shape, d.shape, x.shape)

(5099, 1) (5099, 1) (5099, 15)


# First Stage

In [18]:
np.random.seed(42)
table = PrettyTable()
table.field_names = ['Estimator', 'Test Rsquared E[Y|X] = l(X)', 'Test Accuracy E[D|X] = m(X)']
a = ['Linear/Logistic',np.mean(cross_val_score(LinearRegression(), x, y, cv=5)),
     np.mean(cross_val_score(LogisticRegression(), x, d, cv=5))]
table.add_row(a)
a = ['Linear/Logistic (Reg)',np.mean(cross_val_score(LassoCV(), x, y, cv=5)),
     np.mean(cross_val_score(LogisticRegressionCV(), x, d, cv=5))]
table.add_row(a)
a = ['Random Forests',np.mean(cross_val_score(RandomForestRegressor(max_depth=5,n_estimators=500,verbose=0), x, y, cv=5)),
     np.mean(cross_val_score(RandomForestClassifier(max_depth=5, n_estimators=500), x, d, cv=5))]
table.add_row(a)
a = ['Boosting',np.mean(cross_val_score(XGBRegressor(max_depth=2,verbosity=0), x, y, cv=5)),
     np.mean(cross_val_score(XGBClassifier(verbosity=0,max_depth=2), x, d, cv=5))]
table.add_row(a)
a = ['Neural Networks',np.mean(cross_val_score(MLPRegressor((100,),max_iter=1000, learning_rate_init=0.01), normalize(x), y, cv=5)),
     np.mean(cross_val_score(MLPClassifier((100,), max_iter=1000,learning_rate_init=0.01), normalize(x), d, cv=5))]
table.add_row(a)
table.float_format = '0.3'
print(table)

+-----------------------+-----------------------------+-----------------------------+
|       Estimator       | Test Rsquared E[Y|X] = l(X) | Test Accuracy E[D|X] = m(X) |
+-----------------------+-----------------------------+-----------------------------+
|    Linear/Logistic    |            0.020            |            0.658            |
| Linear/Logistic (Reg) |            0.019            |            0.658            |
|     Random Forests    |            0.023            |            0.658            |
|        Boosting       |            0.027            |            0.657            |
|    Neural Networks    |            -0.034           |            0.621            |
+-----------------------+-----------------------------+-----------------------------+


# OLS

In [19]:
OLS = sm.OLS(y,sm.add_constant(np.c_[d,x])).fit()
OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.029
Model:,OLS,Adj. R-squared:,0.026
Method:,Least Squares,F-statistic:,9.534
Date:,"Thu, 15 Dec 2022",Prob (F-statistic):,6.2299999999999996e-24
Time:,22:53:17,Log-Likelihood:,-8151.2
No. Observations:,5099,AIC:,16340.0
Df Residuals:,5082,BIC:,16450.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.1785,0.159,13.700,0.000,1.867,2.490
x1,-0.0717,0.035,-2.022,0.043,-0.141,-0.002
x2,0.1264,0.035,3.629,0.000,0.058,0.195
x3,-0.2938,0.053,-5.545,0.000,-0.398,-0.190
x4,-0.4724,0.198,-2.381,0.017,-0.861,-0.084
x5,0.0299,0.054,0.552,0.581,-0.076,0.136
x6,0.0962,0.047,2.053,0.040,0.004,0.188
x7,0.0737,0.157,0.470,0.639,-0.234,0.381
x8,-0.0385,0.156,-0.246,0.806,-0.345,0.268

0,1,2,3
Omnibus:,1975.952,Durbin-Watson:,1.989
Prob(Omnibus):,0.0,Jarque-Bera (JB):,471.819
Skew:,-0.516,Prob(JB):,3.51e-103
Kurtosis:,1.924,Cond. No.,32.4


# ML Estimation

In [10]:
l = RandomForestRegressor(max_depth=5,n_estimators=500,verbose=0) # Model for E[Y|X]=E[θD+g(X)]
g = RandomForestRegressor(max_depth=5,n_estimators=500,verbose=0) # Model for E[Y - θD|X]=g(X)
m = RandomForestClassifier(max_depth=5,n_estimators=500,verbose=0) # Model for E[D|X]

def score(y, d, l_hat, m_hat, g_hat, smpls):
    "Score function for Single ML"
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

# Single-ML

In [11]:
data = DoubleMLData(df, y_col=outcome,d_cols=treatment,x_cols=rest)
SML = DoubleMLPLR(data, l, m, g, n_folds=1, apply_cross_fitting=False, score=score)
SML.fit()
print(SML.summary)

       coef   std err         t     P>|t|     2.5 %   97.5 %
tg -0.07236  0.028204 -2.565557  0.010301 -0.127639 -0.01708


# Orthogonal-ML

In [12]:
data = DoubleMLData(df,y_col=outcome,d_cols=treatment,x_cols=rest)
OML = DoubleMLPLR(data,l, m, g, n_folds=1,apply_cross_fitting=False,score='IV-type')
OML.fit();
print(OML.summary)

        coef   std err         t     P>|t|     2.5 %    97.5 %
tg -0.074555  0.034835 -2.140201  0.032339 -0.142831 -0.006279


# Orthogonal + Crossfitting (DML)

In [15]:
data = DoubleMLData(df,y_col=outcome,d_cols=treatment,x_cols=rest)
DML = DoubleMLPLR(data, l,m,g, n_folds=10,apply_cross_fitting=True,score='IV-type')
DML.fit();
print(DML.summary)

        coef   std err         t     P>|t|     2.5 %    97.5 %
tg -0.070918  0.035426 -2.001898  0.045296 -0.140351 -0.001486


# Summary

In [16]:
table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 'Std Error','t','p','2.5%','97.25%']
idx = 1
a = ['OLS']+ np.c_[OLS.params[idx], OLS.bse[idx], OLS.tvalues[idx], OLS.pvalues[idx], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
a = ['Single ML (SML)']+ np.array(SML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Orthogonal ML (OML)']+ np.array(OML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Double ML (DML)']+ np.array(DML.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+---------------------+--------+-----------+--------+-------+--------+--------+
|      Estimator      | θ_hat  | Std Error |   t    |   p   |  2.5%  | 97.25% |
+---------------------+--------+-----------+--------+-------+--------+--------+
|         OLS         | -0.072 |   0.035   | -2.022 | 0.043 |  nan   |  nan   |
|   Single ML (SML)   | -0.072 |   0.028   | -2.566 | 0.010 | -0.128 | -0.017 |
| Orthogonal ML (OML) | -0.075 |   0.035   | -2.140 | 0.032 | -0.143 | -0.006 |
|   Double ML (DML)   | -0.071 |   0.035   | -2.002 | 0.045 | -0.140 | -0.001 |
+---------------------+--------+-----------+--------+-------+--------+--------+
