# Packages

In [2]:
import numpy as np
import pandas as pd
from doubleml.datasets import fetch_bonus
from doubleml import DoubleMLData
import statsmodels.api as sm
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLR
from doubleml import DoubleMLPLR
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, LogisticRegressionCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
np.random.seed(3293423)

# Load the Data

In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv('/Users/pranjal/Desktop/Causal-Inference/data/nyvoucher.csv')
df.head()

Unnamed: 0,s_id,voucher,pre_ach,post_ach
0,42,0,74.0,83.0
1,194,0,7.5,4.0
2,218,1,2.5,3.5
3,261,1,0.0,26.5
4,304,1,11.0,2.0


In [4]:
print(df.isnull().sum())

s_id        0
voucher     0
pre_ach     0
post_ach    0
dtype: int64


In [7]:
outcome = 'post_ach'
treatment = 'voucher'
rest = list(df.drop([outcome, treatment], axis = 1).columns)
df = df[[outcome] + [treatment] + rest]
y = np.array(df.post_ach).reshape(-1, 1)
d = np.array(df.voucher).astype(int).reshape(-1, 1)
x = np.array(df[rest])
print(y.shape, d.shape, x.shape)

(521, 1) (521, 1) (521, 1)


# First Stage

In [11]:
np.random.seed(42)
table = PrettyTable()
table.field_names = ['Estimator', 'g(X):Rsquared', 'm(X):Accuracy']
a = ['Linear/Logistic',np.mean(cross_val_score(LinearRegression(), x, y, cv=5)),np.mean(cross_val_score(LogisticRegression(), x, d, cv=5))]
table.add_row(a)
a = ['Linear/Logistic (Reg)',np.mean(cross_val_score(LassoCV(), x, y, cv=5)),np.mean(cross_val_score(LogisticRegressionCV(), x, d, cv=5))]
table.add_row(a)
a = ['Random Forests',np.mean(cross_val_score(RandomForestRegressor(max_depth=5,n_estimators=500,verbose=0), x, y, cv=5)),np.mean(cross_val_score(RandomForestClassifier(max_depth=5, n_estimators=500), x, d, cv=5))]
table.add_row(a)
a = ['Boosting',np.mean(cross_val_score(XGBRegressor(max_depth=3,verbosity=0), x, y, cv=5)),np.mean(cross_val_score(XGBClassifier(verbosity=0,max_depth=5), x, d, cv=5))]
table.add_row(a)
a = ['Neural Networks',np.mean(cross_val_score(MLPRegressor((20,10,), activation = 'tanh', max_iter=500, learning_rate_init=0.01), normalize(x), y, cv=5)),np.mean(cross_val_score(MLPClassifier((20,10,), activation = 'tanh', max_iter=500,learning_rate_init=0.01), normalize(x), d, cv=5))]
table.add_row(a)
table.float_format = '0.3'
print(table)

+-----------------------+---------------+---------------+
|       Estimator       | g(X):Rsquared | m(X):Accuracy |
+-----------------------+---------------+---------------+
|    Linear/Logistic    |     -0.039    |     0.559     |
| Linear/Logistic (Reg) |     -0.035    |     0.559     |
|     Random Forests    |     -0.203    |     0.449     |
|        Boosting       |     -0.680    |     0.443     |
|    Neural Networks    |     -0.016    |     0.559     |
+-----------------------+---------------+---------------+


# OLS

In [12]:
OLS = sm.OLS(y,sm.add_constant(np.c_[d,x])).fit()
OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,4.278
Date:,"Thu, 15 Dec 2022",Prob (F-statistic):,0.0144
Time:,10:16:03,Log-Likelihood:,-2274.2
No. Observations:,521,AIC:,4554.0
Df Residuals:,518,BIC:,4567.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,21.5205,1.780,12.091,0.000,18.024,25.017
x1,4.9337,1.688,2.923,0.004,1.618,8.250
x2,-3.925e-05,0.000,-0.310,0.757,-0.000,0.000

0,1,2,3
Omnibus:,72.843,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,100.341
Skew:,1.038,Prob(JB):,1.63e-22
Kurtosis:,3.562,Cond. No.,31200.0


# ML Estimation

In [20]:
l = LassoCV() # Model for E[Y|X]=E[θD+g(X)]
g = LassoCV() # Model for E[Y - θD|X]=g(X)
m = LogisticRegressionCV() # Model for E[D|X]

def score(y, d, l_hat, m_hat, g_hat, smpls):
    "Score function for Single ML"
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

# Single-ML

In [21]:
data = DoubleMLData(df, y_col=outcome,d_cols=treatment,x_cols=rest)
SML = DoubleMLPLR(data, l, m, g, n_folds=1, apply_cross_fitting=False, score=score)
SML.fit()
print(SML.summary)

             coef   std err         t     P>|t|     2.5 %    97.5 %
voucher  4.917762  1.156009  4.254087  0.000021  2.652026  7.183498


# Orthogonal-ML

In [22]:
data = DoubleMLData(df,y_col=outcome,d_cols=treatment,x_cols=rest)
OML = DoubleMLPLR(data,l, m, g, n_folds=1,apply_cross_fitting=False,score='IV-type')
OML.fit();
print(OML.summary)

             coef   std err         t     P>|t|     2.5 %    97.5 %
voucher  4.934872  1.660423  2.972057  0.002958  1.680503  8.189241


# Orthogonal + Crossfitting (DML)

In [23]:
data = DoubleMLData(df,y_col=outcome,d_cols=treatment,x_cols=rest)
DML = DoubleMLPLR(data, l,m,g, n_folds=5,apply_cross_fitting=True,score='IV-type')
DML.fit();
print(DML.summary)

             coef   std err         t     P>|t|     2.5 %    97.5 %
voucher  5.056453  1.666632  3.033935  0.002414  1.789915  8.322991


# Summary

In [24]:
table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 'Std Error','t','p','2.5%','97.25%']
idx = 1
a = ['OLS']+ np.c_[OLS.params[idx], OLS.bse[idx], OLS.tvalues[idx], OLS.pvalues[idx], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
a = ['Single ML (SML)']+ np.array(SML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Orthogonal ML (OML)']+ np.array(OML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Double ML (DML)']+ np.array(DML.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+---------------------+-------+-----------+-------+-------+-------+--------+
|      Estimator      | θ_hat | Std Error |   t   |   p   |  2.5% | 97.25% |
+---------------------+-------+-----------+-------+-------+-------+--------+
|         OLS         | 4.934 |   1.688   | 2.923 | 0.004 |  nan  |  nan   |
|   Single ML (SML)   | 4.918 |   1.156   | 4.254 | 0.000 | 2.652 | 7.183  |
| Orthogonal ML (OML) | 4.935 |   1.660   | 2.972 | 0.003 | 1.681 | 8.189  |
|   Double ML (DML)   | 5.056 |   1.667   | 3.034 | 0.002 | 1.790 | 8.323  |
+---------------------+-------+-----------+-------+-------+-------+--------+
