In [34]:
import statsmodels.api as sm
import numpy as np
from econml.dml import LinearDML
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from scipy.stats import logistic

MC_N = 1
MC_θ = np.zeros((MC_N,4))
MC_y = np.zeros((MC_N,4))
MC_t = np.zeros((MC_N,4))

for j in range(MC_N):
    import pandas as pd
    import numpy as np
    df = pd.read_csv('/Users/pranjal/Desktop/Causal-Inference/data/wage.csv')
    cat = df.select_dtypes('object').columns
    df = pd.get_dummies(df, columns = cat, drop_first = True)
    outcome = 'lwage'
    treatment = 'educ'
    #rest = list(df.drop([outcome, treatment], axis = 1).columns)
    rest = ['exper','age', 'kidslt6', 'kidsge6']
    df = df[[outcome] + [treatment] + rest]
    df = df.dropna()
    y = df[outcome]
    t = df[treatment]
    x = df[rest].astype('float')
    print(x.shape, t.shape, y.shape)
    
    # OLS - Full Estimation
    model_OLS = sm.OLS(y, np.c_[t,x])
    res = model_OLS.fit()
    θ_OLS = res.params[0]
    # OLS First Stage: Y
    model_OLS = sm.OLS(y, np.c_[x])
    res_y = model_OLS.fit()
    θ_OLS_y = res_y.params[0]    
    # Logistic First Stage
    clf = LinearRegression().fit(x, t)
    
    # DML Lasso
    model_Lasso = LinearDML(discrete_treatment=False, random_state=1, cv=1)
    model_Lasso.fit(y.ravel(), t.ravel(), X=None,W=x)
    θ_DMLL = model_Lasso.intercept_

    # DML RF
    model_XGB = LinearDML(discrete_treatment=False, cv=1,
                          model_y = CatBoostRegressor(learning_rate = 1, max_depth = 8,verbose = False), 
                          model_t = CatBoostRegressor(learning_rate = 1, max_depth = 8,verbose = False))
    model_XGB.fit(y.ravel(), t.ravel(), X=None,W=x)
    θ_DMLRF = model_XGB.intercept_
    
    # DML NN - First Stage
    model_NN = LinearDML(discrete_treatment=False, cv =1,
                         model_y = MLPRegressor(random_state=1,
                                                 hidden_layer_sizes=(500,100,50), 
                                                 batch_size = x.shape[0],
                                                 momentum = 0.95, 
                                                 max_iter=50000, 
                                                 learning_rate_init=0.01, 
                                                 verbose=True), 
                         model_t = MLPRegressor(random_state=1,
                                                 hidden_layer_sizes=(500,100,50), 
                                                 batch_size = x.shape[0],
                                                 momentum = 0.95, 
                                                 max_iter=50000, 
                                                 learning_rate_init=0.01, 
                                                 verbose=True))
    model_NN.fit(y.ravel(), t.ravel(), X=None,W=x)
    θ_DMLRF = model_NN.intercept_


    MC_θ[j] = [θ_OLS, model_Lasso.intercept_, model_XGB.intercept_, model_NN.intercept_]
    MC_y[j] = [res_y.rsquared, np.mean(model_Lasso.nuisance_scores_y), np.mean(model_XGB.nuisance_scores_y),np.mean(model_NN.nuisance_scores_y)]
    MC_t[j] = [clf.score(x,t), np.mean(model_Lasso.nuisance_scores_t), np.mean(model_XGB.nuisance_scores_t),np.mean(model_NN.nuisance_scores_t)]
    
    

(428, 4) (428,) (428,)
Iteration 1, loss = 132.06658334
Iteration 2, loss = 12.55072934
Iteration 3, loss = 12.37243333
Iteration 4, loss = 6.29844335
Iteration 5, loss = 10.37424082
Iteration 6, loss = 5.68161515
Iteration 7, loss = 8.85088928
Iteration 8, loss = 6.71120623
Iteration 9, loss = 5.85937852
Iteration 10, loss = 7.58362144
Iteration 11, loss = 5.74863546
Iteration 12, loss = 5.83543903
Iteration 13, loss = 6.79158685
Iteration 14, loss = 5.35873528
Iteration 15, loss = 5.64911159
Iteration 16, loss = 6.21231545
Iteration 17, loss = 5.20640355
Iteration 18, loss = 5.38094762
Iteration 19, loss = 5.82307933
Iteration 20, loss = 5.06674082
Iteration 21, loss = 5.19690536
Iteration 22, loss = 5.52087117
Iteration 23, loss = 4.97353376
Iteration 24, loss = 5.04118388
Iteration 25, loss = 5.29031963
Iteration 26, loss = 4.88921507
Iteration 27, loss = 4.99689521
Iteration 28, loss = 5.00078524
Iteration 29, loss = 4.77064233
Iteration 30, loss = 4.80403559
Iteration 31, loss = 

Iteration 123, loss = 0.25355896
Iteration 124, loss = 0.25017309
Iteration 125, loss = 0.24932516
Iteration 126, loss = 0.25113342
Iteration 127, loss = 0.25199818
Iteration 128, loss = 0.24988296
Iteration 129, loss = 0.24770040
Iteration 130, loss = 0.24686307
Iteration 131, loss = 0.24769704
Iteration 132, loss = 0.24970435
Iteration 133, loss = 0.25024931
Iteration 134, loss = 0.25110292
Iteration 135, loss = 0.24754103
Iteration 136, loss = 0.24540497
Iteration 137, loss = 0.24429052
Iteration 138, loss = 0.24444638
Iteration 139, loss = 0.24595634
Iteration 140, loss = 0.24866737
Iteration 141, loss = 0.25740238
Iteration 142, loss = 0.25375752
Iteration 143, loss = 0.24862723
Iteration 144, loss = 0.24283447
Iteration 145, loss = 0.24852844
Iteration 146, loss = 0.25382283
Iteration 147, loss = 0.24285375
Iteration 148, loss = 0.24796800
Iteration 149, loss = 0.25526744
Iteration 150, loss = 0.24185450
Iteration 151, loss = 0.25172232
Iteration 152, loss = 0.25571722
Iteration 

In [35]:
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
table = PrettyTable()
table.field_names = ['Var', 'OLS','DML-L','DML-Boost','DML-NN']
a = ['θ_hat']+ np.mean(MC_θ, axis = 0).tolist()
table.add_row(a)
a = ['First Stage Y R2']+ np.mean(MC_y, axis = 0).tolist()
table.add_row(a)
a = ['First Stage D R2']+ np.mean(MC_t, axis = 0).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+------------------+-------+-------+-----------+--------+
|       Var        |  OLS  | DML-L | DML-Boost | DML-NN |
+------------------+-------+-------+-----------+--------+
|      θ_hat       | 0.103 | 0.110 |   0.124   | 0.101  |
| First Stage Y R2 | 0.723 | 0.027 |   0.924   | 0.032  |
| First Stage D R2 | 0.031 | 0.031 |   0.866   | -0.024 |
+------------------+-------+-------+-----------+--------+


In [37]:
res.summary()

0,1,2,3
Dep. Variable:,lwage,R-squared (uncentered):,0.771
Model:,OLS,Adj. R-squared (uncentered):,0.768
Method:,Least Squares,F-statistic:,285.0
Date:,"Fri, 09 Dec 2022",Prob (F-statistic):,5.9300000000000005e-133
Time:,17:56:25,Log-Likelihood:,-433.42
No. Observations:,428,AIC:,876.8
Df Residuals:,423,BIC:,897.1
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.1027,0.011,9.427,0.000,0.081,0.124
x2,0.0155,0.005,3.274,0.001,0.006,0.025
x3,-0.0062,0.004,-1.745,0.082,-0.013,0.001
x4,-0.0881,0.087,-1.013,0.312,-0.259,0.083
x5,-0.0267,0.025,-1.070,0.285,-0.076,0.022

0,1,2,3
Omnibus:,77.868,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,298.33
Skew:,-0.757,Prob(JB):,1.65e-65
Kurtosis:,6.8,Cond. No.,125.0


In [36]:
model_XGB.summary()

Coefficient Results:  X is None, please call intercept_inference to learn the constant!


0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
cate_intercept,0.124,0.023,5.492,0.0,0.08,0.168


In [19]:
dir(model_XGB)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_cached_values',
 '_check_fitted_dims',
 '_check_fitted_dims_w_z',
 '_check_input_dims',
 '_d_t',
 '_d_t_in',
 '_d_w',
 '_d_x',
 '_d_y',
 '_d_z',
 '_defer_to_inference',
 '_expand_treatments',
 '_fit_final',
 '_fit_nuisances',
 '_gen_featurizer',
 '_gen_model_final',
 '_gen_model_t',
 '_gen_model_y',
 '_gen_ortho_learner_model_final',
 '_gen_ortho_learner_model_nuisance',
 '_gen_rlearner_model_final',
 '_get_inference',
 '_get_inference_options',
 '_illegal_refit_inference_methods',
 '_inference',
 '_input_names',
 '_models_nuisance',
 '_original_treatment_featurizer',
 '_ortho_learn