In [45]:
import statsmodels.api as sm
import numpy as np
from econml.dml import LinearDML
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from scipy.stats import logistic

MC_N = 1
MC_θ = np.zeros((MC_N,4))
MC_y = np.zeros((MC_N,4))
MC_t = np.zeros((MC_N,4))

for j in range(MC_N):
    import pandas as pd
    import numpy as np
    df = pd.read_csv('/Users/pranjal/Desktop/Causal-Inference/data/wage.csv')
    cat = df.select_dtypes('object').columns
    df = pd.get_dummies(df, columns = cat, drop_first = True)
    outcome = 'lwage'
    treatment = 'educ'
    #rest = list(df.drop([outcome, treatment], axis = 1).columns)
    rest = ['exper','age', 'kidslt6', 'kidsge6']
    df = df[[outcome] + [treatment] + rest]
    df = df.dropna()
    y = df[outcome]
    t = df[treatment]
    x = df[rest].astype('float')
    print(x.shape, t.shape, y.shape)
    
    # OLS - Full Estimation
    model_OLS = sm.OLS(y, sm.add_constant(np.c_[t,x]))
    res = model_OLS.fit()
    θ_OLS = res.params[1]
    # OLS First Stage: Y
    model_OLS = sm.OLS(y, sm.add_constant(np.c_[x]))
    res_y = model_OLS.fit()
    θ_OLS_y = res_y.params[1]    
    # Logistic First Stage
    clf = LinearRegression().fit(x, t)
    
    # DML Lasso
    model_Lasso = LinearDML(discrete_treatment=False, random_state=1, cv=1)
    model_Lasso.fit(y.ravel(), t.ravel(), X=None,W=x)
    θ_DMLL = model_Lasso.intercept_
    
    # DML RF
    model_XGB = LinearDML(discrete_treatment=False, cv=1,
                          model_y = CatBoostRegressor(iterations=10000,
                                                       depth=12,
                                                       learning_rate=0.01,
                                                       loss_function='RMSE',
                                                       verbose=100,
                                                       eval_metric="R2",
                                                       eval_fraction=0.2),
                          model_t = CatBoostRegressor(iterations=10000,
                                                       depth=8,
                                                       learning_rate=0.01,
                                                       loss_function='RMSE',
                                                       verbose=1000, 
                                                       eval_metric="R2",
                                                       eval_fraction=0.2))
    model_XGB.fit(y.ravel(), t.ravel(), X=None,W=x)
    θ_DMLRF = model_XGB.intercept_
    
    # DML NN - First Stage
    model_NN = LinearDML(discrete_treatment=False, cv =1,
                         model_y = MLPRegressor(random_state=1,
                                                 hidden_layer_sizes=(500,100,50), 
                                                 batch_size = x.shape[0],
                                                 momentum = 0.95, 
                                                 max_iter=50000, 
                                                 learning_rate_init=0.01, 
                                                 verbose=False), 
                         model_t = CatBoostRegressor(random_state=1,
                                                 hidden_layer_sizes=(500,100,50), 
                                                 batch_size = x.shape[0],
                                                 momentum = 0.95, 
                                                 max_iter=50000, 
                                                 learning_rate_init=0.01, 
                                                 verbose=False))
    model_NN.fit(y.ravel(), t.ravel(), X=None,W=x)
    θ_DMLRF = model_NN.intercept_


    MC_θ[j] = [θ_OLS, model_Lasso.intercept_, model_XGB.intercept_, model_NN.intercept_]
    MC_y[j] = [res_y.rsquared, np.mean(model_Lasso.nuisance_scores_y), np.mean(model_XGB.nuisance_scores_y),np.mean(model_NN.nuisance_scores_y)]
    MC_t[j] = [clf.score(x,t), np.mean(model_Lasso.nuisance_scores_t), np.mean(model_XGB.nuisance_scores_t),np.mean(model_NN.nuisance_scores_t)]
    
    

(428, 4) (428,) (428,)


CatBoostError: Invalid loss_function='Logloss': for regressor use RMSE, MultiRMSE, SurvivalAft, MAE, Quantile, LogLinQuantile, Poisson, MAPE, Lq or custom objective object

In [40]:
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
table = PrettyTable()
table.field_names = ['Var', 'OLS','DML-L','DML-Boost','DML-NN']
a = ['θ_hat']+ np.mean(MC_θ, axis = 0).tolist()
table.add_row(a)
a = ['First Stage Y R2']+ np.mean(MC_y, axis = 0).tolist()
table.add_row(a)
a = ['First Stage D R2']+ np.mean(MC_t, axis = 0).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+------------------+-------+-------+-----------+--------+
|       Var        |  OLS  | DML-L | DML-Boost | DML-NN |
+------------------+-------+-------+-----------+--------+
|      θ_hat       | 0.110 | 0.110 |   0.124   | 0.101  |
| First Stage Y R2 | 0.034 | 0.027 |   0.924   | 0.032  |
| First Stage D R2 | 0.031 | 0.031 |   0.866   | -0.024 |
+------------------+-------+-------+-----------+--------+


In [43]:
res_y.summary()

0,1,2,3
Dep. Variable:,lwage,R-squared:,0.034
Model:,OLS,Adj. R-squared:,0.025
Method:,Least Squares,F-statistic:,3.772
Date:,"Fri, 09 Dec 2022",Prob (F-statistic):,0.005
Time:,18:20:51,Log-Likelihood:,-460.6
No. Observations:,428,AIC:,931.2
Df Residuals:,423,BIC:,951.5
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.2752,0.242,5.273,0.000,0.800,1.751
x1,0.0149,0.005,2.957,0.003,0.005,0.025
x2,-0.0053,0.006,-0.955,0.340,-0.016,0.006
x3,0.0006,0.094,0.007,0.995,-0.184,0.185
x4,-0.0428,0.030,-1.450,0.148,-0.101,0.015

0,1,2,3
Omnibus:,50.962,Durbin-Watson:,1.961
Prob(Omnibus):,0.0,Jarque-Bera (JB):,145.252
Skew:,-0.553,Prob(JB):,2.88e-32
Kurtosis:,5.631,Cond. No.,318.0


In [36]:
model_XGB.summary()

Coefficient Results:  X is None, please call intercept_inference to learn the constant!


0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
cate_intercept,0.124,0.023,5.492,0.0,0.08,0.168


In [19]:
dir(model_XGB)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_cached_values',
 '_check_fitted_dims',
 '_check_fitted_dims_w_z',
 '_check_input_dims',
 '_d_t',
 '_d_t_in',
 '_d_w',
 '_d_x',
 '_d_y',
 '_d_z',
 '_defer_to_inference',
 '_expand_treatments',
 '_fit_final',
 '_fit_nuisances',
 '_gen_featurizer',
 '_gen_model_final',
 '_gen_model_t',
 '_gen_model_y',
 '_gen_ortho_learner_model_final',
 '_gen_ortho_learner_model_nuisance',
 '_gen_rlearner_model_final',
 '_get_inference',
 '_get_inference_options',
 '_illegal_refit_inference_methods',
 '_inference',
 '_input_names',
 '_models_nuisance',
 '_original_treatment_featurizer',
 '_ortho_learn