In [5]:
import statsmodels.api as sm
import numpy as np
from econml.dml import LinearDML
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from scipy.stats import logistic

MC_N = 1
MC_θ = np.zeros((MC_N,4))
MC_y = np.zeros((MC_N,4))
MC_t = np.zeros((MC_N,4))

for j in range(MC_N):
    import pandas as pd
    import numpy as np
    df = pd.read_csv('/Users/pranjal/Desktop/Causal-Inference/data/wage.csv')
    cat = df.select_dtypes('object').columns
    df = pd.get_dummies(df, columns = cat, drop_first = True)
    outcome = 'lwage'
    treatment = 'educ'
    #rest = list(df.drop([outcome, treatment], axis = 1).columns)
    rest = ['exper','age', 'kidslt6', 'kidsge6']
    df = df[[outcome] + [treatment] + rest]
    df = df.dropna()
    y = df[outcome]
    t = df[treatment]
    x = df[rest].astype('float')
    print(x.shape, t.shape, y.shape)
    
    # OLS - Full Estimation
    model_OLS = sm.OLS(y, sm.add_constant(np.c_[t,x]))
    res = model_OLS.fit()
    θ_OLS = res.params[1]
    # OLS First Stage: Y
    model_OLS = sm.OLS(y, sm.add_constant(np.c_[x]))
    res_y = model_OLS.fit()
    θ_OLS_y = res_y.params[1]    
    # Logistic First Stage
    clf = LinearRegression().fit(x, t)
    
    # DML Lasso
    model_Lasso = LinearDML(discrete_treatment=False, random_state=1, cv=1)
    model_Lasso.fit(y.ravel(), t.ravel(), X=None,W=x)
    θ_DMLL = model_Lasso.intercept_

    # DML RF
    model_XGB = LinearDML(discrete_treatment=False, cv=1,
                          model_y = CatBoostRegressor(iterations=10000,
                                                       depth=12,
                                                       learning_rate=0.01,
                                                       loss_function='RMSE',
                                                       verbose=100,
                                                       eval_metric="R2",
                                                       eval_fraction=0.2),
                          model_t = CatBoostRegressor(iterations=10000,
                                                       depth=8,
                                                       learning_rate=0.01,
                                                       loss_function='RMSE',
                                                       verbose=1000, 
                                                       eval_metric="R2",
                                                       eval_fraction=0.2))
    model_XGB.fit(y.ravel(), t.ravel(), X=None,W=x)
    θ_DMLRF = model_XGB.intercept_
    
    # DML NN - First Stage
    model_NN = LinearDML(discrete_treatment=False, cv =1,
                         model_y = MLPRegressor(random_state=1,
                                                 hidden_layer_sizes=(500,100,50), 
                                                 batch_size = x.shape[0],
                                                 momentum = 0.95, 
                                                 max_iter=50000, 
                                                 learning_rate_init=0.01, 
                                                 verbose=False), 
                         model_t = MLPRegressor(random_state=1,
                                                 hidden_layer_sizes=(500,100,50), 
                                                 batch_size = x.shape[0],
                                                 momentum = 0.95, 
                                                 max_iter=50000, 
                                                 learning_rate_init=0.01, 
                                                 verbose=False))
    model_NN.fit(y.ravel(), t.ravel(), X=None,W=x)
    θ_DMLRF = model_NN.intercept_
    MC_θ[j] = [θ_OLS, model_Lasso.intercept_, model_XGB.intercept_, model_NN.intercept_]
    MC_y[j] = [res_y.rsquared, np.mean(model_Lasso.nuisance_scores_y), np.mean(model_XGB.nuisance_scores_y),np.mean(model_NN.nuisance_scores_y)]
    MC_t[j] = [clf.score(x,t), np.mean(model_Lasso.nuisance_scores_t), np.mean(model_XGB.nuisance_scores_t),np.mean(model_NN.nuisance_scores_t)]

(428, 4) (428,) (428,)
0:	learn: 0.0007258	test: -0.0011942	best: -0.0011942 (0)	total: 55.6ms	remaining: 9m 16s
1000:	learn: 0.5074440	test: -0.0257967	best: 0.0395152 (212)	total: 455ms	remaining: 4.09s
2000:	learn: 0.7046222	test: -0.1371632	best: 0.0395152 (212)	total: 897ms	remaining: 3.58s
3000:	learn: 0.7841271	test: -0.2016425	best: 0.0395152 (212)	total: 1.56s	remaining: 3.63s
4000:	learn: 0.8214698	test: -0.2439995	best: 0.0395152 (212)	total: 2.1s	remaining: 3.15s
5000:	learn: 0.8432334	test: -0.2796843	best: 0.0395152 (212)	total: 2.58s	remaining: 2.58s
6000:	learn: 0.8576474	test: -0.3064900	best: 0.0395152 (212)	total: 3.1s	remaining: 2.06s
7000:	learn: 0.8676718	test: -0.3303604	best: 0.0395152 (212)	total: 3.52s	remaining: 1.51s
8000:	learn: 0.8747371	test: -0.3509029	best: 0.0395152 (212)	total: 3.94s	remaining: 984ms
9000:	learn: 0.8795011	test: -0.3656598	best: 0.0395152 (212)	total: 4.36s	remaining: 484ms
9999:	learn: 0.8826916	test: -0.3776074	best: 0.0395152 (212)

7800:	learn: 0.9151276	test: -0.2638501	best: 0.0046983 (112)	total: 23.7s	remaining: 6.68s
7900:	learn: 0.9152386	test: -0.2640022	best: 0.0046983 (112)	total: 24s	remaining: 6.38s
8000:	learn: 0.9153417	test: -0.2644598	best: 0.0046983 (112)	total: 24.3s	remaining: 6.07s
8100:	learn: 0.9154321	test: -0.2647270	best: 0.0046983 (112)	total: 24.6s	remaining: 5.76s
8200:	learn: 0.9155183	test: -0.2649903	best: 0.0046983 (112)	total: 24.9s	remaining: 5.46s
8300:	learn: 0.9155987	test: -0.2654581	best: 0.0046983 (112)	total: 25.3s	remaining: 5.17s
8400:	learn: 0.9156766	test: -0.2655730	best: 0.0046983 (112)	total: 25.6s	remaining: 4.88s
8500:	learn: 0.9157456	test: -0.2657731	best: 0.0046983 (112)	total: 26s	remaining: 4.58s
8600:	learn: 0.9158130	test: -0.2660802	best: 0.0046983 (112)	total: 26.3s	remaining: 4.27s
8700:	learn: 0.9158812	test: -0.2664012	best: 0.0046983 (112)	total: 26.6s	remaining: 3.97s
8800:	learn: 0.9159427	test: -0.2664898	best: 0.0046983 (112)	total: 26.9s	remaining

In [6]:
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
table = PrettyTable()
table.field_names = ['Var', 'OLS','DML-L','DML-Boost','DML-NN']
a = ['θ_hat']+ np.mean(MC_θ, axis = 0).tolist()
table.add_row(a)
a = ['First Stage Y R2']+ np.mean(MC_y, axis = 0).tolist()
table.add_row(a)
a = ['First Stage D Accuracy']+ np.mean(MC_t, axis = 0).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+------------------------+-------+-------+-----------+--------+
|          Var           |  OLS  | DML-L | DML-Boost | DML-NN |
+------------------------+-------+-------+-----------+--------+
|         θ_hat          | 0.110 | 0.110 |   0.111   | 0.101  |
|    First Stage Y R2    | 0.034 | 0.027 |   0.144   | 0.032  |
| First Stage D Accuracy | 0.031 | 0.031 |   0.154   | -0.024 |
+------------------------+-------+-------+-----------+--------+


# First Stage

In [141]:
# Checking Logistic regression

import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,t, test_size=0.3, random_state = 42)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape) 
model = LogisticRegression()
model.fit(x_train, y_train)
preds_class = model.predict(x_test)
preds_proba = model.predict_proba(x_test)
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, f1_score
log_loss(y_test, preds_proba[:,0]), roc_auc_score(y_test, preds_proba[:,0]), accuracy_score(y_test, preds_class), f1_score(y_test, preds_class)

(364, 1) (364,) (157, 1) (157,)


(0.714211406639855, 0.4621732026143791, 0.5414012738853503, 0.7024793388429752)

In [156]:
# Checking Catboost

import numpy as np
from catboost import CatBoostClassifier, Pool, metrics
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,t, test_size=0.3)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape) 

# initialize data
train_data = x_train
train_labels = y_train
test_data = catboost_pool = Pool(x_test,y_test)
model = CatBoostClassifier(iterations=10000,
                           depth=8,
                           learning_rate=0.01,
                           loss_function='Logloss',
                           verbose=1000, 
                           eval_metric="Accuracy",
                           eval_fraction=0.2)
model.fit(x_train, y_train)

(364, 1) (364,) (157, 1) (157,)
0:	learn: 0.5993151	test: 0.5342466	best: 0.5342466 (0)	total: 1.26ms	remaining: 12.6s
1000:	learn: 0.7191781	test: 0.5479452	best: 0.6027397 (7)	total: 907ms	remaining: 8.15s
2000:	learn: 0.7363014	test: 0.5753425	best: 0.6027397 (7)	total: 1.99s	remaining: 7.96s
3000:	learn: 0.7397260	test: 0.5616438	best: 0.6027397 (7)	total: 3.04s	remaining: 7.09s
4000:	learn: 0.7397260	test: 0.5616438	best: 0.6027397 (7)	total: 4.09s	remaining: 6.13s
5000:	learn: 0.7397260	test: 0.5616438	best: 0.6027397 (7)	total: 5.06s	remaining: 5.06s
6000:	learn: 0.7397260	test: 0.5616438	best: 0.6027397 (7)	total: 6.17s	remaining: 4.11s
7000:	learn: 0.7397260	test: 0.5616438	best: 0.6027397 (7)	total: 7.22s	remaining: 3.09s
8000:	learn: 0.7397260	test: 0.5616438	best: 0.6027397 (7)	total: 8.26s	remaining: 2.06s
9000:	learn: 0.7397260	test: 0.5616438	best: 0.6027397 (7)	total: 9.26s	remaining: 1.03s
9999:	learn: 0.7397260	test: 0.5616438	best: 0.6027397 (7)	total: 10.3s	remainin

<catboost.core.CatBoostClassifier at 0x133913910>

In [171]:
# Checking Catboost

import numpy as np
from catboost import CatBoostClassifier, Pool, metrics
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape) 

# initialize data
train_data = x_train
train_labels = y_train
test_data = catboost_pool = Pool(x_test,y_test)
model = CatBoostRegressor(iterations=10000,
                           depth=12,
                           learning_rate=0.01,
                           loss_function='RMSE',
                           verbose=100,
                           eval_metric="R2",
                           eval_fraction=0.2)
model.fit(x_train, y_train)

(364, 1) (364,) (157, 1) (157,)
0:	learn: 0.0075058	test: 0.0024301	best: 0.0024301 (0)	total: 630us	remaining: 6.3s
100:	learn: 0.4075502	test: 0.3125693	best: 0.3125693 (100)	total: 394ms	remaining: 38.6s
200:	learn: 0.4970583	test: 0.3757928	best: 0.3757928 (200)	total: 753ms	remaining: 36.7s
300:	learn: 0.5270238	test: 0.3940882	best: 0.3940910 (299)	total: 1.27s	remaining: 41.1s
400:	learn: 0.5442462	test: 0.4024884	best: 0.4024884 (400)	total: 1.67s	remaining: 40s
500:	learn: 0.5570542	test: 0.4042267	best: 0.4042267 (500)	total: 2.06s	remaining: 39s
600:	learn: 0.5685458	test: 0.4076423	best: 0.4076423 (600)	total: 2.47s	remaining: 38.6s
700:	learn: 0.5807441	test: 0.4108128	best: 0.4108128 (700)	total: 2.89s	remaining: 38.4s
800:	learn: 0.5919856	test: 0.4128935	best: 0.4128935 (800)	total: 3.32s	remaining: 38.1s
900:	learn: 0.6019592	test: 0.4156672	best: 0.4159072 (892)	total: 3.79s	remaining: 38.3s
1000:	learn: 0.6099588	test: 0.4165554	best: 0.4167026 (980)	total: 4.26s	rem

KeyboardInterrupt: 