# Packages

In [5]:
import numpy as np
import pandas as pd
from doubleml.datasets import fetch_bonus
from doubleml import DoubleMLData
import statsmodels.api as sm
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLR
from doubleml import DoubleMLPLR
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')
np.random.seed(3293423)

# Load the Data

In [6]:
df = pd.read_csv('/Users/pranjal/Desktop/Causal-Inference/data/wage.csv')
cat = df.select_dtypes('object').columns
df = pd.get_dummies(df, columns = cat, drop_first = True)
df = df[df.wage>0]
print(df.shape)
df.head()

(428, 22)


Unnamed: 0,inlf,hours,kidslt6,kidsge6,age,educ,wage,repwage,hushrs,husage,...,faminc,mtr,motheduc,fatheduc,unem,city,exper,nwifeinc,lwage,expersq
0,1,1610,1,0,32,12,3.354,2.65,2708,34,...,16310,0.7215,12,7,5.0,0,14,10.91006,1.210154,196
1,1,1656,0,2,30,12,1.3889,2.65,2310,30,...,21800,0.6615,7,7,11.0,1,5,19.499981,0.328512,25
2,1,1980,1,3,35,12,4.5455,4.04,3072,40,...,21040,0.6915,12,7,5.0,0,15,12.03991,1.514138,225
3,1,456,0,3,34,12,1.0965,3.25,1920,53,...,7300,0.7815,7,7,5.0,0,6,6.799996,0.092123,36
4,1,1568,1,2,31,14,4.5918,3.6,2000,32,...,27300,0.6215,12,14,9.5,1,7,20.100058,1.524272,49


In [7]:
print(df.isnull().sum())

inlf        0
hours       0
kidslt6     0
kidsge6     0
age         0
educ        0
wage        0
repwage     0
hushrs      0
husage      0
huseduc     0
huswage     0
faminc      0
mtr         0
motheduc    0
fatheduc    0
unem        0
city        0
exper       0
nwifeinc    0
lwage       0
expersq     0
dtype: int64


In [8]:
outcome = 'lwage'
treatment = 'educ'
#rest = ['exper','age', 'kidslt6', 'kidsge6']
rest = ['inlf', 'hours', 'kidslt6', 'kidsge6', 'age',
       'hushrs', 'husage', 'huseduc', 'huswage', 'faminc', 'mtr', 'motheduc',
       'fatheduc', 'unem', 'city', 'exper', 'nwifeinc']
df = df[[outcome] + [treatment] + rest]
y = df[outcome]
d = df[treatment]
x = df[rest].astype('float')
print(y.shape, d.shape, x.shape)

(428,) (428,) (428, 17)


# First Stage

In [20]:
k=int(df.shape[0]*0.8)
df = df.sample(df.shape[0])
y = df[outcome]
x = df[rest]

# 10-Fold Cross validation
print(np.mean(cross_val_score(LinearRegression(), x, y, cv=5)))
print(np.mean(cross_val_score(LassoCV(), x, y, cv=10)))
print(np.mean(cross_val_score(RandomForestRegressor(), x, y, cv=5)))
print(np.mean(cross_val_score(CatBoostRegressor(verbose=0), x, y, cv=5)))
print(np.mean(cross_val_score(XGBRegressor(verbosity=0), x, y, cv=5)))
print(np.mean(cross_val_score(MLPRegressor(learning_rate_init=0.01), normalize(x), y, cv=5)))

0.5713841904532764
0.10292380764276343
0.26051816021577745
0.3279689400973198
0.23484764049253437
0.1445923421779551


In [17]:
k=int(df.shape[0]*0.8)
df = df.sample(df.shape[0])
d = df[treatment]
x = df[rest]

# 10-Fold Cross validation
print(np.mean(cross_val_score(LinearRegression(), x, d, cv=5)))
print(np.mean(cross_val_score(LassoCV(), x, d, cv=5)))
print(np.mean(cross_val_score(RandomForestRegressor(max_depth=4,), x, d, cv=5)))
print(np.mean(cross_val_score(CatBoostRegressor(max_depth=4,verbose=0), x, d, cv=5)))
print(np.mean(cross_val_score(XGBRegressor(max_depth=4,verbosity=0), x, d, cv=5)))
print(np.mean(cross_val_score(MLPRegressor(learning_rate_init=0.01), normalize(x), d, cv=5)))

0.4332739818310484
0.10186676640388266
0.40709531290917267
0.41184566914186416
0.26617606542247996
0.05690641072621818


# OLS

In [219]:
OLS = sm.OLS(y,sm.add_constant(np.c_[d,x])).fit()
OLS.summary()

0,1,2,3
Dep. Variable:,educ,R-squared:,0.518
Model:,OLS,Adj. R-squared:,0.498
Method:,Least Squares,F-statistic:,25.95
Date:,"Wed, 14 Dec 2022",Prob (F-statistic):,1.2499999999999999e-54
Time:,22:57:32,Log-Likelihood:,-804.24
No. Observations:,428,AIC:,1644.0
Df Residuals:,410,BIC:,1718.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0360,0.035,1.032,0.303,-0.033,0.105
const,6.3380,2.856,2.219,0.027,0.724,11.952
x2,-0.0006,0.000,-4.700,0.000,-0.001,-0.000
x3,0.5511,0.221,2.489,0.013,0.116,0.986
x4,-0.0810,0.071,-1.146,0.253,-0.220,0.058
x5,-0.0023,0.025,-0.093,0.926,-0.051,0.046
x6,0.0001,0.000,0.686,0.493,-0.000,0.001
x7,0.0088,0.023,0.388,0.698,-0.036,0.053
x8,0.3128,0.032,9.757,0.000,0.250,0.376

0,1,2,3
Omnibus:,11.743,Durbin-Watson:,1.93
Prob(Omnibus):,0.003,Jarque-Bera (JB):,14.683
Skew:,-0.275,Prob(JB):,0.000648
Kurtosis:,3.722,Cond. No.,1370000.0


# ML Estimation

In [25]:
l = LinearRegression() # Model for E[Y|X]=E[θD+g(X)]
g = LinearRegression() # Model for E[Y - θD|X]=g(X)
m = RandomForestRegressor(max_depth=4) # Model for E[D|X]

def score(y, d, l_hat, m_hat, g_hat, smpls):
    "Score function for Single ML"
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

# Single-ML

In [26]:
data = DoubleMLData(df, y_col='lwage',d_cols='educ',x_cols=rest)
SML = DoubleMLPLR(data, l, m, g, n_folds=1, apply_cross_fitting=False, score=score)
SML.fit()
print(SML.summary)

          coef   std err         t         P>|t|     2.5 %    97.5 %
educ  0.015509  0.001737  8.927275  4.366311e-19  0.012104  0.018915


# Orthogonal-ML

In [27]:
data = DoubleMLData(df,y_col='lwage',d_cols='educ',x_cols=rest)
OML = DoubleMLPLR(data,l, m, g, n_folds=1,apply_cross_fitting=False,score='IV-type')
OML.fit();
print(OML.summary)

          coef   std err         t    P>|t|    2.5 %    97.5 %
educ  0.016252  0.012394  1.311289  0.18976 -0.00804  0.040544


# Orthogonal + Crossfitting (DML)

In [28]:
data = DoubleMLData(df,y_col='lwage',d_cols='educ',x_cols=rest)
DML = DoubleMLPLR(data, l,m,g, n_folds=5,apply_cross_fitting=True,score='IV-type')
DML.fit();
print(DML.summary)

          coef   std err        t     P>|t|     2.5 %    97.5 %
educ  0.013372  0.012744  1.04926  0.294058 -0.011606  0.038349


# Summary

In [171]:
table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 'Std Error','t','p','2.5%','97.25%']
idx = 0
a = ['OLS']+ np.c_[OLS.params[idx], OLS.bse[idx], OLS.tvalues[idx], OLS.pvalues[idx], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
a = ['Single ML (SML)']+ np.array(SML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Orthogonal ML (OML)']+ np.array(OML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Double ML (DML)']+ np.array(DML.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+---------------------+-------+-----------+--------+-------+-------+--------+
|      Estimator      | θ_hat | Std Error |   t    |   p   |  2.5% | 97.25% |
+---------------------+-------+-----------+--------+-------+-------+--------+
|         OLS         | 0.021 |   0.014   | 1.522  | 0.129 |  nan  |  nan   |
|   Single ML (SML)   | 0.078 |   0.002   | 31.239 | 0.000 | 0.073 | 0.083  |
| Orthogonal ML (OML) | 0.078 |   0.014   | 5.580  | 0.000 | 0.051 | 0.105  |
|   Double ML (DML)   | 0.076 |   0.014   | 5.430  | 0.000 | 0.049 | 0.104  |
+---------------------+-------+-----------+--------+-------+-------+--------+


# First Stage Model - Y

In [180]:
# Shuffle
df = df.sample(df.shape[0])
y = df[outcome]
d = df[treatment]
x = df[rest]
k=int(df.shape[0]*0.8)
print(LinearRegression().fit(x[:k],y[:k]).score(x[:k],y[:k]))
print(LinearRegression().fit(x[:k],y[:k]).score(x[k:],y[k:]))
print(LassoCV().fit(x[:k],y[:k]).score(x[:k],y[:k]))
print(LassoCV().fit(x[:k],y[:k]).score(x[k:],y[k:]))
print(RandomForestRegressor().fit(x[:k],y[:k]).score(x[:k],y[:k]))
print(RandomForestRegressor().fit(x[:k],y[:k]).score(x[k:],y[k:]))
print(CatBoostRegressor(verbose=0).fit(x[:k],y[:k]).score(x[:k],y[:k]))
print(CatBoostRegressor(verbose=0).fit(x[:k],y[:k]).score(x[k:],y[k:]))
print(MLPRegressor(max_iter = 500).fit(x[:k],y[:k]).score(x[:k],y[:k]))
print(MLPRegressor( max_iter = 100).fit(x[:k],y[:k]).score(x[k:],y[k:]))

0.6176292839921211
0.5840313625194415
0.1375017141971282
0.07003693616460771
0.8854270643841722
0.23188493071284777
0.9953759822769018
0.4201842615205914
-82450.05418522444
-163.2113504022591


In [188]:
help(LinearRegression)

Help on class LinearRegression in module sklearn.linear_model._base:

class LinearRegression(sklearn.base.MultiOutputMixin, sklearn.base.RegressorMixin, LinearModel)
 |  LinearRegression(*, fit_intercept=True, normalize='deprecated', copy_X=True, n_jobs=None, positive=False)
 |  
 |  Ordinary least squares Linear Regression.
 |  
 |  LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
 |  to minimize the residual sum of squares between the observed targets in
 |  the dataset, and the targets predicted by the linear approximation.
 |  
 |  Parameters
 |  ----------
 |  fit_intercept : bool, default=True
 |      Whether to calculate the intercept for this model. If set
 |      to False, no intercept will be used in calculations
 |      (i.e. data is expected to be centered).
 |  
 |  normalize : bool, default=False
 |      This parameter is ignored when ``fit_intercept`` is set to False.
 |      If True, the regressors X will be normalized before regression by
 |    

In [150]:

print(MLPRegressor((5), learning_rate_init = 0.01, max_iter = 10000).fit(x[:k],y[:k]).score(x[:k],y[:k]))
print(MLPRegressor((), max_iter = 1000).fit(x[:k],y[:k]).score(x[k:],y[k:]))

-9.120081167306205
-18789.931784285214


In [136]:
print(CatBoostRegressor(n_estimators = 1000, l2_leaf_reg=3, max_depth= 6, verbose=0).fit(x[:k],y[:k]).score(x[k:],y[k:]))


0.2897012176739773


In [132]:
help(CatBoostRegressor)

Help on class CatBoostRegressor in module catboost.core:

class CatBoostRegressor(CatBoost)
 |  CatBoostRegressor(iterations=None, learning_rate=None, depth=None, l2_leaf_reg=None, model_size_reg=None, rsm=None, loss_function='RMSE', border_count=None, feature_border_type=None, per_float_feature_quantization=None, input_borders=None, output_borders=None, fold_permutation_block=None, od_pval=None, od_wait=None, od_type=None, nan_mode=None, counter_calc_method=None, leaf_estimation_iterations=None, leaf_estimation_method=None, thread_count=None, random_seed=None, use_best_model=None, best_model_min_trees=None, verbose=None, silent=None, logging_level=None, metric_period=None, ctr_leaf_count_limit=None, store_all_simple_ctr=None, max_ctr_complexity=None, has_time=None, allow_const_label=None, target_border=None, one_hot_max_size=None, random_strength=None, name=None, ignored_features=None, train_dir=None, custom_metric=None, eval_metric=None, bagging_temperature=None, save_snapshot=None, 