In [2]:
import numpy as np
from doubleml.datasets import fetch_bonus
from doubleml import DoubleMLData
import statsmodels.api as sm
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLR
from doubleml import DoubleMLPLR
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
from sklearn.linear_model import LinearRegression, LogisticRegression

In [7]:
np.random.seed(1)
df = fetch_bonus('DataFrame')
df.head(5)

import pandas as pd
import numpy as np
df = pd.read_csv('/Users/pranjal/Desktop/Causal-Inference/data/wage.csv')
cat = df.select_dtypes('object').columns
df = pd.get_dummies(df, columns = cat, drop_first = True)
outcome = 'lwage'
treatment = 'educ'
rest = ['exper','age', 'kidslt6', 'kidsge6']
df = df[[outcome] + [treatment] + rest]
df = df.dropna()
y = df[outcome]
d = df[treatment]
x = df[rest].astype('float')

l = RandomForestRegressor(n_estimators = 500, max_features = 'sqrt', max_depth= 5)
g = clone(l)
m = RandomForestRegressor(n_estimators = 500, max_features = 'sqrt', max_depth= 5)

def score(y, d, l_hat, m_hat, g_hat, smpls):
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

print(y.shape, x.shape, d.shape)

(428,) (428, 4) (428,)


In [9]:
OLS = sm.OLS(y,sm.add_constant(np.c_[d,x])).fit()

In [10]:
data = DoubleMLData(df.sample(int(df.shape[0]*0.8)),
                  y_col='lwage',
                  d_cols='educ',
                  x_cols=['exper','age', 'kidslt6', 'kidsge6'])

SML = DoubleMLPLR(data, l,m,g, n_folds=1,apply_cross_fitting=False,score=score)
SML.fit();

In [11]:
data = DoubleMLData(df.sample(int(df.shape[0]*0.8)),
                  y_col='lwage',
                  d_cols='educ',
                  x_cols=['exper','age', 'kidslt6', 'kidsge6'])

OML = DoubleMLPLR(data, l,m,g, n_folds=1,apply_cross_fitting=False,score='IV-type')
OML.fit();

In [12]:
data = DoubleMLData(df.sample(int(df.shape[0]*1)),
                  y_col='lwage',
                  d_cols='educ',
                  x_cols=['exper','age', 'kidslt6', 'kidsge6'])

DML = DoubleMLPLR(data, l,m,g, n_folds=5,apply_cross_fitting=True,score='IV-type')
DML.fit();

In [13]:
table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 'Std Error','t','p','2.5%','97.25%']
a = ['OLS']+ np.c_[OLS.params[1], OLS.bse[1], OLS.tvalues[1], OLS.pvalues[1], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
a = ['Naive/Single ML (SML)']+ np.array(SML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Orthogonal ML (OML)']+ np.array(OML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Orthogonal+Crossfitting ML (DML)']+ np.array(DML.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+----------------------------------+-------+-----------+--------+-------+-------+--------+
|            Estimator             | θ_hat | Std Error |   t    |   p   |  2.5% | 97.25% |
+----------------------------------+-------+-----------+--------+-------+-------+--------+
|               OLS                | 0.110 |   0.014   | 7.600  | 0.000 |  nan  |  nan   |
|      Naive/Single ML (SML)       | 0.110 |   0.002   | 47.333 | 0.000 | 0.105 | 0.114  |
|       Orthogonal ML (OML)        | 0.092 |   0.014   | 6.793  | 0.000 | 0.066 | 0.119  |
| Orthogonal+Crossfitting ML (DML) | 0.109 |   0.014   | 7.650  | 0.000 | 0.081 | 0.137  |
+----------------------------------+-------+-----------+--------+-------+-------+--------+
