# Generate Data

In [66]:
import numpy as np
import math
from doubleml.datasets import make_plr_CCDDHNR2018, make_plr_turrell2018
np.random.seed(1234)
n_rep = 1
n_obs = 10000
n_vars = 10
alpha = 0.5
data = list()
from sklearn.datasets import make_spd_matrix

def g(x):
    return np.exp(x)

def m(x):
    return np.sin(x)

theta = alpha = 0.5 
b = [1/k for k in range(1,n_vars+1)] # x weights 
sigma = make_spd_matrix(n_vars, random_state=42)

for i_rep in range(n_rep):
    import pandas as pd
    import numpy as np
    df = pd.read_csv('/Users/pranjal/Desktop/Causal-Inference/data/wage.csv')
    cat = df.select_dtypes('object').columns
    df = pd.get_dummies(df, columns = cat, drop_first = True)
    outcome = 'lwage'
    treatment = 'educ'
    #rest = list(df.drop([outcome, treatment], axis = 1).columns)
    rest = ['exper','age', 'kidslt6', 'kidsge6']
    df = df[[outcome] + [treatment] + rest]
    df = df.dropna()
    y = df[outcome]
    d = df[treatment]
    x = df[rest].astype('float')
    data.append((x, y, d))

In [68]:
df = pd.read_csv('/Users/pranjal/Desktop/Causal-Inference/data/wage.csv')
df.head()

Unnamed: 0,inlf,hours,kidslt6,kidsge6,age,educ,wage,repwage,hushrs,husage,...,faminc,mtr,motheduc,fatheduc,unem,city,exper,nwifeinc,lwage,expersq
0,1,1610,1,0,32,12,3.354,2.65,2708,34,...,16310,0.7215,12,7,5.0,0,14,10.91006,1.210154,196
1,1,1656,0,2,30,12,1.3889,2.65,2310,30,...,21800,0.6615,7,7,11.0,1,5,19.499981,0.328512,25
2,1,1980,1,3,35,12,4.5455,4.04,3072,40,...,21040,0.6915,12,7,5.0,0,15,12.03991,1.514138,225
3,1,456,0,3,34,12,1.0965,3.25,1920,53,...,7300,0.7815,7,7,5.0,0,6,6.799996,0.092123,36
4,1,1568,1,2,31,14,4.5918,3.6,2000,32,...,27300,0.6215,12,14,9.5,1,7,20.100058,1.524272,49


# Naive ML
- no orthogonalisation, no crossfitting

In [57]:
def non_orth_score(y, d, l_hat, m_hat, g_hat, smpls):
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, y - g_hat)
    return psi_a, psi_b

In [58]:
from doubleml import DoubleMLData
from doubleml import DoubleMLPLR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
face_colors = sns.color_palette('pastel')
edge_colors = sns.color_palette('dark')
np.random.seed(1111)
ml_l = RandomForestRegressor()
ml_m = RandomForestRegressor()
#ml_m = LogisticRegression()
ml_g = clone(ml_l)

# to speed up the illustration we hard-code the simulation results
theta_nonorth = np.empty(n_rep)
se_nonorth = np.empty(n_rep)
t_nonorth = np.empty(n_rep)
p_nonorth = np.empty(n_rep)
# to run the full simulation uncomment the following line to fit the model for every dataset and not just for the first dataset
for i_rep in range(n_rep):
    print(i_rep)
#for i_rep in range(1):
    (x, y, d) = data[i_rep]
    obj_dml_data = DoubleMLData.from_arrays(x, y.ravel(), d.ravel())
    print(obj_dml_data)
    obj_dml_plr_nonorth = DoubleMLPLR(obj_dml_data,
                                      ml_l, ml_m, ml_g,
                                      n_folds=1,
                                      apply_cross_fitting=False,
                                      score=non_orth_score)
    obj_dml_plr_nonorth.fit()
    theta_nonorth[i_rep] = obj_dml_plr_nonorth.coef[0]
    se_nonorth[i_rep] = obj_dml_plr_nonorth.se[0]

0

------------------ Data summary      ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4']
Instrument variable(s): None
No. Observations: 428

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Columns: 6 entries, X1 to d
dtypes: float64(6)
memory usage: 20.2 KB



# Orthogonal Machine Learning
- resolves regularization bias, but not overfitting

In [59]:
import numpy as np
np.random.seed(2222)
# to speed up the illustration we hard-code the simulation results
theta_orth_nosplit = np.empty(n_rep)
se_orth_nosplit = np.empty(n_rep)
# to run the full simulation uncomment the following line to fit the model for every dataset and not just for the first dataset
for i_rep in range(n_rep):
    print(i_rep)
#for i_rep in range(1):
    (x, y, d) = data[i_rep]
    obj_dml_data = DoubleMLData.from_arrays(x, y, d)
    obj_dml_plr_orth_nosplit = DoubleMLPLR(obj_dml_data,
                                           ml_l, ml_m, ml_g,
                                           n_folds=1,
                                           score='IV-type',
                                           apply_cross_fitting=False)
    obj_dml_plr_orth_nosplit.fit()
    theta_orth_nosplit[i_rep] = obj_dml_plr_orth_nosplit.coef[0]
    se_orth_nosplit[i_rep] = obj_dml_plr_orth_nosplit.se[0]

0


# Orthogonal ML + Cross fitting (DML)

In [60]:
np.random.seed(42)
# to speed up the illustration we hard-code the simulation results
theta_dml = np.empty(n_rep)
se_dml = np.empty(n_rep)

# to run the full simulation uncomment the following line to fit the model for every dataset and not just for the first dataset
for i_rep in range(n_rep):
    print(i_rep)
    (x, y, d) = data[i_rep]
    obj_dml_data = DoubleMLData.from_arrays(x, y, d)
    obj_dml_plr = DoubleMLPLR(obj_dml_data,
                              ml_l, ml_m, ml_g,
                              n_folds=2,
                              score='IV-type')
    obj_dml_plr.fit()
    theta_dml[i_rep] = obj_dml_plr.coef[0]
    se_dml[i_rep] = obj_dml_plr.se[0]

0


# Regular OLS

In [61]:
np.random.seed(42)
import statsmodels.api as sm # for OLS 
# to speed up the illustration we hard-code the simulation results
theta_ols = np.empty(n_rep)
se_ols = np.empty(n_rep)
# to run the full simulation uncomment the following line to fit the model for every dataset and not just for the first dataset
for i_rep in range(n_rep):
    (x, y, d) = data[i_rep]
    OLS = sm.OLS(y,sm.add_constant(np.c_[d,x]))
    results = OLS.fit()
    theta_ols[i_rep] = results.params[1]
    se_ols[i_rep] = results.bse[1]   

In [62]:
results.summary()

0,1,2,3
Dep. Variable:,lwage,R-squared:,0.151
Model:,OLS,Adj. R-squared:,0.141
Method:,Least Squares,F-statistic:,14.98
Date:,"Mon, 12 Dec 2022",Prob (F-statistic):,1.52e-13
Time:,10:20:06,Log-Likelihood:,-433.15
No. Observations:,428,AIC:,878.3
Df Residuals:,422,BIC:,902.6
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.2190,0.300,-0.729,0.466,-0.809,0.371
x1,0.1096,0.014,7.600,0.000,0.081,0.138
x2,0.0155,0.005,3.261,0.001,0.006,0.025
x3,-0.0035,0.005,-0.666,0.505,-0.014,0.007
x4,-0.0755,0.089,-0.851,0.395,-0.250,0.099
x5,-0.0177,0.028,-0.632,0.528,-0.073,0.037

0,1,2,3
Omnibus:,79.021,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,299.257
Skew:,-0.773,Prob(JB):,1.04e-65
Kurtosis:,6.793,Cond. No.,433.0


# Distribution of Theta

In [63]:
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
MC_θ = np.c_[theta_ols, theta_nonorth, theta_orth_nosplit, theta_dml]
MC_se = np.c_[se_ols, se_nonorth, se_orth_nosplit, se_dml]

table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 's.e(θ_hat)','t','p','2.5%','97.25%']
a = ['OLS']+ np.c_[results.params[1], results.bse[1], results.tvalues[1], results.pvalues[1], results.conf_int(alpha=0.05, cols=None)[0][1], results.conf_int(alpha=0.05, cols=None)[1][1]].reshape(-1).tolist()
table.add_row(a)
a = ['Naive-ML']+ np.array(obj_dml_plr_nonorth.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Ortho-ML']+ np.array(obj_dml_plr_orth_nosplit.summary).reshape(-1).tolist()
table.add_row(a)
a = ['OrthoML+Crossfitting (DML)']+ np.array(obj_dml_plr.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+----------------------------+-------+------------+--------+-------+-------+--------+
|         Estimator          | θ_hat | s.e(θ_hat) |   t    |   p   |  2.5% | 97.25% |
+----------------------------+-------+------------+--------+-------+-------+--------+
|            OLS             | 0.110 |   0.014    | 7.600  | 0.000 | 0.081 | 0.138  |
|          Naive-ML          | 0.106 |   0.001    | 91.638 | 0.000 | 0.104 | 0.109  |
|          Ortho-ML          | 0.113 |   0.009    | 12.717 | 0.000 | 0.096 | 0.130  |
| OrthoML+Crossfitting (DML) | 0.131 |   0.023    | 5.788  | 0.000 | 0.087 | 0.176  |
+----------------------------+-------+------------+--------+-------+-------+--------+


# First Stage Checks

In [64]:
from sklearn.linear_model import LinearRegression, LogisticRegression
OLS_Y = LinearRegression()
OLS_D = LinearRegression()
RF_Y = RandomForestRegressor(n_estimators=132, max_features=12, max_depth=5, min_samples_leaf=1)
RF_D = RandomForestRegressor(n_estimators=132, max_features=12, max_depth=5, min_samples_leaf=1)

In [65]:
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
MC_θ = np.c_[theta_ols, theta_nonorth, theta_orth_nosplit, theta_dml]
MC_se = np.c_[se_ols, se_nonorth, se_orth_nosplit, se_dml]

table = PrettyTable()
table.field_names = ['Model', 'OLS', 'RF']
a = ['Y on X (R2)'] + [OLS_Y.fit(x,y).score(x,y), RF_Y.fit(x,y).score(x,y)]
table.add_row(a)
a = ['D on X (R2)'] + [OLS_D.fit(x,d).score(x,d), RF_D.fit(x,d).score(x,d)]
table.add_row(a)
table.float_format = '0.3'
print(table)

+-------------+-------+-------+
|    Model    |  OLS  |   RF  |
+-------------+-------+-------+
| Y on X (R2) | 0.034 | 0.324 |
| D on X (R2) | 0.031 | 0.242 |
+-------------+-------+-------+


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from pandas.table.plotting import table # EDIT: see deprecation warnings below

ax = plt.subplot(111, frame_on=False) # no visible frame
ax.xaxis.set_visible(False)  # hide the x axis
ax.yaxis.set_visible(False)  # hide the y axis

table(ax, df)  # where df is your data frame

plt.savefig('mytable.png')
