# Generate Data

In [71]:
import numpy as np
import pandas as pd
import math
from doubleml.datasets import make_plr_CCDDHNR2018, make_plr_turrell2018
np.random.seed(1234)
n_rep = 1
n_obs = 10000
n_vars = 10
alpha = 0.5
data = list()
from sklearn.datasets import make_spd_matrix

def g(x):
    return np.exp(x)

def m(x):
    return np.sin(x)

theta = alpha = 0.5 
b = [1/k for k in range(1,n_vars+1)] # x weights 
sigma = make_spd_matrix(n_vars, random_state=42)

for i_rep in range(n_rep):
    df = pd.read_csv("/Users/pranjal/Desktop/Causal-Inference/data/penn_jae.dat", sep = ' ')
    outcome = 'inuidur1'
    df.inuidur1 = np.log(df.inuidur1)
    treatment = 'tg'
    df['d'] = 0
    df.loc[df.tg == 4, 'd'] = 1
    treatment = 'd'
    rest = list(df.drop([outcome, treatment, 'inuidur2','muld', 'Unnamed: 24', 'Unnamed: 25'], axis = 1).columns)
    df = pd.get_dummies(df, prefix=['dep'],  drop_first=True)
    rest = ['female', 'black', 'othrace',
       'dep', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 
       'agelt35', 'agegt54', 'durable', 'lusd', 'husd']
    df = df[[outcome] + [treatment] + rest]
    #df = df.dropna()
    y = df[outcome]
    d = df[treatment]
    x = df[rest].astype('float')
    print(y.shape, x.shape, d.shape)
    data.append((x, y, d))

ValueError: Length of 'prefix' (1) did not match the length of the columns being encoded (0).

In [None]:
pd.get_dummies(df, prefix=['col1', 'col2'])

In [69]:
df.dep.value_counts()

0    10010
2     2275
1     1628
Name: dep, dtype: int64

In [None]:
df.dep

# Naive ML
- no orthogonalisation, no crossfitting

In [58]:
def non_orth_score(y, d, l_hat, m_hat, g_hat, smpls):
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, y - g_hat)
    return psi_a, psi_b

In [59]:
from doubleml import DoubleMLData
from doubleml import DoubleMLPLR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
face_colors = sns.color_palette('pastel')
edge_colors = sns.color_palette('dark')
np.random.seed(1111)
ml_l = RandomForestRegressor()
ml_m = RandomForestClassifier()
#ml_m = LogisticRegression()
ml_g = clone(ml_l)

# to speed up the illustration we hard-code the simulation results
theta_nonorth = np.empty(n_rep)
se_nonorth = np.empty(n_rep)
t_nonorth = np.empty(n_rep)
p_nonorth = np.empty(n_rep)
# to run the full simulation uncomment the following line to fit the model for every dataset and not just for the first dataset
for i_rep in range(n_rep):
    print(i_rep)
#for i_rep in range(1):
    (x, y, d) = data[i_rep]
    obj_dml_data = DoubleMLData.from_arrays(x, y.ravel(), d.ravel())
    print(obj_dml_data)
    obj_dml_plr_nonorth = DoubleMLPLR(obj_dml_data,
                                      ml_l, ml_m, ml_g,
                                      n_folds=1,
                                      apply_cross_fitting=False,
                                      score=non_orth_score)
    obj_dml_plr_nonorth.fit()
    theta_nonorth[i_rep] = obj_dml_plr_nonorth.coef[0]
    se_nonorth[i_rep] = obj_dml_plr_nonorth.se[0]

0

------------------ Data summary      ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15']
Instrument variable(s): None
No. Observations: 13913

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13913 entries, 0 to 13912
Columns: 17 entries, X1 to d
dtypes: float64(17)
memory usage: 1.8 MB



# Orthogonal Machine Learning
- resolves regularization bias, but not overfitting

In [60]:
import numpy as np
np.random.seed(2222)
# to speed up the illustration we hard-code the simulation results
theta_orth_nosplit = np.empty(n_rep)
se_orth_nosplit = np.empty(n_rep)
# to run the full simulation uncomment the following line to fit the model for every dataset and not just for the first dataset
for i_rep in range(n_rep):
    print(i_rep)
#for i_rep in range(1):
    (x, y, d) = data[i_rep]
    obj_dml_data = DoubleMLData.from_arrays(x, y, d)
    obj_dml_plr_orth_nosplit = DoubleMLPLR(obj_dml_data,
                                           ml_l, ml_m, ml_g,
                                           n_folds=1,
                                           score='IV-type',
                                           apply_cross_fitting=False)
    obj_dml_plr_orth_nosplit.fit()
    theta_orth_nosplit[i_rep] = obj_dml_plr_orth_nosplit.coef[0]
    se_orth_nosplit[i_rep] = obj_dml_plr_orth_nosplit.se[0]

0


# Orthogonal ML + Cross fitting (DML)

In [61]:
np.random.seed(42)
# to speed up the illustration we hard-code the simulation results
theta_dml = np.empty(n_rep)
se_dml = np.empty(n_rep)

# to run the full simulation uncomment the following line to fit the model for every dataset and not just for the first dataset
for i_rep in range(n_rep):
    print(i_rep)
    (x, y, d) = data[i_rep]
    obj_dml_data = DoubleMLData.from_arrays(x, y, d)
    obj_dml_plr = DoubleMLPLR(obj_dml_data,
                              ml_l, ml_m, ml_g,
                              n_folds=2,
                              score='IV-type')
    obj_dml_plr.fit()
    theta_dml[i_rep] = obj_dml_plr.coef[0]
    se_dml[i_rep] = obj_dml_plr.se[0]

0


# Regular OLS

In [62]:
np.random.seed(42)
import statsmodels.api as sm # for OLS 
# to speed up the illustration we hard-code the simulation results
theta_ols = np.empty(n_rep)
se_ols = np.empty(n_rep)
# to run the full simulation uncomment the following line to fit the model for every dataset and not just for the first dataset
for i_rep in range(n_rep):
    (x, y, d) = data[i_rep]
    OLS = sm.OLS(y,sm.add_constant(np.c_[d,x]))
    results = OLS.fit()
    theta_ols[i_rep] = results.params[1]
    se_ols[i_rep] = results.bse[1]   

In [63]:
results.summary()

0,1,2,3
Dep. Variable:,inuidur1,R-squared:,0.04
Model:,OLS,Adj. R-squared:,0.038
Method:,Least Squares,F-statistic:,35.75
Date:,"Mon, 12 Dec 2022",Prob (F-statistic):,4.44e-109
Time:,17:58:17,Log-Likelihood:,-22215.0
No. Observations:,13913,AIC:,44460.0
Df Residuals:,13896,BIC:,44590.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.7714,0.029,60.250,0.000,1.714,1.829
x1,-0.0327,0.031,-1.066,0.287,-0.093,0.027
x2,0.1257,0.021,5.979,0.000,0.084,0.167
x3,-0.3751,0.033,-11.356,0.000,-0.440,-0.310
x4,-0.2503,0.134,-1.862,0.063,-0.514,0.013
x5,0.0541,0.013,4.009,0.000,0.028,0.081
x6,0.0708,0.092,0.774,0.439,-0.109,0.250
x7,0.1289,0.031,4.151,0.000,0.068,0.190
x8,0.0533,0.030,1.795,0.073,-0.005,0.111

0,1,2,3
Omnibus:,6732.957,Durbin-Watson:,1.97
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1157.715
Skew:,-0.434,Prob(JB):,4.0300000000000004e-252
Kurtosis:,1.884,Cond. No.,18.6


# Distribution of Theta

In [64]:
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
MC_θ = np.c_[theta_ols, theta_nonorth, theta_orth_nosplit, theta_dml]
MC_se = np.c_[se_ols, se_nonorth, se_orth_nosplit, se_dml]

table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 's.e(θ_hat)','t','p','2.5%','97.25%']
a = ['OLS']+ np.c_[results.params[1], results.bse[1], results.tvalues[1], results.pvalues[1], results.conf_int(alpha=0.05, cols=None)[0][1], results.conf_int(alpha=0.05, cols=None)[1][1]].reshape(-1).tolist()
table.add_row(a)
a = ['Naive-ML']+ np.array(obj_dml_plr_nonorth.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Ortho-ML']+ np.array(obj_dml_plr_orth_nosplit.summary).reshape(-1).tolist()
table.add_row(a)
a = ['OrthoML+Crossfitting (DML)']+ np.array(obj_dml_plr.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+----------------------------+--------+------------+--------+-------+--------+--------+
|         Estimator          | θ_hat  | s.e(θ_hat) |   t    |   p   |  2.5%  | 97.25% |
+----------------------------+--------+------------+--------+-------+--------+--------+
|            OLS             | -0.033 |   0.031    | -1.066 | 0.287 | -0.093 | 0.027  |
|          Naive-ML          | -0.028 |   0.027    | -1.040 | 0.298 | -0.082 | 0.025  |
|          Ortho-ML          | -0.031 |   0.030    | -1.019 | 0.308 | -0.091 | 0.029  |
| OrthoML+Crossfitting (DML) | -0.047 |   0.033    | -1.409 | 0.159 | -0.112 | 0.018  |
+----------------------------+--------+------------+--------+-------+--------+--------+


# First Stage Checks

In [65]:
from sklearn.linear_model import LinearRegression, LogisticRegression
OLS_Y = LinearRegression()
OLS_D = LogisticRegression()
RF_Y = RandomForestRegressor()
RF_D = RandomForestClassifier()

In [66]:
from sklearn.linear_model import LinearRegression, LogisticRegression
OLS_Y = LinearRegression()
OLS_D = LogisticRegression()
RF_Y = RandomForestRegressor()
RF_D = RandomForestClassifier()
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
MC_θ = np.c_[theta_ols, theta_nonorth, theta_orth_nosplit, theta_dml]
MC_se = np.c_[se_ols, se_nonorth, se_orth_nosplit, se_dml]
table = PrettyTable()
table.field_names = ['Model', 'OLS', 'RF']
a = ['Y on X (R2)'] + [OLS_Y.fit(x,y).score(x,y), RF_Y.fit(x,y).score(x,y)]
table.add_row(a)
a = ['D on X (Accuracy)'] + [OLS_D.fit(x,d).score(x,d), RF_D.fit(x,d).score(x,d)]
table.add_row(a)
table.float_format = '0.3'
print(table)

+-------------------+-------+-------+
|       Model       |  OLS  |   RF  |
+-------------------+-------+-------+
|    Y on X (R2)    | 0.039 | 0.104 |
| D on X (Accuracy) | 0.875 | 0.878 |
+-------------------+-------+-------+
