# Packages

In [31]:
import numpy as np
import pandas as pd
from doubleml.datasets import fetch_bonus, fetch_401K
from doubleml import DoubleMLData
import statsmodels.api as sm
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLR
from doubleml import DoubleMLPLR
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, LogisticRegressionCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
np.random.seed(3293423)

# Load the Data

Between 1965 and 1982, the Social Security Survivor Benefits (SSSB) Program in the United States offered $6,700 (expressed in year 2000 dol- lars) in college financial aid to the 18- to 22-year-old children of deceased, disabled, or retired Social Security recipients. In 1981, the U.S. Congress eliminated the SSSB program, mandating that otherwise eligible children who were not enrolled in college as of May 1982 would not receive the SSSB college-aid offer. Using the National Longitudinal Survey of Youth, Dynarski identified students in cohorts of high-school seniors, just before and just after the policy change, who would have been eligible for the aid offer because their fathers were Social Security recipients who had died. She argued that, other than differing in receipt of the offer of college aid, these two groups of students were equal in expectation initially. However, the 137 high-school seniors who satisfied SSSB eligibility requirements immediately before the policy change (in the years 1979 through 1981) received the college financial-aid offer and therefore constituted the treat- ment group. The 54 high-school seniors who satisfied SSSB eligibility requirements immediately after the policy change (1982 and 1983) received no SSSB-related financial-aid offer and made up the control group.

In [32]:
import pandas as pd
import numpy as np
df = pd.read_csv('/Users/pranjal/Desktop/Causal-Inference/data/collegeoffer.csv')
df['fd'] = 0
df.loc[df.fatherdec=='Father not deceased', 'fd'] = 1
df.head()

Unnamed: 0,id,hhid,wt88,coll,hgc23,yearsr,fatherdec,offer,fd
0,9,9,691916,1,13,81,Father not deceased,1,1
1,14,13,784204,1,16,81,Father not deceased,1,1
2,15,15,811032,1,16,82,Father not deceased,0,1
3,21,20,644853,1,16,79,Father not deceased,1,1
4,22,22,728189,1,16,80,Father not deceased,1,1


In [33]:
df.corr()

Unnamed: 0,id,hhid,wt88,coll,hgc23,yearsr,offer,fd
id,1.0,1.0,-0.579677,-0.076675,-0.126027,0.040928,-0.034088,-0.018138
hhid,1.0,1.0,-0.579669,-0.076661,-0.126012,0.040878,-0.034049,-0.018136
wt88,-0.579677,-0.579669,1.0,0.110379,0.20517,0.017405,-0.048547,0.058347
coll,-0.076675,-0.076661,0.110379,1.0,0.775674,-0.045338,0.044412,-0.003655
hgc23,-0.126027,-0.126012,0.20517,0.775674,1.0,-0.074521,0.058952,0.014941
yearsr,0.040928,0.040878,0.017405,-0.045338,-0.074521,1.0,-0.811417,-0.014989
offer,-0.034088,-0.034049,-0.048547,0.044412,0.058952,-0.811417,1.0,0.002884
fd,-0.018138,-0.018136,0.058347,-0.003655,0.014941,-0.014989,0.002884,1.0


In [34]:
df.columns

Index(['id', 'hhid', 'wt88', 'coll', 'hgc23', 'yearsr', 'fatherdec', 'offer',
       'fd'],
      dtype='object')

In [35]:
print(df.isnull().sum())

id           0
hhid         0
wt88         0
coll         0
hgc23        0
yearsr       0
fatherdec    0
offer        0
fd           0
dtype: int64


In [36]:
outcome = 'coll'
treatment = 'offer'
rest = list(df.drop([outcome, treatment, 'hhid', 'id', 'fatherdec', 'yearsr'], axis = 1).columns)
df = df[[outcome] + [treatment] + rest]
y = np.array(df.coll).reshape(-1, 1)
d = np.array(df.offer).astype(int).reshape(-1, 1)
x = np.array(df[rest])
print(y.shape, d.shape, x.shape)

(3986, 1) (3986, 1) (3986, 3)


# First Stage

In [37]:
np.random.seed(42)
table = PrettyTable()
table.field_names = ['Estimator', 'g(X):Rsquared', 'm(X):Accuracy']
a = ['Linear/Logistic',np.mean(cross_val_score(LinearRegression(), x, y, cv=5)),np.mean(cross_val_score(LogisticRegression(), x, d, cv=5))]
table.add_row(a)
a = ['Linear/Logistic (Reg)',np.mean(cross_val_score(LassoCV(), x, y, cv=5)),np.mean(cross_val_score(LogisticRegressionCV(), x, d, cv=5))]
table.add_row(a)
a = ['Random Forests',np.mean(cross_val_score(RandomForestRegressor(max_depth=5,n_estimators=500,verbose=0), x, y, cv=5)),np.mean(cross_val_score(RandomForestClassifier(max_depth=5), x, d, cv=5))]
table.add_row(a)
a = ['Boosting',np.mean(cross_val_score(XGBRegressor(max_depth=3,verbosity=0), x, y, cv=5)),np.mean(cross_val_score(XGBClassifier(verbosity=0,max_depth=5), x, d, cv=5))]
table.add_row(a)
a = ['Neural Networks',np.mean(cross_val_score(MLPRegressor(activation = 'tanh', max_iter=500, learning_rate_init=0.01), normalize(x), y, cv=5)),np.mean(cross_val_score(MLPClassifier((5,2,), activation = 'tanh', max_iter=500,learning_rate_init=0.01), normalize(x), d, cv=5))]
table.add_row(a)
table.float_format = '0.3'
print(table)

+-----------------------+---------------+---------------+
|       Estimator       | g(X):Rsquared | m(X):Accuracy |
+-----------------------+---------------+---------------+
|    Linear/Logistic    |     0.598     |     0.723     |
| Linear/Logistic (Reg) |     0.000     |     0.723     |
|     Random Forests    |     0.726     |     0.724     |
|        Boosting       |     0.705     |     0.697     |
|    Neural Networks    |     -0.031    |     0.723     |
+-----------------------+---------------+---------------+


# OLS

In [11]:
OLS = sm.OLS(y,sm.add_constant(np.c_[d,x])).fit()
OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.605
Model:,OLS,Adj. R-squared:,0.604
Method:,Least Squares,F-statistic:,1217.0
Date:,"Thu, 15 Dec 2022",Prob (F-statistic):,0.0
Time:,10:38:40,Log-Likelihood:,-1029.3
No. Observations:,3986,AIC:,2071.0
Df Residuals:,3980,BIC:,2108.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.6529,0.576,-6.341,0.000,-4.782,-2.524
x1,0.0228,0.019,1.197,0.231,-0.015,0.060
x2,-1.147e-07,2.35e-08,-4.887,0.000,-1.61e-07,-6.87e-08
x3,0.2400,0.003,77.050,0.000,0.234,0.246
x4,0.0125,0.007,1.792,0.073,-0.001,0.026
x5,-0.0282,0.023,-1.212,0.226,-0.074,0.017

0,1,2,3
Omnibus:,648.469,Durbin-Watson:,1.902
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1003.043
Skew:,1.196,Prob(JB):,1.56e-218
Kurtosis:,3.565,Cond. No.,45500000.0


# ML Estimation

In [12]:
l = RandomForestRegressor(max_depth=5,n_estimators=500,verbose=0) # Model for E[Y|X]=E[θD+g(X)]
g = RandomForestRegressor(max_depth=5,n_estimators=500,verbose=0) # Model for E[Y - θD|X]=g(X)
m = RandomForestClassifier(max_depth=5,n_estimators=500,verbose=0) # Model for E[D|X]

def score(y, d, l_hat, m_hat, g_hat, smpls):
    "Score function for Single ML"
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

# Single-ML

In [13]:
data = DoubleMLData(df, y_col=outcome,d_cols=treatment,x_cols=rest)
SML = DoubleMLPLR(data, l, m, g, n_folds=1, apply_cross_fitting=False, score=score)
SML.fit()
print(SML.summary)

           coef   std err         t     P>|t|     2.5 %    97.5 %
offer -0.014615  0.004665 -3.132811  0.001731 -0.023759 -0.005472


# Orthogonal-ML

In [14]:
data = DoubleMLData(df,y_col=outcome,d_cols=treatment,x_cols=rest)
OML = DoubleMLPLR(data,l, m, g, n_folds=1,apply_cross_fitting=False,score='IV-type')
OML.fit();
print(OML.summary)

           coef   std err         t         P>|t|     2.5 %    97.5 %
offer  0.049297  0.009132  5.398167  6.732507e-08  0.031398  0.067196


# Orthogonal + Crossfitting (DML)

In [15]:
data = DoubleMLData(df,y_col=outcome,d_cols=treatment,x_cols=rest)
DML = DoubleMLPLR(data, l,m,g, n_folds=5,apply_cross_fitting=True,score='IV-type')
DML.fit();
print(DML.summary)

           coef   std err         t         P>|t|   2.5 %    97.5 %
offer  0.089288  0.009382  9.516846  1.785147e-21  0.0709  0.107677


# Summary

In [16]:
table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 'Std Error','t','p','2.5%','97.25%']
idx = 1
a = ['OLS']+ np.c_[OLS.params[idx], OLS.bse[idx], OLS.tvalues[idx], OLS.pvalues[idx], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
a = ['Single ML (SML)']+ np.array(SML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Orthogonal ML (OML)']+ np.array(OML.summary).reshape(-1).tolist()
table.add_row(a)
a = ['Double ML (DML)']+ np.array(DML.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+---------------------+--------+-----------+--------+-------+--------+--------+
|      Estimator      | θ_hat  | Std Error |   t    |   p   |  2.5%  | 97.25% |
+---------------------+--------+-----------+--------+-------+--------+--------+
|         OLS         | 0.023  |   0.019   | 1.197  | 0.231 |  nan   |  nan   |
|   Single ML (SML)   | -0.015 |   0.005   | -3.133 | 0.002 | -0.024 | -0.005 |
| Orthogonal ML (OML) | 0.049  |   0.009   | 5.398  | 0.000 | 0.031  | 0.067  |
|   Double ML (DML)   | 0.089  |   0.009   | 9.517  | 0.000 | 0.071  | 0.108  |
+---------------------+--------+-----------+--------+-------+--------+--------+
