Between 1965 and 1982, the Social Security Survivor Benefits (SSSB) Program in the United States offered $6,700 (expressed in year 2000 dol- lars) in college financial aid to the 18- to 22-year-old children of deceased, disabled, or retired Social Security recipients. In 1981, the U.S. Congress eliminated the SSSB program, mandating that otherwise eligible children who were not enrolled in college as of May 1982 would not receive the SSSB college-aid offer. Using the National Longitudinal Survey of Youth, Dynarski identified students in cohorts of high-school seniors, just before and just after the policy change, who would have been eligible for the aid offer because their fathers were Social Security recipients who had died. She argued that, other than differing in receipt of the offer of college aid, these two groups of students were equal in expectation initially. However, the 137 high-school seniors who satisfied SSSB eligibility requirements immediately before the policy change (in the years 1979 through 1981) received the college financial-aid offer and therefore constituted the treat- ment group. The 54 high-school seniors who satisfied SSSB eligibility requirements immediately after the policy change (1982 and 1983) received no SSSB-related financial-aid offer and made up the control group.

In [17]:
import pandas as pd
import numpy as np
df = pd.read_csv('/Users/pranjal/Desktop/Causal-Inference/data/collegeoffer.csv')
df['fd'] = 0
df.loc[df.fatherdec=='Father not deceased', 'fd'] = 1
outcome = 'coll'
treatment = 'offer'
rest = list(df.drop([outcome, treatment, 'hhid', 'id', 'fatherdec'], axis = 1).columns)
df = df[[outcome] + [treatment] + rest]

In [18]:
df.head()

Unnamed: 0,id,hhid,wt88,coll,hgc23,yearsr,fatherdec,offer
0,9,9,691916,1,13,81,Father not deceased,1
1,14,13,784204,1,16,81,Father not deceased,1
2,15,15,811032,1,16,82,Father not deceased,0
3,21,20,644853,1,16,79,Father not deceased,1
4,22,22,728189,1,16,80,Father not deceased,1


In [19]:
df['fd'] = 0
df.loc[df.fatherdec=='Father not deceased', 'fd'] = 1


In [20]:
outcome = 'coll'
treatment = 'offer'
rest = list(df.drop([outcome, treatment, 'hhid', 'id', 'fatherdec'], axis = 1).columns)
df = df[[outcome] + [treatment] + rest]

In [23]:
#import wooldridge
#df = wooldridge.data('jtrain3')
#df['avg'] = 0.5 * (df.re74+df.re75)
#df = df.dropna()
df = df.fillna(0)
#df = df[df.avg <= 15]
y = df[outcome]
d = df[treatment]
x = df[rest].astype('float')
print(df.shape)
df.head()

(3986, 6)


Unnamed: 0,coll,offer,wt88,hgc23,yearsr,fd
0,1,1,691916,13,81,1
1,1,1,784204,16,81,1
2,1,0,811032,16,82,1
3,1,1,644853,16,79,1
4,1,1,728189,16,80,1


In [24]:
# Simple Comparision of Means
import statsmodels.api as sm
mod = sm.OLS(y, sm.add_constant(np.c_[d], prepend=False))
res = mod.fit()
print(res.summary())
print(res.params[0])
print(res.bse[0])

                            OLS Regression Results                            
Dep. Variable:                   coll   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     7.874
Date:                Mon, 05 Dec 2022   Prob (F-statistic):            0.00504
Time:                        01:25:25   Log-Likelihood:                -2874.9
No. Observations:                3986   AIC:                             5754.
Df Residuals:                    3984   BIC:                             5766.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.0494      0.018      2.806      0.0

In [25]:
# Pooled Regression Adjustment
import statsmodels.api as sm
mod = sm.OLS(y, sm.add_constant(np.c_[d, x], prepend=False))
res = mod.fit()
print(res.summary())
print(res.params[0])
print(res.bse[0])

                            OLS Regression Results                            
Dep. Variable:                   coll   R-squared:                       0.605
Model:                            OLS   Adj. R-squared:                  0.604
Method:                 Least Squares   F-statistic:                     1217.
Date:                Mon, 05 Dec 2022   Prob (F-statistic):               0.00
Time:                        01:25:39   Log-Likelihood:                -1029.3
No. Observations:                3986   AIC:                             2071.
Df Residuals:                    3980   BIC:                             2108.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.0228      0.019      1.197      0.2

In [26]:
import numpy as np
from doubleml.datasets import make_plr_CCDDHNR2018
from doubleml import DoubleMLData

np.random.seed(1234)
dml_data_bonus = DoubleMLData(df, y_col=outcome,
                                  d_cols=treatment,
                                  x_cols=list(rest))
print(dml_data_bonus)
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
learner = RandomForestRegressor(n_estimators = 500, max_features = 'sqrt', max_depth= 6)
ml_l_bonus = clone(learner)
ml_m_bonus = clone(learner)
learner = LassoCV()
ml_l_sim = clone(learner)
ml_m_sim = clone(learner)
def non_orth_score(y, d, l_hat, m_hat, g_hat, smpls):
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

from doubleml import DoubleMLPLR
np.random.seed(3141)
obj_dml_plr_bonus = DoubleMLPLR(dml_data_bonus, ml_l_bonus, ml_m_bonus)
obj_dml_plr_bonus.fit();
print(obj_dml_plr_bonus)


------------------ Data summary      ------------------
Outcome variable: coll
Treatment variable(s): ['offer']
Covariates: ['wt88', 'hgc23', 'yearsr', 'fd']
Instrument variable(s): None
No. Observations: 3986

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3986 entries, 0 to 3985
Columns: 6 entries, coll to fd
dtypes: int64(6)
memory usage: 187.0 KB


------------------ Data summary      ------------------
Outcome variable: coll
Treatment variable(s): ['offer']
Covariates: ['wt88', 'hgc23', 'yearsr', 'fd']
Instrument variable(s): None
No. Observations: 3986

------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2

------------------ Machine learner   ------------------
Learner ml_l: RandomForestRegressor(max_depth=6, max_features='sqrt', n_estimators=500)
Learner ml_m: RandomForestRegressor(max_depth=6, max_features='sqrt', n_estimators=500)

------------------ Resampling   

In [27]:
import numpy as np
from doubleml.datasets import make_plr_CCDDHNR2018
from doubleml import DoubleMLData

np.random.seed(1234)
dml_data_bonus = DoubleMLData(df, y_col=outcome,
                                  d_cols=treatment,
                                  x_cols=list(rest))
print(dml_data_bonus)
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
learner = RandomForestRegressor(n_estimators = 100, max_features = 'sqrt', max_depth= 6)
ml_l_bonus = clone(learner)
ml_m_bonus = clone(learner)
learner = LassoCV()
ml_l_sim = clone(learner)
ml_m_sim = clone(learner)
def non_orth_score(y, d, l_hat, m_hat, g_hat, smpls):
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

from doubleml import DoubleMLPLR
np.random.seed(3141)
obj_dml_plr_bonus = DoubleMLPLR(dml_data_bonus, ml_l_bonus, ml_m_bonus)
obj_dml_plr_bonus.fit();
print(obj_dml_plr_bonus)


------------------ Data summary      ------------------
Outcome variable: coll
Treatment variable(s): ['offer']
Covariates: ['wt88', 'hgc23', 'yearsr', 'fd']
Instrument variable(s): None
No. Observations: 3986

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3986 entries, 0 to 3985
Columns: 6 entries, coll to fd
dtypes: int64(6)
memory usage: 187.0 KB


------------------ Data summary      ------------------
Outcome variable: coll
Treatment variable(s): ['offer']
Covariates: ['wt88', 'hgc23', 'yearsr', 'fd']
Instrument variable(s): None
No. Observations: 3986

------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2

------------------ Machine learner   ------------------
Learner ml_l: RandomForestRegressor(max_depth=6, max_features='sqrt')
Learner ml_m: RandomForestRegressor(max_depth=6, max_features='sqrt')

------------------ Resampling        ------------------
No. folds: 5

In [28]:
# DML regression - still yeilds unbiased estimate of ATE 
from econml.dml import LinearDML
est = LinearDML(random_state=45)
est.fit(y, d, X=None,W=x)
est.summary()

  from .autonotebook import tqdm as notebook_tqdm


Coefficient Results:  X is None, please call intercept_inference to learn the constant!


0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
cate_intercept,0.056,0.017,3.193,0.001,0.021,0.09
