# Packages

In [4]:
import numpy as np
import pandas as pd
from doubleml.datasets import fetch_bonus
from doubleml import DoubleMLData 
import statsmodels.api as sm
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLIV
from doubleml import DoubleMLPLR
from prettytable import PrettyTable
from PIL import Image, ImageDraw, ImageFont
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, LogisticRegressionCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
import wooldridge
import warnings
warnings.filterwarnings('ignore')
np.random.seed(3293423)

# Load the Data

Wooldridge Source: D. Card (1995), Using Geographic Variation in College Proximity to Estimate the Return to Schooling, in Aspects of Labour Market Behavior: Essays in Honour of John Van- derkamp. Ed. L.N. Christophides, E.K. Grant, and R. Swidinsky, 201-222. Toronto: University of Toronto Press. Professor Card kindly provided these data. Data loads lazily.
            
A data.frame with 3010 observations on 34 variables:
- id: person identifier
- nearc2: =1 if near 2 yr college, 1966
- nearc4: =1 if near 4 yr college, 1966
- educ: years of schooling, 1976
- age: in years
- fatheduc: father’s schooling
- motheduc: mother’s schooling
- weight: NLS sampling weight, 1976
- momdad14: =1 if live with mom, dad at 14 - sinmom14: =1 if with single mom at 14
- step14: =1 if with step parent at 14
- reg661: =1 for region 1, 1966
- reg662: =1 for region 2, 1966
- reg663: =1 for region 3, 1966
- reg664: =1 for region 4, 1966
- reg665: =1 for region 5, 1966
- reg666: =1 for region 6, 1966
- reg667: =1 for region 7, 1966
- reg668: =1 for region 8, 1966
- reg669: =1 for region 9, 1966
- south66: =1 if in south in 1966
- black: =1 if black
- smsa: =1 in in SMSA, 1976
- south: =1 if in south, 1976
- smsa66: =1 if in SMSA, 1966
- wage: hourly wage in cents, 1976
- enroll: =1 if enrolled in school, 1976
- KWW: knowledge world of work score - IQ: IQ score
- married: =1 if married, 1976
- libcrd14: =1 if lib. card in home at 14 - exper: age - educ - 6
- lwage: log(wage)
- expersq: exper^2


In [6]:
df = wooldridge.data('card')
df = df[['lwage','nearc2', 'nearc4' ,'age', 'married', 'smsa', 'educ' ]].dropna()
df.head()

Unnamed: 0,lwage,nearc2,nearc4,age,married,smsa,educ
0,6.306275,0,0,29,1.0,1,7
1,6.175867,0,0,27,1.0,1,12
2,6.580639,0,0,34,1.0,1,12
3,5.521461,1,1,27,1.0,1,11
4,6.591674,1,1,34,1.0,1,12


In [7]:
print(df.isnull().sum())

lwage      0
nearc2     0
nearc4     0
age        0
married    0
smsa       0
educ       0
dtype: int64


In [10]:
np.random.seed(1)
outcome = 'lwage'
treatment = 'educ'
rest = ['age', 'married', 'smsa']
instruments = ['nearc2', 'nearc4']
df = df.dropna()
df = df[[outcome] + [treatment] + rest + instruments]
y = df[outcome]
d = df[treatment]
x = df[rest].astype('float')
z = df[instruments]
print(y.shape, d.shape, x.shape, z.shape)

(3003,) (3003,) (3003, 3) (3003, 2)


# First Stage

In [19]:
np.random.seed(42)
table = PrettyTable()
table.field_names = ['Estimator', 'g(X):Rsquared', 'm(X):Rsquared']
a = ['Linear/Logistic',np.mean(cross_val_score(LinearRegression(), x, y, cv=5)),np.mean(cross_val_score(LinearRegression(), x, d, cv=5))]
table.add_row(a)
a = ['Linear/Logistic (Reg)',np.mean(cross_val_score(LassoCV(), x, y, cv=5)),np.mean(cross_val_score(LassoCV(), x, d, cv=5))]
table.add_row(a)
a = ['Random Forests',np.mean(cross_val_score(RandomForestRegressor(max_depth=5,n_estimators=500,verbose=0), x, y, cv=5)),np.mean(cross_val_score(RandomForestRegressor(max_depth=3, n_estimators=500), x, d, cv=5))]
table.add_row(a)
a = ['Boosting',np.mean(cross_val_score(XGBRegressor(max_depth=3,verbosity=0), x, y, cv=5)),np.mean(cross_val_score(XGBRegressor(verbosity=0,max_depth=3), x, d, cv=5))]
table.add_row(a)
a = ['Neural Networks',np.mean(cross_val_score(MLPRegressor((100,), activation = 'tanh', max_iter=500, learning_rate_init=0.01), normalize(x), y, cv=5)),np.mean(cross_val_score(MLPRegressor((100,), activation = 'tanh', max_iter=500,learning_rate_init=0.01), normalize(x), d, cv=5))]
table.add_row(a)
table.float_format = '0.3'
print(table)

+-----------------------+---------------+---------------+
|       Estimator       | g(X):Rsquared | m(X):Rsquared |
+-----------------------+---------------+---------------+
|    Linear/Logistic    |     0.070     |     -0.025    |
| Linear/Logistic (Reg) |     0.068     |     -0.026    |
|     Random Forests    |     0.061     |     -0.018    |
|        Boosting       |     0.052     |     -0.023    |
|    Neural Networks    |     -0.034    |     -0.027    |
+-----------------------+---------------+---------------+


# OLS

In [13]:
OLS = sm.OLS(endog=y, exog=sm.add_constant(np.c_[d,x]), prepend=True).fit()
print(OLS.summary(xname=['constant', treatment]+rest))

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.249
Model:                            OLS   Adj. R-squared:                  0.248
Method:                 Least Squares   F-statistic:                     248.8
Date:                Thu, 15 Dec 2022   Prob (F-statistic):          9.36e-185
Time:                        10:29:04   Log-Likelihood:                -1389.8
No. Observations:                3003   AIC:                             2790.
Df Residuals:                    2998   BIC:                             2820.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
constant       4.5824      0.075     60.721      0.0

In [14]:
from statsmodels.sandbox.regression.gmm import IV2SLS
endog = df['lwage']
exog =  df[[treatment] + rest]
instr = df[rest + instruments]
exog_constant = sm.add_constant(exog)
instr_constant = sm.add_constant(instr)
IV2SLS = IV2SLS(endog, exog_constant, instrument = instr_constant).fit()
print(IV2SLS.summary())

                          IV2SLS Regression Results                           
Dep. Variable:                  lwage   R-squared:                      -0.093
Model:                         IV2SLS   Adj. R-squared:                 -0.094
Method:                     Two Stage   F-statistic:                     118.4
                        Least Squares   Prob (F-statistic):           7.21e-94
Date:                Thu, 15 Dec 2022                                         
Time:                        10:29:04                                         
No. Observations:                3003                                         
Df Residuals:                    2998                                         
Df Model:                           4                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.3827      0.465      7.275      0.0

# ML Estimation

In [15]:
l = RandomForestRegressor(max_depth=4) # Model for E[Y|X] = E[θD+g(X)]
g = RandomForestRegressor(max_depth=4) # Model for E[Y - θD|X]=g(X)
r = m = RandomForestRegressor(max_depth=4) # Model for E[D|X]

def score(y, d, l_hat, m_hat, g_hat, smpls):
    "Score function for Single ML"
    u_hat = y - g_hat
    psi_a = -np.multiply(d, d)
    psi_b = np.multiply(d, u_hat)
    return psi_a, psi_b

# Orthogonal + Crossfitting (DML)

In [16]:
data = DoubleMLData(df, y_col=outcome,d_cols=treatment,x_cols=rest,z_cols=instruments)
DML = DoubleMLPLIV(data, l,m,g, n_folds=5,apply_cross_fitting=True)
DML.fit();
print(DML.summary)

          coef   std err         t     P>|t|     2.5 %    97.5 %
educ  0.149897  0.038051  3.939374  0.000082  0.075319  0.224476


# Summary

In [17]:
table = PrettyTable()
table.field_names = ['Estimator', 'θ_hat', 'Std Error','t','p','2.5%','97.25%']
idx = 1
a = ['OLS']+ np.c_[OLS.params[idx], OLS.bse[idx], OLS.tvalues[idx], OLS.pvalues[idx], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
a = ['IV-2SLS']+ np.c_[IV2SLS.params[1], IV2SLS.bse[1], IV2SLS.tvalues[1], IV2SLS.pvalues[1], np.nan, np.nan].reshape(-1).tolist()
table.add_row(a)
#a = ['Naive/Single ML (SML)']+ np.array(SML.summary).reshape(-1).tolist()
#table.add_row(a)
#a = ['Orthogonal ML (OML)']+ np.array(OML.summary).reshape(-1).tolist()
#table.add_row(a)
a = ['Double ML (DML)']+ np.array(DML.summary).reshape(-1).tolist()
table.add_row(a)
table.float_format = '0.3'
print(table)

+-----------------+-------+-----------+--------+-------+-------+--------+
|    Estimator    | θ_hat | Std Error |   t    |   p   |  2.5% | 97.25% |
+-----------------+-------+-----------+--------+-------+-------+--------+
|       OLS       | 0.048 |   0.003   | 18.122 | 0.000 |  nan  |  nan   |
|     IV-2SLS     | 0.147 |   0.038   | 3.908  | 0.000 |  nan  |  nan   |
| Double ML (DML) | 0.150 |   0.038   | 3.939  | 0.000 | 0.075 | 0.224  |
+-----------------+-------+-----------+--------+-------+-------+--------+
