### Functions

In [1]:
import numpy as np
import pandas as pd
import scipy.linalg as la

# HFUL function
def b_hful(y,X,Z):
    
    # Dimensions
    n = y.shape[0]
    k = X.shape[1]
    
    # Create P matrix
    P = np.dot(Z@la.inv(Z.T@Z),Z.T)
    
    # Create Xbar matrix
    Xbar = np.column_stack((y,X))
    
    # Get alpha: first create sum term, then matrix, then get eigenvals, then select smallest eigenval (alpha)
    PXbarXbar = Xbar.T @ np.diag(P) @ Xbar
    BigMat = la.inv(Xbar.T@Xbar)@(PXbarXbar)
    
    eigvals = np.linalg.eigvals(BigMat)
    alpha = np.min(eigvals)
    
    # Get alpha hat
    AlphaHat = ((alpha - (1-alpha))/n) / ((1 - (1-alpha))/n)
    
    # Create HFUL estimator: create sum terms, then estimator
    PXX = X.T @ P @ X - X.T @ np.diag(P) @ X - AlphaHat * X.T @ X
    PXy = X.T @ P @ y - X.T @ np.diag(P) @ y - AlphaHat * X.T @ y
        
    bHFUL = la.inv(PXX) @ (PXy)
    return(bHFUL)

# TSLS function
def b_iv(y,X,Z):
    bIV = np.linalg.lstsq(Z.T@X,Z.T@y, rcond=None)
    return(bIV)

# TSLS-GMM function (thank you Shreya!)
def b_iv_gmm(y,X,Z):
    ZZ = Z.T@Z
    ZZ_inv = np.linalg.pinv(ZZ)
    XtZ = X.T@Z
    ZtX = Z.T@X
    ZY = Z.T@y
    bGMM = np.linalg.pinv(XtZ@ZZ_inv@ZtX)@(XtZ@ZZ_inv@ZY)
    return(bGMM)

# OLS function
def b_ols(y,X):
    bOLS = np.linalg.solve(X.T@X,X.T@y)
    return(bOLS)

### Replicate Table V in AK1991

In [2]:
# Import data
ak_df = pd.read_stata('angrist-krueger91.dta')

# Create variables needed for regressions
ak_df['agesquared'] = ak_df.apply(lambda row: row.ageq**2, axis = 1) # age squared

region_dummy = pd.get_dummies(ak_df['region'], drop_first = True) # region dummy, drop region 0
yob_dummy = pd.get_dummies(ak_df['yob'], drop_first = True) # yob dummy, drop 1930
qob_dummy = pd.get_dummies(ak_df['qob'], drop_first = True) # qob dummy, drop 1
ak_df['qob_yob'] = ak_df.qob.map(str) + "_" + ak_df.yob.map(str) # concat qob and yob to create dummies
qob_yob_dummy = pd.get_dummies(ak_df['qob_yob']) # qob-yob interaction dummy

In [3]:
# Create y, X, Z matrices - will do for columns 5&6 and 7&8 (difference is the age vars)
y = ak_df['logwage']

# drop qob 1 dummies, year 1930
qob_yob_dummy = qob_yob_dummy[qob_yob_dummy.columns.drop(list(qob_yob_dummy.filter(regex='_1930')))]
qob_yob_dummy = qob_yob_dummy[qob_yob_dummy.columns.drop(list(qob_yob_dummy.filter(regex='1_')))]

constant = pd.DataFrame(np.ones(ak_df.shape[0]))

# columns 5 & 6: intercept, education (qob-yob dummies), married, black, smsa, yob dummy, region dummy
X_col56 = np.array(pd.concat([constant, ak_df[['edu', 'married', 'black', 'smsa']], 
                     yob_dummy, region_dummy], axis = 1))
Z_col56 = np.array(pd.concat([constant, qob_yob_dummy, qob_dummy, ak_df[['married', 'black', 'smsa']], 
                     yob_dummy, region_dummy], axis = 1))

# columns 7 & 8: intercept, education (qob-yob dummies), married, 
    # black, smsa, age, age squared, yob dummy, region dummy
X_col78 = np.array(pd.concat([constant, ak_df[['edu', 'married', 'black', 'smsa', 'ageq', 'agesquared']], 
                     yob_dummy, region_dummy], axis = 1))
Z_col78 = np.array(pd.concat([constant, qob_yob_dummy, qob_dummy, ak_df[['married', 'black', 'smsa', 'ageq', 'agesquared']], 
                     yob_dummy, region_dummy], axis = 1))

In [4]:
# 2sls & ols for 5/6 - used GMM 2sls which replicated the estimates from the paper
b_tsls56 = b_iv_gmm(y, X_col56, Z_col56)
print(b_tsls56[1])

b_ols56 = b_ols(y, X_col56)
print(b_ols56[1])

# 2sls & ols for 7/8
b_tsls78 = b_iv_gmm(y, X_col78, Z_col78)
print(b_tsls78[1])

b_ols78 = b_ols(y, X_col78)
print(b_ols78[1])

0.08055179488422937
0.06324573304217924
0.05995357591746142
0.06323780159470752


In [None]:
# hful
#b_hful56 = b_hful(y, X_col56, Z_col56)
#print(b_hful56[1])
#b_hful78 = b_hful(y, X_col78, Z_col78)
#print(b_hful78[1])