In [2]:
import pandas as pd
from model import LinearModel, MedianHeuristic, PvalueLog, ConfIntHSIC
from hsic import hsic_perm_test
import patsy
import numpy as np
from functools import partial
from statsmodels.sandbox.regression.gmm import LinearIVGMM
from statsmodels.formula.api import ols
import itertools
from joblib import Parallel, delayed
from kernel import RBFKernel, CategoryKernel, ProductKernel3, ProductKernel2
from torch.utils.data import TensorDataset
from linearmodels.iv.model import IV2SLS
import torch
from utils import *
import warnings
warnings.simplefilter("ignore")

np.random.seed(0)

In [2]:
df = pd.read_csv("card.csv", index_col=0)
# bucketize the experience variable
df['exp_bin'] = pd.cut(df['exper'], 4)

In [3]:
OLS = ols("lwage ~ -1 + educ + C(black) + C(smsa66) + C(exp_bin) + C(south66)", df).fit()
formula = 'lwage ~ -1 + C(black) + C(smsa66) + C(exp_bin) + C(south66) + [educ ~ nearc4]'
IV = IV2SLS.from_formula(formula, df).fit()

In [4]:
print(OLS.params['educ'])
print(IV.params['educ'])

0.07184591915928744
0.1420452833699528


In [5]:
OLS.conf_int()

Unnamed: 0,0,1
C(black)[0],4.943625,5.181382
C(black)[1],4.764392,5.001649
C(smsa66)[T.1],0.087739,0.14778
"C(exp_bin)[T.Interval(5.75, 11.5, closed='right')]",0.220436,0.300349
"C(exp_bin)[T.Interval(11.5, 17.25, closed='right')]",0.3706,0.474731
"C(exp_bin)[T.Interval(17.25, 23.0, closed='right')]",0.422192,0.627235
C(south66)[T.1],-0.12552,-0.061489
educ,0.064823,0.078869


In [6]:
IV.conf_int()

Unnamed: 0,lower,upper
C(black)[T.0],2.418748,5.46843
C(black)[T.1],2.382113,5.269245
"C(exp_bin)[T.(5.75, 11.5]]",0.193146,0.677448
"C(exp_bin)[T.(11.5, 17.25]]",0.314496,1.126283
"C(exp_bin)[T.(17.25, 23.0]]",0.324811,1.804797
C(smsa66)[T.1],0.047774,0.138769
C(south66)[T.1],-0.116857,-0.009571
educ,0.046398,0.237693


In [7]:
def invert_test(p, X, W, Y, Z, kernel_z, method='gamma'):
    Y_hat = Y - p * X
    se_callback = MedianHeuristic()
    pval_callback = PvalueLog()
    
    batch_size = 256
    
    kernel_e = RBFKernel(sigma=1)
    hsic_net = LinearModel(input_dim=W.shape[1],
                           lr=1e-2,
                           lmd=0.0,
                           kernel_e=kernel_e,
                           kernel_z=kernel_z,
                           bias=False)
    
    trainloader = torch.utils.data.DataLoader(TensorDataset(to_torch(W), to_torch(Y_hat), to_torch(Z)),
                                              batch_size=batch_size,
                                              shuffle=True, num_workers=0)
    
    max_epoch = 50
    hsic_net = fit_restart(trainloader, hsic_net, pval_callback, max_epoch, 
                           se_callback, num_restart=1, verbose=False)
    
    res = Y_hat - hsic_net(to_torch(W)).detach().numpy()
    
    sigma_e = med_sigma(res)
    kernel_e = RBFKernel(sigma=sigma_e)
    
    if method == 'gamma':
        if kernel_z.__class__ == CategoryKernel:
            kernels = ['gaussian', 'discrete']
        else:
            kernels = ['gaussian', 'gaussian']
        
        pval, hsic = dhsic_test(res, Z, kernels, 
                         statistics=True, method='gamma')
    elif method == 'permu':
        pval, hsic = hsic_perm_test(res, Z, kernel_e, kernel_z, B=100)
        
    return p, pval, hsic

In [8]:
W = np.asarray(patsy.dmatrix("~ -1 + C(black) + C(smsa66) + C(south66) + C(exp_bin)", data=df, return_type='matrix'))
Z = np.asarray(patsy.dmatrix("~ -1 + C(nearc4)", data=df, return_type='matrix'))
Z = np.hstack([Z, W])
X = df['educ']
Y = df['lwage']

In [9]:
kernel_z = ProductKernel2(CategoryKernel(one_hot=False),
                          RBFKernel(1),
                          [0, 1],
                          [2, 3, 4, 5, 6, 7, 8])

In [10]:
param_range = np.linspace(0.03, 0.23, 64)

pval_ret = Parallel(n_jobs=-1)(delayed(invert_test)(p, X, W, Y, Z, kernel_z, method='permu') for p in param_range)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
G

In [11]:
pval_df = pd.DataFrame(pval_ret, columns=['param', 'pval', 'stat'])
pval_df.to_csv("card_pval_df.csv", index=False)

In [6]:
point_estimate = pval_df.param[pval_df.stat == pval_df.stat.min()].iloc[0]

In [7]:
accept_df = pval_df.query('pval >= 0.05')
conf_int = accept_df.param.min(), accept_df.param.max()

In [8]:
point_estimate

0.1601587301587301

In [9]:
conf_int

(0.0966666666666666, 0.2077777777777777)