# Simulation in **hypothesis testing** and **Confidence Interval**(CI) with **invalid IVs** for the 2SLS, PT-2SLS, and the proposed 2SIR methods.

In [6]:
from nl_causal.ts_models import _2SLS, _2SIR
from nl_causal.linear_reg import L0_IC
import numpy as np
from sklearn.preprocessing import normalize
from sim_data import sim
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import power_transform, quantile_transform

In [7]:
## simulate a dataset
np.random.seed(0)
n, p = 5000, 50
beta0 = 0.0
theta0 = np.random.randn(p)
theta0 = theta0 / np.sqrt(np.sum(theta0**2))
alpha0 = np.zeros(p)
alpha0[:5] = 1.
Z, X, y, phi = sim(n, p, theta0, beta0, alpha0=alpha0, case='inverse', feat='AP-normal')
## normalize the dataset
center = StandardScaler(with_std=False)
mean_X, mean_y = X.mean(), y.mean()
Z, X, y = center.fit_transform(Z), X - mean_X, y - mean_y
y_scale = y.std()
y = y / y_scale
## generate two-stage dataset
Z1, Z2, X1, X2, y1, y2 = train_test_split(Z, X, y, test_size=0.5, random_state=42)
n1, n2 = len(Z1), len(Z2)
LD_Z1, cov_ZX1 = np.dot(Z1.T, Z1), np.dot(Z1.T, X1)
LD_Z2, cov_ZY2 = np.dot(Z2.T, Z2), np.dot(Z2.T, y2)

In [8]:
## 2SLS
# specify a sparse regression model to detect invalid IVs
Ks = range(p)
reg_model = L0_IC(fit_intercept=False, alphas=10**np.arange(-1,3,.3),
				Ks=Ks, max_iter=10000, refit=False, find_best=False)
LS = _2SLS(sparse_reg=reg_model)
## Stage-1 fit theta 
LS.fit_theta(LD_Z1, cov_ZX1)
## Stage-2 fit beta
LS.fit_beta(LD_Z2, cov_ZY2, n2)
## produce p_value for beta
LS.test_effect(n2, LD_Z2, cov_ZY2)
print('p-value for 2SLS: %.5f' %LS.p_value)

ValueError: ndarray is not Fortran contiguous

In [23]:
## PT-2SLS
PT_X1 = power_transform(X1.reshape(-1,1), method='yeo-johnson').flatten()
PT_cor_ZX1 = np.dot(Z1.T, PT_X1)
PT_LS = _2SLS(sparse_reg=None)
## Stage-1 fit theta
PT_LS.fit_theta(LD_Z1, PT_cor_ZX1)
## Stage-2 fit beta
PT_LS.fit_beta(LD_Z2, cov_ZY2, n2)
## produce p-value for beta
PT_LS.test_effect(n2, LD_Z2, cov_ZY2)
print('p-value for PT-2SLS: %.5f' %PT_LS.p_value)

p-value for PT-2SLS: 0.84561


In [24]:
## the proposed 2SIR
SIR = _2SIR(sparse_reg=None)
## Stage-1 fit theta
SIR.fit_theta(Z1, X1)
## Stage-2 fit beta
SIR.fit_beta(LD_Z2, cov_ZY2, n2)
## generate CI for beta
SIR.test_effect(n2, LD_Z2, cov_ZY2)
print('p-value for PT-2SLS: %.5f' %SIR.p_value)

p-value for PT-2SLS: 0.59967
