# Simulation of **hypothesis testing** and **confidence interval**(CI) for the 2SLS, PT-2SLS, and the proposed 2SIR methods.

In [49]:
from nl_causal.ts_models import _2SLS, _2SIR
import numpy as np
from sklearn.preprocessing import normalize
from sim_data import sim
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import power_transform, quantile_transform

## A toy example with beta0 = 0.10, both `2SLS` and `PT-2SLS` provide wrong p-values and CIs, yet the proposed test is valid. (see more detailed simulation in `./sim`)

In [56]:
## simulate a dataset
np.random.seed(1)
n, p = 5000, 50
beta0 = 0.10
theta0 = np.random.randn(p)
theta0 = theta0 / np.sqrt(np.sum(theta0**2))
Z, X, y, phi = sim(n, p, theta0, beta0, case='inverse', feat='normal')
## normalize the dataset
center = StandardScaler(with_std=False)
mean_X, mean_y = X.mean(), y.mean()
Z, X, y = center.fit_transform(Z), X - mean_X, y - mean_y
y_scale = y.std()
y = y / y_scale
## generate two-stage dataset
Z1, Z2, X1, X2, y1, y2 = train_test_split(Z, X, y, test_size=0.5, random_state=42)
n1, n2 = len(Z1), len(Z2)
LD_Z1, cov_ZX1 = np.dot(Z1.T, Z1), np.dot(Z1.T, X1)
LD_Z2, cov_ZY2 = np.dot(Z2.T, Z2), np.dot(Z2.T, y2)

In [57]:
## 2SLS
LS = _2SLS(sparse_reg=None)
## Stage-1 fit theta
LS.fit_theta(LD_Z1, cov_ZX1)
## Stage-2 fit beta
LS.fit_beta(LD_Z2, cov_ZY2, n2)
## produce p_value and CI for beta
LS.test_effect(n2, LD_Z2, cov_ZY2)
LS.CI_beta(n1, n2, Z1, X1, LD_Z2, cov_ZY2)
LS.CI[0] = max(LS.CI[0], 0.)
print('p-value based on 2SLS: %.5f' %LS.p_value)
print('CI based on 2SLS: %s' %(LS.CI*y_scale))

p-value based on 2SLS: 0.27777
CI based on 2SLS: [0.         0.08686183]


In [58]:
## PT-2SLS
PT_X1 = power_transform(X1.reshape(-1,1), method='yeo-johnson').flatten()
PT_cor_ZX1 = np.dot(Z1.T, PT_X1)
PT_LS = _2SLS(sparse_reg=None)
## Stage-1 fit theta
PT_LS.fit_theta(LD_Z1, PT_cor_ZX1)
## Stage-2 fit beta
PT_LS.fit_beta(LD_Z2, cov_ZY2, n2)
## produce p-value and CI for beta
PT_LS.test_effect(n2, LD_Z2, cov_ZY2)
PT_LS.CI_beta(n1, n2, Z1, X1, LD_Z2, cov_ZY2)
PT_LS.CI[0] = max(PT_LS.CI[0], 0.)
print('p-value based on PT-2SLS: %.5f' %PT_LS.p_value)
print('CI based on 2SLS: %s' %(PT_LS.CI*y_scale))

p-value based on PT-2SLS: 0.32749
CI based on 2SLS: [0.         0.08361912]


In [59]:
## the proposed 2SIR
SIR = _2SIR(sparse_reg=None)
## Stage-1 fit theta
SIR.fit_theta(Z1, X1)
## Stage-2 fit beta
SIR.fit_beta(LD_Z2, cov_ZY2, n2)
## generate CI for beta
SIR.test_effect(n2, LD_Z2, cov_ZY2)
SIR.CI_beta(n1, n2, Z1, X1, LD_Z2, cov_ZY2)
print('p-value based on 2SIR: %.5f' %SIR.p_value)
print('CI based on 2SIR: %s' %(SIR.CI*y_scale))

p-value based on 2SIR: 0.00003
CI based on 2SIR: [0.05994451 0.17434128]
