## Example: (nonlinear) IV causal inference by `nl_causal`
> Below is an example that demonstrates the usage of `ts_twas` in `nl_causal`.

In [2]:
## import global libraries
import numpy as np

## Simulate Data

- **library:** `nl_causal.base.sim`
- **Two Stage Datasets:** two independent datasets, **2SLS** and **2SIR** require different types of datasets:
  * For 2SLS:
    + Stage 1. LD matrix (`np.dot(Z1.T, Z1)`) + XZ_sum (`np.dot(Z1.T, X1)`)
    + Stage 2. ZY_sum (GWAS summary) (`np.dot(Z2.T, y2)`)
  * For 2SIR:
    + Stage 1. invidual-level data `Z1` and `X1`
    + Stage 2. ZY_sum (GWAS summary) (`np.dot(Z2.T, y2)`)
- **Remarks:** In terms of data, the advantage of 2SLS is merely requiring summary statistics of XZ and YZ in both Stages 1 and 2.

In [20]:
## import libraries
from nl_causal.base import sim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## simulate a dataset
np.random.seed(1)
n, p = 2000, 50
beta0 = 0.10
theta0 = np.ones(p) / np.sqrt(p)
Z, X, y, phi = sim(n, p, theta0, beta0, case='inverse', feat='normal')

## normalize the dataset
center = StandardScaler(with_std=False)
mean_X, mean_y = X.mean(), y.mean()
Z, X, y = center.fit_transform(Z), X - mean_X, y - mean_y
y_scale = y.std()
y = y / y_scale

## generate two-stage dataset
Z1, Z2, X1, X2, y1, y2 = train_test_split(Z, X, y, test_size=0.5, random_state=42)
n1, n2 = len(Z1), len(Z2)
LD_Z1, cov_ZX1 = np.dot(Z1.T, Z1), np.dot(Z1.T, X1)
LD_Z2, cov_ZY2 = np.dot(Z2.T, Z2), np.dot(Z2.T, y2)

## Models
- **library:** `nl_causal.ts_models._2SLS` and `nl_causal.ts_models._2SIR`
- **Methods:** [2SLS](https://doi.org/10.1080/01621459.2014.994705) and [2SIR](https://openreview.net/pdf?id=cylRvJYxYI)

In [23]:
from nl_causal.ts_models import _2SLS, _2SIR

In [72]:
## 2SLS
LS = _2SLS(sparse_reg=None)
## Stage-1 fit theta
LS.fit_theta(LD_Z1, cov_ZX1)
## Stage-2 fit beta
LS.fit_beta(LD_Z2, cov_ZY2, n2)
## produce p_value and CI for beta
LS.test_effect(n2, LD_Z2, cov_ZY2)
LS.CI_beta(n1, n2, Z1, X1, LD_Z2, cov_ZY2)
print('p-value based on 2SLS: %.5f' %LS.p_value)
print('CI based on 2SLS: %s' %(LS.CI*y_scale))

p-value based on 2SLS: 0.60930
CI based on 2SLS: [-0.13427608  0.08645929]


In [27]:
import pandas as pd

def print_msg_box(msg, indent=1, width=None, title=None):
    """Print message-box with optional title."""
    lines = msg.split('\n')
    space = " " * indent
    if not width:
        width = max(map(len, lines))
    box = f'╔{"═" * (width + indent * 2)}╗\n'  # upper_border
    if title:
        box += f'║{space}{title:<{width}}{space}║\n'  # title
        box += f'║{space}{"-" * len(title):<{width}}{space}║\n'  # underscore
    box += ''.join([f'║{space}{line:<{width}}{space}║\n' for line in lines])
    box += f'╚{"═" * (width + indent * 2)}╝'  # lower_border
    print(box)

In [103]:
np.set_printoptions(precision=4)

LS_result = {'model': "2SLS: x = z^T theta + omega; y = beta x + z^T alpha + epsilon",
             'est beta': LS.beta, 'p-value': LS.p_value, 'CI': str(LS.CI)}

msg = "x = z^T theta + omega; \n" \
      "y = beta x + z^T alpha + epsilon. \n" \
      "--- \n" \
      "beta: causal effect from x to y. \n" \
      "--- \n" \
      "Est beta (CI): %.3f (CI: %s) \n" \
      "p-value: %.4f, -log10(p): %.4f" %(LS.beta, LS.CI, LS.p_value, -np.log10(LS.p_value))

print_msg_box(msg, indent=1, title="2SLS")

╔════════════════════════════════════════════════╗
║ 2SLS                                           ║
║ ----                                           ║
║ x = z^T theta + omega;                         ║
║ y = beta x + z^T alpha + epsilon.              ║
║ ---                                            ║
║ beta: causal effect from x to y.               ║
║ ---                                            ║
║ Est beta (CI): -0.016 (CI: [-0.0879  0.0566])  ║
║ p-value: 0.6093, -log10(p): 0.2152             ║
╚════════════════════════════════════════════════╝


x = z^T theta + omega; 
y = beta x + z^T alpha + epsilon. 

