## 【ポアソン回帰】冠動脈心疾患と喫煙習慣

線形項のみのポアソン回帰のモデルに、更に交互作用項と非線形項を加えて、モデルの比較を行ってみる。

In [None]:
import pymc as pm
import arviz as az

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = [8, 6]

## Load Data

In [None]:
data = pd.read_csv('smoke.csv')

In [None]:
data

## Preprocess & Scale Data

In [None]:
agecat = data['agecat'].values
deaths = data['deaths'].values
smoke = data['smoke'].values

population = data['population'].values

In [None]:
agecat_mu = np.mean(agecat)
agecat_sd = np.std(agecat)

agecat_scaled = (agecat - agecat_mu) / agecat_sd

## Check Data

In [None]:
sns.scatterplot(x=agecat, y=deaths/population, hue=smoke, s=150)
plt.xticks(agecat)

plt.xlabel('Age Category')
plt.ylabel('Death Rate');

In [None]:
sns.scatterplot(x=agecat, y=np.log(deaths/population), hue=smoke, s=150)
plt.xticks(agecat)

plt.xlabel('Age Category')
plt.ylabel('log( Death Rate )');

## Poisson Regression

In [None]:
agesq = agecat_scaled * agecat_scaled
smokage = smoke * agecat_scaled

In [None]:
with pm.Model() as model:

    b1 = pm.Normal('b1_intercept', mu=0, sigma=10)
    b2 = pm.Normal('b2_smoke', mu=0, sigma=10)
    b3 = pm.Normal('b3_agecat', mu=0, sigma=10)
    b4 = pm.Normal('b4_agesq', mu=0, sigma=10)
    b5 = pm.Normal('b5_smokage', mu=0, sigma=10)

    theta = b1 + b2 * smoke + b3 * agecat_scaled + b4 * agesq + b5 * smokage

    mu = pm.math.exp(theta) * population

    obs = pm.Poisson('obs', mu=mu, observed=deaths)

In [None]:
with model:

    trace = pm.sample(3000, tune=6000, return_inferencedata=False)
    idata = pm.to_inference_data(trace, log_likelihood=True)

## Check MCMC-samples

In [None]:
az.plot_trace(idata)
plt.gcf().subplots_adjust(wspace=0.5, hspace=0.5)

In [None]:
az.summary(idata)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

az.plot_posterior(idata, var_names=['b2_smoke', 'b3_agecat', 'b4_agesq', 'b5_smokage'], ref_val=0, ax=axes)

plt.subplots_adjust(top=1.0, hspace=0.3)

## Posterior Predictive Check

In [None]:
with model:

    ppc = pm.sample_posterior_predictive(idata, return_inferencedata=False)

In [None]:
ppc['obs'].shape

In [None]:
fig = plt.figure(figsize=(12, 12))

for k in range(10):

    ax = fig.add_subplot(5, 2, k+1)

    az.plot_dist(ppc['obs'][:, :, k])
    ax.axvline(deaths[k], color='r', linestyle='dashed')
    ax.set_title('Age Category = {}, Smoke = {}'.format(agecat[k], smoke[k]))

plt.tight_layout()

## Compare Models (WAIC & LOO)

In [None]:
idata_base = az.from_netcdf('idata_base.nc')

In [None]:
dict_idata = {'Linear Terms Only':idata_base, 'Interaction and Non-linear Term Added':idata}

In [None]:
df_waic = az.compare(dict_idata, ic='waic', scale='deviance')
df_waic

In [None]:
az.plot_compare(df_waic, figsize=(8, 3));

In [None]:
df_loo = az.compare(dict_idata, ic='loo', scale='deviance')
df_loo

In [None]:
az.plot_compare(df_loo, figsize=(8, 3));