# Medical Checkup Problem

In [None]:
# Enable the commands below when running this program on Google Colab.
# !pip install arviz==0.7
# !pip install pymc3==3.8
# !pip install Theano==1.0.4

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import pymc3 as pm

plt.style.use('seaborn-darkgrid')
np.set_printoptions(precision=3)
pd.set_option('display.precision', 3)

In [None]:
EXPERIMENT_GROUP = [56, 55, 55, 62, 54, 63, 47, 58, 56, 56, 57, 52, 53, 50, 50, 57, 57, 55, 60, 65, 53, 43, 60, 51, 52, 60, 54, 49, 56, 54, 55, 57, 53, 58, 54, 57, 60, 57, 53, 61, 60, 58, 56, 52, 62, 52, 66, 63, 54, 50]
CONTROL_GROUP = [33, 37, 59, 41, 42, 61, 46, 25, 32, 35, 55, 44, 45, 41, 33, 61, 46, 16, 48, 34, 27, 37, 28, 31, 32, 20, 50, 42, 26, 55, 45, 36, 51, 51, 50, 48, 47, 39, 36, 35, 32, 38, 25, 66, 54, 27, 35, 34, 49, 39]

In [None]:
# Data vsualization
plt.boxplot([EXPERIMENT_GROUP, CONTROL_GROUP], labels=['EXPERIMENT GROUP', 'CONTROL GROUP'])
plt.ylabel('Biomarker')
plt.show()

In [None]:
# Summary
data = pd.DataFrame([EXPERIMENT_GROUP, CONTROL_GROUP], index=['Experiment', 'Control']).transpose()
# display(data)
data.describe()

## Bayesian analysis

In [None]:
with pm.Model() as model:
    # Prior distribution
    mu = pm.Uniform('mu', 0, 100, shape=2)
    sigma = pm.Uniform('sigma', 0, 50)

    # Likelihood
    y_pred = pm.Normal('y_pred', mu=mu, sd=sigma, observed=data.values)

    # Difference of mean
    delta_mu = pm.Deterministic('mu1 - mu2', mu[0] - mu[1])

    trace = pm.sample(21000, chains=5)

In [None]:
chain = trace[1000:]
pm.traceplot(chain)
plt.show()

In [None]:
pm.summary(chain)

### RQ1: 第1群の平均値が第2群の平均値より高い確率

In [None]:
print('p(mu1 - mu2 > 0) = {:.3f}'.format((chain['mu'][:,0] - chain['mu'][:,1] > 0).mean()))
# 「罹患群の平均値が健常群の平均値より大きい」という研究仮説が正しい確率は100%

### RQ2: 第1群と第2群の平均値の差の点推定、平均値の差の区間推定

In [None]:
print('Point estimation (difference of mean): {:.3f}'.format(chain['mu1 - mu2'].mean()))
# 平均値差に関するEAP推定値
hpd_0025 = np.quantile(chain['mu1 - mu2'], 0.025)
hpd_0975 = np.quantile(chain['mu1 - mu2'], 0.975)
print('Credible Interval (95%): ({:.3f}, {:.3f})'.format(hpd_0025, hpd_0975))
# 平均値差は95%の確率で上記の区間に入る

### RQ3: 平均値の差の片側区間推定の下限・上限

In [None]:
hpd_005 = np.quantile(chain['mu1 - mu2'], 0.05)
hpd_0950 = np.quantile(chain['mu1 - mu2'], 0.95)
print('At most (95%): {:.3f}'.format(hpd_0950))  # 95%の確信で高々これだけの差がある
print('At least (95%): {:.3f}'.format(hpd_005))  # 95%の確信で少なくともこれだけの差がある

### RQ4: 平均値の差が基準点cより大きい確率

In [None]:
print('p(mu1 - mu2 > 10) = {:.3f}'.format((chain['mu'][:,0] - chain['mu'][:,1] > 10).mean()))
print('p(mu1 - mu2 > 12) = {:.3f}'.format((chain['mu'][:,0] - chain['mu'][:,1] > 12).mean()))
print('p(mu1 - mu2 > 14) = {:.3f}'.format((chain['mu'][:,0] - chain['mu'][:,1] > 14).mean()))