# Two Paired Samples
Alternative of paired sample t-test

In [None]:
# Enable the commands below when running this program on Google Colab.
# !pip install arviz==0.7
# !pip install pymc3==3.8
# !pip install Theano==1.0.4import numpy as np

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import pymc3 as pm
import theano.tensor as tt

import statistics
import math

plt.style.use('seaborn-darkgrid')
np.set_printoptions(precision=3)
pd.set_option('display.precision', 3)

In [None]:
# Weight of 20 women before/after the diet program.
WEIGHT_BEFORE = [53.1, 51.5, 45.5, 55.5, 49.6, 50.1, 59.2, 54.7, 53.0, 48.6, 55.3, 52.6, 51.7, 48.6, 56.4, 42.9, 50.3, 42.4, 51.2, 39.1]
WEIGHT_AFTER = [48.3, 45.2, 46.6, 56.6, 41.2, 44.6, 51.9, 55.5, 45.4, 47.6, 50.6, 54.5, 49.0, 43.9, 53.8, 40.1, 52.8, 35.3, 55.6, 38.0]

In [None]:
# before
print('[before]')
print('mean: {:.3f}'.format(statistics.mean(WEIGHT_BEFORE)))
print('standard deviation: {:.3f}'.format(statistics.pstdev(WEIGHT_BEFORE)))
print('variance: {:.3f}'.format(statistics.pvariance(WEIGHT_BEFORE)))
print('25, 50, 75%: {}'.format(np.percentile(WEIGHT_BEFORE, [25, 50, 75])))
print()

# after
print('[after]')
print('mean: {:.3f}'.format(statistics.mean(WEIGHT_AFTER)))
print('standard deviation: {:.3f}'.format(statistics.pstdev(WEIGHT_AFTER)))
print('variance: {:.3f}'.format(statistics.pvariance(WEIGHT_AFTER)))
print('25, 50, 75%: {}'.format(np.percentile(WEIGHT_AFTER, [25, 50, 75])))

In [None]:
# Visualize the data (boxplot)
plt.boxplot([WEIGHT_BEFORE, WEIGHT_AFTER], labels=['Before', 'After'])
plt.ylabel('Weight')
plt.show()

In [None]:
# Visualize the data (correlation)
fig, ax = plt.subplots()

ax.scatter(WEIGHT_BEFORE, WEIGHT_AFTER)
plt.xlabel('before (kg)')
plt.ylabel('after (kg)')

lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),
    np.max([ax.get_xlim(), ax.get_ylim()])
]
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')

plt.show()

In [None]:
# Summary
data = pd.DataFrame([WEIGHT_BEFORE, WEIGHT_AFTER], index=['Before', 'After']).transpose()
data.describe()

In [None]:
# mean deviation data
before_mean = sum(WEIGHT_BEFORE) / len(WEIGHT_BEFORE)
mdd_before = list(map(lambda x: x - before_mean, WEIGHT_BEFORE))
after_mean = sum(WEIGHT_AFTER) / len(WEIGHT_AFTER)
mdd_after = list(map(lambda x: x - after_mean, WEIGHT_AFTER))

# covariance
s = sum(list(b * a for b, a in zip(mdd_before, mdd_after))) / len(mdd_after)
print('Covariance: {:.3f}'.format(s))

# correlation coefficient
weight_before_std = list(map(lambda x: x / statistics.pstdev(WEIGHT_BEFORE), mdd_before))
weight_after_std = list(map(lambda x: x / statistics.pstdev(WEIGHT_AFTER), mdd_after))
r = sum(list(b * a for b, a in zip(weight_before_std, weight_after_std))) / len(weight_before_std)
print('Correlation coefficient: {:.3f}'.format(r))

v_before = statistics.pvariance(WEIGHT_BEFORE)
v_after = statistics.pvariance(WEIGHT_AFTER)
cov = np.array([[v_before, s], [s, v_after]])
print('Covariance matrix:\n', cov)

## Bayesian analysis

In [None]:
with pm.Model() as mv_model:
    # Prior distribution
    mu = pm.Normal('mu', 0, 100, shape=2)
    sigma = pm.Uniform('sigma', 0, 100, shape=2)

    # https://stackoverflow.com/questions/45534752/model-multivariate-normal-with-separate-means-dimension-mismatch-error
    C_triu = pm.LKJCorr('omega', n=2, p=2)
    C = tt.fill_diagonal(C_triu[np.zeros((2, 2), dtype=np.int64)], 1)
    sigma_diag = tt.nlinalg.diag(sigma)
    cov = tt.nlinalg.matrix_dot(sigma_diag, C, sigma_diag)

    # Likelihood
    y_pred = pm.MvNormal('y_pred', mu=mu, cov=cov, observed=np.stack((WEIGHT_BEFORE, WEIGHT_AFTER)).T)

    # Difference of average values
    delta_mu = pm.Deterministic('mu1 - mu2', mu[0] - mu[1])

    trace = pm.sample(21000, chains=5)

In [None]:
chain = trace[1000:]
pm.traceplot(chain)
plt.show()

In [None]:
pm.summary(chain)

In [None]:
pm.plot_posterior(chain['mu1 - mu2'], credible_interval=0.95, point_estimate='mode')
plt.xlabel(r'$\mu$1 - $\mu$2')
plt.show()

### RQ1: 「参加後体重」の母平均が「参加前体重」の母平均より軽い確率


In [None]:
print('p(mu1 - mu2 > 0) = {:.3f}'.format((chain['mu'][:,0] - chain['mu'][:,1] > 0).mean()))
# print('p(mu1 - mu2 > 0) = {:.3f}'.format((chain['mu1 - mu2'] > 0).mean()))

### RQ2: ダイエットプログラムに参加した人と参加前の人では、平均値に何kgの差があるか。また、その減量はどの程度の幅で確信できるか。95%の確信で答えよ。

In [None]:
print('Point estimation (difference of mean values): {:.3f}kg'.format(chain['mu1 - mu2'].mean()))
print('Point estimation (standard deviation): {:.3f}kg'.format(chain['mu1 - mu2'].std()))
hpd_0025 = np.quantile(chain['mu1 - mu2'], 0.025)
hpd_0975 = np.quantile(chain['mu1 - mu2'], 0.975)
print('Credible Interval (95%): ({:.3f}, {:.3f})'.format(hpd_0025, hpd_0975))

### RQ3: ダイエットプログラムに参加した人と参加前の人では、少なくともどれだけ体重差があるか。あるいは、どの程度の体重差しか高々見込めないか。95%の確信で答えよ。

In [None]:
hpd_005 = np.quantile(chain['mu1 - mu2'], 0.05)
hpd_095 = np.quantile(chain['mu1 - mu2'], 0.95)
print('At most (95%): {:.3f}kg'.format(hpd_095))
print('At least (95%): {:.3f}kg'.format(hpd_005))

### RQ4: ダイエットプログラムに参加した人と参加前の人の平均値差で、2kgより減量できる確率が70%より大きいならば参加したい。参加すべきか、あるいは見送るべきか。

In [None]:
print('p(mu1 - mu2 > 2kg) = {:.3f}'.format((chain['mu1 - mu2'] > 2).mean()))