# Independent One Factorial Design
Alternative of one-way ANOVA

In [None]:
# Enable the commands below when running this program on Google Colab.
# !pip install arviz==0.7
# !pip install pymc3==3.8
# !pip install Theano==1.0.4

import numpy as np 
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import pymc3 as pm

import statistics
import math

plt.style.use('seaborn-darkgrid')
np.set_printoptions(precision=3)
pd.set_option('display.precision', 3)

In [None]:
# Concentration of sulfurous gas in 6 different days in each season.
# Num. of factor = 1 (season)
# Num. of level = 4 (Spring, Summer, Autumn, Winter)
levels = ['Spring', 'Summer', 'Autumn', 'Winter']
data = pd.DataFrame(
    [[10, 8, 8, 14],
     [10, 10, 8, 12],
     [9, 8, 11, 11],
     [11, 10, 11, 16],
     [12, 12, 14, 13],
     [11, 9, 15, 12]],
    columns=levels)
display(data)

## Bayesian analysis

In [None]:
with pm.Model() as ow_anova_model:
    # Prior distribution
    mu = pm.Uniform('mu', 0, 50, shape=len(levels))
    sigma = pm.Uniform('sigma', 0, 50)

    # Likelihood
    y_pred = pm.Normal('y_pred', mu=mu, sigma=sigma, observed=data.values)

    # Total mean
    total_mean = pm.Deterministic('total_mean', (pm.math.sum(mu) / len(levels)))

    # Effect of each level
    a = pm.Deterministic('a', mu - total_mean)

    # Variance of factor (season)
    sigma_factor = pm.Deterministic('sigma_factor',
                        pm.math.sqrt(pm.math.sum(a**2) / len(levels)))
    
    # Coefficient of determination
    eta_square = pm.Deterministic('eta_square', sigma_factor**2 / (sigma_factor**2 + sigma**2))  

    # Effect size
    delta = pm.Deterministic('delta', sigma_factor / sigma)

    # Post analysis
    mu4_mu1 = pm.Deterministic('mu4 - mu1', mu[3] - mu[0])
    mu4_mu2 = pm.Deterministic('mu4 - mu2', mu[3] - mu[1])

    delta_41 = pm.Deterministic('delta_41', mu4_mu1 / sigma)  # effect size
    delta_42 = pm.Deterministic('delta_42', mu4_mu2 / sigma)  # effect size

    trace = pm.sample(21000, chains=5)

In [None]:
chain = trace[1000:]
pm.traceplot(chain)
plt.show()

In [None]:
pm.summary(chain)

In [None]:
plt.boxplot(
    [chain['a'][:,i] for i in range(len(levels))],
    labels=levels)
plt.ylim(-5, 6)
plt.xlabel('Effect of level')
plt.show()

### 水準の効果の有無（どの水準（春夏秋冬）が大きいのか小さいのか）
基準(0)より大きいか小さいか

In [None]:
print('-- a_j > 0 --')
for i in range(len(levels)):
    print('{}: {:.3f} %'.format(levels[i], (chain['a'][:,i] > 0).mean() * 100))

print()

print('-- a_j < 0 --')
for i in range(len(levels)):
    print('{}: {:.3f} %'.format(levels[i], (chain['a'][:,i] < 0).mean() * 100))

### 要因の効果の大きさ（一つ一つの水準（季節）の効果ではなく、水準をまとめた「季節」という要因の効果の大きさ）

In [None]:
pm.plot_posterior(chain['eta_square'], credible_interval=0.95, point_estimate='mode')
plt.xlabel('Coefficient of determination (CoD)')

pm.plot_posterior(chain['delta'], credible_interval=0.95, point_estimate='mode')
plt.xlabel('Effect size')

plt.show()

In [None]:
print('Effect (SD) of Factor A (season): {:.3f} ({:.3f}) [{:.3f}, {:.3f}] = {:.1f} x 10^-3 ppm'.format(chain['sigma_factor'].mean(), chain['sigma_factor'].std(), np.quantile(chain['sigma_factor'], 0.025), np.quantile(chain['sigma_factor'], 0.975), chain['sigma_factor'].mean()))

# if CoD = 0 (0%) -> The factor does not explain the observed data at all.
# if CoD = 1 (100%)  -> The factor well explains the observed data.
print('CoD: {:.3f} ({:.3f}) [{:.3f}, {:.3f}] = {:.1f} %'.format(chain['eta_square'].mean(), chain['eta_square'].std(), np.quantile(chain['eta_square'], 0.025), np.quantile(chain['eta_square'], 0.975), chain['eta_square'].mean() * 100))

print('Effect size: {:.3f} ({:.3f}) [{:.3f}, {:.3f}] = {:.1f} %'.format(chain['delta'].mean(), chain['delta'].std(), np.quantile(chain['delta'], 0.025), np.quantile(chain['delta'], 0.975), chain['delta'].mean() * 100))

### 水準間の比較

In [None]:
def compare(a, b):
    return (chain['mu'][:,a] - chain['mu'][:,b] > 0).mean()

In [None]:
# 行iの水準が列jの水準より大きい確率
result = pd.DataFrame(
    [[0, compare(0, 1), compare(0, 2), compare(0, 3)],
     [compare(1, 0), 0, compare(1, 2), compare(1, 3)],
     [compare(2, 0), compare(2, 1), 0, compare(2, 3)],
     [compare(3, 0), compare(3, 1), compare(3, 2),0]
    ],
    columns=levels,
    index=levels)
display(result)
# 95%以上の確率であると「別々に」明言できるのは、
# mu_4 > mu_1 (Winter > Spring)
# mu_4 > mu_2 (Winter > Summer)
# 「別々に」：同時に成り立つ確率は異なるため

### RQ1: 冬 > 秋 > 春 > 夏 の順にガス濃度が高い

In [None]:
val_1 = (chain['mu'][:,3] > chain['mu'][:,2]).mean() * (chain['mu'][:,2] > chain['mu'][:,0]).mean() * (chain['mu'][:,0] > chain['mu'][:,1]).mean()
print('Winter > Autumn > Spring > Summer: {:.3f} % '.format(val_1 * 100))

### RQ2: 冬 > (秋、春) > 夏 の順にガス濃度が高い

In [None]:
val_2 = (chain['mu'][:,3] > chain['mu'][:,2]).mean() * (chain['mu'][:,3] > chain['mu'][:,0]).mean() * (chain['mu'][:,2] > chain['mu'][:,1]).mean() * (chain['mu'][:,0] > chain['mu'][:,1]).mean()
print('Winter > (Autumn, Spring) > Summer: {:.3f} % '.format(val_2 * 100))

### RQ3: 冬 > 冬以外

In [None]:
val_3 = (chain['mu'][:,3] > chain['mu'][:,0]).mean() * (chain['mu'][:,3] > chain['mu'][:,1]).mean() * (chain['mu'][:,3] > chain['mu'][:,2]).mean()
print('Winter > Spring, Summer, Autumn: {:.3f} % '.format(val_3 * 100))

### RQ4: 冬 > 春、夏

In [None]:
val_4 = (chain['mu'][:,3] > chain['mu'][:,0]).mean() * (chain['mu'][:,3] > chain['mu'][:,1]).mean()
print('Winter > Spring: {:.3f} % '.format(val_4 * 100))

### 特に興味のある2水準間の比較（冬と春、冬と夏）

In [None]:
print('The gas density in Winter is average {:.2f} x 10^-3 % ({:.3f}) [{:.3f}, {:.3f}] higher than Spring.'.format(chain['mu4 - mu1'].mean(), chain['mu4 - mu1'].std(), np.quantile(chain['mu4 - mu1'], 0.025), np.quantile(chain['mu4 - mu1'], 0.975)))
print('The gas density in Winter is average {:.2f} x 10^-3 % ({:.3f}) [{:.3f}, {:.3f}] higher than Summer.'.format(chain['mu4 - mu2'].mean(), chain['mu4 - mu2'].std(), np.quantile(chain['mu4 - mu2'], 0.025), np.quantile(chain['mu4 - mu2'], 0.975)))