In [None]:
# Enable the commands below when running this program on Google Colab.
# !pip install arviz==0.7
# !pip install pymc3==3.8
# !pip install Theano==1.0.4

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import pymc3 as pm
import theano.tensor as tt

import statistics
import math

plt.style.use('seaborn-darkgrid')
np.set_printoptions(precision=3)
pd.set_option('display.precision', 3)

In [None]:
# Price of hotel in San Francisco and Los Angeles.
# Num. of factor = 2 (City, Area)
# Num. of level = 2 (Downtown, Suburbs)
data = pd.DataFrame(
    [[79, 'San Francisco', 'Downtown'],
     [107, 'San Francisco', 'Downtown'],
     [103, 'San Francisco', 'Downtown'],
     [92, 'San Francisco', 'Downtown'],
     [180, 'San Francisco', 'Downtown'],
     [165, 'San Francisco', 'Downtown'],
     [240, 'San Francisco', 'Downtown'],
     [265, 'San Francisco', 'Downtown'],
     [300, 'San Francisco', 'Downtown'],
     [75, 'San Francisco', 'Suburbs'],
     [60, 'San Francisco', 'Suburbs'],
     [60, 'San Francisco', 'Suburbs'],
     [94, 'San Francisco', 'Suburbs'],
     [119, 'San Francisco', 'Suburbs'],
     [100, 'San Francisco', 'Suburbs'],
     [102, 'San Francisco', 'Suburbs'],
     [125, 'San Francisco', 'Suburbs'],
     [165, 'San Francisco', 'Suburbs'],
     [95, 'Los Angeles', 'Downtown'],
     [99, 'Los Angeles', 'Downtown'],
     [70, 'Los Angeles', 'Downtown'],
     [116, 'Los Angeles', 'Downtown'],
     [170, 'Los Angeles', 'Downtown'],
     [145, 'Los Angeles', 'Downtown'],
     [205, 'Los Angeles', 'Downtown'],
     [200, 'Los Angeles', 'Downtown'],
     [210, 'Los Angeles', 'Downtown'],
     [153, 'Los Angeles', 'Suburbs'],
     [78, 'Los Angeles', 'Suburbs'],
     [75, 'Los Angeles', 'Suburbs'],
     [92, 'Los Angeles', 'Suburbs'],
     [115, 'Los Angeles', 'Suburbs'],
     [155, 'Los Angeles', 'Suburbs'],
     [250, 'Los Angeles', 'Suburbs'],
     [340, 'Los Angeles', 'Suburbs'],
     [380, 'Los Angeles', 'Suburbs']],
    columns=['Price', 'City', 'Area'])
display(data)

In [None]:
sf_downtown = data[(data['City'] == 'San Francisco') & (data['Area'] == 'Downtown')]
sf_suburbs = data[(data['City'] == 'San Francisco') & (data['Area'] == 'Suburbs')]
la_downtown = data[(data['City'] == 'Los Angeles') & (data['Area'] == 'Downtown')]
la_suburbs = data[(data['City'] == 'Los Angeles') & (data['Area'] == 'Suburbs')]

print('Mean price in San Francisco (Downtown): {:.3f}'.format(sf_downtown.mean()['Price']))
print('Mean price in San Francisco (Suburbs): {:.3f}'.format(sf_suburbs.mean()['Price']))

print('Mean price in Los Angles (Downtown): {:.3f}'.format(la_downtown.mean()['Price']))
print('Mean price in Los Angles (Suburbs): {:.3f}'.format(la_suburbs.mean()['Price']))
# data.query('City == "Los Angeles" and Area == "Downtown"')

In [None]:
# Data Visualization
plt.boxplot(
    [sf_downtown['Price'],
     la_downtown['Price'],
     sf_suburbs['Price'],
     la_suburbs['Price']],
    labels=['S.F. (Downtown)', 'L.A. (Downtown)', 'S.F. (Suburbs)', 'L.A. (Suburbs)'])
plt.ylim(50, 350)
plt.show()

plt.plot(np.array(['S.F.', 'L.A.']), np.array([sf_downtown.mean()['Price'], la_downtown.mean()['Price']]), marker='o')
plt.plot(np.array(['S.F.', 'L.A.']), np.array([sf_suburbs.mean()['Price'], la_suburbs.mean()['Price']]), marker='o')
plt.ylim(90, 200)
plt.legend(['Downtown', 'Suburbs'])
plt.show()

## Bayesian Analysis

In [None]:
observed = np.concatenate([sf_downtown['Price'], sf_suburbs['Price'], la_downtown['Price'], la_suburbs['Price']])
print(len(observed))
print(observed)
# id_j = [0] * 18 + [1] * 18
# print(len(id_j))
# print(id_j)
# id_k = [0] * 9 + [1] * 9 + [0] * 9 + [1] * 9
# print(len(id_k))
# print(id_k)
id_jk = [0] * 9 + [1] * 9 + [2] * 9 + [3] * 9
print(len(id_jk))
print(id_jk)

In [None]:
with pm.Model() as tw_anova_model:
    # Prior distribution
    mu = pm.Uniform('mu', 0, 1000)  # total mean
    sigma_e = pm.Uniform('sigma', 0, 500)
    a_1 = pm.Uniform('a_1', -100, 100)  # main effect of A (City)
    b_1 = pm.Uniform('b_1', -100, 100)  # main effect of B (Area)
    ab_11 = pm.Uniform('ab_11', -100, 100)  # interaction

    mu_jk = tt.as_tensor_variable(
        [mu + a_1 + b_1 + ab_11, mu + a_1 - b_1 - ab_11,
         mu - a_1 + b_1 - ab_11, mu - a_1 - b_1 + ab_11]
    )

    # Likelihood
    y_pred = pm.Normal('y_pred', mu=mu_jk[id_jk], sigma=sigma_e, observed=observed)

    # Variance of factors
    sigma_a = pm.Deterministic('sigma_a', pm.math.sqrt((a_1**2 + (-a_1)**2) / 2))
    sigma_b = pm.Deterministic('sigma_b', pm.math.sqrt((b_1**2 + (-b_1)**2) / 2))
    sigma_ab = pm.Deterministic('sigma_ab', pm.math.sqrt((ab_11**2 + (-ab_11)**2 + (-ab_11)**2 + ab_11**2) / 4))

    # Coefficient of determination
    sigma_y_square = sigma_a**2 + sigma_b**2 + sigma_ab**2 + sigma_e**2
    eta_a_square = pm.Deterministic('eta_a_square', sigma_a**2 / sigma_y_square)
    eta_b_square = pm.Deterministic('eta_b_square', sigma_b**2 / sigma_y_square)
    eta_ab_square = pm.Deterministic('eta_ab_square', sigma_ab**2 / sigma_y_square)
    eta_t_square = pm.Deterministic('eta_t_square', (sigma_a**2 + sigma_b**2 + sigma_ab**2) / sigma_y_square)

    # Effect size
    delta_a = pm.Deterministic('delta_a', sigma_a / sigma_e)
    delta_b = pm.Deterministic('delta_b', sigma_b / sigma_e)
    delta_ab = pm.Deterministic('delta_ab', sigma_ab / sigma_e)

    # Average of interaction
    mu_11 = pm.Deterministic('mu_11', mu + a_1 + b_1 + ab_11)
    mu_12 = pm.Deterministic('mu_12', mu + a_1 - b_1 - ab_11)
    mu_21 = pm.Deterministic('mu_21', mu - a_1 + b_1 - ab_11)
    mu_22 = pm.Deterministic('mu_22', mu - a_1 - b_1 + ab_11)

    # Post analysis
    mu_11_mu_12 = pm.Deterministic('mu_11 - mu_12', mu_11 - mu_12)
    mu_21_mu_22 = pm.Deterministic('mu_21 - mu_22', mu_21 - mu_22)

    delta_11_12 = pm.Deterministic('delta_11_12', mu_11_mu_12 / sigma_e)  # effect size
    delta_21_22 = pm.Deterministic('delta_21_22', mu_21_mu_22 / sigma_e)  # effect size

    trace = pm.sample(21000, chains=5)

In [None]:
chain = trace[1000:]
pm.traceplot(chain)
plt.show()

In [None]:
pm.summary(chain)

### 水準とセルの効果の有無（どの水準が、あるいはどの交互作用項が、ある基準より大きい、または小さいかという確信が持てるか）

In [None]:
result_df = pd.DataFrame([
    [(chain['a_1'] > 0).mean(), (chain['b_1'] > 0).mean(), (chain['ab_11'] > 0).mean()],
    [(chain['a_1'] < 0).mean(), (chain['b_1'] < 0).mean(), (chain['ab_11'] < 0).mean()]
], index=['0 >', '0 <'], columns=['a_1', 'b_1', 'ab_11'])
display(result_df)
# 「都市」の主効果も「場所」の主効果も確信を持てない（確率が低い）
# 交互作用が0より大きい確率は97.5%付近のため、「都市」と「場所」に交互作用がありそう

### 要因の効果の大きさ（個々の水準の項や交互作用項の効果の有無ではなく、効果の全体的な大きさはどれほどか？）

In [None]:
print('Effect (SD) of Interaction: {:.3f} ({:.3f}) [{:.3f}, {:.3f}] = {:.1f} USD'.format(chain['sigma_ab'].mean(), chain['sigma_ab'].std(), np.quantile(chain['sigma_ab'], 0.025), np.quantile(chain['sigma_ab'], 0.975), chain['sigma_ab'].mean()))

# if CoD = 0 (0%) -> The factor does not explain the observed data at all.
# if CoD = 1 (100%)  -> The factor well explains the observed data.
print('CoD: {:.3f} ({:.3f}) [{:.3f}, {:.3f}] = {:.1f} %'.format(chain['eta_ab_square'].mean(), chain['eta_ab_square'].std(), np.quantile(chain['eta_ab_square'], 0.025), np.quantile(chain['eta_ab_square'], 0.975), chain['eta_ab_square'].mean() * 100))

print('Effect size: {:.3f} ({:.3f}) [{:.3f}, {:.3f}] = {:.1f} %'.format(chain['delta_ab'].mean(), chain['delta_ab'].std(), np.quantile(chain['delta_ab'], 0.025), np.quantile(chain['delta_ab'], 0.975), chain['delta_ab'].mean() * 100))

### セル平均の事後分布
2要因の分析で交互作用効果の存在が確信されたら、一方の要因の水準ごとに、他方の要因の水準間の推測をするとデータに対する理解が深まる

### 特に興味のある2セル間の推測

In [None]:
print('The hotel price in San Francisco downtown is average {:.2f} usd ({:.3f}) [{:.3f}, {:.3f}] higher than in San Francisco suburbs.'.format(chain['mu_11 - mu_12'].mean(), chain['mu_11 - mu_12'].std(), np.quantile(chain['mu_11 - mu_12'], 0.025), np.quantile(chain['mu_11 - mu_12'], 0.975)))
print('The hotel price in Los Angeles downtown is average {:.2f} usd ({:.3f}) [{:.3f}, {:.3f}] higher than in Los Angeles suburbs.'.format(chain['mu_21 - mu_22'].mean(), chain['mu_21 - mu_22'].std(), np.quantile(chain['mu_21 - mu_22'], 0.025), np.quantile(chain['mu_21 - mu_22'], 0.975)))