In [None]:
# !pip install arviz==0.6.1
# !pip install pymc3==3.8
# !pip install Theano==1.0.4

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import pymc3 as pm
import theano.tensor as tt

import statistics
import math

plt.style.use('seaborn-darkgrid')
np.set_printoptions(precision=3)
pd.set_option('display.precision', 3)

In [None]:
# Price of hotel in San Francisco and Los Angeles.
# Num. of factor = 2 (City, Area)
# Num. of level = 2 (Downtown, Suburbs)
data = pd.DataFrame(
    [[79, 'San Francisco', 'Downtown'],
     [107, 'San Francisco', 'Downtown'],
     [103, 'San Francisco', 'Downtown'],
     [92, 'San Francisco', 'Downtown'],
     [180, 'San Francisco', 'Downtown'],
     [165, 'San Francisco', 'Downtown'],
     [240, 'San Francisco', 'Downtown'],
     [265, 'San Francisco', 'Downtown'],
     [300, 'San Francisco', 'Downtown'],
     [95, 'Los Angeles', 'Downtown'],
     [99, 'Los Angeles', 'Downtown'],
     [70, 'Los Angeles', 'Downtown'],
     [116, 'Los Angeles', 'Downtown'],
     [170, 'Los Angeles', 'Downtown'],
     [145, 'Los Angeles', 'Downtown'],
     [205, 'Los Angeles', 'Downtown'],
     [200, 'Los Angeles', 'Downtown'],
     [210, 'Los Angeles', 'Downtown'],
     [75, 'San Francisco', 'Suburbs'],
     [60, 'San Francisco', 'Suburbs'],
     [60, 'San Francisco', 'Suburbs'],
     [94, 'San Francisco', 'Suburbs'],
     [119, 'San Francisco', 'Suburbs'],
     [100, 'San Francisco', 'Suburbs'],
     [102, 'San Francisco', 'Suburbs'],
     [125, 'San Francisco', 'Suburbs'],
     [165, 'San Francisco', 'Suburbs'],
     [153, 'Los Angeles', 'Suburbs'],
     [78, 'Los Angeles', 'Suburbs'],
     [75, 'Los Angeles', 'Suburbs'],
     [92, 'Los Angeles', 'Suburbs'],
     [115, 'Los Angeles', 'Suburbs'],
     [155, 'Los Angeles', 'Suburbs'],
     [250, 'Los Angeles', 'Suburbs'],
     [340, 'Los Angeles', 'Suburbs'],
     [380, 'Los Angeles', 'Suburbs']],
    columns=['Price', 'City', 'Area'])
display(data)

In [None]:
sf_downtown = data[(data['City'] == 'San Francisco') & (data['Area'] == 'Downtown')]
sf_suburbs = data[(data['City'] == 'San Francisco') & (data['Area'] == 'Suburbs')]
la_downtown = data[(data['City'] == 'Los Angeles') & (data['Area'] == 'Downtown')]
la_suburbs = data[(data['City'] == 'Los Angeles') & (data['Area'] == 'Suburbs')]

print('Mean price in San Francisco (Downtown): {:.3f}'.format(sf_downtown.mean()['Price']))
print('Mean price in San Francisco (Suburbs): {:.3f}'.format(sf_suburbs.mean()['Price']))

print('Mean price in Los Angles (Downtown): {:.3f}'.format(la_downtown.mean()['Price']))
print('Mean price in Los Angles (Suburbs): {:.3f}'.format(la_suburbs.mean()['Price']))
# data.query('City == "Los Angeles" and Area == "Downtown"')

In [None]:
# Data Visualization
plt.boxplot(
    [sf_downtown['Price'],
     la_downtown['Price'],
     sf_suburbs['Price'],
     la_suburbs['Price']],
    labels=['S.F. (Downtown)', 'L.A. (Downtown)', 'S.F. (Suburbs)', 'L.A. (Suburbs)'])
plt.ylim(50, 350)
plt.show()

plt.plot(np.array(['S.F.', 'L.A.']), np.array([sf_downtown.mean()['Price'], la_downtown.mean()['Price']]), marker='o')
plt.plot(np.array(['S.F.', 'L.A.']), np.array([sf_suburbs.mean()['Price'], la_suburbs.mean()['Price']]), marker='o')
plt.ylim(90, 200)
plt.legend(['Downtown', 'Suburbs'])
plt.show()

## Bayesian Analysis

In [None]:
observed = np.concatenate([sf_downtown['Price'], sf_suburbs['Price'], la_downtown['Price'], la_suburbs['Price']])
print(len(observed))
print(observed)
id_j = [0] * 18 + [1] * 18
print(len(id_j))
print(id_j)
id_k = [0] * 9 + [1] * 9 + [0] * 9 + [1] * 9
print(len(id_k))
print(id_k)
id_jk = [0] * 9 + [1] * 9 + [2] * 9 + [3] * 9
print(len(id_jk))
print(id_jk)

In [None]:
mu_jk = np.array([[0, 1], [2, 3]])
print(mu_jk)
# print(id_j)
print(mu_jk[id_j, id_k])
# print(mu_jk[id_j][id_k])

In [None]:
with pm.Model() as tw_anova_model:
    # Prior distribution
    mu = pm.Uniform('mu', 0, 1000)  # total mean
    sigma_e = pm.Uniform('sigma', 0, 500)
    epsilon = pm.Normal('epsilon', 0, sigma_e)
    a_1 = pm.Uniform('a_1', -1000, 1000)  # main effect of A (City)
    b_1 = pm.Uniform('b_1', -1000, 1000)  # main effect of B (Area)
    ab_11 = pm.Uniform('ab_11', -1000, 1000)  # interaction

    # mu_jk = np.zeros((2, 2))
    # mu_jk[0][0] = mu + a_1 + b_1 + ab_11
    # mu_jk[0][1] = mu + a_1 - b_1 - ab_11
    # mu_jk[1][0] = mu - a_1 + b_1 - ab_11
    # mu_jk[1][1] = mu - a_1 - b_1 + ab_11
    # mu_jk = tt.matrix()
    # mu_jk = pm.math.stack((
    #     [mu + a_1 + b_1 + ab_11, mu - a_1 + b_1 - ab_11],
    #     [mu + a_1 - b_1 - ab_11, mu - a_1 - b_1 + ab_11]
    # ), axis=0)
    # mu_jk = [
    #     [mu + a_1 + b_1 + ab_11, mu - a_1 + b_1 - ab_11],
    #     [mu + a_1 - b_1 - ab_11, mu - a_1 - b_1 + ab_11]
    # ]

    # mu_jk = tt.inc_subtensor(
    #     mu + a_1 + b_1 + ab_11, mu - a_1 + b_1 - ab_11,
    #     mu + a_1 - b_1 - ab_11, mu - a_1 - b_1 + ab_11
    # )
    # y_pred = pm.Normal('y_pred', mu=mu_jk[id_j * 2 + id_k], sigma=epsilon, observed=observed)

    # mu_jk = pm.math.concatenate((
    #     [mu + a_1 + b_1 + ab_11, mu - a_1 + b_1 - ab_11],
    #     [mu + a_1 - b_1 - ab_11, mu - a_1 - b_1 + ab_11]
    # ))
    # print(mu_jk)
    # print(mu_jk.shape)
    mu_jk = pm.math.concatenate([
        [mu + a_1 + b_1 + ab_11],
        [mu + a_1 + b_1 + ab_11],
        [mu + a_1 + b_1 + ab_11],
        [mu + a_1 + b_1 + ab_11]
        # mu + a_1 + b_1 + ab_11,  # 11
        # mu - a_1 + b_1 - ab_11,  # 12
        # mu + a_1 - b_1 - ab_11,  # 21
        # mu - a_1 - b_1 + ab_11   # 22
    ], axis=0)
    # 11, 12, 21, 22
    # mu_jk = [
    #     mu + a_1 + b_1 + ab_11,
    #     mu - a_1 + b_1 - ab_11,  # 12
    #     mu + a_1 - b_1 - ab_11,  # 21
    #     mu - a_1 - b_1 + ab_11   # 22
    # ]
    # print(mu_jk[0].eval())
    print(len(observed))
    y_pred = pm.Normal('y_pred', mu=mu_jk[id_jk], sigma=epsilon, observed=observed)

    trace = pm.sample(21000, chains=5)

In [None]:
chain = trace[1000:]
pm.traceplot(chain)
plt.show()

In [None]:
pm.summary(chain)