# Pitching Problem

In [None]:
# Enable the commands below when running this program on Google Colab.
# !pip install arviz==0.7
# !pip install pymc3==3.8
# !pip install Theano==1.0.4

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import pymc3 as pm
import theano.tensor as tt

import statistics
import math

plt.style.use('seaborn-darkgrid')
np.set_printoptions(precision=3)
pd.set_option('display.precision', 3)

In [None]:
# factor_A =  ['with runner', 'withour runner']
factor_B = ['straight', 'cut', 'fork', 'change-up', 'slider', 'curve']
# columns = ['speed', 'runner', 'pitch']

straight_w = [140,146,149,136,147,147,143,143,143,141]
cut_w = [139,136,136,140,135,132,140,134]
fork_w = [123,127,131,130,138,128,129]
change_up_w = [115,120,118,118,121,124,129,119,128]
slider_w = [128,124,123,121,122,126,131,122]
curve_w = [121,121,120,116,117,113,118]

straight_wo = [143,141,142,145,149,145,143,141,142,155]
cut_wo = [138,134,142,136,135,136,131,133]
fork_wo = [131,128,128,128,127,130,130]
change_up_wo = [117,125,132,122,119,122,129,117,127]
slider_wo = [117,120,124,122,122,122,118,122]
curve_wo = [119,125,122,116,119,113,122]

In [None]:
# Data visualization
plt.boxplot(
    [straight_w, cut_w, fork_w, change_up_w, slider_w, curve_w, straight_wo, cut_wo, fork_wo, change_up_wo, slider_wo, curve_wo],
    labels=['Straight (w/ runner)', 'Cut (w/ runner)', 'Fork (w/ runner)', 'Change-up (w/ runner)', 'Slider (w/ runner)', 'Curve (w/ runner)', 'Straight (w/o runner)', 'Cut (w/o runner)', 'Fork (w/o runner)', 'Change-up (w/o runner)', 'Slider (w/o runner)', 'Curve (w/o runner)']
)

plt.xticks(rotation=45)
plt.show()

## Bayesian analysis

In [None]:
observed = np.concatenate([
    straight_w, cut_w, fork_w, change_up_w, slider_w, curve_w, straight_wo, cut_wo, fork_wo, change_up_wo, slider_wo, curve_wo
])
print(len(observed))
print(observed)

id_jk = [0] * len(straight_w) + [1] * len(cut_w) + [2] * len(fork_w) + [3] * len(change_up_w) + [4] * len(slider_w) + [5] * len(curve_w) + [6] * len(straight_wo) + [7] * len(cut_wo) + [8] * len(fork_wo) + [9] * len(change_up_wo) + [10] * len(slider_wo) + [11] * len(curve_wo)
print(len(id_jk))
print(id_jk)

In [None]:
with pm.Model() as Model:
    # Prior distribution
    mu = pm.Uniform('mu', 0, 200)  # total mean
    sigma_e = pm.Uniform('sigma', 0, 50)
    a1 = pm.Uniform('a1', -100, 100)
    a2 = -a1  # a1+a2=0
    b1 = pm.Uniform('b1', -100, 100)
    b2 = pm.Uniform('b2', -100, 100)
    b3 = pm.Uniform('b3', -100, 100)
    b4 = pm.Uniform('b4', -100, 100)
    b5 = pm.Uniform('b5', -100, 100)
    b6 = pm.Deterministic('b6', -1 * (b1 + b2 + b3 + b4 + b5))  # b1+b2+b3+b4+b5+b6=0
    ab11 = pm.Uniform('ab11', -100, 100)
    ab21 = -ab11  # ab11+ab21=0
    ab12 = pm.Uniform('ab12', -100, 100)
    ab22 = -ab12  # ab12+ab22=0
    ab13 = pm.Uniform('ab13', -100, 100)
    ab23 = -ab13  # ab13+ab23=0
    ab14 = pm.Uniform('ab14', -100, 100)
    ab24 = -ab14  # ab14+ab24=0
    ab15 = pm.Uniform('ab15', -100, 100)
    ab25 = -ab15  #ab15+ab25=0
    ab16 = pm.Deterministic('ab16', -1 * (ab11 + ab12 + ab13 + ab14 + ab15))  #ab11+ab12+ab13+ab14+ab15+ab16=0)
    ab26 = -ab16  # ab16+ab26=0

    mu_jk = tt.as_tensor_variable([
        mu + a1 + b1 + ab11,  # 11
        mu + a1 + b2 + ab12,  # 12
        mu + a1 + b3 + ab13,  # 13
        mu + a1 + b4 + ab14,  # 14
        mu + a1 + b5 + ab15,  # 15
        mu + a1 + b6 + ab16,  # 16
        mu + a2 + b1 + ab21,  # 21
        mu + a2 + b2 + ab22,  # 22
        mu + a2 + b3 + ab23,  # 23
        mu + a2 + b4 + ab24,  # 24
        mu + a2 + b5 + ab25,  # 25
        mu + a2 + b6 + ab26,  # 26
    ])

    # Likelihood
    y_pred =pm.Normal('y_pred', mu=mu_jk[id_jk], sigma=sigma_e, observed=observed)

    # Variance of factors
    sigma_a = pm.Deterministic('sigma_a', pm.math.sqrt((a1**2 + a2**2) / 2))
    sigma_b = pm.Deterministic('sigma_b', pm.math.sqrt((b1**2 + b2**2 + b3**2 + b4**2 + b5**2 + b6**2) / 6))
    sigma_ab = pm.Deterministic('sigma_ab', pm.math.sqrt((ab11**2 + ab12**2 + ab13**2 + ab14**2 + ab15**2 + ab16**2 + ab21**2 + ab22**2 + ab23**2 + ab24**2 + ab25**2 + ab26**2) / (2 * 6)))

    # Coefficient of determination
    sigma_y_square = sigma_a**2 + sigma_b**2 + sigma_ab**2 + sigma_e**2
    eta_a_square = pm.Deterministic('eta_a_square', sigma_a**2 / sigma_y_square)
    eta_b_square = pm.Deterministic('eta_b_square', sigma_b**2 / sigma_y_square)
    eta_ab_square = pm.Deterministic('eta_ab_square', sigma_ab**2 / sigma_y_square)
    eta_t_square = pm.Deterministic('eta_t_square', (sigma_a**2 + sigma_b**2 + sigma_ab**2) / sigma_y_square)

    # Effect size
    delta_a = pm.Deterministic('delta_a', sigma_a / sigma_e)
    delta_b = pm.Deterministic('delta_b', sigma_b / sigma_e)
    delta_ab = pm.Deterministic('delta_ab', sigma_ab / sigma_e)

    # Average of interaction
    mu11 = pm.Deterministic('mu11', mu + a1 + b1 + ab11)
    mu12 = pm.Deterministic('mu12', mu + a1 + b2 + ab12)
    mu13 = pm.Deterministic('mu13', mu + a1 + b3 + ab13)
    mu14 = pm.Deterministic('mu14', mu + a1 + b4 + ab14)
    mu15 = pm.Deterministic('mu15', mu + a1 + b5 + ab15)
    mu16 = pm.Deterministic('mu16', mu + a1 + b6 + ab16)
    
    mu21 = pm.Deterministic('mu21', mu + a2 + b1 + ab21)
    mu22 = pm.Deterministic('mu22', mu + a2 + b2 + ab22)
    mu23 = pm.Deterministic('mu23', mu + a2 + b3 + ab23)
    mu24 = pm.Deterministic('mu24', mu + a2 + b4 + ab24)
    mu25 = pm.Deterministic('mu25', mu + a2 + b5 + ab25)
    mu26 = pm.Deterministic('mu26', mu + a2 + b6 + ab26)

    trace = pm.sample(21000, chains=5)

In [None]:
chain = trace[1000:]
pm.traceplot(chain)
plt.show()

In [None]:
pm.summary(chain)

### 水準とセルの効果の有無（どの水準が、あるいはどの交互作用項が、ある基準より大きい、または小さいかという確信が持てるか）

In [None]:
result_df = pd.DataFrame([
    [(chain['a1'] > 0).mean(), (chain['b1'] > 0).mean(), (chain['b2'] > 0).mean(), (chain['b3'] > 0).mean(), (chain['b4'] > 0).mean(), (chain['b5'] > 0).mean(), (chain['b6'] > 0).mean(), (chain['ab11'] > 0).mean(), (chain['ab12'] > 0).mean(), (chain['ab13'] > 0).mean(), (chain['ab14'] > 0).mean(), (chain['ab15'] > 0).mean(), (chain['ab16'] > 0).mean()],
    [(chain['a1'] < 0).mean(), (chain['b1'] < 0).mean(), (chain['b2'] < 0).mean(), (chain['b3'] < 0).mean(), (chain['b4'] < 0).mean(), (chain['b5'] < 0).mean(), (chain['b6'] < 0).mean(), (chain['ab11'] < 0).mean(), (chain['ab12'] < 0).mean(), (chain['ab13'] < 0).mean(), (chain['ab14'] < 0).mean(), (chain['ab15'] < 0).mean(), (chain['ab16'] < 0).mean()]
], index=['0 >', '0 <'], columns=['a1', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'ab11', 'ab12', 'ab13', 'ab14', 'ab15', 'ab16'])
display(result_df)
# 要因A（走者）の効果a1の確信区間はほぼ0を中心としており、要因Aの効果は確認できない
# 一方、要因B（球種）の効果は確認できる

### 要因の効果の大きさ（個々の水準の項や交互作用項の効果の有無ではなく、効果の全体的な大きさはどれほどか？）

In [None]:
print('Effect (SD) of Factor B: {:.3f} ({:.3f}) [{:.3f}, {:.3f}] = {:.1f} km/h'.format(chain['sigma_b'].mean(), chain['sigma_b'].std(), np.quantile(chain['sigma_b'], 0.025), np.quantile(chain['sigma_b'], 0.975), chain['sigma_b'].mean()))

# if CoD = 0 (0%) -> The factor does not explain the observed data at all.
# if CoD = 1 (100%)  -> The factor well explains the observed data.
print('CoD: {:.3f} ({:.3f}) [{:.3f}, {:.3f}] = {:.1f} %'.format(chain['eta_b_square'].mean(), chain['eta_b_square'].std(), np.quantile(chain['eta_b_square'], 0.025), np.quantile(chain['eta_b_square'], 0.975), chain['eta_b_square'].mean() * 100))

print('Effect size: {:.3f} ({:.3f}) [{:.3f}, {:.3f}] = {:.1f} %'.format(chain['delta_b'].mean(), chain['delta_b'].std(), np.quantile(chain['delta_b'], 0.025), np.quantile(chain['delta_b'], 0.975), chain['delta_b'].mean() * 100))

### 行の水準が列の水準より大きい確率（要因Bに関して）

In [None]:
# mu1 = (mu11 + mu21) / 2
def mu(i):
    return (chain['mu1' + str(i)] + chain['mu2' + str(i)]) * 0.5

In [None]:
def compare(i, j):
    return (mu(i + 1) > mu(j + 1)).mean()

In [None]:
result = pd.DataFrame([[compare(i, j) for j in range(6)] for i in range(6)], index=factor_B, columns=factor_B)
display(result)

### 特に興味のある2セル間の推測

In [None]:
# Change-up vs curve
print('The speed of change-up is {:.3f} km/h faster than that of curve.'.format((mu(4) - mu(6)).mean()))

### RQ1: ストレートは他のどの球種より速い

In [None]:
val_1 = (mu(1) > mu(2)).mean() * (mu(1) > mu(3)).mean() * (mu(1) > mu(4)).mean() * (mu(1) > mu(5)).mean() * (mu(1) > mu(6)).mean()
print('Prob(Straight > other pitch) = {:.3f}'.format(val_1))

### RQ2: カットはストレート以外のどの球種よりも速い

In [None]:
val_2 = (mu(1) > mu(2)).mean() * (mu(2) > mu(3)).mean() * (mu(2) > mu(4)).mean() * (mu(2) > mu(5)).mean() * (mu(2) > mu(6)).mean()
print('Prob(Straight > Cut > other pitch) = {:.3f}'.format(val_2))

### RQ3: フォーク・カット・ストレートはチェンジアップ・スライダー・カーブよりも速い

In [None]:
val_3 = (mu(1) > mu(2)).mean() * (mu(2) > mu(3)).mean() * (mu(3) > mu(4)).mean() * (mu(3) > mu(5)).mean() * (mu(3) > mu(6)).mean()
print('Prob(Straight > Cut > fork > other pitch) = {:.3f}'.format(val_3))