<a href="https://colab.research.google.com/github/smitbajaj/bayes-learning/blob/main/BayesToy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install arviz==0.11.0

In [None]:
## Bayesian Baseball Monkeys -- https://www.pymc-labs.com/blog-posts/bayesian-marcel/

!pip install pybaseball
!pip install pymc
# !pip install arviz

import pandas as pd
import numpy as np
import seaborn as sns
import pymc as pm
import arviz as az
from pybaseball import batting_stats
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

data = batting_stats(2020, 2024, qual=1)
hard_hit_subset = data[["Name", "Season", "Team", "Age", "HardHit", "Events"]].copy().reset_index(drop=True)

In [None]:
hard_hit_subset.head(3)

Unnamed: 0,Name,Season,Team,Age,HardHit,Events
0,Aaron Judge,2024,NYY,32,238,391
1,Aaron Judge,2022,NYY,30,246,404
2,Bobby Witt Jr.,2024,KCR,24,259,538


In [None]:
hard_hit_by_season = (
  hard_hit_subset.pivot_table(index=['Name'], values=["Events", "HardHit", "Age"], columns=['Season'])
  .dropna(subset=[('HardHit', y) for y in range(2020, 2024)])
  )

hard_hit_by_season[('Age', 2024)] = hard_hit_by_season[('Age', 2023)] + 1

hard_hit_by_season.head(3)

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Events,Events,Events,Events,Events,HardHit,HardHit,HardHit,HardHit,HardHit
Season,2020,2021,2022,2023,2024,2020,2021,2022,2023,2024,2020,2021,2022,2023,2024
Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
A.J. Pollock,32.0,33.0,34.0,35.0,36.0,153.0,306.0,395.0,103.0,,66.0,144.0,166.0,42.0,
Aaron Hicks,30.0,31.0,32.0,33.0,34.0,131.0,80.0,278.0,201.0,34.0,50.0,31.0,92.0,59.0,12.0
Aaron Judge,28.0,29.0,30.0,31.0,32.0,70.0,397.0,404.0,240.0,391.0,29.0,230.0,246.0,154.0,238.0


In [None]:
# batters & seasons -- easier to keep index (presumably)
batters = hard_hit_by_season.index.to_numpy()
fit_seasons = [2020, 2021, 2022]

coords = {
  'batter': batters,
  'season': fit_seasons
}

In [None]:
# starting from AJ Pollock (first batter) -- downwards
events_x = hard_hit_by_season["Events"][fit_seasons].to_numpy().astype(np.int32) # for at bats / events
hard_hit_x = hard_hit_by_season["HardHit"][fit_seasons].to_numpy().astype(np.int32) # for hard hits

age_y = hard_hit_by_season["Age"][2023].to_numpy().astype(np.float32) # reason it's why not x: modeling for the 2023 season

events_y = hard_hit_by_season["Events"][2023].to_numpy().astype(np.int32) # what actually happened in 2023?
hard_hit_y = hard_hit_by_season["HardHit"][2023].to_numpy().astype(np.int32) # what actually happened in 2023?

In [None]:
print(events_x[0]) # printing aj pollocks 2020-22 seasons
print(hard_hit_by_season.loc["A.J. Pollock", "Events"].tolist())

[153 306 395]
[153.0, 306.0, 395.0, 103.0, nan]


In [None]:
with pm.Model(coords=coords) as marcel:

  ## Loading in data -- coords "links" indexes for batter names and statistics

  events_x = pm.Data("events_x", events_x)
  hard_hit_x = pm.Data("hard_hit_x", hard_hit_x)

  age_y = pm.Data("age_y", age_y)

  events_y = pm.Data("events_y", events_y)
  hard_hit_y = pm.Data("hard_hit_y", hard_hit_y)

  ## Setting priors --

  mu_p = pm.Beta("mu_p", mu=0.35, sigma=0.2) # -- https://www.desmos.com/calculator/kx83qio7yl
  sigma_p = pm.Uniform("sigma_p", 0, 0.5) # -- uniform distribution - saying we have no clue what the sigma is here.. will let the data tell us (effectively)

  p = pm.Beta("p", mu=mu_p, sigma=sigma_p, dims=("batter", "season"))

  pm.Binomial("rate_like", n=events_x, p=p, observed=hard_hit_x)

