In [11]:
import pandas as pd
import pymc3 as pm
import numpy as np
import theano.tensor as tt

data = pd.read_csv('statcast_data/2023.csv')

In [12]:
pitcher_counts = data['pitcher'].value_counts()
data = data[data['pitcher'].isin(pitcher_counts[pitcher_counts > 100].index)]
data = data.loc[data['pitch_type'].isin(['FF', 'SI', 'FC'])]

In [13]:
data = data.sample(10000)

In [14]:
data['stand'] = (data['stand'] == 'L').astype(int)

In [15]:
data['pitch_type'] = data['pitch_type'].astype('category').cat.codes.values

In [16]:
data = data[['pitcher', 'plate_x', 'plate_z', 'stand', 'pitch_type', 'balls', 'strikes']]
data.head()


Unnamed: 0,pitcher,plate_x,plate_z,stand,pitch_type,balls,strikes
85929,621112,-0.09,0.73,1,2,1,0
366773,657612,-0.12,3.73,0,1,0,2
190975,502043,0.07,3.48,1,2,2,2
590329,680686,-0.28,2.13,0,2,0,0
156790,592791,-0.36,1.88,0,2,0,1


In [17]:
data = data.dropna()

In [18]:
# from sklearn.preprocessing import OneHotEncoder

# pitch_type = data['pitch_type'].values.reshape(-1, 1)
# encoder = OneHotEncoder(sparse=False)
# pitch_type_2d = encoder.fit_transform(pitch_type)




# with pm.Model() as model:
#     # Hyperpriors
#     mu_x = pm.Normal('mu_x', mu=0, sd=1)
#     mu_z = pm.Normal('mu_z', mu=0, sd=1)
#     sigma_x = pm.HalfNormal('sigma_x', sd=1)
#     sigma_z = pm.HalfNormal('sigma_z', sd=1)
    
#     # Priors for each pitcher
#     pitcher_mu_x = pm.Normal('pitcher_mu_x', mu=mu_x, sd=sigma_x, shape=num_pitchers)
#     pitcher_mu_z = pm.Normal('pitcher_mu_z', mu=mu_z, sd=sigma_z, shape=num_pitchers)

#     # Covariate coefficients
#     beta_stand_x = pm.Normal('beta_stand_x', 0, sd=1)
#     beta_stand_z = pm.Normal('beta_stand_z', 0, sd=1)
#     beta_pitch_type_x = pm.Normal('beta_pitch_type_x', 0, sd=1, shape=num_pitch_types)
#     beta_pitch_type_z = pm.Normal('beta_pitch_type_z', 0, sd=1, shape=num_pitch_types)
#     beta_balls_x = pm.Normal('beta_balls_x', 0, sd=1)
#     beta_balls_z = pm.Normal('beta_balls_z', 0, sd=1)
#     beta_strikes_x = pm.Normal('beta_strikes_x', 0, sd=1)
#     beta_strikes_z = pm.Normal('beta_strikes_z', 0, sd=1)

#     # Linear model for means
#     mu_observed_x = (pitcher_mu_x[pitcher_idx] + 
#                      beta_stand_x * stand + 
#                      tt.dot(beta_pitch_type_x, pitch_type_2d.T) + 
#                      beta_balls_x * balls + 
#                      beta_strikes_x * strikes)

#     mu_observed_z = (pitcher_mu_z[pitcher_idx] + 
#                      beta_stand_z * stand + 
#                      tt.dot(beta_pitch_type_z, pitch_type_2d.T) + 
#                      beta_balls_z * balls + 
#                      beta_strikes_z * strikes)

#     # Likelihood
#     likelihood_x = pm.Normal('likelihood_x', mu=mu_observed_x, sd=obs_sd_x, observed=observed_x)
#     likelihood_z = pm.Normal('likelihood_z', mu=mu_observed_z, sd=obs_sd_z, observed=observed_z)

#     # Sample
#     trace = pm.sample(2000)


stan code

In [24]:
import cmdstanpy

num_pitchers = data['pitcher'].nunique()
pitcher_idx = data['pitcher'].astype('category').cat.codes.values
num_pitch_types = data['pitch_type'].nunique()
stand = data['stand'].values
balls = data['balls'].values
strikes = data['strikes'].values
observed_x = data['plate_x'].values
observed_z = data['plate_z'].values
obs_sd_x = np.std(observed_x)
obs_sd_z = np.std(observed_z)

stan_model_code ="""
data {
  int<lower=0> N;  // number of observations
  int<lower=0> K;  // number of pitch types
  int<lower=0> P;  // number of pitchers
  int<lower=0> pitch_type[N];  // pitch type for each observation
  int<lower=0> pitcher[N];  // pitcher for each observation
  int<lower=0> stand[N];  // stand for each observation
  real balls[N];  // balls for each observation
  real strikes[N];  // strikes for each observation
  real observed_x[N];  // observed x for each observation
  real observed_z[N];  // observed z for each observation
}
parameters {
  real mu_x;
  real mu_z;
  real<lower=0> sigma_x;
  real<lower=0> sigma_z;
  real pitcher_mu_x[P];
  real pitcher_mu_z[P];
  real beta_stand_x;
  real beta_stand_z;
  real beta_pitch_type_x[K];
  real beta_pitch_type_z[K];
  real beta_balls_x;
  real beta_balls_z;
  real beta_strikes_x;
  real beta_strikes_z;
}
model {
  mu_x ~ normal(0, 1);
  mu_z ~ normal(0, 1);
  sigma_x ~ normal(0, 1);
  sigma_z ~ normal(0, 1);
  pitcher_mu_x ~ normal(mu_x, sigma_x);
  pitcher_mu_z ~ normal(mu_z, sigma_z);
  beta_stand_x ~ normal(0, 1);
  beta_stand_z ~ normal(0, 1);
  beta_pitch_type_x ~ normal(0, 1);
  beta_pitch_type_z ~ normal(0, 1);
  beta_balls_x ~ normal(0, 1);
  beta_balls_z ~ normal(0, 1);
  beta_strikes_x ~ normal(0, 1);
  beta_strikes_z ~ normal(0, 1);
  for (i in 1:N) {
    observed_x[i] ~ normal(pitcher_mu_x[pitcher[i]] + beta_stand_x * stand[i] + beta_pitch_type_x[pitch_type[i]] + beta_balls_x * balls[i] + beta_strikes_x * strikes[i], sigma_x);
    observed_z[i] ~ normal(pitcher_mu_z[pitcher[i]] + beta_stand_z * stand[i] + beta_pitch_type_z[pitch_type[i]] + beta_balls_z * balls[i] + beta_strikes_z * strikes[i], sigma_z);
  }
}
"""

# Write Stan model to a file
with open('model.stan', 'w') as f:
    f.write(stan_model_code)

# Convert data to dictionary format
data_dict = {
    'N': len(data),
    'K': num_pitch_types,
    'P': num_pitchers,
    'pitch_type': data['pitch_type'].values + 1,
    'pitcher': pitcher_idx + 1,
    'stand': stand,
    'balls': balls,
    'strikes': strikes,
    'observed_x': observed_x,
    'observed_z': observed_z
}

In [28]:
# Compile and run Stan model
model = cmdstanpy.CmdStanModel(stan_file='model.stan')
fit = model.sample(data=data_dict, iter_sampling=2000)

# Print summary of the fit
print(fit.summary())

15:27:07 - cmdstanpy - INFO - compiling stan file C:\Users\wampl\pitching_model\model.stan to exe file C:\Users\wampl\pitching_model\model.exe


ValueError: Failed to compile Stan model 'C:\Users\wampl\pitching_model\model.stan'. Console:

--- Translating Stan model to C++ code ---
bin/stanc.exe  --o=C:/Users/wampl/pitching_model/model.hpp C:/Users/wampl/pitching_model/model.stan
Error in 'C:/Users/wampl/pitching_model/model.stan', line 24, column 2: Declaration
    of arrays by placing brackets after a variable name was removed in Stan
    2.33.0. Instead use the array keyword before the type. This can be
    changed automatically using the auto-format flag to stanc
Error in 'C:/Users/wampl/pitching_model/model.stan', line 23, column 2: Declaration
    of arrays by placing brackets after a variable name was removed in Stan
    2.33.0. Instead use the array keyword before the type. This can be
    changed automatically using the auto-format flag to stanc
Error in 'C:/Users/wampl/pitching_model/model.stan', line 20, column 2: Declaration
    of arrays by placing brackets after a variable name was removed in Stan
    2.33.0. Instead use the array keyword before the type. This can be
    changed automatically using the auto-format flag to stanc
Error in 'C:/Users/wampl/pitching_model/model.stan', line 19, column 2: Declaration
    of arrays by placing brackets after a variable name was removed in Stan
    2.33.0. Instead use the array keyword before the type. This can be
    changed automatically using the auto-format flag to stanc
Error in 'C:/Users/wampl/pitching_model/model.stan', line 12, column 2: Declaration
    of arrays by placing brackets after a variable name was removed in Stan
    2.33.0. Instead use the array keyword before the type. This can be
    changed automatically using the auto-format flag to stanc
Error in 'C:/Users/wampl/pitching_model/model.stan', line 11, column 2: Declaration
    of arrays by placing brackets after a variable name was removed in Stan
    2.33.0. Instead use the array keyword before the type. This can be
    changed automatically using the auto-format flag to stanc
Error in 'C:/Users/wampl/pitching_model/model.stan', line 10, column 2: Declaration
    of arrays by placing brackets after a variable name was removed in Stan
    2.33.0. Instead use the array keyword before the type. This can be
    changed automatically using the auto-format flag to stanc
Error in 'C:/Users/wampl/pitching_model/model.stan', line 9, column 2: Declaration
    of arrays by placing brackets after a variable name was removed in Stan
    2.33.0. Instead use the array keyword before the type. This can be
    changed automatically using the auto-format flag to stanc
Error in 'C:/Users/wampl/pitching_model/model.stan', line 8, column 2: Declaration
    of arrays by placing brackets after a variable name was removed in Stan
    2.33.0. Instead use the array keyword before the type. This can be
    changed automatically using the auto-format flag to stanc
Error in 'C:/Users/wampl/pitching_model/model.stan', line 7, column 2: Declaration
    of arrays by placing brackets after a variable name was removed in Stan
    2.33.0. Instead use the array keyword before the type. This can be
    changed automatically using the auto-format flag to stanc
Error in 'C:/Users/wampl/pitching_model/model.stan', line 6, column 2: Declaration
    of arrays by placing brackets after a variable name was removed in Stan
    2.33.0. Instead use the array keyword before the type. This can be
    changed automatically using the auto-format flag to stanc
make/program:48: recipe for target 'C:/Users/wampl/pitching_model/model.hpp' failed
mingw32-make: *** [C:/Users/wampl/pitching_model/model.hpp] Error 65

Command ['mingw32-make', 'C:/Users/wampl/pitching_model/model.exe']
	error during processing No such file or directory


In [27]:
from cmdstanpy import install_cxx_toolchain
config = install_cxx_toolchain.get_config('C:\\RTools', True)
install_cxx_toolchain.get_toolchain_name()

'RTools'