In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
import xgboost as xgb
import joblib

from feature_engineering import compute_fastball_relative_features, compute_approach_angles, compute_adjusted_axis_deviation

In [3]:
df = pd.read_csv('statcast_data/df_all_spin.csv')

In [4]:
## Calculate vertical approach angle and horizontal approach angle (degrees) for each pitch
df = compute_approach_angles(df)

## Calculate adjusted axis deviation (degrees normalized for pitcher handedness) for each pitch
df = compute_adjusted_axis_deviation(df)

df = compute_fastball_relative_features(df)

In [5]:
df.description.value_counts()

description
ball                       792435
foul                       418067
hit_into_play              414439
called_strike              390569
swinging_strike            255047
blocked_ball                54861
foul_tip                    23020
swinging_strike_blocked     15260
hit_by_pitch                 7133
foul_bunt                    4029
missed_bunt                   817
pitchout                      127
bunt_foul_tip                  87
foul_pitchout                   1
unknown_strike                  1
Name: count, dtype: int64

In [6]:
csw = ['called_strike', 'swinging_strike', 'swinging_strike_blocked']

In [7]:
df['csw'] = df['description'].isin(csw).astype(int)


In [8]:
fastballs = ['FF', 'SI']
non_fastballs = ['FC', 'CH', 'FS', 'FO', 'KC', 'CU', 'SL', 'ST', 'SV', 'CS', 'SC']

In [10]:
df.loc[df.pitch_type.isin(fastballs), 'csw'].mean().round(3)

0.268

In [11]:
df.loc[df.pitch_type.isin(non_fastballs), 'csw'].mean().round(3)

0.287

In [45]:
df_fastballs = df.loc[(df['pitch_type'].isin(fastballs)) & (df['game_year'].isin([2020, 2021, 2022]))]
df_fastballs_holdout = df.loc[(df['pitch_type'].isin(fastballs)) & (df['game_year']==2023)]

In [46]:
fastball_features = [
    'release_speed', 'ax', 'ay', 'az', 'vaa', 'haa', 'axis_deviation_adj', 'plate_x', 'plate_z'
]
target = 'csw'

In [47]:
df_fastballs[fastball_features].info()

<class 'pandas.core.frame.DataFrame'>
Index: 843425 entries, 0 to 1717441
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   release_speed       843405 non-null  float64
 1   ax                  843416 non-null  float64
 2   ay                  843416 non-null  float64
 3   az                  843416 non-null  float64
 4   vaa                 843416 non-null  float64
 5   haa                 843310 non-null  float64
 6   axis_deviation_adj  825727 non-null  float64
 7   plate_x             843416 non-null  float64
 8   plate_z             843416 non-null  float64
dtypes: float64(9)
memory usage: 64.3 MB


In [48]:
df_fastballs = df_fastballs.dropna(subset=fastball_features)
df_fastballs = df_fastballs.dropna(subset=[target])

df_fastballs_holdout = df_fastballs_holdout.dropna(subset=fastball_features)
df_fastballs_holdout = df_fastballs_holdout.dropna(subset=[target])

In [49]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train, val = train_test_split(df_fastballs, test_size=0.3, random_state=10)

In [50]:
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# Define the objective function for Optuna
def objective(trial: Trial) -> float:
    params = {
        'device': 'cuda',  # Use GPU acceleration
        'n_estimators': trial.suggest_int('n_estimators', 100, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'gamma': trial.suggest_float('gamma', 0.1, 0.3),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 0.1),
        'scale_pos_weight': 2.7  # Class weight for positive class
    }
    model = xgb.XGBClassifier(**params)
    model.fit(train[fastball_features], train[target])

    val_preds = model.predict(val[fastball_features])
    val_error = roc_auc_score(val[target], val_preds)  # Use sklearn's roc auc score

    return val_error

# Run the Optuna optimization
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=25)

# Print the best parameters
print(study.best_params)


[I 2023-09-24 14:24:50,063] A new study created in memory with name: no-name-dd2c94ab-1862-4c30-9d4d-ec5e6159aa38
[I 2023-09-24 14:24:52,986] Trial 0 finished with value: 0.6855525820386597 and parameters: {'n_estimators': 137, 'max_depth': 10, 'learning_rate': 0.07346740023932911, 'gamma': 0.2197316968394073, 'reg_alpha': 0.02404167763981929}. Best is trial 0 with value: 0.6855525820386597.
[I 2023-09-24 14:24:54,051] Trial 1 finished with value: 0.6797600429991083 and parameters: {'n_estimators': 115, 'max_depth': 3, 'learning_rate': 0.08675143843171859, 'gamma': 0.22022300234864176, 'reg_alpha': 0.0737265320016441}. Best is trial 0 with value: 0.6855525820386597.
[I 2023-09-24 14:24:56,431] Trial 2 finished with value: 0.6858345384811926 and parameters: {'n_estimators': 102, 'max_depth': 10, 'learning_rate': 0.08341182143924175, 'gamma': 0.14246782213565523, 'reg_alpha': 0.02636424704863906}. Best is trial 2 with value: 0.6858345384811926.
[I 2023-09-24 14:24:57,623] Trial 3 finishe

{'n_estimators': 102, 'max_depth': 10, 'learning_rate': 0.08341182143924175, 'gamma': 0.14246782213565523, 'reg_alpha': 0.02636424704863906}


In [51]:
params = study.best_params
params['device'] = 'cuda'
xgb_fastball_csw = xgb.XGBRegressor(**params)
xgb_fastball_csw.fit(df_fastballs[fastball_features], df_fastballs[target])

In [52]:
xgb_fastball_csw.save_model('models/xgb_fastball_csw.json')

In [53]:
xgb_fastball_csw = xgb.XGBRegressor()
xgb_fastball_csw.load_model('models/xgb_fastball_csw.json')

In [54]:
df_fastballs_holdout['xgb_csw'] = xgb_fastball_csw.predict(df_fastballs_holdout[fastball_features])

In [55]:
non_fastball_features = [
    'release_speed', 'release_spin_rate', 'ax', 'ay', 'az', 'velo_delta', 'spin_axis_delta',
    'vert_delta', 'horz_delta', 'vaa', 'haa', 'axis_deviation_adj', 'plate_x', 'plate_z'
]
target = 'csw'

In [56]:
df_non_fastballs = df.loc[df['pitch_type'].isin(non_fastballs) & (df['game_year'].isin([2020, 2021, 2022]))]
df_non_fastballs_holdout = df.loc[df['pitch_type'].isin(non_fastballs) & (df['game_year']==2023)]

In [57]:
df_non_fastballs[non_fastball_features].info()

<class 'pandas.core.frame.DataFrame'>
Index: 845674 entries, 1 to 1717442
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   release_speed       845633 non-null  float64
 1   release_spin_rate   841683 non-null  float64
 2   ax                  845643 non-null  float64
 3   ay                  845643 non-null  float64
 4   az                  845643 non-null  float64
 5   velo_delta          830506 non-null  float64
 6   spin_axis_delta     821952 non-null  float64
 7   vert_delta          830515 non-null  float64
 8   horz_delta          830515 non-null  float64
 9   vaa                 845643 non-null  float64
 10  haa                 845539 non-null  float64
 11  axis_deviation_adj  821952 non-null  float64
 12  plate_x             845643 non-null  float64
 13  plate_z             845643 non-null  float64
dtypes: float64(14)
memory usage: 96.8 MB


In [58]:
df_non_fastballs = df_non_fastballs.dropna(subset=non_fastball_features)
df_non_fastballs = df_non_fastballs.dropna(subset=[target])

df_non_fastballs_holdout = df_non_fastballs_holdout.dropna(subset=non_fastball_features)
df_non_fastballs_holdout = df_non_fastballs_holdout.dropna(subset=[target])

In [59]:
# Split the data into training and testing sets
train, val = train_test_split(df_non_fastballs, test_size=0.3, random_state=10)

In [60]:
# Define the objective function for Optuna
def objective(trial: Trial) -> float:
    params = {
        'device': 'cuda',  # Use GPU acceleration
        'n_estimators': trial.suggest_int('n_estimators', 100, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'gamma': trial.suggest_float('gamma', 0.1, 0.3),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 0.1),
        'scale_pos_weight': 2.7  # Class weight for positive class
    }
    model = xgb.XGBClassifier(**params)
    model.fit(train[fastball_features], train[target])

    val_preds = model.predict(val[fastball_features])
    val_error = roc_auc_score(val[target], val_preds)  # Use sklearn's roc auc score

    return val_error

# Run the Optuna optimization
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=25)

params = study.best_params
params['device'] = 'cuda'

[I 2023-09-24 14:25:50,668] A new study created in memory with name: no-name-9ec14421-40c0-4661-924d-564c0ba635d4


[I 2023-09-24 14:25:53,814] Trial 0 finished with value: 0.6572921540684148 and parameters: {'n_estimators': 137, 'max_depth': 10, 'learning_rate': 0.07346740023932911, 'gamma': 0.2197316968394073, 'reg_alpha': 0.02404167763981929}. Best is trial 0 with value: 0.6572921540684148.
[I 2023-09-24 14:25:54,912] Trial 1 finished with value: 0.6444209405238094 and parameters: {'n_estimators': 115, 'max_depth': 3, 'learning_rate': 0.08675143843171859, 'gamma': 0.22022300234864176, 'reg_alpha': 0.0737265320016441}. Best is trial 0 with value: 0.6572921540684148.
[I 2023-09-24 14:25:57,178] Trial 2 finished with value: 0.6564577152233649 and parameters: {'n_estimators': 102, 'max_depth': 10, 'learning_rate': 0.08341182143924175, 'gamma': 0.14246782213565523, 'reg_alpha': 0.02636424704863906}. Best is trial 0 with value: 0.6572921540684148.
[I 2023-09-24 14:25:58,335] Trial 3 finished with value: 0.6456564713610884 and parameters: {'n_estimators': 118, 'max_depth': 5, 'learning_rate': 0.05295088

In [61]:
xgb_non_fastball_csw = xgb.XGBRegressor(**params)
xgb_non_fastball_csw.fit(df_non_fastballs[non_fastball_features], df_non_fastballs[target])

In [62]:
xgb_non_fastball_csw.save_model('models/xgb_non_fastball_csw.json')

In [63]:
xgb_non_fastball_csw = xgb.XGBRegressor()
xgb_non_fastball_csw.load_model('models/xgb_non_fastball_csw.json')

In [64]:
df_non_fastballs_holdout.loc[:, 'xgb_csw'] = xgb_non_fastball_csw.predict(df_non_fastballs_holdout.loc[:, non_fastball_features])

In [65]:
df_holdout = pd.concat([df_fastballs_holdout, df_non_fastballs_holdout])

In [66]:
df2 = df_holdout.groupby(['player_name', 'pitcher', 'pitch_type'], as_index=False)[['csw', 'xgb_csw', 'release_speed']] \
    .agg({'xgb_csw':'mean', 'release_speed':'count', 'csw':'mean'}) \
    .reset_index(drop=True) \
    .rename(columns={'release_speed':'pitches_thrown'})

# Flatten the multi-index in columns
df2.columns = ['_'.join(col).rstrip('_') if isinstance(col, tuple) else col for col in df2.columns.values]

# Rename the columns
df2 = df2.rename(columns={
    'pitches_thrown_count': 'pitches_thrown'})
df2 = df2.round(2)

In [67]:

df2.query('pitches_thrown > 100').sort_values('xgb_csw', ascending=False).head(50)

Unnamed: 0,player_name,pitcher,pitch_type,xgb_csw,pitches_thrown,csw
442,"Chapman, Aroldis",547973,SL,0.46,267,0.41
233,"Bird, Jake",656234,CU,0.43,253,0.35
2264,"Ryu, Hyun Jin",547943,CU,0.42,108,0.39
2498,"Stephenson, Robert",596112,SL,0.41,184,0.45
797,"Floro, Dylan",571670,SL,0.41,207,0.29
2529,"Stripling, Ross",548389,KC,0.41,128,0.43
1762,"Miller, Bobby",676272,CU,0.4,317,0.36
1961,"Ortiz, Luis F.",656814,ST,0.4,113,0.33
1745,"Merryweather, Julian",657240,SL,0.4,559,0.39
1585,"López, Jorge",605347,KC,0.4,183,0.39


In [68]:
def top_pitches(df, pitch_types, min_pitches=100, top_n=50, ascending=False):
    """
    Returns the top n pitches in pitch_score_percentile for a selected pitch type(s) 
    with an option to filter number of pitches_thrown.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the pitch data.
    pitch_types (list): List of pitch types to consider.
    min_pitches (int): Minimum number of pitches thrown to consider.
    top_n (int): Number of top pitches to return.

    Returns:
    pandas.DataFrame: DataFrame containing the top n pitches.
    """
    if isinstance(pitch_types, str):
        pitch_types = [pitch_types]

    return (df.loc[df['pitch_type'].isin(pitch_types)]
              .query('pitches_thrown > @min_pitches')
              .sort_values('xgb_csw', ascending=ascending)
              .head(top_n))

In [69]:
top_pitches(df2, 'SI', min_pitches=100, top_n=50)

Unnamed: 0,player_name,pitcher,pitch_type,xgb_csw,pitches_thrown,csw
1147,"Hjelle, Sean",663546,SI,0.36,205,0.35
2585,"Sánchez, Cristopher",650911,SI,0.35,606,0.33
85,"Anderson, Grant",681982,SI,0.34,167,0.31
1507,"Llovera, Mauricio",661440,SI,0.34,327,0.31
455,"Cimber, Adam",643256,SI,0.33,110,0.34
552,"Criswell, Cooper",681867,SI,0.33,167,0.29
2649,"Topa, Justin",623437,SI,0.33,422,0.31
1177,"Houck, Tanner",656557,SI,0.33,439,0.3
393,"Cano, Yennier",666974,SI,0.33,526,0.29
2803,"Webb, Logan",657277,SI,0.32,1030,0.35


In [70]:
top_pitches(df2, 'SL', min_pitches=100, top_n=50)

Unnamed: 0,player_name,pitcher,pitch_type,xgb_csw,pitches_thrown,csw
442,"Chapman, Aroldis",547973,SL,0.46,267,0.41
797,"Floro, Dylan",571670,SL,0.41,207,0.29
2498,"Stephenson, Robert",596112,SL,0.41,184,0.45
1745,"Merryweather, Julian",657240,SL,0.4,559,0.39
1081,"Helsley, Ryan",664854,SL,0.4,175,0.43
2977,"deGrom, Jacob",594798,SL,0.39,167,0.41
1644,"Marte, Yunior",628708,SL,0.39,291,0.36
2457,"Soto, Gregory",642397,SL,0.38,352,0.39
2348,"Sears, JP",676664,SL,0.37,330,0.3
117,"Bachman, Sam",696147,SL,0.37,179,0.33


In [71]:
df2['xgb_csw'].mean().round(2)

0.28

In [72]:
top_pitches(df2, 'FS', min_pitches=100, top_n=50)

Unnamed: 0,player_name,pitcher,pitch_type,xgb_csw,pitches_thrown,csw
177,"Bautista, Félix",642585,FS,0.35,242,0.37
1055,"Harvey, Hunter",640451,FS,0.33,163,0.27
1038,"Harris, Hobie",665048,FS,0.32,175,0.27
775,"Finnegan, Kyle",640448,FS,0.31,259,0.3
690,"Duran, Jhoan",661395,FS,0.3,258,0.24
2904,"Winn, Keaton",676775,FS,0.3,286,0.31
1494,"Littell, Zack",641793,FS,0.29,219,0.21
1904,"Neris, Hector",593576,FS,0.28,265,0.28
2573,"Swanson, Erik",657024,FS,0.28,448,0.32
1372,"Kirby, George",669923,FS,0.28,135,0.24


In [73]:
top_pitches(df2, 'FF', min_pitches=100, top_n=50)

Unnamed: 0,player_name,pitcher,pitch_type,xgb_csw,pitches_thrown,csw
2150,"Rasmussen, Drew",656876,FF,0.37,199,0.29
1085,"Hendricks, Kyle",543294,FF,0.34,398,0.35
1511,"Lodolo, Nick",666157,FF,0.33,300,0.28
2336,"Scott, Tanner",656945,FF,0.33,526,0.31
1781,"Miller, Tyson",668338,FF,0.33,131,0.24
1187,"Hudson, Bryan",663542,FF,0.33,142,0.38
1667,"Maton, Phil",664208,FF,0.32,306,0.25
1259,"Jiménez, Joe",641729,FF,0.32,535,0.3
278,"Bradford, Cody",674003,FF,0.32,440,0.32
1322,"Kelly, Merrill",518876,FF,0.32,742,0.32


In [74]:
top_pitches(df2, 'CH', min_pitches=100, top_n=50)

Unnamed: 0,player_name,pitcher,pitch_type,xgb_csw,pitches_thrown,csw
2195,"Rodriguez, Grayson",680570,CH,0.37,419,0.25
2858,"Williams, Devin",642207,CH,0.36,507,0.33
2399,"Skubal, Tarik",669373,CH,0.33,245,0.39
1817,"Moore, Matt",519043,CH,0.32,237,0.3
1039,"Harris, Hogan",663687,CH,0.32,225,0.36
2174,"Richards, Trevor",670950,CH,0.32,675,0.38
2023,"Pepiot, Ryan",686752,CH,0.32,139,0.29
1699,"McClanahan, Shane",663556,CH,0.32,469,0.37
2134,"Ramirez, Nick",598286,CH,0.31,171,0.26
644,"Devenski, Chris",606965,CH,0.31,307,0.3


In [81]:
df2['xgb_csw_percentile'] = df2['xgb_csw'].rank(pct=True).round(2).mul(100).astype(int)

In [82]:
df2.sample(10)

Unnamed: 0,player_name,pitcher,pitch_type,xgb_csw,pitches_thrown,csw,xgb_csw_percentile
437,"Chafin, Andrew",605177,SI,0.24,364,0.29,22
2450,"Soriano, José",667755,SL,0.3,33,0.33,70
55,"Alexander, Tyler",641302,SI,0.31,76,0.3,77
325,"Brito, Jhony",666745,SL,0.2,20,0.1,6
2357,"Senga, Kodai",673540,ST,0.34,156,0.33,92
2233,"Romero, JoJo",668941,FF,0.19,85,0.21,4
1148,"Hoeing, Bryan",663773,CH,0.23,62,0.15,16
300,"Brasier, Ryan",518489,SL,0.34,298,0.28,92
1954,"Ortega, Oliver",661383,SI,0.24,41,0.27,22
2326,"Schmidt, Clarke",657376,ST,0.34,674,0.36,92


In [83]:
df2.to_csv('csw_pitcher_predictions.csv', index=False)

In [85]:
df_holdout.to_csv('csw_model_predictions.csv')