In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [108]:
# load in data with predicted run values
df = pd.read_csv('statcast_data/df_all_spin.csv')

In [109]:
## Calculate vertical approach angle and horizontal approach angle (degrees) for each pitch
df['vaa'] = np.arctan((df['plate_z'] - df['release_pos_z']) / (60.5 - df['release_extension'])) * (180 / np.pi)
df['haa'] = np.arctan((df['plate_x'] - df['release_pos_x']) / (60.5 - df['release_pos_y'])) * (180 / np.pi)

df['axis_deviation_adj'] = np.where(df['p_throws']=='L', df['diff_measured_inferred'].mul(-1), df['diff_measured_inferred'])

In [110]:
df.loc[df['p_throws']=='R'].groupby('pitch_type')[['vaa', 'haa']].mean()

Unnamed: 0_level_0,vaa,haa
pitch_type,Unnamed: 1_level_1,Unnamed: 2_level_1
CH,-4.156466,13.366556
CS,-4.249456,13.491816
CU,-4.345398,15.983744
EP,-3.543989,16.279951
FA,-3.574865,18.930291
FC,-3.762179,18.710288
FF,-3.258297,15.434165
FO,-5.080786,15.217236
FS,-4.43363,13.067402
KC,-4.374953,17.073745


In [111]:
df.pitch_type.unique()

array(['FF', 'SL', 'SI', 'FC', 'CU', 'CH', 'KC', 'CS', 'FS', 'ST', 'SV',
       'EP', 'FA', nan, 'KN', 'PO', 'SC', 'FO'], dtype=object)

In [112]:
df.loc[df['pitch_type'].isin(['CS']), 'pitch_name'].unique()

array(['Slow Curve'], dtype=object)

In [113]:
fastballs = ['FF', 'SI', 'FC']
offspeed = ['CH', 'FS', 'FO']
breaking_balls = ['KC', 'CU', 'SL', 'ST', 'SV', 'CS', 'SC']

In [123]:
df.game_year

0          2020
1          2020
2          2020
3          2020
4          2020
           ... 
2330676    2023
2330677    2023
2330678    2023
2330679    2023
2330680    2023
Name: game_year, Length: 2330681, dtype: int64

In [124]:
df_fastballs = df.loc[(df['pitch_type'].isin(fastballs)) & (df['game_year'].isin([2020, 2021, 2022]))]
df_fastballs_holdout = df.loc[(df['pitch_type'].isin(fastballs)) & (df['game_year']==2023)]

In [125]:
df_fastballs.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,...,predicted_run_value,vaa,player_id,api_pitch_type,active_spin_formatted,hawkeye_measured,movement_inferred,diff_measured_inferred,haa,axis_deviation_adj
0,0,2875.0,FF,2020-09-27,91.6,2.31,6.19,"Hand, Brad",605137,543272,...,0.034984,-3.002928,543272.0,FF,89.0,129.2,139.2,-10.0,-24.988881,10.0
6,6,3604.0,FF,2020-09-27,92.8,2.39,6.27,"Hand, Brad",663647,543272,...,-0.249587,-3.475149,543272.0,FF,89.0,129.2,139.2,-10.0,-27.267778,10.0
10,10,2843.0,SI,2020-09-27,96.7,-2.2,5.92,"Cederlind, Blake",596019,664977,...,-0.197027,-4.352926,,,,,,,18.484902,
11,11,2951.0,SI,2020-09-27,97.1,-2.2,5.99,"Cederlind, Blake",596019,664977,...,,-3.501793,,,,,,,22.089253,
12,12,3160.0,SI,2020-09-27,97.3,-2.34,5.94,"Cederlind, Blake",596019,664977,...,,-3.58032,,,,,,,26.260469,


In [126]:
fastball_features = [
    'release_speed', 'az', 'ax', 'active_spin_formatted',
    'plate_x', 'plate_z', 'axis_deviation_adj', 'vaa', 'haa'
]
target = 'predicted_run_value'

In [127]:
df_fastballs[fastball_features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 961810 entries, 0 to 1717492
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   release_speed          961782 non-null  float64
 1   az                     961796 non-null  float64
 2   ax                     961796 non-null  float64
 3   active_spin_formatted  941051 non-null  float64
 4   plate_x                961796 non-null  float64
 5   plate_z                961796 non-null  float64
 6   axis_deviation_adj     941051 non-null  float64
 7   vaa                    959050 non-null  float64
 8   haa                    961652 non-null  float64
dtypes: float64(9)
memory usage: 73.4 MB


In [128]:
df_fastballs.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,...,predicted_run_value,vaa,player_id,api_pitch_type,active_spin_formatted,hawkeye_measured,movement_inferred,diff_measured_inferred,haa,axis_deviation_adj
0,0,2875.0,FF,2020-09-27,91.6,2.31,6.19,"Hand, Brad",605137,543272,...,0.034984,-3.002928,543272.0,FF,89.0,129.2,139.2,-10.0,-24.988881,10.0
6,6,3604.0,FF,2020-09-27,92.8,2.39,6.27,"Hand, Brad",663647,543272,...,-0.249587,-3.475149,543272.0,FF,89.0,129.2,139.2,-10.0,-27.267778,10.0
10,10,2843.0,SI,2020-09-27,96.7,-2.2,5.92,"Cederlind, Blake",596019,664977,...,-0.197027,-4.352926,,,,,,,18.484902,
11,11,2951.0,SI,2020-09-27,97.1,-2.2,5.99,"Cederlind, Blake",596019,664977,...,,-3.501793,,,,,,,22.089253,
12,12,3160.0,SI,2020-09-27,97.3,-2.34,5.94,"Cederlind, Blake",596019,664977,...,,-3.58032,,,,,,,26.260469,


In [129]:
df_fastballs = df_fastballs.dropna(subset=fastball_features)
df_fastballs = df_fastballs.dropna(subset=[target])

df_fastballs_holdout = df_fastballs_holdout.dropna(subset=fastball_features)
df_fastballs_holdout = df_fastballs_holdout.dropna(subset=[target])

In [130]:
df_fastballs.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,...,predicted_run_value,vaa,player_id,api_pitch_type,active_spin_formatted,hawkeye_measured,movement_inferred,diff_measured_inferred,haa,axis_deviation_adj
0,0,2875.0,FF,2020-09-27,91.6,2.31,6.19,"Hand, Brad",605137,543272,...,0.034984,-3.002928,543272.0,FF,89.0,129.2,139.2,-10.0,-24.988881,10.0
6,6,3604.0,FF,2020-09-27,92.8,2.39,6.27,"Hand, Brad",663647,543272,...,-0.249587,-3.475149,543272.0,FF,89.0,129.2,139.2,-10.0,-27.267778,10.0
23,23,4283.0,FF,2020-09-27,91.3,2.59,5.99,"Hand, Brad",624428,543272,...,0.316117,-4.011443,543272.0,FF,89.0,129.2,139.2,-10.0,-30.262832,10.0
26,26,3247.0,SI,2020-09-27,88.4,-2.68,2.35,"Cimber, Adam",572180,643256,...,0.23,-1.302434,643256.0,SI,73.0,275.3,309.3,-33.9,17.620372,-33.9
32,32,4018.0,SI,2020-09-27,87.4,-2.68,2.49,"Cimber, Adam",591741,643256,...,-0.15412,-0.792788,643256.0,SI,73.0,275.3,309.3,-33.9,21.73886,-33.9


In [131]:
from sklearn.model_selection import train_test_split, GroupShuffleSplit
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
import xgboost as xgb

# Create a GroupShuffleSplit object
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)

# Get the indices for the training and validation sets
train_idx, val_idx = next(gss.split(df_fastballs, groups=df_fastballs['pitcher']))

# Create the training and validation sets
train = df_fastballs.iloc[train_idx]
val = df_fastballs.iloc[val_idx]

# Define the objective function for Optuna
def objective(trial: Trial) -> float:
    params = {
        'tree_method': 'gpu_hist',  # Use GPU acceleration
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    model = xgb.XGBRegressor(**params)
    model.fit(train[fastball_features], train[target])

    val_preds = model.predict(val[fastball_features])
    val_error = np.sqrt(((val_preds - val[target]) ** 2).mean())

    return val_error

# Run the Optuna optimization
study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=50)

# Print the best parameters
print(study.best_params)


[I 2023-09-09 19:54:17,505] A new study created in memory with name: no-name-ba4980ee-d160-4f30-9d34-0cbc980227b2
[I 2023-09-09 19:54:31,347] Trial 0 finished with value: 0.33098924931863977 and parameters: {'n_estimators': 812, 'max_depth': 10, 'learning_rate': 0.07346740023932911, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182}. Best is trial 0 with value: 0.33098924931863977.
[I 2023-09-09 19:54:32,408] Trial 1 finished with value: 0.3214208055107788 and parameters: {'n_estimators': 396, 'max_depth': 3, 'learning_rate': 0.08675143843171859, 'subsample': 0.8005575058716043, 'colsample_bytree': 0.8540362888980227}. Best is trial 1 with value: 0.3214208055107788.
[I 2023-09-09 19:54:34,637] Trial 2 finished with value: 0.32382849544661485 and parameters: {'n_estimators': 139, 'max_depth': 10, 'learning_rate': 0.08341182143924175, 'subsample': 0.6061695553391381, 'colsample_bytree': 0.5909124836035503}. Best is trial 1 with value: 0.3214208055107788.
[I 2023-09-

{'n_estimators': 1636, 'max_depth': 5, 'learning_rate': 0.010669539286632005, 'subsample': 0.8421165132560784, 'colsample_bytree': 0.7200762468698007}


In [132]:
params = study.best_params
params['tree_method'] = 'gpu_hist'
xgb_fastball = xgb.XGBRegressor(**params)
xgb_fastball.fit(df_fastballs[fastball_features], df_fastballs[target])

In [133]:
df_fastballs_holdout['xgb_preds'] = xgb_fastball.predict(df_fastballs_holdout[fastball_features])

In [134]:
df_fastballs_holdout.groupby(['player_name', 'pitch_type', 'game_year'])['delta_run_exp', 'xgb_preds'].count()

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,delta_run_exp,xgb_preds
player_name,pitch_type,game_year,Unnamed: 3_level_1,Unnamed: 4_level_1
"Abad, Fernando",FF,2023,6,6
"Abad, Fernando",SI,2023,16,16
"Abbott, Andrew",FF,2023,207,207
"Abbott, Cory",FF,2023,65,65
"Abreu, Albert",FF,2023,4,4
...,...,...,...,...
"Zerpa, Angel",SI,2023,26,26
"Zimmermann, Bruce",FF,2023,6,6
"Zimmermann, Bruce",SI,2023,6,6
"Zuñiga, Guillermo",FF,2023,4,4


In [135]:
test_preds = df_fastballs_holdout.groupby(['player_name', 'pitch_type'], as_index=False)['predicted_run_value', 'xgb_preds', 'release_speed'] \
    .agg({'predicted_run_value':'mean', 'xgb_preds':'mean', 'release_speed':'count'}) \
    .reset_index() \
    .rename(columns={'release_speed':'count'})

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


In [137]:
test_preds['predicted_run_value_percentile'] = (1 - test_preds['predicted_run_value'].rank(pct=True).round(3)).mul(100)
test_preds['stuff_percentile'] = (1 - test_preds['xgb_preds'].rank(pct=True).round(3)).mul(100)
test_preds.query('count > 100').sort_values('xgb_preds', ascending=True).head(50)

Unnamed: 0,index,player_name,pitch_type,predicted_run_value,xgb_preds,count,predicted_run_value_percentile,stuff_percentile
82,82,"Bautista, Félix",FF,-0.040511,-0.058531,155,90.0,99.8
452,452,"Hader, Josh",SI,-0.02456,-0.036711,137,85.0,99.2
1038,1038,"Scott, Tanner",FF,0.011991,-0.024805,105,66.1,98.5
433,433,"Graterol, Brusdar",SI,-0.033805,-0.023449,111,88.2,98.4
498,498,"Hicks, Jordan",SI,-0.013598,-0.022594,145,80.9,98.2
86,86,"Bednar, David",FF,-0.059417,-0.019648,140,93.3,97.7
1263,1263,"Wheeler, Zack",FF,-0.038202,-0.017799,272,89.4,97.6
492,492,"Hernández, Carlos",FF,0.02441,-0.015503,136,57.0,97.2
552,552,"Jansen, Kenley",FC,-0.014479,-0.014392,145,81.2,96.7
515,515,"Holmes, Clay",SI,0.000955,-0.013955,168,72.7,96.5


In [155]:
test_preds.loc[test_preds['pitch_type']=='FF', ['player_name', 'pitch_type', 'stuff_percentile', 'count']].query('count > 25').sort_values('stuff_percentile', ascending=False).head(50)

Unnamed: 0,player_name,pitch_type,stuff_percentile,count
82,"Bautista, Félix",FF,99.8,155
1326,"deGrom, Jacob",FF,99.5,54
497,"Hicks, Jordan",FF,99.2,34
480,"Helsley, Ryan",FF,98.9,56
1105,"Stewart, Brock",FF,98.9,32
330,"Fairbanks, Pete",FF,98.7,78
781,"Miller, Mason",FF,98.6,50
1038,"Scott, Tanner",FF,98.5,105
261,"Cuas, Jose",FF,98.1,39
208,"Cimber, Adam",FF,98.0,39


In [156]:
test_preds.loc[test_preds['pitch_type']=='FC', ['player_name', 'pitch_type', 'stuff_percentile', 'count']].query('count > 25').sort_values('stuff_percentile', ascending=False).head(20)

Unnamed: 0,player_name,pitch_type,stuff_percentile,count
297,"Doval, Camilo",FC,99.4,74
29,"Alvarado, José",FC,99.1,58
1191,"Valdez, Framber",FC,98.8,86
789,"Minter, A.J.",FC,97.3,95
552,"Jansen, Kenley",FC,96.7,145
1189,"Urías, Julio",FC,95.7,55
1238,"Ward, Thaddeus",FC,95.6,27
217,"Clase, Emmanuel",FC,94.7,182
239,"Coulombe, Danny",FC,94.6,76
1048,"Severino, Luis",FC,94.3,50


In [157]:
test_preds.loc[test_preds['pitch_type']=='SI', ['player_name', 'pitch_type', 'stuff_percentile', 'count']].query('count > 25').sort_values('stuff_percentile', ascending=False).head(50)

Unnamed: 0,player_name,pitch_type,stuff_percentile,count
201,"Chapman, Aroldis",SI,99.6,65
452,"Hader, Josh",SI,99.2,137
433,"Graterol, Brusdar",SI,98.4,111
498,"Hicks, Jordan",SI,98.2,145
696,"Luzardo, Jesús",SI,96.8,32
515,"Holmes, Clay",SI,96.5,168
358,"Foley, Jason",SI,95.9,177
918,"Pop, Zach",SI,95.8,32
729,"Marte, Yunior",SI,95.2,86
18,"Alcantara, Sandy",SI,95.1,244


In [158]:
test_preds.loc[test_preds['player_name']== 'Harrison, Kyle']

Unnamed: 0,index,player_name,pitch_type,predicted_run_value,xgb_preds,count,predicted_run_value_percentile,stuff_percentile
464,464,"Harrison, Kyle",FF,-0.018284,0.009546,42,83.1,78.1


#### Breaking ball / off-speed feature engineering
Calculating some fastball-relative features 

In [142]:
fastball_metrics = df.groupby(['player_name', 'pitcher', 'game_pk', 'pitch_type'])[['release_speed', 'hawkeye_measured', 'active_spin_formatted', 'az', 'ax']].mean().round().reset_index()
fastball_metrics = fastball_metrics.rename(columns={
    'release_speed': 'avg_top_velocity',
    'hawkeye_measured':'fb_spin_axis',
    'active_spin_formatted':'fb_active_spin',
    'az':'fastball_vert',
    'ax':'fastball_horz'})


In [143]:
fastball_metrics = fastball_metrics.dropna()
idx = fastball_metrics.groupby(['pitcher', 'game_pk'])['avg_top_velocity'].idxmax()
fastball_metrics_max_velocity = fastball_metrics.loc[idx].rename(columns={'pitch_type':'top_velo_pitch_type'})
fastball_metrics_max_velocity.head()


Unnamed: 0,player_name,pitcher,game_pk,top_velo_pitch_type,avg_top_velocity,fb_spin_axis,fb_active_spin,fastball_vert,fastball_horz
163659,"Pérez, Oliver",424144,630981,FF,91.0,134.0,98.0,-18.0,12.0
163662,"Pérez, Oliver",424144,631114,SI,89.0,126.0,96.0,-24.0,19.0
163665,"Pérez, Oliver",424144,631118,SI,91.0,126.0,96.0,-25.0,20.0
163667,"Pérez, Oliver",424144,631272,FF,90.0,134.0,98.0,-18.0,13.0
163670,"Pérez, Oliver",424144,631479,FF,90.0,134.0,98.0,-18.0,12.0


In [144]:
df = df.merge(fastball_metrics_max_velocity, on=['player_name', 'pitcher', 'game_pk'], how='left')

In [145]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,...,movement_inferred,diff_measured_inferred,haa,axis_deviation_adj,top_velo_pitch_type,avg_top_velocity,fb_spin_axis,fb_active_spin,fastball_vert,fastball_horz
0,0,2875.0,FF,2020-09-27,91.6,2.31,6.19,"Hand, Brad",605137,543272,...,139.2,-10.0,-24.988881,10.0,FF,92.0,129.0,89.0,-15.0,14.0
1,1,3004.0,SL,2020-09-27,80.4,2.45,6.12,"Hand, Brad",605137,543272,...,,,-31.459543,,FF,92.0,129.0,89.0,-15.0,14.0
2,2,3161.0,SL,2020-09-27,80.8,2.27,6.09,"Hand, Brad",592567,543272,...,,,-36.820491,,FF,92.0,129.0,89.0,-15.0,14.0
3,3,3209.0,SL,2020-09-27,80.5,2.41,6.12,"Hand, Brad",592567,543272,...,,,-31.792712,,FF,92.0,129.0,89.0,-15.0,14.0
4,4,3421.0,SL,2020-09-27,81.4,2.35,6.14,"Hand, Brad",592567,543272,...,,,-31.472149,,FF,92.0,129.0,89.0,-15.0,14.0


In [146]:
df.groupby('p_throws')['fb_spin_axis'].mean()

p_throws
L    144.015257
R    213.247251
Name: fb_spin_axis, dtype: float64

In [147]:
df['velo_delta'] = df['avg_top_velocity'] - df['release_speed']
df['spin_axis_delta'] = df['hawkeye_measured'] - df['fb_spin_axis']

df['vert_delta'] = df['fastball_vert'] - df['az']
df['horz_delta'] = df['fastball_horz'] - df['ax']

In [165]:
non_fastball_features = [
    'release_speed', 'az', 'ax', 'plate_x', 'plate_z',
    'axis_deviation_adj', 'vaa', 'haa', 'velo_delta',
    'spin_axis_delta', 'vert_delta', 'horz_delta'
]
target = 'predicted_run_value'

In [166]:
df_non_fastballs = df.loc[~df['pitch_type'].isin(fastballs) & (df['game_year'].isin([2020, 2021, 2022]))]
df_non_fastballs_holdout = df.loc[~df['pitch_type'].isin(fastballs) & (df['game_year']==2023)]

In [167]:
df_non_fastballs[non_fastball_features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 755712 entries, 1 to 1717521
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   release_speed       730541 non-null  float64
 1   az                  730548 non-null  float64
 2   ax                  730548 non-null  float64
 3   plate_x             730548 non-null  float64
 4   plate_z             730548 non-null  float64
 5   axis_deviation_adj  706676 non-null  float64
 6   vaa                 728743 non-null  float64
 7   haa                 730482 non-null  float64
 8   velo_delta          714150 non-null  float64
 9   spin_axis_delta     706676 non-null  float64
 10  vert_delta          714156 non-null  float64
 11  horz_delta          714156 non-null  float64
dtypes: float64(12)
memory usage: 75.0 MB


In [168]:
df_non_fastballs = df_non_fastballs.dropna(subset=non_fastball_features)
df_non_fastballs = df_non_fastballs.dropna(subset=[target])

df_non_fastballs_holdout = df_non_fastballs_holdout.dropna(subset=non_fastball_features)
df_non_fastballs_holdout = df_non_fastballs_holdout.dropna(subset=[target])

In [169]:
from sklearn.model_selection import train_test_split, GroupShuffleSplit
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
import xgboost as xgb

# Create a GroupShuffleSplit object
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)

# Get the indices for the training and validation sets
train_idx, val_idx = next(gss.split(df_non_fastballs, groups=df_non_fastballs['pitcher']))

# Create the training and validation sets
train = df_non_fastballs.iloc[train_idx]
val = df_non_fastballs.iloc[val_idx]

# Define the objective function for Optuna
def objective(trial: Trial) -> float:
    params = {
        'tree_method': 'gpu_hist',  # Use GPU acceleration
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    model = xgb.XGBRegressor(**params)
    model.fit(train[non_fastball_features], train[target])

    val_preds = model.predict(val[non_fastball_features])
    val_error = np.sqrt(((val_preds - val[target]) ** 2).mean())

    return val_error

# Run the Optuna optimization
study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=50)

# Print the best parameters
params = study.best_params
params['tree_method'] = 'gpu_hist'

xgb_non_fastball = xgb.XGBRegressor(**params)
xgb_non_fastball.fit(df_non_fastballs[non_fastball_features], df_non_fastballs[target])

[I 2023-09-09 20:04:57,823] A new study created in memory with name: no-name-9cb9f77b-ac44-47dc-998f-9f4fcb2363d6
[I 2023-09-09 20:05:10,218] Trial 0 finished with value: 0.313261118294867 and parameters: {'n_estimators': 812, 'max_depth': 10, 'learning_rate': 0.07346740023932911, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182}. Best is trial 0 with value: 0.313261118294867.
[I 2023-09-09 20:05:11,214] Trial 1 finished with value: 0.3030296454699902 and parameters: {'n_estimators': 396, 'max_depth': 3, 'learning_rate': 0.08675143843171859, 'subsample': 0.8005575058716043, 'colsample_bytree': 0.8540362888980227}. Best is trial 1 with value: 0.3030296454699902.
[I 2023-09-09 20:05:13,354] Trial 2 finished with value: 0.30739249418740183 and parameters: {'n_estimators': 139, 'max_depth': 10, 'learning_rate': 0.08341182143924175, 'subsample': 0.6061695553391381, 'colsample_bytree': 0.5909124836035503}. Best is trial 1 with value: 0.3030296454699902.
[I 2023-09-09 2

{'n_estimators': 1636, 'max_depth': 5, 'learning_rate': 0.010669539286632005, 'subsample': 0.8421165132560784, 'colsample_bytree': 0.7200762468698007}


In [170]:
df_non_fastballs_holdout['xgb_preds'] = xgb_non_fastball.predict(df_non_fastballs_holdout[non_fastball_features])

In [171]:
test_preds_non_fb = df_non_fastballs_holdout.groupby(['player_name', 'pitch_type'], as_index=False)['predicted_run_value', 'xgb_preds', 'release_speed'] \
    .agg({'predicted_run_value':'mean', 'xgb_preds':'mean', 'release_speed':'count'}) \
    .reset_index() \
    .rename(columns={'release_speed':'count'})

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


In [180]:
test_preds_non_fb['predicted_run_value_percentile'] = (1 - test_preds_non_fb['predicted_run_value'].rank(pct=True).round(3)).mul(100)
test_preds_non_fb['stuff_percentile'] = (1 - test_preds_non_fb['xgb_preds'].rank(pct=True).round(3)).mul(100)
test_preds_non_fb.query('count > 100').sort_values('xgb_preds', ascending=True).head(50)

Unnamed: 0,index,player_name,pitch_type,predicted_run_value,xgb_preds,count,predicted_run_value_percentile,stuff_percentile
1296,1296,"Snell, Blake",CU,-0.126633,-0.088114,134,94.3,98.6
660,660,"Jax, Griffin",ST,-0.051874,-0.077419,143,75.4,97.8
182,182,"Bummer, Aaron",ST,-0.059095,-0.074495,106,78.8,97.6
1509,1509,"Williams, Devin",CH,-0.038484,-0.071325,134,67.9,97.0
1428,1428,"Valdez, Framber",CU,-0.047213,-0.069747,150,73.1,96.9
1179,1179,"Rogers, Tyler",SL,-0.061967,-0.064683,138,79.2,95.5
157,157,"Brash, Matt",SL,-0.089088,-0.064134,131,86.7,95.4
688,688,"Keller, Mitch",ST,-0.031041,-0.06366,130,63.8,95.2
527,527,"Gray, Sonny",ST,-0.116756,-0.062804,182,92.6,94.9
896,896,"McClanahan, Shane",CH,-0.095925,-0.062291,130,88.5,94.8


In [182]:
test_preds_non_fb.loc[test_preds_non_fb['pitch_type']=='CH', ['player_name', 'pitch_type', 'stuff_percentile', 'count']].query('count > 25').sort_values('stuff_percentile', ascending=False).head(50)

Unnamed: 0,player_name,pitch_type,stuff_percentile,count
1509,"Williams, Devin",CH,97.0,134
896,"McClanahan, Shane",CH,94.8,130
1154,"Richards, Trevor",CH,94.7,157
159,"Brazoban, Huascar",CH,94.6,64
695,"Kelly, Merrill",CH,93.8,152
1177,"Rogers, Trevor",CH,93.7,26
163,"Brieske, Beau",CH,92.9,26
1078,"Peralta, Wandy",CH,92.7,106
958,"Montero, Rafael",CH,91.9,75
205,"Cano, Yennier",CH,90.2,79


In [181]:
test_preds_non_fb.loc[test_preds_non_fb['pitch_type']=='CU', ['player_name', 'pitch_type', 'stuff_percentile', 'count']].query('count > 50').sort_values('stuff_percentile', ascending=False).head(50)

Unnamed: 0,player_name,pitch_type,stuff_percentile,count
1296,"Snell, Blake",CU,98.6,134
494,"Glasnow, Tyler",CU,97.8,90
1428,"Valdez, Framber",CU,96.9,150
357,"Duran, Jhoan",CU,96.4,63
188,"Burnes, Corbin",CU,94.7,86
151,"Bradish, Kyle",CU,94.3,108
578,"Hentges, Sam",CU,94.0,64
1401,"Thompson, Zack",CU,92.4,54
760,"Lange, Alex",CU,91.0,152
1102,"Pressly, Ryan",CU,90.1,61


In [178]:
test_preds_non_fb.loc[test_preds_non_fb['player_name']== 'Harrison, Kyle']

Unnamed: 0,index,player_name,pitch_type,predicted_run_value,xgb_preds,count,predicted_run_value_percentile,stuff_percentile
549,549,"Harrison, Kyle",CH,0.188881,-0.02689,6,5.1,68.4
550,550,"Harrison, Kyle",SV,0.076831,-0.021587,15,17.4,62.7


In [179]:
test_preds_non_fb.loc[test_preds_non_fb['pitch_type']=='FS', ['player_name', 'pitch_type', 'stuff_percentile', 'count']].query('count > 10').sort_values('stuff_percentile', ascending=False).head(50)

Unnamed: 0,player_name,pitch_type,stuff_percentile,count
92,"Bautista, Félix",FS,98.5,75
740,"Kriske, Brooks",FS,95.5,11
867,"Martin, Chris",FS,94.8,28
1317,"Stanek, Ryne",FS,93.6,42
229,"Chapman, Aroldis",FS,91.3,17
403,"Finnegan, Kyle",FS,88.2,55
555,"Harvey, Hunter",FS,86.5,38
1009,"Neris, Hector",FS,86.2,69
455,"García, Luis",FS,83.0,20
546,"Harris, Hobie",FS,82.1,43
