## Test pitch quality metric against pitchingbot and pitching+ metrics on fangraphs using 2023 split halves
Note: no 2023 data was used to train my metric

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [86]:
first_half = pd.read_csv('first_half_metrics.csv')
second_half = pd.read_csv('second_half_metrics.csv')

In [87]:
statcast_data = pd.read_csv('statcast_data/df_all_spin.csv')

In [88]:
cols = [
    'player_name', 'pitcher', 'pitch_type', 'release_speed', 'p_throws',
    'plate_x', 'plate_z', 'ax', 'az', 'release_pos_x', 'active_spin_formatted',
    'release_pos_y', 'release_pos_z', 'description', 'hawkeye_measured',
    'movement_inferred', 'delta_run_exp', 'diff_measured_inferred',
    'predicted_run_value', 'game_pk', 'game_date'
]

In [89]:
statcast_data = statcast_data.loc[(statcast_data['game_year'] == 2023) & (statcast_data['game_type']=='R'), cols]
statcast_data['game_date'] = pd.to_datetime(statcast_data['game_date'])


In [90]:
from feature_engineering import compute_fastball_relative_features, compute_approach_angles, compute_adjusted_axis_deviation
statcast_data = compute_fastball_relative_features(statcast_data)
statcast_data = compute_approach_angles(statcast_data)
statcast_data = compute_adjusted_axis_deviation(statcast_data)

In [91]:
statcast_data.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Index: 608535 entries, 0 to 613158
Data columns (total 34 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   player_name             608535 non-null  object        
 1   pitcher                 608535 non-null  int64         
 2   pitch_type              608535 non-null  object        
 3   release_speed           608535 non-null  float64       
 4   p_throws                608535 non-null  object        
 5   plate_x                 608535 non-null  float64       
 6   plate_z                 608535 non-null  float64       
 7   ax                      608535 non-null  float64       
 8   az                      608535 non-null  float64       
 9   release_pos_x           608535 non-null  float64       
 10  active_spin_formatted   608535 non-null  float64       
 11  release_pos_y           608535 non-null  float64       
 12  release_pos_z           608535 non-

In [92]:
xgb_fastball = joblib.load('models/xgb_fastball_model.joblib')
xgb_non_fastball = joblib.load('models/xgb_non_fastball_model.joblib')

In [93]:
statcast_data = statcast_data.dropna()

In [94]:
fastball_types = ['FF', 'SI', 'FC']
non_fastball_types = ['CH', 'FS', 'FO', 'KC', 'CU', 'SL', 'ST', 'SV', 'CS', 'SC']

fastball_features = [
    'release_speed', 'az', 'ax', 'active_spin_formatted',
    'plate_x', 'plate_z', 'axis_deviation_adj', 'vaa', 'haa'
]

non_fastball_features = [
    'release_speed', 'az', 'ax', 'plate_x', 'plate_z',
    'axis_deviation_adj', 'vaa', 'haa', 'velo_delta',
    'spin_axis_delta', 'vert_delta', 'horz_delta'
]

statcast_data[fastball_features].info()

<class 'pandas.core.frame.DataFrame'>
Index: 608535 entries, 0 to 613158
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   release_speed          608535 non-null  float64
 1   az                     608535 non-null  float64
 2   ax                     608535 non-null  float64
 3   active_spin_formatted  608535 non-null  float64
 4   plate_x                608535 non-null  float64
 5   plate_z                608535 non-null  float64
 6   axis_deviation_adj     608535 non-null  float64
 7   vaa                    608535 non-null  float64
 8   haa                    608535 non-null  float64
dtypes: float64(9)
memory usage: 46.4 MB


In [95]:
statcast_data[non_fastball_features].info()

<class 'pandas.core.frame.DataFrame'>
Index: 608535 entries, 0 to 613158
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   release_speed       608535 non-null  float64
 1   az                  608535 non-null  float64
 2   ax                  608535 non-null  float64
 3   plate_x             608535 non-null  float64
 4   plate_z             608535 non-null  float64
 5   axis_deviation_adj  608535 non-null  float64
 6   vaa                 608535 non-null  float64
 7   haa                 608535 non-null  float64
 8   velo_delta          608535 non-null  float64
 9   spin_axis_delta     608535 non-null  float64
 10  vert_delta          608535 non-null  float64
 11  horz_delta          608535 non-null  float64
dtypes: float64(12)
memory usage: 60.4 MB


In [96]:
statcast_data.loc[statcast_data['pitch_type'].isin(fastball_types), 'xgb_pred'] = xgb_fastball.predict(statcast_data.loc[statcast_data['pitch_type'].isin(fastball_types), fastball_features])
statcast_data.loc[statcast_data['pitch_type'].isin(non_fastball_types), 'xgb_pred'] = xgb_non_fastball.predict(statcast_data.loc[statcast_data['pitch_type'].isin(non_fastball_types), non_fastball_features])


In [97]:
statcast_data.sample(10)

Unnamed: 0,player_name,pitcher,pitch_type,release_speed,p_throws,plate_x,plate_z,ax,az,release_pos_x,...,fastball_vert,fastball_horz,velo_delta,spin_axis_delta,vert_delta,horz_delta,vaa,haa,axis_deviation_adj,xgb_pred
305247,"Neris, Hector",593576,FF,93.7,R,0.3,1.91,-9.908635,-12.702145,-1.74,...,-12.0,-9.0,-1.7,-0.2,0.702145,0.908635,-4.238335,2.148375,8.3,-0.006147
576101,"Foley, Jason",671345,SI,96.7,R,0.05,1.48,-23.48144,-23.463217,-1.89,...,-24.0,-22.0,0.3,-0.1,-0.536783,1.48144,-4.327648,2.07093,-20.7,-0.01139
403169,"Cease, Dylan",656302,SL,86.4,R,0.58,0.78,2.296518,-30.051368,-1.81,...,-11.0,-3.0,9.6,-94.3,19.051368,-5.296518,-5.157891,2.546933,-11.0,-0.015696
122357,"Walker, Taijuan",592836,FS,87.1,R,-0.66,1.37,-14.956677,-25.581742,-2.2,...,-16.0,-11.0,5.9,12.3,9.581742,3.956677,-5.22371,1.612949,-35.7,-0.031837
477227,"Manaea, Sean",640455,SL,85.8,L,0.06,0.8,4.024315,-30.646386,3.31,...,-20.0,16.0,6.2,97.4,10.646386,11.975685,-5.157513,-3.497179,13.5,-0.000872
132360,"Paxton, James",572020,FF,92.6,L,0.1,3.23,12.087247,-16.636086,1.98,...,-16.0,19.0,3.4,2.9,0.636086,6.912753,-2.695912,-2.003948,4.0,-0.026128
608948,"Sandoval, Patrick",663776,SL,87.0,L,-0.24,2.47,-6.180329,-33.436746,1.22,...,-17.0,14.0,6.0,80.9,16.436746,20.180329,-3.795852,-1.541313,17.0,0.006537
61612,"Gray, Josiah",680686,SI,93.5,R,1.88,2.75,-20.088371,-18.357882,-1.37,...,-16.0,-12.0,-1.5,3.0,2.357882,8.088371,-2.534959,3.430892,-12.2,0.067574
603764,"Garrett, Amir",607237,SI,94.8,L,-0.21,2.29,16.523814,-12.140706,1.76,...,-12.0,12.0,0.2,-1.7,0.140706,-4.523814,-4.19313,-2.083526,-13.7,-0.012696
40319,"Blach, Ty",621389,SI,89.3,L,1.26,2.05,18.65173,-22.809362,3.15,...,-23.0,19.0,-0.3,-0.3,-0.190638,0.34827,-3.700645,-1.944798,0.3,0.015537


In [98]:
first_half_pred = statcast_data.loc[statcast_data['game_date'] < '2023-07-11'].groupby(['player_name', 'pitcher'], as_index=False)['xgb_pred'].agg(['mean', 'count']).sort_values('mean', ascending=True)
second_half_pred = statcast_data.loc[statcast_data['game_date'] >= '2023-07-11'].groupby(['player_name', 'pitcher'], as_index=False)['xgb_pred'].agg(['mean', 'count']).sort_values('mean', ascending=True)

In [99]:
second_half_pred = second_half_pred.rename(columns={'mean': 'second_half_pred', 'count': 'second_half_count'})

In [100]:
first_half = first_half.merge(first_half_pred, left_on='MLBAMID', right_on='pitcher', how='left')
second_half = second_half.merge(second_half_pred, left_on='MLBAMID', right_on='pitcher', how='left')

In [101]:
second_half = second_half.rename(columns={'mean': 'second_half_pred', 'count': 'second_half_count'})

In [102]:
first_half[['ERA', 'FIP', 'xFIP', 'SIERA', 'botERA', 'Pitching+', 'mean']].corr()

Unnamed: 0,ERA,FIP,xFIP,SIERA,botERA,Pitching+,mean
ERA,1.0,0.692828,0.526362,0.553805,0.329677,-0.370031,0.344815
FIP,0.692828,1.0,0.788027,0.771558,0.505739,-0.505073,0.489296
xFIP,0.526362,0.788027,1.0,0.93307,0.516102,-0.533723,0.521812
SIERA,0.553805,0.771558,0.93307,1.0,0.539595,-0.573944,0.541369
botERA,0.329677,0.505739,0.516102,0.539595,1.0,-0.824954,0.805787
Pitching+,-0.370031,-0.505073,-0.533723,-0.573944,-0.824954,1.0,-0.815469
mean,0.344815,0.489296,0.521812,0.541369,0.805787,-0.815469,1.0


In [103]:
full = pd.merge(first_half, second_half, on='MLBAMID', suffixes=('_1H', '_2H'))

In [104]:
full.columns

Index(['Name_1H', 'Team_1H', 'IP_1H', 'ERA_1H', 'xERA_1H', 'FIP_1H', 'xFIP_1H',
       'SIERA_1H', 'Pitching+_1H', 'botERA_1H', 'NameASCII_1H', 'PlayerId_1H',
       'MLBAMID', 'player_name_1H', 'pitcher_1H', 'mean', 'count', 'Name_2H',
       'Team_2H', 'IP_2H', 'ERA_2H', 'xERA_2H', 'FIP_2H', 'xFIP_2H',
       'SIERA_2H', 'Pitching+_2H', 'botERA_2H', 'NameASCII_2H', 'PlayerId_2H',
       'player_name_2H', 'pitcher_2H', 'second_half_pred',
       'second_half_count'],
      dtype='object')

In [108]:
full[['ERA_1H', 'botERA_1H', 'Pitching+_1H', 'mean', 'ERA_2H', 'FIP_2H', 'botERA_2H', 'Pitching+_2H', 'second_half_pred']].corr()

Unnamed: 0,ERA_1H,botERA_1H,Pitching+_1H,mean,ERA_2H,FIP_2H,botERA_2H,Pitching+_2H,second_half_pred
ERA_1H,1.0,0.229811,-0.266938,0.261404,0.209014,0.31613,0.063614,-0.095137,0.177779
botERA_1H,0.229811,1.0,-0.838773,0.809671,0.367389,0.381582,0.715541,-0.623306,0.652657
Pitching+_1H,-0.266938,-0.838773,1.0,-0.836425,-0.309551,-0.342017,-0.61908,0.737687,-0.692048
mean,0.261404,0.809671,-0.836425,1.0,0.385989,0.403046,0.593342,-0.651678,0.811912
ERA_2H,0.209014,0.367389,-0.309551,0.385989,1.0,0.795017,0.466702,-0.460661,0.450594
FIP_2H,0.31613,0.381582,-0.342017,0.403046,0.795017,1.0,0.504193,-0.505034,0.480625
botERA_2H,0.063614,0.715541,-0.61908,0.593342,0.466702,0.504193,1.0,-0.836706,0.805446
Pitching+_2H,-0.095137,-0.623306,0.737687,-0.651678,-0.460661,-0.505034,-0.836706,1.0,-0.827576
second_half_pred,0.177779,0.652657,-0.692048,0.811912,0.450594,0.480625,0.805446,-0.827576,1.0
