In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from scipy.special import logit
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [4]:
pitchers = pd.read_csv('pitchers17.csv')
pitchers.head()

Unnamed: 0,pitcher,y,bf,bb,k,hr,h,lo,po,fo,go
0,Aardsma_David_430911,2012,5,1,1,1,1,0,0,0,2
1,Aardsma_David_430911,2013,172,13,36,7,39,12,8,28,28
2,Aardsma_David_430911,2015,126,11,35,6,25,9,5,21,17
3,Abad_Fernando_472551,2012,207,18,38,6,56,4,16,28,38
4,Abad_Fernando_472551,2013,166,10,32,3,42,11,17,22,29


In [5]:
# 2a.i.
pitchers.bb.sum()/pitchers.bf.sum()

0.07491447652363538

In [6]:
# 2a.ii.
pitchers.k.sum()/pitchers.bf.sum()

0.2063503168798128

In [7]:
# 2a.iii.
pitchers.k.sum()/(pitchers.bf.sum() - pitchers.bb.sum())

0.22306079994028244

In [8]:
# 2a.iv.
pitchers.hr.sum()/pitchers.bf.sum()

0.027626467982645436

In [9]:
# 2a.v.
pitchers.hr.sum()/(pitchers.bf.sum() - pitchers.bb.sum() - pitchers.k.sum())

0.03843761614721209

In [10]:
# 2a.vi.
(pitchers.h.sum() - pitchers.hr.sum())/(pitchers.bf.sum() - pitchers.bb.sum() - pitchers.k.sum() - pitchers.hr.sum())

0.2914712660905554

In [46]:
# 2b
pitchers500 = pitchers[pitchers.bf >= 500]
pitchers500['krate'] = pitchers500.k/(pitchers500.bf - pitchers500.bb)

In [50]:
pitchers500[pitchers500.y == 2016][['pitcher','bf','krate']].nlargest(10, 'krate')

Unnamed: 0,pitcher,bf,krate
1190,Fernandez_Jose_605228,731,0.370968
3524,Scherzer_Max_453286,900,0.335697
3721,Strasburg_Stephen_544931,597,0.330325
2021,Kershaw_Clayton_477132,543,0.322702
3775,Syndergaard_Noah_592789,742,0.310984
3183,Ray_Robbie_592662,772,0.30922
3445,Salazar_Danny_517593,581,0.309021
3942,Velasquez_Vincent_592826,550,0.300395
3964,Verlander_Justin_434378,902,0.300236
120,Archer_Chris_502042,850,0.297573


In [51]:
pitchers500[pitchers500.y == 2017][['pitcher','bf','krate']].nlargest(10, 'krate')

Unnamed: 0,pitcher,bf,krate
3452,Sale_Chris_519242,851,0.381188
3525,Scherzer_Max_453286,778,0.369655
3184,Ray_Robbie_592662,662,0.367003
2058,Kluber_Corey_446372,775,0.357625
1754,Hill_Rich_448179,551,0.33002
2948,Peacock_Brad_502748,546,0.329243
121,Archer_Chris_502042,852,0.314394
3556,Severino_Luis_622663,783,0.314208
3722,Strasburg_Stephen_544931,696,0.311927
2022,Kershaw_Clayton_477132,679,0.311248


In [52]:
# 2c
pitchers500['hrate'] = (pitchers500.h - pitchers500.hr)/(pitchers500.bf - pitchers500.bb - pitchers500.k - pitchers500.hr)

In [53]:
pitchers500[pitchers500.y == 2016][['pitcher','bf','hrate']].nlargest(10, 'hrate')

Unnamed: 0,pitcher,bf,hrate
3183,Ray_Robbie_592662,772,0.347732
2939,Paxton_James_572020,508,0.34626
2958,Pelfrey_Mike_460059,541,0.341981
759,Cole_Gerrit_543037,503,0.339726
2980,Perdomo_Luis_606131,655,0.336066
2511,McHugh_Collin_543521,795,0.335185
412,Bradley_Archie_605151,630,0.334951
1047,Duffey_Tyler_608648,593,0.334118
4009,Wacha_Michael_608379,600,0.333333
3775,Syndergaard_Noah_592789,742,0.332627


In [54]:
pitchers500[pitchers500.y == 2017][['pitcher','bf','hrate']].nlargest(10, 'hrate')

Unnamed: 0,pitcher,bf,hrate
2619,Montero_Rafael_606160,545,0.361345
3778,Taillon_Jameson_592791,584,0.348148
3223,Richard_Clayton_453385,852,0.346154
248,Bauer_Trevor_545333,749,0.333333
2722,Nelson_Jimmy_519076,727,0.333333
1361,Gausman_Kevin_592332,816,0.331471
2564,Miley_Wade_489119,727,0.32906
3055,Pivetta_Nick_601713,584,0.328729
788,Colon_Bartolo_112526,648,0.328629
396,Boyd_Matt_571510,602,0.32783


In [165]:
# 3a.i.
pitchers200 = pitchers[pitchers.bf >= 200].reset_index(drop=True)
pitcher_list = pitchers200.pitcher.value_counts()
pitcher_list = pitcher_list[pitcher_list > 1].index.unique().to_list()
pitchers200 = pitchers200[pitchers200.pitcher.isin(pitcher_list)]

In [221]:
pitchers200['bbrate'] = pitchers200.bb/pitchers200.bf
pitchers200['krate'] = pitchers200.k/(pitchers200.bf - pitchers200.bb)
pitchers200['hrrate'] = pitchers200.hr/(pitchers200.bf - pitchers200.bb - pitchers200.k)
pitchers200['hrate'] = (pitchers200.h - pitchers200.hr)/(pitchers200.bf - pitchers200.bb - pitchers200.k - pitchers200.hr)
# three true outcomes rates
pitchers200['ttorate'] = (pitchers200.bb + pitchers200.hr + pitchers200.k)/(pitchers200.bb + pitchers200.h + pitchers200.k)
# soft contact rate
pitchers200['scrate'] = (pitchers200.po + pitchers200.go)/(pitchers200.bf - pitchers200.bb - pitchers200.k)
# hard contact rate
pitchers200['hcrate'] = (pitchers200.lo + pitchers200.fo + pitchers200.hr)/(pitchers200.bf - pitchers200.bb - pitchers200.k)
# ground out rate - correlated with Ks?
pitchers200['gorate'] = pitchers200.go/(pitchers200.go + pitchers200.fo + pitchers200.lo + pitchers200.po)  
# fly out rate - correlated with HRs?
pitchers200['forate'] = pitchers200.fo/(pitchers200.go + pitchers200.fo + pitchers200.lo + pitchers200.po) 
# pop out rate 
pitchers200['porate'] = pitchers200.po/(pitchers200.go + pitchers200.fo + pitchers200.lo + pitchers200.po) 
# line out rate 
pitchers200['lorate'] = pitchers200.lo/(pitchers200.go + pitchers200.fo + pitchers200.lo + pitchers200.po) 

pitchers200['ydiff'] = pitchers200.groupby('pitcher')['y'].diff(1).fillna(0)
pitchers200['bbrate_prev'] = pitchers200.bbrate.shift()
pitchers200['krate_prev'] = pitchers200.krate.shift()
pitchers200['hrrate_prev'] = pitchers200.hrrate.shift()
pitchers200['hrate_prev'] = pitchers200.hrate.shift()
pitchers200['ttorate_prev'] = pitchers200.ttorate.shift()
pitchers200['scrate_prev'] = pitchers200.scrate.shift()
pitchers200['hcrate_prev'] = pitchers200.hcrate.shift()
pitchers200['gorate_prev'] = pitchers200.gorate.shift()
pitchers200['forate_prev'] = pitchers200.forate.shift()
pitchers200['lorate_prev'] = pitchers200.lorate.shift()
pitchers200['porate_prev'] = pitchers200.porate.shift()

pitchersprev = pitchers200[pitchers200.ydiff == 1]
pitchersprev['yprev'] = pitchersbbprev.y - 1

In [206]:
smf.ols('bbrate ~ bbrate_prev', pitchersprev).fit().summary()

0,1,2,3
Dep. Variable:,bbrate,R-squared:,0.286
Model:,OLS,Adj. R-squared:,0.285
Method:,Least Squares,F-statistic:,475.1
Date:,"Thu, 01 Apr 2021",Prob (F-statistic):,7.219999999999999e-89
Time:,12:21:45,Log-Likelihood:,3034.6
No. Observations:,1189,AIC:,-6065.0
Df Residuals:,1187,BIC:,-6055.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0335,0.002,17.934,0.000,0.030,0.037
bbrate_prev,0.5474,0.025,21.797,0.000,0.498,0.597

0,1,2,3
Omnibus:,39.471,Durbin-Watson:,2.069
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47.781
Skew:,0.379,Prob(JB):,4.21e-11
Kurtosis:,3.624,Cond. No.,46.1


In [174]:
# 3b.i.
smf.ols('krate ~ krate_prev', pitchersprev).fit().summary()

0,1,2,3
Dep. Variable:,krate,R-squared:,0.558
Model:,OLS,Adj. R-squared:,0.558
Method:,Least Squares,F-statistic:,1498.0
Date:,"Tue, 30 Mar 2021",Prob (F-statistic):,1.19e-212
Time:,16:47:18,Log-Likelihood:,2072.4
No. Observations:,1189,AIC:,-4141.0
Df Residuals:,1187,BIC:,-4131.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0583,0.005,12.505,0.000,0.049,0.067
krate_prev,0.7474,0.019,38.707,0.000,0.710,0.785

0,1,2,3
Omnibus:,87.997,Durbin-Watson:,2.15
Prob(Omnibus):,0.0,Jarque-Bera (JB):,127.11
Skew:,0.591,Prob(JB):,2.5e-28
Kurtosis:,4.081,Cond. No.,16.6


In [175]:
# 3c.i.
smf.ols('hrrate ~ hrrate_prev', pitchersprev).fit().summary()

0,1,2,3
Dep. Variable:,hrrate,R-squared:,0.069
Model:,OLS,Adj. R-squared:,0.068
Method:,Least Squares,F-statistic:,87.85
Date:,"Tue, 30 Mar 2021",Prob (F-statistic):,3.44e-20
Time:,16:47:32,Log-Likelihood:,3367.6
No. Observations:,1189,AIC:,-6731.0
Df Residuals:,1187,BIC:,-6721.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0273,0.001,23.285,0.000,0.025,0.030
hrrate_prev,0.2954,0.032,9.373,0.000,0.234,0.357

0,1,2,3
Omnibus:,34.41,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38.262
Skew:,0.381,Prob(JB):,4.91e-09
Kurtosis:,3.439,Cond. No.,76.3


In [176]:
# 3d.i.
smf.ols('hrate ~ hrate_prev', pitchersprev).fit().summary()

0,1,2,3
Dep. Variable:,hrate,R-squared:,0.029
Model:,OLS,Adj. R-squared:,0.028
Method:,Least Squares,F-statistic:,35.55
Date:,"Tue, 30 Mar 2021",Prob (F-statistic):,3.27e-09
Time:,16:47:42,Log-Likelihood:,2449.4
No. Observations:,1189,AIC:,-4895.0
Df Residuals:,1187,BIC:,-4885.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2404,0.008,28.946,0.000,0.224,0.257
hrate_prev,0.1720,0.029,5.963,0.000,0.115,0.229

0,1,2,3
Omnibus:,12.209,Durbin-Watson:,1.967
Prob(Omnibus):,0.002,Jarque-Bera (JB):,18.016
Skew:,-0.061,Prob(JB):,0.000122
Kurtosis:,3.591,Cond. No.,34.9


In [278]:
# 3a.ii.
smf.ols('bbrate ~ bbrate_prev + krate_prev + forate_prev + hrrate_prev', pitchersprev).fit().summary()

0,1,2,3
Dep. Variable:,bbrate,R-squared:,0.298
Model:,OLS,Adj. R-squared:,0.295
Method:,Least Squares,F-statistic:,125.6
Date:,"Thu, 01 Apr 2021",Prob (F-statistic):,2.16e-89
Time:,13:49:11,Log-Likelihood:,3044.7
No. Observations:,1189,AIC:,-6079.0
Df Residuals:,1184,BIC:,-6054.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0313,0.003,9.456,0.000,0.025,0.038
bbrate_prev,0.5285,0.025,20.766,0.000,0.479,0.578
krate_prev,0.0332,0.009,3.762,0.000,0.016,0.050
forate_prev,-0.0257,0.009,-2.845,0.005,-0.043,-0.008
hrrate_prev,0.0795,0.044,1.817,0.069,-0.006,0.165

0,1,2,3
Omnibus:,25.523,Durbin-Watson:,2.05
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28.878
Skew:,0.303,Prob(JB):,5.36e-07
Kurtosis:,3.465,Cond. No.,86.0


In [250]:
# 3b.ii.
smf.ols('krate ~ krate_prev + bbrate_prev + forate_prev + gorate_prev', pitchersprev).fit().summary()

0,1,2,3
Dep. Variable:,krate,R-squared:,0.562
Model:,OLS,Adj. R-squared:,0.561
Method:,Least Squares,F-statistic:,380.1
Date:,"Thu, 01 Apr 2021",Prob (F-statistic):,1.41e-210
Time:,13:05:24,Log-Likelihood:,2078.2
No. Observations:,1189,AIC:,-4146.0
Df Residuals:,1184,BIC:,-4121.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1234,0.023,5.316,0.000,0.078,0.169
krate_prev,0.7319,0.020,36.157,0.000,0.692,0.772
bbrate_prev,0.0804,0.057,1.402,0.161,-0.032,0.193
forate_prev,-0.1053,0.036,-2.895,0.004,-0.177,-0.034
gorate_prev,-0.0756,0.025,-3.025,0.003,-0.125,-0.027

0,1,2,3
Omnibus:,82.005,Durbin-Watson:,2.146
Prob(Omnibus):,0.0,Jarque-Bera (JB):,114.871
Skew:,0.572,Prob(JB):,1.14e-25
Kurtosis:,4.005,Cond. No.,55.6


In [248]:
# 3c.ii.
smf.ols('hrrate ~ hrrate_prev + hcrate_prev + krate_prev + forate_prev + gorate_prev', pitchersprev).fit().summary()

0,1,2,3
Dep. Variable:,hrrate,R-squared:,0.147
Model:,OLS,Adj. R-squared:,0.143
Method:,Least Squares,F-statistic:,40.74
Date:,"Thu, 01 Apr 2021",Prob (F-statistic):,9.53e-39
Time:,13:04:18,Log-Likelihood:,3419.6
No. Observations:,1189,AIC:,-6827.0
Df Residuals:,1183,BIC:,-6797.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0639,0.011,5.583,0.000,0.041,0.086
hrrate_prev,0.1418,0.037,3.871,0.000,0.070,0.214
hcrate_prev,0.0523,0.022,2.404,0.016,0.010,0.095
krate_prev,-0.0171,0.006,-2.651,0.008,-0.030,-0.004
forate_prev,-0.0571,0.012,-4.582,0.000,-0.082,-0.033
gorate_prev,-0.0523,0.011,-4.659,0.000,-0.074,-0.030

0,1,2,3
Omnibus:,29.271,Durbin-Watson:,1.977
Prob(Omnibus):,0.0,Jarque-Bera (JB):,32.096
Skew:,0.348,Prob(JB):,1.07e-07
Kurtosis:,3.406,Cond. No.,119.0


In [259]:
# 3d.ii.
smf.ols('hrate ~ hrate_prev + bbrate_prev  + ttorate_prev + gorate_prev + forate_prev + lorate_prev + porate_prev', pitchersprev).fit().summary()

0,1,2,3
Dep. Variable:,hrate,R-squared:,0.073
Model:,OLS,Adj. R-squared:,0.069
Method:,Least Squares,F-statistic:,15.62
Date:,"Thu, 01 Apr 2021",Prob (F-statistic):,2.57e-17
Time:,13:15:08,Log-Likelihood:,2477.2
No. Observations:,1189,AIC:,-4940.0
Df Residuals:,1182,BIC:,-4905.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2370,0.013,17.698,0.000,0.211,0.263
hrate_prev,0.0659,0.033,2.007,0.045,0.001,0.130
bbrate_prev,0.1185,0.045,2.616,0.009,0.030,0.207
ttorate_prev,-0.0756,0.016,-4.834,0.000,-0.106,-0.045
gorate_prev,0.0706,0.008,8.497,0.000,0.054,0.087
forate_prev,0.0996,0.016,6.050,0.000,0.067,0.132
lorate_prev,0.1111,0.021,5.181,0.000,0.069,0.153
porate_prev,-0.0443,0.025,-1.781,0.075,-0.093,0.004

0,1,2,3
Omnibus:,12.363,Durbin-Watson:,1.987
Prob(Omnibus):,0.002,Jarque-Bera (JB):,18.97
Skew:,0.022,Prob(JB):,7.6e-05
Kurtosis:,3.617,Cond. No.,6140000000000000.0
