## How to measure run support

* Model the run environment, based on:
** year
** park
** DH presence
** home/away?
* Predict RE for each game
* Bucket games by run environment
* Compute the RS distribution by run environment bucket
** Convert these to winning percentages
* Merge in the winning percentage based on RS and RE

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pyretro.boxball_loader as bbl

In [3]:
glt = bbl.load_gamelog_teams(game_types=bbl.GameType.RS, seasons=bbl.Eras.ThirtyTeams).reset_index(drop=True)

# Add 'runs_scored_9' (runs scored through 9 innings; eliminate zombie runners)
by_inning = glt['linescore_parsed'].apply(lambda x: pd.Series(x)).fillna(0)
glt['runs_scored_9'] = by_inning.iloc[:, 0:9].sum(axis=1).astype(int)

# Merge in the DH flag
g = bbl.load_games().set_index('game_id')
dh_fl = g['dh_fl']=='T'
glt = pd.merge(left=glt, right=dh_fl, left_on='game_id', right_index=True).sort_index()

glt


Unnamed: 0,game_id,date,double_header,yr,game_type,park_id,team,team_league,team_game_number,runs_scored,...,batting_9_name,batting_9_position,HA,opp,runs_allowed,W,L,linescore_parsed,runs_scored_9,dh_fl
0,TBA199803310,1998-03-31,0,1998,RS,STP01,DET,AL,1,11,...,Billy Ripken,6.0,A,TBA,6,True,False,"[0.0, 4.0, 2.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0]",11,True
1,SEA199803310,1998-03-31,0,1998,RS,SEA02,CLE,AL,1,10,...,Enrique Wilson,4.0,A,SEA,9,True,False,"[0.0, 0.0, 2.0, 1.0, 0.0, 3.0, 0.0, 4.0, 0.0]",10,True
2,TEX199803310,1998-03-31,0,1998,RS,ARL02,CHA,AL,1,9,...,Mike Caruso,6.0,A,TEX,2,True,False,"[0.0, 0.0, 0.0, 0.0, 7.0, 0.0, 1.0, 0.0, 1.0]",9,True
3,ARI199803310,1998-03-31,0,1998,RS,PHO01,COL,NL,1,9,...,Darryl Kile,1.0,A,ARI,2,True,False,"[0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 5.0, 1.0, 0.0]",9,False
4,ATL199803310,1998-03-31,0,1998,RS,ATL02,MIL,NL,1,1,...,Cal Eldred,1.0,A,ATL,2,False,True,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118405,CIN202210050,2022-10-05,0,2022,RS,CIN09,CIN,NL,162,2,...,Austin Romine,2.0,H,CHN,15,False,True,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",2,True
118406,MIA202210050,2022-10-05,0,2022,RS,MIA02,MIA,NL,162,12,...,Lewin Diaz,3.0,H,ATL,9,True,False,"[0.0, 3.0, 0.0, 0.0, 2.0, 5.0, 2.0, 0.0, nan]",12,True
118407,OAK202210050,2022-10-05,0,2022,RS,OAK01,ANA,AL,162,2,...,Max Stassi,2.0,A,OAK,3,False,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0]",2,True
118408,MIA202210050,2022-10-05,0,2022,RS,MIA02,ATL,NL,162,9,...,Ehire Adrianza,5.0,A,MIA,12,False,True,"[0.0, 0.0, 2.0, 1.0, 0.0, 3.0, 2.0, 0.0, 1.0]",9,True


In [4]:
# Compare overall runs scored with runs scored through 9
# Verify that the merge worked properly by looking for a correlation ~1
tmp = glt[['runs_scored', 'runs_scored_9']]
tmp.mean(), tmp.corr()

(runs_scored      4.590491
 runs_scored_9    4.507010
 dtype: float64,
                runs_scored  runs_scored_9
 runs_scored       1.000000       0.990301
 runs_scored_9     0.990301       1.000000)

In [5]:
# Model the run environment

yr_dummies = pd.get_dummies(glt['yr'], prefix='yr', drop_first=True)
park_dummies = pd.get_dummies(glt['park_id'], drop_first=True)
team_bat_dummies = pd.get_dummies(glt['team'], drop_first=True, prefix='tm_bat_') 
team_pit_dummies = pd.get_dummies(glt['opp'], drop_first=True, prefix='tm_pit_') 
park_dummies

X = pd.concat([glt['dh_fl'], yr_dummies, park_dummies, team_bat_dummies, team_pit_dummies], axis=1)
y = glt['runs_scored_9']
model = LinearRegression()
model.fit(X, y)

model.intercept_, dict(zip(X.columns, model.coef_))

(4.409795256493723,
 {'dh_fl': 0.2027621538571567,
  'yr_1999': 0.33137423899415414,
  'yr_2000': 0.3963458022841933,
  'yr_2001': 0.022800629030101494,
  'yr_2002': -0.13559943383897016,
  'yr_2003': -0.032059313464395,
  'yr_2004': 0.04078087626480584,
  'yr_2005': -0.17135506229966999,
  'yr_2006': 0.0979170139633877,
  'yr_2007': 0.02402599971383644,
  'yr_2008': -0.13638628948935203,
  'yr_2009': -0.17694495344194128,
  'yr_2010': -0.40982940914505617,
  'yr_2011': -0.5300228100001713,
  'yr_2012': -0.46446110156011766,
  'yr_2013': -0.6461847545904832,
  'yr_2014': -0.7419731387732392,
  'yr_2015': -0.5464718492891066,
  'yr_2016': -0.30639132791255846,
  'yr_2017': -0.159891419941133,
  'yr_2018': -0.364677619556024,
  'yr_2019': 0.005776503355530482,
  'yr_2020': -0.2857398535251919,
  'yr_2021': -0.31669246472475876,
  'yr_2022': -0.6493392955257623,
  'ARL02': 0.7953700182797323,
  'ARL03': 0.08007298685333697,
  'ATL02': 0.03134550131963437,
  'ATL03': 0.46837363144942706,
 

In [6]:
np.mean([v for (k, v) in dict(zip(X.columns, model.coef_)).items() if 'yr' not in k and 'dh' not in k])

0.1597253566175219

In [7]:
for col in team_bat_dummies+team_pit_dummies:
    X[col] = 0

In [8]:
# Predict RE for each game (ignoring the identity of the batting/pitching teams)
for col in team_bat_dummies+team_pit_dummies:
    X[col] = 0
re_pred = model.predict(X)
glt['re_pred'] = re_pred
glt.sample(20)

Unnamed: 0,game_id,date,double_header,yr,game_type,park_id,team,team_league,team_game_number,runs_scored,...,batting_9_position,HA,opp,runs_allowed,W,L,linescore_parsed,runs_scored_9,dh_fl,re_pred
37436,FLO200508120,2005-08-12,0,2005,RS,MIA01,SFN,NL,114,1,...,1.0,A,FLO,0,True,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",1,False,4.132308
2975,CHN199807220,1998-07-22,0,1998,RS,CHI11,MON,NL,100,5,...,1.0,A,CHN,9,False,True,"[0.0, 0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0]",5,False,4.704706
101164,BAL201808290,2018-08-29,0,2018,RS,BAL12,TOR,AL,133,5,...,4.0,A,BAL,10,False,True,"[1.0, 3.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",5,True,4.490681
93154,SEA201705040,2017-05-04,0,2017,RS,SEA03,SEA,AL,29,11,...,8.0,H,ANA,3,True,False,"[1.0, 0.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, nan]",11,True,4.101644
8834,BOS199909010,1999-09-01,0,1999,RS,BOS07,BOS,AL,133,4,...,5.0,H,KCA,3,True,False,"[1.0, 0.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0, nan]",4,True,5.535466
27580,ANA200308030,2003-08-03,0,2003,RS,ANA01,TOR,AL,111,4,...,4.0,A,ANA,0,True,False,"[0.0, 0.0, 0.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0]",4,True,4.580498
80847,TBA201407260,2014-07-26,0,2014,RS,STP01,BOS,AL,104,0,...,2.0,A,TBA,3,False,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0,True,3.831788
49931,OAK200805200,2008-05-20,0,2008,RS,OAK01,TBA,AL,46,3,...,4.0,A,OAK,2,True,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0]",3,True,4.346354
85012,CIN201507040,2015-07-04,0,2015,RS,CIN09,MIL,NL,83,7,...,1.0,A,CIN,3,True,False,"[0.0, 0.0, 0.0, 1.0, 6.0, 0.0, 0.0, 0.0, 0.0]",7,False,4.351645
72387,HOU201209160,2012-09-16,0,2012,RS,HOU03,HOU,NL,147,7,...,1.0,H,PHI,6,True,False,"[0.0, 0.0, 2.0, 0.0, 0.0, 1.0, 4.0, 0.0, nan]",7,False,4.093479


In [9]:
glt[['runs_scored_9', 're_pred']].mean(), glt[['runs_scored_9', 're_pred']].corr()

(runs_scored_9    4.507010
 re_pred          4.518767
 dtype: float64,
                runs_scored_9   re_pred
 runs_scored_9       1.000000  0.149627
 re_pred             0.149627  1.000000)

In [10]:
# Add RSI
glt['rsi'] = glt['runs_scored_9']/glt['re_pred']

In [11]:
# Bucket games by RE
bin_ids, bins = pd.qcut(glt['re_pred'], 20, retbins=True)
score_dists = glt.groupby(bin_ids)['runs_scored_9'].value_counts().unstack().fillna(0)
score_dists

runs_scored_9,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,28,29,30
re_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(3.2640000000000002, 3.774]",545.0,810.0,1008.0,917.0,750.0,578.0,500.0,297.0,222.0,148.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(3.774, 3.911]",500.0,845.0,958.0,885.0,735.0,635.0,447.0,324.0,231.0,152.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(3.911, 4.023]",433.0,733.0,828.0,898.0,786.0,621.0,454.0,349.0,236.0,167.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(4.023, 4.115]",453.0,726.0,885.0,942.0,828.0,654.0,475.0,383.0,260.0,153.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(4.115, 4.173]",444.0,654.0,825.0,866.0,794.0,594.0,517.0,372.0,264.0,197.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(4.173, 4.258]",416.0,712.0,780.0,919.0,830.0,634.0,492.0,387.0,266.0,170.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(4.258, 4.311]",366.0,667.0,802.0,838.0,798.0,686.0,527.0,374.0,274.0,212.0,...,2.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
"(4.311, 4.375]",367.0,624.0,794.0,847.0,811.0,653.0,533.0,392.0,288.0,200.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(4.375, 4.436]",324.0,634.0,802.0,884.0,767.0,647.0,543.0,384.0,284.0,189.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(4.436, 4.496]",358.0,629.0,838.0,849.0,806.0,668.0,531.0,402.0,267.0,179.0,...,3.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
score_dists_cum = score_dists.stack().groupby('re_pred').cumsum().unstack()
score_dists_cum


runs_scored_9,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,28,29,30
re_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(3.2640000000000002, 3.774]",545.0,1355.0,2363.0,3280.0,4030.0,4608.0,5108.0,5405.0,5627.0,5775.0,...,6002.0,6002.0,6002.0,6002.0,6002.0,6002.0,6002.0,6002.0,6002.0,6002.0
"(3.774, 3.911]",500.0,1345.0,2303.0,3188.0,3923.0,4558.0,5005.0,5329.0,5560.0,5712.0,...,5998.0,5998.0,5998.0,5998.0,5998.0,5998.0,5998.0,5998.0,5998.0,5998.0
"(3.911, 4.023]",433.0,1166.0,1994.0,2892.0,3678.0,4299.0,4753.0,5102.0,5338.0,5505.0,...,5782.0,5783.0,5784.0,5784.0,5784.0,5784.0,5784.0,5784.0,5784.0,5784.0
"(4.023, 4.115]",453.0,1179.0,2064.0,3006.0,3834.0,4488.0,4963.0,5346.0,5606.0,5759.0,...,6056.0,6056.0,6056.0,6056.0,6056.0,6056.0,6056.0,6056.0,6056.0,6056.0
"(4.115, 4.173]",444.0,1098.0,1923.0,2789.0,3583.0,4177.0,4694.0,5066.0,5330.0,5527.0,...,5831.0,5832.0,5832.0,5832.0,5832.0,5832.0,5832.0,5832.0,5832.0,5832.0
"(4.173, 4.258]",416.0,1128.0,1908.0,2827.0,3657.0,4291.0,4783.0,5170.0,5436.0,5606.0,...,5956.0,5957.0,5957.0,5957.0,5957.0,5958.0,5958.0,5958.0,5958.0,5958.0
"(4.258, 4.311]",366.0,1033.0,1835.0,2673.0,3471.0,4157.0,4684.0,5058.0,5332.0,5544.0,...,5937.0,5937.0,5938.0,5938.0,5940.0,5940.0,5940.0,5940.0,5940.0,5940.0
"(4.311, 4.375]",367.0,991.0,1785.0,2632.0,3443.0,4096.0,4629.0,5021.0,5309.0,5509.0,...,5927.0,5928.0,5928.0,5928.0,5928.0,5928.0,5928.0,5928.0,5928.0,5928.0
"(4.375, 4.436]",324.0,958.0,1760.0,2644.0,3411.0,4058.0,4601.0,4985.0,5269.0,5458.0,...,5882.0,5882.0,5882.0,5882.0,5882.0,5882.0,5882.0,5882.0,5882.0,5882.0
"(4.436, 4.496]",358.0,987.0,1825.0,2674.0,3480.0,4148.0,4679.0,5081.0,5348.0,5527.0,...,5938.0,5939.0,5939.0,5940.0,5940.0,5940.0,5940.0,5940.0,5940.0,5940.0


In [13]:
wps = (score_dists_cum - score_dists/2).divide(score_dists_cum.max(axis=1), axis=0)
wps


runs_scored_9,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,28,29,30
re_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(3.2640000000000002, 3.774]",0.045402,0.158281,0.30973,0.470093,0.608964,0.719593,0.809397,0.875791,0.919027,0.94985,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(3.774, 3.911]",0.041681,0.153801,0.304101,0.457736,0.592781,0.706986,0.797182,0.861454,0.907719,0.939647,...,0.999917,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(3.911, 4.023]",0.037431,0.138226,0.273167,0.422372,0.567946,0.689575,0.782503,0.851919,0.90249,0.937327,...,0.999568,0.999741,0.999914,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(4.023, 4.115]",0.037401,0.134742,0.267751,0.418593,0.564729,0.687087,0.780301,0.851139,0.904227,0.938326,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(4.115, 4.173]",0.038066,0.132202,0.259002,0.403978,0.546296,0.665295,0.760545,0.836763,0.891289,0.930813,...,0.999743,0.999914,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(4.173, 4.258]",0.034911,0.129574,0.254783,0.397365,0.544142,0.667002,0.761497,0.835264,0.890064,0.926653,...,0.99958,0.999748,0.999832,0.999832,0.999832,0.999916,1.0,1.0,1.0,1.0
"(4.258, 4.311]",0.030808,0.117761,0.241414,0.379461,0.517172,0.642088,0.744192,0.820034,0.874579,0.915488,...,0.999327,0.999495,0.999579,0.999663,0.999832,1.0,1.0,1.0,1.0,1.0
"(4.311, 4.375]",0.030955,0.114541,0.234143,0.372554,0.512399,0.635881,0.735914,0.813934,0.871289,0.912449,...,0.999747,0.999916,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(4.375, 4.436]",0.027542,0.108977,0.231044,0.374362,0.514706,0.634903,0.736059,0.814859,0.871642,0.91185,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(4.436, 4.496]",0.030135,0.113215,0.2367,0.378704,0.518013,0.642088,0.743013,0.821549,0.877862,0.915404,...,0.999411,0.999747,0.999832,0.999916,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
wps.stack().rename('rs_wp').reset_index()

Unnamed: 0,re_pred,runs_scored_9,rs_wp
0,"(3.2640000000000002, 3.774]",0,0.045402
1,"(3.2640000000000002, 3.774]",1,0.158281
2,"(3.2640000000000002, 3.774]",2,0.309730
3,"(3.2640000000000002, 3.774]",3,0.470093
4,"(3.2640000000000002, 3.774]",4,0.608964
...,...,...,...
595,"(5.418, 12.41]",25,1.000000
596,"(5.418, 12.41]",26,1.000000
597,"(5.418, 12.41]",28,1.000000
598,"(5.418, 12.41]",29,1.000000


In [15]:
left=pd.concat([bin_ids, glt['runs_scored_9']], axis=1)
left

Unnamed: 0,re_pred,runs_scored_9
0,"(4.542, 4.599]",11
1,"(5.418, 12.41]",10
2,"(5.103, 5.418]",9
3,"(4.992, 5.103]",9
4,"(4.436, 4.496]",1
...,...,...
118405,"(4.436, 4.496]",2
118406,"(3.774, 3.911]",12
118407,"(3.774, 3.911]",2
118408,"(3.774, 3.911]",9


In [16]:
right=wps.stack().rename('rs_wp').reset_index().rename(columns={'level_1': 'runs_scored_9'})
right

Unnamed: 0,re_pred,runs_scored_9,rs_wp
0,"(3.2640000000000002, 3.774]",0,0.045402
1,"(3.2640000000000002, 3.774]",1,0.158281
2,"(3.2640000000000002, 3.774]",2,0.309730
3,"(3.2640000000000002, 3.774]",3,0.470093
4,"(3.2640000000000002, 3.774]",4,0.608964
...,...,...,...
595,"(5.418, 12.41]",25,1.000000
596,"(5.418, 12.41]",26,1.000000
597,"(5.418, 12.41]",28,1.000000
598,"(5.418, 12.41]",29,1.000000


In [17]:
pd.merge(left, right, on=['re_pred', 'runs_scored_9'], how='left')

Unnamed: 0,re_pred,runs_scored_9,rs_wp
0,"(4.542, 4.599]",11,0.958843
1,"(5.418, 12.41]",10,0.880028
2,"(5.103, 5.418]",9,0.853797
3,"(4.992, 5.103]",9,0.881899
4,"(4.436, 4.496]",1,0.113215
...,...,...,...
118405,"(4.436, 4.496]",2,0.236700
118406,"(3.774, 3.911]",12,0.985829
118407,"(3.774, 3.911]",2,0.304101
118408,"(3.774, 3.911]",9,0.939647


In [18]:
rs_wp = pd.merge(left, right, on=['re_pred', 'runs_scored_9'], how='left')['rs_wp']
rs_wp.index = left.index
rs_wp

0         0.958843
1         0.880028
2         0.853797
3         0.881899
4         0.113215
            ...   
118405    0.236700
118406    0.985829
118407    0.304101
118408    0.939647
118409    0.254783
Name: rs_wp, Length: 118410, dtype: float64

In [19]:
glt['rs_wp'] = rs_wp
glt

Unnamed: 0,game_id,date,double_header,yr,game_type,park_id,team,team_league,team_game_number,runs_scored,...,opp,runs_allowed,W,L,linescore_parsed,runs_scored_9,dh_fl,re_pred,rsi,rs_wp
0,TBA199803310,1998-03-31,0,1998,RS,STP01,DET,AL,1,11,...,TBA,6,True,False,"[0.0, 4.0, 2.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0]",11,True,4.573761,2.405023,0.958843
1,SEA199803310,1998-03-31,0,1998,RS,SEA02,CLE,AL,1,10,...,SEA,9,True,False,"[0.0, 0.0, 2.0, 1.0, 0.0, 3.0, 0.0, 4.0, 0.0]",10,True,5.645387,1.771358,0.880028
2,TEX199803310,1998-03-31,0,1998,RS,ARL02,CHA,AL,1,9,...,TEX,2,True,False,"[0.0, 0.0, 0.0, 0.0, 7.0, 0.0, 1.0, 0.0, 1.0]",9,True,5.407927,1.664224,0.853797
3,ARI199803310,1998-03-31,0,1998,RS,PHO01,COL,NL,1,9,...,ARI,2,True,False,"[0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 5.0, 1.0, 0.0]",9,False,5.034758,1.787574,0.881899
4,ATL199803310,1998-03-31,0,1998,RS,ATL02,MIL,NL,1,1,...,ATL,2,False,True,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,False,4.441141,0.225167,0.113215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118405,CIN202210050,2022-10-05,0,2022,RS,CIN09,CIN,NL,162,2,...,CHN,15,False,True,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",2,True,4.451540,0.449283,0.236700
118406,MIA202210050,2022-10-05,0,2022,RS,MIA02,MIA,NL,162,12,...,ATL,9,True,False,"[0.0, 3.0, 0.0, 0.0, 2.0, 5.0, 2.0, 0.0, nan]",12,True,3.893438,3.082109,0.985829
118407,OAK202210050,2022-10-05,0,2022,RS,OAK01,ANA,AL,162,2,...,OAK,3,False,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0]",2,True,3.833401,0.521730,0.304101
118408,MIA202210050,2022-10-05,0,2022,RS,MIA02,ATL,NL,162,9,...,MIA,12,False,True,"[0.0, 0.0, 2.0, 1.0, 0.0, 3.0, 2.0, 0.0, 1.0]",9,True,3.893438,2.311582,0.939647


In [20]:
glt[['runs_scored_9', 'rs_wp']].mean(), glt[['runs_scored_9', 'rs_wp']].corr()

(runs_scored_9    4.50701
 rs_wp            0.50000
 dtype: float64,
                runs_scored_9     rs_wp
 runs_scored_9       1.000000  0.937966
 rs_wp               0.937966  1.000000)

In [21]:
agg = {'W': np.mean, 'rs_wp': np.mean, 'rsi': np.mean, 'game_id': len, 'runs_scored_9': np.mean, 're_pred': np.mean}
pit_careers = glt.groupby(['starting_pitcher_id']).agg(agg)
pit_seasons = glt.groupby(['starting_pitcher_id', 'yr']).agg(agg)
for df in [pit_careers, pit_seasons]:
    df['rsi_agg2'] = df['runs_scored_9']/df['re_pred']
    df['rsi_wp'] = (df['rsi']**1.83)/(df['rsi']**1.83 +1)
pit_careers.query('game_id>=100').sort_values('rs_wp')

Unnamed: 0_level_0,W,rs_wp,rsi,game_id,runs_scored_9,re_pred,rsi_agg2,rsi_wp
starting_pitcher_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alcas001,0.463636,0.401308,0.748374,110,3.145455,4.229584,0.743679,0.370417
kellb003,0.419048,0.413334,0.817653,105,3.809524,4.655621,0.818263,0.408925
urenj001,0.406015,0.429107,0.850689,133,3.676692,4.363844,0.842535,0.426554
ruscg001,0.404145,0.435610,0.829644,193,3.834197,4.628504,0.828388,0.415380
wellk001,0.410959,0.446021,0.872531,219,4.095890,4.641631,0.882425,0.437938
...,...,...,...,...,...,...,...,...
friem001,0.694444,0.569883,1.187413,108,5.351852,4.487338,1.192656,0.577947
ramih002,0.495238,0.572547,1.204295,105,5.342857,4.459820,1.197998,0.584236
selea001,0.581967,0.575218,1.171817,244,5.659836,4.821059,1.173982,0.572035
mainj001,0.561905,0.588510,1.205890,105,5.104762,4.294381,1.188707,0.584824


In [22]:
for df_name in ['glt', 'pit_careers', 'pit_seasons']:
    locals()[df_name].to_parquet(f'output/{df_name}.parquet')

In [23]:
px.scatter(pit_careers.query('game_id>=100').reset_index(), x='rsi_wp', y='rs_wp', hover_data=['starting_pitcher_id', 'rsi', 'runs_scored_9'], trendline='ols')

In [24]:
pit_seasons.query('game_id>=25').corr()

Unnamed: 0,W,rs_wp,rsi,game_id,runs_scored_9,re_pred,rsi_agg2,rsi_wp
W,1.0,0.594205,0.570295,0.179712,0.498534,-0.025717,0.570643,0.572532
rs_wp,0.594205,1.0,0.956676,0.027668,0.85524,-0.002201,0.956951,0.958765
rsi,0.570295,0.956676,1.0,0.02004,0.869139,-0.043902,0.997843,0.995148
game_id,0.179712,0.027668,0.02004,1.0,0.048716,0.059035,0.021071,0.025836
runs_scored_9,0.498534,0.85524,0.869139,0.048716,1.0,0.447749,0.873498,0.86569
re_pred,-0.025717,-0.002201,-0.043902,0.059035,0.447749,1.0,-0.038837,-0.042933
rsi_agg2,0.570643,0.956951,0.997843,0.021071,0.873498,-0.038837,1.0,0.99307
rsi_wp,0.572532,0.958765,0.995148,0.025836,0.86569,-0.042933,0.99307,1.0


In [25]:
pit_careers.query('game_id>=100').corr()

Unnamed: 0,W,rs_wp,rsi,game_id,runs_scored_9,re_pred,rsi_agg2,rsi_wp
W,1.0,0.58872,0.593697,0.264743,0.4392,-0.08493,0.593079,0.591391
rs_wp,0.58872,1.0,0.96159,0.093673,0.773145,-0.048762,0.962353,0.961705
rsi,0.593697,0.96159,1.0,0.096988,0.741276,-0.140038,0.997144,0.998497
game_id,0.264743,0.093673,0.096988,1.0,0.040549,-0.058564,0.090398,0.107381
runs_scored_9,0.4392,0.773145,0.741276,0.040549,1.0,0.556791,0.755159,0.739835
re_pred,-0.08493,-0.048762,-0.140038,-0.058564,0.556791,1.0,-0.122209,-0.141049
rsi_agg2,0.593079,0.962353,0.997144,0.090398,0.755159,-0.122209,1.0,0.995785
rsi_wp,0.591391,0.961705,0.998497,0.107381,0.739835,-0.141049,0.995785,1.0


In [26]:
pit_seasons.query('game_id>=25').sort_values('rs_wp')

Unnamed: 0_level_0,Unnamed: 1_level_0,W,rs_wp,rsi,game_id,runs_scored_9,re_pred,rsi_agg2,rsi_wp
starting_pitcher_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
turns003,2019,0.233333,0.285554,0.521977,30,2.533333,4.854781,0.521822,0.233305
mussm001,2000,0.470588,0.310974,0.603928,34,3.147059,5.239333,0.600660,0.284374
kellb003,2019,0.321429,0.312754,0.595848,28,2.928571,4.956748,0.590825,0.279384
blaiw001,1998,0.240000,0.318848,0.603856,25,2.920000,4.750303,0.614698,0.284329
hendm001,2006,0.360000,0.328761,0.615373,25,2.840000,4.674284,0.607580,0.291417
...,...,...,...,...,...,...,...,...,...
estes001,2000,0.666667,0.679095,1.491648,30,7.300000,4.867416,1.499769,0.675196
saunj001,2009,0.645161,0.685158,1.390514,31,6.290323,4.522383,1.390931,0.646411
paxtj001,2019,0.689655,0.688179,1.414483,29,6.827586,4.805286,1.420849,0.653527
abbop001,2001,0.814815,0.693427,1.518040,27,7.037037,4.668666,1.507291,0.682195


In [27]:
px.scatter(pit_seasons.query('game_id>=35').reset_index(), x='rsi_wp', y='rs_wp', hover_data=['starting_pitcher_id', 'yr', 'rsi'], trendline='ols')