## How to measure run support

* Model the run environment, based on:
** year
** park
** DH presence
** home/away?
* Predict RE for each game
* Bucket games by run environment
* Compute the RS distribution by run environment bucket
** Convert these to winning percentages
* Merge in the winning percentage based on RS and RE

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pyretro.boxball_loader as bbl

In [2]:
glt = bbl.load_gamelog_teams(game_types=bbl.GameType.RS, seasons=bbl.Eras.ThirtyTeams).reset_index(drop=True)

# Add 'runs_scored_9' (runs scored through 9 innings; eliminate zombie runners)
by_inning = glt['linescore_parsed'].apply(lambda x: pd.Series(x)).fillna(0)
glt['runs_scored_9'] = by_inning.iloc[:, 0:9].sum(axis=1).astype(int)

# Merge in the DH flag
g = bbl.load_games().set_index('game_id')
dh_fl = g['dh_fl']=='T'
glt = pd.merge(left=glt, right=dh_fl, left_on='game_id', right_index=True).sort_index()

glt


Unnamed: 0,game_id,date,double_header,yr,game_type,park_id,team,team_league,team_game_number,runs_scored,...,batting_9_name,batting_9_position,HA,opp,runs_allowed,W,L,linescore_parsed,runs_scored_9,dh_fl
0,TBA199803310,1998-03-31,0,1998,RS,STP01,DET,AL,1,11,...,Billy Ripken,6.0,A,TBA,6,True,False,"[0.0, 4.0, 2.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0]",11,True
1,SEA199803310,1998-03-31,0,1998,RS,SEA02,CLE,AL,1,10,...,Enrique Wilson,4.0,A,SEA,9,True,False,"[0.0, 0.0, 2.0, 1.0, 0.0, 3.0, 0.0, 4.0, 0.0]",10,True
2,TEX199803310,1998-03-31,0,1998,RS,ARL02,CHA,AL,1,9,...,Mike Caruso,6.0,A,TEX,2,True,False,"[0.0, 0.0, 0.0, 0.0, 7.0, 0.0, 1.0, 0.0, 1.0]",9,True
3,ARI199803310,1998-03-31,0,1998,RS,PHO01,COL,NL,1,9,...,Darryl Kile,1.0,A,ARI,2,True,False,"[0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 5.0, 1.0, 0.0]",9,False
4,ATL199803310,1998-03-31,0,1998,RS,ATL02,MIL,NL,1,1,...,Cal Eldred,1.0,A,ATL,2,False,True,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118405,CIN202210050,2022-10-05,0,2022,RS,CIN09,CIN,NL,162,2,...,Austin Romine,2.0,H,CHN,15,False,True,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",2,True
118406,MIA202210050,2022-10-05,0,2022,RS,MIA02,MIA,NL,162,12,...,Lewin Diaz,3.0,H,ATL,9,True,False,"[0.0, 3.0, 0.0, 0.0, 2.0, 5.0, 2.0, 0.0, nan]",12,True
118407,OAK202210050,2022-10-05,0,2022,RS,OAK01,ANA,AL,162,2,...,Max Stassi,2.0,A,OAK,3,False,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0]",2,True
118408,MIA202210050,2022-10-05,0,2022,RS,MIA02,ATL,NL,162,9,...,Ehire Adrianza,5.0,A,MIA,12,False,True,"[0.0, 0.0, 2.0, 1.0, 0.0, 3.0, 2.0, 0.0, 1.0]",9,True


In [3]:
# Compare overall runs scored with runs scored through 9
# Verify that the merge worked properly by looking for a correlation ~1
tmp = glt[['runs_scored', 'runs_scored_9']]
tmp.mean(), tmp.corr()

(runs_scored      4.590491
 runs_scored_9    4.507010
 dtype: float64,
                runs_scored  runs_scored_9
 runs_scored       1.000000       0.990301
 runs_scored_9     0.990301       1.000000)

In [4]:
# Model the run environment

yr_dummies = pd.get_dummies(glt['yr'], prefix='yr', drop_first=True)
park_dummies = pd.get_dummies(glt['park_id'], drop_first=True)
park_dummies

X = pd.concat([glt['dh_fl'], yr_dummies, park_dummies], axis=1)
y = glt['runs_scored_9']
model = LinearRegression()
model.fit(X, y)

model.intercept_, dict(zip(X.columns, model.coef_))

(4.405648212264609,
 {'dh_fl': 0.2005542852342971,
  'yr_1999': 0.3313916965964737,
  'yr_2000': 0.39676392083272816,
  'yr_2001': 0.02286979808816403,
  'yr_2002': -0.13554987618148834,
  'yr_2003': -0.03256272625148032,
  'yr_2004': 0.04037218634747494,
  'yr_2005': -0.16332032489394033,
  'yr_2006': 0.10599111922938939,
  'yr_2007': 0.03200953555530661,
  'yr_2008': -0.12809941594141452,
  'yr_2009': -0.16852629064258562,
  'yr_2010': -0.4010599372462168,
  'yr_2011': -0.5212234532685863,
  'yr_2012': -0.4625352821917044,
  'yr_2013': -0.6441875466086423,
  'yr_2014': -0.7398545171343808,
  'yr_2015': -0.5443032173167621,
  'yr_2016': -0.3043200850596357,
  'yr_2017': -0.15809538955246794,
  'yr_2018': -0.3625838217437959,
  'yr_2019': 0.007565394185666485,
  'yr_2020': -0.2844929426390211,
  'yr_2021': -0.3153785627124977,
  'yr_2022': -0.6463209727472418,
  'ARL02': 0.8919988672080499,
  'ARL03': 0.17990401631033806,
  'ATL02': -0.007206484294364546,
  'ATL03': 0.4305678375361627,

In [5]:
np.mean([v for (k, v) in dict(zip(X.columns, model.coef_)).items() if 'yr' not in k and 'dh' not in k])

0.3306365783061987

In [6]:
# Predict RE for each game
re_pred = model.predict(X)
glt['re_pred'] = re_pred
glt.sample(20)

Unnamed: 0,game_id,date,double_header,yr,game_type,park_id,team,team_league,team_game_number,runs_scored,...,batting_9_position,HA,opp,runs_allowed,W,L,linescore_parsed,runs_scored_9,dh_fl,re_pred
54559,ATL200905180,2009-05-18,0,2009,RS,ATL02,ATL,NL,37,1,...,1.0,H,COL,5,False,True,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,False,4.229915
52734,CIN200809030,2008-09-03,0,2008,RS,CIN09,CIN,NL,139,5,...,1.0,H,PIT,6,False,True,"[0.0, 0.0, 0.0, 2.0, 1.0, 1.0, 0.0, 1.0, 0.0]",5,False,4.751261
25849,BAL200305290,2003-05-29,0,2003,RS,BAL12,TEX,AL,52,8,...,4.0,A,BAL,4,True,False,"[1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 0.0, 0.0, 2.0]",8,True,4.866381
10625,SEA200005070,2000-05-07,0,2000,RS,SEA03,ANA,AL,32,2,...,6.0,A,SEA,8,False,True,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]",2,True,4.703247
55526,CHA200906230,2009-06-23,0,2009,RS,CHI12,LAN,NL,71,5,...,10.0,A,CHA,2,True,False,"[1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0]",5,True,4.713682
107017,SFN202007280,2020-07-28,0,2020,RS,SFO03,SDN,NL,5,5,...,10.0,A,SFN,3,True,False,"[0.0, 0.0, 3.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0]",5,True,4.192913
61523,COL201008040,2010-08-04,0,2010,RS,DEN02,COL,NL,107,6,...,1.0,H,SFN,1,True,False,"[0.0, 2.0, 0.0, 0.0, 2.0, 2.0, 0.0, 0.0, nan]",6,False,5.577071
89795,WAS201606290,2016-06-29,0,2016,RS,WAS11,WAS,NL,79,4,...,1.0,H,NYN,2,True,False,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 2.0, nan]",4,False,4.366391
106234,PIT201909050,2019-09-05,0,2019,RS,PIT08,PIT,NL,140,7,...,1.0,H,MIA,10,False,True,"[5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0]",7,False,4.537878
43599,NYA200609270,2006-09-27,0,2006,RS,NYC16,BAL,AL,158,5,...,7.0,A,NYA,16,False,True,"[0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0]",5,True,4.971146


In [7]:
glt[['runs_scored_9', 're_pred']].mean(), glt[['runs_scored_9', 're_pred']].corr()

(runs_scored_9    4.50701
 re_pred          4.50701
 dtype: float64,
                runs_scored_9   re_pred
 runs_scored_9       1.000000  0.151539
 re_pred             0.151539  1.000000)

In [8]:
# Add RSI
glt['rsi'] = glt['runs_scored_9']/glt['re_pred']

In [9]:
# Bucket games by RE
bin_ids, bins = pd.qcut(glt['re_pred'], 20, retbins=True)
score_dists = glt.groupby(bin_ids)['runs_scored_9'].value_counts().unstack().fillna(0)
score_dists

runs_scored_9,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,28,29,30
re_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(3.346, 3.785]",544.0,825.0,1010.0,929.0,727.0,584.0,480.0,311.0,222.0,152.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(3.785, 3.936]",485.0,825.0,922.0,879.0,745.0,616.0,429.0,299.0,228.0,137.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(3.936, 4.028]",462.0,758.0,879.0,903.0,813.0,622.0,487.0,351.0,231.0,163.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(4.028, 4.128]",455.0,720.0,842.0,892.0,779.0,634.0,471.0,375.0,252.0,175.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(4.128, 4.179]",423.0,688.0,823.0,910.0,814.0,647.0,496.0,372.0,269.0,170.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(4.179, 4.255]",392.0,714.0,831.0,884.0,815.0,610.0,526.0,371.0,257.0,203.0,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(4.255, 4.321]",370.0,628.0,812.0,853.0,837.0,662.0,519.0,397.0,310.0,204.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(4.321, 4.367]",388.0,599.0,811.0,890.0,812.0,638.0,527.0,398.0,274.0,185.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(4.367, 4.427]",346.0,607.0,780.0,858.0,790.0,667.0,564.0,378.0,250.0,195.0,...,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(4.427, 4.478]",354.0,679.0,811.0,843.0,771.0,672.0,552.0,385.0,289.0,174.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
score_dists_cum = score_dists.stack().groupby('re_pred').cumsum().unstack()
score_dists_cum


runs_scored_9,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,28,29,30
re_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(3.346, 3.785]",544.0,1369.0,2379.0,3308.0,4035.0,4619.0,5099.0,5410.0,5632.0,5784.0,...,6012.0,6012.0,6012.0,6012.0,6012.0,6012.0,6012.0,6012.0,6012.0,6012.0
"(3.785, 3.936]",485.0,1310.0,2232.0,3111.0,3856.0,4472.0,4901.0,5200.0,5428.0,5565.0,...,5838.0,5839.0,5840.0,5840.0,5840.0,5840.0,5840.0,5840.0,5840.0,5840.0
"(3.936, 4.028]",462.0,1220.0,2099.0,3002.0,3815.0,4437.0,4924.0,5275.0,5506.0,5669.0,...,5950.0,5950.0,5950.0,5950.0,5950.0,5950.0,5950.0,5950.0,5950.0,5950.0
"(4.028, 4.128]",455.0,1175.0,2017.0,2909.0,3688.0,4322.0,4793.0,5168.0,5420.0,5595.0,...,5887.0,5888.0,5888.0,5888.0,5888.0,5888.0,5888.0,5888.0,5888.0,5888.0
"(4.128, 4.179]",423.0,1111.0,1934.0,2844.0,3658.0,4305.0,4801.0,5173.0,5442.0,5612.0,...,5920.0,5920.0,5920.0,5920.0,5920.0,5920.0,5920.0,5920.0,5920.0,5920.0
"(4.179, 4.255]",392.0,1106.0,1937.0,2821.0,3636.0,4246.0,4772.0,5143.0,5400.0,5603.0,...,5950.0,5952.0,5952.0,5952.0,5952.0,5952.0,5952.0,5952.0,5952.0,5952.0
"(4.255, 4.321]",370.0,998.0,1810.0,2663.0,3500.0,4162.0,4681.0,5078.0,5388.0,5592.0,...,6003.0,6003.0,6003.0,6003.0,6003.0,6004.0,6004.0,6004.0,6004.0,6004.0
"(4.321, 4.367]",388.0,987.0,1798.0,2688.0,3500.0,4138.0,4665.0,5063.0,5337.0,5522.0,...,5932.0,5933.0,5933.0,5933.0,5934.0,5934.0,5934.0,5934.0,5934.0,5934.0
"(4.367, 4.427]",346.0,953.0,1733.0,2591.0,3381.0,4048.0,4612.0,4990.0,5240.0,5435.0,...,5824.0,5824.0,5825.0,5825.0,5826.0,5826.0,5826.0,5826.0,5826.0,5826.0
"(4.427, 4.478]",354.0,1033.0,1844.0,2687.0,3458.0,4130.0,4682.0,5067.0,5356.0,5530.0,...,5945.0,5945.0,5945.0,5946.0,5946.0,5946.0,5946.0,5946.0,5946.0,5946.0


In [11]:
wps = (score_dists_cum - score_dists/2).divide(score_dists_cum.max(axis=1), axis=0)
wps


runs_scored_9,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,28,29,30
re_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(3.346, 3.785]",0.045243,0.159098,0.31171,0.472971,0.610695,0.719727,0.808217,0.874002,0.91833,0.949434,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(3.785, 3.936]",0.041524,0.153682,0.303253,0.457449,0.59649,0.713014,0.802483,0.864812,0.909932,0.941182,...,0.999572,0.999743,0.999914,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(3.936, 4.028]",0.038824,0.141345,0.278908,0.428655,0.572857,0.693445,0.786639,0.857059,0.905966,0.939076,...,0.999916,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(4.028, 4.128]",0.038638,0.138417,0.27106,0.418308,0.560207,0.680197,0.774032,0.845873,0.899117,0.935377,...,0.99983,0.999915,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(4.128, 4.179]",0.035726,0.129561,0.257179,0.403547,0.549155,0.672551,0.769088,0.842399,0.896537,0.933615,...,0.999916,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(4.179, 4.255]",0.03293,0.12584,0.255628,0.399698,0.542423,0.66213,0.75756,0.832913,0.885669,0.924311,...,0.99958,0.999832,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"(4.255, 4.321]",0.030813,0.113924,0.233844,0.372502,0.513241,0.638075,0.736426,0.812708,0.871586,0.91439,...,0.99975,0.999833,0.999833,0.999833,0.999833,0.999917,1.0,1.0,1.0,1.0
"(4.321, 4.367]",0.032693,0.115858,0.234665,0.377991,0.521402,0.643579,0.741743,0.819683,0.876306,0.914981,...,0.999579,0.999747,0.999831,0.999831,0.999916,1.0,1.0,1.0,1.0,1.0
"(4.367, 4.427]",0.029694,0.111483,0.230518,0.371095,0.51253,0.637573,0.74322,0.824065,0.877961,0.916152,...,0.999399,0.999657,0.999743,0.999828,0.999914,1.0,1.0,1.0,1.0,1.0
"(4.427, 4.478]",0.029768,0.116633,0.241927,0.381012,0.516734,0.638076,0.741002,0.819795,0.876472,0.915405,...,0.999748,0.999832,0.999832,0.999916,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
wps.stack().rename('rs_wp').reset_index()

Unnamed: 0,re_pred,runs_scored_9,rs_wp
0,"(3.346, 3.785]",0,0.045243
1,"(3.346, 3.785]",1,0.159098
2,"(3.346, 3.785]",2,0.311710
3,"(3.346, 3.785]",3,0.472971
4,"(3.346, 3.785]",4,0.610695
...,...,...,...
595,"(5.37, 12.5]",25,1.000000
596,"(5.37, 12.5]",26,1.000000
597,"(5.37, 12.5]",28,1.000000
598,"(5.37, 12.5]",29,1.000000


In [13]:
left=pd.concat([bin_ids, glt['runs_scored_9']], axis=1)
left

Unnamed: 0,re_pred,runs_scored_9
0,"(4.478, 4.518]",11
1,"(5.37, 12.5]",10
2,"(5.37, 12.5]",9
3,"(4.854, 4.923]",9
4,"(4.367, 4.427]",1
...,...,...
118405,"(4.427, 4.478]",2
118406,"(3.785, 3.936]",12
118407,"(3.785, 3.936]",2
118408,"(3.785, 3.936]",9


In [14]:
right=wps.stack().rename('rs_wp').reset_index().rename(columns={'level_1': 'runs_scored_9'})
right

Unnamed: 0,re_pred,runs_scored_9,rs_wp
0,"(3.346, 3.785]",0,0.045243
1,"(3.346, 3.785]",1,0.159098
2,"(3.346, 3.785]",2,0.311710
3,"(3.346, 3.785]",3,0.472971
4,"(3.346, 3.785]",4,0.610695
...,...,...,...
595,"(5.37, 12.5]",25,1.000000
596,"(5.37, 12.5]",26,1.000000
597,"(5.37, 12.5]",28,1.000000
598,"(5.37, 12.5]",29,1.000000


In [15]:
pd.merge(left, right, on=['re_pred', 'runs_scored_9'], how='left')

Unnamed: 0,re_pred,runs_scored_9,rs_wp
0,"(4.478, 4.518]",11,0.960428
1,"(5.37, 12.5]",10,0.875859
2,"(5.37, 12.5]",9,0.826460
3,"(4.854, 4.923]",9,0.881448
4,"(4.367, 4.427]",1,0.111483
...,...,...,...
118405,"(4.427, 4.478]",2,0.241927
118406,"(3.785, 3.936]",12,0.984418
118407,"(3.785, 3.936]",2,0.303253
118408,"(3.785, 3.936]",9,0.941182


In [16]:
rs_wp = pd.merge(left, right, on=['re_pred', 'runs_scored_9'], how='left')['rs_wp']
rs_wp.index = left.index
rs_wp

0         0.960428
1         0.875859
2         0.826460
3         0.881448
4         0.111483
            ...   
118405    0.241927
118406    0.984418
118407    0.303253
118408    0.941182
118409    0.255628
Name: rs_wp, Length: 118410, dtype: float64

In [17]:
glt['rs_wp'] = rs_wp
glt

Unnamed: 0,game_id,date,double_header,yr,game_type,park_id,team,team_league,team_game_number,runs_scored,...,opp,runs_allowed,W,L,linescore_parsed,runs_scored_9,dh_fl,re_pred,rsi,rs_wp
0,TBA199803310,1998-03-31,0,1998,RS,STP01,DET,AL,1,11,...,TBA,6,True,False,"[0.0, 4.0, 2.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0]",11,True,4.490427,2.449655,0.960428
1,SEA199803310,1998-03-31,0,1998,RS,SEA02,CLE,AL,1,10,...,SEA,9,True,False,"[0.0, 0.0, 2.0, 1.0, 0.0, 3.0, 0.0, 4.0, 0.0]",10,True,5.679798,1.760626,0.875859
2,TEX199803310,1998-03-31,0,1998,RS,ARL02,CHA,AL,1,9,...,TEX,2,True,False,"[0.0, 0.0, 0.0, 0.0, 7.0, 0.0, 1.0, 0.0, 1.0]",9,True,5.498201,1.636899,0.826460
3,ARI199803310,1998-03-31,0,1998,RS,PHO01,COL,NL,1,9,...,ARI,2,True,False,"[0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 5.0, 1.0, 0.0]",9,False,4.878733,1.844741,0.881448
4,ATL199803310,1998-03-31,0,1998,RS,ATL02,MIL,NL,1,1,...,ATL,2,False,True,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,False,4.398442,0.227353,0.111483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118405,CIN202210050,2022-10-05,0,2022,RS,CIN09,CIN,NL,162,2,...,CHN,15,False,True,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",2,True,4.433594,0.451101,0.241927
118406,MIA202210050,2022-10-05,0,2022,RS,MIA02,MIA,NL,162,12,...,ATL,9,True,False,"[0.0, 3.0, 0.0, 0.0, 2.0, 5.0, 2.0, 0.0, nan]",12,True,3.885279,3.088581,0.984418
118407,OAK202210050,2022-10-05,0,2022,RS,OAK01,ANA,AL,162,2,...,OAK,3,False,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0]",2,True,3.832277,0.521883,0.303253
118408,MIA202210050,2022-10-05,0,2022,RS,MIA02,ATL,NL,162,9,...,MIA,12,False,True,"[0.0, 0.0, 2.0, 1.0, 0.0, 3.0, 2.0, 0.0, 1.0]",9,True,3.885279,2.316436,0.941182


In [18]:
glt[['runs_scored_9', 'rs_wp']].mean(), glt[['runs_scored_9', 'rs_wp']].corr()

(runs_scored_9    4.50701
 rs_wp            0.50000
 dtype: float64,
                runs_scored_9     rs_wp
 runs_scored_9       1.000000  0.937629
 rs_wp               0.937629  1.000000)

In [19]:
agg = {'W': np.mean, 'rs_wp': np.mean, 'rsi': np.mean, 'game_id': len, 'runs_scored_9': np.mean, 're_pred': np.mean}
pit_careers = glt.groupby(['starting_pitcher_id']).agg(agg)
pit_seasons = glt.groupby(['starting_pitcher_id', 'yr']).agg(agg)
for df in [pit_careers, pit_seasons]:
    df['rsi_agg2'] = df['runs_scored_9']/df['re_pred']
    df['rsi_wp'] = (df['rsi']**1.83)/(df['rsi']**1.83 +1)
pit_careers.query('game_id>=100').sort_values('rs_wp')

Unnamed: 0_level_0,W,rs_wp,rsi,game_id,runs_scored_9,re_pred,rsi_agg2,rsi_wp
starting_pitcher_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alcas001,0.463636,0.400878,0.747853,110,3.145455,4.236257,0.742508,0.370119
kellb003,0.419048,0.417177,0.832893,105,3.809524,4.575118,0.832661,0.417118
urenj001,0.406015,0.430389,0.849962,133,3.676692,4.355228,0.844202,0.426171
ruscg001,0.404145,0.434360,0.829092,193,3.834197,4.633508,0.827493,0.415084
franr001,0.424528,0.445888,0.854519,106,3.886792,4.521792,0.859569,0.428566
...,...,...,...,...,...,...,...,...
selea001,0.581967,0.571602,1.170362,244,5.659836,4.829218,1.171999,0.571478
ramih002,0.495238,0.573558,1.203719,105,5.342857,4.459465,1.198094,0.584023
buehw001,0.660377,0.576130,1.229213,106,5.179245,4.226689,1.225367,0.593310
mainj001,0.561905,0.579728,1.182782,105,5.104762,4.365545,1.169330,0.576202


In [20]:
for df_name in ['glt', 'pit_careers', 'pit_seasons']:
    locals()[df_name].to_parquet(f'output/{df_name}.parquet')

In [21]:
px.scatter(pit_careers.query('game_id>=300').reset_index(), x='rsi_wp', y='rs_wp', hover_data=['starting_pitcher_id', 'rsi', 'runs_scored_9'], trendline='ols')

In [22]:
pit_seasons.query('game_id>=25').corr()

Unnamed: 0,W,rs_wp,rsi,game_id,runs_scored_9,re_pred,rsi_agg2,rsi_wp
W,1.0,0.596455,0.572677,0.179712,0.498534,-0.020209,0.572397,0.574613
rs_wp,0.596455,1.0,0.957473,0.024982,0.84969,-0.000263,0.957308,0.959523
rsi,0.572677,0.957473,1.0,0.02028,0.876746,-0.014391,0.998024,0.995123
game_id,0.179712,0.024982,0.02028,1.0,0.048716,0.060685,0.021237,0.026158
runs_scored_9,0.498534,0.84969,0.876746,0.048716,1.0,0.460509,0.880353,0.873375
re_pred,-0.020209,-0.000263,-0.014391,0.060685,0.460509,1.0,-0.010452,-0.012875
rsi_agg2,0.572397,0.957308,0.998024,0.021237,0.880353,-0.010452,1.0,0.99329
rsi_wp,0.574613,0.959523,0.995123,0.026158,0.873375,-0.012875,0.99329,1.0


In [23]:
pit_careers.query('game_id>=100').corr()

Unnamed: 0,W,rs_wp,rsi,game_id,runs_scored_9,re_pred,rsi_agg2,rsi_wp
W,1.0,0.598321,0.602061,0.264743,0.4392,-0.081031,0.601197,0.599708
rs_wp,0.598321,1.0,0.964456,0.092249,0.763671,-0.040862,0.964536,0.964632
rsi,0.602061,0.964456,1.0,0.096327,0.757889,-0.090466,0.997229,0.998441
game_id,0.264743,0.092249,0.096327,1.0,0.040549,-0.056895,0.090321,0.106916
runs_scored_9,0.4392,0.763671,0.757889,0.040549,1.0,0.577298,0.771244,0.75734
re_pred,-0.081031,-0.040862,-0.090466,-0.056895,0.577298,1.0,-0.072725,-0.090104
rsi_agg2,0.601197,0.964536,0.997229,0.090321,0.771244,-0.072725,1.0,0.995896
rsi_wp,0.599708,0.964632,0.998441,0.106916,0.75734,-0.090104,0.995896,1.0


In [24]:
pit_seasons.query('game_id>=25').sort_values('rs_wp')

Unnamed: 0_level_0,Unnamed: 1_level_0,W,rs_wp,rsi,game_id,runs_scored_9,re_pred,rsi_agg2,rsi_wp
starting_pitcher_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
turns003,2019,0.233333,0.282665,0.525557,30,2.533333,4.819774,0.525612,0.235550
mussm001,2000,0.470588,0.313655,0.603478,34,3.147059,5.248084,0.599659,0.284096
kellb003,2019,0.321429,0.320193,0.603685,28,2.928571,4.887700,0.599172,0.284224
blaiw001,1998,0.240000,0.320870,0.616678,25,2.920000,4.665434,0.625880,0.292218
hendm001,2006,0.360000,0.330154,0.622332,25,2.840000,4.634892,0.612743,0.295684
...,...,...,...,...,...,...,...,...,...
paxtj001,2019,0.689655,0.673393,1.404809,29,6.827586,4.847870,1.408368,0.650678
estes001,2000,0.666667,0.673885,1.496978,30,7.300000,4.850675,1.504945,0.676626
saunj001,2009,0.645161,0.687042,1.391127,31,6.290323,4.523405,1.390617,0.646596
abbop001,2001,0.814815,0.692002,1.518275,27,7.037037,4.667764,1.507582,0.682257


In [25]:
px.scatter(pit_seasons.query('game_id>=35').reset_index(), x='rsi_wp', y='rs_wp', hover_data=['starting_pitcher_id', 'yr', 'rsi'], trendline='ols')