## Evaluate catcher defense by looking at baserunner events
SB, CS, WP, PB, PO, other outs/advancements

Also look at pitchers, so that we can come up with adjustments for pitcher quality (e.g., knuckleballers are tough)

In [1]:
import pandas as pd
import boxball_loader as bbl 
import baseball_stats_utils as bsu

In [2]:
ev = bbl.load_event_data(1970, 2020, ['pos2_fld_id', 'inn_ct', 'pit_id', 'outs_ct'], pa_only=False)

In [3]:
# Add re24 to events

# TODO move this to boxball_loader
def add_re24_to_events(df_events: pd.DataFrame):
    event_res = pd.read_csv('~/temp/event_res.csv')
    if 'event_re' not in df_events.columns:
        df_merged = pd.merge(left=df_events, right=event_res, on='event_cd') 
        return df_merged
    else:
        return df_events

In [4]:
ev = add_re24_to_events(ev)

In [5]:
ev.columns

Index(['game_id', 'date', 'game_type', 'event_cd', 'pos2_fld_id', 'outs_ct',
       'pit_id', 'bat_event_fl', 'tb_ct', 'inn_ct', 'ab_fl', 'h_fl', 'ob_fl',
       'yr', 'event_re'],
      dtype='object')

In [6]:

ev.sample(10)

Unnamed: 0,game_id,date,game_type,event_cd,pos2_fld_id,outs_ct,pit_id,bat_event_fl,tb_ct,inn_ct,ab_fl,h_fl,ob_fl,yr,event_re
3401156,NYA201907200,2019-07-20,RS,20,woltt001,1,senza001,True,1,2,True,1,1,2019,0.456976
1206072,CIN201507180,2015-07-18,RS,3,penab002,1,diazj005,True,0,8,True,0,0,2015,-0.277322
2673035,SEA199308130,1993-08-13,RS,20,valld001,2,hanse001,True,1,7,True,1,1,1993,0.456976
7193137,TOR201007260,2010-07-26,RS,2,wietm001,2,albem001,True,0,7,True,0,0,2010,-0.275843
7130892,MIN200909200,2009-09-20,RS,2,mauej001,0,bakes002,True,0,4,True,0,0,2009,-0.275843
1484673,ATL197705170,1977-05-17,RS,14,footb101,0,wartd101,True,0,5,False,0,1,1977,0.311613
8801976,SDN198707200,1987-07-20,RS,19,sundj001,0,dipif001,True,0,7,False,0,0,1987,-0.194241
8013637,TOR202008280,2020-08-28,RS,2,sevep001,1,meanj001,True,0,4,True,0,0,2020,-0.275843
6346888,SEA200106040,2001-06-04,RS,2,lampt001,1,franr001,True,0,3,True,0,0,2001,-0.275843
6185163,OAK199907190,1999-07-19,RS,2,hernr002,2,joned001,True,0,8,True,0,0,1999,-0.275843


In [7]:
ev['bat_event_fl'].value_counts()

True     8524001
False     308449
Name: bat_event_fl, dtype: int64

In [8]:
ev[~ev['bat_event_fl']]['event_cd'].value_counts()

4     129713
9      64969
6      49191
8      26646
10     15223
11     10395
5       6492
12      2991
13      2829
Name: event_cd, dtype: int64

In [9]:
# Ignore defensive interference, foul error. 
# Other advancement/out (12) seems to jump around a lot in frequency; need to understand this better, so leaving it out for now
# Eventually would probably be good to split these into different sets of categories, based on 
# how much the catcher influences each one
bsr_ev = ev[(~ev['bat_event_fl']) & (~ev['event_cd'].isin([5, 12, 13]))]
bsr_ev['event_cd'].value_counts()

4     129713
9      64969
6      49191
8      26646
10     15223
11     10395
Name: event_cd, dtype: int64

In [10]:
# Compute league totals for baselines

lg_outs = ev.groupby(['yr'])['outs_ct'].sum().apply(int)

# get counts by event type
evt_counts = pd.get_dummies(bsr_ev['event_cd'], prefix='ev')
evt_counts = pd.concat([bsr_ev[['yr', 'event_re']], evt_counts], axis=1)

lg_bsr_runs = evt_counts.groupby(['yr']).sum()
lg_totals = pd.concat([lg_outs, lg_bsr_runs], axis=1)
lg_totals

lg_rates = lg_totals.divide(lg_totals['outs_ct'], axis=0) * 3000 # per 1000 innings

lg_rates


Unnamed: 0_level_0,outs_ct,event_re,ev_4,ev_6,ev_8,ev_9,ev_10,ev_11
yr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1970,3000.0,6.182845,36.181724,16.390182,8.553129,24.247126,7.18065,2.565939
1971,3000.0,5.079951,34.196388,16.602741,8.756379,21.961725,7.745249,1.961591
1972,3000.0,4.042084,37.120527,18.719261,9.836622,22.280796,6.08429,1.950365
1973,3000.0,4.601479,39.126804,18.569327,11.332459,23.678874,6.620542,1.888743
1974,3000.0,4.245629,47.416474,21.931856,11.412504,20.959312,6.192524,3.711545
1975,3000.0,4.933062,48.524196,21.616945,12.126096,22.449128,6.38007,4.022218
1976,3000.0,4.490613,58.800861,25.453796,10.962872,20.390943,5.581099,3.547984
1977,3000.0,2.75816,52.74809,26.165375,12.756074,20.703671,4.935494,4.282267
1978,3000.0,3.985347,53.403817,23.290408,12.465808,18.919999,5.08959,5.034269
1979,3000.0,4.475314,53.032023,22.659639,11.541506,20.321886,5.871994,3.055646


In [11]:
# single-se totals in raw runs
cat_ev_totals = bsr_ev.groupby(['pos2_fld_id', 'yr'])['event_re'].sum()
cat_ev_totals.sort_values()

pos2_fld_id  yr  
boonb001     1982   -10.508019
fiskc001     1977    -9.201512
cartg001     1979    -8.294928
karkr001     1993    -7.352262
dempr001     1977    -7.306428
                       ...    
hatts001     2001    22.516942
kendj001     2007    23.504638
bardj001     2007    23.585429
piazm001     1996    25.448091
dietd101     1970    28.977813
Name: event_re, Length: 4852, dtype: float64

In [12]:
# Let's compute vs league average

# Start by counting innings
cat_inn_totals = ev[['game_id', 'pos2_fld_id', 'inn_ct', 'yr']].drop_duplicates()[['pos2_fld_id', 'yr']].value_counts().rename('innings')
cat_inn_totals.sort_values()

pos2_fld_id  yr  
lecrm001     2005       1
jonet002     1989       1
sakal001     1983       1
kearb001     1979       1
cabrf001     1992       1
                     ... 
sangm101     1974    1323
kendj001     2008    1332
simmt001     1973    1356
cartg001     1982    1358
fiskc001     1978    1364
Name: innings, Length: 5140, dtype: int64

In [13]:
cat_totals = pd.concat([cat_inn_totals, cat_ev_totals], axis=1).fillna(0)
cat_totals

Unnamed: 0_level_0,Unnamed: 1_level_0,innings,event_re
pos2_fld_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1
adamb105,1977,9,0.179519
afent001,1987,52,1.653861
afent001,1990,33,0.029843
afent001,1991,28,0.440909
afent001,1992,93,1.270700
...,...,...,...
zunim001,2016,446,4.122584
zunim001,2017,995,16.502276
zunim001,2018,923,9.815331
zunim001,2019,690,8.040438


In [14]:
# Compute league averages
yr_totals = cat_totals.groupby('yr').sum()

re_rate = yr_totals['event_re'] / yr_totals['innings']
re_rate * 8.5 * 162 # full season equivalent


yr
1970    12.224998
1971     9.905074
1972     7.815137
1973     9.101294
1974     8.396596
1975     9.833118
1976     8.849940
1977     5.522422
1978     7.903213
1979     8.895504
1980     9.757630
1981     7.758738
1982     9.588078
1983    10.453410
1984     9.824646
1985    11.326727
1986    11.373819
1987    16.133993
1988    19.674542
1989    13.233400
1990    13.794973
1991    10.848940
1992     8.823304
1993    10.663716
1994    14.666297
1995    16.016264
1996    15.300300
1997    12.705751
1998    12.992032
1999    14.213004
2000    12.765520
2001    12.055058
2002    12.923462
2003    14.259218
2004    14.078696
2005    13.080232
2006    14.600492
2007    16.709955
2008    16.282624
2009    15.422036
2010    15.807767
2011    14.902978
2012    16.838005
2013    16.571451
2014    16.001877
2015    15.044494
2016    16.515680
2017    17.820097
2018    17.557442
2019    17.893886
2020    19.216461
dtype: float64

In [15]:
# Now compare vs league average
cat_totals['baseline_re'] = cat_totals['innings'] * re_rate
cat_totals['re_diff'] = cat_totals['baseline_re'] - cat_totals['event_re']
cat_totals.sort_values('re_diff').tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,innings,event_re,baseline_re,re_diff
pos2_fld_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
munst101,1975,1135,-3.893797,8.105003,11.9988
schnb001,2004,1120,-0.649347,11.451082,12.100429
realj001,2019,1144,2.618198,14.866089,12.247892
wilsd001,1996,1133,0.166187,12.589136,12.422949
moliy001,2010,1143,0.694835,13.121479,12.426644
mathm001,2000,1035,-3.07789,9.594999,12.672888
sundj001,1978,1300,-5.370902,7.461276,12.832177
hanir001,2012,882,-2.563169,10.785127,13.348296
santb001,1988,1173,3.243923,16.759795,13.515872
johnc002,1997,1080,-3.647639,9.965295,13.612934


In [16]:
# Career totals

careers = cat_totals.groupby('pos2_fld_id').sum()
careers.sort_values('re_diff', ascending=False)

Unnamed: 0_level_0,innings,event_re,baseline_re,re_diff
pos2_fld_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rodri001,20427,80.659127,204.496775,123.837648
moliy001,16726,72.213328,195.406241,123.192913
sundj001,15955,28.713221,112.081339,83.368118
dempr001,12367,14.160834,91.589004,77.428171
boonb001,18521,67.962532,141.099137,73.136605
...,...,...,...,...
hatts001,2906,68.834611,26.989770,-41.844840
piazm001,13596,181.192843,132.492521,-48.700322
barrm003,7343,124.249894,74.033076,-50.216818
piera001,16402,230.370464,178.510659,-51.859805


In [17]:
# career rates
careers['re_diff_rate'] = careers['re_diff']/careers['innings'] * 1000 # represent as full-season eqv
careers[careers['innings']>=5000].sort_values(by='re_diff_rate')

Unnamed: 0_level_0,innings,event_re,baseline_re,re_diff,re_diff_rate
pos2_fld_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
taube001,6597,116.225439,61.607230,-54.618209,-8.279249
barrm003,7343,124.249894,74.033076,-50.216818,-6.838733
stanm002,5645,94.272414,57.261360,-37.011054,-6.556431
fitzm001,5756,88.800884,52.352862,-36.448022,-6.332179
flowt001,6229,108.692629,75.368177,-33.324453,-5.349888
...,...,...,...,...,...
pagnt001,6704,16.279843,61.492674,45.212831,6.744157
poseb001,8431,40.921805,101.494046,60.572242,7.184467
moliy001,16726,72.213328,195.406241,123.192913,7.365354
karkr001,6999,9.740195,67.041010,57.300815,8.187000
