## Evaluate catcher defense by looking at baserunner events
SB, CS, WP, PB, PO, other outs/advancements

Also look at pitchers, so that we can come up with adjustments for pitcher quality (e.g., knuckleballers are tough)

In [1]:
import pandas as pd
import boxball_loader as bbl 
import baseball_stats_utils as bsu

In [2]:
ev = bbl.load_event_data(bbl.Eras.Expansion, ['pos2_fld_id', 'inn_ct', 'pit_id', 'outs_ct'], pa_only=False)
ev['bat_event_fl'].value_counts()

True     9638817
False     343339
Name: bat_event_fl, dtype: int64

In [3]:
# Add re24 to events

# TODO move this to boxball_loader
def add_re24_to_events(df_events: pd.DataFrame):
    event_res = pd.read_csv('~/temp/event_res.csv')
    if 'event_re' not in df_events.columns:
        df_merged = pd.merge(left=df_events, right=event_res, on='event_cd') 
        return df_merged
    else:
        return df_events

In [4]:
ev = add_re24_to_events(ev)

In [5]:
ev.columns

Index(['game_id', 'date', 'game_type', 'ab_fl', 'pit_id', 'bat_event_fl',
       'tb_ct', 'pos2_fld_id', 'outs_ct', 'event_cd', 'inn_ct', 'h_fl',
       'ob_fl', 'yr', 'event_re'],
      dtype='object')

In [6]:

ev.sample(10)

Unnamed: 0,game_id,date,game_type,ab_fl,pit_id,bat_event_fl,tb_ct,pos2_fld_id,outs_ct,event_cd,inn_ct,h_fl,ob_fl,yr,event_re
187559,SFN197904120,1979-04-12,RS,False,knepb001,True,0,hillm001,0,14,8,0,1,1979,0.31116
4021641,BOS200208200,2002-08-20,RS,False,howrb001,True,0,varij001,0,2,8,0,0,2002,-0.275642
9816382,SLN198407160,1984-07-16,RS,False,willf001,False,0,brenb001,0,9,8,0,0,1984,0.261552
897568,MIN196306220,1963-06-22,RS,True,mccom103,True,0,browd102,0,2,1,0,0,1963,-0.275642
6911186,NYN201306080,2013-06-08,RS,True,marcs001,True,1,buckj001,1,20,20,1,1,2013,0.456638
7339421,ATL197305190,1973-05-19,RS,True,houst101,True,0,oatej101,2,3,7,0,0,1973,-0.277176
5871425,PIT197504170,1975-04-17,RS,True,reusj001,True,1,sangm101,2,20,6,1,1,1975,0.456638
8551429,WAS201706250,2017-06-25,RS,True,felds001,True,0,barnt001,2,3,1,0,0,2017,-0.277176
6139768,ATL198507130,1985-07-13,RS,True,mahlr001,True,1,ceror001,0,20,6,1,1,1985,0.456638
6821657,CIN201005140,2010-05-14,RS,True,rhoda001,True,1,hanir001,0,20,9,1,1,2010,0.456638


In [7]:
ev['bat_event_fl'].value_counts()

True     9638817
False     343339
Name: bat_event_fl, dtype: int64

In [8]:
ev[~ev['bat_event_fl']]['event_cd'].value_counts()

4     141492
9      73670
6      55342
8      29730
10     18130
11     11292
5       6548
12      3621
13      3514
Name: event_cd, dtype: int64

In [9]:
# Ignore defensive interference, foul error. 
# Other advancement/out (12) seems to jump around a lot in frequency; need to understand this better, so leaving it out for now
# Eventually would probably be good to split these into different sets of categories, based on 
# how much the catcher influences each one
bsr_ev = ev[(~ev['bat_event_fl']) & (~ev['event_cd'].isin([5, 12, 13]))]
bsr_ev['event_cd'].value_counts()

4     141492
9      73670
6      55342
8      29730
10     18130
11     11292
Name: event_cd, dtype: int64

In [10]:
# Compute league totals for baselines

lg_outs = ev.groupby(['yr'])['outs_ct'].sum().apply(int)

# get counts by event type
evt_counts = pd.get_dummies(bsr_ev['event_cd'], prefix='ev')
evt_counts = pd.concat([bsr_ev[['yr', 'event_re']], evt_counts], axis=1)

lg_bsr_runs = evt_counts.groupby(['yr']).sum()
lg_totals = pd.concat([lg_outs, lg_bsr_runs], axis=1)
lg_totals

lg_rates = lg_totals.divide(lg_totals['outs_ct'], axis=0) * 3000 # per 1000 innings

lg_rates


Unnamed: 0_level_0,outs_ct,event_re,ev_4,ev_6,ev_8,ev_9,ev_10,ev_11
yr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1961,3000.0,4.782382,26.271179,13.407548,7.941184,20.097724,7.995576,1.686142
1962,3000.0,5.374346,30.451155,14.87927,8.550207,21.853182,8.144191,2.19726
1963,3000.0,5.341377,28.250186,15.301174,7.250477,21.411943,6.983737,4.704323
1964,3000.0,5.035593,26.499807,14.829282,7.499035,23.051698,7.957176,1.567323
1965,3000.0,6.458477,33.291013,15.331705,8.172088,24.395732,8.654217,1.735665
1966,3000.0,4.863143,33.579633,18.070086,9.169166,24.629941,7.096349,2.341064
1967,3000.0,4.470036,31.760773,18.581627,7.14678,23.208864,7.14678,2.446863
1968,3000.0,4.918292,35.210061,18.253444,8.588417,23.832244,7.071374,2.030879
1969,3000.0,5.726121,35.203827,17.920861,9.229543,24.997508,8.372371,2.631317
1970,3000.0,6.201313,36.181724,16.390182,8.553129,24.247126,7.18065,2.565939


In [11]:
# single-se totals in raw runs
cat_ev_totals = bsr_ev.groupby(['pos2_fld_id', 'yr'])['event_re'].sum()
cat_ev_totals.sort_values()

pos2_fld_id  yr  
boonb001     1982   -10.484600
fiskc001     1977    -9.177675
cartg001     1979    -8.262217
karkr001     1993    -7.324891
dempr001     1977    -7.293480
                       ...    
cartg001     1988    22.565861
kendj001     2007    23.543287
bardj001     2007    23.623712
piazm001     1996    25.508504
dietd101     1970    29.021643
Name: event_re, Length: 5508, dtype: float64

In [12]:
# Let's compute vs league average

# Start by counting innings
cat_inn_totals = ev[['game_id', 'pos2_fld_id', 'inn_ct', 'yr']].drop_duplicates()[['pos2_fld_id', 'yr']].value_counts().rename('innings')
cat_inn_totals.sort_values()

pos2_fld_id  yr  
pagaj101     1967       1
sudab101     1974       1
beanb001     1989       1
boscj001     2010       1
peren001     1998       1
                     ... 
kendj001     2008    1332
simmt001     1973    1356
cartg001     1982    1358
fiskc001     1978    1364
hundr101     1968    1390
Name: innings, Length: 5834, dtype: int64

In [13]:
cat_totals = pd.concat([cat_inn_totals, cat_ev_totals], axis=1).fillna(0)
cat_totals

Unnamed: 0_level_0,Unnamed: 1_level_0,innings,event_re
pos2_fld_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1
adamb105,1977,9,0.179787
adamd101,1969,26,0.271778
adled101,1963,16,0.533330
adled101,1964,21,0.201047
adled101,1965,82,0.610510
...,...,...,...
zunim001,2017,995,16.528981
zunim001,2018,923,9.833490
zunim001,2019,690,8.055827
zunim001,2020,216,5.666893


In [14]:
# Compute league averages
yr_totals = cat_totals.groupby('yr').sum()

re_rate = yr_totals['event_re'] / yr_totals['innings']
re_rate * 8.5 * 162 # full season equivalent


yr
1961     9.465080
1962    10.635330
1963    10.373240
1964     9.835059
1965    12.580778
1966     9.415901
1967     8.660164
1968     9.427918
1969    11.288368
1970    12.261514
1971     9.938586
1972     7.849247
1973     9.137688
1974     8.439899
1975     9.878360
1976     8.898827
1977     5.569782
1978     7.950938
1979     8.940566
1980     9.807245
1981     7.806538
1982     9.636876
1983    10.504079
1984     9.873586
1985    11.373230
1986    11.425995
1987    16.190685
1988    19.744726
1989    13.284350
1990    13.844773
1991    10.897829
1992     8.871315
1993    10.711013
1994    14.711798
1995    16.061039
1996    15.343174
1997    12.748726
1998    13.034290
1999    14.254758
2000    12.802091
2001    12.092394
2002    12.958981
2003    14.293413
2004    14.112407
2005    13.113400
2006    14.635352
2007    16.745998
2008    16.318189
2009    15.458395
2010    15.845530
2011    14.942190
2012    16.877218
2013    16.606239
2014    16.037024
2015    15.078659
2016   

In [15]:
# Now compare vs league average
cat_totals['baseline_re'] = cat_totals['innings'] * re_rate
cat_totals['re_diff'] = cat_totals['baseline_re'] - cat_totals['event_re']
cat_totals.sort_values('re_diff').tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,innings,event_re,baseline_re,re_diff
pos2_fld_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
schnb001,2004,1120,-0.635226,11.478501,12.113727
realj001,2019,1144,2.642317,14.893047,12.25073
bertd101,1963,848,-6.021382,6.388168,12.40955
wilsd001,1996,1133,0.188229,12.624412,12.436183
moliy001,2010,1143,0.711882,13.152826,12.440944
mathm001,2000,1035,-3.052962,9.622487,12.675449
sundj001,1978,1300,-5.340183,7.506332,12.846516
hanir001,2012,882,-2.54569,10.810244,13.355934
santb001,1988,1173,3.278229,16.819582,13.541353
johnc002,1997,1080,-3.625099,9.999001,13.624099


In [16]:
# Career totals

careers = cat_totals.groupby('pos2_fld_id').sum()
careers.sort_values('re_diff', ascending=False)

Unnamed: 0_level_0,innings,event_re,baseline_re,re_diff
pos2_fld_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rodri001,20427,81.092558,205.086848,123.994290
moliy001,16726,72.462504,195.838542,123.376038
sundj001,15955,29.158141,112.644741,83.486601
dempr001,12378,14.676635,92.122164,77.445530
boonb001,18521,68.507622,141.756806,73.249184
...,...,...,...,...
dietd101,4318,75.131023,32.980533,-42.150489
piazm001,13596,181.696331,132.891594,-48.804737
barrm003,7343,124.484998,74.222478,-50.262521
piera001,16402,230.800036,178.937256,-51.862780


In [17]:
# career rates
careers['re_diff_rate'] = careers['re_diff']/careers['innings'] * 1000 # represent as full-season eqv
careers[careers['innings']>=5000].sort_values(by='re_diff_rate')

Unnamed: 0_level_0,innings,event_re,baseline_re,re_diff,re_diff_rate
pos2_fld_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
taube001,6597,116.477192,61.815419,-54.661773,-8.285853
barrm003,7343,124.484998,74.222478,-50.262521,-6.844957
stanm002,5645,94.496066,57.462325,-37.033741,-6.560450
fitzm001,5756,89.079142,52.567940,-36.511202,-6.343155
flowt001,6229,108.877654,75.526542,-33.351111,-5.354168
...,...,...,...,...,...
pagnt001,6704,16.477766,61.723692,45.245926,6.749094
poseb001,8431,41.098494,101.711322,60.612828,7.189281
moliy001,16726,72.462504,195.838542,123.376038,7.376303
karkr001,6999,9.919564,67.286705,57.367140,8.196477
