In [1]:
import pandas as pd
import boxball_loader as bbl 
import baseball_stats_utils as bsu

In [2]:
ev = bbl.load_event_data(1990, 2020, ['pos2_fld_id', 'inn_ct', 'event_runs_ct'])

In [3]:
event_res = pd.read_csv('~/temp/event_res.csv')

In [4]:
ev.columns

Index(['game_id', 'date', 'ab_fl', 'event_runs_ct', 'inn_ct', 'pos2_fld_id',
       'bat_event_fl', 'event_cd', 'tb_ct', 'h_fl', 'ob_fl', 'yr'],
      dtype='object')

In [5]:
ev = pd.merge(left=ev, right=event_res, on='event_cd') 
ev.sample(10)

Unnamed: 0,game_id,date,ab_fl,event_runs_ct,inn_ct,pos2_fld_id,bat_event_fl,event_cd,tb_ct,h_fl,ob_fl,yr,event_re
2154079,DET201406050,2014-06-05,True,0,2,avila001,True,2,0,0,0,2014,-0.275843
2575538,TEX201905190,2019-05-19,True,0,7,moliy001,True,2,0,0,0,2019,-0.275843
325209,MIN199309040,1993-09-04,True,0,8,rodri001,True,2,0,0,0,1993,-0.275843
5512539,FLO199706270,1997-06-27,False,0,3,johnc002,True,16,0,0,1,1997,0.333941
5190134,DET200905010,2009-05-01,True,0,9,lairg001,True,21,2,1,1,2009,0.762388
2817989,NYA199605150,1996-05-15,True,0,7,giraj001,True,3,0,0,0,1996,-0.277322
2256355,PIT201507090,2015-07-09,True,0,8,moliy001,True,2,0,0,0,2015,-0.275843
4472027,SEA201807240,2018-07-24,True,0,8,zunim001,True,20,1,1,1,2018,0.456976
530760,MIL199607210,1996-07-21,True,0,9,mathm001,True,2,0,0,0,1996,-0.275843
1471235,TEX200609010,2006-09-01,True,0,8,martv001,True,2,0,0,0,2006,-0.275843


In [6]:
ev[~ev['bat_event_fl']]['event_cd'].value_counts()

4     77704
9     43834
6     26654
8     15824
10     9298
5      6409
11     5328
12     2174
13     1509
Name: event_cd, dtype: int64

In [7]:
# Ignore defensive interference, foul error.  Keep other error/out since those are likely on pickoff attempts
#cat_ev = ev[(~ev['bat_event_fl']) & (~ev['event_cd'].isin([5, 12, 13]))]
cat_ev = ev[(~ev['bat_event_fl']) & (~ev['event_cd'].isin([5, 13]))]
cat_ev['event_cd'].value_counts()

4     77704
9     43834
6     26654
8     15824
10     9298
11     5328
12     2174
Name: event_cd, dtype: int64

In [8]:
cat_ev

Unnamed: 0,game_id,date,ab_fl,event_runs_ct,inn_ct,pos2_fld_id,bat_event_fl,event_cd,tb_ct,h_fl,ob_fl,yr,event_re
4518228,OAK199004090,1990-04-09,False,0,1,steit001,False,4,0,0,0,1990,0.179519
4518229,TEX199004090,1990-04-09,False,0,2,myerg001,False,4,0,0,0,1990,0.179519
4518230,TEX199004090,1990-04-09,False,0,3,myerg001,False,4,0,0,0,1990,0.179519
4518231,TEX199004090,1990-04-09,False,0,4,myerg001,False,4,0,0,0,1990,0.179519
4518232,TEX199004090,1990-04-09,False,0,5,myerg001,False,4,0,0,0,1990,0.179519
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5724675,NYN202009220,2020-09-22,False,0,5,perem005,False,12,0,0,0,2020,-0.425766
5724676,OAK202009250,2020-09-25,False,0,3,odomj001,False,12,0,0,0,2020,-0.425766
5724677,TEX202009260,2020-09-26,False,0,3,garnd001,False,12,0,0,0,2020,-0.425766
5724678,MIN202009270,2020-09-27,False,0,10,garvm001,False,12,0,0,0,2020,-0.425766


In [9]:
# career totals in raw runs
cat_ev_totals = cat_ev.groupby(['pos2_fld_id', 'yr'])['event_re'].sum()
cat_ev_totals.sort_values()

pos2_fld_id  yr  
reyeg001     1991    -7.923770
karkr001     1993    -7.711308
pagnt001     1991    -6.573707
obric001     1992    -6.264789
             1997    -5.288557
                       ...    
posaj001     2007    21.036546
kendj001     2007    22.340131
hatts001     2001    22.516942
bardj001     2007    23.159663
piazm001     1996    25.417798
Name: event_re, Length: 3089, dtype: float64

In [10]:
# Let's compute vs league average

# Start by counting innings
cat_inn_totals = ev[['game_id', 'pos2_fld_id', 'inn_ct', 'yr']].drop_duplicates()[['pos2_fld_id', 'yr']].value_counts().rename('innings')
cat_inn_totals.sort_values()

pos2_fld_id  yr  
melha001     2000       1
nevip001     2006       1
burkj003     2010       1
tremc001     2004       1
freid001     2020       1
                     ... 
posaj001     2000    1333
peres002     2015    1343
kendj001     2008    1366
dauld001     1993    1395
peres002     2014    1395
Name: innings, Length: 3267, dtype: int64

In [11]:
cat_totals = pd.concat([cat_inn_totals, cat_ev_totals], axis=1).fillna(0)
cat_totals

Unnamed: 0_level_0,Unnamed: 1_level_0,innings,event_re
pos2_fld_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1
afent001,1990,33,0.029843
afent001,1991,28,0.440909
afent001,1992,93,1.270700
alfaj002,2016,31,1.601143
alfaj002,2017,244,3.559480
...,...,...,...
zunim001,2016,446,3.696817
zunim001,2017,995,16.076510
zunim001,2018,923,8.538032
zunim001,2019,690,6.763139


In [12]:
# Compute league averages
yr_totals = cat_totals.groupby('yr').sum()

re_rate = yr_totals['event_re'] / yr_totals['innings']
re_rate * 8.5 * 162 # full season equivalent


yr
1990    13.512450
1991    10.312291
1992     8.488459
1993    10.152321
1994    13.947909
1995    15.099027
1996    14.529618
1997    11.583447
1998    12.315619
1999    13.583531
2000    12.159173
2001    10.986518
2002    12.293668
2003    13.364814
2004    13.363204
2005    12.247726
2006    13.879826
2007    15.969157
2008    15.424641
2009    14.197934
2010    15.063745
2011    13.856534
2012    15.416876
2013    15.731077
2014    14.019860
2015    13.769478
2016    13.640933
2017    16.670494
2018    16.243581
2019    16.627422
2020    18.063394
dtype: float64

In [13]:
# Now compare vs league average
cat_totals['baseline_re'] = cat_totals['innings'] * re_rate
cat_totals['re_diff'] = cat_totals['baseline_re'] - cat_totals['event_re']
cat_totals.sort_values('re_diff').tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,innings,event_re,baseline_re,re_diff
pos2_fld_id,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
moliy001,2007,864,-0.534239,10.019863,10.554102
moliy001,2012,1281,3.406437,14.342061,10.935624
rodri001,1998,1231,-0.088917,11.009824,11.09874
poseb001,2016,1125,-0.028742,11.144553,11.173295
ramow001,2015,1080,-0.445908,10.79959,11.245499
moliy001,2010,1147,0.694835,12.547651,11.852816
schnb001,2004,1120,-1.075113,10.869128,11.944241
realj001,2019,1144,1.766665,13.813922,12.047257
reyeg001,1991,574,-7.92377,4.29866,12.222431
hanir001,2012,919,-2.179506,10.289113,12.46862


In [20]:
# Career totals

careers = cat_totals.groupby('pos2_fld_id').sum()
careers.sort_values('re_diff', ascending=False)

Unnamed: 0_level_0,innings,event_re,baseline_re,re_diff
pos2_fld_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rodri001,20860,70.388823,197.281504,126.892681
moliy001,17616,69.079686,190.008841,120.929155
ausmb001,16178,85.108200,154.901998,69.793798
poseb001,8921,35.723343,97.786384,62.063041
karkr001,5641,1.884928,49.599537,47.714609
...,...,...,...,...
hatts001,2936,68.753669,25.416199,-43.337470
barrm003,7343,122.121063,69.813466,-52.307596
piera001,16681,221.471714,168.873289,-52.598424
piazm001,13906,181.665767,127.905264,-53.760503


In [25]:
# career rates
careers['re_diff_rate'] = careers['re_diff']/careers['innings'] * 162 * 8.5 # represent as full-season eqv
careers[careers['innings']>=3000].sort_values(by='re_diff_rate')

Unnamed: 0_level_0,innings,event_re,baseline_re,re_diff,re_diff_rate
pos2_fld_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
taube001,6599,114.368114,58.407666,-55.960448,-0.008480
barrm003,7343,122.121063,69.813466,-52.307596,-0.007123
widgc001,4458,66.466320,40.973295,-25.493024,-0.005718
berrd002,3202,43.107538,25.639176,-17.468363,-0.005455
darnt001,4387,70.902213,48.707899,-22.194315,-0.005059
...,...,...,...,...,...
moliy001,17616,69.079686,190.008841,120.929155,0.006865
poseb001,8921,35.723343,97.786384,62.063041,0.006957
perer003,3711,15.325667,42.156634,26.830967,0.007230
karkr001,5641,1.884928,49.599537,47.714609,0.008459
