In [1]:
import pandas as pd
import boxball_loader as bbl
import baseball_stats_utils as bsu

In [2]:
runner_cols = ['base1_run_id', 'base2_run_id', 'base3_run_id']
cols = ['bat_id', 'pit_id', 'bat_hand_cd', 'pit_hand_cd', 'ob_fl', 'tb_ct'] + runner_cols
ev = bbl.load_event_data(bbl.Seasons(1979, 2003), cols, pa_only=True)
ev['baserunner_fl'] = ev[runner_cols].any(axis=1) # a flag indicating whether any runners on
ev.sample(10)

Unnamed: 0,h_fl,yr,ob_fl,event_cd,game_type,date,pit_hand_cd,pit_id,base1_run_id,game_id,tb_ct,base3_run_id,ab_fl,base2_run_id,bat_event_fl,bat_id,bat_hand_cd,baserunner_fl
12915605,1,1992,1,20,RS,1992-05-25,R,sampb002,,MON199205250,1,,True,,True,henrb001,L,False
9220699,0,1991,0,2,RS,1991-05-22,L,kilgp001,,DET199105220,0,,True,,True,allaa001,R,False
4502479,0,1997,0,2,RS,1997-09-15,L,eyres001,,MIL199709150,0,,True,,True,stink001,R,False
442104,0,1989,0,2,RS,1989-04-24,R,stewd001,white002,OAK198904240,0,,True,,True,bellg001,R,True
12238046,0,1995,0,2,RS,1995-08-06,L,howes001,flahj001,DET199508060,0,,True,,True,gibsk001,L,True
5252768,0,2002,0,3,RS,2002-07-02,R,tuckt001,,ATL200207020,0,,True,,True,castv001,R,False
5379846,0,2002,1,16,RS,2002-06-08,R,bensk001,jenkg001,PIT200206080,0,,False,youne001,True,sexsr001,R,True
13345565,0,1981,1,14,RS,1981-05-29,L,matlj101,,TEX198105290,0,,False,,True,burrj001,R,False
8665050,0,1979,0,2,RS,1979-04-19,R,hartp001,,MIN197904190,0,,True,,True,bayld001,R,False
9280362,0,1991,0,2,RS,1991-07-19,R,moorm001,,OAK199107190,0,,True,,True,nokem001,L,False


In [3]:
runner_ids = ['hendr001']

rickey_fl = ev[runner_cols].isin(runner_ids).any(axis=1)
rickey_fl.value_counts()

False    4121871
True        9839
dtype: int64

In [5]:
# For now, the baseline is the career total for each batter
def generate_batter_baselines(ev):
    counting_cols = ['ob', 'ab', 'h', 'tb', 'k', 'bb', 'ibb', 'hr']
    batter_totals = bsu.summarize_events(ev.query('baserunner_fl'), ['bat_id'])
    batter_rates = pd.concat([(batter_totals[col]/batter_totals['pa']).rename(col) for col in counting_cols], axis=1)
    return batter_rates

batter_rates = generate_batter_baselines(ev)
batter_rates.sample(10)

Unnamed: 0_level_0,ob,ab,h,tb,k,bb,ibb,hr
bat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
smitb004,0.157895,0.631579,0.052632,0.052632,0.105263,0.105263,0.0,0.0
howad001,0.277228,0.826733,0.194307,0.264851,0.157178,0.076733,0.008663,0.008663
valdp001,0.342105,0.894737,0.236842,0.447368,0.184211,0.105263,0.0,0.026316
olerj001,0.381595,0.797398,0.239219,0.375331,0.10648,0.131535,0.036377,0.026259
meyeb001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
barnb001,0.166667,0.787879,0.121212,0.121212,0.378788,0.045455,0.0,0.0
pattc001,0.29249,0.922925,0.254941,0.403162,0.197628,0.031621,0.005929,0.031621
jonet004,0.243363,0.862832,0.199115,0.256637,0.123894,0.044248,0.00885,0.0
escoa002,0.305556,0.916667,0.25,0.305556,0.333333,0.055556,0.013889,0.013889
ruanw001,0.411765,1.0,0.411765,0.588235,0.176471,0.0,0.0,0.0


In [6]:
rickey_ct = ev[rickey_fl]['bat_id'].value_counts()
rickey_ct

murpd002    898
mattd001    636
lansc001    479
randw001    411
cansj001    328
           ... 
bergs001      1
willg002      1
youne002      1
noboj001      1
rossd001      1
Name: bat_id, Length: 247, dtype: int64

In [10]:
# this function will encapsulate how we find a baseline for a set of PAs
# for now, just use the same thing we have been: a weighted average of the each batter's career line with runners on base
def get_baseline_for_pas(pas):
    batter_pa_counts = pas['bat_id'].value_counts()

    # compute the weighted average of the batter rates, weighted by batter_pa_counts
    # TODO this function depends on the batter_rates df existing as global
    baseline = pd.concat([(batter_rates[col]*batter_pa_counts).dropna().rename(col) for col in batter_rates.columns], axis=1).sum()/len(pas)
    baseline['pa'] = 1
    baseline['ba'] = baseline['h'] / baseline['ab']
    baseline['obp'] = baseline['ob']
    baseline['slg'] = baseline['tb'] / baseline['ab']
    baseline['woba'] = bsu.get_woba(baseline)
    return baseline

teammate_baseline = get_baseline_for_pas(ev[rickey_fl])
teammate_baseline

ob      0.334725
ab      0.851979
h       0.238456
tb      0.368962
k       0.135296
bb      0.090608
ibb     0.017913
hr      0.026353
pa      1.000000
ba      0.279884
obp     0.334725
slg     0.433064
woba    0.314475
dtype: float64

In [11]:
with_rickey = bsu.summarize_events(ev, rickey_fl).loc[True]
with_rickey

pa      9839.000000
ob      3327.000000
ab      8190.000000
h       2395.000000
tb      3688.000000
k       1260.000000
bb       881.000000
ibb      181.000000
hr       248.000000
ba         0.292430
obp        0.338144
slg        0.450305
woba       0.317882
k%         0.128062
bb%        0.089542
hr%        0.025206
Name: True, dtype: float64

In [12]:
rate_cols = ['ba', 'obp', 'slg']
comp = pd.concat([teammate_baseline[rate_cols].rename('baseline'), with_rickey[rate_cols+['pa']].rename('with_runner')], axis=1).T
comp['r27'] = 31 * comp['obp'] * comp['slg']
comp

Unnamed: 0,ba,obp,slg,pa,r27
baseline,0.279884,0.334725,0.433064,,4.49368
with_runner,0.29243,0.338144,0.450305,9839.0,4.72031


In [13]:
# Estimate the size of the difference in runs
(comp['r27']['with_runner']-comp['r27']['baseline'])*comp['pa']['with_runner']/36

61.939209676952565