## How Likely are Players to reach career milestones (e.g., 3000 H)

### And how did the shortened season impact their chances?

Players like Starlin Castro and Nick Markakis had outside chances of 3000 hits, that have been seriously threatened by a shortened season.  Can we estimate their probabilities before and after, by finding comparable players?

In [1]:
import pandas as pd
import boxball_loader as bbl

In [14]:
bat = bbl.load_batting(bbl.Eras.Integration, bbl.PlayerType.POSITION, bbl.CoalesceMode.PLAYER_SEASON)



In [3]:
date_cols = ['year', 'month', 'day']
col_mapper = { f'birth_{col}': col for col in date_cols}
ppl = bbl.load_people().set_index('player_id')[col_mapper.keys()].dropna().rename(columns=col_mapper)
birthdates = pd.to_datetime(ppl).rename('birth_date')
birthdates

player_id
aardsda01   1981-12-27
aaronha01   1934-02-05
aaronto01   1939-08-05
aasedo01    1954-09-08
abadan01    1972-08-25
               ...    
zupofr01    1939-08-29
zuvelpa01   1958-10-31
zuverge01   1924-08-20
zwilldu01   1888-11-02
zychto01    1990-08-07
Name: birth_date, Length: 19670, dtype: datetime64[ns]

In [4]:
# Only consider players born between 1930 (e.g., played post WW2/integration) and before 1980 (career is complete)
bat_age = pd.merge(left=bat, right=birthdates, on=['player_id']).query('"1930-01-01" <= birth_date < "1980-01-01"')
bat_age

Unnamed: 0,player_id,yr,stint,team_id,lg_id,g,ab,r,h,_2b,...,cs,bb,so,ibb,hbp,sh,sf,gidp,franch_id,birth_date
35530,antonjo02,1948,1,BSN,NL,4,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,ATL,1930-04-12
35531,antonjo02,1949,1,BSN,NL,22,25,0,3,0,...,,1,9.0,,0.0,5.0,,0.0,ATL,1930-04-12
35532,antonjo02,1950,1,BSN,NL,20,16,1,2,0,...,,0,3.0,,0.0,0.0,,1.0,ATL,1930-04-12
35533,antonjo02,1953,1,ML1,NL,31,62,7,11,2,...,0.0,3,14.0,,0.0,4.0,,0.0,ATL,1930-04-12
35534,antonjo02,1954,1,NY1,NL,39,98,6,16,0,...,0.0,4,25.0,,0.0,4.0,0.0,2.0,SFG,1930-04-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98635,boscajc01,2013,1,CHN,NL,6,9,1,2,1,...,0.0,0,2.0,0.0,1.0,0.0,0.0,0.0,CHC,1979-12-26
99512,tateyyo01,2011,1,TEX,AL,39,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,TEX,1975-12-26
99513,tateyyo01,2012,1,TEX,AL,14,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,TEX,1975-12-26
100217,sanitam01,2011,1,NYA,AL,4,0,0,0,0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,NYY,1979-07-04


In [5]:
import arrow

def compute_baseball_age(b):
    return (arrow.Arrow(b['yr'],7,1).date()-b['birth_date'].date()).days/365.24



In [6]:
bat_age['age'] = bat_age.apply(compute_baseball_age, axis=1)
bat_age

Unnamed: 0,player_id,yr,stint,team_id,lg_id,g,ab,r,h,_2b,...,bb,so,ibb,hbp,sh,sf,gidp,franch_id,birth_date,age
35530,antonjo02,1948,1,BSN,NL,4,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,ATL,1930-04-12,18.220896
35531,antonjo02,1949,1,BSN,NL,22,25,0,3,0,...,1,9.0,,0.0,5.0,,0.0,ATL,1930-04-12,19.220239
35532,antonjo02,1950,1,BSN,NL,20,16,1,2,0,...,0,3.0,,0.0,0.0,,1.0,ATL,1930-04-12,20.219582
35533,antonjo02,1953,1,ML1,NL,31,62,7,11,2,...,3,14.0,,0.0,4.0,,0.0,ATL,1930-04-12,23.220348
35534,antonjo02,1954,1,NY1,NL,39,98,6,16,0,...,4,25.0,,0.0,4.0,0.0,2.0,SFG,1930-04-12,24.219691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98635,boscajc01,2013,1,CHN,NL,6,9,1,2,1,...,0,2.0,0.0,1.0,0.0,0.0,0.0,CHC,1979-12-26,33.514949
99512,tateyyo01,2011,1,TEX,AL,39,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,TEX,1975-12-26,35.513635
99513,tateyyo01,2012,1,TEX,AL,14,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,TEX,1975-12-26,36.515716
100217,sanitam01,2011,1,NYA,AL,4,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,NYY,1979-07-04,31.992662


In [7]:
# Define comparable players as those with a similar total of hits (or whatever) over a given age span
min_age=20
max_age=29
comp_total = 1617
def find_comps(min_age, max_age, comp_total):
    comp_range=100
    comp_hits = bat_age[(min_age <= bat_age['age']) & (bat_age['age'] <= max_age)].groupby(['player_id'])['h'].sum()
    comps = comp_hits[abs(comp_hits-comp_total)<comp_range]
    return comps.sort_values()
    
comps = find_comps(min_age, max_age, comp_total)
comps

player_id
alomaro01    1522
kalinal01    1537
rodrial01    1663
aaronha01    1697
Name: h, dtype: int16

In [8]:
# Then look at remaining careers of the comps
# With and without a missing season
comps_after_29 = bat_age[((bat_age['player_id'].isin(comps.index)& (bat_age['age']>=30)))].groupby(['player_id'])['h'].sum()
comps_after_30 = bat_age[((bat_age['player_id'].isin(comps.index)& (bat_age['age']>=31)))].groupby(['player_id'])['h'].sum()
comps_after_30

player_id
aaronha01    1686
alomaro01     899
kalinal01    1058
rodrial01    1048
Name: h, dtype: int16

In [9]:
comps_after_29.mean()

1330.5

In [10]:
comps_after_30.mean()

1172.75

In [11]:
# Castro has 1633 hits now, 1617 coming into 2020
(comps_after_29>=(3000-1617)).value_counts()

False    3
True     1
Name: h, dtype: int64

In [12]:
comps_after_29.sort_values()

player_id
alomaro01    1065
kalinal01    1170
rodrial01    1214
aaronha01    1873
Name: h, dtype: int16

In [13]:
# Castro has 1633 hits now, 1617 coming into 2020
(comps_after_30>=(3000-1633)).value_counts()

False    3
True     1
Name: h, dtype: int64