# Age Distributions by Position

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

import boxball_loader as bbl


In [2]:
apps = bbl.load_appearances()
apps.columns

Index(['year_id', 'team_id', 'lg_id', 'player_id', 'g_all', 'gs', 'g_batting',
       'g_defense', 'g_p', 'g_c', 'g_1b', 'g_2b', 'g_3b', 'g_ss', 'g_lf',
       'g_cf', 'g_rf', 'g_of', 'g_dh', 'g_ph', 'g_pr'],
      dtype='object')

In [3]:
def compute_baseball_age(yr, bd):
    age = yr - bd.year - np.where(bd.month>=7, 1, 0)
    return age
    

In [4]:
birthdates = bbl.load_birthdates()
birthdates

player_id
aardsda01   1981-12-27
aaronha01   1934-02-05
aaronto01   1939-08-05
aasedo01    1954-09-08
abadan01    1972-08-25
               ...    
zupofr01    1939-08-29
zuvelpa01   1958-10-31
zuverge01   1924-08-20
zwilldu01   1888-11-02
zychto01    1990-08-07
Name: birthdate, Length: 19670, dtype: datetime64[ns]

In [5]:
df = pd.merge(left=apps.query('year_id >= 1980').set_index('player_id'), right=birthdates, on='player_id').reset_index()
df['baseball_age'] = compute_baseball_age(df['year_id'], df['birthdate'].dt)

yrly_games = df
yrly_games.sample(10)

Unnamed: 0,player_id,year_id,team_id,lg_id,g_all,gs,g_batting,g_defense,g_p,g_c,...,g_ss,g_lf,g_cf,g_rf,g_of,g_dh,g_ph,g_pr,birthdate,baseball_age
8202,dawlebi01,1988,PHI,NL,8,0.0,8,8.0,8,0,...,0,0,0,0,0,0.0,0.0,0.0,1958-02-06,30
48782,kuhlch01,2017,PIT,NL,33,31.0,30,31.0,31,0,...,0,0,0,0,0,0.0,2.0,0.0,1992-09-10,24
48929,musgrjo01,2017,HOU,AL,38,15.0,4,38.0,38,0,...,0,0,0,0,0,0.0,0.0,0.0,1992-12-04,24
20250,mccarda01,2003,OAK,AL,8,7.0,8,8.0,0,0,...,0,5,0,0,5,0.0,0.0,0.0,1969-11-23,33
45918,puigya01,2013,LAN,NL,104,96.0,104,100.0,0,0,...,0,2,10,93,100,0.0,5.0,0.0,1990-12-07,22
14391,schilcu01,1996,PHI,NL,28,26.0,28,26.0,26,0,...,0,0,0,0,0,0.0,2.0,0.0,1966-11-14,29
31483,figgich01,2005,LAA,AL,158,157.0,158,151.0,0,0,...,4,15,50,8,72,7.0,1.0,0.0,1978-01-22,27
15727,aldresc01,1991,DET,AL,11,11.0,0,11.0,11,0,...,0,0,0,0,0,0.0,0.0,0.0,1968-06-12,23
40186,avilaal01,2011,DET,AL,141,135.0,141,134.0,0,133,...,0,0,0,0,0,4.0,4.0,0.0,1987-01-29,24
28638,willisc01,2000,CIN,NL,48,10.0,47,48.0,48,0,...,0,0,0,0,0,0.0,0.0,0.0,1976-02-17,24


In [6]:
posns = ['c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'dh']
pos_cols = [f'g_{pos}' for pos in posns]
pos_all_cols =  [f'g_{pos}' for pos in (posns+['defense', 'all'])]
position_colors = {'c': 'black', '1b': 'darkkhaki', '2b': 'khaki', '3b': 'brown', 'ss': 'tan', 'lf': 'green', 'cf': 'green', 'rf': 'green', 'dh': 'grey'}
for pos in posns:
    position_colors[f'g_{pos}'] = position_colors[pos]

yrly_games['primary'] = yrly_games[pos_cols].idxmax(axis=1).str[2:]
yrly_games.sample(10)


Unnamed: 0,player_id,year_id,team_id,lg_id,g_all,gs,g_batting,g_defense,g_p,g_c,...,g_lf,g_cf,g_rf,g_of,g_dh,g_ph,g_pr,birthdate,baseball_age,primary
14122,jordari02,1990,PHI,NL,92,81.0,92,84.0,0,0,...,0,0,0,0,0.0,8.0,0.0,1965-05-26,25,1b
10398,greenmi01,1987,BOS,AL,125,103.0,125,92.0,0,1,...,64,0,28,91,15.0,20.0,1.0,1963-07-18,23,lf
8004,bilarda01,1991,SDN,NL,15,6.0,15,13.0,0,13,...,0,0,0,0,0.0,2.0,0.0,1959-05-26,32,c
29117,harpetr01,2002,TBA,AL,37,7.0,3,37.0,37,0,...,0,0,0,0,0.0,0.0,0.0,1976-05-21,26,c
13839,dejesjo01,1989,KCA,AL,3,1.0,0,3.0,3,0,...,0,0,0,0,0.0,0.0,0.0,1965-01-06,24,c
23784,erstada01,2005,LAA,AL,153,149.0,153,147.0,0,0,...,0,0,0,0,5.0,0.0,1.0,1974-06-04,31,1b
35122,doumiry01,2013,MIN,AL,135,119.0,135,76.0,0,43,...,1,0,32,33,49.0,16.0,0.0,1981-04-03,32,dh
3283,molinbo01,1980,CHA,AL,119,91.0,119,49.0,0,0,...,49,0,0,49,48.0,26.0,2.0,1950-05-21,30,lf
5306,walkech01,1987,CHN,NL,47,23.0,47,35.0,0,0,...,25,3,5,33,0.0,7.0,10.0,1957-11-25,29,lf
22709,myersmi01,2007,NYA,AL,55,0.0,6,55.0,55,0,...,0,0,0,0,0.0,0.0,0.0,1969-06-26,38,c


In [7]:
# Let's look at playing time by age

pt_by_age = yrly_games.query('2010<=year_id<2020').groupby('baseball_age')[pos_cols].sum()/10/162
px.line(pt_by_age,color_discrete_map=position_colors)

In [8]:
# Let's look at 28yo second basemen.  What were they doing 5 years later?

def get_future_pt(gms: pd.DataFrame, pos: str, start_age: int = 28, comp_age: int = 33, gm_threshold: int = 100) -> pd.Series:
    max_year = 2019-(comp_age-start_age)

    prime_regulars = gms.query(f'baseball_age==@start_age and g_all>=@gm_threshold and primary==@pos and year_id<=@max_year')['player_id']
    avg_gms = gms.query('player_id in @prime_regulars and baseball_age==@comp_age')[pos_all_cols].sum()/len(prime_regulars)
    avg_gms['n'] = len(prime_regulars)
    return avg_gms.rename(pos)

get_future_pt(yrly_games, '2b')

g_c           0.000000
g_1b          2.655556
g_2b         49.255556
g_3b          4.033333
g_ss          4.944444
g_lf          5.444444
g_cf          0.433333
g_rf          2.233333
g_dh          3.177778
g_defense    66.644444
g_all        75.122222
n            90.000000
Name: 2b, dtype: float64

In [9]:
future = pd.concat([get_future_pt(yrly_games, pos) for pos in posns], axis=1).T
future

Unnamed: 0,g_c,g_1b,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_dh,g_defense,g_all,n
c,63.611111,3.125,0.027778,0.069444,0.0,0.041667,0.0,0.25,1.208333,66.611111,70.875,72.0
1b,0.0,61.847059,0.094118,3.188235,0.0,3.035294,0.0,1.447059,6.894118,68.576471,79.505882,85.0
2b,0.0,2.655556,49.255556,4.033333,4.944444,5.444444,0.433333,2.233333,3.177778,66.644444,75.122222,90.0
3b,0.031579,11.178947,3.031579,50.063158,1.926316,2.736842,0.136842,0.715789,4.263158,68.905263,79.294737,95.0
ss,0.0,1.313953,10.755814,8.290698,40.5,2.05814,2.534884,1.813953,1.546512,66.174419,70.55814,86.0
lf,0.0,3.040816,0.438776,2.704082,0.214286,30.520408,5.193878,7.602041,7.418367,48.632653,61.142857,98.0
cf,0.0,0.825243,0.786408,0.834951,0.592233,16.271845,35.116505,19.0,3.009709,70.398058,79.572816,103.0
rf,0.0,4.297297,1.099099,0.324324,0.0,9.864865,7.054054,37.126126,6.927928,58.162162,70.477477,111.0
dh,0.0,8.461538,0.0,0.0,0.0,1.769231,0.0,4.923077,35.0,14.384615,54.230769,13.0


In [10]:
# try it again for 2000-current
def get_pt_matrix(start_age=24, comp_age=27):
    future = pd.concat([get_future_pt(yrly_games.query('year_id>=2000'), pos, start_age=start_age, comp_age=comp_age) for pos in posns], axis=1).T
    return future.apply(lambda x: round(x, 1))

get_pt_matrix(24, 27)

Unnamed: 0,g_c,g_1b,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_dh,g_defense,g_all,n
c,105.0,1.3,0.0,0.0,0.0,0.0,0.0,0.0,2.6,105.4,110.8,20.0
1b,0.0,101.1,5.9,0.7,0.0,1.5,0.0,0.1,7.7,108.2,120.4,25.0
2b,0.0,4.0,72.8,11.8,6.7,5.1,4.5,0.6,1.2,103.0,109.9,34.0
3b,0.0,12.0,2.9,70.9,0.6,5.4,1.4,5.5,5.2,96.2,105.2,36.0
ss,0.0,0.0,12.0,4.9,100.5,0.5,0.2,0.2,1.2,117.5,120.6,34.0
lf,0.0,0.1,3.4,0.0,0.4,52.0,9.8,26.9,6.2,90.6,100.4,25.0
cf,0.0,0.5,0.7,0.1,0.3,12.4,85.0,15.2,1.2,112.8,118.9,42.0
rf,0.0,4.3,0.0,0.0,0.0,22.6,30.0,50.5,0.9,103.3,107.8,26.0
dh,0.0,22.5,0.0,0.0,0.0,4.5,0.0,10.5,56.0,37.5,102.5,2.0


In [11]:
get_pt_matrix(27, 30)

Unnamed: 0,g_c,g_1b,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_dh,g_defense,g_all,n
c,84.2,8.0,0.0,0.1,0.0,0.4,0.0,0.2,7.6,91.5,104.2,37.0
1b,0.0,76.1,0.1,5.8,0.0,2.8,0.0,8.7,7.0,92.0,103.2,45.0
2b,0.0,2.8,62.7,9.7,6.6,4.5,1.2,0.8,0.8,86.0,92.4,59.0
3b,0.0,12.2,7.4,62.4,4.4,0.5,0.0,3.2,4.1,88.1,96.0,42.0
ss,0.0,0.4,12.5,10.6,89.0,0.8,2.8,0.2,0.8,114.2,118.7,47.0
lf,0.0,9.4,0.4,2.3,0.0,54.7,6.5,13.2,8.9,84.6,99.8,48.0
cf,0.0,2.6,0.9,0.1,0.1,18.8,58.8,11.5,1.7,90.1,96.2,58.0
rf,0.0,4.4,1.4,0.1,0.2,13.7,21.0,62.2,4.6,98.8,107.4,42.0
dh,1.3,22.2,0.0,5.2,0.0,12.7,0.0,1.1,34.0,40.2,80.0,10.0


In [12]:
# as an aside
# Wait, how do we get 1b->2b move of 5.9 games at age 24->27?  Did someone move from 1B to 2B?  Let's find out.

def temp_get_future_pt(gms: pd.DataFrame, pos: str, start_age: int = 28, comp_age: int = 33, gm_threshold: int = 100) -> pd.Series:
    max_year = 2019-(comp_age-start_age)

    prime_regulars = gms.query(f'baseball_age==@start_age and g_all>=@gm_threshold and primary==@pos and year_id<=@max_year')['player_id']
    print(gms.query('player_id in @prime_regulars and baseball_age==@comp_age')[[f'g_{pos}' for pos in posns]].sort_values('g_2b'))

temp_get_future_pt(yrly_games.query('year_id>=2000'), '1b', start_age=24, comp_age=27)

       g_c  g_1b  g_2b  g_3b  g_ss  g_lf  g_cf  g_rf   g_dh
25306    0   119     0     0     0     0     0     0   14.0
43268    0   157     0     0     0     0     0     0    5.0
43163    0   157     0     0     0     0     0     0    2.0
42704    0   120     0     0     0    14     0     0    0.0
42415    0    79     0     0     0     0     0     0    0.0
41792    0   105     0    16     0     0     0     0    0.0
38941    0     1     0     0     0     0     0     1    0.0
38940    0     1     0     0     0     0     0     1    0.0
38778    0   160     0     0     0     0     0     0    0.0
37779    0     7     0     0     0     0     0     0  150.0
36814    0   150     0     0     0     0     0     0    1.0
37666    0    36     0     0     0     0     0     0    0.0
35207    0   159     0     0     0     0     0     0    3.0
34228    0   151     0     0     0     0     0     0    0.0
34206    0   116     0     0     0     0     0     0    3.0
33929    0   156     0     0     0     0

In [13]:
yrly_games.loc[39642]

player_id                 murphda08
year_id                        2012
team_id                         NYN
lg_id                            NL
g_all                           156
gs                            136.0
g_batting                       156
g_defense                     144.0
g_p                               0
g_c                               0
g_1b                             12
g_2b                            138
g_3b                              0
g_ss                              0
g_lf                              0
g_cf                              0
g_rf                              0
g_of                              0
g_dh                            0.0
g_ph                           18.0
g_pr                            0.0
birthdate       1985-04-01 00:00:00
baseball_age                     27
primary                          2b
Name: 39642, dtype: object

In [14]:
# OK, let's do these 3-year diffs for each start_age, and for now look at g_all as the output variable

trends = pd.concat((get_pt_matrix(start_age, start_age+3)['g_all'].rename(f'{start_age}->{start_age+3}') for start_age in range(24, 36)), axis=1)
trends


Unnamed: 0,24->27,25->28,26->29,27->30,28->31,29->32,30->33,31->34,32->35,33->36,34->37,35->38
c,110.8,98.6,97.9,104.2,94.4,96.9,91.0,80.9,81.8,68.2,47.6,43.2
1b,120.4,113.3,100.8,103.2,111.2,99.0,90.8,88.5,77.8,56.9,40.3,43.7
2b,109.9,111.4,105.7,92.4,102.5,98.7,88.2,77.8,72.9,70.4,54.1,50.6
3b,105.2,109.1,96.7,96.0,92.5,87.6,85.0,59.3,70.5,58.4,54.5,34.3
ss,120.6,123.0,106.3,118.7,100.1,100.3,94.5,81.5,65.2,91.7,64.7,40.5
lf,100.4,91.2,94.1,99.8,83.0,81.8,95.0,77.0,67.0,50.8,50.3,51.5
cf,118.9,101.7,97.6,96.2,103.1,90.7,85.8,71.4,60.9,93.2,69.9,69.0
rf,107.8,83.0,105.3,107.4,90.2,92.5,90.4,94.1,68.0,61.4,74.0,64.0
dh,102.5,97.7,98.8,80.0,82.1,68.5,101.4,72.7,82.7,93.0,47.7,57.1


In [15]:
px.line(trends.T.rolling(5, center=True).mean().dropna(), color_discrete_map=position_colors)