# Age Distributions by Position

In [1]:
import pandas as pd
import plotly.express as px

import boxball_loader as bbl


In [2]:
apps = bbl.load_appearances()
apps.columns

Index(['year_id', 'team_id', 'lg_id', 'player_id', 'g_all', 'gs', 'g_batting',
       'g_defense', 'g_p', 'g_c', 'g_1b', 'g_2b', 'g_3b', 'g_ss', 'g_lf',
       'g_cf', 'g_rf', 'g_of', 'g_dh', 'g_ph', 'g_pr'],
      dtype='object')

In [3]:
birthdates = bbl.load_birthdates()
birthdates.sample(10)


player_id
richbla01    1897-12-18
fletcge01    1845-04-21
gibauia01    1993-11-19
hendrky01    1989-12-07
rohrle01     1946-03-05
lyonru01     1913-06-26
larocan01    1983-09-13
sheajo01     1904-12-27
whitesa01    1893-08-23
bartosh01    1963-05-14
Name: birthdate, dtype: object

In [4]:
def compute_baseball_age(yr: int, bd):
    age = yr - bd.year - (1 if bd.month >= 7 else 0)
    return age

df = pd.merge(left=apps.query('year_id >= 1980').set_index('player_id'), right=birthdates, on='player_id').reset_index()
df['baseball_age'] = df.apply(lambda row: compute_baseball_age(row['year_id'], row['birthdate']), axis=1)

yrly_games = df
yrly_games.sample(10)


Unnamed: 0,player_id,year_id,team_id,lg_id,g_all,gs,g_batting,g_defense,g_p,g_c,...,g_ss,g_lf,g_cf,g_rf,g_of,g_dh,g_ph,g_pr,birthdate,baseball_age
7360,johnsro03,1983,KCA,AL,9,7.0,9,9.0,0,2,...,0,0,0,0,0,0.0,0.0,0.0,1956-03-23,27
42043,lecursa01,2011,CIN,NL,43,4.0,40,43.0,43,0,...,0,0,0,0,0,0.0,0.0,0.0,1984-05-04,27
30544,mateohe01,2004,MON,NL,40,1.0,40,10.0,0,0,...,0,1,0,0,1,0.0,28.0,4.0,1976-10-14,27
13892,gerenbo01,1991,NYA,AL,64,42.0,64,63.0,0,63,...,0,0,0,0,0,0.0,1.0,0.0,1961-09-22,29
33417,welleto01,2006,FLO,NL,18,0.0,18,18.0,18,0,...,0,0,0,0,0,0.0,0.0,0.0,1978-08-30,27
37430,stokebr01,2009,NYN,NL,69,0.0,66,69.0,69,0,...,0,0,0,0,0,0.0,0.0,0.0,1979-09-07,29
36036,tejedro01,2009,KCA,AL,35,6.0,0,35.0,35,0,...,0,0,0,0,0,0.0,0.0,0.0,1982-03-24,27
36529,hammeja01,2007,TBA,AL,24,14.0,1,24.0,24,0,...,0,0,0,0,0,0.0,0.0,0.0,1982-09-02,24
48699,hellebe01,2017,NYA,AL,9,0.0,0,9.0,9,0,...,0,0,0,0,0,0.0,0.0,0.0,1991-08-05,25
19048,pughti01,1994,CIN,NL,10,9.0,10,10.0,10,0,...,0,0,0,0,0,0.0,0.0,0.0,1967-01-26,27


In [5]:
posns = ['c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'dh']
pos_cols = [f'g_{pos}' for pos in posns]
pos_all_cols =  [f'g_{pos}' for pos in (posns+['defense', 'all'])]

yrly_games['primary'] = yrly_games[pos_cols].idxmax(axis=1).str[2:]
yrly_games.sample(10)


Unnamed: 0,player_id,year_id,team_id,lg_id,g_all,gs,g_batting,g_defense,g_p,g_c,...,g_lf,g_cf,g_rf,g_of,g_dh,g_ph,g_pr,birthdate,baseball_age,primary
30576,milleco01,2007,ATL,NL,12,6.0,12,11.0,0,11,...,0,0,0,0,0.0,1.0,0.0,1976-03-18,31,c
50548,shafeju01,2019,TOR,AL,34,0.0,2,34.0,34,0,...,0,0,0,0,0.0,0.0,0.0,1992-09-18,26,c
25978,bakopa01,2008,CIN,NL,99,88.0,99,96.0,0,96,...,0,0,0,0,0.0,5.0,1.0,1972-06-20,36,c
6518,powerte01,1992,CLE,AL,64,0.0,0,64.0,64,0,...,0,0,0,0,0.0,0.0,0.0,1955-01-31,37,c
13712,brilegr01,1992,SEA,AL,86,47.0,86,49.0,0,0,...,27,13,4,42,12.0,31.0,2.0,1965-05-24,27,lf
24627,wrighja01,2002,MIL,NL,19,19.0,19,19.0,19,0,...,0,0,0,0,0.0,0.0,0.0,1974-12-24,27,c
44899,rodried04,2012,SDN,NL,2,2.0,2,2.0,0,2,...,0,0,0,0,0.0,0.0,0.0,1985-12-01,26,c
6619,ruckeda01,1985,PHI,NL,41,3.0,41,39.0,39,0,...,0,0,0,0,0.0,0.0,2.0,1957-09-01,27,c
30339,ingebr01,2007,DET,AL,151,146.0,151,150.0,0,0,...,0,0,0,0,0.0,0.0,1.0,1977-05-19,30,3b
19216,snowjt01,2001,SFN,NL,101,80.0,101,92.0,0,0,...,0,0,0,0,0.0,15.0,0.0,1968-02-26,33,1b


In [6]:
# Let's look at playing time by age

pt_by_age = yrly_games.query('2010<=year_id<2020').groupby('baseball_age')[pos_cols].sum()/10/162
position_colors = {'g_c': 'black', 'g_1b': 'darkkhaki', 'g_2b': 'khaki', 'g_3b': 'brown', 'g_ss': 'tan', 'g_lf': 'green', 'g_cf': 'green', 'g_rf': 'green', 'g_dh': 'grey'}

px.line(pt_by_age,color_discrete_map=position_colors)

In [7]:
# Let's look at 28yo second basemen.  What were they doing 5 years later?

def get_future_pt(gms: pd.DataFrame, pos: str, start_age: int = 28, comp_age: int = 33, gm_threshold: int = 100) -> pd.Series:
    max_year = 2019-(comp_age-start_age)

    prime_regulars = gms.query(f'baseball_age==@start_age and g_all>=@gm_threshold and primary==@pos and year_id<=@max_year')['player_id']
    avg_gms = gms.query('player_id in @prime_regulars and baseball_age==@comp_age')[pos_all_cols].sum()/len(prime_regulars)
    avg_gms['n'] = len(prime_regulars)
    return avg_gms.rename(pos)

get_future_pt(yrly_games, '2b')

g_c           0.000000
g_1b          2.655556
g_2b         49.255556
g_3b          4.033333
g_ss          4.944444
g_lf          5.444444
g_cf          0.433333
g_rf          2.233333
g_dh          3.177778
g_defense    66.644444
g_all        75.122222
n            90.000000
Name: 2b, dtype: float64

In [8]:
future = pd.concat([get_future_pt(yrly_games, pos) for pos in posns], axis=1).T
future

Unnamed: 0,g_c,g_1b,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_dh,g_defense,g_all,n
c,63.611111,3.125,0.027778,0.069444,0.0,0.041667,0.0,0.25,1.208333,66.611111,70.875,72.0
1b,0.0,61.847059,0.094118,3.188235,0.0,3.035294,0.0,1.447059,6.894118,68.576471,79.505882,85.0
2b,0.0,2.655556,49.255556,4.033333,4.944444,5.444444,0.433333,2.233333,3.177778,66.644444,75.122222,90.0
3b,0.031579,11.178947,3.031579,50.063158,1.926316,2.736842,0.136842,0.715789,4.263158,68.905263,79.294737,95.0
ss,0.0,1.313953,10.755814,8.290698,40.5,2.05814,2.534884,1.813953,1.546512,66.174419,70.55814,86.0
lf,0.0,3.040816,0.438776,2.704082,0.214286,30.520408,5.193878,7.602041,7.418367,48.632653,61.142857,98.0
cf,0.0,0.825243,0.786408,0.834951,0.592233,16.271845,35.116505,19.0,3.009709,70.398058,79.572816,103.0
rf,0.0,4.297297,1.099099,0.324324,0.0,9.864865,7.054054,37.126126,6.927928,58.162162,70.477477,111.0
dh,0.0,8.461538,0.0,0.0,0.0,1.769231,0.0,4.923077,35.0,14.384615,54.230769,13.0


In [9]:
# try it again for 2000-current
def get_pt_matrix(start_age=24, comp_age=27):
    future = pd.concat([get_future_pt(yrly_games.query('year_id>=2000'), pos, start_age=start_age, comp_age=comp_age) for pos in posns], axis=1).T
    return future.apply(lambda x: round(x, 1))

get_pt_matrix(24, 27)

Unnamed: 0,g_c,g_1b,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_dh,g_defense,g_all,n
c,105.0,1.3,0.0,0.0,0.0,0.0,0.0,0.0,2.6,105.4,110.8,20.0
1b,0.0,101.1,5.9,0.7,0.0,1.5,0.0,0.1,7.7,108.2,120.4,25.0
2b,0.0,4.0,72.8,11.8,6.7,5.1,4.5,0.6,1.2,103.0,109.9,34.0
3b,0.0,12.0,2.9,70.9,0.6,5.4,1.4,5.5,5.2,96.2,105.2,36.0
ss,0.0,0.0,12.0,4.9,100.5,0.5,0.2,0.2,1.2,117.5,120.6,34.0
lf,0.0,0.1,3.4,0.0,0.4,52.0,9.8,26.9,6.2,90.6,100.4,25.0
cf,0.0,0.5,0.7,0.1,0.3,12.4,85.0,15.2,1.2,112.8,118.9,42.0
rf,0.0,4.3,0.0,0.0,0.0,22.6,30.0,50.5,0.9,103.3,107.8,26.0
dh,0.0,22.5,0.0,0.0,0.0,4.5,0.0,10.5,56.0,37.5,102.5,2.0


In [10]:
get_pt_matrix(27, 30)

Unnamed: 0,g_c,g_1b,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_dh,g_defense,g_all,n
c,84.2,8.0,0.0,0.1,0.0,0.4,0.0,0.2,7.6,91.5,104.2,37.0
1b,0.0,76.1,0.1,5.8,0.0,2.8,0.0,8.7,7.0,92.0,103.2,45.0
2b,0.0,2.8,62.7,9.7,6.6,4.5,1.2,0.8,0.8,86.0,92.4,59.0
3b,0.0,12.2,7.4,62.4,4.4,0.5,0.0,3.2,4.1,88.1,96.0,42.0
ss,0.0,0.4,12.5,10.6,89.0,0.8,2.8,0.2,0.8,114.2,118.7,47.0
lf,0.0,9.4,0.4,2.3,0.0,54.7,6.5,13.2,8.9,84.6,99.8,48.0
cf,0.0,2.6,0.9,0.1,0.1,18.8,58.8,11.5,1.7,90.1,96.2,58.0
rf,0.0,4.4,1.4,0.1,0.2,13.7,21.0,62.2,4.6,98.8,107.4,42.0
dh,1.3,22.2,0.0,5.2,0.0,12.7,0.0,1.1,34.0,40.2,80.0,10.0


In [11]:
# as an aside
# Wait, how do we get 1b->2b move of 5.9 games at age 24->27?  Did someone move from 1B to 2B?  Let's find out.

def temp_get_future_pt(gms: pd.DataFrame, pos: str, start_age: int = 28, comp_age: int = 33, gm_threshold: int = 100) -> pd.Series:
    max_year = 2019-(comp_age-start_age)

    prime_regulars = gms.query(f'baseball_age==@start_age and g_all>=@gm_threshold and primary==@pos and year_id<=@max_year')['player_id']
    print(gms.query('player_id in @prime_regulars and baseball_age==@comp_age')[[f'g_{pos}' for pos in posns]].sort_values('g_2b'))

temp_get_future_pt(yrly_games.query('year_id>=2000'), '1b', start_age=24, comp_age=27)

       g_c  g_1b  g_2b  g_3b  g_ss  g_lf  g_cf  g_rf   g_dh
25306    0   119     0     0     0     0     0     0   14.0
43268    0   157     0     0     0     0     0     0    5.0
43163    0   157     0     0     0     0     0     0    2.0
42704    0   120     0     0     0    14     0     0    0.0
42415    0    79     0     0     0     0     0     0    0.0
41792    0   105     0    16     0     0     0     0    0.0
38941    0     1     0     0     0     0     0     1    0.0
38940    0     1     0     0     0     0     0     1    0.0
38778    0   160     0     0     0     0     0     0    0.0
37779    0     7     0     0     0     0     0     0  150.0
36814    0   150     0     0     0     0     0     0    1.0
37666    0    36     0     0     0     0     0     0    0.0
35207    0   159     0     0     0     0     0     0    3.0
34228    0   151     0     0     0     0     0     0    0.0
34206    0   116     0     0     0     0     0     0    3.0
33929    0   156     0     0     0     0

In [12]:
yrly_games.loc[39642]

player_id        murphda08
year_id               2012
team_id                NYN
lg_id                   NL
g_all                  156
gs                   136.0
g_batting              156
g_defense            144.0
g_p                      0
g_c                      0
g_1b                    12
g_2b                   138
g_3b                     0
g_ss                     0
g_lf                     0
g_cf                     0
g_rf                     0
g_of                     0
g_dh                   0.0
g_ph                  18.0
g_pr                   0.0
birthdate       1985-04-01
baseball_age            27
primary                 2b
Name: 39642, dtype: object