# Age Distributions by Position

In [1]:
import pandas as pd
import plotly.express as px
import datetime

import boxball_loader as bbl


In [2]:
apps = bbl.load_appearances()
apps.columns

Index(['year_id', 'team_id', 'lg_id', 'player_id', 'g_all', 'gs', 'g_batting',
       'g_defense', 'g_p', 'g_c', 'g_1b', 'g_2b', 'g_3b', 'g_ss', 'g_lf',
       'g_cf', 'g_rf', 'g_of', 'g_dh', 'g_ph', 'g_pr'],
      dtype='object')

In [3]:
birthdates = bbl.load_birthdates()
birthdates.sample(10)


player_id
burdabo01    1938-07-16
reyesfr01    1995-07-07
thomaer01    1948-08-13
whitele01    1957-09-10
deshaji01    1960-06-23
carloci01    1940-09-17
burgobi01    1919-11-15
graboal01    1901-09-04
jimence01    1984-11-12
brenzbi01    1910-03-03
Name: birthdate, dtype: object

In [4]:

def compute_age(b):
    return (datetime.datetime(b['year_id'],7,1).date()-b['birthdate']).days/365.24
    
df = pd.merge(left=apps.query('year_id >= 1980').set_index('player_id'), right=birthdates, on='player_id').reset_index()
df['age'] = df.apply(compute_age, axis=1)
df['age_int'] = df['age'].astype(int)

yrly_games = df
yrly_games.sample(10)


Unnamed: 0,player_id,year_id,team_id,lg_id,g_all,gs,g_batting,g_defense,g_p,g_c,...,g_lf,g_cf,g_rf,g_of,g_dh,g_ph,g_pr,birthdate,age,age_int
5222,valenfe01,1997,SLN,NL,5,5.0,4,5.0,5,0,...,0,0,0,0,0.0,0.0,0.0,1960-11-01,36.663564,36
14976,halech01,1994,MIN,AL,67,25.0,67,32.0,0,0,...,0,0,1,1,10.0,34.0,1.0,1964-12-02,29.577812,29
49457,deverra01,2019,BOS,AL,156,152.0,156,152.0,0,0,...,0,0,0,0,1.0,4.0,0.0,1996-10-24,22.683715,22
22251,gilesbr02,2008,SDN,NL,147,142.0,147,144.0,0,0,...,0,2,144,144,0.0,5.0,0.0,1971-01-20,37.44661,37
49967,tauchmi01,2018,COL,NL,21,6.0,21,9.0,0,0,...,3,5,1,9,0.0,12.0,0.0,1990-12-03,27.576388,27
33004,madsory01,2005,PHI,NL,78,0.0,75,78.0,78,0,...,0,0,0,0,0.0,0.0,0.0,1980-08-28,24.8412,24
30408,kennejo04,2004,COL,NL,27,27.0,25,27.0,27,0,...,0,0,0,0,0.0,0.0,0.0,1979-05-24,25.106779,25
230,bairdo01,1989,PIT,NL,44,0.0,44,44.0,44,0,...,0,0,0,0,0.0,0.0,0.0,1949-08-22,39.858723,39
16435,martiti02,1996,NYA,AL,155,152.0,155,151.0,0,0,...,0,0,0,0,3.0,3.0,0.0,1967-12-07,28.567517,28
48181,solissa01,2018,WAS,NL,56,0.0,54,56.0,56,0,...,0,0,0,0,0.0,0.0,0.0,1988-08-10,29.889935,29


In [5]:
posns = ['c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'dh']
pos_cols = [f'g_{pos}' for pos in posns]
pos_all_cols =  [f'g_{pos}' for pos in (posns+['defense', 'all'])]

yrly_games['primary'] = yrly_games[pos_cols].idxmax(axis=1).str[2:]
yrly_games.sample(10)


Unnamed: 0,player_id,year_id,team_id,lg_id,g_all,gs,g_batting,g_defense,g_p,g_c,...,g_cf,g_rf,g_of,g_dh,g_ph,g_pr,birthdate,age,age_int,primary
29420,parrijo01,2007,BAL,AL,45,0.0,5,45.0,45,0,...,0,0,0,0.0,0.0,0.0,1977-11-26,29.594239,29,c
9945,tartada01,1984,SEA,AL,10,5.0,10,9.0,0,0,...,0,0,0,0.0,1.0,2.0,1962-10-30,21.670682,21,ss
41483,bogusbr01,2010,HOU,NL,19,5.0,19,11.0,0,0,...,3,2,11,0.0,11.0,0.0,1984-02-18,26.366225,26,lf
43099,forsylo01,2012,SDN,NL,91,76.0,91,86.0,0,0,...,0,0,0,2.0,8.0,2.0,1987-01-14,25.462709,25,2b
42894,collmjo01,2016,ATL,NL,3,3.0,3,3.0,3,0,...,0,0,0,0.0,0.0,0.0,1986-02-07,30.396452,30,c
13739,carpecr01,1993,FLO,NL,29,0.0,29,29.0,29,0,...,0,0,0,0.0,0.0,0.0,1965-04-05,28.238966,28,c
30758,puntoni01,2013,LAN,NL,116,71.0,116,105.0,0,0,...,0,0,0,0.0,20.0,3.0,1977-11-08,35.645055,35,ss
36761,lestejo01,2016,CHN,NL,33,32.0,30,32.0,32,0,...,0,0,0,0.0,1.0,0.0,1984-01-07,32.482751,32,c
28181,molinjo01,2012,TBA,AL,102,80.0,102,102.0,0,102,...,0,0,0,0.0,1.0,0.0,1975-06-03,37.079728,37,c
13436,velarra01,1994,NYA,AL,77,69.0,77,77.0,0,0,...,0,1,7,0.0,0.0,2.0,1962-11-24,31.601139,31,ss


In [6]:
# Let's look at playing time by age

pt_by_age = yrly_games.query('2010<=year_id<2020').groupby('age_int')[pos_cols].sum()/10/162
position_colors = {'g_c': 'black', 'g_1b': 'darkkhaki', 'g_2b': 'khaki', 'g_3b': 'brown', 'g_ss': 'tan', 'g_lf': 'green', 'g_cf': 'green', 'g_rf': 'green', 'g_dh': 'grey'}
#{'c': 'black', 'cf': 'green', 'P': 'white', '1B': 'darkkhaki', '2B': 'khaki', '3B': 'brown', 'SS': 'tan'}

px.line(pt_by_age,color_discrete_map=position_colors)

In [7]:
# Let's look at 28yo second basemen.  What were they doing 5 years later?

def get_future_pt(gms: pd.DataFrame, pos: str, start_age: int = 28, comp_age: int = 33, gm_threshold: int = 100) -> pd.Series:
    max_year = 2019-(comp_age-start_age)

    prime_regulars = gms.query(f'age_int==@start_age and g_all>=@gm_threshold and primary==@pos and year_id<=@max_year')['player_id']
    avg_gms = gms.query('player_id in @prime_regulars and age_int==@comp_age')[pos_all_cols].sum()/len(prime_regulars)
    avg_gms['n'] = len(prime_regulars)
    return avg_gms.rename(pos)

get_future_pt(yrly_games, '2b')

g_c           0.000000
g_1b          2.655556
g_2b         49.255556
g_3b          4.033333
g_ss          4.944444
g_lf          5.444444
g_cf          0.433333
g_rf          2.233333
g_dh          3.177778
g_defense    66.644444
g_all        75.122222
n            90.000000
Name: 2b, dtype: float64

In [8]:
future = pd.concat([get_future_pt(yrly_games, pos) for pos in posns], axis=1).T
future

Unnamed: 0,g_c,g_1b,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_dh,g_defense,g_all,n
c,63.611111,3.125,0.027778,0.069444,0.0,0.041667,0.0,0.25,1.208333,66.611111,70.875,72.0
1b,0.0,61.847059,0.094118,3.188235,0.0,3.035294,0.0,1.447059,6.894118,68.576471,79.505882,85.0
2b,0.0,2.655556,49.255556,4.033333,4.944444,5.444444,0.433333,2.233333,3.177778,66.644444,75.122222,90.0
3b,0.031579,11.178947,3.031579,50.063158,1.926316,2.736842,0.136842,0.715789,4.263158,68.905263,79.294737,95.0
ss,0.0,1.313953,10.755814,8.290698,40.5,2.05814,2.534884,1.813953,1.546512,66.174419,70.55814,86.0
lf,0.0,3.040816,0.438776,2.704082,0.214286,30.520408,5.193878,7.602041,7.418367,48.632653,61.142857,98.0
cf,0.0,0.825243,0.786408,0.834951,0.592233,16.271845,35.116505,19.0,3.009709,70.398058,79.572816,103.0
rf,0.0,4.297297,1.099099,0.324324,0.0,9.324324,7.054054,38.243243,6.144144,58.747748,70.306306,111.0
dh,0.0,8.461538,0.0,0.0,0.0,1.769231,0.0,4.923077,35.0,14.384615,54.230769,13.0


In [9]:
# try it again for 2000-current
def get_pt_matrix(start_age=24, comp_age=27):
    future = pd.concat([get_future_pt(yrly_games.query('year_id>=2000'), pos, start_age=start_age, comp_age=comp_age) for pos in posns], axis=1).T
    return future.apply(lambda x: round(x, 1))

get_pt_matrix(24, 27)

Unnamed: 0,g_c,g_1b,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_dh,g_defense,g_all,n
c,105.0,1.3,0.0,0.0,0.0,0.0,0.0,0.0,2.6,105.4,110.8,20.0
1b,0.0,101.1,5.9,0.7,0.0,1.5,0.0,0.1,7.7,108.2,120.4,25.0
2b,0.0,4.0,72.8,11.8,6.7,5.1,4.5,0.6,1.2,103.0,109.9,34.0
3b,0.0,12.0,2.9,70.9,0.6,5.4,1.4,5.5,5.2,96.2,105.2,36.0
ss,0.0,0.0,12.0,4.9,100.5,0.5,0.2,0.2,1.2,117.5,120.6,34.0
lf,0.0,0.1,3.4,0.0,0.4,52.0,9.8,26.9,6.2,90.6,100.4,25.0
cf,0.0,0.5,0.7,0.1,0.3,12.4,85.0,15.2,1.2,112.8,118.9,42.0
rf,0.0,4.3,0.0,0.0,0.0,22.6,30.0,50.5,0.9,103.3,107.8,26.0
dh,0.0,22.5,0.0,0.0,0.0,4.5,0.0,10.5,56.0,37.5,102.5,2.0


In [10]:
get_pt_matrix(27, 30)

Unnamed: 0,g_c,g_1b,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_dh,g_defense,g_all,n
c,84.2,8.0,0.0,0.1,0.0,0.4,0.0,0.2,7.6,91.5,104.2,37.0
1b,0.0,76.1,0.1,5.8,0.0,2.8,0.0,8.7,7.0,92.0,103.2,45.0
2b,0.0,2.8,62.7,9.7,6.6,4.5,1.2,0.8,0.8,86.0,92.4,59.0
3b,0.0,12.2,7.4,62.4,4.4,0.5,0.0,3.2,4.1,88.1,96.0,42.0
ss,0.0,0.4,12.5,10.6,89.0,0.8,2.8,0.2,0.8,114.2,118.7,47.0
lf,0.0,9.4,0.4,2.3,0.0,54.7,6.5,13.2,8.9,84.6,99.8,48.0
cf,0.0,2.6,0.9,0.1,0.1,18.8,58.8,11.5,1.7,90.1,96.2,58.0
rf,0.0,4.5,1.5,0.1,0.2,14.0,17.7,63.7,4.7,97.4,106.1,41.0
dh,1.3,22.2,0.0,5.2,0.0,12.7,0.0,1.1,34.0,40.2,80.0,10.0


In [11]:
# Wait, how do we get 1b->2b move of 5.9 games?  Did someone move from 1B to 2B?  Let's find out.

def temp_get_future_pt(gms: pd.DataFrame, pos: str, start_age: int = 28, comp_age: int = 33, gm_threshold: int = 100) -> pd.Series:
    max_year = 2019-(comp_age-start_age)

    prime_regulars = gms.query(f'age_int==@start_age and g_all>=@gm_threshold and primary==@pos and year_id<=@max_year')['player_id']
    print(gms.query('player_id in @prime_regulars and age_int==@comp_age')[[f'g_{pos}' for pos in posns]].sort_values('g_2b'))

temp_get_future_pt(yrly_games.query('year_id>=2000'), '1b', start_age=24, comp_age=27)

       g_c  g_1b  g_2b  g_3b  g_ss  g_lf  g_cf  g_rf   g_dh
25306    0   119     0     0     0     0     0     0   14.0
43268    0   157     0     0     0     0     0     0    5.0
43163    0   157     0     0     0     0     0     0    2.0
42704    0   120     0     0     0    14     0     0    0.0
42415    0    79     0     0     0     0     0     0    0.0
41792    0   105     0    16     0     0     0     0    0.0
38941    0     1     0     0     0     0     0     1    0.0
38940    0     1     0     0     0     0     0     1    0.0
38778    0   160     0     0     0     0     0     0    0.0
37779    0     7     0     0     0     0     0     0  150.0
36814    0   150     0     0     0     0     0     0    1.0
37666    0    36     0     0     0     0     0     0    0.0
35207    0   159     0     0     0     0     0     0    3.0
34228    0   151     0     0     0     0     0     0    0.0
34206    0   116     0     0     0     0     0     0    3.0
33929    0   156     0     0     0     0

In [12]:
yrly_games.loc[39642]

player_id     murphda08
year_id            2012
team_id             NYN
lg_id                NL
g_all               156
gs                136.0
g_batting           156
g_defense         144.0
g_p                   0
g_c                   0
g_1b                 12
g_2b                138
g_3b                  0
g_ss                  0
g_lf                  0
g_cf                  0
g_rf                  0
g_of                  0
g_dh                0.0
g_ph               18.0
g_pr                0.0
birthdate    1985-04-01
age           27.250575
age_int              27
primary              2b
Name: 39642, dtype: object