In [1]:
import pandas as pd
import datetime as dt

In [2]:
# Read in all the game logs, then narrow to a subset of fields:  game_id, plus the IDs of each starting pitcher
# sps is short for 'starting pitchers'
gl = pd.read_parquet('../data/mine/gamelog_enhanced.parquet')
sps = gl[['game_id', 'visitor_starting_pitcher_id', 'home_starting_pitcher_id']]
sps

Unnamed: 0,game_id,visitor_starting_pitcher_id,home_starting_pitcher_id
73283,FW1187105040,prata101,mathb101
73284,WS3187105050,spala101,braia102
73285,RC1187105060,prata101,fishc102
73286,CH1187105080,prata101,zettg101
73287,TRO187105090,spala101,mcmuj101
...,...,...,...
20330,WAS201910250,greiz001,sanca004
20331,WAS201910260,urquj001,corbp001
20332,WAS201910270,coleg001,rossj002
20333,HOU201910290,stras001,verlj001


In [3]:
# Read in the table of all people, narrowing to their Retrosheet ID and the birthdate fields
ppl = pd.read_parquet('../data/bd/people.parquet')[['retro_id', 'birth_year', 'birth_month', 'birth_day']].dropna()

In [4]:
# Combine the birth year/month/day fields into one birth_date field
def mk_birthdate(row):
    return dt.datetime(int(row['birth_year']), int(row['birth_month']), int(row['birth_day']))    

ppl['birth_date'] = ppl.apply(mk_birthdate, axis=1)
ppl

Unnamed: 0,retro_id,birth_year,birth_month,birth_day,birth_date
0,aardd001,1981.0,12.0,27.0,1981-12-27
1,aaroh101,1934.0,2.0,5.0,1934-02-05
2,aarot101,1939.0,8.0,5.0,1939-08-05
3,aased001,1954.0,9.0,8.0,1954-09-08
4,abada001,1972.0,8.0,25.0,1972-08-25
...,...,...,...,...,...
19873,zupof101,1939.0,8.0,29.0,1939-08-29
19874,zuvep001,1958.0,10.0,31.0,1958-10-31
19875,zuveg101,1924.0,8.0,20.0,1924-08-20
19876,zwild101,1888.0,11.0,2.0,1888-11-02


In [5]:
# Now merge the starting pitchers table with the birthdates
# It takes two steps, first to merge in the visitor starter's birthdate, then for home
# The resulting sps_bd table has both starters' IDs and birthdates
sps2 = pd.merge(left=sps,  right=ppl[['retro_id', 'birth_date']], left_on='visitor_starting_pitcher_id', right_on='retro_id').drop(columns='retro_id').rename(columns={'birth_date': 'visitor_starting_pitcher_birthdate'})
sps3 = pd.merge(left=sps2, right=ppl[['retro_id', 'birth_date']], left_on='home_starting_pitcher_id',    right_on='retro_id').drop(columns='retro_id').rename(columns={'birth_date':    'home_starting_pitcher_birthdate'})
sps_bd = sps3
sps_bd

Unnamed: 0,game_id,visitor_starting_pitcher_id,home_starting_pitcher_id,visitor_starting_pitcher_birthdate,home_starting_pitcher_birthdate
0,FW1187105040,prata101,mathb101,1847-11-19,1851-11-21
1,FW1187108110,prata101,mathb101,1847-11-19,1851-11-21
2,BL1187207090,prata101,mathb101,1847-11-19,1851-11-21
3,FW1187107120,spala101,mathb101,1850-09-02,1851-11-21
4,BL1187205020,spala101,mathb101,1850-09-02,1851-11-21
...,...,...,...,...,...
220208,SEA201906030,martc009,gearc001,1995-12-28,1986-04-14
220209,PHI201906230,yamaj001,deloe001,1996-05-11,1995-12-25
220210,LAN201909180,mckab001,sadlc002,1995-12-18,1990-07-13
220211,CHA201909260,civaa001,ruizj001,1995-06-12,1994-10-21


In [6]:
# This function computes the difference in ages
def sp_age_diff(row):
    return abs(row['visitor_starting_pitcher_birthdate'] - row['home_starting_pitcher_birthdate'])

In [None]:
# Now compute the age difference for every game in the table
# (This step takes a bit, as it's attempting to compute the age difference for every game in MLB history)
sps_bd['age_diff'] = sps_bd.apply(sp_age_diff, axis=1)

In [None]:
# And display the largest differences
sps_bd.sort_values(by='age_diff').tail(30)