In [1]:
# Create a gamelog_team table (e.g., one row per team-game, two per game)

import pandas as pd

import pyretro.boxball_loader as bbl
import pyretro.retrosheetgl as retrosheetgl # for parsing linescores

In [2]:
gl = pd.read_parquet(f'{bbl.BASE_DATA_DIR}/mine/gamelog_enhanced.parquet')
gl.shape

(228193, 164)

In [3]:
# Select the per-team columns
vis_cols  = list(filter(lambda c: c.find('vis',  0)==0, gl.columns.values))
home_cols = list(filter(lambda c: c.find('home', 0)==0, gl.columns.values))

vis_col_mapper = dict([(c, c.replace('visitor_', '').replace('visiting_', '')) for c in vis_cols ])
hm_col_mapper  = dict([(c, c.replace('home_', ''))                             for c in home_cols])
    
vis_col_mapper

{'visiting_team': 'team',
 'visiting_team_league': 'team_league',
 'visiting_team_game_number': 'team_game_number',
 'visitor_runs_scored': 'runs_scored',
 'visitor_line_score': 'line_score',
 'visitor_ab': 'ab',
 'visitor_h': 'h',
 'visitor_d': 'd',
 'visitor_t': 't',
 'visitor_hr': 'hr',
 'visitor_rbi': 'rbi',
 'visitor_sh': 'sh',
 'visitor_sf': 'sf',
 'visitor_hbp': 'hbp',
 'visitor_bb': 'bb',
 'visitor_ibb': 'ibb',
 'visitor_k': 'k',
 'visitor_sb': 'sb',
 'visitor_cs': 'cs',
 'visitor_gdp': 'gdp',
 'visitor_ci': 'ci',
 'visitor_lob': 'lob',
 'visitor_pitchers': 'pitchers',
 'visitor_er': 'er',
 'visitor_ter': 'ter',
 'visitor_wp': 'wp',
 'visitor_balks': 'balks',
 'visitor_po': 'po',
 'visitor_a': 'a',
 'visitor_e': 'e',
 'visitor_passed': 'passed',
 'visitor_db': 'db',
 'visitor_tp': 'tp',
 'visitor_manager_id': 'manager_id',
 'visitor_manager_name': 'manager_name',
 'visitor_starting_pitcher_id': 'starting_pitcher_id',
 'visitor_starting_pitcher_name': 'starting_pitcher_name',
 '

In [4]:
neutral_cols = gl[['game_id', 'date', 'double_header', 'yr', 'game_type', 'park_id']]

gl_hometeam = pd.concat([neutral_cols, gl[home_cols].rename(columns= hm_col_mapper)], axis=1)
gl_visteam  = pd.concat([neutral_cols, gl[vis_cols].rename(columns=vis_col_mapper)], axis=1)

#gl_hometeam = gl[home_cols].rename(columns= hm_col_mapper)
#gl_visteam  = gl[vis_cols].rename(columns=vis_col_mapper)

gl_hometeam['HA'] = 'H'
gl_visteam ['HA'] = 'A'

gl_hometeam['opp'] = gl_visteam ['team']
gl_visteam ['opp'] = gl_hometeam['team']

gl_hometeam['runs_allowed'] = gl_visteam ['runs_scored']
gl_visteam ['runs_allowed'] = gl_hometeam['runs_scored']

gl_hometeam['W'] = gl_hometeam['runs_scored'] > gl_visteam ['runs_scored']
gl_hometeam['L'] = gl_hometeam['runs_scored'] < gl_visteam ['runs_scored']
gl_visteam['L'] = gl_hometeam['runs_scored'] > gl_visteam ['runs_scored']
gl_visteam['W'] = gl_hometeam['runs_scored'] < gl_visteam ['runs_scored']

gl_teams = pd.concat([gl_hometeam, gl_visteam], axis=0).sort_values('date')
gl_teams

Unnamed: 0,game_id,date,double_header,yr,game_type,park_id,team,team_league,team_game_number,runs_scored,...,batting_8_name,batting_8_position,batting_9_player_id,batting_9_name,batting_9_position,HA,opp,runs_allowed,W,L
2119,FW1187105040,1871-05-04,0,1871,RS,FOR01,FW1,,1,2,...,James McDermott,8.0,kellb105,Bill Kelly,9.0,H,CL1,0,True,False
2119,FW1187105040,1871-05-04,0,1871,RS,FOR01,CL1,,1,0,...,Jim Carleton,3.0,bassj101,John Bass,6.0,A,FW1,2,False,True
2120,WS3187105050,1871-05-05,0,1871,RS,WAS01,BS1,,1,20,...,Fred Cone,7.0,spala101,Al Spalding,1.0,A,WS3,18,True,False
2120,WS3187105050,1871-05-05,0,1871,RS,WAS01,WS3,,1,18,...,Henry Burroughs,9.0,berth101,Henry Berthrong,8.0,H,BS1,20,False,True
2121,RC1187105060,1871-05-06,0,1871,RS,RCK01,RC1,,1,4,...,George Bird,7.0,stirg101,Gat Stires,9.0,H,CL1,12,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184592,PHI202211020,2022-11-02,0,2022,PS,PHI13,PHI,NL,4,0,...,Jean Segura,4.0,marsb002,Brandon Marsh,8.0,H,HOU,5,False,True
184593,PHI202211030,2022-11-03,0,2022,PS,PHI13,HOU,AL,5,3,...,Chas McCormick,8.0,maldm001,Martin Maldonado,2.0,A,PHI,2,True,False
184593,PHI202211030,2022-11-03,0,2022,PS,PHI13,PHI,NL,5,2,...,Jean Segura,4.0,marsb002,Brandon Marsh,8.0,H,HOU,3,False,True
184594,HOU202211050,2022-11-05,0,2022,PS,HOU03,HOU,AL,6,4,...,Chas McCormick,8.0,maldm001,Martin Maldonado,2.0,H,PHI,1,True,False


In [5]:
# Parse linescores
def parse_linescore_str(obj):
    if obj:
        return retrosheetgl.parse_linescore_str(obj)
    else:
        return None

In [6]:
linescores = gl_teams.line_score.apply(parse_linescore_str)
gl_teams['linescore_parsed'] = linescores
gl_teams

Unnamed: 0,game_id,date,double_header,yr,game_type,park_id,team,team_league,team_game_number,runs_scored,...,batting_8_position,batting_9_player_id,batting_9_name,batting_9_position,HA,opp,runs_allowed,W,L,linescore_parsed
2119,FW1187105040,1871-05-04,0,1871,RS,FOR01,FW1,,1,2,...,8.0,kellb105,Bill Kelly,9.0,H,CL1,0,True,False,"[0, 1, 0, 0, 1, 0, 0, 0, 0]"
2119,FW1187105040,1871-05-04,0,1871,RS,FOR01,CL1,,1,0,...,3.0,bassj101,John Bass,6.0,A,FW1,2,False,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0]"
2120,WS3187105050,1871-05-05,0,1871,RS,WAS01,BS1,,1,20,...,7.0,spala101,Al Spalding,1.0,A,WS3,18,True,False,"[1, 0, 7, 0, 0, 0, 4, 3, 5]"
2120,WS3187105050,1871-05-05,0,1871,RS,WAS01,WS3,,1,18,...,9.0,berth101,Henry Berthrong,8.0,H,BS1,20,False,True,"[6, 4, 0, 1, 1, 3, 0, 3, 0]"
2121,RC1187105060,1871-05-06,0,1871,RS,RCK01,RC1,,1,4,...,7.0,stirg101,Gat Stires,9.0,H,CL1,12,False,True,"[0, 1, 0, 0, 2, 0, 1, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184592,PHI202211020,2022-11-02,0,2022,PS,PHI13,PHI,NL,4,0,...,4.0,marsb002,Brandon Marsh,8.0,H,HOU,5,False,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0]"
184593,PHI202211030,2022-11-03,0,2022,PS,PHI13,HOU,AL,5,3,...,8.0,maldm001,Martin Maldonado,2.0,A,PHI,2,True,False,"[1, 0, 0, 1, 0, 0, 0, 1, 0]"
184593,PHI202211030,2022-11-03,0,2022,PS,PHI13,PHI,NL,5,2,...,4.0,marsb002,Brandon Marsh,8.0,H,HOU,3,False,True,"[1, 0, 0, 0, 0, 0, 0, 1, 0]"
184594,HOU202211050,2022-11-05,0,2022,PS,HOU03,HOU,AL,6,4,...,8.0,maldm001,Martin Maldonado,2.0,H,PHI,1,True,False,"[0, 0, 0, 0, 0, 4, 0, 0, None]"


In [7]:
gl_teams.to_parquet(f'{bbl.BASE_DATA_DIR}/mine/gl_teams.parquet')

In [8]:
gl_teams.columns

Index(['game_id', 'date', 'double_header', 'yr', 'game_type', 'park_id',
       'team', 'team_league', 'team_game_number', 'runs_scored', 'line_score',
       'ab', 'h', 'd', 't', 'hr', 'rbi', 'sh', 'sf', 'hbp', 'bb', 'ibb', 'k',
       'sb', 'cs', 'gdp', 'ci', 'lob', 'pitchers', 'er', 'ter', 'wp', 'balks',
       'po', 'a', 'e', 'passed', 'db', 'tp', 'manager_id', 'manager_name',
       'starting_pitcher_id', 'starting_pitcher_name', 'batting_1_player_id',
       'batting_1_name', 'batting_1_position', 'batting_2_player_id',
       'batting_2_name', 'batting_2_position', 'batting_3_player_id',
       'batting_3_name', 'batting_3_position', 'batting_4_player_id',
       'batting_4_name', 'batting_4_position', 'batting_5_player_id',
       'batting_5_name', 'batting_5_position', 'batting_6_player_id',
       'batting_6_name', 'batting_6_position', 'batting_7_player_id',
       'batting_7_name', 'batting_7_position', 'batting_8_player_id',
       'batting_8_name', 'batting_8_position', '

In [9]:
def get_starters(i, tmgms):
    fields=[f'batting_{i}_player_id', 'game_id', 'date', 'yr', 'game_type', 'team', 'HA', f'batting_{i}_position']
    starters = tmgms[fields].dropna(axis=0, how='any')
    starters = starters.rename(columns={f'batting_{i}_player_id': 'player_id', f'batting_{i}_position': 'pos'})
    starters['batting_pos']=i
    return starters

gl_starters = pd.concat([get_starters(i+1, gl_teams) for i in range(9)])
gl_starters


Unnamed: 0,player_id,game_id,date,yr,game_type,team,HA,pos,batting_pos
2119,selmf101,FW1187105040,1871-05-04,1871,RS,FW1,H,5.0,1
2119,whitd102,FW1187105040,1871-05-04,1871,RS,CL1,A,2.0,1
2120,wrigg101,WS3187105050,1871-05-05,1871,RS,BS1,A,6.0,1
2120,watef102,WS3187105050,1871-05-05,1871,RS,WS3,H,5.0,1
2121,mackd101,RC1187105060,1871-05-06,1871,RS,RC1,H,3.0,1
...,...,...,...,...,...,...,...,...,...
184592,marsb002,PHI202211020,2022-11-02,2022,PS,PHI,H,8.0,9
184593,maldm001,PHI202211030,2022-11-03,2022,PS,HOU,A,2.0,9
184593,marsb002,PHI202211030,2022-11-03,2022,PS,PHI,H,8.0,9
184594,maldm001,HOU202211050,2022-11-05,2022,PS,HOU,H,2.0,9


In [10]:
gl_starters.batting_pos.value_counts()

1    415782
2    415782
3    415782
4    415782
5    415782
6    415782
7    415782
8    415782
9    415779
Name: batting_pos, dtype: int64

In [11]:
gl_starters.to_parquet(f'{bbl.BASE_DATA_DIR}/mine/gl_starters.parquet')

In [12]:
gl.head()

Unnamed: 0,date,double_header,day_of_week,visiting_team,visiting_team_league,visiting_team_game_number,home_team,home_team_league,home_team_game_number,visitor_runs_scored,...,home_batting_8_name,home_batting_8_position,home_batting_9_player_id,home_batting_9_name,home_batting_9_position,additional_info,acquisition_info,game_id,yr,game_type
2119,1871-05-04,0,Thu,CL1,,1,FW1,,1,0,...,James McDermott,8.0,kellb105,Bill Kelly,9.0,,D,FW1187105040,1871,RS
2120,1871-05-05,0,Fri,BS1,,1,WS3,,1,20,...,Henry Burroughs,9.0,berth101,Henry Berthrong,8.0,HTBF,D,WS3187105050,1871,RS
2121,1871-05-06,0,Sat,CL1,,2,RC1,,1,12,...,George Bird,7.0,stirg101,Gat Stires,9.0,,D,RC1187105060,1871,RS
2122,1871-05-08,0,Mon,CL1,,3,CH1,,1,12,...,Ed Pinkham,5.0,zettg101,George Zettlein,1.0,,D,CH1187105080,1871,RS
2123,1871-05-09,0,Tue,BS1,,2,TRO,,1,9,...,Lip Pike,3.0,cravb101,Bill Craver,6.0,HTBF,D,TRO187105090,1871,RS
