In [2]:
# Split up the monolithic event.parquet into individual seasons
import sys
sys.path.append('../')

from typing import Sequence
import pandas as pd
import pyretro.boxball_loader as bbl

In [3]:
df = pd.read_parquet(f'{bbl.RETROSHEET_DIR}/event.parquet')
df.shape

(14929271, 160)

In [4]:
event_gms = df['game_id'].unique()
gms = pd.read_parquet(f'{bbl.BASE_DATA_DIR}/mine/gamelog_enhanced.parquet')[['game_id', 'date', 'game_type']].query('game_id in @event_gms')
gms.shape


(188294, 3)

In [5]:
ev = pd.merge(left=df, right=gms, on='game_id')
ev.shape


(14929271, 162)

In [6]:
ev = bbl.fixup_event_data(ev)
ev['yr'] = ev['date'].dt.year

In [7]:
yrs = ev['yr'].sort_values().unique()
yrs

array([1903, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914,
       1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925,
       1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936,
       1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947,
       1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958,
       1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969,
       1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980,
       1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022])

In [8]:
for yr in yrs:
    print(yr)
    ev_yr = ev.query('yr==@yr')
    ev_yr.to_parquet(f'{bbl.RETROSHEET_DIR}/event_yearly/event_{yr}.parquet')

1903
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022


In [9]:
def load_event_data(seasons: bbl.Seasons, requested_columns: Sequence[str], pa_only=True, game_types=bbl.GameType.RS) -> pd.DataFrame:
    required_cols = ['game_id', 'bat_event_fl', 'h_fl', 'event_cd', 'ab_fl', 'game_type']
    columns = list(set(required_cols+requested_columns))
#    hash_key = (tuple([seasons, tuple(sorted(columns))]))
#    cache_filepath = get_cache_filename('event', hash_key)
    if False: #os.path.isfile(cache_filepath):
        ev = pd.read_parquet(cache_filepath)
    else:
        ev = pd.concat([load_event_data_season(yr, columns) for yr in seasons])
#        ev.to_parquet(cache_filepath)

    ev = ev[ev['bat_event_fl']] if pa_only else ev
    ev = bbl.filter_on_game_types(ev, game_types)
    return ev

In [10]:
def load_event_data_season(season: int, columns: Sequence[str]) -> pd.DataFrame:
    print(season)
    ev_yr = pd.read_parquet(f'{bbl.RETROSHEET_DIR}/event_yearly/event_{season}.parquet')[columns]
    print(ev_yr.shape)
    return ev_yr

In [11]:
required_cols = ['game_id', 'bat_event_fl', 'h_fl', 'event_cd', 'ab_fl']



load_event_data_season(2019, required_cols)

2019
(194946, 5)


Unnamed: 0,game_id,bat_event_fl,h_fl,event_cd,ab_fl
5395,ALS201907090,True,0,2,True
5396,ALS201907090,True,0,3,True
5397,ALS201907090,True,0,3,True
5398,ALS201907090,True,1,20,True
5399,ALS201907090,True,0,2,True
...,...,...,...,...,...
11422399,WAS201909290,True,0,3,True
11422400,WAS201909290,True,0,2,True
11422401,WAS201909290,True,0,14,False
11422402,WAS201909290,True,0,2,True


In [12]:
load_event_data(bbl.Seasons(2018, 2022), [])

2018
(193809, 6)
2019
(194946, 6)
2020
(72711, 6)
2021
(190130, 6)
2022
(190496, 6)


Unnamed: 0,h_fl,game_id,game_type,event_cd,bat_event_fl,ab_fl
7804312,0,ANA201804020,RS,2,True,True
7804313,0,ANA201804020,RS,2,True,True
7804314,0,ANA201804020,RS,2,True,True
7804315,0,ANA201804020,RS,2,True,True
7804316,0,ANA201804020,RS,3,True,True
...,...,...,...,...,...,...
8765068,0,WAS202210020,RS,3,True,True
8765069,0,WAS202210020,RS,14,True,False
8765070,1,WAS202210020,RS,21,True,True
8765071,0,WAS202210020,RS,2,True,True
