In [1]:

import pandas as pd

In [2]:
sched = pd.read_parquet('../data/retrosheet/schedule.parquet')
sched.columns

Index(['date', 'double_header', 'day_of_week', 'visiting_team',
       'visiting_team_league', 'visiting_team_game_number', 'home_team',
       'home_team_league', 'home_team_game_number', 'day_night',
       'postponement_indicator', 'makeup_dates'],
      dtype='object')

In [3]:
sched['yr'] = sched['date'].dt.year

sched.sample(10)

Unnamed: 0,date,double_header,day_of_week,visiting_team,visiting_team_league,visiting_team_game_number,home_team,home_team_league,home_team_game_number,day_night,postponement_indicator,makeup_dates,yr
70528,1929-04-20,0,Sat,BOS,AL,5,WS1,AL,5,D,,,1929
102422,2019-09-21,0,Sat,ARI,NL,155,SDN,NL,155,n,,,2019
188422,1907-07-15,0,Mon,BOS,AL,82,SLA,AL,85,D,,,1907
9564,1979-05-13,0,Sun,MIL,AL,35,DET,AL,32,D,,,1979
177930,2001-04-12,0,Thu,ATL,NL,10,NYN,NL,9,N,,,2001
166779,2011-06-18,0,Sat,PHI,NL,72,SEA,AL,73,N,,,2011
27394,1967-04-29,0,Sat,PIT,NL,16,SLN,NL,17,N,Rain,19670712.0,1967
100097,1921-10-01,2,Sat,WS1,AL,153,BOS,AL,153,D,,,1921
93780,1955-07-17,0,Sun,WS1,AL,91,KC1,AL,90,D,,,1955
119441,1926-06-04,0,Fri,SLN,NL,51,PHI,NL,48,D,Rain,19260605.0,1926


In [4]:
sched = sched.sort_values(['yr', 'home_team', 'home_team_game_number'])[['yr', 'home_team', 'home_team_game_number', 'visiting_team']].reset_index(drop=True)
sched

Unnamed: 0,yr,home_team,home_team_game_number,visiting_team
0,1877,BSN,11,HAR
1,1877,BSN,12,HAR
2,1877,BSN,15,CHN
3,1877,BSN,16,CHN
4,1877,BSN,17,SL3
...,...,...,...,...
220901,2019,WAS,158,PHI
220902,2019,WAS,159,PHI
220903,2019,WAS,160,CLE
220904,2019,WAS,161,CLE


In [5]:
# first look for consecutive games against same visitor
poss_series_id = (sched['visiting_team'] != sched['visiting_team'].shift()).cumsum()
poss_series_id

0             1
1             1
2             2
3             2
4             3
          ...  
220901    72098
220902    72098
220903    72099
220904    72099
220905    72099
Name: visiting_team, Length: 220906, dtype: int64

In [6]:
# 
offset = sched.reset_index().index - sched['home_team_game_number']
offset

0            -11
1            -11
2            -13
3            -13
4            -13
           ...  
220901    220743
220902    220743
220903    220743
220904    220743
220905    220743
Length: 220906, dtype: int64

In [7]:
sched['series_id'] = sched.groupby([poss_series_id, offset]).ngroup()
sched

Unnamed: 0,yr,home_team,home_team_game_number,visiting_team,series_id
0,1877,BSN,11,HAR,0
1,1877,BSN,12,HAR,0
2,1877,BSN,15,CHN,1
3,1877,BSN,16,CHN,1
4,1877,BSN,17,SL3,2
...,...,...,...,...,...
220901,2019,WAS,158,PHI,72984
220902,2019,WAS,159,PHI,72984
220903,2019,WAS,160,CLE,72985
220904,2019,WAS,161,CLE,72985


In [8]:
series_lengths = sched.groupby(['yr', 'series_id', 'home_team', 'visiting_team'])['home_team_game_number'].count().rename('length').reset_index().set_index('series_id')
series_lengths

Unnamed: 0_level_0,yr,home_team,visiting_team,length
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1877,BSN,HAR,2
1,1877,BSN,CHN,2
2,1877,BSN,SL3,2
3,1877,BSN,CN1,2
4,1877,BSN,LS1,2
...,...,...,...,...
72981,2019,WAS,MIA,3
72982,2019,WAS,NYN,3
72983,2019,WAS,ATL,3
72984,2019,WAS,PHI,4


In [9]:
series_lengths.query('yr==2019')['length'].value_counts()

3    518
4    175
2     88
Name: length, dtype: int64

In [10]:
series_lengths.query('yr>=2000')['length'].value_counts()

3    11741
4     2824
2     1037
1        7
Name: length, dtype: int64

In [11]:
series_lengths.query('1980 <= yr <= 2000 and length==5')

Unnamed: 0_level_0,yr,home_team,visiting_team,length
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
43576,1980,NYN,MON,5
43656,1980,PIT,CHN,5
43699,1980,SEA,MIN,5
43763,1980,SLN,PHI,5
43806,1980,TOR,CLE,5
43973,1981,CHN,SLN,5
44033,1981,CLE,MIL,5
44417,1981,SFN,ATL,5
44453,1981,SLN,PHI,5
44748,1982,DET,TOR,5


In [12]:
series_lengths.query('yr >= 1970 and length==5')['yr'].value_counts()

1971    8
1983    7
1977    7
1975    6
1982    6
1976    5
1980    5
1981    4
1973    4
1972    4
1979    3
1978    3
1974    3
1985    2
1995    2
1986    1
1996    1
Name: yr, dtype: int64