## Which rotations have gotten the most starts from their top-5 starters?

In [1]:
import pandas as pd

# Find all player-seasons with game starts, ranked among their team
gs = pd.read_parquet("file:../data/pitching.parquet")[['year_id', 'team_id', 'player_id', 'gs']]
gs = gs[(gs['gs']>0) & (gs['year_id']>=1947)]
gs['rank_on_team'] = gs.sort_values(['gs'], ascending=False).groupby(['year_id', 'team_id']).cumcount()+1
gs

Unnamed: 0,year_id,team_id,player_id,gs,rank_on_team
12617,1947,NY1,ayersbi01,4,11
12618,1947,PIT,bagbyji02,6,9
12619,1947,PIT,bahred01,11,7
12621,1947,BRO,bantaja01,1,10
12622,1947,BRO,barnere02,9,7
...,...,...,...,...,...
47617,2019,TBA,yarbrry01,14,5
47620,2019,BAL,ynoaga01,13,6
47622,2019,ARI,youngal01,15,5
47624,2019,TOR,zeuchtj01,3,13


In [2]:
# Aggregate the GS for each team-season, in total and by their top 5
total_gs = gs.groupby(['year_id', 'team_id']).sum()['gs']
top5_gs = gs[gs['rank_on_team']<=5].groupby(['year_id', 'team_id']).sum()['gs']
teams = pd.merge(total_gs, top5_gs, on=['team_id', 'year_id'])
teams = teams.rename(columns={"gs_x": "gs_total", "gs_y": "gs_top5"})

# Compute the pct of games by top5, filter to 98%+, and sort
teams['top5pct'] = teams['gs_top5']/teams['gs_total']
teams = teams[teams['top5pct']>.98].sort_values(by='top5pct', ascending=False)
teams

Unnamed: 0_level_0,Unnamed: 1_level_0,gs_total,gs_top5,top5pct
team_id,year_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LAN,1966,162,162,1.0
SEA,2003,162,162,1.0
CIN,2012,162,161,0.993827
LAN,1994,114,113,0.991228
SLN,2005,162,160,0.987654
LAN,1993,162,160,0.987654
SFN,2012,162,160,0.987654
ATL,1980,161,159,0.987578
CHA,1972,154,152,0.987013
BAL,1972,154,152,0.987013


In [3]:
# Look up the names of the top-5 pitchers for each team

people = pd.read_parquet("file:../data/people.parquet")[['player_id', 'name_last']]

def lookup_player_name(player_id):
    return people[people['player_id']==player_id]['name_last'].values[0]

lookup_player_name('wainwad01')

'Wainwright'

In [4]:
def top5_names(year_id, team_id):
    pitchers = gs[(gs['year_id']==year_id) & (gs['team_id']==team_id) &(gs['rank_on_team'] <= 5)].sort_values('rank_on_team')['player_id'].values
    return [lookup_player_name(pit) for pit in pitchers]

top5_names(2019, 'SLN')

['Flaherty', 'Mikolas', 'Hudson', 'Wainwright', 'Wacha']

In [5]:
# Add the names
def top5_names_from_row(row):
    return top5_names(row[1], row[0])
teams['top5_names'] = teams.index.map(top5_names_from_row)
teams

Unnamed: 0_level_0,Unnamed: 1_level_0,gs_total,gs_top5,top5pct,top5_names
team_id,year_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LAN,1966,162,162,1.0,"[Koufax, Drysdale, Osteen, Sutton, Moeller]"
SEA,2003,162,162,1.0,"[Garcia, Moyer, Meche, Franklin, Pineiro]"
CIN,2012,162,161,0.993827,"[Bailey, Cueto, Latos, Arroyo, Leake]"
LAN,1994,114,113,0.991228,"[Martinez, Gross, Astacio, Candiotti, Hershiser]"
SLN,2005,162,160,0.987654,"[Carpenter, Marquis, Suppan, Mulder, Morris]"
LAN,1993,162,160,0.987654,"[Hershiser, Candiotti, Gross, Martinez, Astacio]"
SFN,2012,162,160,0.987654,"[Lincecum, Bumgarner, Cain, Zito, Vogelsong]"
ATL,1980,161,159,0.987578,"[Niekro, Alexander, Matula, McWilliams, Boggs]"
CHA,1972,154,152,0.987013,"[Wood, Bahnsen, Bradley, Lemonds, Fisher]"
BAL,1972,154,152,0.987013,"[McNally, Palmer, Dobson, Cuellar, Alexander]"
