# Best Starting Lineups of All Time

Inspired by https://www.reddit.com/r/baseball/comments/jzo3gi/was_the_springfield_nuclear_power_plant_company/

> Not including Homer (since his WAR is yet to be calculated by my knowledge), the average player on the softball team for the Springfield Nuclear Power Plant had a career bWAR of 63.3

    SS - Ozzie Smith (76.9)
    CF - Ken Griffey Jr. (83.8)
    3B - Wade Boggs (91.4)
    LF - Jose Canseco (42.4)
    RF - Darryl Strawberry (42.2)
    1B - Don Mattingly (42.4)
    C - Mike Scioscia (26.1)
    2B - Steve Sax (25.7)
    P - Roger Clemens (139.2)

> So I ask, was the Springfield Nuclear Power Plant company softball team the greatest team of all time?

So let's compute the total WAR of every lineup in MLB history.

In [21]:
import pandas as pd
import numpy as np

In [2]:
wars = pd.read_parquet('../data/mine/WAR_tot_seasonal.parquet')
career_wars = wars.groupby('bbref_id')['WAR_total'].sum()
career_wars.sort_values()

bbref_id
leveyji01     -7.5
flynndo01     -6.9
meyerda01     -6.5
harrivi01     -6.3
staintu01     -6.2
             ...  
cobbty01     151.2
mayswi01     156.3
bondsba01    162.8
johnswa01    164.5
ruthba01     182.4
Name: WAR_total, Length: 17750, dtype: float64

In [3]:
# Now merge in retro_id

ppl = pd.read_parquet('../data/bd/people.parquet')
ppl.columns

Index(['player_id', 'birth_year', 'birth_month', 'birth_day', 'birth_country',
       'birth_state', 'birth_city', 'death_year', 'death_month', 'death_day',
       'death_country', 'death_state', 'death_city', 'name_first', 'name_last',
       'name_given', 'weight', 'height', 'bats', 'throws', 'debut',
       'final_game', 'retro_id', 'bbref_id'],
      dtype='object')

In [4]:
war_lookup = pd.merge(left=ppl[['retro_id', 'bbref_id']], right=career_wars, on='bbref_id')
war_lookup

Unnamed: 0,retro_id,bbref_id,WAR_total
0,aardd001,aardsda01,1.9
1,aaroh101,aaronha01,143.2
2,aarot101,aaronto01,-2.8
3,aased001,aasedo01,15.1
4,abada001,abadan01,-0.3
...,...,...,...
17536,zupof101,zupofr01,-0.1
17537,zuvep001,zuvelpa01,-2.4
17538,zuveg101,zuverge01,6.9
17539,zwild101,zwilldu01,8.5


In [5]:
# Now get starting lineups
# This will get the batting orders. E.g., it won't include starting pitchers in DH games

gls = pd.read_parquet('../data/mine/gl_starters.parquet')
gls.columns

Index(['player_id', 'game_id', 'date', 'yr', 'game_type', 'team', 'HA', 'pos',
       'batting_pos'],
      dtype='object')

In [31]:
# Merge the lineups with the career WARs
gls_war = pd.merge(left=gls, right=war_lookup, left_on='player_id', right_on='retro_id')

In [35]:
# Then total up the lineups
#aggreg = {'WAR_total': sum, 'date': min, 'yr': min, 'game_type': min}
lineup_totals = gls_war.groupby(['game_id', 'team'])['WAR_total'].sum()
lineup_totals

game_id       team
ALS193307060  ALS     658.8
              NLS     333.2
ALS193507080  ALS     540.1
              NLS     444.4
ALS193707070  ALS     547.9
                      ...  
WS2197109280  WS2     145.8
WS2197109290  NYA     210.1
              WS2     122.9
WS2197109300  NYA     162.3
              WS2     152.4
Name: WAR_total, Length: 404920, dtype: float64

In [90]:
# Now merge the WAR_total with other data from the game_log_teams table (date, opponent, etc)

glt_cols = ['game_id', 'team', 'date', 'yr', 'game_type', 'HA', 'opp']
idx_flds = ['game_id', 'team']

glt = pd.read_parquet('../data/mine/gl_teams.parquet')[glt_cols]
lineups = pd.merge(left=glt, right=lineup_totals, left_on=idx_flds, right_on=idx_flds).sort_values('WAR_total')
lineups

Unnamed: 0,game_id,team,date,yr,game_type,HA,opp,WAR_total
38775,PTF191509182,BLF,1915-09-18,1915,RS,A,PTF,-11.5
118657,PHA194806131,SLA,1948-06-13,1948,RS,A,PHA,-10.4
125435,DET195104230,SLA,1951-04-23,1951,RS,A,DET,-10.3
230884,CHA198209272,SEA,1982-09-27,1982,RS,A,CHA,-10.0
39104,NEW191510032,BLF,1915-10-03,1915,RS,A,NEW,-9.6
...,...,...,...,...,...,...,...,...
246072,NLS198607150,ALS,1986-07-15,1986,ASG,A,NLS,703.6
189777,NLS197207250,NLS,1972-07-25,1972,ASG,H,ALS,716.0
164566,ALS196507130,NLS,1965-07-13,1965,ASG,A,ALS,723.7
326763,NLS200407130,NLS,2004-07-13,2004,ASG,H,ALS,731.5


In [91]:
# top regular season lineups ever

lineups[lineups['game_type']=='RS'].sort_values(by='WAR_total', ascending=False).head(20)

Unnamed: 0,game_id,team,date,yr,game_type,HA,opp,WAR_total
69019,SLA192806070,PHA,1928-06-07,1928,RS,A,SLA,641.5
80845,PHA193304270,NYA,1933-04-27,1933,RS,A,PHA,636.2
81744,DET193306280,NYA,1933-06-28,1933,RS,A,DET,634.1
80914,DET193305020,NYA,1933-05-02,1933,RS,A,DET,634.1
82092,NYA193307220,NYA,1933-07-22,1933,RS,H,CLE,634.1
80051,NYA193208191,NYA,1932-08-19,1932,RS,H,DET,633.7
79898,SLA193208071,NYA,1932-08-07,1932,RS,A,SLA,633.7
81301,WS1193305311,NYA,1933-05-31,1933,RS,A,WS1,633.7
80590,PHA193209210,NYA,1932-09-21,1932,RS,A,PHA,633.7
82178,WS1193307270,NYA,1933-07-27,1933,RS,A,WS1,633.7


In [98]:
# whoa, the Yankees ran out that 633.7 lineup a lot!  It's blowing up the list
# Can we aggregate by the set of starting players?

# Let's start by just aggregating by WAR total (e.g., assume that there aren't
#  distinct lineups with the same rounded WAR total.  Of course there are, but
#  maybe not among the top ~100??)

lineups.groupby(['team', 'WAR_total'])['game_id'].count().reset_index().sort_values(by='WAR_total').tail(50)

Unnamed: 0,team,WAR_total,game_id
75,ALS,601.0,1
91545,NLS,601.9,1
91546,NLS,602.6,1
101781,NYA,603.6,21
101782,NYA,604.0,5
91547,NLS,604.7,1
76,ALS,605.1,1
113751,PHA,605.1,1
77,ALS,605.7,1
101783,NYA,606.1,1


In [30]:
# Now compute matchup totals (e.g., sum of two lineups)

matchups = lineups.groupby(['game_id']).agg(aggreg).sort_values(by='WAR_total', ascending=False)
matchups

Unnamed: 0_level_0,WAR_total,date,yr,game_type
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NLS199807070,1368.6,1998-07-07,1998,ASG
NLS197207250,1321.7,1972-07-25,1972,ASG
NLS198607150,1297.9,1986-07-15,1986,ASG
NLS200407130,1277.7,2004-07-13,2004,ASG
NLS195707090,1255.3,1957-07-09,1957,ASG
...,...,...,...,...
BS1187406240,-0.1,1874-06-24,1874,RS
BS1187406260,-0.1,1874-06-26,1874,RS
BS1187406270,-0.1,1874-06-27,1874,RS
HR1187406250,-0.1,1874-06-25,1874,RS


In [25]:
# Top regular-season lineups
lineups[lineups['game_type']=='RS'].head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,WAR_total,date,game_type
game_id,team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SLA192806070,PHA,641.5,1928-06-07,RS
PHA193304270,NYA,636.2,1933-04-27,RS
DET193306280,NYA,634.1,1933-06-28,RS
NYA193307220,NYA,634.1,1933-07-22,RS
DET193305020,NYA,634.1,1933-05-02,RS
NYA193308072,NYA,633.7,1933-08-07,RS
WS1193307270,NYA,633.7,1933-07-27,RS
CHA193306181,NYA,633.7,1933-06-18,RS
DET193206070,NYA,633.7,1932-06-07,RS
CHA193308190,NYA,633.7,1933-08-19,RS


In [26]:
# Top lineup for each team
lineups.reset_index().sort_values(by='WAR_total', ascending=False).groupby('team').first().sort_values(by='WAR_total', ascending=False)

Unnamed: 0_level_0,game_id,WAR_total,date,game_type
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ALS,NLS193407100,750.0,1934-07-10,ASG
NLS,NLS200407130,731.5,2004-07-13,ASG
PHA,SLA192806070,641.5,1928-06-07,RS
NYA,PHA193304270,636.2,1933-04-27,RS
WS1,WS1192705301,545.6,1927-05-30,RS
CIN,LAN197709150,534.0,1977-09-15,RS
MLN,CIN196405190,532.6,1964-05-19,RS
PHI,CIN198305100,521.7,1983-05-10,RS
BOS,BOS191509200,514.6,1915-09-20,RS
HOU,HOU200408030,505.5,2004-08-03,RS


In [9]:
lineups[lineups['game_type']=='RS'].groupby('game_id')['sum'].sum().sort_values(ascending=False).head(20)

game_id
PHA193308121    1045.9
PHA192805280    1029.2
PHA192805260    1006.4
NYA192704120    1000.1
PHA192804110     997.9
PHA192704200     997.4
PHA192806270     986.1
PHA192705301     982.6
NYA193308030     977.6
NYA193206260     977.5
PHA192705311     974.0
PHA193209210     967.6
PHA192805251     966.4
WS1192604200     966.2
SFN196404140     964.2
NYA193308072     963.7
WS1193307270     963.7
NYA193204200     962.1
NYA192806210     961.8
SFN196408161     959.8
Name: sum, dtype: float64