In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd())) 

import pandas as pd
from datetime import datetime, timedelta

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
from features.load_team_stats_from_db import load_team_stats_from_db
from features.load_games_from_db import load_games_from_db
from features.load_player_stats_from_db import load_player_stats_from_db

In [3]:
df_g = load_games_from_db()
df_ts = load_team_stats_from_db()
df_ps = load_player_stats_from_db()

2025-07-03 21:55:04,916 [INFO] Successully loaded 10641 games from database


There are 10641 games for  modeling...


2025-07-03 21:55:05,150 [INFO] Successully loaded 25560 rows from database


There are 25560 team-games for  modeling...


2025-07-03 21:55:07,872 [INFO] Successully loaded 933365 rows from database


There are 933365 players for  modeling...


In [4]:
df_ps.head(3)

Unnamed: 0,game_pk,team_id,team_side,player_id,player_name,at_bats,runs_scored,hits,home_runs,rbis,walks_batting,strikeouts_batting,left_on_base,stolen_bases,innings_pitched,hits_allowed,runs_allowed,earned_runs,strikeouts_pitching,walks_pitching,pitches_thrown,putouts,assists,errors
0,634642,147,home,476595,Lucas Luetge,,,,,,,,,,,,,,,,,,,
1,634642,147,home,518934,DJ LeMahieu,4.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,,,,,,,,4.0,3.0,0.0
2,634642,147,home,650633,Michael King,,,,,,,,,,,,,,,,,,,


In [5]:
df_ps.isna().sum()

game_pk                     0
team_id                     0
team_side                   0
player_id                   0
player_name                 0
at_bats                633560
runs_scored            633560
hits                   633560
home_runs              633560
rbis                   633560
walks_batting          633560
strikeouts_batting     633560
left_on_base           633560
stolen_bases           633560
innings_pitched        817144
hits_allowed           817144
runs_allowed           817144
earned_runs            817144
strikeouts_pitching    817144
walks_pitching         817144
pitches_thrown         817144
putouts                529765
assists                529765
errors                 529765
dtype: int64

In [6]:
df_g.query("game_id==634642")

Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type
0,634642,2021-04-01,2021-04-01 17:05:00+00:00,147,141,New York Yankees,Toronto Blue Jays,2,3,Final,Yankee Stadium,R


In [7]:
df_ps.query("player_id==476595").head()

Unnamed: 0,game_pk,team_id,team_side,player_id,player_name,at_bats,runs_scored,hits,home_runs,rbis,walks_batting,strikeouts_batting,left_on_base,stolen_bases,innings_pitched,hits_allowed,runs_allowed,earned_runs,strikeouts_pitching,walks_pitching,pitches_thrown,putouts,assists,errors
0,634642,147,home,476595,Lucas Luetge,,,,,,,,,,,,,,,,,,,
1088,634644,147,home,476595,Lucas Luetge,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,0.0,15.0,0.0,0.0,0.0
1917,634607,147,home,476595,Lucas Luetge,,,,,,,,,,,,,,,,,,,
2698,634587,147,home,476595,Lucas Luetge,,,,,,,,,,,,,,,,,,,
3424,634599,147,home,476595,Lucas Luetge,,,,,,,,,,1.0,3.0,2.0,2.0,1.0,0.0,25.0,0.0,0.0,0.0


In [8]:
df_ps1 = df_ps.merge(df_g[['game_id','game_date_time','game_date']],
           how='left',
           left_on='game_pk',
           right_on='game_id')

# Keep games in our games dataset
df_ps2 = df_ps1[~df_ps1['game_date_time'].isna()].copy()

In [9]:
df_ps2['season'] = df_ps2['game_date_time'].dt.year

In [10]:
d2 = df_ps2[['season','game_pk','team_id','player_id','player_name','innings_pitched']].copy()

In [11]:
d2.head()

Unnamed: 0,season,game_pk,team_id,player_id,player_name,innings_pitched
0,2021,634642,147,476595,Lucas Luetge,
1,2021,634642,147,518934,DJ LeMahieu,
2,2021,634642,147,650633,Michael King,
3,2021,634642,147,593334,Domingo Germán,
4,2021,634642,147,643565,Mike Tauchman,


In [12]:
d2.groupby('season')['player_name'].size()

season
2021    123307
2022    126278
2023    124935
2024    125803
2025     57914
Name: player_name, dtype: int64

In [13]:
d2.query("player_id==476595")['season'].value_counts(dropna=False)

season
2021    157
2022    155
2023     34
Name: count, dtype: int64

In [14]:
d2[(d2['player_id']==476595) & (d2['season']==2021)]['innings_pitched'].value_counts(dropna=False)

innings_pitched
None    101
1.0      21
1.1       9
2.0       8
0.2       6
0.1       5
1.2       4
3.0       2
4.0       1
Name: count, dtype: int64

- The above suggests that available players will be listed even if they don't play in a particular game.
- Cross-referenced Lucas Luetge on baseball-reference and found that he played 57 games in the 2021 season
- The previous cell shows that out of 157 game rows in which this player appears, 101 had `None` for `innings_pitched`. This leaves 157-101 = 56 games, which is just one off from baseball-reference.

In [15]:
d3 = df_ps2[['season','game_date','game_pk','team_id','player_id','player_name','at_bats']].copy()
d3[d3['player_name'].str.contains('Bryce Harper')].head()

Unnamed: 0,season,game_date,game_pk,team_id,player_id,player_name,at_bats
211,2021,2021-04-01,634622,143,547180,Bryce Harper,4.0
1351,2021,2021-04-03,634580,143,547180,Bryce Harper,3.0
1868,2021,2021-04-04,634617,143,547180,Bryce Harper,3.0
2858,2021,2021-04-05,634606,143,547180,Bryce Harper,1.0
3582,2021,2021-04-06,634608,143,547180,Bryce Harper,4.0


In [16]:
d_bh = d3[d3['player_name'].str.contains('Bryce Harper')].copy()

In [17]:
d_bh['season'].value_counts(dropna=False)

season
2024    149
2021    146
2023    128
2022    108
2025     60
Name: count, dtype: int64

In [18]:
# According to Baseball-Reference
# Bryce Harper missed 5 games due to injury May 28th to June 2nd
d_bh.query('season==2021')['at_bats'].value_counts(dropna=False)

at_bats
4.0    57
3.0    46
5.0    14
2.0    13
NaN    10
1.0     5
6.0     1
Name: count, dtype: int64

In [19]:
d_bh[(d_bh['season']==2021) & (d_bh['at_bats'].isna())]

Unnamed: 0,season,game_date,game_pk,team_id,player_id,player_name,at_bats
10990,2021,2021-04-17,634470,143,547180,Bryce Harper,
19250,2021,2021-04-29,634352,143,547180,Bryce Harper,
19640,2021,2021-04-30,634336,143,547180,Bryce Harper,
20724,2021,2021-05-01,634276,143,547180,Bryce Harper,
21909,2021,2021-05-03,634244,143,547180,Bryce Harper,
22741,2021,2021-05-04,634324,143,547180,Bryce Harper,
23575,2021,2021-05-05,634294,143,547180,Bryce Harper,
24035,2021,2021-05-06,634283,143,547180,Bryce Harper,
52991,2021,2021-06-16,633677,143,547180,Bryce Harper,
59952,2021,2021-06-27,633504,143,547180,Bryce Harper,


In [20]:
d_bh[(d_bh['season']==2021) & (d_bh['game_date'] <= pd.Timestamp('2021-07-01').date())]

Unnamed: 0,season,game_date,game_pk,team_id,player_id,player_name,at_bats
211,2021,2021-04-01,634622,143,547180,Bryce Harper,4.0
1351,2021,2021-04-03,634580,143,547180,Bryce Harper,3.0
1868,2021,2021-04-04,634617,143,547180,Bryce Harper,3.0
2858,2021,2021-04-05,634606,143,547180,Bryce Harper,1.0
3582,2021,2021-04-06,634608,143,547180,Bryce Harper,4.0
4410,2021,2021-04-07,634626,143,547180,Bryce Harper,4.0
5473,2021,2021-04-09,634534,143,547180,Bryce Harper,2.0
6250,2021,2021-04-10,634524,143,547180,Bryce Harper,4.0
7101,2021,2021-04-11,632207,143,547180,Bryce Harper,4.0
8096,2021,2021-04-13,632189,143,547180,Bryce Harper,3.0


In [21]:
type(pd.to_datetime(d_bh['game_date'].iloc[0]))

pandas._libs.tslibs.timestamps.Timestamp

In [22]:
df_ts.head()

Unnamed: 0,game_pk,team_side,team_id,runs_batting,hits_batting,strikeouts_batting,baseonballs_batting,avg,obp,slg,pitchesthrown,balls_pitching,strikes_pitching,strikeouts_pitching,baseonballs_pitching,hits_pitching,earnedruns,homeruns_pitching,runs_pitching,era,whip,groundouts_pitching,airouts_pitching,total,putouts,assists,errors,doubleplays,tripleplays,rangefactor,caughtstealing,passedball,innings
0,634642,home,147,2,6,13,6,0.171,0.293,0.257,159,55,104,13,2,8,2,1,3,1.8,1.0,4,11,0,30,7,0,0,0,0.0,1,0,0.0
1,634642,away,141,3,8,13,2,0.222,0.263,0.333,181,78,103,13,6,6,2,1,2,1.8,1.2,10,6,0,30,10,0,0,0,0.0,0,0,0.0
2,634645,home,116,3,5,14,5,0.167,0.286,0.3,154,60,94,5,6,6,2,1,2,2.0,1.33,11,10,0,27,12,0,0,0,0.0,0,0,0.0
3,634645,away,114,2,6,5,6,0.188,0.316,0.313,125,48,77,14,5,5,3,1,3,3.38,1.25,5,6,0,24,6,1,0,0,0.0,0,0,0.0
4,634638,home,158,6,11,9,2,0.275,0.341,0.3,189,74,115,17,7,10,5,1,5,4.5,1.7,7,6,0,30,6,0,0,0,0.0,0,0,0.0


In [23]:
from typing import Literal
# Note lists below were generated by Google Gemini
# 2020
batters_2020 = {
    "Baltimore Orioles": ["Renato Núñez", "Rio Ruiz", "José Iglesias"],
    "Boston Red Sox": ["Rafael Devers", "Xander Bogaerts", "Alex Verdugo"],
    "New York Yankees": ["DJ LeMahieu", "Luke Voit", "Aaron Judge"],
    "Tampa Bay Rays": ["Brandon Lowe", "Joey Wendle"],
    "Toronto Blue Jays": ["Teoscar Hernández", "Lourdes Gurriel Jr.", "Cavan Biggio"],

    "Chicago White Sox": ["José Abreu", "Tim Anderson", "Eloy Jiménez"],
    "Cleveland Guardians": ["José Ramírez", "Cesar Hernandez", "Franmil Reyes"],
    "Detroit Tigers": ["Jeimer Candelario", "Jonathan Schoop"],
    "Kansas City Royals": ["Salvador Perez", "Whit Merrifield"],
    "Minnesota Twins": ["Nelson Cruz", "Josh Donaldson", "Max Kepler"],

    "Houston Astros": ["George Springer", "Jose Altuve", "Michael Brantley"],
    "Los Angeles Angels": ["Mike Trout", "Anthony Rendon", "David Fletcher"],
    "Oakland Athletics": ["Matt Olson", "Mark Canha", "Ramón Laureano"],
    "Seattle Mariners": ["Kyle Lewis", "Kyle Seager"],
    "Texas Rangers": ["Joey Gallo", "Isiah Kiner-Falefa"],

    "Atlanta Braves": ["Freddie Freeman", "Marcell Ozuna", "Ronald Acuña Jr."],
    "Miami Marlins": ["Brian Anderson", "Garrett Cooper"],
    "New York Mets": ["Michael Conforto", "Dominic Smith", "Brandon Nimmo"],
    "Philadelphia Phillies": ["Bryce Harper", "J.T. Realmuto", "Didi Gregorius"],
    "Washington Nationals": ["Juan Soto", "Trea Turner", "Josh Bell"],

    "Chicago Cubs": ["Ian Happ", "Willson Contreras", "Anthony Rizzo"],
    "Cincinnati Reds": ["Nick Castellanos", "Jesse Winker", "Mike Moustakas"],
    "Milwaukee Brewers": ["Christian Yelich", "Jed Lowrie"], # Limited impactful bats in 2020
    "Pittsburgh Pirates": ["Bryan Reynolds", "Colin Moran"],
    "St. Louis Cardinals": ["Paul Goldschmidt", "Brad Miller", "Yadier Molina"],

    "Arizona Diamondbacks": ["Ketel Marte", "Eduardo Escobar", "Christian Walker"],
    "Colorado Rockies": ["Trevor Story", "Charlie Blackmon", "C.J. Cron"],
    "Los Angeles Dodgers": ["Mookie Betts", "Corey Seager", "Justin Turner"],
    "San Diego Padres": ["Fernando Tatis Jr.", "Manny Machado", "Wil Myers"],
    "San Francisco Giants": ["Mike Yastrzemski", "Brandon Belt", "Donovan Solano"]
}

pitchers_2020 = {
    "Baltimore Orioles": ["John Means", "Tanner Scott"],
    "Boston Red Sox": ["Nathan Eovaldi", "Martin Perez"],
    "New York Yankees": ["Gerrit Cole", "Chad Green", "Zack Britton"],
    "Tampa Bay Rays": ["Tyler Glasnow", "Blake Snell", "Nick Anderson"],
    "Toronto Blue Jays": ["Hyun Jin Ryu", "Taijuan Walker", "Anthony Bass"],

    "Chicago White Sox": ["Lucas Giolito", "Dallas Keuchel", "Liam Hendriks"],
    "Cleveland Guardians": ["Shane Bieber", "Carlos Carrasco", "James Karinchak"],
    "Detroit Tigers": ["Spencer Turnbull", "Matthew Boyd"],
    "Kansas City Royals": ["Brad Keller", "Danny Duffy", "Greg Holland"],
    "Minnesota Twins": ["Kenta Maeda", "Tyler Duffey", "Taylor Rogers"],

    "Houston Astros": ["Zack Greinke", "Framber Valdez", "Lance McCullers Jr."],
    "Los Angeles Angels": ["Dylan Bundy", "Griffin Canning", "Raisel Iglesias"], # Iglesias was acquired mid-season
    "Oakland Athletics": ["Chris Bassitt", "Jesús Luzardo", "Liam Hendriks"], # Hendriks was traded mid-season
    "Seattle Mariners": ["Marco Gonzales", "Justus Sheffield"],
    "Texas Rangers": ["Lance Lynn", "Kyle Cody"],

    "Atlanta Braves": ["Max Fried", "Ian Anderson", "Darrell Ceciliani"], # Ceciliani had very limited appearances, but was on the roster.
    "Miami Marlins": ["Sandy Alcántara", "Pablo López", "Sixto Sánchez"],
    "New York Mets": ["Jacob deGrom", "Seth Lugo", "Edwin Díaz"],
    "Philadelphia Phillies": ["Zack Wheeler", "Aaron Nola"],
    "Washington Nationals": ["Max Scherzer", "Patrick Corbin", "Daniel Hudson"],

    "Chicago Cubs": ["Yu Darvish", "Kyle Hendricks", "Craig Kimbrel"],
    "Cincinnati Reds": ["Trevor Bauer", "Sonny Gray", "Raisel Iglesias"],
    "Milwaukee Brewers": ["Corbin Burnes", "Brandon Woodruff", "Devin Williams"],
    "Pittsburgh Pirates": ["Joe Musgrove", "Richard Rodriguez"],
    "St. Louis Cardinals": ["Kwang Hyun Kim", "Jack Flaherty", "Giovanny Gallegos"],

    "Arizona Diamondbacks": ["Zac Gallen", "Luke Weaver", "Stefan Crichton"],
    "Colorado Rockies": ["Germán Márquez", "Antonio Senzatela", "Daniel Bard"],
    "Los Angeles Dodgers": ["Clayton Kershaw", "Walker Buehler", "Dustin May"],
    "San Diego Padres": ["Dinelson Lamet", "Zach Davies", "Drew Pomeranz"],
    "San Francisco Giants": ["Kevin Gausman", "Johnny Cueto", "Sammy Long"] # Long had very limited appearances, but was on the roster
}
# 2021
batters_2021 = {
    "Baltimore Orioles": ["Cedric Mullins", "Ryan Mountcastle"],
    "Boston Red Sox": ["Rafael Devers", "Xander Bogaerts"],
    "New York Yankees": ["Aaron Judge", "Giancarlo Stanton"],
    "Tampa Bay Rays": ["Brandon Lowe"],
    "Toronto Blue Jays": ["Vladimir Guerrero Jr.", "Marcus Semien"],

    "Chicago White Sox": ["José Abreu", "Tim Anderson"],
    "Cleveland Guardians": ["José Ramírez"],
    "Detroit Tigers": ["Robbie Grossman", "Jeimer Candelario"],
    "Kansas City Royals": ["Salvador Perez", "Whit Merrifield"],
    "Minnesota Twins": ["Byron Buxton", "Josh Donaldson", "Jorge Polanco"],

    "Houston Astros": ["Carlos Correa", "Jose Altuve", "Yordan Alvarez"],
    "Los Angeles Angels": ["Shohei Ohtani", "Mike Trout", "Jared Walsh"], # Ohtani is both, listed here for his bat impact
    "Oakland Athletics": ["Matt Olson"],
    "Seattle Mariners": ["Ty France", "Mitch Haniger"],
    "Texas Rangers": ["Adolis García", "Joey Gallo"],

    "Atlanta Braves": ["Freddie Freeman", "Ronald Acuña Jr.", "Austin Riley"],
    "Miami Marlins": ["Jazz Chisholm Jr."],
    "New York Mets": ["Pete Alonso", "Francisco Lindor"],
    "Philadelphia Phillies": ["Bryce Harper", "J.T. Realmuto"],
    "Washington Nationals": ["Juan Soto", "Josh Bell"],

    "Chicago Cubs": ["Willson Contreras", "Javier Báez"], # Javier Báez (before trade)
    "Cincinnati Reds": ["Joey Votto", "Nick Castellanos", "Jesse Winker"],
    "Milwaukee Brewers": [], # Top players were primarily pitchers
    "Pittsburgh Pirates": ["Bryan Reynolds", "Ke'Bryan Hayes", "Adam Frazier"], # Adam Frazier (before trade)
    "St. Louis Cardinals": ["Paul Goldschmidt", "Nolan Arenado"],

    "Arizona Diamondbacks": ["Ketel Marte", "Eduardo Escobar", "David Peralta"], # Eduardo Escobar (before trade)
    "Colorado Rockies": ["C.J. Cron", "Trevor Story"],
    "Los Angeles Dodgers": ["Mookie Betts", "Trea Turner"], # Trea Turner (after trade)
    "San Diego Padres": ["Fernando Tatis Jr.", "Manny Machado"],
    "San Francisco Giants": ["Buster Posey", "Brandon Crawford"]
}

pitchers_2021 = {
    "Baltimore Orioles": ["John Means"],
    "Boston Red Sox": ["Nathan Eovaldi"],
    "New York Yankees": ["Gerrit Cole"],
    "Tampa Bay Rays": ["Shane McClanahan", "Drew Rasmussen", "Tyler Glasnow"], # Glasnow was impactful when healthy
    "Toronto Blue Jays": ["Robbie Ray"],

    "Chicago White Sox": ["Lance Lynn", "Carlos Rodón", "Liam Hendriks"],
    "Cleveland Guardians": ["Shane Bieber", "Emmanuel Clase", "Cal Quantrill"],
    "Detroit Tigers": ["Casey Mize", "Tarik Skubal", "Gregory Soto"],
    "Kansas City Royals": [],
    "Minnesota Twins": [],

    "Houston Astros": ["Lance McCullers Jr.", "Luis Garcia", "Zack Greinke"],
    "Los Angeles Angels": ["Shohei Ohtani", "Raisel Iglesias"], # Ohtani is both, listed here for his pitching impact
    "Oakland Athletics": ["Sean Manaea", "Chris Bassitt", "Frankie Montas"],
    "Seattle Mariners": ["Yusei Kikuchi", "Logan Gilbert", "Paul Sewald"],
    "Texas Rangers": ["Kyle Gibson"],

    "Atlanta Braves": ["Max Fried", "Charlie Morton", "Ian Anderson"],
    "Miami Marlins": ["Sandy Alcántara", "Trevor Rogers"],
    "New York Mets": ["Jacob deGrom", "Taijuan Walker", "Edwin Díaz"],
    "Philadelphia Phillies": ["Zack Wheeler", "Aaron Nola"],
    "Washington Nationals": ["Max Scherzer"], # Max Scherzer (before trade)

    "Chicago Cubs": ["Kyle Hendricks"],
    "Cincinnati Reds": ["Wade Miley", "Luis Castillo", "Tyler Mahle"],
    "Milwaukee Brewers": ["Corbin Burnes", "Brandon Woodruff", "Josh Hader"],
    "Pittsburgh Pirates": [],
    "St. Louis Cardinals": ["Adam Wainwright", "Alex Reyes", "Jack Flaherty"],

    "Arizona Diamondbacks": ["Merrill Kelly", "Zac Gallen"],
    "Colorado Rockies": ["Germán Márquez"],
    "Los Angeles Dodgers": ["Walker Buehler", "Julio Urías", "Max Scherzer"], # Max Scherzer (after trade)
    "San Diego Padres": ["Joe Musgrove", "Blake Snell", "Mark Melancon"],
    "San Francisco Giants": ["Kevin Gausman", "Logan Webb", "Anthony DeSclafani"]
}

# 2022
# Corrected for 2022 season (approximately 3 players total per team)
batters_2022 = {
    "Baltimore Orioles": ["Adley Rutschman", "Anthony Santander", "Cedric Mullins"],
    "Boston Red Sox": ["Rafael Devers", "Xander Bogaerts"],
    "New York Yankees": ["Aaron Judge", "Anthony Rizzo"],
    "Tampa Bay Rays": ["Yandy Díaz", "Randy Arozarena"],
    "Toronto Blue Jays": ["Vladimir Guerrero Jr.", "Alejandro Kirk", "Bo Bichette"],

    "Chicago White Sox": ["José Abreu", "Andrew Vaughn", "Luis Robert Jr."],
    "Cleveland Guardians": ["José Ramírez", "Steven Kwan", "Andrés Giménez"],
    "Detroit Tigers": ["Javier Báez", "Miguel Cabrera"],
    "Kansas City Royals": ["Salvador Perez", "Bobby Witt Jr."],
    "Minnesota Twins": ["Luis Arraez", "Carlos Correa", "Byron Buxton"],

    "Houston Astros": ["Yordan Alvarez", "Jose Altuve", "Kyle Tucker"],
    "Los Angeles Angels": ["Shohei Ohtani", "Mike Trout", "Taylor Ward"], # Ohtani listed here for his bat
    "Oakland Athletics": ["Sean Murphy", "Seth Brown"],
    "Seattle Mariners": ["Julio Rodríguez", "Ty France", "Eugenio Suárez"],
    "Texas Rangers": ["Corey Seager", "Nathaniel Lowe", "Marcus Semien"],

    "Atlanta Braves": ["Austin Riley", "Dansby Swanson", "Matt Olson"],
    "Miami Marlins": ["Miguel Rojas", "Garrett Cooper"],
    "New York Mets": ["Pete Alonso", "Jeff McNeil", "Francisco Lindor"],
    "Philadelphia Phillies": ["Bryce Harper", "Kyle Schwarber", "J.T. Realmuto"],
    "Washington Nationals": ["Juan Soto", "Josh Bell"], # Both traded mid-season but were the key bats early

    "Chicago Cubs": ["Ian Happ", "Willson Contreras"],
    "Cincinnati Reds": ["Kyle Farmer", "Jonathan India", "Brandon Drury"],
    "Milwaukee Brewers": ["Rowdy Tellez", "Willy Adames", "Christian Yelich"],
    "Pittsburgh Pirates": ["Bryan Reynolds", "Ke'Bryan Hayes"],
    "St. Louis Cardinals": ["Paul Goldschmidt", "Nolan Arenado", "Tommy Edman"],

    "Arizona Diamondbacks": ["Christian Walker", "Ketel Marte", "Daulton Varsho"],
    "Colorado Rockies": ["C.J. Cron", "Brendan Rodgers", "Ryan McMahon"],
    "Los Angeles Dodgers": ["Freddie Freeman", "Mookie Betts", "Trea Turner"],
    "San Diego Padres": ["Manny Machado", "Jake Cronenworth", "Juan Soto"], # Soto (after trade)
    "San Francisco Giants": ["Mike Yastrzemski", "J.D. Davis"]
}

pitchers_2022 = {
    "Baltimore Orioles": ["Dean Kremer", "Félix Bautista"],
    "Boston Red Sox": ["Nathan Eovaldi", "Michael Wacha", "Garrett Whitlock"],
    "New York Yankees": ["Gerrit Cole", "Nestor Cortes", "Clay Holmes"],
    "Tampa Bay Rays": ["Shane McClanahan", "Drew Rasmussen", "Jeffrey Springs"],
    "Toronto Blue Jays": ["Alek Manoah", "Kevin Gausman", "Jordan Romano"],

    "Chicago White Sox": ["Dylan Cease", "Lance Lynn", "Liam Hendriks"],
    "Cleveland Guardians": ["Shane Bieber", "Triston McKenzie", "Emmanuel Clase"],
    "Detroit Tigers": ["Tarik Skubal", "Gregory Soto"],
    "Kansas City Royals": ["Brady Singer", "Scott Barlow"],
    "Minnesota Twins": ["Joe Ryan", "Sonny Gray", "Jhoan Duran"],

    "Houston Astros": ["Justin Verlander", "Framber Valdez", "Cristian Javier"],
    "Los Angeles Angels": ["Shohei Ohtani", "Patrick Sandoval"], # Ohtani listed here for his arm
    "Oakland Athletics": ["Paul Blackburn", "Cole Irvin", "Domingo Acevedo"],
    "Seattle Mariners": ["Luis Castillo", "Logan Gilbert", "George Kirby"],
    "Texas Rangers": ["Martín Pérez", "Jon Gray", "Brock Burke"],

    "Atlanta Braves": ["Kyle Wright", "Max Fried", "Spencer Strider"],
    "Miami Marlins": ["Sandy Alcántara", "Pablo López", "Trevor Rogers"],
    "New York Mets": ["Max Scherzer", "Jacob deGrom", "Edwin Díaz"],
    "Philadelphia Phillies": ["Zack Wheeler", "Aaron Nola", "Seranthony Domínguez"],
    "Washington Nationals": ["Josiah Gray", "Patrick Corbin"],

    "Chicago Cubs": ["Justin Steele", "Marcus Stroman"],
    "Cincinnati Reds": ["Hunter Greene", "Nick Lodolo", "Alexis Díaz"],
    "Milwaukee Brewers": ["Corbin Burnes", "Brandon Woodruff", "Devin Williams"],
    "Pittsburgh Pirates": ["Mitch Keller", "David Bednar"],
    "St. Louis Cardinals": ["Miles Mikolas", "Adam Wainwright", "Ryan Helsley"],

    "Arizona Diamondbacks": ["Zac Gallen", "Merrill Kelly"],
    "Colorado Rockies": ["Germán Márquez", "Kyle Freeland", "Daniel Bard"],
    "Los Angeles Dodgers": ["Julio Urías", "Tony Gonsolin", "Evan Phillips"],
    "San Diego Padres": ["Joe Musgrove", "Yu Darvish", "Josh Hader"],
    "San Francisco Giants": ["Logan Webb", "Carlos Rodón", "Alex Wood"]
}

# 2023
# Corrected for 2023 season (approximately 3 players total per team)
batters_2023 = {
    "Baltimore Orioles": ["Adley Rutschman", "Gunnar Henderson"],
    "Boston Red Sox": ["Rafael Devers", "Masataka Yoshida"],
    "New York Yankees": ["Aaron Judge", "Gleyber Torres"],
    "Tampa Bay Rays": ["Yandy Díaz", "Randy Arozarena"],
    "Toronto Blue Jays": ["Vladimir Guerrero Jr.", "Bo Bichette"],

    "Chicago White Sox": ["Luis Robert Jr.", "Andrew Vaughn"],
    "Cleveland Guardians": ["José Ramírez", "Steven Kwan"],
    "Detroit Tigers": ["Spencer Torkelson", "Riley Greene"],
    "Kansas City Royals": ["Bobby Witt Jr.", "Salvador Perez"],
    "Minnesota Twins": ["Carlos Correa", "Byron Buxton"], # Luis Arraez was traded mid-season but impactful before

    "Houston Astros": ["Yordan Alvarez", "Jose Altuve", "Kyle Tucker"],
    "Los Angeles Angels": ["Shohei Ohtani", "Mike Trout"], # Ohtani for his bat
    "Oakland Athletics": ["Brent Rooker", "Zack Gelof"],
    "Seattle Mariners": ["Julio Rodríguez", "Cal Raleigh"],
    "Texas Rangers": ["Corey Seager", "Marcus Semien", "Adolis García"],

    "Atlanta Braves": ["Ronald Acuña Jr.", "Matt Olson", "Austin Riley"],
    "Miami Marlins": ["Luis Arraez", "Jorge Soler"], # Arraez (after trade)
    "New York Mets": ["Pete Alonso", "Francisco Lindor"],
    "Philadelphia Phillies": ["Kyle Schwarber", "Bryce Harper", "Trea Turner"],
    "Washington Nationals": ["Joey Meneses", "CJ Abrams"],

    "Chicago Cubs": ["Cody Bellinger", "Dansby Swanson"],
    "Cincinnati Reds": ["Spencer Steer", "Elly De La Cruz", "Matt McLain"],
    "Milwaukee Brewers": ["Christian Yelich", "William Contreras"],
    "Pittsburgh Pirates": ["Bryan Reynolds", "Ke'Bryan Hayes"],
    "St. Louis Cardinals": ["Paul Goldschmidt", "Nolan Arenado"],

    "Arizona Diamondbacks": ["Corbin Carroll", "Ketel Marte", "Christian Walker"],
    "Colorado Rockies": ["Elias Díaz", "Ryan McMahon"],
    "Los Angeles Dodgers": ["Freddie Freeman", "Mookie Betts", "Will Smith"],
    "San Diego Padres": ["Juan Soto", "Fernando Tatis Jr.", "Manny Machado"],
    "San Francisco Giants": ["LaMonte Wade Jr.", "Patrick Bailey"]
}

pitchers_2023 = {
    "Baltimore Orioles": ["Kyle Bradish"],
    "Boston Red Sox": ["Brayan Bello", "Kenley Jansen"],
    "New York Yankees": ["Gerrit Cole"],
    "Tampa Bay Rays": ["Shane McClanahan", "Zach Eflin"],
    "Toronto Blue Jays": ["Kevin Gausman", "Jordan Romano"],

    "Chicago White Sox": ["Dylan Cease"],
    "Cleveland Guardians": ["Emmanuel Clase"],
    "Detroit Tigers": ["Eduardo Rodriguez", "Tarik Skubal"],
    "Kansas City Royals": ["Cole Ragans"],
    "Minnesota Twins": ["Pablo López", "Sonny Gray", "Jhoan Duran"],

    "Houston Astros": ["Framber Valdez", "Justin Verlander"],
    "Los Angeles Angels": ["Shohei Ohtani"], # Ohtani for his arm
    "Oakland Athletics": ["JP Sears"],
    "Seattle Mariners": ["Luis Castillo", "George Kirby"],
    "Texas Rangers": ["Nathan Eovaldi", "Jordan Montgomery"],

    "Atlanta Braves": ["Spencer Strider"],
    "Miami Marlins": ["Jesús Luzardo", "Eury Pérez"],
    "New York Mets": ["Kodai Senga"],
    "Philadelphia Phillies": ["Zack Wheeler", "Aaron Nola"],
    "Washington Nationals": ["Josiah Gray"],

    "Chicago Cubs": ["Justin Steele", "Marcus Stroman"],
    "Cincinnati Reds": ["Alexis Díaz"],
    "Milwaukee Brewers": ["Corbin Burnes", "Freddy Peralta", "Devin Williams"],
    "Pittsburgh Pirates": ["Mitch Keller", "David Bednar"],
    "St. Louis Cardinals": ["Ryan Helsley"],

    "Arizona Diamondbacks": ["Zac Gallen", "Merrill Kelly"],
    "Colorado Rockies": ["Kyle Freeland"],
    "Los Angeles Dodgers": ["Clayton Kershaw"],
    "San Diego Padres": ["Blake Snell"],
    "San Francisco Giants": ["Logan Webb", "Alex Cobb"]
}

# 2024
batters_2024 = {
    "Baltimore Orioles": ["Gunnar Henderson", "Adley Rutschman"],
    "Boston Red Sox": ["Rafael Devers", "Triston Casas"],
    "New York Yankees": ["Aaron Judge", "Juan Soto"],
    "Tampa Bay Rays": ["Yandy Díaz", "Isaac Paredes"],
    "Toronto Blue Jays": ["Vladimir Guerrero Jr.", "Bo Bichette"],

    "Chicago White Sox": ["Luis Robert Jr."],
    "Cleveland Guardians": ["José Ramírez", "Steven Kwan"],
    "Detroit Tigers": ["Riley Greene", "Spencer Torkelson"],
    "Kansas City Royals": ["Bobby Witt Jr.", "Salvador Perez"],
    "Minnesota Twins": ["Carlos Correa", "Royce Lewis"],

    "Houston Astros": ["Yordan Alvarez", "Jose Altuve", "Kyle Tucker"],
    "Los Angeles Angels": ["Shohei Ohtani", "Mike Trout"], # Ohtani as batter for 2024
    "Oakland Athletics": ["Brent Rooker", "Zack Gelof"],
    "Seattle Mariners": ["Julio Rodríguez", "Cal Raleigh"],
    "Texas Rangers": ["Corey Seager", "Marcus Semien", "Adolis García"],

    "Atlanta Braves": ["Ronald Acuña Jr.", "Matt Olson", "Austin Riley"],
    "Miami Marlins": ["Luis Arraez", "Bryan De La Cruz"],
    "New York Mets": ["Francisco Lindor", "Pete Alonso"],
    "Philadelphia Phillies": ["Bryce Harper", "Kyle Schwarber", "Alec Bohm"],
    "Washington Nationals": ["CJ Abrams", "Lane Thomas"],

    "Chicago Cubs": ["Cody Bellinger", "Dansby Swanson"],
    "Cincinnati Reds": ["Elly De La Cruz", "Spencer Steer"],
    "Milwaukee Brewers": ["William Contreras", "Christian Yelich"],
    "Pittsburgh Pirates": ["Bryan Reynolds", "Oneil Cruz"],
    "St. Louis Cardinals": ["Paul Goldschmidt", "Nolan Arenado"],

    "Arizona Diamondbacks": ["Corbin Carroll", "Ketel Marte", "Christian Walker"],
    "Colorado Rockies": ["Elias Díaz", "Nolan Jones"],
    "Los Angeles Dodgers": ["Mookie Betts", "Freddie Freeman"],
    "San Diego Padres": ["Juan Soto", "Fernando Tatis Jr.", "Manny Machado"],
    "San Francisco Giants": ["LaMonte Wade Jr.", "Patrick Bailey"]
}

pitchers_2024 = {
    "Baltimore Orioles": ["Corbin Burnes"],
    "Boston Red Sox": ["Garrett Crochet"], # Crochet had a strong year after trade
    "New York Yankees": ["Gerrit Cole"],
    "Tampa Bay Rays": ["Zach Eflin"],
    "Toronto Blue Jays": ["Kevin Gausman"],

    "Chicago White Sox": ["Garrett Crochet", "Erick Fedde"], # Crochet split time
    "Cleveland Guardians": ["Emmanuel Clase"],
    "Detroit Tigers": ["Tarik Skubal"],
    "Kansas City Royals": ["Seth Lugo", "Cole Ragans"],
    "Minnesota Twins": ["Pablo López", "Jhoan Duran"],

    "Houston Astros": ["Framber Valdez"],
    "Los Angeles Angels": [], # Ohtani didn't pitch in 2024
    "Oakland Athletics": ["Mason Miller"],
    "Seattle Mariners": ["Luis Castillo", "Logan Gilbert", "George Kirby"],
    "Texas Rangers": ["Nathan Eovaldi"],

    "Atlanta Braves": ["Spencer Strider", "Chris Sale"],
    "Miami Marlins": ["Jesús Luzardo", "Eury Pérez"],
    "New York Mets": ["Kodai Senga", "Edwin Díaz"],
    "Philadelphia Phillies": ["Zack Wheeler", "Aaron Nola"],
    "Washington Nationals": ["Josiah Gray"],

    "Chicago Cubs": ["Shota Imanaga", "Justin Steele"],
    "Cincinnati Reds": ["Nick Lodolo", "Alexis Díaz"],
    "Milwaukee Brewers": ["Freddy Peralta", "Devin Williams"],
    "Pittsburgh Pirates": ["Paul Skenes", "Mitch Keller"],
    "St. Louis Cardinals": ["Sonny Gray", "Ryan Helsley"],

    "Arizona Diamondbacks": ["Zac Gallen", "Merrill Kelly"],
    "Colorado Rockies": ["Cal Quantrill"],
    "Los Angeles Dodgers": ["Tyler Glasnow", "Yoshinobu Yamamoto"],
    "San Diego Padres": ["Dylan Cease", "Michael King"],
    "San Francisco Giants": ["Logan Webb", "Blake Snell"]
}

batters_2025 = {
    "Baltimore Orioles": ["Gunnar Henderson", "Adley Rutschman", "Ryan O'Hearn"],
    "Boston Red Sox": ["Rafael Devers", "Jarren Duran", "Triston Casas"],
    "New York Yankees": ["Aaron Judge", "Juan Soto"],
    "Tampa Bay Rays": ["Yandy Díaz", "Randy Arozarena", "Isaac Paredes"],
    "Toronto Blue Jays": ["Vladimir Guerrero Jr.", "Bo Bichette"],

    "Chicago White Sox": ["Luis Robert Jr.", "Andrew Vaughn"],
    "Cleveland Guardians": ["José Ramírez", "Steven Kwan", "Andrés Giménez"],
    "Detroit Tigers": ["Riley Greene", "Spencer Torkelson"],
    "Kansas City Royals": ["Bobby Witt Jr.", "Salvador Perez", "Vinnie Pasquantino"],
    "Minnesota Twins": ["Carlos Correa", "Royce Lewis", "Byron Buxton"],

    "Houston Astros": ["Yordan Alvarez", "Jose Altuve", "Kyle Tucker"],
    "Los Angeles Angels": ["Shohei Ohtani", "Mike Trout"], # Ohtani continues to bat well
    "Oakland Athletics": ["Brent Rooker", "Shea Langeliers", "Jacob Wilson"],
    "Seattle Mariners": ["Cal Raleigh", "Julio Rodríguez", "Ty France"],
    "Texas Rangers": ["Corey Seager", "Marcus Semien", "Adolis García"],

    "Atlanta Braves": ["Ronald Acuña Jr.", "Matt Olson", "Austin Riley"],
    "Miami Marlins": ["Luis Arraez", "Bryan De La Cruz"],
    "New York Mets": ["Pete Alonso", "Francisco Lindor", "Brandon Nimmo"],
    "Philadelphia Phillies": ["Bryce Harper", "Kyle Schwarber", "Alec Bohm"],
    "Washington Nationals": ["CJ Abrams", "Lane Thomas", "Joey Meneses"],

    "Chicago Cubs": ["Cody Bellinger", "Dansby Swanson", "Seiya Suzuki"],
    "Cincinnati Reds": ["Elly De La Cruz", "Spencer Steer", "Matt McLain"],
    "Milwaukee Brewers": ["William Contreras", "Christian Yelich", "Willy Adames"],
    "Pittsburgh Pirates": ["Bryan Reynolds", "Oneil Cruz"],
    "St. Louis Cardinals": ["Paul Goldschmidt", "Nolan Arenado", "Willson Contreras"],

    "Arizona Diamondbacks": ["Corbin Carroll", "Ketel Marte", "Christian Walker"],
    "Colorado Rockies": ["Elias Díaz", "Nolan Jones", "Ryan McMahon"],
    "Los Angeles Dodgers": ["Mookie Betts", "Freddie Freeman", "Will Smith"],
    "San Diego Padres": ["Juan Soto", "Fernando Tatis Jr.", "Manny Machado"],
    "San Francisco Giants": ["LaMonte Wade Jr.", "Patrick Bailey", "Matt Chapman"]
}

pitchers_2025 = {
    "Baltimore Orioles": ["Corbin Burnes"],
    "Boston Red Sox": ["Garrett Crochet"],
    "New York Yankees": ["Gerrit Cole", "Luis Gil", "Max Fried"], # Fried moved to Yankees, all pitching well
    "Tampa Bay Rays": ["Zach Eflin", "Shane McClanahan"],
    "Toronto Blue Jays": ["Kevin Gausman", "José Berríos"],

    "Chicago White Sox": ["Garrett Crochet", "Erick Fedde"],
    "Cleveland Guardians": ["Emmanuel Clase"],
    "Detroit Tigers": ["Tarik Skubal", "Jack Flaherty"],
    "Kansas City Royals": ["Cole Ragans", "Seth Lugo"],
    "Minnesota Twins": ["Pablo López", "Jhoan Duran"],

    "Houston Astros": ["Framber Valdez", "Hunter Brown", "Justin Verlander"],
    "Los Angeles Angels": [], # Ohtani is primarily batting impact
    "Oakland Athletics": ["Mason Miller", "Paul Blackburn"],
    "Seattle Mariners": ["Luis Castillo", "George Kirby", "Logan Gilbert"],
    "Texas Rangers": ["Nathan Eovaldi", "José Leclerc"],

    "Atlanta Braves": ["Spencer Strider", "Chris Sale"],
    "Miami Marlins": ["Jesús Luzardo", "Sandy Alcántara", "Eury Pérez"],
    "New York Mets": ["Kodai Senga", "Luis Severino", "Edwin Díaz"],
    "Philadelphia Phillies": ["Zack Wheeler", "Aaron Nola", "Ranger Suárez"],
    "Washington Nationals": ["Josiah Gray", "MacKenzie Gore"],

    "Chicago Cubs": ["Shota Imanaga", "Justin Steele"],
    "Cincinnati Reds": ["Hunter Greene", "Nick Lodolo", "Alexis Díaz"],
    "Milwaukee Brewers": ["Freddy Peralta", "Devin Williams"],
    "Pittsburgh Pirates": ["Paul Skenes", "Mitch Keller", "David Bednar"],
    "St. Louis Cardinals": ["Sonny Gray", "Ryan Helsley"],

    "Arizona Diamondbacks": ["Zac Gallen", "Merrill Kelly"],
    "Colorado Rockies": ["Cal Quantrill", "Austin Gomber"],
    "Los Angeles Dodgers": ["Tyler Glasnow", "Yoshinobu Yamamoto", "Bobby Miller"],
    "San Diego Padres": ["Dylan Cease", "Yu Darvish", "Michael King"],
    "San Francisco Giants": ["Logan Webb", "Jordan Hicks", "Blake Snell"]
}

STAR_BATTERS = {
    "2020": batters_2020,
    "2021": batters_2021,
    "2022": batters_2022, 
    "2023": batters_2023,
    "2024": batters_2024,
    "2025": batters_2025
}

STAR_PITCHERS = {
    "2020": pitchers_2020,
    "2021": pitchers_2021,
    "2022": pitchers_2022, 
    "2023": pitchers_2023,
    "2024": pitchers_2024,
    "2025": pitchers_2025
}

def get_star_players(
        team_name: str, 
        year: int, 
        star_dict: dict
        ) -> list[str]:
    year_str = str(year)
    return star_dict.get(year_str, {}).get(team_name, [])

def count_missing_star_players(
    player_names: list[str], 
    team_name: str, 
    year: int, 
    star_dict: dict
    ) -> int:
    stars = get_star_players(team_name, year, star_dict)
    return sum(1 for star in stars if star not in player_names)
    



In [43]:
get_star_players('New York Yankees', 2020, STAR_PITCHERS)

['Gerrit Cole', 'Chad Green', 'Zack Britton']

In [64]:
count_missing_star_players(df_ps2.query("game_pk==634642 & team_side=='home'")['player_name'].tolist(),
                           'New York Yankees',
                           2020,
                           STAR_PITCHERS)

1

In [56]:
df_ps2['innings_pitched'] = df_ps2['innings_pitched'].astype(float)

In [60]:
nyy_pitchers = df_ps2.query("game_pk==634642 & team_side=='home' & innings_pitched>0")['player_name'].tolist()

In [61]:
nyy_pitchers

['Jonathan Loáisiga',
 "Darren O'Day",
 'Chad Green',
 'Nick Nelson',
 'Gerrit Cole']

In [62]:
count_missing_star_players(nyy_pitchers, 'New York Yankees', 2020, STAR_PITCHERS)

1

In [None]:
for game_id in df_g['game_ids']:
    df_game = df_ps2[df_ps2['game_pk']==game_id]
    home_pitchers = df_game[
        (df_game['team_side']=='home') & 
        (df_game['innings_pitched']>0)
        ]['player_name'].tolist()
    missing_home_star_pitchers(home_pitchers, 
    

In [65]:
d2.query("player_name=='Gerrit Cole' & season==2021").head()

Unnamed: 0,season,game_pk,team_id,player_id,player_name,innings_pitched
22,2021,634642,147,543037,Gerrit Cole,5.1
1110,2021,634644,147,543037,Gerrit Cole,
1939,2021,634607,147,543037,Gerrit Cole,
2720,2021,634587,147,543037,Gerrit Cole,
3445,2021,634599,147,543037,Gerrit Cole,7.0


In [66]:
df_ps2.head(2)

Unnamed: 0,game_pk,team_id,team_side,player_id,player_name,at_bats,runs_scored,hits,home_runs,rbis,walks_batting,strikeouts_batting,left_on_base,stolen_bases,innings_pitched,hits_allowed,runs_allowed,earned_runs,strikeouts_pitching,walks_pitching,pitches_thrown,putouts,assists,errors,game_id,game_date_time,game_date,season
0,634642,147,home,476595,Lucas Luetge,,,,,,,,,,,,,,,,,,,,634642.0,2021-04-01 17:05:00+00:00,2021-04-01,2021
1,634642,147,home,518934,DJ LeMahieu,4.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,,,,,,,,4.0,3.0,0.0,634642.0,2021-04-01 17:05:00+00:00,2021-04-01,2021


In [37]:
df_g.head()

Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type
0,634642,2021-04-01,2021-04-01 17:05:00+00:00,147,141,New York Yankees,Toronto Blue Jays,2,3,Final,Yankee Stadium,R
1,634645,2021-04-01,2021-04-01 17:10:00+00:00,116,114,Detroit Tigers,Cleveland Indians,3,2,Final,Comerica Park,R
2,634638,2021-04-01,2021-04-01 18:10:00+00:00,158,142,Milwaukee Brewers,Minnesota Twins,6,5,Final,American Family Field,R
3,634634,2021-04-01,2021-04-01 18:20:00+00:00,112,134,Chicago Cubs,Pittsburgh Pirates,3,5,Final,Wrigley Field,R
4,634622,2021-04-01,2021-04-01 19:05:00+00:00,143,144,Philadelphia Phillies,Atlanta Braves,3,2,Final,Citizens Bank Park,R


In [41]:
df_ps.query("game_pk==634642 & team_side=='home'")['player_name']

0          Lucas Luetge
1           DJ LeMahieu
2          Michael King
3        Domingo Germán
4         Mike Tauchman
5         Clint Frazier
6           Aaron Judge
7     Jonathan Loáisiga
8       Jameson Taillon
9           Gio Urshela
10         Darren O'Day
11          Aaron Hicks
12            Jay Bruce
13      Kyle Higashioka
14       Gleyber Torres
15        Brett Gardner
16           Tyler Wade
17           Chad Green
18    Giancarlo Stanton
19           Luis Cessa
20          Nick Nelson
21         Gary Sánchez
22          Gerrit Cole
23    Jordan Montgomery
24         Corey Kluber
Name: player_name, dtype: object