In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd())) 

import pandas as pd
from datetime import datetime, timedelta

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
from features.load_team_stats_from_db import load_team_stats_from_db
from features.load_games_from_db import load_games_from_db
from features.load_player_stats_from_db import load_player_stats_from_db

In [3]:
df_g = load_games_from_db()
df_ts = load_team_stats_from_db()
df_ps = load_player_stats_from_db()

2025-07-02 21:51:24,558 [INFO] Successully loaded 10641 games from database


There are 10641 games for  modeling...


2025-07-02 21:51:24,806 [INFO] Successully loaded 25560 rows from database


There are 25560 team-games for  modeling...


2025-07-02 21:51:27,533 [INFO] Successully loaded 933365 rows from database


There are 933365 players for  modeling...


In [4]:
df_ps.head(3)

Unnamed: 0,game_pk,team_id,team_side,player_id,player_name,at_bats,runs_scored,hits,home_runs,rbis,walks_batting,strikeouts_batting,left_on_base,stolen_bases,innings_pitched,hits_allowed,runs_allowed,earned_runs,strikeouts_pitching,walks_pitching,pitches_thrown,putouts,assists,errors
0,634642,147,home,476595,Lucas Luetge,,,,,,,,,,,,,,,,,,,
1,634642,147,home,518934,DJ LeMahieu,4.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,,,,,,,,4.0,3.0,0.0
2,634642,147,home,650633,Michael King,,,,,,,,,,,,,,,,,,,


In [5]:
df_ps.isna().sum()

game_pk                     0
team_id                     0
team_side                   0
player_id                   0
player_name                 0
at_bats                633560
runs_scored            633560
hits                   633560
home_runs              633560
rbis                   633560
walks_batting          633560
strikeouts_batting     633560
left_on_base           633560
stolen_bases           633560
innings_pitched        817144
hits_allowed           817144
runs_allowed           817144
earned_runs            817144
strikeouts_pitching    817144
walks_pitching         817144
pitches_thrown         817144
putouts                529765
assists                529765
errors                 529765
dtype: int64

In [6]:
df_g.query("game_id==634642")

Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type
0,634642,2021-04-01,2021-04-01 17:05:00+00:00,147,141,New York Yankees,Toronto Blue Jays,2,3,Final,Yankee Stadium,R


In [7]:
df_ps.query("player_id==476595").head()

Unnamed: 0,game_pk,team_id,team_side,player_id,player_name,at_bats,runs_scored,hits,home_runs,rbis,walks_batting,strikeouts_batting,left_on_base,stolen_bases,innings_pitched,hits_allowed,runs_allowed,earned_runs,strikeouts_pitching,walks_pitching,pitches_thrown,putouts,assists,errors
0,634642,147,home,476595,Lucas Luetge,,,,,,,,,,,,,,,,,,,
1088,634644,147,home,476595,Lucas Luetge,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,0.0,15.0,0.0,0.0,0.0
1917,634607,147,home,476595,Lucas Luetge,,,,,,,,,,,,,,,,,,,
2698,634587,147,home,476595,Lucas Luetge,,,,,,,,,,,,,,,,,,,
3424,634599,147,home,476595,Lucas Luetge,,,,,,,,,,1.0,3.0,2.0,2.0,1.0,0.0,25.0,0.0,0.0,0.0


In [8]:
df_ps1 = df_ps.merge(df_g[['game_id','game_date_time','game_date']],
           how='left',
           left_on='game_pk',
           right_on='game_id')

# Keep games in our games dataset
df_ps2 = df_ps1[~df_ps1['game_date_time'].isna()].copy()

In [9]:
df_ps2['season'] = df_ps2['game_date_time'].dt.year

In [10]:
d2 = df_ps2[['season','game_pk','team_id','player_id','player_name','innings_pitched']].copy()

In [11]:
d2.head()

Unnamed: 0,season,game_pk,team_id,player_id,player_name,innings_pitched
0,2021,634642,147,476595,Lucas Luetge,
1,2021,634642,147,518934,DJ LeMahieu,
2,2021,634642,147,650633,Michael King,
3,2021,634642,147,593334,Domingo Germán,
4,2021,634642,147,643565,Mike Tauchman,


In [12]:
d2.groupby('season')['player_name'].size()

season
2021    123307
2022    126278
2023    124935
2024    125803
2025     57914
Name: player_name, dtype: int64

In [13]:
d2.query("player_id==476595")['season'].value_counts(dropna=False)

season
2021    157
2022    155
2023     34
Name: count, dtype: int64

In [14]:
d2[(d2['player_id']==476595) & (d2['season']==2021)]['innings_pitched'].value_counts(dropna=False)

innings_pitched
None    101
1.0      21
1.1       9
2.0       8
0.2       6
0.1       5
1.2       4
3.0       2
4.0       1
Name: count, dtype: int64

- The above suggests that available players will be listed even if they don't play in a particular game.
- Cross-referenced Lucas Luetge on baseball-reference and found that he played 57 games in the 2021 season
- The previous cell shows that out of 157 game rows in which this player appears, 101 had `None` for `innings_pitched`. This leaves 157-101 = 56 games, which is just one off from baseball-reference.

In [15]:
d3 = df_ps2[['season','game_date','game_pk','team_id','player_id','player_name','at_bats']].copy()
d3[d3['player_name'].str.contains('Bryce Harper')].head()

Unnamed: 0,season,game_date,game_pk,team_id,player_id,player_name,at_bats
211,2021,2021-04-01,634622,143,547180,Bryce Harper,4.0
1351,2021,2021-04-03,634580,143,547180,Bryce Harper,3.0
1868,2021,2021-04-04,634617,143,547180,Bryce Harper,3.0
2858,2021,2021-04-05,634606,143,547180,Bryce Harper,1.0
3582,2021,2021-04-06,634608,143,547180,Bryce Harper,4.0


In [16]:
d_bh = d3[d3['player_name'].str.contains('Bryce Harper')].copy()

In [17]:
d_bh['season'].value_counts(dropna=False)

season
2024    149
2021    146
2023    128
2022    108
2025     60
Name: count, dtype: int64

In [18]:
# According to Baseball-Reference
# Bryce Harper missed 5 games due to injury May 28th to June 2nd
d_bh.query('season==2021')['at_bats'].value_counts(dropna=False)

at_bats
4.0    57
3.0    46
5.0    14
2.0    13
NaN    10
1.0     5
6.0     1
Name: count, dtype: int64

In [19]:
d_bh[(d_bh['season']==2021) & (d_bh['at_bats'].isna())]

Unnamed: 0,season,game_date,game_pk,team_id,player_id,player_name,at_bats
10990,2021,2021-04-17,634470,143,547180,Bryce Harper,
19250,2021,2021-04-29,634352,143,547180,Bryce Harper,
19640,2021,2021-04-30,634336,143,547180,Bryce Harper,
20724,2021,2021-05-01,634276,143,547180,Bryce Harper,
21909,2021,2021-05-03,634244,143,547180,Bryce Harper,
22741,2021,2021-05-04,634324,143,547180,Bryce Harper,
23575,2021,2021-05-05,634294,143,547180,Bryce Harper,
24035,2021,2021-05-06,634283,143,547180,Bryce Harper,
52991,2021,2021-06-16,633677,143,547180,Bryce Harper,
59952,2021,2021-06-27,633504,143,547180,Bryce Harper,


In [29]:
d_bh[(d_bh['season']==2021) & (d_bh['game_date'] <= pd.Timestamp('2021-07-01').date())]

Unnamed: 0,season,game_date,game_pk,team_id,player_id,player_name,at_bats
211,2021,2021-04-01,634622,143,547180,Bryce Harper,4.0
1351,2021,2021-04-03,634580,143,547180,Bryce Harper,3.0
1868,2021,2021-04-04,634617,143,547180,Bryce Harper,3.0
2858,2021,2021-04-05,634606,143,547180,Bryce Harper,1.0
3582,2021,2021-04-06,634608,143,547180,Bryce Harper,4.0
4410,2021,2021-04-07,634626,143,547180,Bryce Harper,4.0
5473,2021,2021-04-09,634534,143,547180,Bryce Harper,2.0
6250,2021,2021-04-10,634524,143,547180,Bryce Harper,4.0
7101,2021,2021-04-11,632207,143,547180,Bryce Harper,4.0
8096,2021,2021-04-13,632189,143,547180,Bryce Harper,3.0


In [26]:
type(pd.to_datetime(d_bh['game_date'].iloc[0]))

pandas._libs.tslibs.timestamps.Timestamp

In [30]:
df_ts.head()

Unnamed: 0,game_pk,team_side,team_id,runs_batting,hits_batting,strikeouts_batting,baseonballs_batting,avg,obp,slg,pitchesthrown,balls_pitching,strikes_pitching,strikeouts_pitching,baseonballs_pitching,hits_pitching,earnedruns,homeruns_pitching,runs_pitching,era,whip,groundouts_pitching,airouts_pitching,total,putouts,assists,errors,doubleplays,tripleplays,rangefactor,caughtstealing,passedball,innings
0,634642,home,147,2,6,13,6,0.171,0.293,0.257,159,55,104,13,2,8,2,1,3,1.8,1.0,4,11,0,30,7,0,0,0,0.0,1,0,0.0
1,634642,away,141,3,8,13,2,0.222,0.263,0.333,181,78,103,13,6,6,2,1,2,1.8,1.2,10,6,0,30,10,0,0,0,0.0,0,0,0.0
2,634645,home,116,3,5,14,5,0.167,0.286,0.3,154,60,94,5,6,6,2,1,2,2.0,1.33,11,10,0,27,12,0,0,0,0.0,0,0,0.0
3,634645,away,114,2,6,5,6,0.188,0.316,0.313,125,48,77,14,5,5,3,1,3,3.38,1.25,5,6,0,24,6,1,0,0,0.0,0,0,0.0
4,634638,home,158,6,11,9,2,0.275,0.341,0.3,189,74,115,17,7,10,5,1,5,4.5,1.7,7,6,0,30,6,0,0,0,0.0,0,0,0.0
