In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd())) 

import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
from features.load_games_from_db import load_games_from_db

In [3]:
df = load_games_from_db()

2025-06-26 19:35:12,767 [INFO] Successully loaded 10641 games from database


There 10641 for  modeling...


In [4]:
df.head()

Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type
0,634642,2021-04-01,2021-04-01 17:05:00+00:00,147,141,New York Yankees,Toronto Blue Jays,2,3,Final,Yankee Stadium,R
1,634645,2021-04-01,2021-04-01 17:10:00+00:00,116,114,Detroit Tigers,Cleveland Indians,3,2,Final,Comerica Park,R
2,634638,2021-04-01,2021-04-01 18:10:00+00:00,158,142,Milwaukee Brewers,Minnesota Twins,6,5,Final,American Family Field,R
3,634634,2021-04-01,2021-04-01 18:20:00+00:00,112,134,Chicago Cubs,Pittsburgh Pirates,3,5,Final,Wrigley Field,R
4,634622,2021-04-01,2021-04-01 19:05:00+00:00,143,144,Philadelphia Phillies,Atlanta Braves,3,2,Final,Citizens Bank Park,R


In [5]:
df.shape

(10641, 12)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10641 entries, 0 to 10640
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   game_id         10641 non-null  int64              
 1   game_date       10641 non-null  object             
 2   game_date_time  10641 non-null  datetime64[ns, UTC]
 3   home_team_id    10641 non-null  int64              
 4   away_team_id    10641 non-null  int64              
 5   home_team       10641 non-null  object             
 6   away_team       10641 non-null  object             
 7   home_score      10641 non-null  int64              
 8   away_score      10641 non-null  int64              
 9   state           10641 non-null  object             
 10  venue           10641 non-null  object             
 11  game_type       10641 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(5), object(6)
memory usage: 997.7+ KB


In [7]:
df['game_date'] = pd.to_datetime(df['game_date'])

In [8]:
df.loc[df['home_team']=='Cleveland Indians', 'home_team'] = 'Cleveland Guardians'
df.loc[df['away_team']=='Cleveland Indians', 'away_team'] = 'Cleveland Guardians'

df.loc[df['home_team']=='Athletics', 'home_team'] = 'Oakland Athletics'
df.loc[df['away_team']=='Athletics', 'away_team'] = 'Oakland Athletics'

In [9]:
def team_schedule(
    df: pd.DataFrame, 
    date_col: str = 'game_date', 
    date_time_col: str = 'game_date_time'
) -> pd.DataFrame:
    # Extract home and away games for every team
    home_schedule = df[['home_team',date_col, date_time_col]].rename(columns={'home_team': 'team'}).assign(home_ind=1)
    away_schedule = df[['away_team', date_col, date_time_col]].rename(columns={'away_team': 'team'}).assign(home_ind=0)

    # Join them into one 'team' column
    team_schedule = (
        pd.concat([home_schedule, away_schedule])
        .sort_values(['team',date_time_col])
    )

    return team_schedule
def team_rest_days(
    df: pd.DataFrame,
    date_col: str = 'game_date',
    date_time_col: str = 'game_date_time'
):
    df_team_sched = team_schedule(df, date_time_col=date_time_col, date_col=date_col)
    return (
        df_team_sched
        .groupby('team')[date_col]
        .diff()
        .dt.days
        .fillna(value=0)
        -1
    ).clip(lower=0)

def get_sched_features(df: pd.DataFrame, date_col: str = 'game_date', date_time_col: str = 'game_date_time'):
    # create team schedule data
    df_team_sched = team_sched(df, date_col=date_col, date_time_col=date_time_col)
    # add team_rest_days
    df_team_sched['team_rest_days'] = team_rest_days(df)
    # add rest_days_7day_avg
    df_team_sched['team_rest_days_7day_avg'] = team_rest_days_7day_avg(df)

In [10]:
d1 = team_schedule(df)

In [11]:
df.head()

Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type
0,634642,2021-04-01,2021-04-01 17:05:00+00:00,147,141,New York Yankees,Toronto Blue Jays,2,3,Final,Yankee Stadium,R
1,634645,2021-04-01,2021-04-01 17:10:00+00:00,116,114,Detroit Tigers,Cleveland Guardians,3,2,Final,Comerica Park,R
2,634638,2021-04-01,2021-04-01 18:10:00+00:00,158,142,Milwaukee Brewers,Minnesota Twins,6,5,Final,American Family Field,R
3,634634,2021-04-01,2021-04-01 18:20:00+00:00,112,134,Chicago Cubs,Pittsburgh Pirates,3,5,Final,Wrigley Field,R
4,634622,2021-04-01,2021-04-01 19:05:00+00:00,143,144,Philadelphia Phillies,Atlanta Braves,3,2,Final,Citizens Bank Park,R


In [12]:
d1.head()

Unnamed: 0,team,game_date,game_date_time,home_ind
6,Arizona Diamondbacks,2021-04-01,2021-04-01 20:10:00+00:00,0
17,Arizona Diamondbacks,2021-04-02,2021-04-03 02:10:00+00:00,0
30,Arizona Diamondbacks,2021-04-03,2021-04-04 00:40:00+00:00,0
43,Arizona Diamondbacks,2021-04-04,2021-04-04 20:10:00+00:00,0
68,Arizona Diamondbacks,2021-04-06,2021-04-07 00:40:00+00:00,0


In [14]:
d1['team_rest_days'] = team_rest_days(df)

In [15]:
d1.head()

Unnamed: 0,team,game_date,game_date_time,home_ind,team_rest_days
6,Arizona Diamondbacks,2021-04-01,2021-04-01 20:10:00+00:00,0,0.0
17,Arizona Diamondbacks,2021-04-02,2021-04-03 02:10:00+00:00,0,0.0
30,Arizona Diamondbacks,2021-04-03,2021-04-04 00:40:00+00:00,0,0.0
43,Arizona Diamondbacks,2021-04-04,2021-04-04 20:10:00+00:00,0,0.0
68,Arizona Diamondbacks,2021-04-06,2021-04-07 00:40:00+00:00,0,1.0


In [21]:
d2 = df.merge(d1[['team','game_date','team_rest_days']], how='left', left_on=['home_team','game_date'], right_on=['team','game_date'])
d2.rename(columns={'team_rest_days': 'home_team_rest_days'}, inplace=True)
d3 = d2.merge(d1[['team','game_date','team_rest_days']], how='left', left_on=['away_team','game_date'], right_on=['team','game_date'])
d3.rename(columns={'team_rest_days': 'away_team_rest_days'},inplace=True)

In [22]:
d3.head()

Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type,team_x,home_team_rest_days,team_y,away_team_rest_days
0,634642,2021-04-01,2021-04-01 17:05:00+00:00,147,141,New York Yankees,Toronto Blue Jays,2,3,Final,Yankee Stadium,R,New York Yankees,0.0,Toronto Blue Jays,0.0
1,634645,2021-04-01,2021-04-01 17:10:00+00:00,116,114,Detroit Tigers,Cleveland Guardians,3,2,Final,Comerica Park,R,Detroit Tigers,0.0,Cleveland Guardians,0.0
2,634638,2021-04-01,2021-04-01 18:10:00+00:00,158,142,Milwaukee Brewers,Minnesota Twins,6,5,Final,American Family Field,R,Milwaukee Brewers,0.0,Minnesota Twins,0.0
3,634634,2021-04-01,2021-04-01 18:20:00+00:00,112,134,Chicago Cubs,Pittsburgh Pirates,3,5,Final,Wrigley Field,R,Chicago Cubs,0.0,Pittsburgh Pirates,0.0
4,634622,2021-04-01,2021-04-01 19:05:00+00:00,143,144,Philadelphia Phillies,Atlanta Braves,3,2,Final,Citizens Bank Park,R,Philadelphia Phillies,0.0,Atlanta Braves,0.0


In [17]:
d1.tail(20)

Unnamed: 0,team,game_date,game_date_time,home_ind,team_rest_days
10359,Washington Nationals,2025-05-30,2025-05-31 01:40:00+00:00,0,0.0
10377,Washington Nationals,2025-05-31,2025-06-01 02:10:00+00:00,0,0.0
10389,Washington Nationals,2025-06-01,2025-06-01 20:10:00+00:00,0,0.0
10402,Washington Nationals,2025-06-03,2025-06-03 22:45:00+00:00,1,1.0
10419,Washington Nationals,2025-06-04,2025-06-04 22:45:00+00:00,1,0.0
10438,Washington Nationals,2025-06-05,2025-06-05 22:45:00+00:00,1,0.0
10443,Washington Nationals,2025-06-06,2025-06-06 22:45:00+00:00,1,0.0
10459,Washington Nationals,2025-06-07,2025-06-07 20:05:00+00:00,1,0.0
10473,Washington Nationals,2025-06-08,2025-06-08 17:35:00+00:00,1,0.0
10499,Washington Nationals,2025-06-10,2025-06-10 23:10:00+00:00,0,1.0


In [14]:
d1.head()

Unnamed: 0,team,game_date,game_date_time,home_ind
6,Arizona Diamondbacks,2021-04-01,2021-04-01 20:10:00+00:00,0
17,Arizona Diamondbacks,2021-04-02,2021-04-03 02:10:00+00:00,0
30,Arizona Diamondbacks,2021-04-03,2021-04-04 00:40:00+00:00,0
43,Arizona Diamondbacks,2021-04-04,2021-04-04 20:10:00+00:00,0
68,Arizona Diamondbacks,2021-04-06,2021-04-07 00:40:00+00:00,0


In [27]:
d1['rest_days'] = (
    d1
    .sort_values(['team','game_date_time'])
    .groupby('team')['game_date_time'].diff().dt.days.fillna(value=0)
    -1
).clip(lower=0)

In [28]:
d1.head(30)

Unnamed: 0,team,game_date,game_date_time,home_ind,rest_days
6,Arizona Diamondbacks,2021-04-01,2021-04-01 20:10:00+00:00,0,0.0
17,Arizona Diamondbacks,2021-04-02,2021-04-03 02:10:00+00:00,0,0.0
30,Arizona Diamondbacks,2021-04-03,2021-04-04 00:40:00+00:00,0,0.0
43,Arizona Diamondbacks,2021-04-04,2021-04-04 20:10:00+00:00,0,0.0
68,Arizona Diamondbacks,2021-04-06,2021-04-07 00:40:00+00:00,0,1.0
85,Arizona Diamondbacks,2021-04-07,2021-04-08 00:40:00+00:00,0,0.0
89,Arizona Diamondbacks,2021-04-08,2021-04-08 19:10:00+00:00,0,0.0
103,Arizona Diamondbacks,2021-04-09,2021-04-10 01:40:00+00:00,1,0.0
116,Arizona Diamondbacks,2021-04-10,2021-04-11 00:10:00+00:00,1,0.0
128,Arizona Diamondbacks,2021-04-11,2021-04-11 20:10:00+00:00,1,0.0


In [30]:
d1[d1['rest_days']>1].head(20)

Unnamed: 0,team,game_date,game_date_time,home_ind,rest_days
1319,Arizona Diamondbacks,2021-07-16,2021-07-17 01:40:00+00:00,1,4.0
2362,Arizona Diamondbacks,2022-04-07,2022-04-08 01:40:00+00:00,1,185.0
3722,Arizona Diamondbacks,2022-07-22,2022-07-23 01:40:00+00:00,1,4.0
4752,Arizona Diamondbacks,2023-03-30,2023-03-31 02:10:00+00:00,0,175.0
6079,Arizona Diamondbacks,2023-07-14,2023-07-14 23:07:00+00:00,0,4.0
7142,Arizona Diamondbacks,2024-03-28,2024-03-29 02:10:00+00:00,1,178.0
8559,Arizona Diamondbacks,2024-07-19,2024-07-19 18:20:00+00:00,0,3.0
9538,Arizona Diamondbacks,2025-03-27,2025-03-28 02:10:00+00:00,1,178.0
744,Atlanta Braves,2021-05-29,2021-05-29 23:15:00+00:00,0,2.0
1313,Atlanta Braves,2021-07-16,2021-07-16 23:20:00+00:00,1,4.0


In [32]:
d1[(d1['game_date']>='2021-07-09') & (d1['game_date']<='2021-07-19')]

Unnamed: 0,team,game_date,game_date_time,home_ind,rest_days
1276,Arizona Diamondbacks,2021-07-09,2021-07-10 02:10:00+00:00,0,0.0
1291,Arizona Diamondbacks,2021-07-10,2021-07-11 02:10:00+00:00,0,0.0
1304,Arizona Diamondbacks,2021-07-11,2021-07-11 20:10:00+00:00,0,0.0
1319,Arizona Diamondbacks,2021-07-16,2021-07-17 01:40:00+00:00,1,4.0
1323,Arizona Diamondbacks,2021-07-17,2021-07-17 20:10:00+00:00,1,0.0
1348,Arizona Diamondbacks,2021-07-18,2021-07-18 20:10:00+00:00,1,0.0
1359,Arizona Diamondbacks,2021-07-19,2021-07-20 01:40:00+00:00,1,0.0
1270,Atlanta Braves,2021-07-09,2021-07-09 23:10:00+00:00,0,1.0
1286,Atlanta Braves,2021-07-10,2021-07-10 20:10:00+00:00,0,0.0
1298,Atlanta Braves,2021-07-11,2021-07-11 17:10:00+00:00,0,0.0


In [18]:
df[df['away_team']=='Arizona Diamondbacks'].head(10)

Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type
6,634618,2021-04-01,2021-04-01 20:10:00+00:00,135,109,San Diego Padres,Arizona Diamondbacks,8,7,Final,Petco Park,R
17,634576,2021-04-02,2021-04-03 02:10:00+00:00,135,109,San Diego Padres,Arizona Diamondbacks,4,2,Final,Petco Park,R
30,634623,2021-04-03,2021-04-04 00:40:00+00:00,135,109,San Diego Padres,Arizona Diamondbacks,7,0,Final,Petco Park,R
43,634572,2021-04-04,2021-04-04 20:10:00+00:00,135,109,San Diego Padres,Arizona Diamondbacks,1,3,Final,Petco Park,R
68,634619,2021-04-06,2021-04-07 00:40:00+00:00,115,109,Colorado Rockies,Arizona Diamondbacks,8,10,Final,Coors Field,R
85,634542,2021-04-07,2021-04-08 00:40:00+00:00,115,109,Colorado Rockies,Arizona Diamondbacks,8,0,Final,Coors Field,R
89,634564,2021-04-08,2021-04-08 19:10:00+00:00,115,109,Colorado Rockies,Arizona Diamondbacks,7,3,Final,Coors Field,R
175,634481,2021-04-15,2021-04-15 23:05:00+00:00,120,109,Washington Nationals,Arizona Diamondbacks,6,11,Final,Nationals Park,R
181,634469,2021-04-16,2021-04-16 23:05:00+00:00,120,109,Washington Nationals,Arizona Diamondbacks,1,0,Final,Nationals Park,R
192,634507,2021-04-17,2021-04-17 17:05:00+00:00,120,109,Washington Nationals,Arizona Diamondbacks,6,2,Final,Nationals Park,R


In [16]:
df[df['home_team']=='Arizona Diamondbacks'].head()

Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type
103,634540,2021-04-09,2021-04-10 01:40:00+00:00,109,113,Arizona Diamondbacks,Cincinnati Reds,5,6,Final,Chase Field,R
116,634484,2021-04-10,2021-04-11 00:10:00+00:00,109,113,Arizona Diamondbacks,Cincinnati Reds,8,3,Final,Chase Field,R
128,632209,2021-04-11,2021-04-11 20:10:00+00:00,109,113,Arizona Diamondbacks,Cincinnati Reds,7,0,Final,Chase Field,R
140,632231,2021-04-12,2021-04-13 01:40:00+00:00,109,133,Arizona Diamondbacks,Oakland Athletics,5,9,Final,Chase Field,R
143,632206,2021-04-13,2021-04-13 19:40:00+00:00,109,133,Arizona Diamondbacks,Oakland Athletics,5,7,Final,Chase Field,R


In [22]:
df[['away_team','game_date','game_date_time']].head(20)

Unnamed: 0,away_team,game_date,game_date_time
0,Toronto Blue Jays,2021-04-01,2021-04-01 17:05:00+00:00
1,Cleveland Indians,2021-04-01,2021-04-01 17:10:00+00:00
2,Minnesota Twins,2021-04-01,2021-04-01 18:10:00+00:00
3,Pittsburgh Pirates,2021-04-01,2021-04-01 18:20:00+00:00
4,Atlanta Braves,2021-04-01,2021-04-01 19:05:00+00:00
5,Los Angeles Dodgers,2021-04-01,2021-04-01 20:10:00+00:00
6,Arizona Diamondbacks,2021-04-01,2021-04-01 20:10:00+00:00
7,St. Louis Cardinals,2021-04-01,2021-04-01 20:10:00+00:00
8,Texas Rangers,2021-04-01,2021-04-01 20:10:00+00:00
9,Tampa Bay Rays,2021-04-01,2021-04-01 20:10:00+00:00
