In [2]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd())) 

import pandas as pd
from datetime import datetime, timedelta

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
from features.load_games_from_db import load_games_from_db

In [4]:
df = load_games_from_db()

2025-06-29 16:16:25,677 [INFO] Successully loaded 10641 games from database


There are 10641 games for  modeling...


In [5]:
df.head()

Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type
0,634642,2021-04-01,2021-04-01 17:05:00+00:00,147,141,New York Yankees,Toronto Blue Jays,2,3,Final,Yankee Stadium,R
1,634645,2021-04-01,2021-04-01 17:10:00+00:00,116,114,Detroit Tigers,Cleveland Indians,3,2,Final,Comerica Park,R
2,634638,2021-04-01,2021-04-01 18:10:00+00:00,158,142,Milwaukee Brewers,Minnesota Twins,6,5,Final,American Family Field,R
3,634634,2021-04-01,2021-04-01 18:20:00+00:00,112,134,Chicago Cubs,Pittsburgh Pirates,3,5,Final,Wrigley Field,R
4,634622,2021-04-01,2021-04-01 19:05:00+00:00,143,144,Philadelphia Phillies,Atlanta Braves,3,2,Final,Citizens Bank Park,R


In [6]:
df.shape

(10641, 12)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10641 entries, 0 to 10640
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   game_id         10641 non-null  int64              
 1   game_date       10641 non-null  object             
 2   game_date_time  10641 non-null  datetime64[ns, UTC]
 3   home_team_id    10641 non-null  int64              
 4   away_team_id    10641 non-null  int64              
 5   home_team       10641 non-null  object             
 6   away_team       10641 non-null  object             
 7   home_score      10641 non-null  int64              
 8   away_score      10641 non-null  int64              
 9   state           10641 non-null  object             
 10  venue           10641 non-null  object             
 11  game_type       10641 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(5), object(6)
memory usage: 997.7+ KB


In [8]:
df['game_date'] = pd.to_datetime(df['game_date'])

In [9]:
df.loc[df['home_team']=='Cleveland Indians', 'home_team'] = 'Cleveland Guardians'
df.loc[df['away_team']=='Cleveland Indians', 'away_team'] = 'Cleveland Guardians'

df.loc[df['home_team']=='Athletics', 'home_team'] = 'Oakland Athletics'
df.loc[df['away_team']=='Athletics', 'away_team'] = 'Oakland Athletics'

In [14]:
def team_schedule(
    df: pd.DataFrame, 
    date_col: str = 'game_date', 
    date_time_col: str = 'game_date_time'
) -> pd.DataFrame:
     
    """
    Transform a games DataFrame into a team-centric schedule view.
    
    Takes a DataFrame with game data (home_team vs away_team format) and converts it
    into a schedule where each row represents one team's game, with an indicator
    for whether they played at home or away.
    
    Args:
        df (pd.DataFrame): DataFrame containing game data with columns:
            - 'home_team': Name of the home team
            - 'away_team': Name of the away team
            - date_col: Column name containing game dates
            - date_time_col: Column name containing game datetime stamps
        date_col (str, optional): Name of the date column. Defaults to 'game_date'.
        date_time_col (str, optional): Name of the datetime column. Defaults to 'game_date_time'.
    
    Returns:
        pd.DataFrame: Team schedule DataFrame with columns:
            - 'team': Team name
            - date_col: Game date (same as input)
            - date_time_col: Game datetime (same as input)
            - 'home_ind': Binary indicator (1 if home game, 0 if away game)
            
        The DataFrame is sorted by team name and then by game datetime.
     """
    # Extract home and away games for every team
    common_cols = [date_col, date_time_col, 'home_score','away_score']
    home_cols = ['home_team'] + common_cols
    home_schedule = (
        df[home_cols]
        .rename(columns={
            'home_team': 'team',
            'home_score': 'team_score',
            'away_score': 'opp_score'
        })
        .assign(home_ind=1)
    )
    away_cols = ['away_team'] + common_cols
    away_schedule = (
        df[away_cols]
        .rename(columns={
            'away_team': 'team',
            'home_score': 'opp_score',
            'away_score': 'team_score'
        })
        .assign(home_ind=0)
    )

    # Join them into one 'team' column
    team_schedule = (
        pd.concat([home_schedule, away_schedule])
        .sort_values(['team',date_time_col])
        .reset_index(drop=True)
    )

    # Create team win column
    team_schedule['team_win'] = (team_schedule['team_score'] > team_schedule['opp_score']).astype(int)

    # Create run differential column (team_score - opp_score)
    team_schedule['team_run_diff'] = team_schedule['team_score'] - team_schedule['opp_score']

    # Create season variable (for grouping)
    team_schedule['season'] = team_schedule['game_date'].dt.year.astype('int64')

    return team_schedule

def team_rest_days(
        df: pd.DataFrame,
        date_time_col: str = 'game_date_time',
        date_col: str = 'game_date'
        ) -> pd.Series:
    """
    Calculate the number of rest days between consecutive games for each team.
    
    This function sorts games chronologically and calculates how many days of rest
    each team had before each game by finding the difference between consecutive
    game dates for each team.
    
    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame containing game data with team and date information.
    team : {'home', 'away'}
        Specifies whether to calculate rest days for home teams or away teams.
        Will look for corresponding '{team}_team' column in the DataFrame.
    date_time_col : str, default 'game_date_time'
        Column name containing datetime information used for chronological sorting. 
        Necessary for sorting between double headers for a single team.
    date_col : str, default 'game_date'
        Column name containing date information used to calculate day differences.
        
    Returns
    -------
    pd.Series
        Series with the same index as input DataFrame containing the number of
        rest days for each game. First game for each team will have 0 rest days.
    """
    df_team_sched = team_schedule(df, date_time_col=date_time_col, date_col=date_col)
    return (
        df_team_sched
        .groupby(['team', 'season'])[date_col]
        .diff()
        .dt.days
        .fillna(value=0)
        -1
    ).clip(lower=0)

def team_games_previous_7days(
        df: pd.DataFrame, 
        date_col: str = 'game_date',
        date_time_col: str = 'game_date_time'
        ) -> pd.Series:
    """
    Calculate the 7-day rolling average of rest days for each team.
    
    This function computes a rolling average of rest days over the previous 7 games
    for each team, providing a smoothed measure of how well-rested teams have been
    over recent games. Uses transform to maintain the original DataFrame index.
    
    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame containing game data with team information and rest days.
        Must include a column named '{team}_rest_days' (e.g., 'home_rest_days').
    team : {'home', 'away'}
        Specifies whether to calculate rolling average for home or away teams.
        Will look for '{team}_team' and '{team}_rest_days' columns.
    date_col : str, default 'game_date'
        Column name containing date information used for chronological sorting.
        
    Returns
    -------
    pd.Series
        Series with the same index as input DataFrame containing the 7-day
        rolling average of rest days for each team and game.
    """
    df_team_sched = team_schedule(df, date_time_col=date_time_col, date_col=date_col)
    
    return (
        df_team_sched
        .groupby(['team'])
        .rolling(window='7D', on=date_col, min_periods=1, closed='left')
        .count()['game_date_time']
        .fillna(value=0)
        .reset_index(drop=True,level=0)
    )
def merge_team_features_into_games(df_games: pd.DataFrame, df_team_schedule: pd.DataFrame, team_schedule_cols: list[str], date_time_col: str = 'game_date_time'):
    print(f"df_games: {df_games.shape}")
    print(display(df_games.head(3)))
    # Home Merge
    df_games = df_games.merge(
        df_team_schedule[team_schedule_cols + [date_time_col]],
        how='left',
        left_on=['home_team', date_time_col],
        right_on=['team', date_time_col]
    )
    df_games.drop('team', axis=1, inplace=True)
    home_cols_rename = {col: f'home_{col}' for col in team_schedule_cols}
    df_games.rename(columns=home_cols_rename, inplace=True)
    print(f"df_games AFTER Home Merge: {df_games.shape}")
    print(display(df_games.head(3)))

    # Away Merge
    df_games = df_games.merge(
        df_team_schedule[team_schedule_cols + [date_time_col]],
        how='left',
        left_on=['away_team',date_time_col],
        right_on=['team',date_time_col]
    )
    df_games.drop('team', axis=1, inplace=True)
    away_cols_rename = {col: f'away_{col}' for col in team_schedule_cols}
    df_games.rename(columns=away_cols_rename, inplace=True)
    print(f"df_games AFTER Away Merge: {df_games.shape}")
    print(display(df_games.head(3)))

    return df_games

def get_sched_features(df: pd.DataFrame, date_col: str = 'game_date', date_time_col: str = 'game_date_time'):
    # create team schedule data
    df_team_sched = team_schedule(df, date_col=date_col, date_time_col=date_time_col)
    # add team_rest_days
    df_team_sched['team_rest_days'] = team_rest_days(df)
    # add rest_days_7day_avg
    df_team_sched['team_games_prev_7days'] = team_games_previous_7days(df)

    # Merge
    team_sched_cols = ['team','team_rest_days','team_games_prev_7days'] # no date_time_col here -- add separately
    df_games = merge_team_features_into_games(df_games=df, df_team_schedule=df_team_sched,team_schedule_cols=team_sched_cols, date_time_col=date_time_col)
    """ df_merge = df.merge(
        df_team_sched[team_sched_cols], 
        how='left', 
        left_on=['home_team','game_date'], 
        right_on=['team','game_date']
    )
    df_merge.rename(columns={
        'team_rest_days':'home_team_rest_days',
        'team_games_prev_7days': 'home_team_games_prev_7days'
    }, 
                    inplace=True)
    df_merge.drop('team', axis=1, inplace=True)

    df_merge = df_merge.merge(
        df_team_sched[team_sched_cols],
        how='left',
        left_on=['away_team', 'game_date'],
        right_on=['team','game_date']
    )
    df_merge.rename(columns={
        'team_rest_days': 'away_team_rest_days',
        'team_games_prev_7days': 'away_team_games_prev_7days'
    },
    inplace=True)
    df_merge.drop('team', axis=1, inplace=True) """

    return df_games

In [10]:
team_schedule_cols = ['team','team_rest_days','team_games_prev_7days']
away_cols_rename = {col: f'away_{col}' for col in team_schedule_cols}
away_cols_rename

{'team': 'away_team',
 'team_rest_days': 'away_team_rest_days',
 'team_games_prev_7days': 'away_team_games_prev_7days'}

In [11]:
dts = team_schedule(df)

In [12]:
dts.head()

Unnamed: 0,team,game_date,game_date_time,team_score,opp_score,home_ind,team_win,team_run_diff,season
0,Arizona Diamondbacks,2021-04-01,2021-04-01 20:10:00+00:00,7,8,0,0,-1,2021
1,Arizona Diamondbacks,2021-04-02,2021-04-03 02:10:00+00:00,2,4,0,0,-2,2021
2,Arizona Diamondbacks,2021-04-03,2021-04-04 00:40:00+00:00,0,7,0,0,-7,2021
3,Arizona Diamondbacks,2021-04-04,2021-04-04 20:10:00+00:00,3,1,0,1,2,2021
4,Arizona Diamondbacks,2021-04-06,2021-04-07 00:40:00+00:00,10,8,0,1,2,2021


In [15]:
df_games = get_sched_features(df)

df_games: (10641, 12)


Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type
0,634642,2021-04-01,2021-04-01 17:05:00+00:00,147,141,New York Yankees,Toronto Blue Jays,2,3,Final,Yankee Stadium,R
1,634645,2021-04-01,2021-04-01 17:10:00+00:00,116,114,Detroit Tigers,Cleveland Guardians,3,2,Final,Comerica Park,R
2,634638,2021-04-01,2021-04-01 18:10:00+00:00,158,142,Milwaukee Brewers,Minnesota Twins,6,5,Final,American Family Field,R


None
df_games AFTER Home Merge: (10641, 14)


Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type,home_team_rest_days,home_team_games_prev_7days
0,634642,2021-04-01,2021-04-01 17:05:00+00:00,147,141,New York Yankees,Toronto Blue Jays,2,3,Final,Yankee Stadium,R,0.0,0.0
1,634645,2021-04-01,2021-04-01 17:10:00+00:00,116,114,Detroit Tigers,Cleveland Guardians,3,2,Final,Comerica Park,R,0.0,0.0
2,634638,2021-04-01,2021-04-01 18:10:00+00:00,158,142,Milwaukee Brewers,Minnesota Twins,6,5,Final,American Family Field,R,0.0,0.0


None
df_games AFTER Away Merge: (10641, 16)


Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type,home_team_rest_days,home_team_games_prev_7days,away_team_rest_days,away_team_games_prev_7days
0,634642,2021-04-01,2021-04-01 17:05:00+00:00,147,141,New York Yankees,Toronto Blue Jays,2,3,Final,Yankee Stadium,R,0.0,0.0,0.0,0.0
1,634645,2021-04-01,2021-04-01 17:10:00+00:00,116,114,Detroit Tigers,Cleveland Guardians,3,2,Final,Comerica Park,R,0.0,0.0,0.0,0.0
2,634638,2021-04-01,2021-04-01 18:10:00+00:00,158,142,Milwaukee Brewers,Minnesota Twins,6,5,Final,American Family Field,R,0.0,0.0,0.0,0.0


None


In [79]:
df_games.head()

Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type,home_team_rest_days,home_team_games_prev_7days,away_team_rest_days,away_team_games_prev_7days
0,634642,2021-04-01,2021-04-01 17:05:00+00:00,147,141,New York Yankees,Toronto Blue Jays,2,3,Final,Yankee Stadium,R,0.0,0.0,0.0,0.0
1,634645,2021-04-01,2021-04-01 17:10:00+00:00,116,114,Detroit Tigers,Cleveland Guardians,3,2,Final,Comerica Park,R,0.0,0.0,0.0,0.0
2,634638,2021-04-01,2021-04-01 18:10:00+00:00,158,142,Milwaukee Brewers,Minnesota Twins,6,5,Final,American Family Field,R,0.0,0.0,0.0,0.0
3,634634,2021-04-01,2021-04-01 18:20:00+00:00,112,134,Chicago Cubs,Pittsburgh Pirates,3,5,Final,Wrigley Field,R,0.0,0.0,0.0,0.0
4,634622,2021-04-01,2021-04-01 19:05:00+00:00,143,144,Philadelphia Phillies,Atlanta Braves,3,2,Final,Citizens Bank Park,R,0.0,0.0,0.0,0.0


In [36]:
cols = ['game_date','game_date_time','home_team','away_team','home_team_rest_days','home_team_games_prev_7days','away_team_rest_days','away_team_games_prev_7days']
df_games[cols].head(10)

Unnamed: 0,game_date,game_date_time,home_team,away_team,home_team_rest_days,home_team_games_prev_7days,away_team_rest_days,away_team_games_prev_7days
0,2021-04-01,2021-04-01 17:05:00+00:00,New York Yankees,Toronto Blue Jays,0.0,0.0,0.0,0.0
1,2021-04-01,2021-04-01 17:10:00+00:00,Detroit Tigers,Cleveland Guardians,0.0,0.0,0.0,0.0
2,2021-04-01,2021-04-01 18:10:00+00:00,Milwaukee Brewers,Minnesota Twins,0.0,0.0,0.0,0.0
3,2021-04-01,2021-04-01 18:20:00+00:00,Chicago Cubs,Pittsburgh Pirates,0.0,0.0,0.0,0.0
4,2021-04-01,2021-04-01 19:05:00+00:00,Philadelphia Phillies,Atlanta Braves,0.0,0.0,0.0,0.0
5,2021-04-01,2021-04-01 20:10:00+00:00,Colorado Rockies,Los Angeles Dodgers,0.0,0.0,0.0,0.0
6,2021-04-01,2021-04-01 20:10:00+00:00,San Diego Padres,Arizona Diamondbacks,0.0,0.0,0.0,0.0
7,2021-04-01,2021-04-01 20:10:00+00:00,Cincinnati Reds,St. Louis Cardinals,0.0,0.0,0.0,0.0
8,2021-04-01,2021-04-01 20:10:00+00:00,Kansas City Royals,Texas Rangers,0.0,0.0,0.0,0.0
9,2021-04-01,2021-04-01 20:10:00+00:00,Miami Marlins,Tampa Bay Rays,0.0,0.0,0.0,0.0


In [39]:
df_games.query("home_team=='New York Yankees' | away_team=='New York Yankees'")[cols].sort_values('game_date').head(170).tail(20)

Unnamed: 0,game_date,game_date_time,home_team,away_team,home_team_rest_days,home_team_games_prev_7days,away_team_rest_days,away_team_games_prev_7days
2266,2021-09-26,2021-09-26 23:08:00+00:00,Boston Red Sox,New York Yankees,0.0,5.0,0.0,6.0
2272,2021-09-28,2021-09-28 23:07:00+00:00,Toronto Blue Jays,New York Yankees,1.0,6.0,1.0,5.0
2287,2021-09-29,2021-09-29 23:07:00+00:00,Toronto Blue Jays,New York Yankees,0.0,6.0,0.0,5.0
2303,2021-09-30,2021-09-30 23:07:00+00:00,Toronto Blue Jays,New York Yankees,0.0,6.0,0.0,5.0
2313,2021-10-01,2021-10-01 23:05:00+00:00,New York Yankees,Tampa Bay Rays,0.0,6.0,0.0,6.0
2326,2021-10-02,2021-10-02 17:05:00+00:00,New York Yankees,Tampa Bay Rays,0.0,6.0,0.0,6.0
2345,2021-10-03,2021-10-03 19:05:00+00:00,New York Yankees,Tampa Bay Rays,0.0,6.0,0.0,6.0
2381,2022-04-09,2022-04-09 20:05:00+00:00,New York Yankees,Boston Red Sox,0.0,0.0,0.0,0.0
2402,2022-04-10,2022-04-10 23:08:00+00:00,New York Yankees,Boston Red Sox,0.0,1.0,0.0,1.0
2409,2022-04-11,2022-04-11 23:05:00+00:00,New York Yankees,Toronto Blue Jays,0.0,2.0,0.0,3.0


In [65]:
dts = team_schedule(df)

In [66]:
dts \
.groupby(['team','season'])['team_win'] \
.rolling(window=10, min_periods=1, closed='left') \
.mean() \
.fillna(value=0) \
.reset_index(level=1,drop=True) \
.head()

team                   
Arizona Diamondbacks  0    0.00
                      1    0.00
                      2    0.00
                      3    0.00
                      4    0.25
Name: team_win, dtype: float64

In [67]:
dts \
.groupby(['team','season'])['team_win'] \
.rolling(window=10, min_periods=1, closed='left') \
.mean() \
.fillna(value=0) \
.reset_index(drop=True) \
.head()

0    0.00
1    0.00
2    0.00
3    0.00
4    0.25
Name: team_win, dtype: float64

In [68]:
def team_win_rate_last_10(
    df: pd.DataFrame,
    date_col: str = 'game_date',
    date_time_col: str = 'game_date_time'
):
    df_team_sched = team_schedule(df,date_time_col=date_time_col, date_col=date_col)
    return (
        df_team_sched
        .groupby(['team','season'])['team_win']
        .rolling(window=10, min_periods=1, closed='left')
        .mean()
        .fillna(value=0)
        .reset_index(drop=True)
    )

def team_avg_run_diff_last_10(
        df: pd.DataFrame,
        date_col: str = 'game_date',
        date_time_col: str = 'game_date_time'
):
    df_team_sched = team_schedule(df, date_time_col=date_time_col, date_col=date_col)
    return (
        df_team_sched
        .groupby(['team','season'])['team_run_diff']
        .rolling(window=10, min_periods=1, closed='left')
        .mean()
        .fillna(value=0)
        .reset_index(drop=True)
    )
def team_avg_runs_score_last_10(
        df: pd.DataFrame,
        date_col: str = 'game_date',
        date_time_col: str = 'game_date_time'
):
    df_team_sched = team_schedule(df, date_time_col=date_time_col, date_col=date_col)
    return (
        df_team_sched
        .groupby(['team','season'])['team_score']
        .rolling(window=10, min_periods=1, closed='left')
        .mean()
        .fillna(value=0)
        .reset_index(drop=True)
    ) 
def team_avg_runs_allowed_last_10(
        df: pd.DataFrame,
        date_col: str = 'game_date',
        date_time_col: str = 'game_date_time'
):
    df_team_sched = team_schedule(df, date_time_col=date_time_col, date_col=date_col)
    return (
        df_team_sched
        .groupby(['team','season'])['opp_score']
        .rolling(window=10, min_periods=1, closed='left')
        .mean()
        .fillna(value=0)
        .reset_index(drop=True)
    ) 

def get_outcome_features(
        df: pd.DataFrame,
        date_col: str = 'game_date',
        date_time_col: str = 'game_date_time'
):
    # Create team schedule
    df_team_sched = team_schedule(df, date_time_col=date_time_col, date_col=date_col)

    # Win rate last 10 games
    df_team_sched['win_rate_last_10'] = team_win_rate_last_10(df, date_time_col=date_time_col, date_col=date_col)

    # Avg run differential last 10 games
    df_team_sched['avg_run_diff_last_10'] = team_avg_run_diff_last_10(df, date_time_col=date_time_col, date_col=date_col)

    # Avg runs scored last 10 games
    df_team_sched['avg_runs_scored_last_10'] = team_avg_runs_score_last_10(df, date_time_col=date_time_col, date_col=date_col)

    # Avg runs allowed last 10 games
    df_team_sched['avg_runs_allowed_last_10'] = team_avg_runs_allowed_last_10(df, date_time_col=date_time_col, date_col=date_col)

    # Merge
    team_schedule_cols = ['team', 'win_rate_last_10','avg_run_diff_last_10','avg_runs_scored_last_10','avg_runs_allowed_last_10']
    df_games = merge_team_features_into_games(
        df_games=df, 
        df_team_schedule=df_team_sched,
        team_schedule_cols=team_schedule_cols,
        date_time_col=date_time_col
        )

    return df_games

In [69]:
df_outcomes = get_outcome_features(df)

In [73]:
cols = ['game_date','home_team','away_team','home_win_rate_last_10','away_win_rate_last_10','home_avg_runs_scored_last_10','away_avg_runs_scored_last_10']
df_outcomes.query("home_team=='New York Yankees' | away_team=='New York Yankees'")[cols].head(10)

Unnamed: 0,game_date,home_team,away_team,home_win_rate_last_10,away_win_rate_last_10,home_avg_runs_scored_last_10,away_avg_runs_scored_last_10
0,2021-04-01,New York Yankees,Toronto Blue Jays,0.0,0.0,0.0,0.0
19,2021-04-03,New York Yankees,Toronto Blue Jays,0.0,1.0,2.0,3.0
34,2021-04-04,New York Yankees,Toronto Blue Jays,0.5,0.5,3.5,3.0
48,2021-04-05,New York Yankees,Baltimore Orioles,0.333333,1.0,2.666667,7.5
61,2021-04-06,New York Yankees,Baltimore Orioles,0.5,0.666667,3.75,5.0
84,2021-04-07,New York Yankees,Baltimore Orioles,0.6,0.5,4.4,4.25
95,2021-04-09,Tampa Bay Rays,New York Yankees,0.333333,0.5,3.833333,4.166667
105,2021-04-10,Tampa Bay Rays,New York Yankees,0.428571,0.428571,4.714286,4.285714
122,2021-04-11,Tampa Bay Rays,New York Yankees,0.5,0.375,4.625,3.75
132,2021-04-12,Toronto Blue Jays,New York Yankees,0.444444,0.444444,4.555556,4.222222


In [75]:
df_nyy = df_outcomes.query("home_team=='New York Yankees' | away_team=='New York Yankees'")[cols]

In [76]:
df_nyy.head(170).tail(20)

Unnamed: 0,game_date,home_team,away_team,home_win_rate_last_10,away_win_rate_last_10,home_avg_runs_scored_last_10,away_avg_runs_scored_last_10
2266,2021-09-26,Boston Red Sox,New York Yankees,0.7,0.7,6.9,4.9
2272,2021-09-28,Toronto Blue Jays,New York Yankees,0.5,0.7,3.7,5.1
2287,2021-09-29,Toronto Blue Jays,New York Yankees,0.5,0.8,3.6,5.6
2303,2021-09-30,Toronto Blue Jays,New York Yankees,0.5,0.7,3.6,5.3
2313,2021-10-01,New York Yankees,Tampa Bay Rays,0.8,0.6,5.6,4.5
2326,2021-10-02,New York Yankees,Tampa Bay Rays,0.8,0.7,5.8,4.9
2345,2021-10-03,New York Yankees,Tampa Bay Rays,0.7,0.7,5.6,5.5
2381,2022-04-09,New York Yankees,Boston Red Sox,0.0,0.0,0.0,0.0
2402,2022-04-10,New York Yankees,Boston Red Sox,1.0,0.0,4.0,2.0
2409,2022-04-11,New York Yankees,Toronto Blue Jays,0.5,0.666667,3.5,6.666667


In [23]:
df_nyy[df_nyy['game_date'].dt.year == 2022].head()

Unnamed: 0,game_date,home_team,away_team,home_score,away_score,home_avg_runs_scored_last_10,away_avg_runs_scored_last_10
2381,2022-04-09,New York Yankees,Boston Red Sox,4,2,5.0,4.7
2402,2022-04-10,New York Yankees,Boston Red Sox,3,4,4.7,3.7
2409,2022-04-11,New York Yankees,Toronto Blue Jays,0,3,4.2,6.3
2420,2022-04-12,New York Yankees,Toronto Blue Jays,4,0,3.7,6.1
2438,2022-04-13,New York Yankees,Toronto Blue Jays,4,6,3.5,5.9


In [28]:
df[df['game_date'].dt.year == 2022].sort_values('game_date_time').head()

Unnamed: 0,game_id,game_date,game_date_time,home_team_id,away_team_id,home_team,away_team,home_score,away_score,state,venue,game_type
2356,663178,2022-04-07,2022-04-07 18:20:00+00:00,112,158,Chicago Cubs,Milwaukee Brewers,5,4,Final,Wrigley Field,R
2357,662766,2022-04-07,2022-04-07 20:10:00+00:00,118,114,Kansas City Royals,Cleveland Guardians,3,1,Final,Kauffman Stadium,R
2358,662021,2022-04-07,2022-04-07 20:15:00+00:00,138,134,St. Louis Cardinals,Pittsburgh Pirates,9,0,Final,Busch Stadium,R
2359,662571,2022-04-07,2022-04-07 23:05:00+00:00,120,121,Washington Nationals,New York Mets,1,5,Final,Nationals Park,R
2360,661577,2022-04-07,2022-04-08 00:08:00+00:00,144,113,Atlanta Braves,Cincinnati Reds,3,6,Final,Truist Park,R


In [None]:
def create_game_features(df: pd.DataFrame, date_col: str = 'game_date',date_time_col: str = 'game_date_time') -> pd.DataFrame:
    """
    Create comprehensive game features including schedule, outcome, and temporate features.

    This is the main feature engineering function that generates multiple categories of 
    features. It combines schedule-based features (rest days, game frequency), 
    outcome-based features (recent performance metrics), and temporal features 
    (seasonal and time-of-day patterns).
    
    Parameters
    ----------
    df : pd.DataFrame
        Input games DataFrame.
    date_col : str, default 'game_date'
        Column name containing date information.
    date_time_col : str, default 'game_date_time'
        Column name containing datetime information.
        
    Returns
    -------
    pd.DataFrame
        Original games DataFrame with the following new columns added:
        
        Schedule Features:
        - 'home_team_rest_days': Days of rest for home team before each game
        - 'away_team_rest_days': Days of rest for away team before each game
        - 'home_team_games_prev_7days': Games played by home team in previous 7 days
        - 'away_team_games_prev_7days': Games played by away team in previous 7 days
        - 'home_back2back': Binary indicator (1 if home team has ≤1 rest day)
        - 'away_back2back': Binary indicator (1 if away team has ≤1 rest day)
        - 'rest_difference': Home rest days minus away rest days
        
        Outcome Features (last 10 games for each team):
        - 'home_win_rate_last_10': Home team win rate over last 10 games
        - 'away_win_rate_last_10': Away team win rate over last 10 games
        - 'home_avg_run_diff_last_10': Home team average run differential
        - 'away_avg_run_diff_last_10': Away team average run differential
        - 'home_std_run_diff_last_10': Home team run differential standard deviation
        - 'away_std_run_diff_last_10': Away team run differential standard deviation
        - 'home_avg_runs_scored_last_10': Home team average runs scored
        - 'away_avg_runs_scored_last_10': Away team average runs scored
        - 'home_std_runs_scored_last_10': Home team runs scored standard deviation
        - 'away_std_runs_scored_last_10': Away team runs scored standard deviation
        - 'home_avg_runs_allowed_last_10': Home team average runs allowed
        - 'away_avg_runs_allowed_last_10': Away team average runs allowed
        - 'home_std_runs_allowed_last_10': Home team runs allowed standard deviation
        - 'away_std_runs_allowed_last_10': Away team runs allowed standard deviation
        
        Temporal Features:
        - 'game_month': Month of the game (1-12, where 1=January)
        - 'game_day_of_week': Day of week (0-6, where 0=Monday, 6=Sunday)
        - 'game_hour': Hour of game start time (0-23 in 24-hour format)
    """
    # Schedule Features
    df_games = get_schedule_features(df, date_col=date_col, date_time_col=date_time_col)

    df_games['home_back2back'] = (df_games['home_team_rest_days'] <= 1).astype(int)
    df_games['away_back2back'] = (df_games['away_team_rest_days'] <= 1).astype(int)

    df_games['rest_difference'] = df_games['home_team_rest_days'] - df_games['away_team_rest_days']

    # Outcome Features
    df_games = get_outcome_features(df_games, date_col=date_col, date_time_col=date_time_col)

    # Date-Time Features
    df_games = get_date_time_features(df_games)

    return df_games