#### Rolling Averages for Game Statistics

In this section, we define a function to compute rolling averages for various game statistics over a specified number of past games (`game_window`). This is useful for tracking team performance trends over time while smoothing out short-term fluctuations.

##### Methodology
1. **Load the Data:** We read game logs from a CSV file into a Pandas DataFrame.
2. **Preprocess Data:**
   - Convert the `date` column to a datetime format.
   - Sort the dataset by `team` and `date` to ensure chronological order.
3. **Calculate Rolling Averages:**
   - Apply an exponentially weighted moving average (EWMA) with a defined `span` equal to `game_window`.
   - Shift the rolling values by one game to ensure that each row only reflects past performance.
   - Retain the first game's original values to avoid NaNs in the output.
4. **Post-processing:**
   - Round all numerical values to two decimal places for readability.
   - Save the processed data to an output CSV file, overwriting any existing file if necessary.

This approach ensures that rolling averages are computed efficiently and can be easily used for further predictive modeling or analysis.

In [None]:
# Importing libraries
import math
import os
import pandas as pd
from dataclasses import dataclass
from tqdm import tqdm
from typing import Dict, List, Optional, Tuple


# Templating the configuration for the ELO rating process
@dataclass
class EloConfig:
    """Configuration for Elo calculation."""
    base_rating: float = 1500.0
    k_factor: float = 20.0
    home_advantage: float = 0.0
    carry_over: float = 0.75
    apply_mov_multiplier: bool = True
    min_margin: float = 1.0
    season_start_month: int = 10  # October by default for NBA


# Game window size
game_window: int = 25


# Extract the season based on the month
def season_key(game_date: pd.Timestamp, season_start_month: int) -> int:
    """Return a season identifier based on the month a season starts."""

    return (
        game_date.year
        if game_date.month >= season_start_month
        else game_date.year - 1
    )


# Fucntion to load the results.csv file
def load_results(
    results_file: Optional[str],
) -> Optional[pd.DataFrame]:
    """Load results if available."""
    if results_file and os.path.exists(results_file):
        results = pd.read_csv(results_file, parse_dates=["date"])
        results["winning_team"] = results["winning_team"].astype(int)
        return results
    return None


# Function to search if the result of the match is present in the already populated results file
def lookup_result(
    results_df: Optional[pd.DataFrame],
    game_date: pd.Timestamp,
    home_team: str,
    away_team: str,
) -> Optional[int]:
    """Return 1 if home won, 0 if away won, or None if not found."""

    if results_df is None:
        return None

    mask = (
        (results_df["date"] == game_date)
        & (results_df["home_team"] == home_team)
        & (results_df["away_team"] == away_team)
    )
    if mask.any():
        return int(results_df.loc[mask, "winning_team"].iloc[0])
    return None

# Given a gamelog of a team, its schedule and possible results, compute ELO rating until today's date
def compute_team_elos(
    gamelogs_df: pd.DataFrame,
    schedule_file: str,
    results_file: Optional[str] = None,
    config: Optional[EloConfig] = None,
) -> pd.DataFrame:
    """
    Compute Elo ratings per team for each played game.

    This helper function calculates Elo ratings based on game outcomes,
    incorporating home advantage, margin of victory, and season-to-season
    carry-over. Ratings are updated after each game and stored as pre-game
    values for analysis.

    Parameters
    ----------
    gamelogs_df : pandas.DataFrame
        Game-level statistics for each team. Must contain `date`, `team`, `pts`.
    schedule_file : str
        CSV file with `date`, `home_team`, `away_team`.
    results_file : str, optional
        CSV file with `date`, `home_team`, `away_team`, `winning_team`.
        If missing or if a row is absent, outcomes are derived from points
        in `gamelogs_df`.
    config : EloConfig, optional
        Configuration for base rating, K-factor, home advantage, carry-over,
        and margin-of-victory multiplier.

    Returns
    -------
    pandas.DataFrame
        Columns: `date`, `team`, `elo` (pre-game rating).

    Notes
    -----
    - Pre-game ratings are recorded before updating based on game outcomes.
    - Ratings regress toward the mean between seasons using carry-over factor.
    - Margin of victory multiplier dampens rating changes for expected blowouts.
    """

    cfg = config or EloConfig()

    games = pd.read_csv(schedule_file, parse_dates=["date"])
    games = games.sort_values("date")

    results_df = load_results(results_file)

    # Lookup table for points
    pts_lookup: Dict[Tuple[pd.Timestamp, str], float] = (
        gamelogs_df.assign(date=pd.to_datetime(gamelogs_df["date"]))
        .set_index(["date", "team"])["pts"]
        .astype(float)
        .to_dict()
    )

    ratings: Dict[str, float] = {}
    records: List[Dict[str, object]] = []
    current_season: Optional[int] = None

    for _, game in games.iterrows():
        game_date: pd.Timestamp = game["date"]
        season = season_key(game_date, cfg.season_start_month)
        if current_season is None:
            current_season = season
        elif season != current_season:
            # Apply carry-over regression at season boundary
            ratings = {
                team: cfg.carry_over * rating
                + (1 - cfg.carry_over) * cfg.base_rating
                for team, rating in ratings.items()
            }
            current_season = season

        home_team: str = game["home_team"]
        away_team: str = game["away_team"]

        home_rating: float = ratings.get(home_team, cfg.base_rating)
        away_rating: float = ratings.get(away_team, cfg.base_rating)

        home_pts = pts_lookup.get((game_date, home_team))
        away_pts = pts_lookup.get((game_date, away_team))

        margin: Optional[float] = None
        if home_pts is not None and away_pts is not None:
            margin = abs(float(home_pts) - float(away_pts))
            margin = max(margin, cfg.min_margin)

        actual_home: Optional[float] = lookup_result(
            results_df, game_date, home_team, away_team
        )
        if actual_home is None and home_pts is not None and away_pts is not None:
            actual_home = 1.0 if float(home_pts) > float(away_pts) else 0.0

        # Skip games without a known outcome
        if actual_home is None:
            continue

        expected_home = 1 / (
            1
            + math.pow(
                10,
                ((away_rating - (home_rating + cfg.home_advantage)) / 400),
            )
        )

        mov_multiplier: float = 1.0
        if cfg.apply_mov_multiplier and margin is not None:
            mov_multiplier = math.log(margin + 1) * (
                2.2 / (abs(home_rating - away_rating) * 0.001 + 2.2)
            )

        delta = cfg.k_factor * mov_multiplier * (actual_home - expected_home)

        # Record pre-game ratings
        records.append(
            {"date": game_date, "team": home_team, "elo": round(home_rating, 2)}
        )
        records.append(
            {"date": game_date, "team": away_team, "elo": round(away_rating, 2)}
        )

        # Update ratings for next game
        ratings[home_team] = home_rating + delta
        ratings[away_team] = away_rating - delta

    elo_history = pd.DataFrame(records)
    return elo_history


# Pipeline function to execute the calculations
def compute_rolling_averages(
    game_window: int,
    gamelogs_file: str,
    output_file: str,
    schedule_file: str = "./csv/schedule.csv",
    results_file: str = "./csv/results.csv",
    elo_config: EloConfig | None = None,
):
    """
    Compute rolling and exponentially weighted averages for team game statistics.

    This pipeline function loads a game logs CSV file, sorts the data by team
    and date, performs feature engineering, and computes a combined rolling
    average for each numerical statistic on a per-team basis.

    The combined average is calculated using:
    - A simple rolling mean over a fixed game window
    - An exponential weighted moving average (EWMA) that emphasizes recent games

    The final value is a weighted combination of both methods, shifted by one
    game to prevent data leakage from the current match.

    Additionally, Elo ratings are computed for each team based on game outcomes,
    incorporating home advantage, margin of victory, and season-to-season
    carry-over.

    Parameters
    ----------
    game_window : int
        Number of games to include in the rolling window.
    gamelogs_file : str
        Path to the input CSV file containing per-game team statistics.
    output_file : str
        Path where the computed rolling averages CSV will be saved.
    schedule_file : str, optional
        Path to a schedule CSV with `date`, `home_team`, `away_team`.
    results_file : str, optional
        Path to a results CSV with `date`, `home_team`, `away_team`,
        `winning_team`. If missing, results are inferred from game logs.
    elo_config : EloConfig, optional
        Configuration controlling K-factor, home advantage, carry-over,
        and margin-of-victory multiplier.

    Returns
    -------
    None
        This function does not return a value. The processed data is written
        directly to the output CSV file.

    Raises
    ------
    FileNotFoundError
        If the input CSV file does not exist.
    ValueError
        If `game_window` is less than 1 or invalid rolling operations occur.
    KeyError
        If required statistical columns are missing from the dataset.
    OSError
        If the output file cannot be created or overwritten.

    Notes
    -----
    - All statistics are shifted by one game to avoid using information from
      the current game.
    - Feature engineering includes assist-to-turnover ratio and assist ratio.
    - The combined average weights are currently fixed at 30% rolling mean
      and 70% EWMA.
    - Existing output files are removed before writing new results.
    """

    cfg = elo_config or EloConfig()

    # Load the CSV file
    tqdm.write("Loading CSV file...")
    df: pd.DataFrame = pd.read_csv(gamelogs_file)
    tqdm.write(f"   Loaded {len(df)} game records")

    # Sort by team and date
    tqdm.write("   Sorting data by team and date...")
    df["date"] = pd.to_datetime(df["date"])
    df: pd.DataFrame = df.sort_values(by=["team", "date"])

    # Feature Engineering
    tqdm.write("   Engineering additional features...")
    df["ast_tov"] = round(df["ast"] / df["tov"], 2)
    df["ast_ratio"] = round(df["ast"] / (df["fg"] + df["ast"] + df["tov"]), 2)

    # Preserve raw points for Elo calculation before smoothing
    elo_input = df[["date", "team", "pts"]].copy()

    # Identify stat columns (exclude non-numerical or metadata columns).
    # Keep Elo separate so it never gets rolled/averaged.
    stat_columns: list[str] = [
        col for col in df.columns if col not in ["date", "team", "elo"]
    ]
    tqdm.write(f"   Processing {len(stat_columns)} statistical columns")

    # Get unique teams for progress tracking
    teams = df["team"].unique()
    tqdm.write(f"\nComputing rolling averages for {len(teams)} teams...\n")

    # Combining the 2 averages methods to obtain a more neutral overview
    def compute_combined_avg(group: pd.DataFrame) -> pd.DataFrame:
        """
        Compute a combined rolling and exponentially weighted average for a team.

        This helper function operates on a single team's game log DataFrame and
        calculates two smoothed statistics for each numerical column:
        - A simple rolling mean over a fixed game window
        - An exponential weighted moving average (EWMA) emphasizing recent games

        The two measures are combined into a single value using fixed weights and
        shifted by one game to prevent data leakage from the current match.

        Parameters
        ----------
        group : pandas.DataFrame
            A DataFrame containing game-by-game statistics for a single team,
            ordered chronologically.

        Returns
        -------
        pandas.DataFrame
            A DataFrame of the same shape as the input statistics, containing
            the combined rolling averages for each game.

        Notes
        -----
        - The rolling window size and weighting coefficients are defined in the
          enclosing scope (`game_window`, `stat_columns`).
        - The first game for each team is filled with actual observed values
          to avoid NaN results after shifting.
        - This function is intended to be used with `groupby().apply()`.
        """

        # Rolling average (simple mean)
        rolling_mean: pd.DataFrame = (
            group[stat_columns]
            .rolling(window=game_window, min_periods=1)
            .mean()
            .shift(1)
        )

        # Exponential weighted mean (recent games weighted more)
        ewma: pd.DataFrame = (
            group[stat_columns].ewm(span=game_window, adjust=False).mean().shift(1)
        )

        # Combined average (equal weight, or adjust ratio if you prefer)
        combined: pd.DataFrame = 0.3 * rolling_mean + 0.7 * ewma

        # Fill the first row with actual values to avoid NaNs
        combined.iloc[0] = group.iloc[0][stat_columns]
        return combined

    # Apply per team
    tqdm.write("Computing averages per team...")
    df[stat_columns] = df.groupby("team", group_keys=False, observed=True)[
        stat_columns
    ].apply(compute_combined_avg)

    # Round the results
    tqdm.write("\nRounding values...")
    df[stat_columns] = df[stat_columns].round(2)

    # Elo ratings
    tqdm.write("\nComputing Elo ratings...")
    elo_history = compute_team_elos(
        gamelogs_df=elo_input,
        schedule_file=schedule_file,
        results_file=results_file,
        config=cfg,
    )
    df = df.merge(elo_history, on=["date", "team"], how="left")
    df["elo"] = (
        df.groupby("team")["elo"]
        .ffill()
        .fillna(cfg.base_rating)
        .round(2)
    )

    # Save to CSV
    if os.path.exists(output_file):
        tqdm.write(f"File {output_file} already exists. Removing...")
        os.remove(output_file)

    tqdm.write(f"Saving results to {output_file}...")
    df.to_csv(output_file, index=False)
    tqdm.write(f"Rolling averages saved successfully!")
    tqdm.write(f"Output: {output_file}")


if __name__ == "__main__":
    print(f"Game window size: {game_window}\n")
    compute_rolling_averages(game_window, "./csv/gamelogs.csv", "./csv/averages.csv")
    print("\nDone!\n")