# Super Bowl XLIX Prediction Notebook (Patriots vs. Seahawks)

This notebook builds a **time-aware** machine learning workflow to predict the outcome of the Patriots vs. Seahawks Super Bowl (neutral site, San Francisco). It:

1. Pulls and aggregates 2025 regular + postseason NFL data.
2. Computes **ELO** ratings and advanced team features.
3. Visualizes correlations and exploratory statistics.
4. Adjusts for **key injuries**.
5. Trains an **XGBoost classifier** with a time-series split.
6. Explains the model with feature importance.
7. Trains a second model to predict **player receiving yards**.

> **Note:** Run the notebook top-to-bottom to reproduce the analysis.


In [None]:
# If running in a clean environment, install dependencies
# (You can comment these out if you already have them.)
!pip -q install nfl_data_py xgboost scikit-learn seaborn matplotlib pandas numpy

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier, XGBRegressor

import nfl_data_py as nfl

pd.set_option("display.max_columns", 200)


## 1. Load 2025 Season Data

We use the `nfl_data_py` package to pull schedules, game-level team stats, and injury reports.


In [None]:
# Load 2025 regular + postseason games
seasons = [2025]

schedules = nfl.import_schedules(seasons)
team_stats = nfl.import_team_stats(seasons)
injuries = nfl.import_injuries(seasons)

schedules.head()

## 2. Feature Engineering

We aggregate team offensive/defensive metrics and build an ELO rating time series.


In [None]:
# Basic cleanup
schedules = schedules.copy()
schedules["game_date"] = pd.to_datetime(schedules["gameday"])

# Merge team stats to schedules (offense vs. defense separation)
team_stats = team_stats.copy()

# Rename columns for clarity
team_stats = team_stats.rename(columns={
    "season": "season",
    "team": "team",
    "games": "games",
    "points": "points",
    "points_allowed": "points_allowed",
    "pass_yds": "pass_yds",
    "pass_yds_allowed": "pass_yds_allowed",
    "rush_yds": "rush_yds",
    "rush_yds_allowed": "rush_yds_allowed",
    "turnovers": "turnovers",
    "takeaways": "takeaways",
})


In [None]:
# Build ELO ratings per game

# ELO parameters
K = 20
HOME_FIELD = 0  # neutral site for SB model training; use 0 to avoid leakage

# Initialize ratings
teams = pd.unique(schedules[["home_team", "away_team"]].values.ravel("K"))
elo = {team: 1500 for team in teams}

elo_history = []

for _, row in schedules.sort_values("game_date").iterrows():
    home = row["home_team"]
    away = row["away_team"]
    home_score = row["home_score"]
    away_score = row["away_score"]
    
    # Expected win probability
    home_elo = elo[home] + HOME_FIELD
    away_elo = elo[away]
    exp_home = 1 / (1 + 10 ** ((away_elo - home_elo) / 400))

    # Actual result
    if home_score > away_score:
        actual_home = 1
    elif home_score < away_score:
        actual_home = 0
    else:
        actual_home = 0.5

    # Update ratings
    elo[home] = home_elo + K * (actual_home - exp_home)
    elo[away] = away_elo + K * ((1 - actual_home) - (1 - exp_home))

    elo_history.append({
        "game_id": row["game_id"],
        "home_team": home,
        "away_team": away,
        "home_elo": home_elo,
        "away_elo": away_elo,
    })

elo_df = pd.DataFrame(elo_history)
elo_df.head()

In [None]:
# Merge ELO into schedules
schedules = schedules.merge(elo_df, on=["game_id", "home_team", "away_team"], how="left")


## 3. Injury Features

We will create a simple indicator for **key QB injuries** based on the injury report.


In [None]:
# Filter injuries for QBs and mark if they were out
injuries_qb = injuries[(injuries["position"] == "QB") & (injuries["report_status"] == "Out")]

# Count QB outs per team/week
qb_outs = (
    injuries_qb
    .groupby(["season", "week", "team"])
    .size()
    .reset_index(name="qb_out_count")
)

# Merge into schedules (home and away)

schedules = schedules.merge(
    qb_outs.rename(columns={"team": "home_team", "qb_out_count": "home_qb_out"}),
    on=["season", "week", "home_team"],
    how="left",
)

schedules = schedules.merge(
    qb_outs.rename(columns={"team": "away_team", "qb_out_count": "away_qb_out"}),
    on=["season", "week", "away_team"],
    how="left",
)

schedules["home_qb_out"] = schedules["home_qb_out"].fillna(0)
schedules["away_qb_out"] = schedules["away_qb_out"].fillna(0)


## 4. Build Modeling Dataset

We combine ELO, team stats, and outcome labels. We make sure the split is **time-aware**.


In [None]:
# Derive win label for home team
schedules["home_win"] = (schedules["home_score"] > schedules["away_score"]).astype(int)

# Add team stats for home/away
team_stats_home = team_stats.add_prefix("home_")
team_stats_away = team_stats.add_prefix("away_")

model_df = schedules.merge(team_stats_home, left_on=["season", "home_team"], right_on=["home_season", "home_team"], how="left")
model_df = model_df.merge(team_stats_away, left_on=["season", "away_team"], right_on=["away_season", "away_team"], how="left")

# Advanced derived features (season-level)
model_df["home_off_yards"] = model_df["home_pass_yds"] + model_df["home_rush_yds"]
model_df["away_off_yards"] = model_df["away_pass_yds"] + model_df["away_rush_yds"]
model_df["home_def_yards_allowed"] = model_df["home_pass_yds_allowed"] + model_df["home_rush_yds_allowed"]
model_df["away_def_yards_allowed"] = model_df["away_pass_yds_allowed"] + model_df["away_rush_yds_allowed"]

model_df["home_points_per_game"] = model_df["home_points"] / model_df["home_games"]
model_df["away_points_per_game"] = model_df["away_points"] / model_df["away_games"]
model_df["home_points_allowed_per_game"] = model_df["home_points_allowed"] / model_df["home_games"]
model_df["away_points_allowed_per_game"] = model_df["away_points_allowed"] / model_df["away_games"]

model_df["home_turnover_margin"] = model_df["home_takeaways"] - model_df["home_turnovers"]
model_df["away_turnover_margin"] = model_df["away_takeaways"] - model_df["away_turnovers"]

model_df["home_pass_rate"] = model_df["home_pass_yds"] / (model_df["home_pass_yds"] + model_df["home_rush_yds"]) 
model_df["away_pass_rate"] = model_df["away_pass_yds"] / (model_df["away_pass_yds"] + model_df["away_rush_yds"]) 
model_df["home_def_pass_rate_allowed"] = model_df["home_pass_yds_allowed"] / (model_df["home_pass_yds_allowed"] + model_df["home_rush_yds_allowed"]) 
model_df["away_def_pass_rate_allowed"] = model_df["away_pass_yds_allowed"] / (model_df["away_pass_yds_allowed"] + model_df["away_rush_yds_allowed"]) 

model_df["elo_diff"] = model_df["home_elo"] - model_df["away_elo"]
model_df["points_diff"] = model_df["home_points"] - model_df["away_points"]
model_df["off_yards_diff"] = model_df["home_off_yards"] - model_df["away_off_yards"]
model_df["def_yards_allowed_diff"] = model_df["home_def_yards_allowed"] - model_df["away_def_yards_allowed"]
model_df["turnover_margin_diff"] = model_df["home_turnover_margin"] - model_df["away_turnover_margin"]

# Basic + advanced features
feature_cols = [
    "home_elo", "away_elo", "elo_diff",
    "home_points", "away_points", "points_diff",
    "home_points_allowed", "away_points_allowed",
    "home_pass_yds", "away_pass_yds",
    "home_pass_yds_allowed", "away_pass_yds_allowed",
    "home_rush_yds", "away_rush_yds",
    "home_rush_yds_allowed", "away_rush_yds_allowed",
    "home_turnovers", "away_turnovers",
    "home_takeaways", "away_takeaways",
    "home_qb_out", "away_qb_out",
    "home_off_yards", "away_off_yards", "off_yards_diff",
    "home_def_yards_allowed", "away_def_yards_allowed", "def_yards_allowed_diff",
    "home_points_per_game", "away_points_per_game",
    "home_points_allowed_per_game", "away_points_allowed_per_game",
    "home_turnover_margin", "away_turnover_margin", "turnover_margin_diff",
    "home_pass_rate", "away_pass_rate",
    "home_def_pass_rate_allowed", "away_def_pass_rate_allowed",
]

model_df = model_df.dropna(subset=feature_cols + ["home_win"])

X = model_df[feature_cols]
y = model_df["home_win"]


## 5. Correlation Matrix

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(X.corr(), cmap="coolwarm", center=0)
plt.title("Feature Correlation Matrix")
plt.show()

## 6. XGBoost Classifier (Time-Series Split)

In [None]:
# TimeSeriesSplit (no shuffling)
tscv = TimeSeriesSplit(n_splits=5)

accuracies = []

for train_idx, val_idx in tscv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42,
        use_label_encoder=False,
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    accuracies.append(acc)

print("Fold Accuracies:", accuracies)
print("Mean Accuracy:", np.mean(accuracies))

## 7. Feature Importance

In [None]:
# Fit on all data for interpretation
final_model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    use_label_encoder=False,
)
final_model.fit(X, y)

importance = pd.DataFrame({
    "feature": feature_cols,
    "importance": final_model.feature_importances_
}).sort_values("importance", ascending=False)

importance.head(10)

## 8. Super Bowl Prediction (Patriots vs. Seahawks)

We create a feature row using **neutral-site** conditions (no home-field advantage).


In [None]:
# Build a single-row feature set for SB XLIX
sb_home = "NE"  # Patriots
sb_away = "SEA" # Seahawks

sb_row = model_df[(model_df["home_team"] == sb_home) & (model_df["away_team"] == sb_away)].tail(1)

if sb_row.empty:
    sb_row = pd.DataFrame([{
        "home_team": sb_home,
        "away_team": sb_away,
        "home_elo": elo.get(sb_home, 1500),
        "away_elo": elo.get(sb_away, 1500),
    }])

sb_features = sb_row.reindex(columns=feature_cols).fillna(0)

sb_pred = final_model.predict_proba(sb_features)[:, 1]
print(f"Predicted Patriots win probability: {sb_pred[0]:.3f}")

## 9. Player Receiving Yards Model

We build a regression model to predict total receiving yards for individual players.
Features include:
- Player recent performance (rolling mean)
- Opponent pass defense allowed
- Team offensive pass efficiency


In [None]:
# Load weekly player stats for 2025
player_stats = nfl.import_weekly_data(seasons, columns=[
    "season", "week", "player_id", "player_name", "team", "opponent_team",
    "position", "receiving_yards", "targets", "receiving_tds"
])

# Filter for pass-catchers
player_stats = player_stats[player_stats["position"].isin(["WR", "TE", "RB"])].copy()

# Merge opponent pass defense
team_def = team_stats[["team", "pass_yds_allowed"]].rename(columns={"team": "opponent_team"})
player_stats = player_stats.merge(team_def, on="opponent_team", how="left")

# Rolling mean of player yards (previous 3 weeks)
player_stats = player_stats.sort_values(["player_id", "week"])
player_stats["rec_yards_rolling"] = (
    player_stats.groupby("player_id")["receiving_yards"]
    .shift(1)
    .rolling(3)
    .mean()
)

player_stats = player_stats.dropna(subset=["rec_yards_rolling", "pass_yds_allowed", "receiving_yards"])

# Model features
rec_features = ["rec_yards_rolling", "targets", "pass_yds_allowed"]
X_rec = player_stats[rec_features]
y_rec = player_stats["receiving_yards"]


In [None]:
# Time-series split
rec_tscv = TimeSeriesSplit(n_splits=5)
rec_scores = []

for train_idx, val_idx in rec_tscv.split(X_rec):
    X_train, X_val = X_rec.iloc[train_idx], X_rec.iloc[val_idx]
    y_train, y_val = y_rec.iloc[train_idx], y_rec.iloc[val_idx]

    rec_model = XGBRegressor(
        n_estimators=200,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
    )
    rec_model.fit(X_train, y_train)
    preds = rec_model.predict(X_val)
    rmse = np.sqrt(np.mean((y_val - preds) ** 2))
    rec_scores.append(rmse)

print("Fold RMSE:", rec_scores)
print("Mean RMSE:", np.mean(rec_scores))

## 10. Receiving Yards Feature Importance

In [None]:
# Fit final receiving model
rec_model = XGBRegressor(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)
rec_model.fit(X_rec, y_rec)

rec_importance = pd.DataFrame({
    "feature": rec_features,
    "importance": rec_model.feature_importances_
}).sort_values("importance", ascending=False)

rec_importance