# Super Bowl Predictor — Exploratory Data Analysis

This notebook explores the NFL data used to train the Super Bowl prediction model.
We examine feature distributions, correlations, and team comparisons.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data.fetch_data import build_team_season_stats, build_game_dataset, fetch_schedules
from features.engineering import build_matchup_features, build_team_features, get_feature_names

sns.set_theme(style='whitegrid', palette='deep')
plt.rcParams['figure.figsize'] = (12, 6)

SEASONS = list(range(2010, 2025))
print(f'Analyzing {len(SEASONS)} seasons: {SEASONS[0]}-{SEASONS[-1]}')

## 1. Team Season Stats Overview

In [None]:
team_stats = build_team_season_stats(SEASONS)
print(f'Total team-season records: {len(team_stats)}')
print(f'Unique teams: {team_stats["team"].nunique()}')
team_stats.describe()

## 2. Seahawks vs Patriots — Historical Comparison

In [None]:
sea = team_stats[team_stats['team'] == 'SEA'].sort_values('season')
ne = team_stats[team_stats['team'] == 'NE'].sort_values('season')

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Win percentage
axes[0, 0].plot(sea['season'], sea['win_pct'], 'o-', color='#69BE28', label='Seahawks')
axes[0, 0].plot(ne['season'], ne['win_pct'], 's-', color='#C60C30', label='Patriots')
axes[0, 0].set_title('Win Percentage by Season')
axes[0, 0].legend()
axes[0, 0].set_ylim(0, 1)

# Points per game
axes[0, 1].plot(sea['season'], sea['points_per_game'], 'o-', color='#69BE28', label='Seahawks')
axes[0, 1].plot(ne['season'], ne['points_per_game'], 's-', color='#C60C30', label='Patriots')
axes[0, 1].set_title('Points Per Game')
axes[0, 1].legend()

# Points allowed per game
axes[1, 0].plot(sea['season'], sea['points_allowed_per_game'], 'o-', color='#69BE28', label='Seahawks')
axes[1, 0].plot(ne['season'], ne['points_allowed_per_game'], 's-', color='#C60C30', label='Patriots')
axes[1, 0].set_title('Points Allowed Per Game')
axes[1, 0].legend()

# Point differential
axes[1, 1].plot(sea['season'], sea['point_diff_per_game'], 'o-', color='#69BE28', label='Seahawks')
axes[1, 1].plot(ne['season'], ne['point_diff_per_game'], 's-', color='#C60C30', label='Patriots')
axes[1, 1].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[1, 1].set_title('Point Differential Per Game')
axes[1, 1].legend()

plt.suptitle('Seahawks vs Patriots — Historical Comparison', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 3. Feature Distributions

In [None]:
matchups = build_matchup_features(SEASONS)
feature_names = get_feature_names()

print(f'Total matchups: {len(matchups)}')
print(f'Features: {feature_names}')
print(f'Home win rate: {matchups["home_win"].mean():.3f}')

fig, axes = plt.subplots(2, 4, figsize=(18, 8))
axes = axes.flatten()

for i, feat in enumerate(feature_names):
    ax = axes[i]
    # Split by outcome
    wins = matchups[matchups['home_win'] == 1][feat]
    losses = matchups[matchups['home_win'] == 0][feat]
    ax.hist(wins, bins=30, alpha=0.6, color='#69BE28', label='Home Win', density=True)
    ax.hist(losses, bins=30, alpha=0.6, color='#C60C30', label='Home Loss', density=True)
    ax.set_title(feat.replace('_diff', '').replace('_', ' ').title())
    ax.legend(fontsize=8)

plt.suptitle('Feature Distributions by Outcome', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 4. Correlation Matrix

In [None]:
corr = matchups[feature_names].corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(
    corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
    square=True, ax=ax, linewidths=0.5,
    xticklabels=[f.replace('_diff', '') for f in feature_names],
    yticklabels=[f.replace('_diff', '') for f in feature_names],
)
ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Identify highly correlated pairs
print('\nHighly correlated pairs (|r| > 0.7):')
for i in range(len(feature_names)):
    for j in range(i+1, len(feature_names)):
        r = corr.iloc[i, j]
        if abs(r) > 0.7:
            print(f'  {feature_names[i]} <-> {feature_names[j]}: r={r:.3f}')

## 5. Home Win Rate by Game Type

In [None]:
game_type_stats = matchups.groupby('game_type').agg(
    count=('home_win', 'count'),
    home_win_rate=('home_win', 'mean')
).reset_index()

fig, ax = plt.subplots(figsize=(8, 5))
bars = ax.bar(game_type_stats['game_type'], game_type_stats['home_win_rate'], 
              color=['#002244', '#69BE28', '#C60C30', '#FFD700'][:len(game_type_stats)])
ax.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
ax.set_ylabel('Home Win Rate')
ax.set_title('Home Win Rate by Game Type')
ax.set_ylim(0, 0.8)

for bar, count in zip(bars, game_type_stats['count']):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
            f'n={count}', ha='center', fontsize=10)

plt.tight_layout()
plt.show()

## 6. Super Bowl XLIX Matchup — 2014 Season Stats

In [None]:
team_features = build_team_features([2014])

sea_2014 = team_features[team_features['team'] == 'SEA'].iloc[0]
ne_2014 = team_features[team_features['team'] == 'NE'].iloc[0]

compare_cols = ['points_per_game', 'points_allowed_per_game', 'point_diff_per_game',
                'win_pct', 'pythagorean_wins']

comparison = pd.DataFrame({
    'Stat': [c.replace('_', ' ').title() for c in compare_cols],
    'Seahawks': [sea_2014[c] for c in compare_cols],
    'Patriots': [ne_2014[c] for c in compare_cols],
})
comparison['Advantage'] = comparison.apply(
    lambda r: 'SEA' if r['Seahawks'] > r['Patriots'] else 'NE', axis=1
)

print('2014 Season — Seahawks vs Patriots')
print('=' * 60)
print(comparison.to_string(index=False))