# NFL Game Prediction - Exploratory Analysis

This notebook explores the NFL data and demonstrates the prediction capabilities.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from config.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

## Load Data

In [None]:
# Load raw data
schedule = pd.read_parquet(RAW_DATA_DIR / 'schedule.parquet')
pbp = pd.read_parquet(RAW_DATA_DIR / 'pbp.parquet')
team_stats = pd.read_parquet(RAW_DATA_DIR / 'team_stats.parquet')

print(f"Schedule: {len(schedule)} games")
print(f"Play-by-play: {len(pbp)} plays")
print(f"Team stats: {len(team_stats)} records")

## Basic Statistics

In [None]:
# Home field advantage
completed_games = schedule[schedule['home_score'].notna()]
home_wins = (completed_games['home_score'] > completed_games['away_score']).mean()
print(f"Home win rate: {home_wins:.2%}")

# Average scores
print(f"Average home score: {completed_games['home_score'].mean():.1f}")
print(f"Average away score: {completed_games['away_score'].mean():.1f}")

## Load Feature Data

In [None]:
features = pd.read_parquet(PROCESSED_DATA_DIR / 'game_features.parquet')
print(f"Features shape: {features.shape}")
features.head()

## Feature Correlations

In [None]:
# Get numeric columns
numeric_cols = features.select_dtypes(include=[np.number]).columns

# Calculate correlation with target
if 'home_win' in features.columns:
    correlations = features[numeric_cols].corrwith(features['home_win']).sort_values(ascending=False)
    print("Top 10 features correlated with home win:")
    print(correlations.head(10))
    
    # Plot
    plt.figure(figsize=(10, 6))
    correlations.head(15).plot(kind='barh')
    plt.title('Top 15 Features Correlated with Home Win')
    plt.xlabel('Correlation')
    plt.tight_layout()
    plt.show()

## Load Model and Evaluate

In [None]:
from models.trainer import ModelTrainer
from sklearn.metrics import classification_report, confusion_matrix

# Load model
trainer = ModelTrainer()
trainer.load_model('ensemble')
model = trainer.models['ensemble']

# Prepare data
X, y = trainer.prepare_data(features, target_col='home_win')

# Make predictions
predictions = model.predict(X)
probabilities = model.predict_proba(X)[:, 1]

# Evaluate
print("Classification Report:")
print(classification_report(y, predictions, target_names=['Away Win', 'Home Win']))

# Confusion matrix
cm = confusion_matrix(y, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

## Calibration Plot

In [None]:
# Create bins for probabilities
bins = np.linspace(0, 1, 11)
bin_centers = (bins[:-1] + bins[1:]) / 2

bin_indices = np.digitize(probabilities, bins) - 1
bin_sums = np.bincount(bin_indices, weights=y, minlength=len(bins)-1)
bin_counts = np.bincount(bin_indices, minlength=len(bins)-1)

bin_means = bin_sums / np.maximum(bin_counts, 1)

plt.figure(figsize=(10, 6))
plt.plot([0, 1], [0, 1], 'k--', label='Perfect calibration')
plt.plot(bin_centers, bin_means, 'o-', label='Model calibration')
plt.xlabel('Predicted Probability')
plt.ylabel('Actual Probability')
plt.title('Calibration Plot')
plt.legend()
plt.grid(True)
plt.show()

## Backtest Analysis

In [None]:
from backtesting.engine import BacktestEngine
from backtesting.strategies import KellyCriterionStrategy

# Run backtest
strategy = KellyCriterionStrategy(fraction=0.25, bankroll=10000)
engine = BacktestEngine()
results = engine.run_backtest(strategy, features, predictions, probabilities)

# Plot bankroll history
plt.figure(figsize=(12, 6))
plt.plot(results['bankroll_history'])
plt.axhline(y=10000, color='r', linestyle='--', label='Starting bankroll')
plt.xlabel('Bet Number')
plt.ylabel('Bankroll ($)')
plt.title('Kelly Criterion Strategy - Bankroll Over Time')
plt.legend()
plt.grid(True)
plt.show()

# Display metrics
from backtesting.metrics import BettingMetrics
BettingMetrics.print_metrics_report(results['metrics'])