# ELO Model Training - Hyperparameter Grid Search

This notebook:
1. Loads the 648 hyperparameter configs from Ruby
2. Trains ELO model for each config
3. Tracks RMSE, MAE, R¬≤ for each
4. Saves results and identifies best config

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Import EloModel from elo_model.ipynb (or copy the class here)
# from elo_model import EloModel

sns.set_style('darkgrid')

## Load Data

In [None]:
# Load hyperparameter grid (generated by Ruby)
configs_df = pd.read_csv('../output/hyperparams/model3_elo_grid.csv')
print(f"Loaded {len(configs_df)} hyperparameter configurations")
print(configs_df.head())

In [None]:
# Load hockey game data
# REPLACE WITH YOUR ACTUAL DATA PATH
games_df = pd.read_csv('data/hockey_data.csv')

# CRITICAL: Sort by game date (ELO requires chronological order)
games_df = games_df.sort_values('game_date').reset_index(drop=True)

print(f"Loaded {len(games_df)} games")
print(games_df.head())

## Time Series Split for Validation

In [None]:
# Use 80/20 train/test split (chronological)
split_idx = int(len(games_df) * 0.8)
train_df = games_df[:split_idx]
test_df = games_df[split_idx:]

print(f"Train: {len(train_df)} games")
print(f"Test: {len(test_df)} games")

## Grid Search Loop

In [None]:
results = []

# Loop through all configs (this will take a while - 648 iterations)
for idx, row in tqdm(configs_df.iterrows(), total=len(configs_df), desc="Training ELO models"):
    try:
        # Convert row to parameters dict
        params = row.to_dict()
        experiment_id = params.pop('experiment_id')
        
        # Initialize model
        model = EloModel(params)
        
        # Train on training set
        model.fit(train_df)
        
        # Evaluate on test set
        metrics = model.evaluate(test_df)
        
        # Store results
        results.append({
            'experiment_id': experiment_id,
            'rmse': metrics['rmse'],
            'mae': metrics['mae'],
            'r2': metrics['r2'],
            'status': 'completed',
            **params
        })
        
    except Exception as e:
        print(f"Error in experiment {experiment_id}: {e}")
        results.append({
            'experiment_id': experiment_id,
            'rmse': np.nan,
            'mae': np.nan,
            'r2': np.nan,
            'status': 'failed',
            **params
        })

# Convert to DataFrame
results_df = pd.DataFrame(results)
print(f"\nCompleted {len(results_df)} experiments")
print(f"Failed: {results_df['status'].value_counts().get('failed', 0)}")

## Save Results

In [None]:
# Save results with metrics
results_df.to_csv('../output/hyperparams/model3_elo_results.csv', index=False)
print("Saved results to: output/hyperparams/model3_elo_results.csv")

## Analyze Best Configurations

In [None]:
# Find best configs by RMSE
best_configs = results_df.nsmallest(10, 'rmse')
print("\nTop 10 Configurations by RMSE:")
print(best_configs[['experiment_id', 'rmse', 'mae', 'r2', 'k_factor', 'home_advantage', 
                     'mov_multiplier', 'rest_advantage_per_day', 'b2b_penalty']])

In [None]:
# Best overall config
best = results_df.loc[results_df['rmse'].idxmin()]
print(f"\nüèÜ BEST CONFIGURATION:")
print(f"   Experiment ID: {best['experiment_id']}")
print(f"   RMSE: {best['rmse']:.3f}")
print(f"   MAE: {best['mae']:.3f}")
print(f"   R¬≤: {best['r2']:.3f}")
print(f"\n   Parameters:")
print(f"   - k_factor: {best['k_factor']}")
print(f"   - home_advantage: {best['home_advantage']}")
print(f"   - mov_multiplier: {best['mov_multiplier']}")
print(f"   - mov_method: {best['mov_method']}")
print(f"   - rest_advantage_per_day: {best['rest_advantage_per_day']}")
print(f"   - b2b_penalty: {best['b2b_penalty']}")

## Visualize Results

In [None]:
# Distribution of RMSE scores
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(results_df['rmse'].dropna(), bins=50, edgecolor='black')
plt.axvline(best['rmse'], color='red', linestyle='--', linewidth=2, label='Best')
plt.xlabel('RMSE')
plt.ylabel('Frequency')
plt.title('Distribution of RMSE Scores')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(results_df['r2'].dropna(), bins=50, edgecolor='black', color='green', alpha=0.7)
plt.axvline(best['r2'], color='red', linestyle='--', linewidth=2, label='Best')
plt.xlabel('R¬≤ Score')
plt.ylabel('Frequency')
plt.title('Distribution of R¬≤ Scores')
plt.legend()

plt.tight_layout()
plt.savefig('../output/reports/elo_results_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Parameter importance heatmap
param_cols = ['k_factor', 'home_advantage', 'mov_multiplier', 'rest_advantage_per_day', 'b2b_penalty']
corr = results_df[param_cols + ['rmse']].corr()['rmse'].drop('rmse')

plt.figure(figsize=(8, 5))
corr.abs().sort_values().plot(kind='barh', color='steelblue')
plt.xlabel('Correlation with RMSE (absolute)')
plt.title('Hyperparameter Importance')
plt.tight_layout()
plt.savefig('../output/reports/elo_parameter_importance.png', dpi=150, bbox_inches='tight')
plt.show()

## Train Final Model with Best Config

In [None]:
# Train on full dataset with best parameters
best_params = best.drop(['experiment_id', 'rmse', 'mae', 'r2', 'status']).to_dict()
final_model = EloModel(best_params)
final_model.fit(games_df)

print("Final model trained on full dataset")
print(f"Final team ratings:")
sorted_ratings = sorted(final_model.ratings.items(), key=lambda x: x[1], reverse=True)
for team, rating in sorted_ratings[:10]:
    print(f"  {team}: {rating:.1f}")

## Generate Predictions for Submission

In [None]:
# Load test set (if provided separately)
# test_games = pd.read_csv('data/test_set.csv')

# predictions = []
# for _, game in test_games.iterrows():
#     home_pred, away_pred = final_model.predict_goals(game)
#     predictions.append({
#         'game_id': game['game_id'],
#         'home_goals_pred': home_pred,
#         'away_goals_pred': away_pred
#     })

# predictions_df = pd.DataFrame(predictions)
# predictions_df.to_csv('../output/predictions/model3_elo_predictions.csv', index=False)
# print("Predictions saved!")