# Model 1: Baseline Models - Training and Comparison

This notebook trains and compares multiple baseline models for hockey goal prediction.

## Baseline Models Included

| Model | Description | Use Case |
|-------|-------------|----------|
| GlobalMeanBaseline | League-wide average | Sanity check lower bound |
| TeamMeanBaseline | Per-team offense/defense averages | Standard baseline |
| HomeAwayBaseline | Location-aware averages | Captures home advantage |
| MovingAverageBaseline | Recent N games only | Captures team form |
| WeightedHistoryBaseline | Exponential decay weighting | Balances history and recency |
| PoissonBaseline | Statistical Poisson model | Academic standard |

## Table of Contents

1. Setup and Imports
2. Load Data
3. Train All Baselines
4. Compare Performance
5. Analyze Best Baseline
6. Hyperparameter Search
7. Final Evaluation
8. Save Results

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import sys
import json
from itertools import product

# Add parent directory for imports
sys.path.insert(0, os.path.dirname(os.getcwd()))

# Configure plotting
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11
sns.set_style('whitegrid')

print("Setup complete.")

In [None]:
# Baseline Model Classes (self-contained for portability)

COLUMN_ALIASES = {
    'home_team': ['home_team', 'home', 'team_home', 'h_team'],
    'away_team': ['away_team', 'away', 'team_away', 'a_team', 'visitor', 'visiting_team'],
    'home_goals': ['home_goals', 'home_score', 'h_goals', 'goals_home', 'home_pts'],
    'away_goals': ['away_goals', 'away_score', 'a_goals', 'goals_away', 'away_pts', 'visitor_goals'],
    'game_date': ['game_date', 'date', 'Date', 'game_datetime', 'datetime', 'game_time'],
}

def get_value(game, field, default=None):
    """Get a value from a game record, checking multiple possible column names."""
    aliases = COLUMN_ALIASES.get(field, [field])
    for alias in aliases:
        if alias in game:
            val = game[alias]
            if pd.isna(val):
                return default
            return val
    return default

def get_column(df, field):
    """Find the correct column name in a DataFrame."""
    aliases = COLUMN_ALIASES.get(field, [field])
    for alias in aliases:
        if alias in df.columns:
            return alias
    return None


class BaselineModel:
    """Abstract base class for baseline models."""
    
    def __init__(self, params=None):
        self.params = params or {}
        self.is_fitted = False
    
    def evaluate(self, games_df):
        """Evaluate model on test set."""
        if not self.is_fitted:
            raise RuntimeError("Model must be fitted before evaluation")
        
        home_preds, away_preds = [], []
        home_actuals, away_actuals = [], []
        
        for _, game in games_df.iterrows():
            home_pred, away_pred = self.predict_goals(game)
            home_preds.append(home_pred)
            away_preds.append(away_pred)
            home_actuals.append(get_value(game, 'home_goals', 0))
            away_actuals.append(get_value(game, 'away_goals', 0))
        
        rmse = mean_squared_error(home_actuals, home_preds, squared=False)
        mae = mean_absolute_error(home_actuals, home_preds)
        r2 = r2_score(home_actuals, home_preds) if len(set(home_actuals)) > 1 else 0.0
        
        all_preds = home_preds + away_preds
        all_actuals = home_actuals + away_actuals
        combined_rmse = mean_squared_error(all_actuals, all_preds, squared=False)
        
        return {'rmse': rmse, 'mae': mae, 'r2': r2, 'combined_rmse': combined_rmse}


class GlobalMeanBaseline(BaselineModel):
    """Predict league-wide average goals for all games."""
    
    def fit(self, games_df):
        home_col = get_column(games_df, 'home_goals')
        away_col = get_column(games_df, 'away_goals')
        self.global_mean_home = games_df[home_col].mean()
        self.global_mean_away = games_df[away_col].mean()
        self.n_games = len(games_df)
        self.is_fitted = True
        return self
    
    def predict_goals(self, game):
        return self.global_mean_home, self.global_mean_away
    
    def get_summary(self):
        return {'model': 'GlobalMeanBaseline', 'global_mean_home': round(self.global_mean_home, 3)}


class TeamMeanBaseline(BaselineModel):
    """Predict based on team offensive/defensive averages."""
    
    def fit(self, games_df):
        home_team_col = get_column(games_df, 'home_team')
        away_team_col = get_column(games_df, 'away_team')
        home_goals_col = get_column(games_df, 'home_goals')
        away_goals_col = get_column(games_df, 'away_goals')
        
        goals_for, goals_against, games_played = {}, {}, {}
        
        for _, game in games_df.iterrows():
            home_team = game[home_team_col]
            away_team = game[away_team_col]
            home_goals = game[home_goals_col]
            away_goals = game[away_goals_col]
            
            goals_for[home_team] = goals_for.get(home_team, 0) + home_goals
            goals_against[home_team] = goals_against.get(home_team, 0) + away_goals
            games_played[home_team] = games_played.get(home_team, 0) + 1
            
            goals_for[away_team] = goals_for.get(away_team, 0) + away_goals
            goals_against[away_team] = goals_against.get(away_team, 0) + home_goals
            games_played[away_team] = games_played.get(away_team, 0) + 1
        
        self.team_offense = {t: goals_for[t] / games_played[t] for t in games_played}
        self.team_defense = {t: goals_against[t] / games_played[t] for t in games_played}
        self.global_mean = games_df[home_goals_col].mean()
        self.n_teams = len(games_played)
        self.is_fitted = True
        return self
    
    def predict_goals(self, game):
        home_team = get_value(game, 'home_team')
        away_team = get_value(game, 'away_team')
        
        home_off = self.team_offense.get(home_team, self.global_mean)
        home_def = self.team_defense.get(home_team, self.global_mean)
        away_off = self.team_offense.get(away_team, self.global_mean)
        away_def = self.team_defense.get(away_team, self.global_mean)
        
        return (home_off + away_def) / 2, (away_off + home_def) / 2
    
    def get_summary(self):
        return {'model': 'TeamMeanBaseline', 'n_teams': self.n_teams}


class HomeAwayBaseline(BaselineModel):
    """Account for home/away goal differentials."""
    
    def fit(self, games_df):
        home_team_col = get_column(games_df, 'home_team')
        away_team_col = get_column(games_df, 'away_team')
        home_goals_col = get_column(games_df, 'home_goals')
        away_goals_col = get_column(games_df, 'away_goals')
        
        home_goals_for, home_goals_against, home_games = {}, {}, {}
        away_goals_for, away_goals_against, away_games = {}, {}, {}
        
        for _, game in games_df.iterrows():
            ht, at = game[home_team_col], game[away_team_col]
            hg, ag = game[home_goals_col], game[away_goals_col]
            
            home_goals_for[ht] = home_goals_for.get(ht, 0) + hg
            home_goals_against[ht] = home_goals_against.get(ht, 0) + ag
            home_games[ht] = home_games.get(ht, 0) + 1
            
            away_goals_for[at] = away_goals_for.get(at, 0) + ag
            away_goals_against[at] = away_goals_against.get(at, 0) + hg
            away_games[at] = away_games.get(at, 0) + 1
        
        self.home_offense = {t: home_goals_for[t]/home_games[t] for t in home_games}
        self.home_defense = {t: home_goals_against[t]/home_games[t] for t in home_games}
        self.away_offense = {t: away_goals_for[t]/away_games[t] for t in away_games}
        self.away_defense = {t: away_goals_against[t]/away_games[t] for t in away_games}
        
        self.global_home_mean = games_df[home_goals_col].mean()
        self.global_away_mean = games_df[away_goals_col].mean()
        self.is_fitted = True
        return self
    
    def predict_goals(self, game):
        ht = get_value(game, 'home_team')
        at = get_value(game, 'away_team')
        
        home_off = self.home_offense.get(ht, self.global_home_mean)
        away_def = self.away_defense.get(at, self.global_home_mean)
        away_off = self.away_offense.get(at, self.global_away_mean)
        home_def = self.home_defense.get(ht, self.global_away_mean)
        
        return (home_off + away_def) / 2, (away_off + home_def) / 2
    
    def get_summary(self):
        return {'model': 'HomeAwayBaseline', 'home_advantage': round(self.global_home_mean - self.global_away_mean, 3)}


class MovingAverageBaseline(BaselineModel):
    """Use only last N games for predictions."""
    
    def __init__(self, params=None):
        super().__init__(params)
        self.window = self.params.get('window', 5)
    
    def fit(self, games_df):
        home_team_col = get_column(games_df, 'home_team')
        away_team_col = get_column(games_df, 'away_team')
        home_goals_col = get_column(games_df, 'home_goals')
        away_goals_col = get_column(games_df, 'away_goals')
        
        self.team_history = {}
        
        for _, game in games_df.iterrows():
            ht, at = game[home_team_col], game[away_team_col]
            hg, ag = game[home_goals_col], game[away_goals_col]
            
            if ht not in self.team_history: self.team_history[ht] = []
            if at not in self.team_history: self.team_history[at] = []
            
            self.team_history[ht].append((hg, ag))
            self.team_history[at].append((ag, hg))
        
        self.global_mean = games_df[home_goals_col].mean()
        self.is_fitted = True
        return self
    
    def _get_recent_avg(self, team):
        if team not in self.team_history or len(self.team_history[team]) == 0:
            return self.global_mean, self.global_mean
        recent = self.team_history[team][-self.window:]
        return np.mean([g[0] for g in recent]), np.mean([g[1] for g in recent])
    
    def predict_goals(self, game):
        ht, at = get_value(game, 'home_team'), get_value(game, 'away_team')
        home_off, home_def = self._get_recent_avg(ht)
        away_off, away_def = self._get_recent_avg(at)
        return (home_off + away_def) / 2, (away_off + home_def) / 2
    
    def get_summary(self):
        return {'model': f'MovingAverage(window={self.window})', 'window': self.window}


class WeightedHistoryBaseline(BaselineModel):
    """Recent games count more than older games."""
    
    def __init__(self, params=None):
        super().__init__(params)
        self.decay = self.params.get('decay', 0.9)
    
    def fit(self, games_df):
        home_team_col = get_column(games_df, 'home_team')
        away_team_col = get_column(games_df, 'away_team')
        home_goals_col = get_column(games_df, 'home_goals')
        away_goals_col = get_column(games_df, 'away_goals')
        
        self.team_history = {}
        
        for _, game in games_df.iterrows():
            ht, at = game[home_team_col], game[away_team_col]
            hg, ag = game[home_goals_col], game[away_goals_col]
            
            if ht not in self.team_history: self.team_history[ht] = []
            if at not in self.team_history: self.team_history[at] = []
            
            self.team_history[ht].append((hg, ag))
            self.team_history[at].append((ag, hg))
        
        self.global_mean = games_df[home_goals_col].mean()
        self.is_fitted = True
        return self
    
    def _get_weighted_avg(self, team):
        if team not in self.team_history or len(self.team_history[team]) == 0:
            return self.global_mean, self.global_mean
        
        history = self.team_history[team]
        n = len(history)
        weighted_for, weighted_against, total_weight = 0, 0, 0
        
        for i, (gf, ga) in enumerate(history):
            weight = self.decay ** (n - 1 - i)
            weighted_for += gf * weight
            weighted_against += ga * weight
            total_weight += weight
        
        return weighted_for / total_weight, weighted_against / total_weight
    
    def predict_goals(self, game):
        ht, at = get_value(game, 'home_team'), get_value(game, 'away_team')
        home_off, home_def = self._get_weighted_avg(ht)
        away_off, away_def = self._get_weighted_avg(at)
        return (home_off + away_def) / 2, (away_off + home_def) / 2
    
    def get_summary(self):
        return {'model': f'WeightedHistory(decay={self.decay})', 'decay': self.decay}


class PoissonBaseline(BaselineModel):
    """Statistical Poisson regression model."""
    
    def fit(self, games_df):
        home_team_col = get_column(games_df, 'home_team')
        away_team_col = get_column(games_df, 'away_team')
        home_goals_col = get_column(games_df, 'home_goals')
        away_goals_col = get_column(games_df, 'away_goals')
        
        self.league_avg = games_df[home_goals_col].mean()
        self.home_factor = games_df[home_goals_col].mean() / max(games_df[away_goals_col].mean(), 0.01)
        
        goals_for, goals_against, games_played = {}, {}, {}
        
        for _, game in games_df.iterrows():
            ht, at = game[home_team_col], game[away_team_col]
            hg, ag = game[home_goals_col], game[away_goals_col]
            
            goals_for[ht] = goals_for.get(ht, 0) + hg
            goals_against[ht] = goals_against.get(ht, 0) + ag
            games_played[ht] = games_played.get(ht, 0) + 1
            
            goals_for[at] = goals_for.get(at, 0) + ag
            goals_against[at] = goals_against.get(at, 0) + hg
            games_played[at] = games_played.get(at, 0) + 1
        
        self.attack_strength = {t: (goals_for[t]/games_played[t])/self.league_avg for t in games_played}
        self.defense_strength = {t: (goals_against[t]/games_played[t])/self.league_avg for t in games_played}
        self.is_fitted = True
        return self
    
    def predict_goals(self, game):
        ht, at = get_value(game, 'home_team'), get_value(game, 'away_team')
        
        home_att = self.attack_strength.get(ht, 1.0)
        home_def = self.defense_strength.get(ht, 1.0)
        away_att = self.attack_strength.get(at, 1.0)
        away_def = self.defense_strength.get(at, 1.0)
        
        home_goals = self.league_avg * home_att * away_def * self.home_factor
        away_goals = self.league_avg * away_att * home_def / self.home_factor
        
        return home_goals, away_goals
    
    def get_summary(self):
        return {'model': 'PoissonBaseline', 'home_factor': round(self.home_factor, 3)}


print("All baseline model classes loaded.")

## 2. Load Data

In [None]:
# Try multiple possible data paths
data_paths = [
    '../data/hockey_data.csv',
    '../../data/hockey_data.csv',
    'hockey_data.csv',
]

games_df = None
for path in data_paths:
    if os.path.exists(path):
        games_df = pd.read_csv(path)
        print(f"Loaded data from: {path}")
        break

if games_df is None:
    print("No data file found. Creating synthetic data for demonstration...")
    
    # Generate realistic synthetic hockey data
    np.random.seed(42)
    n_games = 500
    teams = ['Team A', 'Team B', 'Team C', 'Team D', 'Team E', 'Team F', 'Team G', 'Team H']
    
    # Team strength (affects goals scored)
    team_strength = {t: np.random.uniform(0.8, 1.2) for t in teams}
    
    games = []
    for i in range(n_games):
        home, away = np.random.choice(teams, 2, replace=False)
        
        # Base goals with team strength and home advantage
        home_lambda = 3.0 * team_strength[home] * 1.1  # Home advantage
        away_lambda = 3.0 * team_strength[away] * 0.9
        
        home_goals = np.random.poisson(home_lambda)
        away_goals = np.random.poisson(away_lambda)
        
        games.append({
            'game_date': pd.Timestamp('2025-10-01') + pd.Timedelta(days=i//3),
            'home_team': home,
            'away_team': away,
            'home_goals': home_goals,
            'away_goals': away_goals
        })
    
    games_df = pd.DataFrame(games)
    print(f"Generated {len(games_df)} synthetic games")

# Sort by date
date_col = get_column(games_df, 'game_date')
if date_col:
    games_df = games_df.sort_values(date_col).reset_index(drop=True)

print(f"\nDataset: {len(games_df)} games")
print(f"Columns: {list(games_df.columns)}")
games_df.head()

In [None]:
# Split data chronologically (80/20)
split_idx = int(len(games_df) * 0.8)
train_df = games_df.iloc[:split_idx].copy()
test_df = games_df.iloc[split_idx:].copy()

print(f"Training set: {len(train_df)} games")
print(f"Test set: {len(test_df)} games")

## 3. Train All Baselines

In [None]:
# Define all baseline models to compare
models = {
    'GlobalMean': GlobalMeanBaseline(),
    'TeamMean': TeamMeanBaseline(),
    'HomeAway': HomeAwayBaseline(),
    'MovingAvg_3': MovingAverageBaseline({'window': 3}),
    'MovingAvg_5': MovingAverageBaseline({'window': 5}),
    'MovingAvg_10': MovingAverageBaseline({'window': 10}),
    'Weighted_0.85': WeightedHistoryBaseline({'decay': 0.85}),
    'Weighted_0.90': WeightedHistoryBaseline({'decay': 0.90}),
    'Weighted_0.95': WeightedHistoryBaseline({'decay': 0.95}),
    'Poisson': PoissonBaseline()
}

# Train all models
print("Training baseline models...\n")
for name, model in models.items():
    model.fit(train_df)
    print(f"  {name}: trained")

print("\nAll models trained.")

## 4. Compare Performance

In [None]:
# Evaluate all models
results = []

for name, model in models.items():
    metrics = model.evaluate(test_df)
    results.append({
        'model': name,
        'rmse': metrics['rmse'],
        'mae': metrics['mae'],
        'r2': metrics['r2'],
        'combined_rmse': metrics['combined_rmse']
    })

results_df = pd.DataFrame(results).sort_values('rmse')
print("Model Comparison (sorted by RMSE):")
print("="*60)
results_df

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# RMSE comparison
ax = axes[0]
colors = ['green' if x == results_df['rmse'].min() else 'steelblue' for x in results_df['rmse']]
ax.barh(results_df['model'], results_df['rmse'], color=colors)
ax.set_xlabel('RMSE')
ax.set_title('RMSE by Model (lower is better)')
ax.invert_yaxis()

# MAE comparison
ax = axes[1]
colors = ['green' if x == results_df['mae'].min() else 'steelblue' for x in results_df['mae']]
ax.barh(results_df['model'], results_df['mae'], color=colors)
ax.set_xlabel('MAE')
ax.set_title('MAE by Model (lower is better)')
ax.invert_yaxis()

# R2 comparison
ax = axes[2]
colors = ['green' if x == results_df['r2'].max() else 'steelblue' for x in results_df['r2']]
ax.barh(results_df['model'], results_df['r2'], color=colors)
ax.set_xlabel('R-squared')
ax.set_title('R-squared by Model (higher is better)')
ax.invert_yaxis()

plt.tight_layout()
plt.show()

## 5. Analyze Best Baseline

In [None]:
# Identify best model
best_name = results_df.iloc[0]['model']
best_model = models[best_name]
best_metrics = results_df.iloc[0]

print(f"Best Baseline Model: {best_name}")
print(f"="*40)
print(f"RMSE: {best_metrics['rmse']:.4f}")
print(f"MAE:  {best_metrics['mae']:.4f}")
print(f"R2:   {best_metrics['r2']:.4f}")
print(f"\nModel Details:")
print(best_model.get_summary())

In [None]:
# Generate predictions for test set
predictions = []
for _, game in test_df.iterrows():
    home_pred, away_pred = best_model.predict_goals(game)
    predictions.append({
        'home_team': get_value(game, 'home_team'),
        'away_team': get_value(game, 'away_team'),
        'home_goals_actual': get_value(game, 'home_goals'),
        'away_goals_actual': get_value(game, 'away_goals'),
        'home_goals_pred': round(home_pred, 2),
        'away_goals_pred': round(away_pred, 2),
        'home_error': round(abs(get_value(game, 'home_goals') - home_pred), 2)
    })

pred_df = pd.DataFrame(predictions)
print("Sample Predictions:")
pred_df.head(10)

In [None]:
# Prediction error distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Error histogram
ax = axes[0]
errors = pred_df['home_goals_actual'] - pred_df['home_goals_pred']
ax.hist(errors, bins=20, edgecolor='black', alpha=0.7)
ax.axvline(0, color='red', linestyle='--', linewidth=2)
ax.set_xlabel('Prediction Error (Actual - Predicted)')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Prediction Errors')

# Predicted vs Actual
ax = axes[1]
ax.scatter(pred_df['home_goals_pred'], pred_df['home_goals_actual'], alpha=0.5)
ax.plot([0, 8], [0, 8], 'r--', linewidth=2, label='Perfect prediction')
ax.set_xlabel('Predicted Goals')
ax.set_ylabel('Actual Goals')
ax.set_title('Predicted vs Actual Home Goals')
ax.legend()

plt.tight_layout()
plt.show()

## 6. Hyperparameter Search

For models with hyperparameters, find optimal values.

In [None]:
# Grid search for MovingAverage window
window_results = []

for window in range(1, 21):
    model = MovingAverageBaseline({'window': window})
    model.fit(train_df)
    metrics = model.evaluate(test_df)
    window_results.append({
        'window': window,
        'rmse': metrics['rmse'],
        'mae': metrics['mae']
    })

window_df = pd.DataFrame(window_results)
best_window = window_df.loc[window_df['rmse'].idxmin(), 'window']

plt.figure(figsize=(10, 5))
plt.plot(window_df['window'], window_df['rmse'], 'o-', linewidth=2)
plt.axvline(best_window, color='red', linestyle='--', label=f'Best: window={int(best_window)}')
plt.xlabel('Window Size')
plt.ylabel('RMSE')
plt.title('MovingAverage: Window Size vs RMSE')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Best window size: {int(best_window)}")

In [None]:
# Grid search for WeightedHistory decay
decay_results = []

for decay in np.arange(0.70, 1.00, 0.02):
    model = WeightedHistoryBaseline({'decay': decay})
    model.fit(train_df)
    metrics = model.evaluate(test_df)
    decay_results.append({
        'decay': round(decay, 2),
        'rmse': metrics['rmse'],
        'mae': metrics['mae']
    })

decay_df = pd.DataFrame(decay_results)
best_decay = decay_df.loc[decay_df['rmse'].idxmin(), 'decay']

plt.figure(figsize=(10, 5))
plt.plot(decay_df['decay'], decay_df['rmse'], 'o-', linewidth=2)
plt.axvline(best_decay, color='red', linestyle='--', label=f'Best: decay={best_decay}')
plt.xlabel('Decay Factor')
plt.ylabel('RMSE')
plt.title('WeightedHistory: Decay Factor vs RMSE')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Best decay factor: {best_decay}")

## 7. Final Evaluation

Compare best tuned baselines against the simple GlobalMean reference.

In [None]:
# Final comparison with tuned hyperparameters
final_models = {
    'GlobalMean (reference)': GlobalMeanBaseline(),
    'TeamMean': TeamMeanBaseline(),
    'HomeAway': HomeAwayBaseline(),
    f'MovingAvg (window={int(best_window)})': MovingAverageBaseline({'window': int(best_window)}),
    f'Weighted (decay={best_decay})': WeightedHistoryBaseline({'decay': best_decay}),
    'Poisson': PoissonBaseline()
}

final_results = []
for name, model in final_models.items():
    model.fit(train_df)
    metrics = model.evaluate(test_df)
    final_results.append({
        'model': name,
        'rmse': round(metrics['rmse'], 4),
        'mae': round(metrics['mae'], 4),
        'r2': round(metrics['r2'], 4)
    })

final_df = pd.DataFrame(final_results).sort_values('rmse')

# Calculate improvement over baseline
baseline_rmse = final_df[final_df['model'].str.contains('GlobalMean')]['rmse'].values[0]
final_df['improvement'] = round((baseline_rmse - final_df['rmse']) / baseline_rmse * 100, 1)

print("Final Model Comparison:")
print("="*70)
final_df

In [None]:
# Identify the overall best baseline
overall_best_name = final_df.iloc[0]['model']
overall_best_rmse = final_df.iloc[0]['rmse']
overall_best_improvement = final_df.iloc[0]['improvement']

print(f"\nBest Baseline Model: {overall_best_name}")
print(f"RMSE: {overall_best_rmse}")
print(f"Improvement over GlobalMean: {overall_best_improvement}%")

## 8. Save Results

In [None]:
# Save comparison results
output_path = '../data/model1_baseline_results.csv'
final_df.to_csv(output_path, index=False)
print(f"Results saved to: {output_path}")

# Save best configuration
best_config = {
    'model': 'baseline',
    'best_variant': overall_best_name,
    'metrics': {
        'rmse': float(overall_best_rmse),
        'improvement_pct': float(overall_best_improvement)
    },
    'hyperparameters': {
        'best_moving_avg_window': int(best_window),
        'best_weighted_decay': float(best_decay)
    }
}

config_path = '../data/best_baseline_config.json'
with open(config_path, 'w') as f:
    json.dump(best_config, f, indent=2)
print(f"Best config saved to: {config_path}")

In [None]:
# Final summary
print("\n" + "="*60)
print("BASELINE MODEL TRAINING COMPLETE")
print("="*60)
print(f"\nBest Model: {overall_best_name}")
print(f"RMSE: {overall_best_rmse}")
print(f"\nThis baseline RMSE ({overall_best_rmse:.3f}) is the benchmark.")
print("More complex models (ELO, XGBoost, etc.) should beat this.")
print("\nFiles saved:")
print(f"  - {output_path}")
print(f"  - {config_path}")