# Training Linear Regression Models

This notebook trains and tunes linear regression models for hockey goal prediction.

**Workflow:**
1. Load and prepare data
2. Grid search for optimal hyperparameters
3. Random search for broader exploration
4. Train final model with best parameters
5. Evaluate and save model

In [None]:
# Imports
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
from pathlib import Path
from sklearn.model_selection import train_test_split

from utils.linear_model import (
    LinearRegressionModel,
    LinearGoalPredictor,
    grid_search_linear,
    random_search_linear,
    compare_regularization
)

## 1. Load Configuration

In [None]:
# Load hyperparameter config
config_path = Path('../../config/hyperparams/model2_linear_regression.yaml')

if config_path.exists():
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    print("Loaded configuration:")
    print(f"  Model: {config['model_name']}")
    print(f"  Description: {config['description']}")
    hyperparams = config['hyperparameters']
    defaults = config['defaults']
else:
    print("Config not found, using defaults")
    hyperparams = {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
        'l1_ratio': [0.0, 0.5, 1.0],
        'poly_degree': [1, 2],
        'scaling': ['standard', 'robust'],
    }
    defaults = {'alpha': 1.0, 'l1_ratio': 0.5, 'poly_degree': 1, 'scaling': 'standard'}

## 2. Prepare Training Data

In [None]:
# Generate synthetic data for demo (replace with real data loading)
np.random.seed(42)
n_games = 1000

# Features
data = pd.DataFrame({
    'home_elo': np.random.normal(1500, 100, n_games),
    'away_elo': np.random.normal(1500, 100, n_games),
    'home_recent_form': np.random.uniform(0, 1, n_games),
    'away_recent_form': np.random.uniform(0, 1, n_games),
    'home_rest_days': np.random.choice([1, 2, 3, 4, 5], n_games),
    'away_rest_days': np.random.choice([1, 2, 3, 4, 5], n_games),
    'home_avg_goals': np.random.normal(3.0, 0.5, n_games),
    'away_avg_goals': np.random.normal(2.8, 0.5, n_games),
    'home_avg_against': np.random.normal(2.7, 0.5, n_games),
    'away_avg_against': np.random.normal(2.9, 0.5, n_games),
    'home_pp_pct': np.random.uniform(0.15, 0.30, n_games),
    'away_pp_pct': np.random.uniform(0.15, 0.30, n_games),
    'home_pk_pct': np.random.uniform(0.75, 0.90, n_games),
    'away_pk_pct': np.random.uniform(0.75, 0.90, n_games),
})

# Generate targets
home_base = (
    0.5 * (data['home_elo'] - data['away_elo']) / 100 +
    0.3 * data['home_recent_form'] +
    0.5 * data['home_avg_goals'] -
    0.2 * data['away_avg_goals'] +
    0.3 * data['home_pp_pct'] * 10
)

away_base = (
    0.5 * (data['away_elo'] - data['home_elo']) / 100 +
    0.3 * data['away_recent_form'] +
    0.5 * data['away_avg_goals'] -
    0.2 * data['home_avg_goals'] +
    0.3 * data['away_pp_pct'] * 10
)

data['home_goals'] = np.maximum(0, np.round(2.8 + home_base + np.random.normal(0, 1, n_games))).astype(int)
data['away_goals'] = np.maximum(0, np.round(2.6 + away_base + np.random.normal(0, 1, n_games))).astype(int)

print(f"Dataset: {len(data)} games, {len(data.columns) - 2} features")
print(f"\nTarget statistics:")
print(f"  Home goals: mean={data['home_goals'].mean():.2f}, std={data['home_goals'].std():.2f}")
print(f"  Away goals: mean={data['away_goals'].mean():.2f}, std={data['away_goals'].std():.2f}")

In [None]:
# Train/validation/test split
train_df, temp_df = train_test_split(data, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training:   {len(train_df)} games")
print(f"Validation: {len(val_df)} games")
print(f"Test:       {len(test_df)} games")

# Prepare features
feature_cols = [col for col in data.columns if col not in ['home_goals', 'away_goals']]
X_train = train_df[feature_cols]
y_home_train = train_df['home_goals']
y_away_train = train_df['away_goals']

X_val = val_df[feature_cols]
y_home_val = val_df['home_goals']
y_away_val = val_df['away_goals']

## 3. Grid Search Hyperparameter Tuning

In [None]:
# Define parameter grid from config
param_grid = {
    'alpha': hyperparams.get('alpha', [0.001, 0.01, 0.1, 1.0, 10.0]),
    'l1_ratio': hyperparams.get('l1_ratio', [0.0, 0.5, 1.0]),
    'poly_degree': hyperparams.get('poly_degree', [1, 2]),
    'scaling': hyperparams.get('scaling', ['standard', 'robust']),
}

# Calculate total combinations
total = 1
for v in param_grid.values():
    total *= len(v)
print(f"Total grid search combinations: {total}")

In [None]:
# Run grid search for home goals
print("Grid Search for Home Goals...")
home_grid_results = grid_search_linear(
    X_train, y_home_train,
    param_grid=param_grid,
    cv=5,
    verbose=True
)

print(f"\nBest Home Goals Parameters:")
print(f"  {home_grid_results['best_params']}")
print(f"  RMSE: {home_grid_results['best_score']:.4f}")

In [None]:
# Run grid search for away goals
print("Grid Search for Away Goals...")
away_grid_results = grid_search_linear(
    X_train, y_away_train,
    param_grid=param_grid,
    cv=5,
    verbose=True
)

print(f"\nBest Away Goals Parameters:")
print(f"  {away_grid_results['best_params']}")
print(f"  RMSE: {away_grid_results['best_score']:.4f}")

In [None]:
# Top 10 configurations from grid search
print("Top 10 Home Goals Configurations:")
home_grid_results['all_results'].head(10)

## 4. Random Search (Broader Exploration)

In [None]:
# Random search with wider parameter ranges
param_distributions = {
    'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0],
    'l1_ratio': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'poly_degree': [1, 2],
    'scaling': ['standard', 'robust'],
    'max_iter': [1000, 5000],
}

print("Random Search for Home Goals (100 iterations)...")
home_random_results = random_search_linear(
    X_train, y_home_train,
    param_distributions=param_distributions,
    n_iter=100,
    cv=5,
    verbose=True
)

print(f"\nBest Random Search Parameters:")
print(f"  {home_random_results['best_params']}")
print(f"  RMSE: {home_random_results['best_score']:.4f}")

In [None]:
# Compare grid search vs random search
print("Comparison:")
print(f"  Grid Search Best:   RMSE = {home_grid_results['best_score']:.4f}")
print(f"  Random Search Best: RMSE = {home_random_results['best_score']:.4f}")

# Use the better result
if home_random_results['best_score'] < home_grid_results['best_score']:
    best_params = home_random_results['best_params']
    print("\n→ Using Random Search parameters")
else:
    best_params = home_grid_results['best_params']
    print("\n→ Using Grid Search parameters")

print(f"Best params: {best_params}")

## 5. Regularization Analysis

In [None]:
# Compare regularization types
comparison = compare_regularization(
    X_train, y_home_train,
    alphas=[0.001, 0.01, 0.1, 1.0, 10.0],
    cv=5
)

comparison

In [None]:
# Visualize regularization comparison
fig, ax = plt.subplots(figsize=(10, 6))

for model_name in ['Ridge', 'Lasso', 'ElasticNet']:
    subset = comparison[comparison['model'] == model_name]
    ax.errorbar(
        subset['alpha'], 
        subset['rmse_mean'],
        yerr=subset['rmse_std'],
        marker='o', 
        label=model_name,
        capsize=3
    )

ax.set_xscale('log')
ax.set_xlabel('Alpha (Regularization Strength)')
ax.set_ylabel('RMSE (5-fold CV)')
ax.set_title('Regularization Comparison')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Train Final Models

In [None]:
# Train final predictor with best parameters
final_predictor = LinearGoalPredictor(
    alpha=best_params.get('alpha', 0.1),
    l1_ratio=best_params.get('l1_ratio', 0.5),
    scaling=best_params.get('scaling', 'standard'),
    poly_degree=best_params.get('poly_degree', 1),
    max_iter=best_params.get('max_iter', 1000)
)

# Combine train and validation for final training
full_train = pd.concat([train_df, val_df])
final_predictor.fit(full_train)

print("Final predictor trained!")
print(final_predictor)

In [None]:
# Evaluate on held-out test set
test_metrics = final_predictor.evaluate(test_df)

print("=" * 50)
print("FINAL TEST SET RESULTS")
print("=" * 50)
print(f"\nHome Goals Prediction:")
print(f"  RMSE: {test_metrics['home']['rmse']:.4f}")
print(f"  MAE:  {test_metrics['home']['mae']:.4f}")
print(f"  R²:   {test_metrics['home']['r2']:.4f}")

print(f"\nAway Goals Prediction:")
print(f"  RMSE: {test_metrics['away']['rmse']:.4f}")
print(f"  MAE:  {test_metrics['away']['mae']:.4f}")
print(f"  R²:   {test_metrics['away']['r2']:.4f}")

print(f"\nCombined Metrics:")
print(f"  RMSE: {test_metrics['combined']['rmse']:.4f}")
print(f"  MAE:  {test_metrics['combined']['mae']:.4f}")
print(f"  R²:   {test_metrics['combined']['r2']:.4f}")

print(f"\nWin Prediction Accuracy: {test_metrics['win_accuracy']:.2%}")

## 7. Feature Importance Analysis

In [None]:
# Get feature importance (combined from both models)
importance = final_predictor.get_feature_importance(target='combined', top_n=15)

fig, ax = plt.subplots(figsize=(10, 8))
importance.plot(kind='barh', ax=ax)
ax.set_xlabel('Importance (|coefficient|)')
ax.set_title('Top 15 Feature Importances')
plt.tight_layout()
plt.show()

In [None]:
# Coefficient analysis for home model
home_coefs = final_predictor.get_coefficients(target='home')
print("Home Goals Model Coefficients:")
home_coefs

## 8. Prediction Examples

In [None]:
# Make predictions on test set
predictions = final_predictor.predict_batch(test_df)
predictions['home_actual'] = test_df['home_goals'].values
predictions['away_actual'] = test_df['away_goals'].values
predictions['home_error'] = predictions['home_pred'] - predictions['home_actual']
predictions['away_error'] = predictions['away_pred'] - predictions['away_actual']

print("Sample Predictions:")
predictions.head(10)

In [None]:
# Prediction error distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].hist(predictions['home_error'], bins=20, edgecolor='black', alpha=0.7)
axes[0].axvline(x=0, color='red', linestyle='--')
axes[0].set_xlabel('Prediction Error')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Home Goals Prediction Error')

axes[1].hist(predictions['away_error'], bins=20, edgecolor='black', alpha=0.7)
axes[1].axvline(x=0, color='red', linestyle='--')
axes[1].set_xlabel('Prediction Error')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Away Goals Prediction Error')

plt.tight_layout()
plt.show()

In [None]:
# Predicted vs Actual scatter
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(predictions['home_actual'], predictions['home_pred'], alpha=0.5)
axes[0].plot([0, 10], [0, 10], 'r--', label='Perfect prediction')
axes[0].set_xlabel('Actual Home Goals')
axes[0].set_ylabel('Predicted Home Goals')
axes[0].set_title('Home Goals: Predicted vs Actual')
axes[0].legend()

axes[1].scatter(predictions['away_actual'], predictions['away_pred'], alpha=0.5)
axes[1].plot([0, 10], [0, 10], 'r--', label='Perfect prediction')
axes[1].set_xlabel('Actual Away Goals')
axes[1].set_ylabel('Predicted Away Goals')
axes[1].set_title('Away Goals: Predicted vs Actual')
axes[1].legend()

plt.tight_layout()
plt.show()

## 9. Save Models and Results

In [None]:
# Create output directory
output_dir = Path('../models/saved/linear_regression')
output_dir.mkdir(parents=True, exist_ok=True)

# Save predictor
final_predictor.save(output_dir / 'linear_predictor')
print(f"Model saved to {output_dir / 'linear_predictor'}")

In [None]:
# Save grid search results
home_grid_results['all_results'].to_csv(
    output_dir / 'home_grid_search_results.csv',
    index=False
)

home_random_results['all_results'].to_csv(
    output_dir / 'home_random_search_results.csv',
    index=False
)

print("Search results saved!")

In [None]:
# Save training summary
summary = {
    'model_name': 'LinearGoalPredictor',
    'best_params': best_params,
    'test_metrics': test_metrics,
    'training_samples': len(full_train),
    'test_samples': len(test_df),
    'n_features': len(feature_cols),
}

import json
with open(output_dir / 'training_summary.json', 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print("\nTraining Summary:")
print(json.dumps(summary, indent=2, default=str))

## 10. Load and Verify Saved Model

In [None]:
# Load saved model
loaded = LinearGoalPredictor.load(output_dir / 'linear_predictor')

# Verify predictions match
loaded_metrics = loaded.evaluate(test_df)

print("Verification:")
print(f"  Original RMSE:  {test_metrics['combined']['rmse']:.6f}")
print(f"  Loaded RMSE:    {loaded_metrics['combined']['rmse']:.6f}")
print(f"  Match: {abs(test_metrics['combined']['rmse'] - loaded_metrics['combined']['rmse']) < 0.0001}")

## Summary

This notebook:
1. ✅ Loaded configuration from YAML
2. ✅ Prepared training/validation/test splits
3. ✅ Ran grid search for optimal hyperparameters
4. ✅ Ran random search for broader exploration
5. ✅ Compared regularization types (Ridge/Lasso/ElasticNet)
6. ✅ Trained final model with best parameters
7. ✅ Evaluated on held-out test set
8. ✅ Analyzed feature importance
9. ✅ Saved model and results

### Next Steps:
- Train on real hockey data
- Compare with baseline and ELO models
- Combine in ensemble model