# Model 4: Random Forest - Training and Hyperparameter Tuning

This notebook trains and tunes Random Forest models for hockey goal prediction.

## Table of Contents

1. Setup and Imports
2. Load Data
3. Baseline Random Forest
4. Random Search Hyperparameter Tuning
5. Grid Search (Fine-tuning)
6. Cross-Validation Analysis
7. Feature Importance
8. Final Model Evaluation
9. Save Best Model

## Random Forest vs XGBoost

- **Random Forest**: Bagging ensemble, parallel trees, less prone to overfitting
- **XGBoost**: Boosting ensemble, sequential trees, often higher accuracy but needs more tuning
- We use both in Model 4 to compare performance

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import sys
import json
import yaml
import pickle
import pathlib
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

# Reliably set cwd to the python/ folder
_cwd = pathlib.Path(os.path.abspath('')).resolve()
if (_cwd / 'python').is_dir():
    _python_dir = _cwd / 'python'
elif _cwd.name == 'random_forest' and (_cwd.parent.parent / 'data').is_dir():
    _python_dir = _cwd.parent.parent
elif _cwd.name == 'training' and (_cwd.parent / 'data').is_dir():
    _python_dir = _cwd.parent
elif (_cwd / 'data').is_dir():
    _python_dir = _cwd
else:
    raise RuntimeError(f'Cannot locate python/ directory from {_cwd}')

os.chdir(_python_dir)
sys.path.insert(0, str(_python_dir))

# Configure plotting
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11
sns.set_style('whitegrid')

print(f"CWD: {os.getcwd()}")
print(f"Scikit-learn RandomForest ready")
print("Setup complete.")

In [None]:
# Load hyperparameter configuration
config_path = '../config/hyperparams/model4_random_forest.yaml'

if os.path.exists(config_path):
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    print(f"Loaded config: {config['model_name']}")
    print(f"Description: {config['description']}")
else:
    print(f"Config not found at {config_path}, using defaults")
    config = None

## 2. Load or Generate Data

In [None]:
# Try to load real data, otherwise generate synthetic
data_path = 'data/hockey_features.csv'

if os.path.exists(data_path):
    data = pd.read_csv(data_path)
    print(f"Loaded {len(data)} games from {data_path}")
else:
    print("Generating synthetic hockey data for demonstration...")
    
    np.random.seed(42)
    n_games = 2000
    
    data = pd.DataFrame({
        # Team strength metrics
        'home_win_pct': np.random.uniform(0.3, 0.7, n_games),
        'away_win_pct': np.random.uniform(0.3, 0.7, n_games),
        'home_points_pct': np.random.uniform(0.4, 0.8, n_games),
        'away_points_pct': np.random.uniform(0.4, 0.8, n_games),
        
        # Offensive metrics
        'home_goals_avg': np.random.uniform(2.5, 3.8, n_games),
        'away_goals_avg': np.random.uniform(2.5, 3.8, n_games),
        'home_shots_avg': np.random.uniform(28, 35, n_games),
        'away_shots_avg': np.random.uniform(28, 35, n_games),
        
        # Defensive metrics
        'home_goals_against_avg': np.random.uniform(2.2, 3.5, n_games),
        'away_goals_against_avg': np.random.uniform(2.2, 3.5, n_games),
        'home_save_pct': np.random.uniform(0.88, 0.93, n_games),
        'away_save_pct': np.random.uniform(0.88, 0.93, n_games),
        
        # Special teams
        'home_pp_pct': np.random.uniform(0.15, 0.28, n_games),
        'away_pp_pct': np.random.uniform(0.15, 0.28, n_games),
        'home_pk_pct': np.random.uniform(0.75, 0.88, n_games),
        'away_pk_pct': np.random.uniform(0.75, 0.88, n_games),
        
        # Context
        'home_rest_days': np.random.randint(1, 5, n_games),
        'away_rest_days': np.random.randint(1, 5, n_games),
        'home_b2b': np.random.binomial(1, 0.15, n_games),
        'away_b2b': np.random.binomial(1, 0.15, n_games),
        
        # Recent form (last 5 games)
        'home_goals_last5': np.random.uniform(2.0, 4.0, n_games),
        'away_goals_last5': np.random.uniform(2.0, 4.0, n_games),
        'home_wins_last5': np.random.randint(0, 6, n_games),
        'away_wins_last5': np.random.randint(0, 6, n_games),
    })
    
    # Generate realistic goal totals
    home_advantage = 0.35
    
    data['home_goals'] = np.round(
        data['home_goals_avg'] * 0.3 +
        data['home_goals_last5'] * 0.2 +
        (4 - data['away_goals_against_avg']) * 0.3 +
        data['home_pp_pct'] * 3 +
        home_advantage +
        (data['home_rest_days'] - data['away_rest_days']) * 0.1 +
        np.random.normal(0, 0.8, n_games)
    ).clip(0, 9).astype(int)
    
    data['away_goals'] = np.round(
        data['away_goals_avg'] * 0.3 +
        data['away_goals_last5'] * 0.2 +
        (4 - data['home_goals_against_avg']) * 0.3 +
        data['away_pp_pct'] * 3 +
        np.random.normal(0, 0.8, n_games)
    ).clip(0, 9).astype(int)
    
    print(f"Generated {n_games} synthetic games")

print(f"\nDataset shape: {data.shape}")
print(f"Home goals mean: {data['home_goals'].mean():.2f}")
print(f"Away goals mean: {data['away_goals'].mean():.2f}")

In [None]:
# Prepare features and targets
target_cols = ['home_goals', 'away_goals']
exclude_cols = target_cols + ['home_team', 'away_team', 'date', 'game_id', 'season']

feature_cols = [col for col in data.columns if col not in exclude_cols]
print(f"Features ({len(feature_cols)}): {feature_cols[:10]}...")

X = data[feature_cols]
y_home = data['home_goals']
y_away = data['away_goals']

In [None]:
# Train/validation/test split (60/20/20)
X_trainval, X_test, y_home_trainval, y_home_test, y_away_trainval, y_away_test = train_test_split(
    X, y_home, y_away, test_size=0.2, random_state=42
)

X_train, X_val, y_home_train, y_home_val, y_away_train, y_away_val = train_test_split(
    X_trainval, y_home_trainval, y_away_trainval, test_size=0.25, random_state=42
)

print(f"Training set: {len(X_train)} games")
print(f"Validation set: {len(X_val)} games")
print(f"Test set: {len(X_test)} games")

## 3. Baseline Random Forest Model

In [None]:
# Default parameters from config or sensible defaults
default_params = {
    'n_estimators': 200,
    'max_depth': 10,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'bootstrap': True,
    'random_state': 42,
    'n_jobs': -1,
}

print("Default Random Forest Parameters:")
for k, v in default_params.items():
    print(f"  {k}: {v}")

In [None]:
# Train baseline model for home goals
baseline_home = RandomForestRegressor(**default_params)
baseline_home.fit(X_train, y_home_train)

# Train baseline model for away goals
baseline_away = RandomForestRegressor(**default_params)
baseline_away.fit(X_train, y_away_train)

print("Baseline models trained!")

In [None]:
# Evaluate baseline on validation set
def evaluate_models(home_model, away_model, X, y_home, y_away):
    """Evaluate both models and return combined metrics."""
    home_pred = home_model.predict(X)
    away_pred = away_model.predict(X)
    
    metrics = {
        'home_rmse': np.sqrt(mean_squared_error(y_home, home_pred)),
        'away_rmse': np.sqrt(mean_squared_error(y_away, away_pred)),
        'home_mae': mean_absolute_error(y_home, home_pred),
        'away_mae': mean_absolute_error(y_away, away_pred),
        'home_r2': r2_score(y_home, home_pred),
        'away_r2': r2_score(y_away, away_pred),
    }
    
    # Combined metrics
    all_pred = np.concatenate([home_pred, away_pred])
    all_actual = np.concatenate([y_home, y_away])
    metrics['combined_rmse'] = np.sqrt(mean_squared_error(all_actual, all_pred))
    metrics['combined_mae'] = mean_absolute_error(all_actual, all_pred)
    metrics['combined_r2'] = r2_score(all_actual, all_pred)
    
    return metrics

baseline_metrics = evaluate_models(baseline_home, baseline_away, X_val, y_home_val, y_away_val)

print("\n Baseline Validation Performance")
print("=" * 45)
print(f"\nHome Goals:")
print(f"  RMSE: {baseline_metrics['home_rmse']:.4f}")
print(f"  MAE:  {baseline_metrics['home_mae']:.4f}")
print(f"  R²:   {baseline_metrics['home_r2']:.4f}")
print(f"\nAway Goals:")
print(f"  RMSE: {baseline_metrics['away_rmse']:.4f}")
print(f"  MAE:  {baseline_metrics['away_mae']:.4f}")
print(f"  R²:   {baseline_metrics['away_r2']:.4f}")
print(f"\nCombined:")
print(f"  RMSE: {baseline_metrics['combined_rmse']:.4f}")
print(f"  MAE:  {baseline_metrics['combined_mae']:.4f}")
print(f"  R²:   {baseline_metrics['combined_r2']:.4f}")

## 4. Random Search Hyperparameter Tuning

In [None]:
# Parameter distributions for random search
param_distributions = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', 0.5, 0.8],
}

print(f"Parameters to tune: {len(param_distributions)}")
for k, v in param_distributions.items():
    print(f"  {k}: {v}")

In [None]:
def random_search(X_train, y_train, X_val, y_val, param_dist, n_iter=50):
    """
    Perform random search for hyperparameters.
    """
    results = []
    
    for i in range(n_iter):
        # Sample random parameters
        params = {k: np.random.choice(v) for k, v in param_dist.items()}
        params['random_state'] = 42
        params['n_jobs'] = -1
        params['bootstrap'] = True
        
        try:
            # Train model
            model = RandomForestRegressor(**params)
            model.fit(X_train, y_train)
            
            # Evaluate
            train_pred = model.predict(X_train)
            val_pred = model.predict(X_val)
            
            train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
            val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
            
            result = {
                'iteration': i + 1,
                'train_rmse': train_rmse,
                'val_rmse': val_rmse,
                'overfit_ratio': train_rmse / val_rmse if val_rmse > 0 else 0,
                **params
            }
            results.append(result)
            
            if (i + 1) % 10 == 0:
                print(f"  Iteration {i + 1}/{n_iter}: Val RMSE = {val_rmse:.4f}")
        
        except Exception as e:
            print(f"  Iteration {i + 1} failed: {e}")
    
    return pd.DataFrame(results)

print("Starting Random Search for Home Goals...")
home_random_results = random_search(X_train, y_home_train, X_val, y_home_val, param_distributions, n_iter=50)

print("\nStarting Random Search for Away Goals...")
away_random_results = random_search(X_train, y_away_train, X_val, y_away_val, param_distributions, n_iter=50)

In [None]:
# Find best configurations
best_home_idx = home_random_results['val_rmse'].idxmin()
best_away_idx = away_random_results['val_rmse'].idxmin()

best_home_config = home_random_results.loc[best_home_idx]
best_away_config = away_random_results.loc[best_away_idx]

print("\nBest Home Goals Configuration:")
print(f"  Val RMSE: {best_home_config['val_rmse']:.4f}")
for param in param_distributions.keys():
    print(f"  {param}: {best_home_config[param]}")

print("\nBest Away Goals Configuration:")
print(f"  Val RMSE: {best_away_config['val_rmse']:.4f}")
for param in param_distributions.keys():
    print(f"  {param}: {best_away_config[param]}")

## 5. Grid Search (Fine-tuning)

In [None]:
# Fine-tune around best parameters found
fine_tune_params = {
    'n_estimators': [200, 300, 400],
    'max_depth': [8, 10, 12, 15],
    'min_samples_split': [3, 5, 7],
    'min_samples_leaf': [1, 2, 3],
}

from itertools import product

def grid_search(X_train, y_train, X_val, y_val, param_grid):
    """Perform grid search for hyperparameters."""
    results = []
    
    # Generate all combinations
    keys = list(param_grid.keys())
    combinations = list(product(*[param_grid[k] for k in keys]))
    
    print(f"Testing {len(combinations)} combinations...")
    
    for i, combo in enumerate(combinations):
        params = dict(zip(keys, combo))
        params['random_state'] = 42
        params['n_jobs'] = -1
        params['max_features'] = 'sqrt'
        
        try:
            model = RandomForestRegressor(**params)
            model.fit(X_train, y_train)
            
            val_pred = model.predict(X_val)
            val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
            
            results.append({'val_rmse': val_rmse, **params})
            
            if (i + 1) % 20 == 0:
                print(f"  Progress: {i + 1}/{len(combinations)}")
        
        except Exception as e:
            print(f"  Combo {i + 1} failed: {e}")
    
    return pd.DataFrame(results)

print("Grid Search for Home Goals:")
home_grid_results = grid_search(X_train, y_home_train, X_val, y_home_val, fine_tune_params)

print("\nGrid Search for Away Goals:")
away_grid_results = grid_search(X_train, y_away_train, X_val, y_away_val, fine_tune_params)

In [None]:
# Best from grid search
best_home_grid = home_grid_results.loc[home_grid_results['val_rmse'].idxmin()]
best_away_grid = away_grid_results.loc[away_grid_results['val_rmse'].idxmin()]

print("Best Grid Search Results:")
print(f"\nHome Goals - Val RMSE: {best_home_grid['val_rmse']:.4f}")
print(f"Away Goals - Val RMSE: {best_away_grid['val_rmse']:.4f}")

## 6. Cross-Validation Analysis

In [None]:
# Use best parameters for cross-validation
best_params = {
    'n_estimators': int(best_home_grid['n_estimators']),
    'max_depth': int(best_home_grid['max_depth']) if best_home_grid['max_depth'] else None,
    'min_samples_split': int(best_home_grid['min_samples_split']),
    'min_samples_leaf': int(best_home_grid['min_samples_leaf']),
    'max_features': 'sqrt',
    'random_state': 42,
    'n_jobs': -1,
}

# 5-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

X_full = pd.concat([X_train, X_val])
y_home_full = pd.concat([y_home_train, y_home_val])
y_away_full = pd.concat([y_away_train, y_away_val])

# Cross-validation for home goals
home_cv_scores = cross_val_score(
    RandomForestRegressor(**best_params),
    X_full, y_home_full,
    cv=kfold,
    scoring='neg_root_mean_squared_error'
)

# Cross-validation for away goals
away_cv_scores = cross_val_score(
    RandomForestRegressor(**best_params),
    X_full, y_away_full,
    cv=kfold,
    scoring='neg_root_mean_squared_error'
)

print("5-Fold Cross-Validation Results:")
print(f"\nHome Goals RMSE: {-home_cv_scores.mean():.4f} (+/- {home_cv_scores.std():.4f})")
print(f"Away Goals RMSE: {-away_cv_scores.mean():.4f} (+/- {away_cv_scores.std():.4f})")

## 7. Feature Importance

In [None]:
# Train final models with best params
final_home = RandomForestRegressor(**best_params)
final_home.fit(X_full, y_home_full)

final_away = RandomForestRegressor(**best_params)
final_away.fit(X_full, y_away_full)

# Feature importance
home_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': final_home.feature_importances_
}).sort_values('importance', ascending=False)

away_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': final_away.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Features for Home Goals:")
print(home_importance.head(10).to_string(index=False))

print("\nTop 10 Features for Away Goals:")
print(away_importance.head(10).to_string(index=False))

In [None]:
# Plot feature importance
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Home goals
axes[0].barh(home_importance['feature'].head(10)[::-1], 
             home_importance['importance'].head(10)[::-1])
axes[0].set_xlabel('Importance')
axes[0].set_title('Top 10 Features - Home Goals')

# Away goals
axes[1].barh(away_importance['feature'].head(10)[::-1], 
             away_importance['importance'].head(10)[::-1])
axes[1].set_xlabel('Importance')
axes[1].set_title('Top 10 Features - Away Goals')

plt.tight_layout()
plt.show()

## 8. Final Model Evaluation

In [None]:
# Evaluate on test set
test_metrics = evaluate_models(final_home, final_away, X_test, y_home_test, y_away_test)

print("\n Final Test Set Performance")
print("=" * 45)
print(f"\nHome Goals:")
print(f"  RMSE: {test_metrics['home_rmse']:.4f}")
print(f"  MAE:  {test_metrics['home_mae']:.4f}")
print(f"  R²:   {test_metrics['home_r2']:.4f}")
print(f"\nAway Goals:")
print(f"  RMSE: {test_metrics['away_rmse']:.4f}")
print(f"  MAE:  {test_metrics['away_mae']:.4f}")
print(f"  R²:   {test_metrics['away_r2']:.4f}")
print(f"\nCombined:")
print(f"  RMSE: {test_metrics['combined_rmse']:.4f}")
print(f"  MAE:  {test_metrics['combined_mae']:.4f}")
print(f"  R²:   {test_metrics['combined_r2']:.4f}")

In [None]:
# Compare baseline vs tuned
print("\n Improvement Over Baseline")
print("=" * 45)

baseline_test = evaluate_models(baseline_home, baseline_away, X_test, y_home_test, y_away_test)

home_improvement = (baseline_test['home_rmse'] - test_metrics['home_rmse']) / baseline_test['home_rmse'] * 100
away_improvement = (baseline_test['away_rmse'] - test_metrics['away_rmse']) / baseline_test['away_rmse'] * 100
combined_improvement = (baseline_test['combined_rmse'] - test_metrics['combined_rmse']) / baseline_test['combined_rmse'] * 100

print(f"Home Goals RMSE: {baseline_test['home_rmse']:.4f} -> {test_metrics['home_rmse']:.4f} ({home_improvement:+.1f}%)")
print(f"Away Goals RMSE: {baseline_test['away_rmse']:.4f} -> {test_metrics['away_rmse']:.4f} ({away_improvement:+.1f}%)")
print(f"Combined RMSE:   {baseline_test['combined_rmse']:.4f} -> {test_metrics['combined_rmse']:.4f} ({combined_improvement:+.1f}%)")

## 9. Save Best Model

In [None]:
# Save models
output_dir = 'output/models/random_forest'
os.makedirs(output_dir, exist_ok=True)

# Save models
with open(f'{output_dir}/random_forest_home.pkl', 'wb') as f:
    pickle.dump(final_home, f)

with open(f'{output_dir}/random_forest_away.pkl', 'wb') as f:
    pickle.dump(final_away, f)

# Save best parameters
model_info = {
    'model_type': 'RandomForestRegressor',
    'best_params': best_params,
    'test_metrics': test_metrics,
    'cv_home_rmse': float(-home_cv_scores.mean()),
    'cv_away_rmse': float(-away_cv_scores.mean()),
    'feature_cols': feature_cols,
    'trained_at': datetime.now().isoformat(),
}

with open(f'{output_dir}/random_forest_info.json', 'w') as f:
    json.dump(model_info, f, indent=2, default=str)

print(f"Models saved to {output_dir}/")
print(f"  - random_forest_home.pkl")
print(f"  - random_forest_away.pkl")
print(f"  - random_forest_info.json")

In [None]:
# Summary
print("\n" + "=" * 60)
print(" RANDOM FOREST TRAINING COMPLETE")
print("=" * 60)
print(f"\nBest Parameters:")
for k, v in best_params.items():
    print(f"  {k}: {v}")
print(f"\nFinal Test Performance:")
print(f"  Combined RMSE: {test_metrics['combined_rmse']:.4f}")
print(f"  Combined MAE:  {test_metrics['combined_mae']:.4f}")
print(f"  Combined R²:   {test_metrics['combined_r2']:.4f}")