# Random Forest Model Validation

Comprehensive validation tests for the Random Forest model.

**Tests:**
1. Basic functionality (fit, predict, evaluate)
2. Hyperparameter effects (n_estimators, max_depth)
3. Feature importance
4. Out-of-bag estimation
5. Serialization (save/load)
6. Cross-validation stability
7. Edge cases and error handling
8. Dual goal predictor

Run this BEFORE using the model in production to catch bugs.

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import tempfile
import os
import pickle
from pathlib import Path

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Test tracking
test_results = []

def record_test(name, passed, message=""):
    status = "✅ PASS" if passed else "❌ FAIL"
    test_results.append({'test': name, 'passed': passed, 'message': message})
    print(f"{status}: {name}")
    if message:
        print(f"       {message}")

print("Validation setup complete!")

## Generate Test Data

In [None]:
# Create synthetic hockey-like test data
np.random.seed(42)
n = 500

X_test = pd.DataFrame({
    'home_win_pct': np.random.uniform(0.3, 0.7, n),
    'away_win_pct': np.random.uniform(0.3, 0.7, n),
    'home_goals_avg': np.random.uniform(2.5, 3.8, n),
    'away_goals_avg': np.random.uniform(2.5, 3.8, n),
    'home_goals_against_avg': np.random.uniform(2.2, 3.5, n),
    'away_goals_against_avg': np.random.uniform(2.2, 3.5, n),
    'home_pp_pct': np.random.uniform(0.15, 0.28, n),
    'away_pp_pct': np.random.uniform(0.15, 0.28, n),
    'home_rest_days': np.random.randint(1, 5, n),
    'away_rest_days': np.random.randint(1, 5, n),
})

# Create realistic target with known relationships
y_home = (
    X_test['home_goals_avg'] * 0.4 +
    (4 - X_test['away_goals_against_avg']) * 0.3 +
    X_test['home_pp_pct'] * 5 +
    0.3 +  # home advantage
    np.random.normal(0, 0.5, n)
).clip(0, 8).round().astype(int)

y_away = (
    X_test['away_goals_avg'] * 0.4 +
    (4 - X_test['home_goals_against_avg']) * 0.3 +
    X_test['away_pp_pct'] * 5 +
    np.random.normal(0, 0.5, n)
).clip(0, 8).round().astype(int)

# Train/test split
split_idx = int(n * 0.8)
X_train, X_val = X_test.iloc[:split_idx], X_test.iloc[split_idx:]
y_home_train, y_home_val = y_home[:split_idx], y_home[split_idx:]
y_away_train, y_away_val = y_away[:split_idx], y_away[split_idx:]

print(f"Train: {len(X_train)}, Validation: {len(X_val)}")
print(f"Home goals range: {y_home.min()} - {y_home.max()}")
print(f"Away goals range: {y_away.min()} - {y_away.max()}")

## Test 1: Basic Functionality

In [None]:
# Test 1a: Model creation
try:
    model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    record_test("1a. Model creation", True)
except Exception as e:
    record_test("1a. Model creation", False, str(e))

# Test 1b: Fit
try:
    model.fit(X_train, y_home_train)
    record_test("1b. Model fit", True, f"n_features={model.n_features_in_}")
except Exception as e:
    record_test("1b. Model fit", False, str(e))

# Test 1c: Predict
try:
    predictions = model.predict(X_val)
    valid = len(predictions) == len(y_home_val) and not np.isnan(predictions).any()
    record_test("1c. Model predict", valid, f"n_predictions={len(predictions)}")
except Exception as e:
    record_test("1c. Model predict", False, str(e))

# Test 1d: Evaluate
try:
    rmse = np.sqrt(mean_squared_error(y_home_val, predictions))
    mae = mean_absolute_error(y_home_val, predictions)
    r2 = r2_score(y_home_val, predictions)
    record_test("1d. Model evaluate", True, f"RMSE={rmse:.4f}, MAE={mae:.4f}, R²={r2:.4f}")
except Exception as e:
    record_test("1d. Model evaluate", False, str(e))

## Test 2: Hyperparameter Effects

In [None]:
# Test 2a: More trees should improve stability
try:
    model_50 = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)
    model_200 = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
    
    model_50.fit(X_train, y_home_train)
    model_200.fit(X_train, y_home_train)
    
    rmse_50 = np.sqrt(mean_squared_error(y_home_val, model_50.predict(X_val)))
    rmse_200 = np.sqrt(mean_squared_error(y_home_val, model_200.predict(X_val)))
    
    # 200 trees should be similar or better
    improved = rmse_200 <= rmse_50 * 1.1  # Allow 10% tolerance
    record_test("2a. n_estimators effect", improved, 
               f"50 trees: {rmse_50:.4f}, 200 trees: {rmse_200:.4f}")
except Exception as e:
    record_test("2a. n_estimators effect", False, str(e))

# Test 2b: max_depth controls complexity
try:
    model_d3 = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=42)
    model_d20 = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42)
    
    model_d3.fit(X_train, y_home_train)
    model_d20.fit(X_train, y_home_train)
    
    # Deeper trees fit training data better
    train_rmse_d3 = np.sqrt(mean_squared_error(y_home_train, model_d3.predict(X_train)))
    train_rmse_d20 = np.sqrt(mean_squared_error(y_home_train, model_d20.predict(X_train)))
    
    deeper_fits_better = train_rmse_d20 <= train_rmse_d3
    record_test("2b. max_depth effect on training", deeper_fits_better,
               f"depth=3 train RMSE: {train_rmse_d3:.4f}, depth=20: {train_rmse_d20:.4f}")
except Exception as e:
    record_test("2b. max_depth effect on training", False, str(e))

# Test 2c: min_samples_split
try:
    model_s2 = RandomForestRegressor(n_estimators=100, min_samples_split=2, random_state=42)
    model_s20 = RandomForestRegressor(n_estimators=100, min_samples_split=20, random_state=42)
    
    model_s2.fit(X_train, y_home_train)
    model_s20.fit(X_train, y_home_train)
    
    # Lower split = more complex model
    train_rmse_s2 = np.sqrt(mean_squared_error(y_home_train, model_s2.predict(X_train)))
    train_rmse_s20 = np.sqrt(mean_squared_error(y_home_train, model_s20.predict(X_train)))
    
    record_test("2c. min_samples_split effect", train_rmse_s2 <= train_rmse_s20,
               f"split=2: {train_rmse_s2:.4f}, split=20: {train_rmse_s20:.4f}")
except Exception as e:
    record_test("2c. min_samples_split effect", False, str(e))

## Test 3: Feature Importance

In [None]:
try:
    model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_home_train)
    
    # Get feature importance
    importance = model.feature_importances_
    feature_imp = pd.DataFrame({
        'feature': X_train.columns,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    # Top features should include goals-related features
    top_features = feature_imp.head(3)['feature'].tolist()
    
    record_test("3a. Feature importance extraction", len(importance) == len(X_train.columns),
               f"Top 3: {top_features}")
    
    # Importance sums to 1 (normalized)
    sum_importance = importance.sum()
    record_test("3b. Importance normalization", abs(sum_importance - 1.0) < 0.01,
               f"Sum: {sum_importance:.4f}")
    
    # All importances non-negative
    all_positive = (importance >= 0).all()
    record_test("3c. Non-negative importances", all_positive)
    
    print("\nFeature Importance:")
    print(feature_imp.to_string(index=False))
except Exception as e:
    record_test("3. Feature importance", False, str(e))

## Test 4: Out-of-Bag Estimation

In [None]:
try:
    # Train with OOB scoring
    model_oob = RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        oob_score=True,
        random_state=42,
        n_jobs=-1
    )
    model_oob.fit(X_train, y_home_train)
    
    oob_score = model_oob.oob_score_
    record_test("4a. OOB score computation", oob_score is not None,
               f"OOB R²: {oob_score:.4f}")
    
    # OOB predictions available
    oob_predictions = model_oob.oob_prediction_
    record_test("4b. OOB predictions", len(oob_predictions) == len(X_train),
               f"n_predictions: {len(oob_predictions)}")
    
    # OOB should be reasonable estimate of validation performance
    val_r2 = r2_score(y_home_val, model_oob.predict(X_val))
    oob_val_diff = abs(oob_score - val_r2)
    record_test("4c. OOB vs validation similarity", oob_val_diff < 0.2,
               f"OOB R²: {oob_score:.4f}, Val R²: {val_r2:.4f}, Diff: {oob_val_diff:.4f}")
except Exception as e:
    record_test("4. OOB estimation", False, str(e))

## Test 5: Serialization (Save/Load)

In [None]:
try:
    # Train model
    model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_home_train)
    original_pred = model.predict(X_val)
    
    # Save with pickle
    with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f:
        pickle.dump(model, f)
        temp_path = f.name
    
    # Load
    with open(temp_path, 'rb') as f:
        loaded_model = pickle.load(f)
    
    loaded_pred = loaded_model.predict(X_val)
    
    # Predictions should match
    match = np.allclose(original_pred, loaded_pred)
    record_test("5a. Pickle save/load", match,
               f"Max diff: {np.abs(original_pred - loaded_pred).max():.6f}")
    
    # Check model attributes preserved
    attrs_match = (
        loaded_model.n_estimators == model.n_estimators and
        loaded_model.max_depth == model.max_depth
    )
    record_test("5b. Attributes preserved", attrs_match,
               f"n_estimators: {loaded_model.n_estimators}, max_depth: {loaded_model.max_depth}")
    
    # Clean up
    os.unlink(temp_path)
    
except Exception as e:
    record_test("5. Serialization", False, str(e))

## Test 6: Cross-Validation Stability

In [None]:
try:
    model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    
    # 5-fold cross-validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_test, y_home, cv=kfold, 
                                scoring='neg_root_mean_squared_error')
    
    mean_rmse = -cv_scores.mean()
    std_rmse = cv_scores.std()
    
    # Check stability (std should be reasonable relative to mean)
    cv_ratio = std_rmse / mean_rmse
    is_stable = cv_ratio < 0.3  # CV shouldn't vary more than 30%
    
    record_test("6a. Cross-validation stability", is_stable,
               f"RMSE: {mean_rmse:.4f} (+/- {std_rmse:.4f}), CV ratio: {cv_ratio:.2%}")
    
    # Also check R² scores
    cv_r2 = cross_val_score(model, X_test, y_home, cv=kfold, scoring='r2')
    record_test("6b. CV R² positive", cv_r2.mean() > 0,
               f"Mean R²: {cv_r2.mean():.4f}")
except Exception as e:
    record_test("6. Cross-validation stability", False, str(e))

## Test 7: Edge Cases

In [None]:
# Test 7a: Single sample prediction
try:
    model = RandomForestRegressor(n_estimators=50, random_state=42)
    model.fit(X_train, y_home_train)
    
    single_pred = model.predict(X_val.iloc[[0]])
    record_test("7a. Single sample prediction", len(single_pred) == 1,
               f"Prediction: {single_pred[0]:.4f}")
except Exception as e:
    record_test("7a. Single sample prediction", False, str(e))

# Test 7b: Very small training set
try:
    small_model = RandomForestRegressor(n_estimators=10, random_state=42)
    small_model.fit(X_train.iloc[:20], y_home_train[:20])
    small_pred = small_model.predict(X_val)
    record_test("7b. Small training set", len(small_pred) == len(X_val))
except Exception as e:
    record_test("7b. Small training set", False, str(e))

# Test 7c: Empty prediction set
try:
    empty_pred = model.predict(X_val.iloc[:0])
    record_test("7c. Empty prediction set", len(empty_pred) == 0)
except Exception as e:
    record_test("7c. Empty prediction set", False, str(e))

# Test 7d: Predictions should be within training range
try:
    all_pred = model.predict(X_test)
    min_pred, max_pred = all_pred.min(), all_pred.max()
    
    # Random Forest predictions should be within training target range (bounded)
    y_min, y_max = y_home.min(), y_home.max()
    in_range = min_pred >= y_min - 0.5 and max_pred <= y_max + 0.5
    record_test("7d. Predictions in reasonable range", in_range,
               f"Pred range: [{min_pred:.2f}, {max_pred:.2f}], Target range: [{y_min}, {y_max}]")
except Exception as e:
    record_test("7d. Predictions in reasonable range", False, str(e))

# Test 7e: Feature names preserved
try:
    feature_names = model.feature_names_in_
    match = list(feature_names) == list(X_train.columns)
    record_test("7e. Feature names preserved", match)
except Exception as e:
    record_test("7e. Feature names preserved", False, str(e))

## Test 8: Dual Goal Predictor

In [None]:
# Test training separate models for home and away goals
try:
    home_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    away_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    
    home_model.fit(X_train, y_home_train)
    away_model.fit(X_train, y_away_train)
    
    home_pred = home_model.predict(X_val)
    away_pred = away_model.predict(X_val)
    
    home_rmse = np.sqrt(mean_squared_error(y_home_val, home_pred))
    away_rmse = np.sqrt(mean_squared_error(y_away_val, away_pred))
    
    # Combined RMSE
    all_pred = np.concatenate([home_pred, away_pred])
    all_actual = np.concatenate([y_home_val, y_away_val])
    combined_rmse = np.sqrt(mean_squared_error(all_actual, all_pred))
    
    record_test("8a. Dual model training", True,
               f"Home RMSE: {home_rmse:.4f}, Away RMSE: {away_rmse:.4f}")
    record_test("8b. Combined performance", combined_rmse < 2.0,
               f"Combined RMSE: {combined_rmse:.4f}")
    
    # Feature importance should differ between models
    home_imp = home_model.feature_importances_
    away_imp = away_model.feature_importances_
    imp_diff = np.abs(home_imp - away_imp).max()
    record_test("8c. Different feature importance", imp_diff > 0.01,
               f"Max importance diff: {imp_diff:.4f}")
except Exception as e:
    record_test("8. Dual goal predictor", False, str(e))

## Test 9: Reproducibility

In [None]:
try:
    # Train same model twice with same random state
    model1 = RandomForestRegressor(n_estimators=100, random_state=42)
    model2 = RandomForestRegressor(n_estimators=100, random_state=42)
    
    model1.fit(X_train, y_home_train)
    model2.fit(X_train, y_home_train)
    
    pred1 = model1.predict(X_val)
    pred2 = model2.predict(X_val)
    
    # Should be identical
    identical = np.allclose(pred1, pred2)
    record_test("9a. Reproducibility with same seed", identical,
               f"Max diff: {np.abs(pred1 - pred2).max():.6f}")
    
    # Different seeds should give different results
    model3 = RandomForestRegressor(n_estimators=100, random_state=123)
    model3.fit(X_train, y_home_train)
    pred3 = model3.predict(X_val)
    
    different = not np.allclose(pred1, pred3)
    record_test("9b. Different seeds give different results", different,
               f"Max diff: {np.abs(pred1 - pred3).max():.4f}")
except Exception as e:
    record_test("9. Reproducibility", False, str(e))

## Test Summary

In [None]:
# Summary
print("\n" + "=" * 60)
print(" RANDOM FOREST VALIDATION SUMMARY")
print("=" * 60)

results_df = pd.DataFrame(test_results)
passed = results_df['passed'].sum()
total = len(results_df)

print(f"\nPassed: {passed}/{total} ({passed/total*100:.1f}%)")

if passed < total:
    print("\n❌ FAILED TESTS:")
    for _, row in results_df[~results_df['passed']].iterrows():
        print(f"   - {row['test']}: {row['message']}")
else:
    print("\n✅ All tests passed!")

# Show all results
print("\nDetailed Results:")
print(results_df.to_string(index=False))