# XGBoost Model Validation

Comprehensive validation tests for the XGBoost model.

**Tests:**
1. Basic functionality (fit, predict, evaluate)
2. Hyperparameter effects
3. Feature importance
4. Regularization behavior
5. Serialization (save/load)
6. Cross-validation stability
7. Edge cases and error handling
8. Dual goal predictor

Run this BEFORE using the model in production to catch bugs.

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import tempfile
import os
import pickle
from pathlib import Path

try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False
    print("WARNING: XGBoost not installed. Some tests will be skipped.")

from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Test tracking
test_results = []

def record_test(name, passed, message=""):
    status = "✅ PASS" if passed else "❌ FAIL"
    test_results.append({'test': name, 'passed': passed, 'message': message})
    print(f"{status}: {name}")
    if message:
        print(f"       {message}")

print(f"XGBoost available: {XGB_AVAILABLE}")
print("Validation setup complete!")

## Generate Test Data

In [None]:
# Create synthetic hockey-like test data
np.random.seed(42)
n = 500

X_test = pd.DataFrame({
    'home_win_pct': np.random.uniform(0.3, 0.7, n),
    'away_win_pct': np.random.uniform(0.3, 0.7, n),
    'home_goals_avg': np.random.uniform(2.5, 3.8, n),
    'away_goals_avg': np.random.uniform(2.5, 3.8, n),
    'home_goals_against_avg': np.random.uniform(2.2, 3.5, n),
    'away_goals_against_avg': np.random.uniform(2.2, 3.5, n),
    'home_pp_pct': np.random.uniform(0.15, 0.28, n),
    'away_pp_pct': np.random.uniform(0.15, 0.28, n),
    'home_rest_days': np.random.randint(1, 5, n),
    'away_rest_days': np.random.randint(1, 5, n),
})

# Create realistic target with known relationships
y_home = (
    X_test['home_goals_avg'] * 0.4 +
    (4 - X_test['away_goals_against_avg']) * 0.3 +
    X_test['home_pp_pct'] * 5 +
    0.3 +  # home advantage
    np.random.normal(0, 0.5, n)
).clip(0, 8).round().astype(int)

y_away = (
    X_test['away_goals_avg'] * 0.4 +
    (4 - X_test['home_goals_against_avg']) * 0.3 +
    X_test['away_pp_pct'] * 5 +
    np.random.normal(0, 0.5, n)
).clip(0, 8).round().astype(int)

# Train/test split
split_idx = int(n * 0.8)
X_train, X_val = X_test.iloc[:split_idx], X_test.iloc[split_idx:]
y_home_train, y_home_val = y_home[:split_idx], y_home[split_idx:]
y_away_train, y_away_val = y_away[:split_idx], y_away[split_idx:]

print(f"Train: {len(X_train)}, Validation: {len(X_val)}")
print(f"Home goals range: {y_home.min()} - {y_home.max()}")
print(f"Away goals range: {y_away.min()} - {y_away.max()}")

## Test 1: Basic Functionality

In [None]:
if not XGB_AVAILABLE:
    record_test("1. XGBoost import", False, "XGBoost not installed")
else:
    # Test 1a: Model creation
    try:
        model = xgb.XGBRegressor(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=42
        )
        record_test("1a. Model creation", True)
    except Exception as e:
        record_test("1a. Model creation", False, str(e))

    # Test 1b: Fit
    try:
        model.fit(X_train, y_home_train)
        record_test("1b. Model fit", True, f"n_features={len(X_train.columns)}")
    except Exception as e:
        record_test("1b. Model fit", False, str(e))

    # Test 1c: Predict
    try:
        predictions = model.predict(X_val)
        valid = len(predictions) == len(y_home_val) and not np.isnan(predictions).any()
        record_test("1c. Model predict", valid, f"n_predictions={len(predictions)}")
    except Exception as e:
        record_test("1c. Model predict", False, str(e))

    # Test 1d: Evaluate
    try:
        rmse = np.sqrt(mean_squared_error(y_home_val, predictions))
        mae = mean_absolute_error(y_home_val, predictions)
        r2 = r2_score(y_home_val, predictions)
        record_test("1d. Model evaluate", True, f"RMSE={rmse:.4f}, MAE={mae:.4f}, R²={r2:.4f}")
    except Exception as e:
        record_test("1d. Model evaluate", False, str(e))

## Test 2: Hyperparameter Effects

In [None]:
if XGB_AVAILABLE:
    # Test 2a: More trees should generally improve or maintain performance
    try:
        model_50 = xgb.XGBRegressor(n_estimators=50, max_depth=6, random_state=42)
        model_200 = xgb.XGBRegressor(n_estimators=200, max_depth=6, random_state=42)
        
        model_50.fit(X_train, y_home_train)
        model_200.fit(X_train, y_home_train)
        
        rmse_50 = np.sqrt(mean_squared_error(y_home_val, model_50.predict(X_val)))
        rmse_200 = np.sqrt(mean_squared_error(y_home_val, model_200.predict(X_val)))
        
        # 200 trees should be similar or better
        improved = rmse_200 <= rmse_50 * 1.1  # Allow 10% tolerance
        record_test("2a. n_estimators effect", improved, 
                   f"50 trees: {rmse_50:.4f}, 200 trees: {rmse_200:.4f}")
    except Exception as e:
        record_test("2a. n_estimators effect", False, str(e))

    # Test 2b: max_depth control
    try:
        model_d3 = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=42)
        model_d10 = xgb.XGBRegressor(n_estimators=100, max_depth=10, random_state=42)
        
        model_d3.fit(X_train, y_home_train)
        model_d10.fit(X_train, y_home_train)
        
        # Deeper trees often overfit
        train_rmse_d3 = np.sqrt(mean_squared_error(y_home_train, model_d3.predict(X_train)))
        train_rmse_d10 = np.sqrt(mean_squared_error(y_home_train, model_d10.predict(X_train)))
        
        deeper_fits_better = train_rmse_d10 <= train_rmse_d3
        record_test("2b. max_depth effect on training", deeper_fits_better,
                   f"depth=3 train RMSE: {train_rmse_d3:.4f}, depth=10: {train_rmse_d10:.4f}")
    except Exception as e:
        record_test("2b. max_depth effect on training", False, str(e))

    # Test 2c: Learning rate
    try:
        model_fast = xgb.XGBRegressor(n_estimators=100, learning_rate=0.3, random_state=42)
        model_slow = xgb.XGBRegressor(n_estimators=100, learning_rate=0.01, random_state=42)
        
        model_fast.fit(X_train, y_home_train)
        model_slow.fit(X_train, y_home_train)
        
        # Fast learning should fit training data better with same n_estimators
        train_rmse_fast = np.sqrt(mean_squared_error(y_home_train, model_fast.predict(X_train)))
        train_rmse_slow = np.sqrt(mean_squared_error(y_home_train, model_slow.predict(X_train)))
        
        record_test("2c. learning_rate effect", train_rmse_fast < train_rmse_slow,
                   f"lr=0.3: {train_rmse_fast:.4f}, lr=0.01: {train_rmse_slow:.4f}")
    except Exception as e:
        record_test("2c. learning_rate effect", False, str(e))

## Test 3: Feature Importance

In [None]:
if XGB_AVAILABLE:
    try:
        model = xgb.XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
        model.fit(X_train, y_home_train)
        
        # Get feature importance
        importance = model.feature_importances_
        feature_imp = pd.DataFrame({
            'feature': X_train.columns,
            'importance': importance
        }).sort_values('importance', ascending=False)
        
        # Top features should include goals-related features
        top_features = feature_imp.head(3)['feature'].tolist()
        has_goals_feature = any('goals' in f for f in top_features)
        
        record_test("3a. Feature importance extraction", len(importance) == len(X_train.columns),
                   f"Top 3: {top_features}")
        
        # Importance sums to 1 (normalized)
        sum_importance = importance.sum()
        record_test("3b. Importance normalization", abs(sum_importance - 1.0) < 0.01,
                   f"Sum: {sum_importance:.4f}")
        
        print("\nFeature Importance:")
        print(feature_imp.to_string(index=False))
    except Exception as e:
        record_test("3. Feature importance", False, str(e))

## Test 4: Regularization

In [None]:
if XGB_AVAILABLE:
    # Test L1 regularization (reg_alpha)
    try:
        model_no_reg = xgb.XGBRegressor(n_estimators=100, reg_alpha=0, reg_lambda=0, random_state=42)
        model_l1 = xgb.XGBRegressor(n_estimators=100, reg_alpha=10, reg_lambda=0, random_state=42)
        
        model_no_reg.fit(X_train, y_home_train)
        model_l1.fit(X_train, y_home_train)
        
        # L1 regularization should reduce overfitting
        train_no_reg = np.sqrt(mean_squared_error(y_home_train, model_no_reg.predict(X_train)))
        train_l1 = np.sqrt(mean_squared_error(y_home_train, model_l1.predict(X_train)))
        
        val_no_reg = np.sqrt(mean_squared_error(y_home_val, model_no_reg.predict(X_val)))
        val_l1 = np.sqrt(mean_squared_error(y_home_val, model_l1.predict(X_val)))
        
        # Regularization should make training error higher (less overfit)
        record_test("4a. L1 regularization (reg_alpha)", train_l1 >= train_no_reg * 0.99,
                   f"No reg train: {train_no_reg:.4f}, L1 train: {train_l1:.4f}")
    except Exception as e:
        record_test("4a. L1 regularization", False, str(e))

    # Test L2 regularization (reg_lambda)
    try:
        model_l2 = xgb.XGBRegressor(n_estimators=100, reg_alpha=0, reg_lambda=10, random_state=42)
        model_l2.fit(X_train, y_home_train)
        
        train_l2 = np.sqrt(mean_squared_error(y_home_train, model_l2.predict(X_train)))
        
        record_test("4b. L2 regularization (reg_lambda)", True,
                   f"L2 train RMSE: {train_l2:.4f}")
    except Exception as e:
        record_test("4b. L2 regularization", False, str(e))

## Test 5: Serialization (Save/Load)

In [None]:
if XGB_AVAILABLE:
    try:
        # Train model
        model = xgb.XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
        model.fit(X_train, y_home_train)
        original_pred = model.predict(X_val)
        
        # Save with pickle
        with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f:
            pickle.dump(model, f)
            temp_path = f.name
        
        # Load
        with open(temp_path, 'rb') as f:
            loaded_model = pickle.load(f)
        
        loaded_pred = loaded_model.predict(X_val)
        
        # Predictions should match
        match = np.allclose(original_pred, loaded_pred)
        record_test("5a. Pickle save/load", match,
                   f"Max diff: {np.abs(original_pred - loaded_pred).max():.6f}")
        
        # Clean up
        os.unlink(temp_path)
        
    except Exception as e:
        record_test("5a. Pickle save/load", False, str(e))

    # Test native XGBoost save/load
    try:
        with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f:
            temp_path = f.name
        
        model.save_model(temp_path)
        
        loaded_model = xgb.XGBRegressor()
        loaded_model.load_model(temp_path)
        
        native_pred = loaded_model.predict(X_val)
        match = np.allclose(original_pred, native_pred)
        
        record_test("5b. Native XGBoost save/load", match,
                   f"Max diff: {np.abs(original_pred - native_pred).max():.6f}")
        
        os.unlink(temp_path)
    except Exception as e:
        record_test("5b. Native XGBoost save/load", False, str(e))

## Test 6: Cross-Validation Stability

In [None]:
if XGB_AVAILABLE:
    try:
        model = xgb.XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
        
        # 5-fold cross-validation
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X_test, y_home, cv=kfold, 
                                    scoring='neg_root_mean_squared_error')
        
        mean_rmse = -cv_scores.mean()
        std_rmse = cv_scores.std()
        
        # Check stability (std should be reasonable relative to mean)
        cv_ratio = std_rmse / mean_rmse
        is_stable = cv_ratio < 0.3  # CV shouldn't vary more than 30%
        
        record_test("6. Cross-validation stability", is_stable,
                   f"RMSE: {mean_rmse:.4f} (+/- {std_rmse:.4f}), CV ratio: {cv_ratio:.2%}")
    except Exception as e:
        record_test("6. Cross-validation stability", False, str(e))

## Test 7: Edge Cases

In [None]:
if XGB_AVAILABLE:
    # Test 7a: Single sample prediction
    try:
        model = xgb.XGBRegressor(n_estimators=50, random_state=42)
        model.fit(X_train, y_home_train)
        
        single_pred = model.predict(X_val.iloc[[0]])
        record_test("7a. Single sample prediction", len(single_pred) == 1,
                   f"Prediction: {single_pred[0]:.4f}")
    except Exception as e:
        record_test("7a. Single sample prediction", False, str(e))

    # Test 7b: Missing features (should fail)
    try:
        X_missing = X_val.drop(columns=['home_win_pct'])
        pred = model.predict(X_missing)
        record_test("7b. Missing feature handling", False, "Should have raised error")
    except Exception as e:
        record_test("7b. Missing feature handling", True, "Correctly raised error")

    # Test 7c: Empty prediction set
    try:
        empty_pred = model.predict(X_val.iloc[:0])
        record_test("7c. Empty prediction set", len(empty_pred) == 0)
    except Exception as e:
        record_test("7c. Empty prediction set", False, str(e))

    # Test 7d: Predictions should be reasonable
    try:
        all_pred = model.predict(X_test)
        min_pred, max_pred = all_pred.min(), all_pred.max()
        
        # Goals should be in reasonable range (not negative, not > 15)
        reasonable = min_pred >= -1 and max_pred <= 15
        record_test("7d. Reasonable prediction range", reasonable,
                   f"Range: [{min_pred:.2f}, {max_pred:.2f}]")
    except Exception as e:
        record_test("7d. Reasonable prediction range", False, str(e))

## Test 8: Dual Goal Predictor

In [None]:
if XGB_AVAILABLE:
    # Test training separate models for home and away goals
    try:
        home_model = xgb.XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
        away_model = xgb.XGBRegressor(n_estimators=100, max_depth=6, random_state=42)
        
        home_model.fit(X_train, y_home_train)
        away_model.fit(X_train, y_away_train)
        
        home_pred = home_model.predict(X_val)
        away_pred = away_model.predict(X_val)
        
        home_rmse = np.sqrt(mean_squared_error(y_home_val, home_pred))
        away_rmse = np.sqrt(mean_squared_error(y_away_val, away_pred))
        
        # Combined RMSE
        all_pred = np.concatenate([home_pred, away_pred])
        all_actual = np.concatenate([y_home_val, y_away_val])
        combined_rmse = np.sqrt(mean_squared_error(all_actual, all_pred))
        
        record_test("8a. Dual model training", True,
                   f"Home RMSE: {home_rmse:.4f}, Away RMSE: {away_rmse:.4f}")
        record_test("8b. Combined performance", combined_rmse < 2.0,
                   f"Combined RMSE: {combined_rmse:.4f}")
    except Exception as e:
        record_test("8. Dual goal predictor", False, str(e))

## Test Summary

In [None]:
# Summary
print("\n" + "=" * 60)
print(" XGBOOST VALIDATION SUMMARY")
print("=" * 60)

results_df = pd.DataFrame(test_results)
passed = results_df['passed'].sum()
total = len(results_df)

print(f"\nPassed: {passed}/{total} ({passed/total*100:.1f}%)")

if passed < total:
    print("\n❌ FAILED TESTS:")
    for _, row in results_df[~results_df['passed']].iterrows():
        print(f"   - {row['test']}: {row['message']}")
else:
    print("\n✅ All tests passed!")

# Show all results
print("\nDetailed Results:")
print(results_df.to_string(index=False))