# Neural Network Model Validation

Comprehensive validation tests for the Neural Network (MLPRegressor) model.

**Tests:**
1. Basic functionality (fit, predict, evaluate)
2. Architecture effects (hidden layers)
3. Activation functions
4. Regularization behavior
5. Learning rate effects
6. Early stopping
7. Serialization (save/load)
8. Edge cases and error handling
9. Dual goal predictor

Run this BEFORE using the model in production to catch bugs.

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import tempfile
import os
import pickle
from pathlib import Path

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

# Test tracking
test_results = []

def record_test(name, passed, message=""):
    status = "✅ PASS" if passed else "❌ FAIL"
    test_results.append({'test': name, 'passed': passed, 'message': message})
    print(f"{status}: {name}")
    if message:
        print(f"       {message}")

print("Validation setup complete!")

## Generate Test Data

In [None]:
# Create synthetic hockey-like test data
np.random.seed(42)
n = 500

X_data = pd.DataFrame({
    'home_win_pct': np.random.uniform(0.3, 0.7, n),
    'away_win_pct': np.random.uniform(0.3, 0.7, n),
    'home_goals_avg': np.random.uniform(2.5, 3.8, n),
    'away_goals_avg': np.random.uniform(2.5, 3.8, n),
    'home_goals_against_avg': np.random.uniform(2.2, 3.5, n),
    'away_goals_against_avg': np.random.uniform(2.2, 3.5, n),
    'home_pp_pct': np.random.uniform(0.15, 0.28, n),
    'away_pp_pct': np.random.uniform(0.15, 0.28, n),
    'home_rest_days': np.random.randint(1, 5, n),
    'away_rest_days': np.random.randint(1, 5, n),
})

# Create realistic target with known relationships
y_home = (
    X_data['home_goals_avg'] * 0.4 +
    (4 - X_data['away_goals_against_avg']) * 0.3 +
    X_data['home_pp_pct'] * 5 +
    0.3 +  # home advantage
    np.random.normal(0, 0.5, n)
).clip(0, 8).round().astype(int)

y_away = (
    X_data['away_goals_avg'] * 0.4 +
    (4 - X_data['home_goals_against_avg']) * 0.3 +
    X_data['away_pp_pct'] * 5 +
    np.random.normal(0, 0.5, n)
).clip(0, 8).round().astype(int)

# Train/test split
split_idx = int(n * 0.8)
X_train, X_val = X_data.iloc[:split_idx], X_data.iloc[split_idx:]
y_home_train, y_home_val = y_home[:split_idx], y_home[split_idx:]
y_away_train, y_away_val = y_away[:split_idx], y_away[split_idx:]

# Scale features (CRITICAL for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}")
print(f"Home goals range: {y_home.min()} - {y_home.max()}")
print(f"Scaled mean: {X_train_scaled.mean():.4f}, std: {X_train_scaled.std():.4f}")

## Test 1: Basic Functionality

In [None]:
# Test 1a: Model creation
try:
    model = MLPRegressor(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        max_iter=200,
        random_state=42
    )
    record_test("1a. Model creation", True)
except Exception as e:
    record_test("1a. Model creation", False, str(e))

# Test 1b: Fit
try:
    model.fit(X_train_scaled, y_home_train)
    record_test("1b. Model fit", True, f"Iterations: {model.n_iter_}")
except Exception as e:
    record_test("1b. Model fit", False, str(e))

# Test 1c: Predict
try:
    predictions = model.predict(X_val_scaled)
    valid = len(predictions) == len(y_home_val) and not np.isnan(predictions).any()
    record_test("1c. Model predict", valid, f"n_predictions={len(predictions)}")
except Exception as e:
    record_test("1c. Model predict", False, str(e))

# Test 1d: Evaluate
try:
    rmse = np.sqrt(mean_squared_error(y_home_val, predictions))
    mae = mean_absolute_error(y_home_val, predictions)
    r2 = r2_score(y_home_val, predictions)
    record_test("1d. Model evaluate", True, f"RMSE={rmse:.4f}, MAE={mae:.4f}, R²={r2:.4f}")
except Exception as e:
    record_test("1d. Model evaluate", False, str(e))

## Test 2: Architecture Effects

In [None]:
# Test different architectures
architectures = [
    (50,),           # Single layer
    (100, 50),       # Two layers
    (100, 50, 25),   # Three layers
    (200,),          # Wide single layer
]

arch_results = []

for arch in architectures:
    try:
        model = MLPRegressor(
            hidden_layer_sizes=arch,
            max_iter=200,
            random_state=42
        )
        model.fit(X_train_scaled, y_home_train)
        val_pred = model.predict(X_val_scaled)
        rmse = np.sqrt(mean_squared_error(y_home_val, val_pred))
        arch_results.append({'arch': str(arch), 'rmse': rmse, 'n_iter': model.n_iter_})
    except Exception as e:
        arch_results.append({'arch': str(arch), 'rmse': None, 'error': str(e)})

arch_df = pd.DataFrame(arch_results)
print("Architecture Comparison:")
print(arch_df.to_string(index=False))

# All architectures should train successfully
all_trained = all(r.get('rmse') is not None for r in arch_results)
record_test("2a. All architectures train", all_trained)

# Performance should be reasonably similar (within 50%)
rmses = [r['rmse'] for r in arch_results if r['rmse'] is not None]
if rmses:
    rmse_range = max(rmses) / min(rmses)
    record_test("2b. Architecture performance variation", rmse_range < 2.0,
               f"Range: {min(rmses):.4f} - {max(rmses):.4f} (ratio: {rmse_range:.2f})")

## Test 3: Activation Functions

In [None]:
activations = ['relu', 'tanh', 'logistic', 'identity']
activation_results = []

for act in activations:
    try:
        model = MLPRegressor(
            hidden_layer_sizes=(100, 50),
            activation=act,
            max_iter=200,
            random_state=42
        )
        model.fit(X_train_scaled, y_home_train)
        val_pred = model.predict(X_val_scaled)
        rmse = np.sqrt(mean_squared_error(y_home_val, val_pred))
        activation_results.append({'activation': act, 'rmse': rmse})
    except Exception as e:
        activation_results.append({'activation': act, 'rmse': None, 'error': str(e)})

act_df = pd.DataFrame(activation_results)
print("Activation Function Comparison:")
print(act_df.to_string(index=False))

all_trained = all(r.get('rmse') is not None for r in activation_results)
record_test("3a. All activations train", all_trained)

# ReLU should be among the best
relu_rmse = next(r['rmse'] for r in activation_results if r['activation'] == 'relu')
best_rmse = min(r['rmse'] for r in activation_results if r['rmse'] is not None)
relu_competitive = relu_rmse <= best_rmse * 1.2
record_test("3b. ReLU is competitive", relu_competitive,
           f"ReLU: {relu_rmse:.4f}, Best: {best_rmse:.4f}")

## Test 4: Regularization (alpha)

In [None]:
alphas = [0.0001, 0.001, 0.01, 0.1, 1.0]
alpha_results = []

for alpha in alphas:
    try:
        model = MLPRegressor(
            hidden_layer_sizes=(100, 50),
            alpha=alpha,
            max_iter=200,
            random_state=42
        )
        model.fit(X_train_scaled, y_home_train)
        
        train_pred = model.predict(X_train_scaled)
        val_pred = model.predict(X_val_scaled)
        
        train_rmse = np.sqrt(mean_squared_error(y_home_train, train_pred))
        val_rmse = np.sqrt(mean_squared_error(y_home_val, val_pred))
        
        alpha_results.append({
            'alpha': alpha,
            'train_rmse': train_rmse,
            'val_rmse': val_rmse,
            'overfit': train_rmse / val_rmse if val_rmse > 0 else 0
        })
    except Exception as e:
        alpha_results.append({'alpha': alpha, 'error': str(e)})

alpha_df = pd.DataFrame(alpha_results)
print("Alpha (L2 Regularization) Comparison:")
print(alpha_df.to_string(index=False))

# Higher alpha should increase training error (less overfitting)
try:
    low_alpha = next(r for r in alpha_results if r['alpha'] == 0.0001)
    high_alpha = next(r for r in alpha_results if r['alpha'] == 1.0)
    
    more_regularized = high_alpha['train_rmse'] >= low_alpha['train_rmse']
    record_test("4a. Alpha increases training error", more_regularized,
               f"α=0.0001: {low_alpha['train_rmse']:.4f}, α=1.0: {high_alpha['train_rmse']:.4f}")
except Exception as e:
    record_test("4a. Alpha increases training error", False, str(e))

# Moderate alpha should have best validation performance
best_val_alpha = min(alpha_results, key=lambda x: x.get('val_rmse', float('inf')))
record_test("4b. Best validation alpha", True,
           f"Best α={best_val_alpha['alpha']}: val RMSE={best_val_alpha['val_rmse']:.4f}")

## Test 5: Learning Rate Effects

In [None]:
learning_rates = [0.0001, 0.001, 0.01]
lr_results = []

for lr in learning_rates:
    try:
        model = MLPRegressor(
            hidden_layer_sizes=(100, 50),
            learning_rate_init=lr,
            max_iter=200,
            random_state=42
        )
        model.fit(X_train_scaled, y_home_train)
        val_pred = model.predict(X_val_scaled)
        rmse = np.sqrt(mean_squared_error(y_home_val, val_pred))
        
        lr_results.append({
            'learning_rate': lr,
            'rmse': rmse,
            'n_iter': model.n_iter_,
            'final_loss': model.loss_
        })
    except Exception as e:
        lr_results.append({'learning_rate': lr, 'error': str(e)})

lr_df = pd.DataFrame(lr_results)
print("Learning Rate Comparison:")
print(lr_df.to_string(index=False))

# All should train
all_trained = all(r.get('rmse') is not None for r in lr_results)
record_test("5a. All learning rates train", all_trained)

# Higher LR should converge faster (fewer iterations) or have lower loss
try:
    low_lr = next(r for r in lr_results if r['learning_rate'] == 0.0001)
    high_lr = next(r for r in lr_results if r['learning_rate'] == 0.01)
    
    # Higher LR should either converge faster or have similar loss
    faster = high_lr['n_iter'] <= low_lr['n_iter'] or high_lr['final_loss'] <= low_lr['final_loss'] * 1.5
    record_test("5b. Higher LR converges faster or similar", faster,
               f"LR=0.0001: {low_lr['n_iter']} iters, LR=0.01: {high_lr['n_iter']} iters")
except Exception as e:
    record_test("5b. Higher LR converges faster", False, str(e))

## Test 6: Early Stopping

In [None]:
try:
    # Model with early stopping
    model_es = MLPRegressor(
        hidden_layer_sizes=(100, 50),
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
        max_iter=500,
        random_state=42
    )
    model_es.fit(X_train_scaled, y_home_train)
    
    # Model without early stopping
    model_no_es = MLPRegressor(
        hidden_layer_sizes=(100, 50),
        early_stopping=False,
        max_iter=500,
        random_state=42
    )
    model_no_es.fit(X_train_scaled, y_home_train)
    
    record_test("6a. Early stopping trains", model_es.n_iter_ > 0,
               f"Stopped at iteration {model_es.n_iter_}")
    
    # Early stopping should stop before max_iter (usually)
    stopped_early = model_es.n_iter_ < 500
    record_test("6b. Early stopping before max_iter", stopped_early,
               f"ES: {model_es.n_iter_}, No ES: {model_no_es.n_iter_}")
    
    # Validation score should be available
    has_val_score = hasattr(model_es, 'validation_scores_')
    record_test("6c. Validation scores tracked", has_val_score,
               f"N validation scores: {len(model_es.validation_scores_) if has_val_score else 0}")
    
    # Best validation score should be stored
    has_best = hasattr(model_es, 'best_validation_score_')
    record_test("6d. Best validation score stored", has_best,
               f"Best: {model_es.best_validation_score_:.4f}" if has_best else "")
except Exception as e:
    record_test("6. Early stopping", False, str(e))

## Test 7: Serialization (Save/Load)

In [None]:
try:
    # Train model
    model = MLPRegressor(
        hidden_layer_sizes=(100, 50),
        max_iter=200,
        random_state=42
    )
    model.fit(X_train_scaled, y_home_train)
    original_pred = model.predict(X_val_scaled)
    
    # Save with pickle
    with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f:
        pickle.dump(model, f)
        temp_path = f.name
    
    # Load
    with open(temp_path, 'rb') as f:
        loaded_model = pickle.load(f)
    
    loaded_pred = loaded_model.predict(X_val_scaled)
    
    # Predictions should match exactly
    match = np.allclose(original_pred, loaded_pred)
    record_test("7a. Pickle save/load predictions match", match,
               f"Max diff: {np.abs(original_pred - loaded_pred).max():.6f}")
    
    # Model attributes preserved
    attrs_match = (
        loaded_model.hidden_layer_sizes == model.hidden_layer_sizes and
        loaded_model.n_iter_ == model.n_iter_
    )
    record_test("7b. Attributes preserved", attrs_match,
               f"Layers: {loaded_model.hidden_layer_sizes}, Iterations: {loaded_model.n_iter_}")
    
    # Clean up
    os.unlink(temp_path)
    
    # Save scaler too (important for neural nets)
    with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f:
        pickle.dump(scaler, f)
        scaler_path = f.name
    
    with open(scaler_path, 'rb') as f:
        loaded_scaler = pickle.load(f)
    
    scaled_match = np.allclose(
        scaler.transform(X_val),
        loaded_scaler.transform(X_val)
    )
    record_test("7c. Scaler save/load", scaled_match)
    
    os.unlink(scaler_path)
except Exception as e:
    record_test("7. Serialization", False, str(e))

## Test 8: Edge Cases

In [None]:
# Test 8a: Single sample prediction
try:
    model = MLPRegressor(hidden_layer_sizes=(50,), max_iter=200, random_state=42)
    model.fit(X_train_scaled, y_home_train)
    
    single_pred = model.predict(X_val_scaled[[0]])
    record_test("8a. Single sample prediction", len(single_pred) == 1,
               f"Prediction: {single_pred[0]:.4f}")
except Exception as e:
    record_test("8a. Single sample prediction", False, str(e))

# Test 8b: Very small network
try:
    tiny_model = MLPRegressor(hidden_layer_sizes=(5,), max_iter=200, random_state=42)
    tiny_model.fit(X_train_scaled, y_home_train)
    tiny_pred = tiny_model.predict(X_val_scaled)
    tiny_rmse = np.sqrt(mean_squared_error(y_home_val, tiny_pred))
    record_test("8b. Tiny network trains", True, f"RMSE: {tiny_rmse:.4f}")
except Exception as e:
    record_test("8b. Tiny network trains", False, str(e))

# Test 8c: Predictions should be in reasonable range
try:
    all_pred = model.predict(scaler.transform(X_data))
    min_pred, max_pred = all_pred.min(), all_pred.max()
    
    # Goals should be roughly in 0-10 range
    reasonable = min_pred >= -2 and max_pred <= 12
    record_test("8c. Reasonable prediction range", reasonable,
               f"Range: [{min_pred:.2f}, {max_pred:.2f}]")
except Exception as e:
    record_test("8c. Reasonable prediction range", False, str(e))

# Test 8d: Scaling matters - unscaled data should perform worse
try:
    model_scaled = MLPRegressor(hidden_layer_sizes=(100,), max_iter=200, random_state=42)
    model_scaled.fit(X_train_scaled, y_home_train)
    scaled_rmse = np.sqrt(mean_squared_error(y_home_val, model_scaled.predict(X_val_scaled)))
    
    model_unscaled = MLPRegressor(hidden_layer_sizes=(100,), max_iter=200, random_state=42)
    model_unscaled.fit(X_train, y_home_train)
    unscaled_rmse = np.sqrt(mean_squared_error(y_home_val, model_unscaled.predict(X_val)))
    
    scaling_helps = scaled_rmse <= unscaled_rmse
    record_test("8d. Scaling improves performance", scaling_helps,
               f"Scaled: {scaled_rmse:.4f}, Unscaled: {unscaled_rmse:.4f}")
except Exception as e:
    record_test("8d. Scaling improves performance", False, str(e))

# Test 8e: Loss curve available
try:
    has_loss = hasattr(model, 'loss_curve_') and len(model.loss_curve_) > 0
    record_test("8e. Loss curve available", has_loss,
               f"Length: {len(model.loss_curve_) if has_loss else 0}")
except Exception as e:
    record_test("8e. Loss curve available", False, str(e))

## Test 9: Dual Goal Predictor

In [None]:
try:
    # Train separate models for home and away goals
    home_model = MLPRegressor(
        hidden_layer_sizes=(100, 50),
        max_iter=200,
        random_state=42
    )
    away_model = MLPRegressor(
        hidden_layer_sizes=(100, 50),
        max_iter=200,
        random_state=42
    )
    
    home_model.fit(X_train_scaled, y_home_train)
    away_model.fit(X_train_scaled, y_away_train)
    
    home_pred = home_model.predict(X_val_scaled)
    away_pred = away_model.predict(X_val_scaled)
    
    home_rmse = np.sqrt(mean_squared_error(y_home_val, home_pred))
    away_rmse = np.sqrt(mean_squared_error(y_away_val, away_pred))
    
    # Combined RMSE
    all_pred = np.concatenate([home_pred, away_pred])
    all_actual = np.concatenate([y_home_val, y_away_val])
    combined_rmse = np.sqrt(mean_squared_error(all_actual, all_pred))
    
    record_test("9a. Dual model training", True,
               f"Home RMSE: {home_rmse:.4f}, Away RMSE: {away_rmse:.4f}")
    record_test("9b. Combined performance", combined_rmse < 2.0,
               f"Combined RMSE: {combined_rmse:.4f}")
    
    # Models should have learned different weights
    # (even with same architecture, different targets)
    home_weights = home_model.coefs_[0].flatten()[:10]  # First 10 weights
    away_weights = away_model.coefs_[0].flatten()[:10]
    weights_differ = not np.allclose(home_weights, away_weights, rtol=0.1)
    record_test("9c. Models learned different weights", weights_differ)
except Exception as e:
    record_test("9. Dual goal predictor", False, str(e))

## Test 10: Cross-Validation Stability

In [None]:
try:
    model = MLPRegressor(
        hidden_layer_sizes=(100, 50),
        max_iter=200,
        random_state=42
    )
    
    # Scale all data
    X_all_scaled = scaler.fit_transform(X_data)
    
    # 5-fold cross-validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_all_scaled, y_home, cv=kfold,
                                scoring='neg_root_mean_squared_error')
    
    mean_rmse = -cv_scores.mean()
    std_rmse = cv_scores.std()
    
    cv_ratio = std_rmse / mean_rmse
    is_stable = cv_ratio < 0.3
    
    record_test("10. Cross-validation stability", is_stable,
               f"RMSE: {mean_rmse:.4f} (+/- {std_rmse:.4f}), CV ratio: {cv_ratio:.2%}")
except Exception as e:
    record_test("10. Cross-validation stability", False, str(e))

## Test Summary

In [None]:
# Summary
print("\n" + "=" * 60)
print(" NEURAL NETWORK VALIDATION SUMMARY")
print("=" * 60)

results_df = pd.DataFrame(test_results)
passed = results_df['passed'].sum()
total = len(results_df)

print(f"\nPassed: {passed}/{total} ({passed/total*100:.1f}%)")

if passed < total:
    print("\n❌ FAILED TESTS:")
    for _, row in results_df[~results_df['passed']].iterrows():
        print(f"   - {row['test']}: {row['message']}")
else:
    print("\n✅ All tests passed!")

# Show all results
print("\nDetailed Results:")
print(results_df.to_string(index=False))