# Model Validation Patterns

This notebook demonstrates common model validation patterns including:
- Cross-validation
- Performance metrics calculation
- Overfitting detection
- Model comparison

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    mean_squared_error,
    r2_score
)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## 1. Cross-Validation Pattern

In [None]:
def perform_cross_validation(model, X, y, cv_folds=5, scoring='accuracy'):
    """
    Perform k-fold cross-validation on a model.
    
    Parameters:
    -----------
    model : sklearn estimator
        The model to validate
    X : array-like
        Feature matrix
    y : array-like
        Target vector
    cv_folds : int
        Number of cross-validation folds
    scoring : str
        Scoring metric
    
    Returns:
    --------
    dict : Cross-validation results
    """
    # Use stratified k-fold for classification, regular k-fold for regression
    if scoring in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    else:
        cv = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    
    return {
        'mean_score': scores.mean(),
        'std_score': scores.std(),
        'scores': scores,
        'cv_folds': cv_folds
    }

## 2. Performance Metrics Calculation

In [None]:
def calculate_classification_metrics(y_true, y_pred, y_pred_proba=None):
    """
    Calculate comprehensive classification metrics.
    """
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
        'f1_score': f1_score(y_true, y_pred, average='weighted', zero_division=0)
    }
    
    # Add ROC AUC if probability predictions available
    if y_pred_proba is not None:
        try:
            metrics['roc_auc'] = roc_auc_score(y_true, y_pred_proba, average='weighted')
        except:
            metrics['roc_auc'] = None
    
    return metrics

def calculate_regression_metrics(y_true, y_pred):
    """
    Calculate comprehensive regression metrics.
    """
    return {
        'mse': mean_squared_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'r2_score': r2_score(y_true, y_pred)
    }

## 3. Overfitting Detection

In [None]:
def detect_overfitting(model, X_train, y_train, X_test, y_test, scoring='accuracy'):
    """
    Detect overfitting by comparing train and test performance.
    
    Returns:
    --------
    dict : Overfitting analysis results
    """
    # Train the model
    model.fit(X_train, y_train)
    
    # Calculate train and test scores
    if scoring in ['accuracy', 'precision', 'recall', 'f1']:
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)
        
        if scoring == 'accuracy':
            train_score = accuracy_score(y_train, train_pred)
            test_score = accuracy_score(y_test, test_pred)
        elif scoring == 'precision':
            train_score = precision_score(y_train, train_pred, average='weighted', zero_division=0)
            test_score = precision_score(y_test, test_pred, average='weighted', zero_division=0)
        elif scoring == 'recall':
            train_score = recall_score(y_train, train_pred, average='weighted', zero_division=0)
            test_score = recall_score(y_test, test_pred, average='weighted', zero_division=0)
        elif scoring == 'f1':
            train_score = f1_score(y_train, train_pred, average='weighted', zero_division=0)
            test_score = f1_score(y_test, test_pred, average='weighted', zero_division=0)
    else:
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test)
    
    # Calculate overfitting metrics
    score_difference = train_score - test_score
    overfitting_ratio = score_difference / train_score if train_score > 0 else 0
    
    # Determine if overfitting is present
    # Threshold: >10% difference suggests overfitting
    is_overfitting = overfitting_ratio > 0.1
    
    return {
        'train_score': train_score,
        'test_score': test_score,
        'score_difference': score_difference,
        'overfitting_ratio': overfitting_ratio,
        'is_overfitting': is_overfitting,
        'severity': 'high' if overfitting_ratio > 0.2 else 'medium' if overfitting_ratio > 0.1 else 'low'
    }

## 4. Model Comparison

In [None]:
def compare_models(models_dict, X_train, y_train, X_test, y_test, scoring='accuracy'):
    """
    Compare multiple models using cross-validation and test set performance.
    
    Parameters:
    -----------
    models_dict : dict
        Dictionary of model names and model instances
    X_train, y_train : array-like
        Training data
    X_test, y_test : array-like
        Test data
    scoring : str
        Scoring metric
    
    Returns:
    --------
    pd.DataFrame : Comparison results
    """
    results = []
    
    for name, model in models_dict.items():
        # Cross-validation
        cv_results = perform_cross_validation(model, X_train, y_train, scoring=scoring)
        
        # Test set performance
        model.fit(X_train, y_train)
        test_score = model.score(X_test, y_test)
        
        results.append({
            'model': name,
            'cv_mean': cv_results['mean_score'],
            'cv_std': cv_results['std_score'],
            'test_score': test_score
        })
    
    return pd.DataFrame(results).sort_values('test_score', ascending=False)

## Example Usage

```python
# Load your data
# X, y = load_data()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Example: Cross-validation
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
# cv_results = perform_cross_validation(model, X_train, y_train)
# print(f"CV Score: {cv_results['mean_score']:.3f} (+/- {cv_results['std_score']:.3f})")

# Example: Overfitting detection
# overfitting_results = detect_overfitting(model, X_train, y_train, X_test, y_test)
# print(f"Overfitting detected: {overfitting_results['is_overfitting']}")

# Example: Model comparison
# models = {
#     'RandomForest': RandomForestClassifier(),
#     'SVM': SVC(),
#     'LogisticRegression': LogisticRegression()
# }
# comparison = compare_models(models, X_train, y_train, X_test, y_test)
# print(comparison)
```