## Model Evaluation

Evaluate all trained models on the test set with appropriate metrics.


In [4]:
import pandas as pd
import numpy as np
import joblib
import json
from pathlib import Path
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
)

import os

def get_project_root():
    """Find the project root directory by looking for a marker file/directory."""
    # Start from current working directory
    current = Path(os.getcwd())
    
    # Look for project markers (like .git, requirements.txt, or data/ directory)
    markers = ['.git', 'requirements.txt', 'data', 'notebooks']
    
    # Walk up the directory tree
    for path in [current] + list(current.parents):
        # Check if this looks like the project root
        if any((path / marker).exists() for marker in markers):
            return path
    
    # Fallback: if we're in notebooks/, go up one level
    if current.name == 'notebooks':
        return current.parent
    
    # Last resort: current directory
    return current

project_root = get_project_root()
processed_dir = project_root / "data" / "processed"
models_dir = project_root / "models"
reports_dir = project_root / "reports"
reports_dir.mkdir(exist_ok=True)

print("Loading test data...")
test_df = pd.read_csv(processed_dir / "test.csv")

# Load feature columns
feature_cols = joblib.load(models_dir / "feature_columns.pkl")

# Prepare features and targets
X_test = test_df[feature_cols].values
y_test_reg = test_df['OCCUPANCY_RATE_BEDS'].values
y_test_clf = test_df['overcapacity'].values

print(f"Evaluating on {len(X_test)} test samples")
print(f"Regression target range: {y_test_reg.min():.2f} to {y_test_reg.max():.2f}")
print(f"Classification target distribution: {np.bincount(y_test_clf)}")


Loading test data...
Evaluating on 33161 test samples
Regression target range: 2.00 to 101.64
Classification target distribution: [ 5916 27245]


### Regression Model Evaluation


In [6]:
results = {}
regression_models = {
    'lr': 'regression_lr.pkl',
    'rf': 'regression_rf.pkl',
    'xgb': 'regression_xgb.pkl'
}

results['regression'] = {}

for name, model_file in regression_models.items():
    print(f"\nEvaluating {name.upper()}...")
    model = joblib.load(models_dir / model_file)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test_reg, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred))
    r2 = r2_score(y_test_reg, y_pred)
    
    results['regression'][name] = {
        'MAE': float(mae),
        'RMSE': float(rmse),
        'R2': float(r2)
    }
    
    print(f"  MAE:  {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²:   {r2:.4f}")



Evaluating LR...
  MAE:  3.2917
  RMSE: 8.0086
  R²:   0.4423

Evaluating RF...
  MAE:  1.9590
  RMSE: 3.2544
  R²:   0.9079

Evaluating XGB...
  MAE:  1.4441
  RMSE: 2.5280
  R²:   0.9444


### Classification Model Evaluation


In [9]:
classification_models = {
    'lr': 'classification_lr.pkl',
    'xgb': 'classification_xgb.pkl',
    'rf': 'classification_rf.pkl'
}

results['classification'] = {}

for name, model_file in classification_models.items():
    print(f"\nEvaluating {name.upper()}...")
    model = joblib.load(models_dir / model_file)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    precision = precision_score(y_test_clf, y_pred, zero_division=0)
    recall = recall_score(y_test_clf, y_pred, zero_division=0)
    f1 = f1_score(y_test_clf, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test_clf, y_pred_proba)
    cm = confusion_matrix(y_test_clf, y_pred).tolist()
    
    results['classification'][name] = {
        'Precision': float(precision),
        'Recall': float(recall),
        'F1': float(f1),
        'ROC-AUC': float(roc_auc),
        'Confusion_Matrix': cm
    }
    
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f} (most important for overcapacity detection)")
    print(f"  F1:        {f1:.4f}")
    print(f"  ROC-AUC:   {roc_auc:.4f}")
    print(f"  Confusion Matrix: {cm}")



Evaluating LR...
  Precision: 0.9996
  Recall:    0.9941 (most important for overcapacity detection)
  F1:        0.9969
  ROC-AUC:   0.9998
  Confusion Matrix: [[5906, 10], [161, 27084]]

Evaluating XGB...
  Precision: 0.9689
  Recall:    1.0000 (most important for overcapacity detection)
  F1:        0.9842
  ROC-AUC:   0.9966
  Confusion Matrix: [[5041, 875], [0, 27245]]

Evaluating RF...
  Precision: 0.8508
  Recall:    0.9993 (most important for overcapacity detection)
  F1:        0.9191
  ROC-AUC:   0.9159
  Confusion Matrix: [[1143, 4773], [20, 27225]]


In [10]:
# Save results
results_path = reports_dir / "evaluation_results.json"
with open(results_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n✓ Evaluation results saved to {results_path}")

# Print best models
print("\n" + "="*50)
print("Best Models")
print("="*50)

# Best regression (lowest RMSE)
best_reg = min(results['regression'].items(), key=lambda x: x[1]['RMSE'])
print(f"Best Regression Model: {best_reg[0].upper()} (RMSE: {best_reg[1]['RMSE']:.4f})")

# Best classification (highest recall, then F1)
best_clf = max(results['classification'].items(), 
               key=lambda x: (x[1]['Recall'], x[1]['F1']))
print(f"Best Classification Model: {best_clf[0].upper()} (Recall: {best_clf[1]['Recall']:.4f}, F1: {best_clf[1]['F1']:.4f})")



✓ Evaluation results saved to /Users/qadeermac/workspace/ML-Toronto-Shelter-Occupancy /ML-Toronto-Shelter-Occupancy/reports/evaluation_results.json

Best Models
Best Regression Model: XGB (RMSE: 2.5280)
Best Classification Model: XGB (Recall: 1.0000, F1: 0.9842)
