# Model Evaluation - Stress Level Prediction

This notebook provides comprehensive evaluation and visualization of the trained models.

## Objectives:
1. Load trained models and test data
2. Generate detailed performance metrics
3. Create comprehensive visualizations
4. Analyze model strengths and weaknesses
5. Generate evaluation report
6. Make predictions on new data

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import joblib
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Import custom modules
import sys
sys.path.append('../src')
from utils.config import *
from models.model_evaluator import ModelEvaluator

print("Libraries imported successfully!")

## 1. Load Data and Models

In [None]:
# Load the final dataset
try:
    final_dataset_path = PROCESSED_DATA_DIR / "final_dataset.csv"
    df = pd.read_csv(final_dataset_path)
    print(f"Loaded dataset from: {final_dataset_path}")
except FileNotFoundError:
    print("Dataset not found. Creating sample data for demonstration.")
    # Create sample data
    np.random.seed(42)
    n_samples = 800
    sample_data = {
        'heart_rate': np.random.normal(0, 1, n_samples),
        'work_hours': np.random.normal(0, 1, n_samples),
        'sleep_hours': np.random.normal(0, 1, n_samples),
        'exercise_minutes': np.random.normal(0, 1, n_samples),
        'bmi': np.random.normal(0, 1, n_samples),
        'caffeine_intake': np.random.normal(0, 1, n_samples),
        'stress_level': np.random.choice([0, 1, 2], n_samples, p=[0.3, 0.5, 0.2])
    }
    df = pd.DataFrame(sample_data)
    # Make features predictive
    df.loc[df['stress_level'] == 2, 'heart_rate'] += 1.5
    df.loc[df['stress_level'] == 2, 'work_hours'] += 1.2
    df.loc[df['stress_level'] == 0, 'sleep_hours'] += 1.0

# Separate features and target
target_col = 'stress_level'
X = df.drop(columns=[target_col])
y = df[target_col]

# Recreate train-test split (same as in model development)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Dataset shape: {df.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Features: {list(X.columns)}")

In [None]:
# Load model training results
try:
    results_path = RESULTS_DIR / "model_training_results.json"
    with open(results_path, 'r') as f:
        model_results = json.load(f)
    print(f"Loaded model results from: {results_path}")
    
    best_model_name = model_results['best_model']['name']
    all_model_scores = model_results['all_model_scores']
    
except FileNotFoundError:
    print("Model results not found. Using mock results for demonstration.")
    # Create mock results
    best_model_name = 'random_forest'
    all_model_scores = {
        'random_forest': {'accuracy': 0.85, 'precision': 0.84, 'recall': 0.85, 'f1_score': 0.84},
        'gradient_boosting': {'accuracy': 0.82, 'precision': 0.81, 'recall': 0.82, 'f1_score': 0.81},
        'decision_tree': {'accuracy': 0.78, 'precision': 0.77, 'recall': 0.78, 'f1_score': 0.77},
        'logistic_regression': {'accuracy': 0.75, 'precision': 0.74, 'recall': 0.75, 'f1_score': 0.74},
        'svm': {'accuracy': 0.73, 'precision': 0.72, 'recall': 0.73, 'f1_score': 0.72}
    }

print(f"Best model: {best_model_name}")
print(f"Models evaluated: {list(all_model_scores.keys())}")

In [None]:
# Load trained models
trained_models = {}
model_predictions = {}
model_probabilities = {}

# Try to load saved models, or create mock models for demonstration
for model_name in all_model_scores.keys():
    model_path = MODELS_DIR / f"{model_name}_model.joblib"
    try:
        model = joblib.load(model_path)
        trained_models[model_name] = model
        model_predictions[model_name] = model.predict(X_test)
        if hasattr(model, 'predict_proba'):
            model_probabilities[model_name] = model.predict_proba(X_test)
        print(f"Loaded {model_name} from {model_path}")
    except FileNotFoundError:
        print(f"Model file not found for {model_name}. Creating mock predictions.")
        # Create mock predictions for demonstration
        np.random.seed(42)
        accuracy = all_model_scores[model_name]['accuracy']
        n_correct = int(len(y_test) * accuracy)
        
        # Create predictions with specified accuracy
        predictions = y_test.copy().values
        incorrect_indices = np.random.choice(len(predictions), 
                                           size=len(predictions)-n_correct, 
                                           replace=False)
        for idx in incorrect_indices:
            # Randomly change to different class
            current = predictions[idx]
            possible = [0, 1, 2]
            possible.remove(current)
            predictions[idx] = np.random.choice(possible)
        
        model_predictions[model_name] = predictions
        
        # Create mock probabilities
        n_classes = len(np.unique(y))
        probs = np.random.dirichlet(np.ones(n_classes), size=len(X_test))
        model_probabilities[model_name] = probs

print(f"\nLoaded/created predictions for {len(model_predictions)} models")

## 2. Initialize Model Evaluator

In [None]:
# Initialize model evaluator
evaluator = ModelEvaluator(figsize=(12, 8))

# Define class names for better visualization
class_names = ['Low Stress', 'Medium Stress', 'High Stress']

print("Model evaluator initialized!")
print(f"Class names: {class_names}")

## 3. Detailed Classification Reports

In [None]:
# Generate detailed classification reports for all models
classification_reports = {}

print("DETAILED CLASSIFICATION REPORTS")
print("=" * 60)

for model_name, predictions in model_predictions.items():
    print(f"\n{model_name.upper().replace('_', ' ')}")
    print("-" * 40)
    
    report = evaluator.generate_classification_report(
        y_test, predictions, target_names=class_names
    )
    classification_reports[model_name] = report
    print()

## 4. Confusion Matrices

In [None]:
# Generate confusion matrices for all models
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, (model_name, predictions) in enumerate(model_predictions.items()):
    if i < len(axes):
        plt.sca(axes[i])
        cm = confusion_matrix(y_test, predictions)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=class_names, yticklabels=class_names)
        plt.title(f'Confusion Matrix - {model_name.replace("_", " ").title()}')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')

# Hide the last subplot if we have fewer than 6 models
if len(model_predictions) < len(axes):
    axes[-1].set_visible(False)

plt.tight_layout()
plt.show()

# Save individual confusion matrices
for model_name, predictions in model_predictions.items():
    save_path = get_figure_save_path(f'confusion_matrix_{model_name}')
    evaluator.plot_confusion_matrix(
        y_test, predictions, labels=class_names, save_path=str(save_path)
    )
    plt.close()  # Close to save memory

## 5. Model Performance Comparison

In [None]:
# Create comprehensive model comparison
comparison_fig = evaluator.compare_models_performance(
    all_model_scores, 
    save_path=str(get_figure_save_path('model_performance_comparison'))
)
plt.show()

# Create a detailed comparison table
scores_df = pd.DataFrame(all_model_scores).T
scores_df = scores_df.round(4)
scores_df['rank'] = scores_df['accuracy'].rank(ascending=False, method='min').astype(int)
scores_df = scores_df.sort_values('rank')

print("MODEL PERFORMANCE RANKING")
print("=" * 50)
print(scores_df[['rank', 'accuracy', 'precision', 'recall', 'f1_score']])

# Highlight best performances
print("\nBest Performance per Metric:")
for metric in ['accuracy', 'precision', 'recall', 'f1_score']:
    best_model = scores_df[metric].idxmax()
    best_score = scores_df[metric].max()
    print(f"  {metric.title():12s}: {best_model:20s} ({best_score:.4f})")

## 6. ROC Curves (for Binary Classification)

In [None]:
# For multiclass problems, we can create ROC curves for each class vs rest
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from itertools import cycle

# Binarize the output for ROC analysis
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
n_classes = y_test_bin.shape[1]

# Plot ROC curves for the best model
if best_model_name in model_probabilities:
    y_proba_best = model_probabilities[best_model_name]
    
    plt.figure(figsize=(12, 8))
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    
    for i, color in zip(range(n_classes), colors):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba_best[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, color=color, lw=2,
                label=f'{class_names[i]} (AUC = {roc_auc:.2f})')
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curves - {best_model_name.replace("_", " ").title()} (One-vs-Rest)')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    
    # Save the plot
    save_path = get_figure_save_path(f'roc_curves_{best_model_name}')
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"ROC curves saved to: {save_path}")
else:
    print(f"Probability predictions not available for {best_model_name}")

## 7. Learning Curves (if models available)

In [None]:
# Generate learning curves for the best model (if available)
if best_model_name in trained_models:
    best_model = trained_models[best_model_name]
    
    # Combine training and test data for learning curve
    X_combined = pd.concat([X_train, X_test])
    y_combined = pd.concat([y_train, y_test])
    
    learning_curve_fig = evaluator.plot_learning_curves(
        best_model, X_combined, y_combined, 
        model_name=best_model_name.replace('_', ' ').title(),
        save_path=str(get_figure_save_path(f'learning_curve_{best_model_name}'))
    )
    plt.show()
else:
    print(f"Trained model not available for {best_model_name}. Skipping learning curves.")

## 8. Per-Class Performance Analysis

In [None]:
# Analyze per-class performance for all models
per_class_metrics = {}

for model_name, predictions in model_predictions.items():
    cm = confusion_matrix(y_test, predictions)
    
    # Calculate per-class metrics
    class_precision = cm.diagonal() / cm.sum(axis=0)
    class_recall = cm.diagonal() / cm.sum(axis=1)
    class_f1 = 2 * (class_precision * class_recall) / (class_precision + class_recall)
    
    per_class_metrics[model_name] = {
        'precision': class_precision,
        'recall': class_recall,
        'f1_score': class_f1
    }

# Create per-class performance visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
metrics = ['precision', 'recall', 'f1_score']

for i, metric in enumerate(metrics):
    metric_data = []
    for model_name in model_predictions.keys():
        for class_idx, class_name in enumerate(class_names):
            metric_data.append({
                'Model': model_name.replace('_', ' ').title(),
                'Class': class_name,
                'Value': per_class_metrics[model_name][metric][class_idx]
            })
    
    metric_df = pd.DataFrame(metric_data)
    
    # Create grouped bar plot
    pivot_df = metric_df.pivot(index='Model', columns='Class', values='Value')
    pivot_df.plot(kind='bar', ax=axes[i], rot=45)
    axes[i].set_title(f'Per-Class {metric.title()}')
    axes[i].set_ylabel(metric.title())
    axes[i].legend(title='Stress Level')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
save_path = get_figure_save_path('per_class_performance')
plt.savefig(save_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"Per-class performance analysis saved to: {save_path}")

## 9. Error Analysis

In [None]:
# Perform error analysis for the best model
best_predictions = model_predictions[best_model_name]

# Find misclassified samples
misclassified_mask = y_test != best_predictions
misclassified_indices = y_test[misclassified_mask].index

print(f"ERROR ANALYSIS - {best_model_name.upper()}")
print("=" * 50)
print(f"Total test samples: {len(y_test)}")
print(f"Correctly classified: {(~misclassified_mask).sum()}")
print(f"Misclassified: {misclassified_mask.sum()}")
print(f"Error rate: {misclassified_mask.sum() / len(y_test):.4f}")

# Analyze error patterns
error_patterns = pd.DataFrame({
    'True_Label': y_test[misclassified_mask],
    'Predicted_Label': best_predictions[misclassified_mask]
})

print(f"\nError Patterns:")
error_counts = error_patterns.groupby(['True_Label', 'Predicted_Label']).size().reset_index(name='Count')
for _, row in error_counts.iterrows():
    true_class = class_names[int(row['True_Label'])]
    pred_class = class_names[int(row['Predicted_Label'])]
    print(f"  {true_class} -> {pred_class}: {row['Count']} cases")

# Visualize error patterns
plt.figure(figsize=(8, 6))
error_matrix = pd.crosstab(error_patterns['True_Label'], error_patterns['Predicted_Label'], 
                          rownames=['True'], colnames=['Predicted'])
sns.heatmap(error_matrix, annot=True, fmt='d', cmap='Reds')
plt.title(f'Error Pattern Matrix - {best_model_name.replace("_", " ").title()}')
plt.show()

## 10. Feature Impact Analysis

In [None]:
# Analyze feature impact on misclassifications
if len(misclassified_indices) > 0:
    # Compare feature distributions for correct vs incorrect predictions
    X_test_reset = X_test.reset_index(drop=True)
    y_test_reset = y_test.reset_index(drop=True)
    
    correct_mask = ~misclassified_mask
    
    print(f"\nFEATURE IMPACT ON MISCLASSIFICATIONS")
    print("=" * 40)
    
    # Compare feature means
    feature_comparison = pd.DataFrame({
        'Correct_Mean': X_test_reset[correct_mask].mean(),
        'Incorrect_Mean': X_test_reset[misclassified_mask].mean(),
        'Difference': X_test_reset[misclassified_mask].mean() - X_test_reset[correct_mask].mean()
    })
    
    feature_comparison['Abs_Difference'] = abs(feature_comparison['Difference'])
    feature_comparison = feature_comparison.sort_values('Abs_Difference', ascending=False)
    
    print("Feature differences (Misclassified vs Correct):")
    print(feature_comparison.round(4))
    
    # Visualize feature distributions
    n_features = min(4, len(X.columns))  # Show top 4 features
    top_features = feature_comparison.head(n_features).index
    
    plt.figure(figsize=(16, 4))
    for i, feature in enumerate(top_features, 1):
        plt.subplot(1, n_features, i)
        
        plt.hist(X_test_reset[correct_mask][feature], alpha=0.7, 
                label='Correct', bins=20, density=True)
        plt.hist(X_test_reset[misclassified_mask][feature], alpha=0.7, 
                label='Misclassified', bins=20, density=True)
        
        plt.title(f'{feature}')
        plt.xlabel('Feature Value')
        plt.ylabel('Density')
        plt.legend()
        plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    save_path = get_figure_save_path('feature_impact_analysis')
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Feature impact analysis saved to: {save_path}")
else:
    print("Perfect classification! No misclassified samples to analyze.")

## 11. Model Predictions on Sample Data

In [None]:
# Make predictions on a few sample cases
sample_indices = np.random.choice(X_test.index, size=5, replace=False)
sample_data = X_test.loc[sample_indices]
sample_true = y_test.loc[sample_indices]

print("SAMPLE PREDICTIONS")
print("=" * 50)

for idx in sample_indices:
    print(f"\nSample {idx}:")
    print(f"True Label: {class_names[sample_true[idx]]}")
    
    # Show predictions from all models
    for model_name, predictions in model_predictions.items():
        test_idx = list(X_test.index).index(idx)
        pred_label = predictions[test_idx]
        pred_class = class_names[pred_label]
        
        # Add confidence if available
        confidence = ""
        if model_name in model_probabilities:
            proba = model_probabilities[model_name][test_idx]
            max_proba = np.max(proba)
            confidence = f" (confidence: {max_proba:.3f})"
        
        correct = "✓" if pred_label == sample_true[idx] else "✗"
        print(f"  {model_name:20s}: {pred_class:15s} {correct}{confidence}")
    
    # Show feature values
    print(f"  Features: {sample_data.loc[idx].to_dict()}")

## 12. Generate Evaluation Summary

In [None]:
# Generate comprehensive evaluation summary
evaluation_summary = evaluator.generate_evaluation_summary(
    all_model_scores, best_model_name
)

# Add additional analysis results
evaluation_summary['error_analysis'] = {
    'total_test_samples': len(y_test),
    'misclassified_samples': misclassified_mask.sum(),
    'error_rate': misclassified_mask.sum() / len(y_test),
    'per_class_errors': error_patterns.groupby(['True_Label', 'Predicted_Label']).size().to_dict()
}

evaluation_summary['dataset_info'] = {
    'n_features': X.shape[1],
    'n_samples': len(df),
    'n_test_samples': len(X_test),
    'feature_names': list(X.columns),
    'class_names': class_names,
    'class_distribution': y_test.value_counts().to_dict()
}

print("\nEVALUATION SUMMARY GENERATED")
print("=" * 40)
print(f"Best Model: {evaluation_summary['best_model']}")
print(f"Best Accuracy: {evaluation_summary['best_model_scores']['accuracy']:.4f}")
print(f"Error Rate: {evaluation_summary['error_analysis']['error_rate']:.4f}")
print(f"Models Evaluated: {len(evaluation_summary['all_model_scores'])}")

## 13. Save Evaluation Report

In [None]:
# Save comprehensive evaluation report
report_path = get_results_save_path('model_evaluation_report')
evaluator.save_evaluation_report(str(report_path), evaluation_summary)

# Save evaluation summary as JSON
json_path = RESULTS_DIR / "evaluation_summary.json"
with open(json_path, 'w') as f:
    json.dump(evaluation_summary, f, indent=2, default=str)

# Save detailed results
detailed_results = {
    'classification_reports': classification_reports,
    'per_class_metrics': {k: {metric: v[metric].tolist() for metric in v.keys()} 
                         for k, v in per_class_metrics.items()},
    'feature_comparison': feature_comparison.to_dict() if len(misclassified_indices) > 0 else {},
    'model_scores_df': scores_df.to_dict()
}

detailed_path = RESULTS_DIR / "detailed_evaluation_results.json"
with open(detailed_path, 'w') as f:
    json.dump(detailed_results, f, indent=2, default=str)

print(f"\nEvaluation reports saved:")
print(f"  Text report: {report_path}")
print(f"  Summary JSON: {json_path}")
print(f"  Detailed JSON: {detailed_path}")
print(f"  Figures saved to: {FIGURES_DIR}")

## 14. Final Recommendations

In [None]:
# Generate final recommendations
print("FINAL RECOMMENDATIONS")
print("=" * 50)

best_accuracy = evaluation_summary['best_model_scores']['accuracy']
best_f1 = evaluation_summary['best_model_scores']['f1_score']

print(f"✓ Best Model: {best_model_name.replace('_', ' ').title()}")
print(f"✓ Model Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.1f}%)")
print(f"✓ Model F1-Score: {best_f1:.4f}")

if best_accuracy >= 0.85:
    print(f"\n🎯 EXCELLENT: Model shows excellent performance for deployment")
elif best_accuracy >= 0.80:
    print(f"\n✅ GOOD: Model shows good performance, suitable for most applications")
elif best_accuracy >= 0.75:
    print(f"\n⚠️  FAIR: Model shows fair performance, consider improvements")
else:
    print(f"\n❌ POOR: Model needs significant improvement before deployment")

print(f"\nActionable Insights:")

# Performance insights
if best_f1 < best_accuracy - 0.05:
    print(f"• Consider class balancing techniques (F1 significantly lower than accuracy)")

# Error analysis insights
error_rate = evaluation_summary['error_analysis']['error_rate']
if error_rate > 0.2:
    print(f"• High error rate ({error_rate:.1%}) - consider feature engineering or data collection")

# Model selection insights
model_ranking = evaluation_summary['model_ranking']
if len(model_ranking) > 1:
    second_best = model_ranking[1]
    second_best_acc = all_model_scores[second_best]['accuracy']
    if abs(best_accuracy - second_best_acc) < 0.02:
        print(f"• Consider ensemble methods - {second_best} performs similarly")

print(f"\nNext Steps:")
print(f"1. Deploy {best_model_name.replace('_', ' ').title()} for production use")
print(f"2. Set up monitoring for model performance in production")
print(f"3. Collect more data to improve model performance")
print(f"4. Consider ensemble methods for improved accuracy")
print(f"5. Implement model retraining pipeline for continuous improvement")

print(f"\n" + "=" * 50)
print(f"MODEL EVALUATION COMPLETED SUCCESSFULLY!")
print(f"All results and visualizations saved to: {REPORTS_DIR}")