# Model Development - Stress Level Prediction

This notebook develops and compares different machine learning models for stress level prediction.

## Objectives:
1. Load processed dataset with selected features
2. Split data into training and testing sets
3. Train multiple classification models
4. Compare model performances
5. Perform hyperparameter tuning
6. Select the best model
7. Save the trained models

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Import custom modules
import sys
sys.path.append('../src')
from utils.config import *
from models.model_trainer import ModelTrainer

print("Libraries imported successfully!")

## 1. Load Processed Dataset

In [None]:
# Load the dataset with selected features
try:
    final_dataset_path = PROCESSED_DATA_DIR / "final_dataset.csv"
    df = pd.read_csv(final_dataset_path)
    print(f"Loaded final dataset from: {final_dataset_path}")
except FileNotFoundError:
    print("Final dataset not found. Please run the feature selection notebook first.")
    # Create sample data for demonstration
    np.random.seed(42)
    n_samples = 800
    
    # Create sample features (selected features from previous step)
    sample_data = {
        'heart_rate': np.random.normal(0, 1, n_samples),
        'work_hours': np.random.normal(0, 1, n_samples),
        'sleep_hours': np.random.normal(0, 1, n_samples),
        'exercise_minutes': np.random.normal(0, 1, n_samples),
        'bmi': np.random.normal(0, 1, n_samples),
        'caffeine_intake': np.random.normal(0, 1, n_samples),
        'stress_level': np.random.choice([0, 1, 2], n_samples, p=[0.3, 0.5, 0.2])
    }
    
    df = pd.DataFrame(sample_data)
    # Make features more predictive
    df.loc[df['stress_level'] == 2, 'heart_rate'] += 1.5
    df.loc[df['stress_level'] == 2, 'work_hours'] += 1.2
    df.loc[df['stress_level'] == 0, 'sleep_hours'] += 1.0
    df.loc[df['stress_level'] == 0, 'exercise_minutes'] += 0.8
    
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Separate features and target
target_col = 'stress_level'
X = df.drop(columns=[target_col])
y = df[target_col]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")
print(f"\nTarget distribution:")
target_counts = y.value_counts().sort_index()
print(target_counts)
print(f"\nTarget proportions:")
print(y.value_counts(normalize=True).sort_index())

# Visualize target distribution
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
target_counts.plot(kind='bar')
plt.title('Target Distribution (Counts)')
plt.xlabel('Stress Level')
plt.ylabel('Count')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
target_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('Target Distribution (Proportions)')
plt.ylabel('')

plt.tight_layout()
plt.show()

## 2. Train-Test Split

In [None]:
# Split the data into training and testing sets
test_size = 0.2
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"Test size: {test_size*100}%")

print(f"\nTraining set target distribution:")
print(y_train.value_counts().sort_index())
print(f"\nTesting set target distribution:")
print(y_test.value_counts().sort_index())

# Check if stratification worked
train_props = y_train.value_counts(normalize=True).sort_index()
test_props = y_test.value_counts(normalize=True).sort_index()

print(f"\nProportions in training set: {train_props.values}")
print(f"Proportions in testing set: {test_props.values}")
print(f"Stratification successful: {np.allclose(train_props.values, test_props.values, atol=0.05)}")

## 3. Initialize Model Trainer

In [None]:
# Initialize the model trainer
trainer = ModelTrainer()

# Initialize all models with default parameters
models = trainer.initialize_models()

print(f"Initialized {len(models)} models:")
for model_name, model in models.items():
    print(f"  - {model_name}: {type(model).__name__}")

print(f"\nTraining data shape: {X_train.shape}")
print(f"Training target shape: {y_train.shape}")

## 4. Train All Models

In [None]:
# Train all models
print("Training all models...")
print("=" * 30)

trained_models = trainer.train_all_models(X_train, y_train)

print(f"\nAll {len(trained_models)} models trained successfully!")
print(f"Trained models: {list(trained_models.keys())}")

## 5. Evaluate All Models

In [None]:
# Evaluate all trained models
print("Evaluating all models...")
print("=" * 30)

model_scores = trainer.evaluate_all_models(X_test, y_test)

print(f"\nModel evaluation completed!")
print(f"Evaluated {len(model_scores)} models")

## 6. Model Performance Comparison

In [None]:
# Create a comprehensive comparison of model performances
scores_df = pd.DataFrame(model_scores).T
scores_df = scores_df.round(4)

print("MODEL PERFORMANCE COMPARISON")
print("=" * 50)
print(scores_df)

# Visualize model comparison
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
n_metrics = len(metrics)

plt.figure(figsize=(16, 4))

for i, metric in enumerate(metrics, 1):
    plt.subplot(1, n_metrics, i)
    metric_scores = scores_df[metric].sort_values(ascending=False)
    bars = plt.bar(range(len(metric_scores)), metric_scores.values)
    plt.title(f'{metric.title()} Comparison')
    plt.ylabel(metric.title())
    plt.xticks(range(len(metric_scores)), metric_scores.index, rotation=45)
    plt.ylim(0, 1)
    
    # Add value labels on bars
    for j, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom', fontsize=8)
    
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find best model for each metric
print("\nBest models for each metric:")
for metric in metrics:
    best_model = scores_df[metric].idxmax()
    best_score = scores_df[metric].max()
    print(f"  {metric:12s}: {best_model:20s} ({best_score:.4f})")

## 7. Cross-Validation Analysis

In [None]:
# Perform cross-validation for all models
print("Performing cross-validation analysis...")

cv_results = {}
for model_name in models.keys():
    cv_result = trainer.cross_validate_model(model_name, X_train, y_train, cv=5)
    cv_results[model_name] = cv_result

# Create CV results DataFrame
cv_df = pd.DataFrame({
    model: {
        'mean_cv_score': results['mean_cv_score'],
        'std_cv_score': results['std_cv_score']
    } for model, results in cv_results.items()
}).T

cv_df = cv_df.round(4)
cv_df = cv_df.sort_values('mean_cv_score', ascending=False)

print("\nCROSS-VALIDATION RESULTS")
print("=" * 40)
print(cv_df)

# Visualize CV results
plt.figure(figsize=(12, 6))
x_pos = np.arange(len(cv_df))

plt.bar(x_pos, cv_df['mean_cv_score'], 
        yerr=cv_df['std_cv_score'], capsize=5, alpha=0.7)
plt.xlabel('Models')
plt.ylabel('Cross-Validation Accuracy')
plt.title('5-Fold Cross-Validation Results')
plt.xticks(x_pos, cv_df.index, rotation=45)
plt.grid(True, alpha=0.3)

# Add value labels
for i, (mean_score, std_score) in enumerate(zip(cv_df['mean_cv_score'], cv_df['std_cv_score'])):
    plt.text(i, mean_score + std_score + 0.01, f'{mean_score:.3f}', 
             ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## 8. Get Best Model

In [None]:
# Get the best performing model
best_model_name, best_model, best_scores = trainer.get_best_model()

print(f"BEST MODEL SELECTION")
print(f"=" * 30)
print(f"Best Model: {best_model_name}")
print(f"Model Type: {type(best_model).__name__}")
print(f"\nBest Model Performance:")
for metric, score in best_scores.items():
    print(f"  {metric:12s}: {score:.4f}")

print(f"\nBest Model CV Score: {cv_results[best_model_name]['mean_cv_score']:.4f} "
      f"(±{cv_results[best_model_name]['std_cv_score']:.4f})")

## 9. Detailed Classification Report

In [None]:
# Generate detailed classification report for the best model
y_pred_best = best_model.predict(X_test)

# Classification report
class_names = ['Low Stress', 'Medium Stress', 'High Stress']  # Adjust based on your target encoding
report = classification_report(y_test, y_pred_best, target_names=class_names)

print(f"DETAILED CLASSIFICATION REPORT - {best_model_name.upper()}")
print("=" * 60)
print(report)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.title(f'Confusion Matrix - {best_model_name.title()}')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.show()

# Calculate per-class accuracy
class_accuracy = cm.diagonal() / cm.sum(axis=1)
print(f"\nPer-class Accuracy:")
for i, (class_name, acc) in enumerate(zip(class_names, class_accuracy)):
    print(f"  {class_name:15s}: {acc:.4f} ({cm[i,i]}/{cm[i,:].sum()})")

## 10. Hyperparameter Tuning for Best Model

In [None]:
# Perform hyperparameter tuning for the best model
print(f"Performing hyperparameter tuning for {best_model_name}...")

# Define parameter grids for different models
param_grids = {
    'random_forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'gradient_boosting': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'decision_tree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'logistic_regression': {
        'C': [0.1, 1.0, 10.0],
        'solver': ['liblinear', 'lbfgs']
    },
    'svm': {
        'C': [0.1, 1.0, 10.0],
        'kernel': ['linear', 'rbf']
    }
}

if best_model_name in param_grids:
    param_grid = param_grids[best_model_name]
    
    print(f"Tuning hyperparameters: {list(param_grid.keys())}")
    
    # Perform hyperparameter tuning
    tuned_model = trainer.hyperparameter_tuning(
        best_model_name, param_grid, X_train, y_train, cv=3
    )
    
    # Re-train and evaluate the tuned model
    trainer.train_model(best_model_name, X_train, y_train)
    tuned_scores = trainer.evaluate_model(best_model_name, X_test, y_test)
    
    print(f"\nPerformance comparison:")
    print(f"Original {best_model_name} accuracy: {best_scores['accuracy']:.4f}")
    print(f"Tuned {best_model_name} accuracy: {tuned_scores['accuracy']:.4f}")
    print(f"Improvement: {tuned_scores['accuracy'] - best_scores['accuracy']:.4f}")
    
    # Update best model if tuning improved performance
    if tuned_scores['accuracy'] > best_scores['accuracy']:
        best_model = trainer.trained_models[best_model_name]
        best_scores = tuned_scores
        print(f"✓ Model performance improved with hyperparameter tuning!")
    else:
        print(f"• Original model performs better or equal.")
else:
    print(f"No parameter grid defined for {best_model_name}")

## 11. Feature Importance Analysis

In [None]:
# Analyze feature importance for the best model (if available)
if hasattr(best_model, 'feature_importances_'):
    # Get feature importances
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"FEATURE IMPORTANCE - {best_model_name.upper()}")
    print("=" * 40)
    print(feature_importance)
    
    # Visualize feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['feature'], feature_importance['importance'])
    plt.xlabel('Feature Importance')
    plt.title(f'Feature Importance - {best_model_name.title()}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
elif hasattr(best_model, 'coef_'):
    # For linear models, show coefficients
    coefficients = pd.DataFrame({
        'feature': X.columns,
        'coefficient': best_model.coef_[0] if best_model.coef_.ndim > 1 else best_model.coef_
    })
    coefficients['abs_coefficient'] = np.abs(coefficients['coefficient'])
    coefficients = coefficients.sort_values('abs_coefficient', ascending=False)
    
    print(f"FEATURE COEFFICIENTS - {best_model_name.upper()}")
    print("=" * 40)
    print(coefficients)
    
    # Visualize coefficients
    plt.figure(figsize=(10, 6))
    colors = ['red' if x < 0 else 'blue' for x in coefficients['coefficient']]
    plt.barh(coefficients['feature'], coefficients['coefficient'], color=colors, alpha=0.7)
    plt.xlabel('Coefficient Value')
    plt.title(f'Feature Coefficients - {best_model_name.title()}')
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
else:
    print(f"Feature importance not available for {best_model_name}")

## 12. Save Trained Models

In [None]:
# Save all trained models
print("Saving trained models...")

model_save_paths = {}
for model_name in trained_models.keys():
    model_path = MODELS_DIR / f"{model_name}_model.joblib"
    trainer.save_model(model_name, str(model_path))
    model_save_paths[model_name] = str(model_path)

print(f"\nSaved {len(model_save_paths)} models to {MODELS_DIR}")

# Save model comparison results
import json

model_results = {
    'best_model': {
        'name': best_model_name,
        'scores': best_scores,
        'cv_score': cv_results[best_model_name]['mean_cv_score'],
        'cv_std': cv_results[best_model_name]['std_cv_score']
    },
    'all_model_scores': model_scores,
    'cv_results': {k: v for k, v in cv_results.items()},
    'model_ranking': scores_df.sort_values('accuracy', ascending=False).index.tolist(),
    'dataset_info': {
        'n_features': X.shape[1],
        'n_samples': len(df),
        'n_train': len(X_train),
        'n_test': len(X_test),
        'feature_names': list(X.columns)
    },
    'model_save_paths': model_save_paths
}

results_path = RESULTS_DIR / "model_training_results.json"
with open(results_path, 'w') as f:
    json.dump(model_results, f, indent=2, default=str)

print(f"Model training results saved to: {results_path}")

## 13. Model Development Summary

In [None]:
# Generate comprehensive model development summary
print("MODEL DEVELOPMENT SUMMARY")
print("=" * 50)
print(f"Dataset: {len(df)} samples, {X.shape[1]} features")
print(f"Train/Test Split: {len(X_train)}/{len(X_test)} ({(1-test_size)*100:.0f}%/{test_size*100:.0f}%)")
print(f"\nModels Trained: {len(trained_models)}")
for model_name in trained_models.keys():
    print(f"  ✓ {model_name}")

print(f"\nBest Model: {best_model_name}")
print(f"Best Accuracy: {best_scores['accuracy']:.4f}")
print(f"Best F1-Score: {best_scores['f1_score']:.4f}")
print(f"Cross-Validation: {cv_results[best_model_name]['mean_cv_score']:.4f} (±{cv_results[best_model_name]['std_cv_score']:.4f})")

print(f"\nModel Rankings (by accuracy):")
for i, model_name in enumerate(scores_df.sort_values('accuracy', ascending=False).index, 1):
    accuracy = scores_df.loc[model_name, 'accuracy']
    print(f"  {i}. {model_name:20s}: {accuracy:.4f}")

print(f"\nAll models saved to: {MODELS_DIR}")
print(f"Results saved to: {results_path}")

print(f"\nNext steps:")
print(f"1. Model evaluation and visualization")
print(f"2. Generate prediction reports")
print(f"3. Deploy the best model for real-time predictions")