# Student Mental Health Status Classification
## Multiclass Classification Project

**Project Goal:** Develop and deploy a machine learning model to predict student mental health stress levels (Low/Medium/High) based on behavioral and academic factors.

**Dataset:** Generated synthetic dataset simulating real student mental health indicators

**Models to Compare:** Naive Bayes, Logistic Regression, Decision Tree, Random Forest, XGBoost, LightGBM, CatBoost

## 1. IMPORTS AND SETUP

In [None]:
# Data manipulation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
try:
    import lightgbm as lgb
except:
    print("LightGBM not installed, will skip")
try:
    import catboost as cb
except:
    print("CatBoost not installed, will skip")

# Metrics and evaluation
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score,
    roc_curve, auc, ConfusionMatrixDisplay
)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Model saving
import joblib
import pickle

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ All imports successful")

## 2. DATASET CREATION AND LOADING

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Create synthetic dataset
n_samples = 1000

data = {
    'sleep_hours': np.random.uniform(4, 10, n_samples),
    'study_hours_per_day': np.random.uniform(1, 8, n_samples),
    'social_interaction_score': np.random.randint(1, 10, n_samples),
    'exercise_hours_per_week': np.random.uniform(0, 10, n_samples),
    'academic_performance': np.random.uniform(50, 100, n_samples),
    'exam_anxiety_level': np.random.randint(1, 10, n_samples),
    'family_income_level': np.random.choice(['low', 'medium', 'high'], n_samples),
    'caffeine_intake': np.random.uniform(0, 5, n_samples),
    'assignment_overload': np.random.randint(1, 10, n_samples),
    'extracurricular_activities': np.random.randint(0, 6, n_samples),
}

df = pd.DataFrame(data)

# Create target variable with some logical relationships
stress_level = []
for idx, row in df.iterrows():
    # Factors that increase stress
    stress_score = 0
    stress_score += (10 - row['sleep_hours']) * 1.5  # Less sleep = more stress
    stress_score += row['exam_anxiety_level'] * 1.2
    stress_score += row['assignment_overload'] * 1.3
    stress_score += (100 - row['academic_performance']) * 0.1  # Lower grades = more stress
    stress_score += (10 - row['social_interaction_score']) * 0.8
    stress_score += (10 - row['exercise_hours_per_week']) * 0.5
    
    # Factors that decrease stress
    stress_score -= row['extracurricular_activities'] * 0.5
    
    # Add some noise
    stress_score += np.random.normal(0, 2)
    
    if stress_score < 15:
        stress_level.append('Low')
    elif stress_score < 30:
        stress_level.append('Medium')
    else:
        stress_level.append('High')

df['stress_level'] = stress_level

print("Dataset created successfully!")
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## 3. EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
# Basic statistics
print("\n=== DATASET STATISTICS ===")
print(f"Dataset shape: {df.shape}")
print(f"\nMissing values:\n{df.isnull().sum()}")
print(f"\nDuplicate rows: {df.duplicated().sum()}")
print(f"\nData types:\n{df.dtypes}")

In [None]:
# Target variable distribution
print("\n=== TARGET VARIABLE DISTRIBUTION ===")
print(df['stress_level'].value_counts())
print(f"\nPercentage distribution:")
print(df['stress_level'].value_counts(normalize=True) * 100)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Count plot
stress_counts = df['stress_level'].value_counts()
axes[0].bar(stress_counts.index, stress_counts.values, color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[0].set_title('Stress Level Distribution', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Count')
axes[0].set_xlabel('Stress Level')
for i, v in enumerate(stress_counts.values):
    axes[0].text(i, v + 5, str(v), ha='center')

# Pie chart
axes[1].pie(stress_counts.values, labels=stress_counts.index, autopct='%1.1f%%',
             colors=['#2ecc71', '#f39c12', '#e74c3c'])
axes[1].set_title('Stress Level Proportion', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

print("✓ Class distribution is balanced - good for multiclass classification")

In [None]:
# Numerical features statistics
print("\n=== NUMERICAL FEATURES STATISTICS ===")
numerical_cols = df.select_dtypes(include=[np.number]).columns
print(df[numerical_cols].describe())

In [None]:
# Correlation with target
print("\n=== FEATURE ANALYSIS ===")

# Encode target temporarily for correlation
le_temp = LabelEncoder()
df['stress_level_encoded'] = le_temp.fit_transform(df['stress_level'])

# FIX: Convert numerical_cols Index to list before concatenating
numerical_cols_list = list(numerical_cols)
correlation = df[numerical_cols_list + ['stress_level_encoded']].corr()['stress_level_encoded'].sort_values(ascending=False)
print("\nCorrelation with target:")
print(correlation)

# Visualize correlations
plt.figure(figsize=(10, 6))
correlation.drop('stress_level_encoded')[:-1].plot(kind='barh', color='steelblue')
plt.title('Feature Correlation with Stress Level', fontsize=12, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.tight_layout()
plt.show()

df.drop('stress_level_encoded', axis=1, inplace=True)

In [None]:
# Distribution of key features
fig, axes = plt.subplots(2, 3, figsize=(14, 8))
axes = axes.ravel()

features_to_plot = ['sleep_hours', 'study_hours_per_day', 'exam_anxiety_level', 
                     'academic_performance', 'assignment_overload', 'exercise_hours_per_week']

for idx, feature in enumerate(features_to_plot):
    axes[idx].hist(df[feature], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {feature}', fontsize=10, fontweight='bold')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 4. DATA PREPROCESSING AND FEATURE ENGINEERING

In [None]:
print("=== DATA PREPROCESSING ===")

# 1. Handle missing values (none in this dataset)
print(f"\nStep 1: Missing Values Handling")
print(f"Missing values before: {df.isnull().sum().sum()}")
# df = df.dropna()  # Not needed here
print(f"Missing values after: {df.isnull().sum().sum()}")
print("✓ No missing values")

In [None]:
# 2. Encode categorical features
print("\nStep 2: Categorical Feature Encoding")
print(f"Categorical columns: {df.select_dtypes(include='object').columns.tolist()}")

# One-hot encode family_income_level
df_encoded = pd.get_dummies(df, columns=['family_income_level'], drop_first=True)
print(f"✓ One-hot encoding applied for family_income_level")
print(f"Shape after encoding: {df_encoded.shape}")

In [None]:
# 3. Prepare X and y
print("\nStep 3: Feature and Target Separation")

X = df_encoded.drop('stress_level', axis=1)
y = df_encoded['stress_level']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature names: {X.columns.tolist()}")

In [None]:
# 4. Encode target variable
print("\nStep 4: Target Variable Encoding")
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"Classes: {le.classes_}")
print(f"Encoded mapping:")
for i, class_name in enumerate(le.classes_):
    print(f"  {class_name}: {i}")

In [None]:
# 5. Train-test split
print("\nStep 5: Train-Test Split")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set size: {X_train.shape[0]} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set size: {X_test.shape[0]} ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nTraining set class distribution:")
unique, counts = np.unique(y_train, return_counts=True)
for cls, count in zip(unique, counts):
    print(f"  Class {le.classes_[cls]}: {count} samples")

print(f"✓ Stratified split ensures class balance in both sets")

In [None]:
# 6. Feature normalization/standardization
print("\nStep 6: Feature Normalization")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"✓ StandardScaler applied")
print(f"Mean of scaled training features: {X_train_scaled.mean(axis=0)[:5].round(4)}")
print(f"Std of scaled training features: {X_train_scaled.std(axis=0)[:5].round(4)}")

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

## 5. MODEL TRAINING AND EVALUATION

In [None]:
print("=== MODEL TRAINING ===")

# Dictionary to store models and results
models = {}
results = {}

# 1. Gaussian Naive Bayes
print("\n1. Training Gaussian Naive Bayes...")
gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)
y_pred_gnb = gnb.predict(X_test_scaled)
models['Gaussian Naive Bayes'] = gnb
results['Gaussian Naive Bayes'] = y_pred_gnb
print("✓ Completed")

In [None]:
# 2. Logistic Regression
print("2. Training Multinomial Logistic Regression...")
lr = LogisticRegression(multi_class='multinomial', max_iter=1000, random_state=42)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
models['Logistic Regression'] = lr
results['Logistic Regression'] = y_pred_lr
print("✓ Completed")

In [None]:
# 3. Decision Tree
print("3. Training Decision Tree...")
dt = DecisionTreeClassifier(max_depth=10, random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)
models['Decision Tree'] = dt
results['Decision Tree'] = y_pred_dt
print("✓ Completed")

In [None]:
# 4. Random Forest
print("4. Training Random Forest...")
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
models['Random Forest'] = rf
results['Random Forest'] = y_pred_rf
print("✓ Completed")

In [None]:
# 5. XGBoost
print("5. Training XGBoost...")
xgb = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, 
                     random_state=42, eval_metric='mlogloss', verbosity=0)
xgb.fit(X_train_scaled, y_train)
y_pred_xgb = xgb.predict(X_test_scaled)
models['XGBoost'] = xgb
results['XGBoost'] = y_pred_xgb
print("✓ Completed")

In [None]:
# 6. LightGBM
try:
    print("6. Training LightGBM...")
    lgbm = lgb.LGBMClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, 
                               random_state=42, verbose=-1)
    lgbm.fit(X_train_scaled, y_train)
    y_pred_lgbm = lgbm.predict(X_test_scaled)
    models['LightGBM'] = lgbm
    results['LightGBM'] = y_pred_lgbm
    print("✓ Completed")
except:
    print("⚠ LightGBM not available")

In [None]:
# 7. CatBoost
try:
    print("7. Training CatBoost...")
    catb = cb.CatBoostClassifier(iterations=100, max_depth=6, learning_rate=0.1,
                                   random_state=42, verbose=False)
    catb.fit(X_train_scaled, y_train)
    y_pred_catb = catb.predict(X_test_scaled)
    models['CatBoost'] = catb
    results['CatBoost'] = y_pred_catb
    print("✓ Completed")
except:
    print("⚠ CatBoost not available")

print(f"\n✓ All {len(models)} models trained successfully!")

## 6. MODEL EVALUATION

In [None]:
print("=== MODEL EVALUATION ===")

# Compute metrics for all models
evaluation_results = {}

for model_name, y_pred in results.items():
    accuracy = accuracy_score(y_test, y_pred)
    precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    evaluation_results[model_name] = {
        'Accuracy': accuracy,
        'Precision (Macro)': precision_macro,
        'Recall (Macro)': recall_macro,
        'F1 (Macro)': f1_macro,
        'F1 (Weighted)': f1_weighted
    }

# Create results dataframe
results_df = pd.DataFrame(evaluation_results).T
print("\n" + results_df.to_string())

# Find best model
best_model_name = results_df['F1 (Weighted)'].idxmax()
print(f"\n🏆 Best Model: {best_model_name}")
print(f"   F1-Score (Weighted): {results_df.loc[best_model_name, 'F1 (Weighted)']:.4f}")

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
results_df['Accuracy'].sort_values(ascending=True).plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Model Accuracy Comparison', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Accuracy')
axes[0].set_xlim([0, 1])
for i, v in enumerate(results_df['Accuracy'].sort_values(ascending=True).values):
    axes[0].text(v + 0.02, i, f'{v:.4f}', va='center')

# F1-Score comparison
results_df['F1 (Weighted)'].sort_values(ascending=True).plot(kind='barh', ax=axes[1], color='green')
axes[1].set_title('Model F1-Score (Weighted) Comparison', fontsize=12, fontweight='bold')
axes[1].set_xlabel('F1-Score')
axes[1].set_xlim([0, 1])
for i, v in enumerate(results_df['F1 (Weighted)'].sort_values(ascending=True).values):
    axes[1].text(v + 0.02, i, f'{v:.4f}', va='center')

plt.tight_layout()
plt.show()

## 7. DETAILED ANALYSIS OF BEST MODEL

In [None]:
print(f"\n=== DETAILED ANALYSIS: {best_model_name} ===")

y_pred_best = results[best_model_name]
best_model = models[best_model_name]

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best, target_names=le.classes_))

In [None]:
# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred_best)
print(cm)

# Visualize confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(ax=ax, cmap='Blues', values_format='d')
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance (if available)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10).to_string(index=False))
    
    # Visualize
    plt.figure(figsize=(10, 6))
    plt.barh(range(10), feature_importance['Importance'].head(10).values, color='steelblue')
    plt.yticks(range(10), feature_importance['Feature'].head(10).values)
    plt.xlabel('Importance Score')
    plt.title(f'Top 10 Feature Importances - {best_model_name}', fontsize=12, fontweight='bold')
    plt.tight_layout()
    plt.show()

## 8. MODEL PERSISTENCE (SAVING)

In [None]:
print("=== SAVING MODELS ===")

import os

# Create directory for models
model_dir = 'ml_models'
os.makedirs(model_dir, exist_ok=True)

# Save best model
model_path = os.path.join(model_dir, f'{best_model_name.replace(" ", "_")}_model.pkl')
joblib.dump(best_model, model_path)
print(f"✓ Best model saved: {model_path}")

# Save scaler
scaler_path = os.path.join(model_dir, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f"✓ Scaler saved: {scaler_path}")

# Save label encoder
encoder_path = os.path.join(model_dir, 'label_encoder.pkl')
joblib.dump(le, encoder_path)
print(f"✓ Label encoder saved: {encoder_path}")

# Save feature names
features_path = os.path.join(model_dir, 'feature_names.pkl')
joblib.dump(X.columns.tolist(), features_path)
print(f"✓ Feature names saved: {features_path}")

# Save all models
all_models_path = os.path.join(model_dir, 'all_models.pkl')
joblib.dump(models, all_models_path)
print(f"✓ All models saved: {all_models_path}")

print(f"\n✓ All artifacts saved to '{model_dir}' directory")

## 9. MODEL INFERENCE AND TESTING

In [None]:
print("=== LOADING AND TESTING SAVED MODEL ===")

# Load model
loaded_model = joblib.load(model_path)
loaded_scaler = joblib.load(scaler_path)
loaded_encoder = joblib.load(encoder_path)
loaded_features = joblib.load(features_path)

print("✓ All artifacts loaded successfully")

# Test on some samples
print("\nTesting on sample predictions:")
test_samples = X_test_scaled.head(5)

predictions = loaded_model.predict(test_samples)
predictions_proba = loaded_model.predict_proba(test_samples)

for i in range(len(test_samples)):
    print(f"\nSample {i+1}:")
    print(f"  Predicted class: {loaded_encoder.classes_[predictions[i]]}")
    print(f"  Confidence: {predictions_proba[i].max():.4f}")
    print(f"  Class probabilities:")
    for j, cls in enumerate(loaded_encoder.classes_):
        print(f"    {cls}: {predictions_proba[i][j]:.4f}")

## 10. SUMMARY AND CONCLUSIONS

In [None]:
print("\n" + "="*60)
print("PROJECT SUMMARY - STUDENT MENTAL HEALTH CLASSIFICATION")
print("="*60)

print("\n📊 DATASET INFORMATION:")
print(f"  • Total samples: {len(df)}")
print(f"  • Features: {X.shape[1]}")
print(f"  • Classes: {len(le.classes_)} - {', '.join(le.classes_)}")
print(f"  • Train-test split: 80-20")

print(f"\n🔧 PREPROCESSING STEPS:")
print(f"  ✓ Missing value handling")
print(f"  ✓ Categorical feature encoding (One-Hot)")
print(f"  ✓ Feature normalization (StandardScaler)")
print(f"  ✓ Stratified train-test split")

print(f"\n🤖 MODELS TRAINED: {len(models)}")
for name in models.keys():
    print(f"  • {name}")

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"  • Accuracy: {evaluation_results[best_model_name]['Accuracy']:.4f}")
print(f"  • Precision (Macro): {evaluation_results[best_model_name]['Precision (Macro)']:.4f}")
print(f"  • Recall (Macro): {evaluation_results[best_model_name]['Recall (Macro)']:.4f}")
print(f"  • F1-Score (Weighted): {evaluation_results[best_model_name]['F1 (Weighted)']:.4f}")

print(f"\n💾 SAVED ARTIFACTS:")
print(f"  ✓ Trained model: {model_path}")
print(f"  ✓ Scaler: {scaler_path}")
print(f"  ✓ Label encoder: {encoder_path}")
print(f"  ✓ Feature names: {features_path}")
print(f"  ✓ All models: {all_models_path}")

print(f"\n✅ PROJECT COMPLETED SUCCESSFULLY!")
print("="*60)

## 11. NEXT STEPS: WEB DEPLOYMENT (FastAPI)

The following code creates a FastAPI application for model deployment:

In [None]:
# Save FastAPI deployment code to separate file
fastapi_code = '''from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import joblib
import numpy as np
import pandas as pd
from typing import List

app = FastAPI(title="Student Mental Health Classifier", version="1.0.0")

# Load model and preprocessing objects
model = joblib.load('ml_models/XGBoost_model.pkl')
scaler = joblib.load('ml_models/scaler.pkl')
encoder = joblib.load('ml_models/label_encoder.pkl')
feature_names = joblib.load('ml_models/feature_names.pkl')

class PredictionInput(BaseModel):
    sleep_hours: float
    study_hours_per_day: float
    social_interaction_score: int
    exercise_hours_per_week: float
    academic_performance: float
    exam_anxiety_level: int
    family_income_level: str  # 'low', 'medium', 'high'
    caffeine_intake: float
    assignment_overload: int
    extracurricular_activities: int

class PredictionOutput(BaseModel):
    stress_level: str
    confidence: float
    probabilities: dict

@app.get("/")
def read_root():
    return {
        "message": "Student Mental Health Classifier API",
        "version": "1.0.0",
        "endpoints": {
            "predict": "/predict",
            "health": "/health"
        }
    }

@app.get("/health")
def health_check():
    return {"status": "healthy"}

@app.post("/predict", response_model=PredictionOutput)
def predict(input_data: PredictionInput):
    try:
        # Create DataFrame with one-hot encoded features
        data_dict = input_data.dict()
        
        # Handle one-hot encoding for family_income_level
        income_level = data_dict.pop('family_income_level')
        data_dict['family_income_level_high'] = 1 if income_level == 'high' else 0
        data_dict['family_income_level_medium'] = 1 if income_level == 'medium' else 0
        
        # Create DataFrame
        df_input = pd.DataFrame([data_dict])
        
        # Ensure column order matches training
        df_input = df_input[feature_names]
        
        # Scale features
        scaled_input = scaler.transform(df_input)
        
        # Make prediction
        prediction = model.predict(scaled_input)[0]
        probabilities = model.predict_proba(scaled_input)[0]
        
        # Get stress level and confidence
        stress_level = encoder.classes_[prediction]
        confidence = probabilities.max()
        
        # Create probability dictionary
        prob_dict = {
            encoder.classes_[i]: float(probabilities[i]) 
            for i in range(len(encoder.classes_))
        }
        
        return PredictionOutput(
            stress_level=stress_level,
            confidence=confidence,
            probabilities=prob_dict
        )
    
    except Exception as e:
        raise HTTPException(status_code=400, detail=str(e))

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
'''

with open('app.py', 'w') as f:
    f.write(fastapi_code)

print("✓ FastAPI application code saved to 'app.py'")
print("\nTo run the API:")
print("  1. Install FastAPI and Uvicorn: pip install fastapi uvicorn")
print("  2. Run: python app.py")
print("  3. Access API at: http://localhost:8000")
print("  4. View API docs at: http://localhost:8000/docs")

In [None]:
# Create requirements.txt for deployment
requirements = '''pandas==1.5.3
numpy==1.24.3
scikit-learn==1.3.0
xgboost==2.0.0
lightgbm==4.0.0
catboost==1.2.0
joblib==1.3.0
matplotlib==3.7.2
seaborn==0.12.2
fastapi==0.104.1
uvicorn==0.24.0
pydantic==2.4.2
'''

with open('requirements.txt', 'w') as f:
    f.write(requirements)

print("✓ requirements.txt created")