In [34]:
# importing the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# helper method that changes '7-8 hours' to 7.5
def parse_time_range(time_str):
    try:
        if isinstance(time_str, (int, float)):
            return time_str
        
        # Split the string by space and take the first part
        time_part = time_str.split(' ')[0]
        if '-' in time_part:
            start, end = map(float, time_part.split('-'))
            return (start + end) / 2
        else:
            return float(time_part)
    except (ValueError, IndexError):
        # Handle cases that don't fit the pattern
        return np.nan

# loads and preprocesses the data
def load_and_preprocess_data():
    print("Loading the dataset...")
    try:
        # Load the dataset from the CSV file
        df = pd.read_csv('Student-Depression-Dataset.csv')
        
        # Drop irrelevant columns
        df.drop(columns=['id', 'Gender', 'Age', 'City', 'Profession', 'CGPA', 'Study Satisfaction', 
                         'Job Satisfaction', 'Degree', 
                         'Family History of Mental Illness'], inplace=True)
        
        # Rename the remaining independent variables for clarity
        df.rename(columns={
            'Academic Pressure': 'academic_pressure',
            'Work Pressure': 'work_pressure',
            'Sleep Duration': 'sleep_duration',
            'Dietary Habits': 'dietary_habits',
            'Work/Study Hours': 'work_study_hours',
            'Financial Stress': 'financial_stress',
            'Depression': 'Depression',
            'Have you ever had suicidal thoughts ?': 'suicidal_thoughts'
        }, inplace=True)
        
        # Convert the 'suicidal_thoughts' variable to numerical format (0 and 1)
        df['suicidal_thoughts'] = df['suicidal_thoughts'].apply(lambda x: 1 if x.strip().lower() == 'yes' else 0)

        # Apply the new function to clean the time-based columns
        df['sleep_duration'] = df['sleep_duration'].apply(parse_time_range)
        df['work_study_hours'] = df['work_study_hours'].apply(parse_time_range)
        
        # Convert the 'Dietary Habits' column into numerical features using one-hot encoding
        df = pd.get_dummies(df, columns=['dietary_habits'], prefix='diet', dtype=int)

        # Handle potential missing values that may have been created
        df.dropna(inplace=True)
        
        print(f"Dataset loaded and preprocessed!")
        return df
    except FileNotFoundError:
        print("Error: The file 'student-mental-health.csv' was not found.")
        print("Please download it from Kaggle and place it in this directory.")
        return None
    
# helper method for model function calling
def evaluate_model(model, X_test, y_test, model_name):
    print(f"\n--- Evaluating {model_name} ---")
    
    # Make predictions on the testing data
    y_pred = model.predict(X_test)

    # Calculate and print accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
def main():
    # Models: Random Forest and SVM
    
    # Load and preprocess the dataset
    df = load_and_preprocess_data()
    if df is None:
        return
    
    # Separate input data (X) and output data (y)
    # independent variables
    independent_vars = ['academic_pressure', 'work_pressure', 'sleep_duration', 
                        'work_study_hours', 'financial_stress', 'suicidal_thoughts', 
                        'diet_Healthy', 'diet_Moderate', 'diet_Unhealthy']
    
    # Ensure the columns exist in the DataFrame
    try:
        X = df[independent_vars]    # input data
        y = df['Depression']        # output data
    except KeyError as e:
        print(f"Error: Missing column in the dataset: {e}. Please check if the one-hot encoding was successful.")
        print("Available columns:", df.columns.tolist())
        return

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Model Development: two models to compare
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=80, random_state=42),
        'SVM': SVC(kernel='rbf', C=0.8, random_state=42)
    }

    # Train and evaluate each model
    for name, model in models.items():
        model.fit(X_train, y_train)
        evaluate_model(model, X_test, y_test, name)
        
if __name__ == "__main__":
    main()


Loading the dataset...
Dataset loaded and preprocessed!

--- Evaluating Random Forest ---
Accuracy: 0.7868

--- Evaluating SVM ---
Accuracy: 0.8337
