In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import re

# Load the dataset
df = pd.read_csv('Student Mental health.csv')
subsetted_data = df.sample(n=min(50000, len(df)), random_state=42)

# Clean column names to avoid inconsistencies
subsetted_data.columns = subsetted_data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('?', '', regex=False)

# Drop the timestamp column, as it cannot be used directly in the model
subsetted_data = subsetted_data.drop(columns=['timestamp'])

# Handle missing values
subsetted_data.ffill(inplace=True)

# Convert CGPA ranges to numeric (average of the range)
def convert_cgpa_range(cgpa_str):
    # Match ranges like '3.00 - 3.49'
    match = re.match(r'(\d+\.\d+)\s*-\s*(\d+\.\d+)', cgpa_str)
    if match:
        low, high = match.groups()
        return (float(low) + float(high)) / 2
    else:
        # Handle single numbers
        try:
            return float(cgpa_str)
        except ValueError:
            return None  # or handle edge cases if needed

# Apply the conversion to the CGPA column
subsetted_data['what_is_your_cgpa'] = subsetted_data['what_is_your_cgpa'].apply(convert_cgpa_range)

# Handle any remaining missing values created from invalid CGPA strings
subsetted_data.ffill(inplace=True)

# Convert the "do you" questions to binary values
do_you_columns = ['do_you_have_depression', 'do_you_have_anxiety', 'do_you_have_panic_attack', 'did_you_seek_any_specialist_for_a_treatment']
for col in do_you_columns:
    subsetted_data[col] = subsetted_data[col].apply(lambda x: 1 if x == 'Yes' else 0)

# Convert the selected categorical variables into dummy/indicator variables
subsetted_data = pd.get_dummies(subsetted_data, columns=[
    'choose_your_gender', 'what_is_your_course', 'your_current_year_of_study',
    'marital_status'
])

# Model for Depression
def train_random_forest_for_depression(data):
    X = data.drop(columns=['do_you_have_depression'])
    Y = data['do_you_have_depression']

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)
    print("Depression Model Accuracy: {:.2f}".format(accuracy_score(Y_test, Y_pred)))
    print(classification_report(Y_test, Y_pred))

# Model for Anxiety
def train_random_forest_for_anxiety(data):
    X = data.drop(columns=['do_you_have_anxiety'])
    Y = data['do_you_have_anxiety']

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)
    print("Anxiety Model Accuracy: {:.2f}".format(accuracy_score(Y_test, Y_pred)))
    print(classification_report(Y_test, Y_pred))

# Model for Panic Attack
def train_random_forest_for_panic_attack(data):
    X = data.drop(columns=['do_you_have_panic_attack'])
    Y = data['do_you_have_panic_attack']

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)
    print("Panic Attack Model Accuracy: {:.2f}".format(accuracy_score(Y_test, Y_pred)))
    print(classification_report(Y_test, Y_pred))

# Train and evaluate models
print("----- Depression Random Forest -----")
train_random_forest_for_depression(subsetted_data)

print("\n----- Anxiety Random Forest -----")
train_random_forest_for_anxiety(subsetted_data)

print("\n----- Panic Attack Random Forest -----")
train_random_forest_for_panic_attack(subsetted_data)


----- Depression Random Forest -----
Depression Model Accuracy: 0.81
              precision    recall  f1-score   support

           0       0.81      0.93      0.87        14
           1       0.80      0.57      0.67         7

    accuracy                           0.81        21
   macro avg       0.81      0.75      0.77        21
weighted avg       0.81      0.81      0.80        21


----- Anxiety Random Forest -----
Anxiety Model Accuracy: 0.62
              precision    recall  f1-score   support

           0       0.60      1.00      0.75        12
           1       1.00      0.11      0.20         9

    accuracy                           0.62        21
   macro avg       0.80      0.56      0.47        21
weighted avg       0.77      0.62      0.51        21


----- Panic Attack Random Forest -----
Panic Attack Model Accuracy: 0.62
              precision    recall  f1-score   support

           0       0.75      0.75      0.75        16
           1       0.20      0