In [1]:
#This notebook was created with the help of ChatGPT
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
import numpy as np


In [2]:
# Load dataset
filtered_df = pd.read_csv('filtered_mental_health_data.csv')

In [3]:
#Features and Target
feature_columns = [col for col in filtered_df.columns if not col.startswith('DSM')]
X = filtered_df[feature_columns]
y = filtered_df['DSM_MJD'].apply(lambda x: 1 if x == 1 else 0)

In [4]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# Pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    ('logreg', LogisticRegression(solver='liblinear', random_state=42))])

In [8]:
# Hyperparameter tuning
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],
    'logreg__penalty': ['l1', 'l2']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)
print("Best CV ROC-AUC Score:", grid_search.best_score_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'logreg__C': 0.01, 'logreg__penalty': 'l1'}
Best CV ROC-AUC Score: 0.7867985581687418


In [9]:
#Testset
y_pred = grid_search.predict(X_test)
y_pred_proba = grid_search.predict_proba(X_test)[:, 1]

In [10]:
#Metrics
roc_auc_scores = cross_val_score(
    grid_search.best_estimator_, X_test, y_test,
    cv=5, scoring='roc_auc'
)


accuracy_scores = cross_val_score(
    grid_search.best_estimator_, X_test, y_test,
    cv=5, scoring='accuracy'
)

roc_auc_mean = np.mean(roc_auc_scores)
roc_auc_std = np.std(roc_auc_scores)

accuracy_mean = np.mean(accuracy_scores)
accuracy_std = np.std(accuracy_scores)

print("\nClassification Report:\n", classification_report(y_test, y_pred))

print(f"Test ROC-AUC: {roc_auc_mean:.4f} ± {roc_auc_std:.4f}")
print(f"Test Accuracy: {accuracy_mean:.4f} ± {accuracy_std:.4f}")


Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.97      0.89       802
           1       0.56      0.18      0.27       199

    accuracy                           0.81      1001
   macro avg       0.69      0.57      0.58      1001
weighted avg       0.77      0.81      0.77      1001

Test ROC-AUC: 0.7482 ± 0.0289
Test Accuracy: 0.8082 ± 0.0084
