In [12]:
##This notebook was created with the help of ChatGPT
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np

In [13]:
# Load dataset
filtered_df = pd.read_csv('filtered_mental_health_data.csv')

In [14]:
# Prepare original dataset
feature_columns = [col for col in filtered_df.columns if not col.startswith('DSM')]
X = filtered_df[feature_columns]
y = filtered_df['DSM_MJD'].apply(lambda x: 1 if x == 1 else 0)

In [15]:
# Drop "PH4" 
X = X.drop(columns=['PH4'])

In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [17]:
#Hyperparameter Tuning
param_grid = {
    'n_estimators': [100],
    'learning_rate': [0.1],
    'max_depth': [3]
}

grid_search = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

In [18]:
#Fit the model
grid_search.fit(X_train, y_train)
print("\nBest Hyperparameters (without 'PD4'):", grid_search.best_params_)
print("Best CV ROC-AUC Score (without 'PD4'):", grid_search.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Best Hyperparameters (without 'PD4'): {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best CV ROC-AUC Score (without 'PD4'): 0.789646802240014


In [19]:
# Evaluate on test set
y_pred = grid_search.predict(X_test)
y_pred_proba = grid_search.predict_proba(X_test)[:, 1]

In [20]:
#Metrics
roc_auc_scores = cross_val_score(
    grid_search.best_estimator_, X_test, y_test, cv=5, scoring='roc_auc')

accuracy_scores = cross_val_score(
    grid_search.best_estimator_, X_test, y_test, cv=5, scoring='accuracy')

# Compute mean and standard deviation
roc_auc_mean = np.mean(roc_auc_scores)
roc_auc_std = np.std(roc_auc_scores)
accuracy_mean = np.mean(accuracy_scores)
accuracy_std = np.std(accuracy_scores)

# Print classification report and metrics with standard deviation
print("\nClassification Report (without 'PD4'):\n", classification_report(y_test, y_pred))
print(f"Test ROC-AUC (without 'PD4'): {roc_auc_mean:.4f} ± {roc_auc_std:.4f}")
print(f"Test Accuracy (without 'PD4'): {accuracy_mean:.4f} ± {accuracy_std:.4f}")



Classification Report (without 'PD4'):
               precision    recall  f1-score   support

           0       0.84      0.95      0.89       802
           1       0.56      0.25      0.35       199

    accuracy                           0.81      1001
   macro avg       0.70      0.60      0.62      1001
weighted avg       0.78      0.81      0.78      1001

Test ROC-AUC (without 'PD4'): 0.7529 ± 0.0503
Test Accuracy (without 'PD4'): 0.7912 ± 0.0185
