In [15]:
#This notebook was created with the help of ChatGPT
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score
import numpy as np

In [16]:
# Load dataset
filtered_df = pd.read_csv('filtered_mental_health_data.csv')

In [17]:
# Prepare dataset
feature_columns = [col for col in filtered_df.columns if not col.startswith('DSM')]
X = filtered_df[feature_columns]
y = filtered_df['DSM_MJD'].apply(lambda x: 1 if x == 1 else 0)

In [18]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [19]:
#Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

In [20]:
# Fit the model
grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best CV ROC-AUC Score:", grid_search.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best CV ROC-AUC Score: 0.7939835385514478


In [21]:
# Evaluate on test set
y_pred = grid_search.predict(X_test)
y_pred_proba = grid_search.predict_proba(X_test)[:, 1]

In [22]:
# Metrics

roc_auc_scores = cross_val_score(
    grid_search.best_estimator_, X_test, y_test,
    cv=5, scoring='roc_auc'
)

# Calculate cross-validated Accuracy scores
accuracy_scores = cross_val_score(
    grid_search.best_estimator_, X_test, y_test,
    cv=5, scoring='accuracy'
)

# Compute mean and standard deviation
roc_auc_mean = np.mean(roc_auc_scores)
roc_auc_std = np.std(roc_auc_scores)

accuracy_mean = np.mean(accuracy_scores)
accuracy_std = np.std(accuracy_scores)

# Print detailed classification report on test set
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Print ROC-AUC and Accuracy with standard deviation
print(f"Test ROC-AUC: {roc_auc_mean:.4f} ± {roc_auc_std:.4f}")
print(f"Test Accuracy: {accuracy_mean:.4f} ± {accuracy_std:.4f}")



Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.95      0.89       802
           1       0.55      0.27      0.36       199

    accuracy                           0.81      1001
   macro avg       0.70      0.61      0.62      1001
weighted avg       0.78      0.81      0.78      1001

Test ROC-AUC: 0.7600 ± 0.0345
Test Accuracy: 0.7872 ± 0.0335


In [23]:
#Feature Importance
best_gb_model = grid_search.best_estimator_
feature_importances_top = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_gb_model.feature_importances_
}).sort_values(by='importance', ascending=False)

print("Top 10 Important Features:")
print(feature_importances_top.head(10))

feature_importances_bottom = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_gb_model.feature_importances_
}).sort_values(by='importance', ascending=True)  


print("10 Least Important Features:")
print(feature_importances_bottom.head(10))

Top 10 Important Features:
       feature  importance
119        PH4    0.097139
232      PEC52    0.068884
333       SA15    0.040146
94          M5    0.035889
309      NSD1E    0.035313
268         M9    0.027730
10         PD9    0.026710
96   IR1INTRO2    0.022511
98        PD1B    0.021600
103       PD1G    0.020215
10 Least Important Features:
    feature  importance
554  INC_HI         0.0
207    SA1C         0.0
206    SA1B         0.0
205    SA1A         0.0
204    CD18         0.0
203   CD16H         0.0
202   CD16G         0.0
201   CD16F         0.0
200   CD16E         0.0
199   CD16D         0.0


In [24]:
#Zero Importance Features
zero_importance_features = feature_importances[feature_importances['importance'] == 0.0]
num_zero_importance = zero_importance_features.shape[0]

print(f"Number of features with zero importance: {num_zero_importance}")


Number of features with zero importance: 297


In [26]:
#
import pandas as pd

results_df = X_test.copy()
results_df['actual'] = y_test.values
results_df['predicted'] = y_pred

# Top 10 important features
top_features = ['PH4', 'PEC52', 'SA15', 'M5', 'NSD1E', 
                'M9', 'PD9', 'IR1INTRO2', 'PD1B', 'PD1G']

# Misclassification examples
false_negatives = results_df[(results_df['actual'] == 1) & (results_df['predicted'] == 0)].iloc[[0, 1]]
false_positive = results_df[(results_df['actual'] == 0) & (results_df['predicted'] == 1)].iloc[[0]]

print("\nFalse Negatives (actual=1, predicted=0) top features:\n", false_negatives[top_features])
print("\nFalse Positive (actual=0, predicted=1) top features:\n", false_positive[top_features])

# Compute and print mode separately for correct positives and correct negatives
correct_positives = results_df[(results_df['actual'] == 1) & (results_df['predicted'] == 1)]
correct_negatives = results_df[(results_df['actual'] == 0) & (results_df['predicted'] == 0)]

correct_pos_modes = correct_positives[top_features].mode().iloc[0]
correct_neg_modes = correct_negatives[top_features].mode().iloc[0]

print("\nMode values of top features for correctly predicted POSITIVE cases:\n", correct_pos_modes)
print("\nMode values of top features for correctly predicted NEGATIVE cases:\n", correct_neg_modes)



False Negatives (actual=1, predicted=0) top features:
       PH4  PEC52  SA15  M5  NSD1E  M9  PD9  IR1INTRO2  PD1B  PD1G
1073   -9      5    -7   1      4   3    1         -7     1     1
501    -9      5     1  -7      3   1    1         -7     1    -5

False Positive (actual=0, predicted=1) top features:
       PH4  PEC52  SA15  M5  NSD1E  M9  PD9  IR1INTRO2  PD1B  PD1G
1501   -9      5    -7  -7      4   2    5         -7     5     5

Mode values of top features for correctly predicted POSITIVE cases:
 PH4         -9.0
PEC52        5.0
SA15        -7.0
M5          -7.0
NSD1E        2.0
M9          -7.0
PD9         -7.0
IR1INTRO2   -7.0
PD1B        -7.0
PD1G        -7.0
Name: 0, dtype: float64

Mode values of top features for correctly predicted NEGATIVE cases:
 PH4         -9
PEC52        5
SA15        -7
M5          -7
NSD1E        4
M9          -7
PD9         -7
IR1INTRO2   -7
PD1B         1
PD1G        -5
Name: 0, dtype: int64
