In [1]:
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, \
                            confusion_matrix, ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score

In [2]:
## Load Dataset
df = pd.read_csv("student_depression_dataset.csv")
df.drop(columns=['id', 'City'], inplace=True)

## Data Preprocessing

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 16 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 27901 non-null  int64  
 1   Age                                    27901 non-null  float64
 2   Profession                             27901 non-null  int64  
 3   Academic Pressure                      27901 non-null  float64
 4   Work Pressure                          27901 non-null  float64
 5   CGPA                                   27901 non-null  float64
 6   Study Satisfaction                     27901 non-null  float64
 7   Job Satisfaction                       27901 non-null  float64
 8   Sleep Duration                         27901 non-null  int64  
 9   Dietary Habits                         27901 non-null  int64  
 10  Degree                                 27901 non-null  int64  
 11  Ha

In [31]:
df.isnull().sum()

Gender                                   0
Age                                      0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64

In [33]:
df.head()

Unnamed: 0,Gender,Age,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,1,33.0,12,5.0,0.0,8.97,2.0,0.0,0,0,4,1,3.0,0,0,1
1,0,24.0,12,2.0,0.0,5.9,5.0,0.0,0,1,11,0,3.0,1,1,0
2,1,31.0,12,3.0,0.0,7.03,5.0,0.0,2,0,6,0,9.0,0,1,0
3,0,28.0,12,3.0,0.0,5.59,2.0,0.0,1,1,8,1,4.0,4,1,1
4,0,25.0,12,4.0,0.0,8.13,3.0,0.0,0,1,17,1,1.0,0,0,0


In [34]:
df['Profession'].value_counts()

Profession
12    27870
5         8
13        6
2         3
1         2
6         2
7         2
11        2
0         1
4         1
3         1
10        1
9         1
8         1
Name: count, dtype: int64

In [35]:
df['Work Pressure'].value_counts()

Work Pressure
0.0    27898
5.0        2
2.0        1
Name: count, dtype: int64

In [36]:
df.shape

(27901, 16)

In [37]:
df.dtypes

Gender                                     int64
Age                                      float64
Profession                                 int64
Academic Pressure                        float64
Work Pressure                            float64
CGPA                                     float64
Study Satisfaction                       float64
Job Satisfaction                         float64
Sleep Duration                             int64
Dietary Habits                             int64
Degree                                     int64
Have you ever had suicidal thoughts ?      int64
Work/Study Hours                         float64
Financial Stress                           int64
Family History of Mental Illness           int64
Depression                                 int64
dtype: object

In [3]:
## Encode Categorical Features
label_encoder = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoder[col] = le

In [4]:
## Feature Scaling
X = df.drop(columns=['Depression'])
y = df['Depression']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [6]:
## Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [10]:
## Model Dictionary (Initial Testing)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC


models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boost": GradientBoostingClassifier(),
    "Adaboost": AdaBoostClassifier()
}

In [12]:
## Evaluate Models Before Tuning
def evaluate_models(models, X_train, y_train, X_test, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        print(f"----- {name} -----")
        print("Training:")
        print("- Accuracy: {:.4f}".format(accuracy_score(y_train, y_train_pred)))
        print("- F1 Score: {:.4f}".format(f1_score(y_train, y_train_pred, average='weighted')))
        print("- Precision: {:.4f}".format(precision_score(y_train, y_train_pred)))
        print("- Recall: {:.4f}".format(recall_score(y_train, y_train_pred)))
        print("- ROC AUC: {:.4f}".format(roc_auc_score(y_train, y_train_pred)))

        print("Testing:")
        print("- Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
        print("- F1 Score: {:.4f}".format(f1_score(y_test, y_test_pred, average='weighted')))
        print("- Precision: {:.4f}".format(precision_score(y_test, y_test_pred)))
        print("- Recall: {:.4f}".format(recall_score(y_test, y_test_pred)))
        print("- ROC AUC: {:.4f}".format(roc_auc_score(y_test, y_test_pred)))
        print("="*40)

evaluate_models(models, X_train, y_train, X_test, y_test)

----- Logistic Regression -----
Training:
- Accuracy: 0.8482
- F1 Score: 0.8476
- Precision: 0.8579
- Recall: 0.8885
- ROC AUC: 0.8397
Testing:
- Accuracy: 0.8371
- F1 Score: 0.8367
- Precision: 0.8509
- Recall: 0.8721
- ROC AUC: 0.8304
----- Decision Tree -----
Training:
- Accuracy: 1.0000
- F1 Score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- ROC AUC: 1.0000
Testing:
- Accuracy: 0.7457
- F1 Score: 0.7463
- Precision: 0.7875
- Recall: 0.7693
- ROC AUC: 0.7412
----- Random Forest -----
Training:
- Accuracy: 1.0000
- F1 Score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- ROC AUC: 1.0000
Testing:
- Accuracy: 0.8296
- F1 Score: 0.8292
- Precision: 0.8445
- Recall: 0.8657
- ROC AUC: 0.8227
----- Gradient Boost -----
Training:
- Accuracy: 0.8548
- F1 Score: 0.8542
- Precision: 0.8640
- Recall: 0.8931
- ROC AUC: 0.8467
Testing:
- Accuracy: 0.8375
- F1 Score: 0.8369
- Precision: 0.8484
- Recall: 0.8765
- ROC AUC: 0.8300
----- Adaboost -----
Training:
- Accuracy: 0.8494
- F1 Score: 0.8488


In [13]:
## Hyperparameter Tuning with RandomizedSearchCV
randomcv_models = [
    ("RandomForest", RandomForestClassifier(), {
        'n_estimators': [100, 200, 500, 1000],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'max_features': ['auto', 5, 7, 8]
    }),
    ("SVC", SVC(), {
        'C': [0.1, 1, 10],
        'gamma': [0.01, 0.1, 1],
        'kernel': ['rbf', 'linear']
    }),
    ("Adaboost", AdaBoostClassifier(), {
        'n_estimators': [50, 60, 70, 80, 90],
        'algorithm': ['SAMME', 'SAMME.R']
    })
]

best_models = {}
for name, model, params in randomcv_models:
    search = RandomizedSearchCV(model, param_distributions=params, n_iter=10, cv=3, verbose=1, n_jobs=-1)
    search.fit(X_train, y_train)
    best_models[name] = search.best_estimator_
    print(f"\nBest parameters for {name}: {search.best_params_}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best parameters for RandomForest: {'n_estimators': 200, 'min_samples_split': 10, 'max_features': 7, 'max_depth': 10}
Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best parameters for SVC: {'kernel': 'rbf', 'gamma': 0.01, 'C': 10}
Fitting 3 folds for each of 10 candidates, totalling 30 fits





Best parameters for Adaboost: {'n_estimators': 80, 'algorithm': 'SAMME'}


In [14]:
## Final Evaluation with Tuned Models
evaluate_models(best_models, X_train, y_train, X_test, y_test)

----- RandomForest -----
Training:
- Accuracy: 0.8887
- F1 Score: 0.8883
- Precision: 0.8915
- Recall: 0.9226
- ROC AUC: 0.8816
Testing:
- Accuracy: 0.8321
- F1 Score: 0.8315
- Precision: 0.8437
- Recall: 0.8721
- ROC AUC: 0.8245
----- SVC -----
Training:
- Accuracy: 0.8528
- F1 Score: 0.8513
- Precision: 0.8490
- Recall: 0.9111
- ROC AUC: 0.8405
Testing:
- Accuracy: 0.8362
- F1 Score: 0.8349
- Precision: 0.8362
- Recall: 0.8925
- ROC AUC: 0.8255
----- Adaboost -----
Training:
- Accuracy: 0.8494
- F1 Score: 0.8488
- Precision: 0.8597
- Recall: 0.8882
- ROC AUC: 0.8412
Testing:
- Accuracy: 0.8400
- F1 Score: 0.8395
- Precision: 0.8516
- Recall: 0.8771
- ROC AUC: 0.8329


In [16]:
print(df.columns)
print(X.columns)


Index(['Gender', 'Age', 'Profession', 'Academic Pressure', 'Work Pressure',
       'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration',
       'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?',
       'Work/Study Hours', 'Financial Stress',
       'Family History of Mental Illness', 'Depression'],
      dtype='object')
Index(['Gender', 'Age', 'Profession', 'Academic Pressure', 'Work Pressure',
       'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration',
       'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?',
       'Work/Study Hours', 'Financial Stress',
       'Family History of Mental Illness'],
      dtype='object')


In [29]:
sample_input_dict = {
    'Gender': 1,         
    'Age': 21,
    'Profession': 2,
    'Academic Pressure': 2,
    'Work Pressure': 2,
    'CGPA': 5.67,
    'Study Satisfaction': 2,
    'Job Satisfaction': 1,
    'Sleep Duration': 6,
    'Dietary Habits': 3,
    'Social Support': 1,
    'Degree': 2,
    'Have you ever had suicidal thoughts ?': 1,
    'Work/Study Hours': 1,
    'Financial Stress': 1,
    'Family History of Mental Illness': 0,
}

# Convert to DataFrame
sample_input_df = pd.DataFrame([sample_input_dict])

# Ensure column order matches training data
sample_input_df = sample_input_df[X.columns]

# Scale the input
sample_input_scaled = scaler.transform(sample_input_df)

final_model = best_models["RandomForest"] 
pred = final_model.predict(sample_input_scaled)

print("Predicted Depression (0 = No, 1 = Yes):", pred[0])




Predicted Depression (0 = No, 1 = Yes): 0
