In [None]:
#Importing libraries for machine learning models
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings

In [None]:
# Ignore warnings
warnings.filterwarnings('ignore')

In [None]:
file_path = '/content/sample_data/student_depression_dataset.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0


In [None]:
df.info() #Checking for NULL values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [None]:
#Data Preprocessing
df = df.drop('id', axis = 1)
df.head()

Unnamed: 0,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0


In [None]:
#Imputing missing values
#Imputing with mean
numerical_features = df.select_dtypes(include=['number']).columns
imputer_numerical = SimpleImputer(strategy='mean')
df[numerical_features] = imputer_numerical.fit_transform(df[numerical_features])

In [None]:
#Imputing with mode
categorical_features = df.select_dtypes(include=['object']).columns
imputer_categorical = SimpleImputer(strategy='most_frequent')
df[categorical_features] = imputer_categorical.fit_transform(df[categorical_features])

In [None]:
#Encoding categorical variables
onehot_features = ['Gender']
encoder_onehot = OneHotEncoder(sparse_output=False, drop='first')
encoded_data = encoder_onehot.fit_transform(df[onehot_features])
encoded_df = pd.DataFrame(encoded_data, columns=encoder_onehot.get_feature_names_out(onehot_features))
df = df.drop(onehot_features, axis=1)
df = pd.concat([df, encoded_df], axis=1)

In [None]:
#Label encoding for ordinal features
label_features = ['Degree']
encoder_label = LabelEncoder()
for feature in label_features:
  df[feature] = encoder_label.fit_transform(df[feature])

In [None]:
#Standardizing numerical features
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [None]:
#Creating Age group categories
df['age_group'] = pd.cut(df['Age'], bins=[0, 18, 25, 35, float('inf')],
                         labels=['Teens', 'Young Adult', 'Adult', 'Senior'])

In [1]:
#Splitting data into train and test set
X = df.drop('Depression', axis=1)
y = df['Depression']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#Checking the unique values in y_train
print("Unique values in y_train:", y_train.unique())

NameError: name 'df' is not defined

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
print("Original Depression values:", df['Depression'].unique())

# Convert Depression to categorical using Binning with proper thresholds
# Adjust the bins to ensure multiple classes are present
df['Depression'] = pd.cut(df['Depression'],
                         #bins=[-float('inf'), -0.5, 0.5, float('inf')],
                         bins=[-float('inf'), 0, float('inf')],
                          labels=[0, 1])

print("Unique Depression values after binning:", df['Depression'].unique())

# Ensure there are multiple classes in the training data
print("\nDistribution of Depression in training data:")
print(y_train.value_counts())

Original Depression values: [ 0.84139506 -1.18850235]
Unique Depression values after binning: [1, 0]
Categories (2, int64): [0 < 1]

Distribution of Depression in training data:
Depression
 0.841395    13068
-1.188502     9252
Name: count, dtype: int64


In [None]:
#Defining preprocessor
numerical_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

In [None]:
#Creating transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Apply StandardScaler to numerical features
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

In [None]:
#Combining transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
#Training and evaluating classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(random_state=42, class_weight='balanced'),
    "SVM": SVC(random_state=42, class_weight='balanced'),
    "XGBoost": XGBClassifier(random_state=42, scale_pos_weight=1),
    "Logistic Regression": LogisticRegression(random_state=42, class_weight='balanced')
}

for name, clf in classifiers.items():
    model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', clf)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)
    print(f"Classifier: {name}")
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{report}\n")
    print(y_train.unique())

Classifier: Random Forest
Accuracy: 0.8369467837305142
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.78      0.80      2313
           1       0.85      0.88      0.86      3268

    accuracy                           0.84      5581
   macro avg       0.83      0.83      0.83      5581
weighted avg       0.84      0.84      0.84      5581


[1, 0]
Categories (2, int64): [0 < 1]
Classifier: SVM
Accuracy: 0.8416054470524995
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.83      0.81      2313
           1       0.88      0.85      0.86      3268

    accuracy                           0.84      5581
   macro avg       0.84      0.84      0.84      5581
weighted avg       0.84      0.84      0.84      5581


[1, 0]
Categories (2, int64): [0 < 1]
Classifier: XGBoost
Accuracy: 0.8358717075792869
Classification Report:
              precision    recall  f1-score   supp

In [None]:
#Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[1938  375]
 [ 509 2759]]


In [None]:
# Define models and their hyperparameters
models = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'penalty': ['l1', 'l2'],
            'C': [0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs', 'saga']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None, 5, 10],
            'model__min_samples_split': [2, 5]
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'model__kernel': ['linear', 'rbf'],
            'model__C': [0.1, 1, 10]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(),
        'params': {
            'model__learning_rate': [0.1, 0.5],
            'model__max_depth': [3, 5],
            'model__n_estimators': [50, 100]
        }
    }
}

In [None]:
# Perform grid search for each model
for model_name, model_info in models.items():
    model = model_info['model']
    param_grid = model_info['params']

    # Create pipeline with preprocessing
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=5,
        scoring='accuracy'
    )

In [None]:
# Fit the grid search with training data
grid_search.fit(X_train, y_train)

# Print results
print(f"Model: {model_name}")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Accuracy: {grid_search.best_score_}")
print("-" * 50)

Model: XGBoost
Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}
Best Accuracy: 0.8485215053763442
--------------------------------------------------


In [None]:
# Define parameter distributions for Randomized Search
param_dist = {
    'model': ['Logistic Regression', 'Random Forest', 'SVM', 'XGBoost'],
    'model__penalty': ['l1', 'l2'],
    'model__C': [0.1, 1, 10],
    'model__solver': ['liblinear', 'lbfgs', 'saga'],
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [2, 5],
    'model__kernel': ['linear', 'rbf'],
    'model__learning_rate': [0.1, 0.5]
}

In [None]:
# Create a list of models with their corresponding parameter distributions
model_param_list = [
    (LogisticRegression(), {
        'model__penalty': ['l1', 'l2'],
        'model__C': [0.1, 1, 10],
        'model__solver': ['liblinear', 'lbfgs', 'saga']
    }),
    (RandomForestClassifier(), {
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 5, 10],
        'model__min_samples_split': [2, 5]
    }),
    (SVC(), {
        'model__kernel': ['linear', 'rbf'],
        'model__C': [0.1, 1, 10]
    }),
    (XGBClassifier(enable_categorical=True, eval_metric='logloss'), {
        'model__learning_rate': [0.1, 0.5],
        'model__max_depth': [3, 5],
        'model__n_estimators': [50, 100]
    })
]

In [None]:
# Perform randomized search for each model
for model_name, model_info in models.items():
    model = model_info['model']
    param = model_info['params']

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param,
        n_iter=5,  # Reduced number of iterations
        cv=5,
        scoring='accuracy',
        random_state=42,
        error_score='raise'
    )

    try:
        random_search.fit(X_train, y_train)
        print(f"Model: {model_name}")
        print(f"Best Parameters: {random_search.best_params_}")
        print(f"Best Accuracy: {random_search.best_score_}")
        print("-" * 50)
    except Exception as e:
        print(f"\nError occurred during {model_name} training: {str(e)}")
        continue


Error occurred during Logistic Regression training: Invalid parameter 'solver' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Age', 'Academic Pressure', 'Work Pressure', 'CGPA',
       'Study Satisfaction', 'Job Satisfaction', 'Degree', 'Work/Study Hours',
       'Gender_Male'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  Index(['City', 'Pro