In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.combine import SMOTEENN, SMOTETomek
from tqdm import tqdm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve, precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import KNNImputer
import pickle



In [None]:
data = pd.read_csv('/content/Student Depression Dataset.csv')
data

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Health Condition,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33,Visakhapatnam,Student,5,0,8.97,2,0,5-6 hours,Healthy,B.Pharm,Yes,3,1.0,No,1
1,8,Female,24,Bangalore,Student,2,0,5.90,5,0,5-6 hours,Moderate,BSc,No,3,2.0,Yes,0
2,26,Male,31,Srinagar,Student,3,0,7.03,5,0,Less than 5 hours,Healthy,BA,No,9,1.0,Yes,0
3,30,Female,28,Varanasi,Student,3,0,5.59,2,0,7-8 hours,Moderate,BCA,Yes,4,5.0,Yes,1
4,32,Female,25,Jaipur,Student,4,0,8.13,3,0,5-6 hours,Moderate,M.Tech,Yes,1,1.0,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27896,140685,Female,27,Surat,Student,5,0,5.75,5,0,5-6 hours,Unhealthy,Class 12,Yes,7,1.0,Yes,0
27897,140686,Male,27,Ludhiana,Student,2,0,9.40,3,0,Less than 5 hours,Healthy,MSc,No,0,3.0,Yes,0
27898,140689,Male,31,Faridabad,Student,3,0,6.61,4,0,5-6 hours,Unhealthy,MD,No,12,2.0,No,0
27899,140690,Female,18,Ludhiana,Student,5,0,6.88,2,0,Less than 5 hours,Healthy,Class 12,Yes,10,5.0,No,1


In [None]:
data.drop(["Job Satisfaction", "Work Pressure",'id'], axis=1, inplace=True)

In [None]:
X = data[['Gender','Age','Have you ever had suicidal thoughts ?','Financial Stress','CGPA','Academic Pressure','Degree','Work/Study Hours','Sleep Duration']]
Y = data["Health Condition"]

In [None]:
#Checking Shape

X.shape

(27901, 9)

In [None]:
Y.shape

(27901,)

In [None]:
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include="object").columns.tolist()

# Store the original column names of the numeric features.
original_numeric_feature_names = numeric_features.copy()

num_pipline = Pipeline(steps=[
    ("imputer", KNNImputer()), #Removed the fit function.
    ("scaler", StandardScaler())
])
cat_pipline = Pipeline(steps=[
    ("encoder", OrdinalEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipline, numeric_features),
    ("cat", cat_pipline, categorical_features)
])


In [None]:
preprocessor

In [None]:
#Fiting preprocessor to X_train

X_pre_transformed = preprocessor.fit_transform(X)

In [None]:
X_pre_transformed.shape

(27901, 9)

In [None]:
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

In [None]:
encode = LabelEncoder()
y_resampled = encode.fit_transform(Y)

# Get the mapping of classes to integers
class_mapping = dict(zip(encode.classes_, range(len(encode.classes_))))

print("Encoded values:", y_resampled)
print("Class mapping:", class_mapping)

Encoded values: [0 1 0 ... 3 0 0]
Class mapping: {'Healthy': 0, 'Moderate': 1, 'Others': 2, 'Unhealthy': 3}


In [None]:
with open('enoded', 'wb') as file:
    pickle.dump({'encoded_values': y_resampled, 'class_mapping': class_mapping}, file)

In [None]:
y_resampled

array([0, 1, 0, ..., 3, 0, 0])

In [None]:
smt = SMOTEENN()
X_resampled, y_resampled = smt.fit_resample(X_pre_transformed, y_resampled)
X_resampled.shape, y_resampled.shape

((13563, 9), (13563,))

In [None]:
def evaluate_clf(true, predicted):
    # Calculate accuracy
    acc = accuracy_score(true, predicted)
    # Calculate F1 score with 'weighted' average for multiclass
    f1 = f1_score(true, predicted, average='weighted')
    # Calculate precision with 'weighted' average for multiclass
    precision = precision_score(true, predicted, average='weighted')
    # Calculate recall with 'weighted' average for multiclass
    recall = recall_score(true, predicted, average='weighted')
    # Removed roc_auc score as it is not suited for multiclass targets
    return acc, f1 , precision, recall

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle  # For saving models in pickle format
import os  # For handling directories

def evaluate_models(X, y, models, params, save_dir='saved_models'):
    """
    Evaluate models and save the best one for each algorithm in pickle format.

    Parameters:
        X (array-like): Features dataset.
        y (array-like): Target labels.
        models (dict): Dictionary of model names and instances.
        params (dict): Dictionary of hyperparameters for each model.
        save_dir (str): Directory to save the best models in pickle format.

    Returns:
        pd.DataFrame: Evaluation report sorted by Test Accuracy.
    """
    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    results = []

    for model_name, model in models.items():
        print(f"Processing model: {model_name}")

        # Perform Grid Search for hyperparameter tuning
        gs = GridSearchCV(model, params[model_name], cv=3,verbose=3)
        gs.fit(X_train, y_train)

        # Set the best parameters and fit the model
        model.set_params(**gs.best_params_)
        model.fit(X_train, y_train)

        # Save the trained model in pickle format
        model_path = os.path.join(save_dir, f"{model_name}_best_model.pkl")
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f"Best model for {model_name} saved to: {model_path}")

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Evaluate performance on training and test sets
        train_metrics = evaluate_clf(y_train, y_train_pred)
        test_metrics = evaluate_clf(y_test, y_test_pred)

        # Store results
        result_entry = {
            'Model Name': model_name,
            'Train Accuracy': train_metrics[0],
            'Test Accuracy': test_metrics[0],
            'Train F1 Score': train_metrics[1],
            'Test F1 Score': test_metrics[1],
            'Train Precision': train_metrics[2],
            'Test Precision': test_metrics[2],
            'Train Recall': train_metrics[3],
            'Test Recall': test_metrics[3],
        }

        results.append(result_entry)

        # Print the results for the current model
        print(f"Results for {model_name}:")
        for metric_name, value in result_entry.items():
            print(f"{metric_name}: {value:.4}")
        print("=" * 40)

    # Create a DataFrame from results
    report = pd.DataFrame(results).sort_values(by='Test Accuracy', ascending=False)

    return report


In [None]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
}

# Define hyperparameter grids
param_grids = {
    'LogisticRegression': {
        "class_weight": ["balanced"],
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9]

    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
        'min_samples_split': [2]
    },
    'DecisionTreeClassifier': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2]
    },

}

In [None]:
report = evaluate_models(X_resampled, y_resampled, models, param_grids)

Processing model: LogisticRegression
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END C=0.001, class_weight=balanced, penalty=l1, solver=liblinear;, score=0.766 total time=   0.0s
[CV 2/3] END C=0.001, class_weight=balanced, penalty=l1, solver=liblinear;, score=0.766 total time=   0.0s
[CV 3/3] END C=0.001, class_weight=balanced, penalty=l1, solver=liblinear;, score=0.767 total time=   0.0s
[CV 1/3] END C=0.001, class_weight=balanced, penalty=l1, solver=saga;, score=0.103 total time=   0.2s
[CV 2/3] END C=0.001, class_weight=balanced, penalty=l1, solver=saga;, score=0.102 total time=   0.1s
[CV 3/3] END C=0.001, class_weight=balanced, penalty=l1, solver=saga;, score=0.099 total time=   0.1s
[CV 1/3] END C=0.001, class_weight=balanced, penalty=l2, solver=liblinear;, score=0.798 total time=   0.0s
[CV 2/3] END C=0.001, class_weight=balanced, penalty=l2, solver=liblinear;, score=0.811 total time=   0.0s
[CV 3/3] END C=0.001, class_weight=balanced, penalty=l2, solv

In [None]:
report

Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Train F1 Score,Test F1 Score,Train Precision,Test Precision,Train Recall,Test Recall
1,KNeighborsClassifier,0.972995,0.941025,0.972188,0.937503,0.973121,0.937853,0.972995,0.941025
2,RandomForestClassifier,1.0,0.935864,1.0,0.931529,1.0,0.934326,1.0,0.935864
3,GradientBoostingClassifier,0.999078,0.931073,0.999078,0.92843,0.999079,0.928282,0.999078,0.931073
4,DecisionTreeClassifier,1.0,0.90564,1.0,0.905439,1.0,0.905287,1.0,0.90564
0,LogisticRegression,0.841567,0.836712,0.813502,0.807061,0.813005,0.806138,0.841567,0.836712


In [None]:
# prompt: save best model

import pickle
from sklearn.model_selection import GridSearchCV

# ... (Your existing code) ...

def evaluate_models(X, y, models, params):
    # ... (Your existing code) ...

        # Save the best model
       with open(f'{model_name}_best_model.pkl', 'wb') as f:
            pickle.dump(model, f)

        # ... (rest of your existing code) ...

In [None]:
with open('Best_model', 'wb') as f:
    pickle.dump(Best_model, f)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.2, random_state = 42)

In [None]:
model = pickle.load(open('/content/saved_models/RandomForestClassifier_best_model.pkl', 'rb'))
model


In [None]:
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))


0.9338155515370705
