##### Hyper parameter Tuning

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV

In [37]:
# Load cleaned dataset
df = pd.read_csv('cleaned_student_depression.csv')


In [38]:
# split the data set into independent and dependent variables
X = df.drop(columns=['Depression'])
y = df['Depression']
 

In [39]:
# split data into train and test data set, no validation set, because to use cross validation 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



In [40]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#  Column groups
onehot_cols = ['Gender', 'Profession', 'Degree','Suicidal_Thoughts', 'Mental_Illness_History']
ordinal_cols = ['Sleep_Duration', 'Dietary_Habits']

# Define category orders for ordinal features
ordinal_categories = [
    ['Less than 5 hours', '5-6 hours', '7-8 hours', 'More than 8 hours'],  
    ['Unhealthy', 'Moderate', 'Healthy']                                                                                                 # Mental_Illness_History
]

# Define ColumnTransformer
encoder = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore',drop='first', sparse_output=False), onehot_cols),
        ('ordinal', OrdinalEncoder(categories=ordinal_categories), ordinal_cols)
    ],
    remainder='passthrough' 
)



In [41]:
# validate the base model through cross validation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Define the new model pipeline with preprocessing and logistic regression for cross validation
pipeline = Pipeline([
    ('preprocessing', encoder),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=10000, random_state=42))
])

# Define stratified k-fold cross-validator with k=5
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)



In [42]:
# hyperparameter grid to search
param_grid = [
    {
        "model__C": [0.01, 0.1, 1, 10, 100],
        "model__penalty": ["l1", "l2"],
        "model__solver": ["liblinear"]
    },
    {
        "model__C": [0.001, 0.01, 0.1, 1, 10, 100],
        "model__penalty": ["l1", "l2"],
        "model__solver": ["saga"]
    },
    {
        "model__C": [0.001, 0.01, 0.1, 1, 10],
        "model__penalty": ["elasticnet"],
        "model__solver": ["saga"],
        "model__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]
    }
]


In [43]:

# GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=skf,
    scoring='f1',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

In [44]:
# Fit on training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 47 candidates, totalling 235 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"[{'model__C': [0.01, 0.1, ...], 'model__penalty': ['l1', 'l2'], 'model__solver': ['liblinear']}, {'model__C': [0.001, 0.01, ...], 'model__penalty': ['l1', 'l2'], 'model__solver': ['saga']}, ...]"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('onehot', ...), ('ordinal', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Less than 5 hours', '5-6 hours', ...], ['Unhealthy', 'Moderate', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.001
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'saga'
,max_iter,10000


In [45]:
# Best parameters and score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best F1 Score on Training Set:", grid_search.best_score_)

Best Hyperparameters: {'model__C': 0.001, 'model__penalty': 'l2', 'model__solver': 'saga'}
Best F1 Score on Training Set: 0.8707007127511769


In [46]:
## when compare to the f1 score of the cross validation done on base model 
###                 F1 Score of each fold: [0.87244035 0.87469287 0.87224505 0.86523475 0.86318173]
###                       Average F1 Score: 0.8695589513910111 -- base model
###                       Best F1 Score on Training Set: 0.8707007127511769 --- model tuned on hyper parameter tuning
## there is a slight improvement compare to perivous one


In [47]:
### further fine tune the found best hyper parameters

fine_tuned_param_grid = {
    'model__C': [0.0001, 0.0005, 0.001, 0.005, 0.01],
    'model__penalty': ['l2'],
    'model__solver': ['saga']
}

In [48]:
fine_tune_search = GridSearchCV(
    pipeline,
    fine_tuned_param_grid,
    cv=skf,
    scoring='f1',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

In [49]:

# Fit again on training data
fine_tune_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__C': [0.0001, 0.0005, ...], 'model__penalty': ['l2'], 'model__solver': ['saga']}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('onehot', ...), ('ordinal', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Less than 5 hours', '5-6 hours', ...], ['Unhealthy', 'Moderate', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.0005
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'saga'
,max_iter,10000


In [50]:
print("1st iterative Fine-Tuned Best Parameters:", fine_tune_search.best_params_)
print("1st iterative Fine-Tuned Best F1 Score:", fine_tune_search.best_score_)

1st iterative Fine-Tuned Best Parameters: {'model__C': 0.0005, 'model__penalty': 'l2', 'model__solver': 'saga'}
1st iterative Fine-Tuned Best F1 Score: 0.8711489178430754


In [None]:
### there is a slight improvement and found a c value more specific  0.001 to 0.0005