Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import joblib as jb
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV

Model Training

In [None]:
# NOTE : In medical screeing tasks like heart disease prediction, False negatives are far more dangerous than False
#        positives. False negative -> model predicts 'No-disease' and patient has disease -> problematic
#                   False positive -> model predicts 'disease' when patient is healthy
# THEREFORE, it is better to slightly overpredict (high recall, less precise) than to miss severe cases

In [4]:
# importing dataset and class weights required for model training ->

X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze() # squeeze method is used to convert df -> 1D array
y_test = pd.read_csv("../data/processed/y_test.csv").squeeze()

class_weights = { # we will be using weights to minimize loss functions for models that use it
    0: 0.4475609756097561,
    1: 0.6924528301886792,
    3: 1.7069767441860466,
    2: 1.7069767441860466,
    4: 6.672727272727273
}

Logistic Regression Classifier

In [3]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(
    class_weight=class_weights,
    max_iter=1000,# number of optimization iterations
)

param_grid = {
    'solver' : ['lbfgs', 'liblinear'],# solver decides which optimization algorithm to use to find the best model params
    'C': [0.01, 0.1, 1, 5, 10]# # C tells about how much error the model can tolerate
}

grid_search = GridSearchCV(
    estimator=lr_model,
    param_grid=param_grid,
    cv=5,# 5 fold cross-validation
    scoring='recall_macro',# gives equal weight to each class -> ideal for multi-class and imbalanced data
    # this ensures that minority classes matter as much as majority class (0)
    # it is the average of all the recalls of the classes
    n_jobs=-1,# use all cores of the processor
    verbose=2# print the message while computing
)

grid_search.fit(X_train,y_train) # fitting of best parameters ->

print("best parameters : ",grid_search.best_params_)
print(f"best score : {grid_search.best_score_*100:.2f}%")

lr_model = grid_search.best_estimator_
y_pred_lr = lr_model.predict(X_test)

print("first 10 predictions :",y_pred_lr[:10])
print(f"recall score : {recall_score(y_test,y_pred_lr,average='weighted')*100:.2f}%")
# optional refinement -> we can focus recall on diseased classes only -> if we want to be even more precise,
# instead of averaging recall across all 5 classes -> we can calculate recall only for diseased classes(1-4)
# this shows exactly how well your model detects patients -> ignoring healthy predictions entirely

Fitting 5 folds for each of 10 candidates, totalling 50 fits
best parameters :  {'C': 1, 'solver': 'lbfgs'}
best score : 42.79%
first 10 predictions : [1 3 1 4 0 1 2 4 0 0]
recall score : 53.26%


KNN classifier

In [5]:
# knn-classifier's performance depends on the values of k and the type of distance we are choosing ->
# hence we will do enhancement of the model to select the best option ->

# these are the parameters which we want to assign the best values to ->
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'metric' : ['euclidean','manhattan'],
    'n_neighbors' : [5,7,9,11,13,15,17,19],
    'weights' : ['uniform','distance']
}

grid_search = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid,
    cv=5,
    scoring='recall_macro', # focus on balanced recall
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train,y_train)

print("best parameters : ",grid_search.best_params_)
print(f"best score : {grid_search.best_score_*100:.2f}%")

knn_model = grid_search.best_estimator_
y_pred_knn = knn_model.predict(X_test)
print("first 10 predictions : ",y_pred_knn[:10])
print(f"recall score : {recall_score(y_test,y_pred_knn,average='weighted')*100:.2f}%")

Fitting 5 folds for each of 32 candidates, totalling 160 fits
best parameters :  {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
best score : 35.71%
first 10 predictions :  [1 2 1 3 0 0 0 3 0 1]
recall score : 60.33%


Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(
    criterion='entropy',# the model will choose Information Gain to decide best feature and threshold for each split
    class_weight=class_weights,
    random_state=42    
)

param_grid = {
    'max_depth' : [3,4,5,6,7,8,9,11,13]# max-splits the tree can have ->
}

grid_search = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid,
    cv=5,
    scoring='recall_macro',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train,y_train)
print("best parameters : ",grid_search.best_params_)
print(f"best score : {grid_search.best_score_*100:.2f}%")

dt_model =grid_search.best_estimator_

y_pred_dt = dt_model.predict(X_test)
print(y_pred_dt[:10])
print(f"recall score : {recall_score(y_test,y_pred_dt,average='weighted')*100:.2f}%")

Fitting 5 folds for each of 9 candidates, totalling 45 fits
best parameters :  {'max_depth': 9}
best score : 34.38%
[1 3 3 3 0 0 1 2 0 0]
recall score : 49.46%


Random Forest Classifier

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    criterion='entropy',
    class_weight=class_weights,
    random_state=42
)

param_grid = {
    'n_estimators' : [10,20,30,40,50,60,70,80],# the number of trees in the random forest 
    'max_depth' : [3,4,5,6,7,8,9,10,11,12,13]# max-splits each tree in the random forest can have
}

grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,
    scoring='recall_macro',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train,y_train)
print("best parameters : ",grid_search.best_params_)
print(f"best score : {grid_search.best_score_*100:.2f}%")

rf_model = grid_search.best_estimator_

y_pred_rf = rf_model.predict(X_test)
print("first 10 predictions : ",y_pred_rf[:10])
print(f"recall score : {recall_score(y_test,y_pred_rf,average='weighted')*100:.2f}%")

Fitting 5 folds for each of 88 candidates, totalling 440 fits
best parameters :  {'max_depth': 3, 'n_estimators': 70}
best score : 40.63%
first 10 predictions :  [1 3 3 4 0 0 1 4 0 1]
recall score : 55.98%


Gradient Boosting Classifier

In [5]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {
    'n_estimators' : [10,20,30,40,50,60,70,80,90],
    'max_depth' : [3,4,5,6,7,8,9,10,11,12,13]
}

gb_model = GradientBoostingClassifier(
    learning_rate=0.05,
    subsample=0.9,
    random_state=42
)

grid_search = GridSearchCV(
    estimator=gb_model,
    param_grid=param_grid,
    cv=5,
    scoring='recall_macro',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("best parameters : ",grid_search.best_params_)
print(f"best score : {grid_search.best_score_*100:.2f}%")

gb_model = grid_search.best_estimator_

y_pred_gb = gb_model.predict(X_test)

print("first 10 predictions : ",y_pred_gb[:10])
print(f"recall score : {recall_score(y_pred_gb,y_test,average='weighted')*100:.2f}%")

Fitting 5 folds for each of 99 candidates, totalling 495 fits
best parameters :  {'max_depth': 5, 'n_estimators': 90}
best score : 36.85%
first 10 predictions :  [1 1 3 3 0 0 0 3 0 1]
recall score : 60.33%


Extreme Gradient Boosting Classifier

In [6]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    subsample=0.9,
    random_state=42
)

param_grid = {
    'n_estimators' : [20,30,40,50,60,70,80,90],
    'max_depth' : [3,4,5,6,7,8,9,10,11,12,13],
    'learning_rate' : [0.01,0.05,0.1,0.15,0.2]
}

grid_search = GridSearchCV(
   estimator=xgb_model,
   param_grid=param_grid,
   cv=5,
   scoring='recall_macro',
   n_jobs=-1,
   verbose=2 
)

grid_search.fit(X_train,y_train)

print("best parameters : ",grid_search.best_params_)
print(f"best score : {grid_search.best_score_*100:.2f}%")

xgb_model = grid_search.best_estimator_

y_pred_xgb = xgb_model.predict(X_test)

print("first 10 predictions : ",y_pred_xgb[:10])
print(f"recall score : {recall_score(y_test,y_pred_xgb,average='weighted')*100:.2f}%")

Fitting 5 folds for each of 440 candidates, totalling 2200 fits
best parameters :  {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 80}
best score : 38.04%
first 10 predictions :  [1 2 3 3 0 0 0 3 0 1]
recall score : 61.41%


Naive Bayes Classifier

In [6]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()

nb_model.fit(X_train,y_train)

y_pred_nb = nb_model.predict(X_test)

print("first 10 predictions : ",y_pred_nb[:10])
print(f"recall score : {recall_score(y_test,y_pred_nb,average='weighted')*100:.2f}%")

first 10 predictions :  [1 2 1 3 0 1 0 3 0 1]
recall score : 55.98%


In [None]:
import joblib as jb
# saving the models using joblib for integration and deployment ->

jb.dump(lr_model,"../models/lr_model.pkl")
jb.dump(knn_model,"../models/knn_model.pkl")
jb.dump(dt_model,"../models/dt_model.pkl")
jb.dump(rf_model,"../models/rf_model.pkl")
jb.dump(gb_model,"../models/gb_model.pkl")
jb.dump(xgb_model,"../models/xgb_model.pkl")
jb.dump(nb_model,"../models/nb_model.pkl")