In [None]:
# since the problem taken involves multi-class classification, we will training classification models as follows ->
# logistic regression
# KNN classifier
# decision trees

# the goal is to see which model is the best after evaluating it on testing data 

Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

Importing Processed Data

In [2]:
# importing dataset and class weights required for model training ->

X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze() # squeeze method is used to convert df -> 1D array
y_test = pd.read_csv("../data/processed/y_test.csv").squeeze()

class_weights = {
    0: 0.4475609756097561,
    1: 0.6924528301886792,
    3: 1.7069767441860466,
    2: 1.7069767441860466,
    4: 6.672727272727273
}

Model Training

In [3]:
# LOGISTIC REGRESSION FOR MULTI-CLASS CLASSIFICATION ->
lr_model = LogisticRegression(
    class_weight=class_weights,
    max_iter=1000,
    solver='lbfgs',
)

lr_model.fit(X_train,y_train)

y_pred_lr = lr_model.predict(X_test)

print("first 10 predictions :",y_pred_lr[:10])

print(f"train accuracy : {lr_model.score(X_train,y_train)*100:.2f}%")
accuracy_lr = accuracy_score(y_test,y_pred_lr)
print(f"test accuracy : {accuracy_lr*100:.2f}%")

first 10 predictions : [1 3 1 4 0 0 2 4 0 0]
train accuracy : 57.36%
test accuracy : 53.80%


In [None]:
# K-NEAREST NEIGHBOURS CLASSIFIER FOR MULTI-CLASS CLASSIFICATION ->
# knn-classifier's performance depends on the values of k and the type of distance we are choosing ->
# hence we will do enhancement of the model to select the best option ->

# these are the parameters which we want to assign the best values to ->
best_k = None
best_distance = None
best_accuracy_knn = 0
best_knn_model = None

for dist in ['manhattan','euclidean']: # possible distance metrics 
    for k in [3,5,7,11,13,15,17,19,21,23,25,27,30,31,33]: # possible k values
        knn_model = KNeighborsClassifier(
            n_neighbors=k,
            weights='uniform',
            metric=dist
        )
        knn_model.fit(X_train,y_train)
        y_pred_knn = knn_model.predict(X_test)
        accuracy_knn = accuracy_score(y_test,y_pred_knn)
        if(accuracy_knn>best_accuracy_knn):
            best_accuracy_knn=accuracy_knn
            best_k = k
            best_distance = dist
            best_knn_model = knn_model

print(f"best k value : {best_k}")
print(f"best distance metric : {best_distance}")
print(f"train accuracy : {best_knn_model.score(X_train,y_train)*100:.2f}%")
print(f"test accuracy : {best_accuracy_knn*100:.2f}%")

In [None]:
# DECISION TREE CLASSIFIER FOR MULTI-CLASS CLASSIFICATION ->

dt_model = DecisionTreeClassifier(
    criterion='entropy',
    class_weight=class_weights,
    max_depth=3,
    random_state=42
)

dt_model.fit(X_train,y_train)

y_pred_dt = dt_model.predict(X_test)

print("first 10 predictions :",y_pred_dt[:10])
print(f"train accuracy : {dt_model.score(X_train,y_train)*100:.2f}%",)
accuracy_dt = accuracy_score(y_test,y_pred_dt)
print(f"test accuracy : {accuracy_dt*100:.2f}%")

In [None]:
import joblib as jb
# saving the models using joblib for integration and deployment ->

jb.dump(lr_model,"../models/lr_model.pkl")
jb.dump(best_knn_model,"../models/knn_model.pkl")
jb.dump(dt_model,"../models/dt_model.pkl")