Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import joblib as jb
from sklearn.metrics import accuracy_score

Model Training

IMPORTANT NOTE :

In [None]:
# 1. while model training, I have tuned the hyperparameters of each model such that the train-test accuracies 
#    are not only high but also approximately equal.
# 2. This ensured that the models did not overfit the training data while still maintaining good predictive 
#    performance on unseen samples. 
# => Therefore the goal/focus is to maximize generalization rather than memorization.

In [3]:
# importing dataset and class weights required for model training ->

X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze() # squeeze method is used to convert df -> 1D array
y_test = pd.read_csv("../data/processed/y_test.csv").squeeze()

class_weights = {
    0: 0.4475609756097561,
    1: 0.6924528301886792,
    3: 1.7069767441860466,
    2: 1.7069767441860466,
    4: 6.672727272727273
}

Logistic Regression Classifier

In [4]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(
    class_weight=class_weights,
    max_iter=1000,
    solver='lbfgs',
)

lr_model.fit(X_train,y_train)

y_pred_lr = lr_model.predict(X_test)

print("first 10 predictions :",y_pred_lr[:10])

print(f"train accuracy : {lr_model.score(X_train,y_train)*100:.2f}%")
accuracy_lr = accuracy_score(y_test,y_pred_lr)
print(f"test accuracy : {accuracy_lr*100:.2f}%")

first 10 predictions : [1 3 1 4 0 0 2 4 0 0]
train accuracy : 57.36%
test accuracy : 53.80%


KNN classifier

In [5]:
# knn-classifier's performance depends on the values of k and the type of distance we are choosing ->
# hence we will do enhancement of the model to select the best option ->

# these are the parameters which we want to assign the best values to ->
from sklearn.neighbors import KNeighborsClassifier

best_k = None
best_distance = None
best_accuracy_knn = 0
best_knn_model = None

# example execution of Grid Search for calculation of best hyperparameters ->
for dist in ['manhattan','euclidean']: # possible distance metrics 
    for k in [3,5,7,11,13,15,17,19,21,23,25,27]: # possible k values
        knn_model = KNeighborsClassifier(
            n_neighbors=k,
            weights='uniform',
            metric=dist
        )
        knn_model.fit(X_train,y_train)
        y_pred_knn = knn_model.predict(X_test)
        accuracy_knn = accuracy_score(y_test,y_pred_knn)
        if(accuracy_knn>best_accuracy_knn):
            best_accuracy_knn=accuracy_knn
            best_k = k
            best_distance = dist
            best_knn_model = knn_model

print(f"best k value : {best_k}")
print(f"best distance metric : {best_distance}")
print(f"train accuracy : {best_knn_model.score(X_train,y_train)*100:.2f}%")
print(f"test accuracy : {best_accuracy_knn*100:.2f}%")

best k value : 19
best distance metric : manhattan
train accuracy : 58.99%
test accuracy : 59.24%


Analysis of Entropy of the Dataset

In [123]:
import math
# Entropy is the measure of impurity of a dataset 
# Entropy of the dataset -> Summation{pi*log2(pi)}
y_train.value_counts() # we find the number of samples to each class
counts = [328,212,86,86,22]
total = sum(counts)

# computing probabilities ->
probs = [count/total for count in counts]

# compute entropy ->
entropy = -sum(p * math.log2(p) for p in probs if p>0)

print(f"max entropy possible : {math.log2(5):.4f}")
print(f"entropy of the dataset : {entropy:.4f}")
print(f"purity of the dataset : {(1-(entropy/math.log2(5)))*100:.2f}%")

max entropy possible : 2.3219
entropy of the dataset : 1.9133
purity of the dataset : 17.60%


Decision Tree Classifier

In [124]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(
    criterion='entropy',# the model with choose Information Gain to decide best feature and threshold for each split
    class_weight=class_weights,
    max_depth=3,# the model will choose the best 3 attributes in terms of IG for growing the tree
    random_state=42
)

# note :  as stated earlier, I have calculated max_depth hyperparameter of the decisionTreeClassifier by manual search 
#         to minimize difference in train-test accuracy and at the same time keep them high for generalization

dt_model.fit(X_train,y_train)

y_pred_dt = dt_model.predict(X_test)

print("first 10 predictions :",y_pred_dt[:10])
print(f"train accuracy : {dt_model.score(X_train,y_train)*100:.2f}%",)
accuracy_dt = accuracy_score(y_test,y_pred_dt)
print(f"test accuracy : {accuracy_dt*100:.2f}%")

first 10 predictions : [1 1 0 4 0 0 1 1 0 1]
train accuracy : 55.45%
test accuracy : 55.98%


Random Forest Classifier

In [5]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    criterion='entropy',
    n_estimators=11,# 
    max_depth=3,
    class_weight=class_weights,
    random_state=42
)

rf_model.fit(X_train,y_train)

y_pred_rf = rf_model.predict(X_test)

print("first 10 predictions :",y_pred_rf[:10])
print(f"train accuracy : {rf_model.score(X_train,y_train)*100:.2f}%",)
accuracy_rf = accuracy_score(y_test,y_pred_rf)
print(f"test accuracy : {accuracy_rf*100:.2f}%")

first 10 predictions : [1 3 3 4 0 0 0 4 0 1]
train accuracy : 59.95%
test accuracy : 59.24%


Gradient Boosting Classifier

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(
    n_estimators=14,
    learning_rate=0.05,
    max_depth=3,
    min_samples_split=2,
    min_samples_leaf=1,
    subsample=0.9,
    random_state=42
)

gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)

train_acc = gb_model.score(X_train, y_train)
test_acc = accuracy_score(y_test, y_pred_gb)

print(f"Train Accuracy: {train_acc*100:.2f}%")
print(f"Test Accuracy: {test_acc*100:.2f}%")

Train Accuracy: 62.67%
Test Accuracy: 63.59%


Support Vector Machines

In [127]:
from sklearn.svm import SVC
svm_model = SVC(
    kernel='rbf',         # RBF kernel (non-linear)
    C=1.0,                # Regularization strength
    gamma='scale',        # Auto gamma for RBF
    decision_function_shape='ovo',  # one-vs-one (for multi-class)
    random_state=42
)

# Train
svm_model.fit(X_train, y_train)

# Predict
y_pred_svm = svm_model.predict(X_test)

# Evaluate
train_acc = svm_model.score(X_train, y_train)
test_acc = accuracy_score(y_test, y_pred_svm)

print(f"Train Accuracy: {train_acc*100:.2f}%")
print(f"Test Accuracy: {test_acc*100:.2f}%")

Train Accuracy: 59.81%
Test Accuracy: 59.78%


In [None]:
import joblib as jb
# saving the models using joblib for integration and deployment ->

jb.dump(lr_model,"../models/lr_model.pkl")
jb.dump(best_knn_model,"../models/knn_model.pkl")
jb.dump(dt_model,"../models/dt_model.pkl")
jb.dump(rf_model,"../models/rf_model.pkl")
jb.dump(gb_model,"../models/gb_model.pkl")
jb.dump(svm_model,"../models/svm_model.pkl")