# Model Trainer

In [None]:
import warnings 
warnings.simplefilter('ignore')

import pickle 
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split 

from sklearn.svm import SVC
# from xgboost import XGBClassifier 
from catboost import CatBoostClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix

ModuleNotFoundError: No module named 'catboost'

In [None]:
df = pd.read_csv("../data/heart.csv")

## Data Splitting

In [None]:
X = df.drop('target', axis=1)
y = df['target']

In [None]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2


In [None]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64

In [None]:
## Train-test splitting
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)

In [None]:
## verifying split shape
print("X shape : ", X.shape)
print("X_train shape : ", X_train.shape)
print("X_test shape : ", X_test.shape)

print("y shape : ", y.shape)
print("y_train shape : ", y_train.shape)
print("y_test shape : ", y_test.shape)

X shape :  (1025, 13)
X_train shape :  (820, 13)
X_test shape :  (205, 13)
y shape :  (1025,)
y_train shape :  (820,)
y_test shape :  (205,)


## Data Normalization

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train

array([[-0.58584022,  0.65465367,  1.008275  , ...,  1.00526437,
         2.17169136, -0.54519316],
       [ 1.05147737, -1.52752523, -0.91672034, ...,  1.00526437,
        -0.7254674 , -0.54519316],
       [-0.04006769, -1.52752523,  1.008275  , ...,  1.00526437,
        -0.7254674 , -0.54519316],
       ...,
       [-0.36753121,  0.65465367, -0.91672034, ...,  1.00526437,
        -0.7254674 ,  1.11057867],
       [-1.24076726,  0.65465367, -0.91672034, ...,  1.00526437,
        -0.7254674 ,  1.11057867],
       [-0.2583767 ,  0.65465367, -0.91672034, ...,  1.00526437,
         0.24025219, -0.54519316]])

In [None]:
X_test

array([[ 0.83316836, -1.52752523, -0.91672034, ...,  1.00526437,
        -0.7254674 , -0.54519316],
       [-0.1492222 , -1.52752523,  1.008275  , ...,  1.00526437,
        -0.7254674 , -3.85673683],
       [ 0.06908682,  0.65465367, -0.91672034, ..., -0.64007851,
         0.24025219,  1.11057867],
       ...,
       [-1.13161275,  0.65465367,  1.008275  , ...,  1.00526437,
        -0.7254674 , -0.54519316],
       [ 0.72401385,  0.65465367, -0.91672034, ...,  1.00526437,
         0.24025219,  1.11057867],
       [ 0.39655033,  0.65465367,  1.008275  , ..., -0.64007851,
         0.24025219,  1.11057867]])

## Models Defining

In [None]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Naive Bayes' : GaussianNB(),
    'Random Forest Classifier' : RandomForestClassifier(n_estimators=20, random_state=12, max_depth=5),
    'K Nearest Neighbors' : KNeighborsClassifier(n_neighbors=10),
    'Decision Tree' : DecisionTreeClassifier(criterion='entropy', random_state=0, max_depth= 6),
    'Support Vector Machine' : SVC(kernel='rbf', C=2)
}


In [None]:
model_list = []
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    test_accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1_sc = f1_score(y_test, y_pred)
    confusion_mt = confusion_matrix(y_test, y_pred)
    
    print(f"Model Name : {model_name}")
    print(f"Testing Accuracy : {test_accuracy}")
    print(f"Precision : {precision}")
    print(f"Recall : {recall}")
    print(f"F1 Score : {f1_sc}")
    print(f"Confusion Matrix : \n", confusion_mt)

    model_list.append(model_name)
    accuracy_list.append(test_accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1_sc)

max_accuracy_index = accuracy_list.index(max(accuracy_list))
print(f"The best model based on accuracy is {model_list[max_accuracy_index]} with Testing accuracy : {accuracy_list[max_accuracy_index]} ")


Model Name : Logistic Regression
Testing Accuracy : 0.7951219512195122
Precision : 0.7563025210084033
Recall : 0.8737864077669902
F1 Score : 0.8108108108108109
Confusion Matrix : 
 [[73 29]
 [13 90]]
Model Name : Naive Bayes
Testing Accuracy : 0.8
Precision : 0.7540983606557377
Recall : 0.8932038834951457
F1 Score : 0.8177777777777778
Confusion Matrix : 
 [[72 30]
 [11 92]]
Model Name : Random Forest Classifier
Testing Accuracy : 0.8829268292682927
Precision : 0.8319327731092437
Recall : 0.9611650485436893
F1 Score : 0.8918918918918919
Confusion Matrix : 
 [[82 20]
 [ 4 99]]
Model Name : K Nearest Neighbors
Testing Accuracy : 0.8390243902439024
Precision : 0.8431372549019608
Recall : 0.8349514563106796
F1 Score : 0.8390243902439024
Confusion Matrix : 
 [[86 16]
 [17 86]]
Model Name : Decision Tree
Testing Accuracy : 0.8731707317073171
Precision : 0.8235294117647058
Recall : 0.9514563106796117
F1 Score : 0.8828828828828829
Confusion Matrix : 
 [[81 21]
 [ 5 98]]
Model Name : Support Vec