In [110]:
#Basic Import
import pandas as pd
import numpy as np

#Importing the Scaler for numerical features
from sklearn.preprocessing import RobustScaler


#Importing Train Test Split and GridSearchCV for hyperparamater tuning
from sklearn.model_selection import train_test_split, GridSearchCV

#Importing various models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

#Importing model evaluation metrics
from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import GridSearchCV


In [111]:
#Reading the cleaned dataset
df=pd.read_csv('data/heart_without_duplicate.csv')  

In [112]:
#Display first 5 rows of dataset
df.head()

Unnamed: 0.1,Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [113]:
#Removing the first column (unamed column)
df = df.iloc[: , 1:]
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [114]:
#Creating a copy of the dataframe
df1 = df

In [115]:
#Creating a list of categorical and numberical features
categorcial_features = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
numerical_features = ["age","trtbps","chol","thalachh","oldpeak"]

In [116]:
#Creating a dataframe with the Independent variables
X = df1.drop(columns=['output'],axis=1)
X.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [117]:
#Display the Independent variables
X.columns

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall'],
      dtype='object')

In [118]:
#Create a dataframe with the Dependent variable
Y = df1[['output']]
Y.head()

Unnamed: 0,output
0,1
1,1
2,1
3,1
4,1


In [119]:
#Checking the shape of Y
Y.shape

(302, 1)

The categorical variables are already label encoded in the dataset hence we will not require any further encoding

In [120]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 40)
print("The shape of X_train is      ", X_train.shape)
print("The shape of X_test is       ",X_test.shape)
print("The shape of y_train is      ",y_train.shape)
print("The shape of y_test is       ",y_test.shape)

The shape of X_train is       (241, 13)
The shape of X_test is        (61, 13)
The shape of y_train is       (241, 1)
The shape of y_test is        (61, 1)


In [121]:
#Instantiating the scaler for numerical variable
scaler = RobustScaler()

In [122]:
#Scaling the numerical features
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [123]:
#Define classifiers and their respective parameter grids for hyperparameter tuning
classifiers = {
    'Random Forest': (RandomForestClassifier(), {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }),
    'Gradient Boosting': (GradientBoostingClassifier(), {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    }),
    'SVM': (SVC(), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }),
    'KNN': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    }),
    'Logistic Regression': (LogisticRegression(), {
        'C': [0.1, 1, 10],
    }),
    'XGBoost': (XGBClassifier(), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    }),
    'CatBoost': (CatBoostClassifier(), {
        'iterations': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 4, 5]
    }),
    'AdaBoost': (AdaBoostClassifier(), {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2]
    }),
    'Decision Tree': (DecisionTreeClassifier(), {
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10]
    })
}

#Hyperparameter tuning and evaluation for each classifier
for clf_name, (clf, param_grid) in classifiers.items():
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Print the best hyperparameters for the current classifier
    print(f'Best Hyperparameters for {clf_name}: {grid_search.best_params_}')

    # Make predictions on the test set
    y_pred = grid_search.predict(X_test)
    
    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {clf_name}: {accuracy:.2f}')

    # Generate and print the classification report
    report = classification_report(y_test, y_pred)
    print(f'Classification Report for {clf_name}:\n{report}\n')

  return fit_method(estimator, *args, **kwargs)


Best Hyperparameters for Random Forest: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy for Random Forest: 0.89
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.88      0.85      0.86        26
           1       0.89      0.91      0.90        35

    accuracy                           0.89        61
   macro avg       0.88      0.88      0.88        61
weighted avg       0.89      0.89      0.88        61




  y = column_or_1d(y, warn=True)


Best Hyperparameters for Gradient Boosting: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 100}
Accuracy for Gradient Boosting: 0.84
Classification Report for Gradient Boosting:
              precision    recall  f1-score   support

           0       0.83      0.77      0.80        26
           1       0.84      0.89      0.86        35

    accuracy                           0.84        61
   macro avg       0.84      0.83      0.83        61
weighted avg       0.84      0.84      0.84        61




  y = column_or_1d(y, warn=True)
  return self._fit(X, y)


Best Hyperparameters for SVM: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy for SVM: 0.89
Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.88      0.85      0.86        26
           1       0.89      0.91      0.90        35

    accuracy                           0.89        61
   macro avg       0.88      0.88      0.88        61
weighted avg       0.89      0.89      0.88        61


Best Hyperparameters for KNN: {'n_neighbors': 5, 'weights': 'uniform'}
Accuracy for KNN: 0.92
Classification Report for KNN:
              precision    recall  f1-score   support

           0       0.96      0.85      0.90        26
           1       0.89      0.97      0.93        35

    accuracy                           0.92        61
   macro avg       0.93      0.91      0.91        61
weighted avg       0.92      0.92      0.92        61




  y = column_or_1d(y, warn=True)


Best Hyperparameters for Logistic Regression: {'C': 1}
Accuracy for Logistic Regression: 0.89
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.88      0.85      0.86        26
           1       0.89      0.91      0.90        35

    accuracy                           0.89        61
   macro avg       0.88      0.88      0.88        61
weighted avg       0.89      0.89      0.88        61


Best Hyperparameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}
Accuracy for XGBoost: 0.85
Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.84      0.81      0.82        26
           1       0.86      0.89      0.87        35

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61


0:	learn: 0.6642611	total: 1.04ms	remaining: 51

  y = column_or_1d(y, warn=True)


Best Hyperparameters for AdaBoost: {'learning_rate': 0.1, 'n_estimators': 50}
Accuracy for AdaBoost: 0.90
Classification Report for AdaBoost:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        26
           1       0.91      0.91      0.91        35

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61


Best Hyperparameters for Decision Tree: {'max_depth': 5, 'min_samples_split': 2}
Accuracy for Decision Tree: 0.80
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77        26
           1       0.83      0.83      0.83        35

    accuracy                           0.80        61
   macro avg       0.80      0.80      0.80        61
weighted avg       0.80      0.80      0.80        61




CatBoost has given the best accuracy (93%) out of all the models we have tried. 
Best Hyperparameters for CatBoost: {'depth': 4, 'iterations': 50, 'learning_rate': 0.1}