In [None]:
# Import necessary libraries

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier,StackingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,f1_score

import os
import time

import warnings
warnings.filterwarnings("ignore")

In [None]:
# load the data

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
heart_df  = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
# let's look at the first few entries
heart_df.head()

In [None]:
print("The shape of the dataset is : {}".format(heart_df.shape))

In [None]:
heart_df.info()

It seems that there is no null value present in the dataset.

In [None]:
# unique values in each feature
heart_df.nunique()

In [None]:
# check if there is any duplicate
heart_df[heart_df.duplicated()]

In [None]:
# drop the duplicate row
heart_df.drop_duplicates(inplace=True)

In [None]:
# columns with discrete numeric values
num_cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']
                
for col in num_cat_cols:
    print("column : {}".format(col))            
    print(heart_df[col].value_counts(normalize=True))
    print("\n")            

In [None]:
# feature matrix
X = heart_df.drop("output",axis=1)

# target vector
y = heart_df['output']

In [None]:
# splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y)

In [None]:
print("The shape of the training set : {}".format(X_train.shape))
print("The shape of the test set : {}".format(X_test.shape))

In [None]:
# distribution of classes in train and test target vectors
print("The training set :")
print(y_train.value_counts(normalize=True))
print("\n The test set :")
print(y_test.value_counts(normalize=True))

In [None]:
type(y_train)

In [None]:
def build_pipeline(model):
    
    """
    A function to build pipeline with the following steps:
           
           1. 'scaler' : Standardize features by removing the mean and scaling to unit variance.
           2. 'clf' : fit a classifier to the dataset.
           
    Parameter:
    ------------
    model : a classifier object
    
    Returns:
    ----------
    pipeline : a pipeline object
    
    """
    
    pipeline = Pipeline([
                       ('scaler', StandardScaler()),
                       ('clf', model) 
                       ])
    
    return pipeline


def scan_model(pipeline,param_dict,X_train,y_train):
    
    """
    A function to find the optimal set of parameters for a classifier using GridSearchCV.
    
    Parameters:
    ------------
    pipeline : a pipeline object
    
    param_dict : dict
          a dictionary with the names of hyperparameters as keys and the corresponding list of values to be scanned as values
          
    X_train : pandas dataframe
          the feature matrix
          
    y_train : pandas series
          the target vector
    
    Returns:
    ----------
    grid_cv : a GridSearchCV object fitted on the training data
    
    
    """
    
    grid_cv = GridSearchCV(pipeline,param_grid=param_dict,scoring='accuracy',cv=5,verbose=1)
    grid_cv.fit(X_train,y_train)

    return grid_cv


def eval_model(model,X_test,y_test):
    
    """
    A function to evaluate the performance of a classifier on unseen data.
           
    Parameter:
    ------------
    model : the classifier fitted on the training data
    
    X_test : pandas dataframe
         the feature matrix
         
    y_test : pandas series
         the target vector
    
    Returns:
    ----------
    acc_test : float
         the accuracy score on the test set
         
    f1_test : float
         the f1 score on the test set
         
    tn,fp,fn,tp : integer
         true negative,false positive,false negative and true positive respectively (for the test set)
    
    """
    
    pred_test = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test,pred_test).ravel()
    acc_test = accuracy_score(y_test,pred_test)
    prec_test = precision_score(y_test,pred_test)
    recall_test = recall_score(y_test,pred_test)
    f1_test = f1_score(y_test,pred_test)
    
    return acc_test,f1_test,tn,fp,fn,tp

In [None]:
model_list = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier(),
        LGBMClassifier(), GradientBoostingClassifier(), AdaBoostClassifier(), CatBoostClassifier(),
        XGBClassifier()]

lr_dict = {
           'clf__C' : np.arange(0.1,6,0.5),
           'clf__solver': ['liblinear','lbfgs']}

dt_dict = {
           'clf__criterion' : ['gini', 'entropy'],
           'clf__max_features' : ['sqrt','log2'],
           'clf__max_depth' : np.arange(10,60,10)}

rf_dict = {
           'clf__n_estimators': np.arange(20,100,20),
           'clf__criterion' : ['gini', 'entropy'],
           'clf__max_features' : ['sqrt','log2'],
           'clf__max_depth' : np.arange(10,60,10)}

knn_dict = {
            'clf__n_neighbors': np.arange(5,30,5),
            'clf__weights' : ['uniform','distance'],
            'clf__algorithm' : ['auto','ball_tree','kd_tree','brute'],
            'clf__metric' : ['minkowski','euclidean']}

lgbm_dict = {
             'clf__boosting_type': ['gbdt','dart','goss','rf'],
             'clf__n_estimators': np.arange(50,150,10),
             'clf__learning_rate': np.arange(0.1,5,0.5),
             'clf__max_depth': np.arange(2,10,1)}

gb_dict = {
           'clf__loss' : ['deviance','exponential'],
           'clf__learning_rate': np.arange(0.1,5,0.5),
           'clf__n_estimators': np.arange(20,100,20),
           'clf__criterion' : ['friedman_mse','mse','mae'],
           'clf__max_features' : ['sqrt','log2']}

ada_dict = {
           'clf__learning_rate': np.arange(0.1,5,0.5),
           'clf__n_estimators': np.arange(20,100,20),
           'clf__algorithm' : ['SAMME','SAMME.R']}

catb_dict = {
           'clf__learning_rate': np.arange(0.1,5,0.5),
           'clf__n_estimators': np.arange(50,100,10),    
           'clf__auto_class_weights': [None,'balanced','SqrtBalanced']}

xgb_dict = {
           'clf__learning_rate': np.arange(0.1,5,0.5),
           'clf__n_estimators': np.arange(20,100,20),
           'clf__booster':['gbtree','gblinear','dart'],
           'clf__gamma': np.arange(0.1,5,0.5)}

param_list = [lr_dict, dt_dict, rf_dict, knn_dict, lgbm_dict, gb_dict, ada_dict, catb_dict, xgb_dict]

In [None]:
# list to hold the information about classifiers after grid search
scan_list = []

# list to hold the values of TN,FP,FN and TP for each classifier
pred_list = []

for model, param_dict in zip(model_list,param_list):
    print("building model : {}\n".format(str(model).split("(")[0]))
    
    # build the model
    pipeline = build_pipeline(model)
    start_time = time.time()      
    
    # perform grid search and fit the best model to the training set
    grid_cv = scan_model(pipeline,param_dict,X_train,y_train)
    print("\n model fitted!")
    elapsed_time = time.time() - start_time        
    
    # evaluate the model
    acc_test,f1_test,tn,fp,fn,tp = eval_model(grid_cv,X_test,y_test)   
      
    scan_list.append({
              'model': str(model).split("(")[0],
              'best_score': grid_cv.best_score_,
              'test_acc': acc_test,
              'test_f1': f1_test,
              'best_params':grid_cv.best_params_,
              'time':elapsed_time
    })      
          
          
    pred_list.append({
              'model': str(model).split("(")[0],
              'true_neg':tn,
              'false_pos':fp,
              'false_neg':fn,
              'true_pos':tp
    })      

In [None]:
scan_df = pd.DataFrame(scan_list)
scan_df

In [None]:
pred_df = pd.DataFrame(pred_list)
pred_df

In [None]:
# let's look at the top 3 models

print("Logistic Regression")
print(scan_df.iloc[0][4])

print("\n LGBMClassifier")
print(scan_df.iloc[4][4])

print("\n XGBClassifier")
print(scan_df.iloc[8][4])

The best performing model is XGBClassifier with accuracy score 0.836066 and f1 score 0.857143. It also gives the lowest values of FN and FP (3 and 7 respectively). Note that, FN (individuals predicted having less chance of heart attack but are actually prone to heart diseases) has much more severe effect as compared to FP (individuals predicted having high chance of heart attack but are actually not prone to heart diseases). So, XGBClassifier is performing quite well.