In [14]:
# Basic Import
import pandas as pd
import numpy as np

#Importing the Scaler and Encoder for numerical and categorical features
from sklearn.preprocessing import RobustScaler, OneHotEncoder


#Importing Train Test Split
from sklearn.model_selection import train_test_split

#Importing various models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

#Importing model evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, roc_curve

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [15]:
#Reading the cleaned dataset
df=pd.read_csv('data/heart_without_duplicate.csv')  

In [16]:
#Display first 5 rows of dataset
df.head()

Unnamed: 0.1,Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [17]:
#Removing the first column (unamed column)
df = df.iloc[: , 1:]
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [18]:
#Creating a copy of the dataframe
df1 = df

In [19]:
#Creating a list of categorical and numberical features
categorcial_features = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
numerical_features = ["age","trtbps","chol","thalachh","oldpeak"]

In [20]:
#Creating a dataframe with the Independent variables
X = df1.drop(columns=['output'],axis=1)
X.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [21]:
#Display the Independent variables
X.columns

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall'],
      dtype='object')

In [22]:
#Create a dataframe with the Dependent variable
Y = df1[['output']]
Y.head()

Unnamed: 0,output
0,1
1,1
2,1
3,1
4,1


In [23]:
#Checking the shape of Y
Y.shape

(302, 1)

In [24]:
#Encoding the categorical variables
X = pd.get_dummies(X, columns = categorcial_features, drop_first = True)

In [25]:
#Train test split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 40)
print("The shape of X_train is      ", X_train.shape)
print("The shape of X_test is       ",X_test.shape)
print("The shape of y_train is      ",Y_train.shape)
print("The shape of y_test is       ",Y_test.shape)

The shape of X_train is       (241, 22)
The shape of X_test is        (61, 22)
The shape of y_train is       (241, 1)
The shape of y_test is        (61, 1)


In [26]:
#Instantiating the scaler for numerical variable
scaler = RobustScaler()

In [27]:
#Scaling the numerical features
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [28]:
#Defining the funcation for model evaluation
def evaluate_model(true, predicted):
    acc_score=accuracy_score(true, predicted)
    return acc_score

In [29]:

#Model training and evaluation
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boost Classifier": GradientBoostingClassifier()

}

model_list = []
metric_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, Y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    accuracy_train = evaluate_model(Y_train, y_train_pred)
    
    accuracy_test = evaluate_model(Y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("classification_report_train: {:.4f}".format(accuracy_train))
    print('----------------------------------')
    print('Model performance for Test set')
    print("classification_report_test: {:.4f}".format(accuracy_test))
    metric_list.append(accuracy_test)

    print('='*35)
    print('\n')



  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


Logistic Regression
Model performance for Training set
classification_report_train: 0.8589
----------------------------------
Model performance for Test set
classification_report_test: 0.9508


Decision Tree Classifier
Model performance for Training set
classification_report_train: 1.0000
----------------------------------
Model performance for Test set
classification_report_test: 0.6393


Random Forest Classifier
Model performance for Training set
classification_report_train: 1.0000
----------------------------------
Model performance for Test set
classification_report_test: 0.8033


AdaBoost Classifier
Model performance for Training set
classification_report_train: 0.9419
----------------------------------
Model performance for Test set
classification_report_test: 0.7869




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Gradient Boost Classifier
Model performance for Training set
classification_report_train: 0.9959
----------------------------------
Model performance for Test set
classification_report_test: 0.8033




The Logisitic Regression Model has perfromed well with both Train and Test data hence we will choose that model 