In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc #for model evaluation
import joblib

import warnings
warnings.filterwarnings("ignore")

dt = pd.read_csv('heart.csv')
dt.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [2]:
dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

In [3]:
dt['sex'][dt['sex'] == 0] = 'female'
dt['sex'][dt['sex'] == 1] = 'male'

dt['chest_pain_type'][dt['chest_pain_type'] == 1] = 'typical angina'
dt['chest_pain_type'][dt['chest_pain_type'] == 2] = 'atypical angina'
dt['chest_pain_type'][dt['chest_pain_type'] == 3] = 'non-anginal pain'
dt['chest_pain_type'][dt['chest_pain_type'] == 4] = 'asymptomatic'

dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'

dt['rest_ecg'][dt['rest_ecg'] == 0] = 'normal'
dt['rest_ecg'][dt['rest_ecg'] == 1] = 'ST-T wave abnormality'
dt['rest_ecg'][dt['rest_ecg'] == 2] = 'left ventricular hypertrophy'

dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 0] = 'no'
dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 1] = 'yes'

dt['st_slope'][dt['st_slope'] == 1] = 'upsloping'
dt['st_slope'][dt['st_slope'] == 2] = 'flat'
dt['st_slope'][dt['st_slope'] == 3] = 'downsloping'

dt['thalassemia'][dt['thalassemia'] == 1] = 'normal'
dt['thalassemia'][dt['thalassemia'] == 2] = 'fixed defect'
dt['thalassemia'][dt['thalassemia'] == 3] = 'reversable defect'

In [4]:
dt['sex'] = dt['sex'].astype('object')
dt['chest_pain_type'] = dt['chest_pain_type'].astype('object')
dt['fasting_blood_sugar'] = dt['fasting_blood_sugar'].astype('object')
dt['rest_ecg'] = dt['rest_ecg'].astype('object')
dt['exercise_induced_angina'] = dt['exercise_induced_angina'].astype('object')
dt['st_slope'] = dt['st_slope'].astype('object')
dt['thalassemia'] = dt['thalassemia'].astype('object')

In [5]:
dt = pd.get_dummies(dt, drop_first=True)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(dt.drop('target', 1), dt['target'], test_size = .2, random_state=10) #split the data

# RF

In [8]:
rf_model = RandomForestClassifier(max_depth=5)
rf_model.fit(X_train, y_train)

In [9]:
estimator = rf_model.estimators_[1]
feature_names = [i for i in X_train.columns]

y_train_str = y_train.astype('str')
y_train_str[y_train_str == '0'] = 'no disease'
y_train_str[y_train_str == '1'] = 'disease'
y_train_str = y_train_str.values

y_predict_rf = rf_model.predict(X_test)
y_pred_quant_rf = rf_model.predict_proba(X_test)[:, 1]
y_pred_bin_rf = rf_model.predict(X_test)

In [10]:
y_pred_bin_rf

array([0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1])

In [11]:
confusion_matrix = confusion_matrix(y_test, y_pred_bin_rf)
confusion_matrix

total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

# from sklearn.metrics import roc_curve, auc #for model evaluation

fpr, tpr, thresholds = roc_curve(y_test, y_pred_quant_rf)


auc(fpr, tpr)


Sensitivity :  0.875
Specificity :  0.7586206896551724


0.9054945054945055

# XGB

In [7]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
xgb=XGBClassifier(objective= 'binary:logistic', random_state=42)


In [8]:
xgb.fit(X_train, y_train)


In [9]:
# estimator = xgb.estimators_[1]
feature_names = [i for i in X_train.columns]

y_train_str = y_train.astype('str')
y_train_str[y_train_str == '0'] = 'no disease'
y_train_str[y_train_str == '1'] = 'disease'
y_train_str = y_train_str.values

y_predict_xgb = xgb.predict(X_test)
y_pred_quant_xgb = xgb.predict_proba(X_test)[:, 1]
y_pred_bin_xgb = xgb.predict(X_test)

In [10]:
y_pred_bin_xgb

array([0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1])

In [11]:
# y_test=np.array(y_test)
y_test



246    0
183    0
229    0
126    1
184    0
      ..
277    0
121    1
187    0
301    0
283    0
Name: target, Length: 61, dtype: int64

In [12]:
confusion_matrix1 = confusion_matrix(y_test, y_pred_bin_xgb)
confusion_matrix1

total=sum(sum(confusion_matrix1))

sensitivity = confusion_matrix1[0,0]/(confusion_matrix1[0,0]+confusion_matrix1[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix1[1,1]/(confusion_matrix1[1,1]+confusion_matrix1[0,1])
print('Specificity : ', specificity)

# from sklearn.metrics import roc_curve, auc #for model evaluation

fpr, tpr, thresholds = roc_curve(y_test, y_pred_quant_xgb)


auc(fpr, tpr)


Sensitivity :  0.8571428571428571
Specificity :  0.8076923076923077


0.8560439560439561

# logistic Regression

In [18]:
logit_clf = LogisticRegression()
logit_clf.fit(X_train, y_train)

y_pred = logit_clf.predict(X_test)
print('Accuracy Score: ', str(accuracy_score(y_test, y_pred)))
print('Classification Report: ')
print(classification_report(y_test, y_pred))    

Accuracy Score:  0.7704918032786885
Classification Report: 
              precision    recall  f1-score   support

           0       0.82      0.77      0.79        35
           1       0.71      0.77      0.74        26

    accuracy                           0.77        61
   macro avg       0.77      0.77      0.77        61
weighted avg       0.77      0.77      0.77        61



In [14]:
import statistics

ensemble_pred = statistics.mode([int(y_pred_bin_xgb[0]), int(y_pred_bin_xgb[10])])
ensemble_pred

0

Accuracy Score:  0.7704918032786885
Classification Report: 
              precision    recall  f1-score   support

           0       0.82      0.77      0.79        35
           1       0.71      0.77      0.74        26

    accuracy                           0.77        61
   macro avg       0.77      0.77      0.77        61
weighted avg       0.77      0.77      0.77        61

