In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
heart = pd.read_csv("dataset\heart_2020_cleaned.csv")

In [3]:
heart.head(10)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
5,Yes,28.87,Yes,No,No,6.0,0.0,Yes,Female,75-79,Black,No,No,Fair,12.0,No,No,No
6,No,21.63,No,No,No,15.0,0.0,No,Female,70-74,White,No,Yes,Fair,4.0,Yes,No,Yes
7,No,31.64,Yes,No,No,5.0,0.0,Yes,Female,80 or older,White,Yes,No,Good,9.0,Yes,No,No
8,No,26.45,No,No,No,0.0,0.0,No,Female,80 or older,White,"No, borderline diabetes",No,Fair,5.0,No,Yes,No
9,No,40.69,No,No,No,0.0,0.0,Yes,Male,65-69,White,No,Yes,Good,10.0,No,No,No


# Profiling Data 

In [4]:
from pandas_profiling import ProfileReport

In [5]:
#pip install pandas_profiling

In [6]:
rawProfile = ProfileReport(heart, title = "Raw Data Profile", explorative = True)

In [7]:
rawProfile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [8]:
rawProfile.to_file("rawProfle.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
# Converting Age variable from categorical value to a continous value

encode_AgeCategory = {'55-59':57, '80 or older':80, '65-69':67,
                      '75-79':77,'40-44':42,'70-74':72,'60-64':62,
                      '50-54':52,'45-49':47,'18-24':21,'35-39':37,
                      '30-34':32,'25-29':27}
heart['AgeCategory'] = heart['AgeCategory'].apply(lambda x: encode_AgeCategory[x])

In [10]:
# 
heart.shape

(319795, 18)

In [11]:
# Import label encoder
from sklearn import preprocessing
 
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
 
# Encode labels in column 'species'.
heart['HeartDisease']= label_encoder.fit_transform(heart['HeartDisease'])
heart['Smoking']= label_encoder.fit_transform(heart['Smoking'])
heart['AlcoholDrinking']= label_encoder.fit_transform(heart['AlcoholDrinking'])
heart['Stroke']= label_encoder.fit_transform(heart['Stroke'])
heart['DiffWalking']= label_encoder.fit_transform(heart['DiffWalking'])
heart['Sex']= label_encoder.fit_transform(heart['Sex'])
heart['AgeCategory']= label_encoder.fit_transform(heart['AgeCategory'])
heart['Race']= label_encoder.fit_transform(heart['Race'])
heart['Diabetic']= label_encoder.fit_transform(heart['Diabetic'])
heart['PhysicalActivity']= label_encoder.fit_transform(heart['PhysicalActivity'])
heart['GenHealth']= label_encoder.fit_transform(heart['GenHealth'])
heart['Asthma']= label_encoder.fit_transform(heart['Asthma'])
heart['KidneyDisease']= label_encoder.fit_transform(heart['KidneyDisease'])
heart['SkinCancer']= label_encoder.fit_transform(heart['SkinCancer'])

In [12]:
X, y = heart.loc[:, heart.columns != 'HeartDisease'], heart['HeartDisease']

In [13]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [14]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [15]:
import imblearn

In [16]:
from imblearn.under_sampling import NeighbourhoodCleaningRule 
ncr = NeighbourhoodCleaningRule(n_neighbors=20, threshold_cleaning=0.5)

In [17]:
X_ncr, y_ncr =ncr.fit_resample(X,y)

In [18]:
sampledProfile = ProfileReport(X_ncr, title = "Sampled Data Profile", explorative = True)

In [19]:
sampledProfile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X_ncr,y_ncr,test_size=0.40,random_state=42)

In [63]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [64]:
# XGBoost Classifier with Bagging and Boosting

import xgboost as xgb
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score


# Init classifier
xgb_cl = xgb.XGBClassifier()

# The baggging ensemble classifier is initialized with:

bagging = BaggingClassifier(base_estimator=xgb_cl, n_estimators=5, max_samples=50, bootstrap=True)

# Training
bagging.fit(X_train, y_train)

# Evaluating
print(f"Train score: {bagging.score(X_train, y_train)}")
print(f"Test score: {bagging.score(X_test, y_test)}")

# Fit
xgb_cl.fit(X_train, y_train)


# Predict
preds = xgb_cl.predict(X_test)

# Score
accuracy_score(y_test, preds)




Train score: 0.8937249799839871
Test score: 0.8945416879334674




0.9362746569790135

In [65]:
# Precision and recall for XGBoost:

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Precision score:', precision_score(y_test, preds))
print('Recall score:', recall_score(y_test, preds))
print('Accuracy Score:', accuracy_score(y_test, preds))
print('F1 Score:', f1_score(y_test, preds))



Precision score: 0.8400657827114811
Recall score: 0.7524396980298288
Accuracy Score: 0.9362746569790135
F1 Score: 0.7938419697926278


In [67]:
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import roc_curve, auc
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96     55752
           1       0.84      0.75      0.79     10862

    accuracy                           0.94     66614
   macro avg       0.90      0.86      0.88     66614
weighted avg       0.93      0.94      0.93     66614



# XG Boost Plot

In [98]:
# Precision and recall for XGBoost:

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Precision score:', precision_score(y_test, preds))
print('Recall score:', recall_score(y_test, preds))
print('Accuracy Score:', accuracy_score(y_test, preds))
print('F1 Score:', f1_score(y_test, preds))

#  Precision-Recall Curve

from sklearn.metrics import precision_recall_curve

# S1. Get probability
xgb_curve = xgb_cl.predict_proba(X_test)[ : , 1 ]

# S2. Collect Necessary Measures
precision, recall, threshold = precision_recall_curve(y_test, xgb_curve)

# S3. Plot Figure
plt.clf()
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title('XG Boost Precision-Recall Curve')
plt.savefig("XG Boost precision_recall curve.png")

# AUC 
from sklearn.metrics import roc_auc_score
print("AUC for XG Boost:",  roc_auc_score(y_test, xgb_curve) )


# ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, threshold= roc_curve(y_test, xgb_curve)


# Generate Figure
plt.clf()
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title('XG Boost ROC Curve')
plt.savefig("XG Boost roc curve.png")

Precision score: 0.8400657827114811
Recall score: 0.7524396980298288
Accuracy Score: 0.9362746569790135
F1 Score: 0.7938419697926278
AUC for XG Boost: 0.9574755680118379


In [52]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation


# The base learner will be a decision tree with depth = 2
xgb_cl = xgb.XGBClassifier()

adaboost = AdaBoostClassifier(base_estimator=xgb_cl, n_estimators=5, learning_rate=0.1, random_state=23)

# Train!
adaboost.fit(X_train, y_train)

# Evaluation
print(f"Train score: {adaboost.score(X_train, y_train)}")
print(f"Test score: {adaboost.score(X_test, y_test)}")

# Fit
adaboost.fit(X_train, y_train)

# Predict
preds = adaboost.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, preds))



Train score: 0.834757806244996
Test score: 0.8369411835349927




Accuracy: 0.8369411835349927


In [33]:
# Decision Tree Classifier with Bagging and Boosting

from sklearn.tree import DecisionTreeClassifier 
# Import Decision Tree Classifier
from sklearn.ensemble import BaggingClassifier
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics 


tree = DecisionTreeClassifier(max_depth=3, random_state=23)

# The baggging ensemble classifier is initialized with:

bagging = BaggingClassifier(base_estimator=tree, n_estimators=5, max_samples=50, bootstrap=True)

# Training
bagging.fit(X_train, y_train)

# Evaluating
print(f"Train score: {bagging.score(X_train, y_train)}")
print(f"Test score: {bagging.score(X_test, y_test)}")


# Train Decision Tree Classifer
tree = tree.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = tree.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Train score: 0.8838370696557246
Test score: 0.8850541928123218
Accuracy: 0.9013270483682109


In [34]:
from sklearn.ensemble import AdaBoostClassifier

# The base learner will be a decision tree with depth = 2
tree = DecisionTreeClassifier(max_depth=2, random_state=23)


adaboost = AdaBoostClassifier(base_estimator=tree, n_estimators=5, learning_rate=0.1, random_state=23)

# Train!
adaboost.fit(X_train, y_train)

# Evaluation
print(f"Train score: {adaboost.score(X_train, y_train)}")
print(f"Test score: {adaboost.score(X_test, y_test)}")

# Train Decision Tree Classifer
tree = tree.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = tree.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Train score: 0.8939151321056845
Test score: 0.8953373164800192
Accuracy: 0.8737202389888011


In [140]:
#Import Gaussian Naive Bayes model

from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Accuracy: 0.861545621040622
              precision    recall  f1-score   support

           0       0.93      0.90      0.92     55752
           1       0.56      0.66      0.61     10862

    accuracy                           0.86     66614
   macro avg       0.75      0.78      0.76     66614
weighted avg       0.87      0.86      0.87     66614



In [133]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print ("Accuracy : ", accuracy_score(y_test, y_pred))

Accuracy :  0.9192662203140481


In [134]:
from sklearn import metrics
cf_matrix = metrics.confusion_matrix(y_test, y_pred)

In [136]:
import seaborn as sns
import matplotlib.pyplot as plt 
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
plt.savefig("Logistic Reg confusion_matrix")
#plt.show()

  plt.show()


In [137]:
# Precision and recall for Logistic Regression:

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Precision score:', precision_score(y_test, y_pred))
print('Recall score:', recall_score(y_test, y_pred))
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))

Precision score: 0.8076750448833034
Recall score: 0.6626772233474498
Accuracy Score: 0.9192662203140481
F1 Score: 0.7280267017295439


# Logistic Regression Plot

In [138]:
#  Precision-Recall Curve

from sklearn.metrics import precision_recall_curve

# S1. Get probability
classifier_curve = classifier.predict_proba(X_test)[ : , 1 ]

# S2. Collect Necessary Measures
precision, recall, threshold = precision_recall_curve(y_test, classifier_curve)

# S3. Plot Figure
plt.clf()
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title('Logistic Regression Precision-Recall Curve')
plt.savefig("Logistic Regression precision_recall curve.png")

# AUC 
from sklearn.metrics import roc_auc_score
print("AUC for KNN:",  roc_auc_score(y_test, classifier_curve) )


# ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, threshold= roc_curve(y_test, classifier_curve)


# Generate Figure
plt.clf()
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title('Logistic Regression ROC Curve')
plt.savefig("Logistic Regression roc curve.png")

AUC for KNN: 0.9406178820921407


# KNN PLOT

In [139]:
# K Nearest Classifiers

from sklearn.neighbors import KNeighborsClassifier
 
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

0.9207223706728316
              precision    recall  f1-score   support

           0       0.94      0.97      0.95     55752
           1       0.82      0.66      0.73     10862

    accuracy                           0.92     66614
   macro avg       0.88      0.82      0.84     66614
weighted avg       0.92      0.92      0.92     66614



In [107]:
# Precision and recall for KNN:

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Precision score:', precision_score(y_test, y_pred))
print('Recall score:', recall_score(y_test, y_pred))
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))

Precision score: 0.8175714123136452
Recall score: 0.6613883262750875
Accuracy Score: 0.9207223706728316
F1 Score: 0.7312331416357066


In [108]:
#  Precision-Recall Curve

from sklearn.metrics import precision_recall_curve

# S1. Get probability
knn_curve = knn.predict_proba(X_test)[ : , 1 ]

# S2. Collect Necessary Measures
precision, recall, threshold = precision_recall_curve(y_test, knn_curve)

# S3. Plot Figure
plt.clf()
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title('KNN  Precision-Recall Curve')
plt.savefig("KNN  precision_recall curve.png")

# AUC 
from sklearn.metrics import roc_auc_score
print("AUC for KNN:",  roc_auc_score(y_test, knn_curve) )


# ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, threshold= roc_curve(y_test, knn_curve)


# Generate Figure
plt.clf()
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title('KNN ROC Curve')
plt.savefig("KNN roc curve.png")

AUC for KN: 0.9265025784678809


In [112]:
# KNN with gridserach

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# define function

knn = KNeighborsClassifier()


# define a list of parameters
param_knn = {'n_neighbors': range(3, 26, 2)}  # exactly the same as the input variable name. 

#apply grid search

grid_knn = GridSearchCV( knn , param_knn, cv = 5)

grid_knn.fit(X_train, y_train)

# the best hyperparameter chosen:
print(grid_knn.best_params_)

# When k=25 (best case), the validation score of through CV is:
print(grid_knn.best_score_)


grid_knn.score(X_test, y_test)

print(accuracy_score(y_test, y_pred))

{'n_neighbors': 7}
0.9186148919135307
0.9207223706728316


In [None]:
y_pred = grid_knn.predict(X_test)

# Grid Search KNN Plot

In [117]:
# Precision and recall for KNN Grid Search:

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Precision score:', precision_score(y_test, y_pred))
print('Recall score:', recall_score(y_test, y_pred))
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))

Precision score: 0.8175714123136452
Recall score: 0.6613883262750875
Accuracy Score: 0.9207223706728316
F1 Score: 0.7312331416357066


In [None]:
#  Precision-Recall Curve Grid KNN

from sklearn.metrics import precision_recall_curve

# S1. Get probability
grid_knn_curve = grid_knn.predict_proba(X_test)[ : , 1 ]

# S2. Collect Necessary Measures
precision, recall, threshold = precision_recall_curve(y_test, grid_knn_curve)

# S3. Plot Figure
plt.clf()
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title('Grid Search KNN Precision-Recall Curve')
plt.savefig("Grid Search KNN precision_recall curve.png")

# AUC 
from sklearn.metrics import roc_auc_score
print("AUC for Grid Search KNN:",  roc_auc_score(y_test, grid_knn_curve) )


# ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, threshold= roc_curve(y_test, grid_knn_curve)


# Generate Figure
plt.clf()
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title('Grid Search KNN ROC Curve')
plt.savefig("Grid Search KNN roc curve.png")

In [116]:
# S3. Plot Figure
plt.clf()
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title('Grid Search KNN Precision-Recall Curve')
plt.savefig("Grid Search KNN precision_recall curve.png")

In [118]:
# Linear SVM with GridSearchCV

from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

# Define Function
linear_svc = LinearSVC(random_state = 22) # must specify random state here

# Define a list of hyperparameters
params_svc = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 100000]   }

grid_lr_svc = GridSearchCV(linear_svc,  params_svc, n_jobs = 2 )

grid_lr_svc.fit(X_train, y_train)

GridSearchCV(estimator=LinearSVC(random_state=22), n_jobs=2,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 100000]})

In [119]:
grid_lr_svc.score(X_test, y_test)

0.9182904494550695

In [120]:
grid_lr_svc.best_params_

{'C': 0.1}

In [125]:
y_pred = grid_lr_svc.predict(X_test)

In [126]:
# Precision and recall for SVM:

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Precision score:', precision_score(y_test, y_pred))
print('Recall score:', recall_score(y_test, y_pred))
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))

Precision score: 0.8174575278265964
Recall score: 0.6423310624194439
Accuracy Score: 0.9182904494550695
F1 Score: 0.7193895963293293
