# Continued from Part 1...

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sets backend to render higher res images
%config InlineBackend.figure_formats = ['retina']

sns.set_style("white")

## Read Data

In [2]:
df = pd.read_csv('detect cardiovascular-cleaned 2.csv')
df

Unnamed: 0,age,ap_hi,ap_lo,cholesterol,gluc,active,cardio,bmi
0,61,130,80,2,2,1,0,28.7
1,40,110,70,1,1,1,1,26.7
2,60,120,80,1,1,1,0,25.7
3,39,120,80,1,1,1,0,31.2
4,64,120,80,1,1,1,0,25.8
...,...,...,...,...,...,...,...,...
68256,52,120,80,1,1,1,0,26.9
68257,61,140,90,2,2,1,1,50.5
68258,52,180,90,3,1,0,1,31.4
68259,61,135,80,1,2,0,1,27.1


### Features:
1.	Age | Objective Feature | age | int (days)
2.	Height | Objective Feature | height | int (cm) |
3.	Weight | Objective Feature | weight | float (kg) |
4.	Gender | Objective Feature | gender | categorical code |
5.	Systolic blood pressure | Examination Feature | ap_hi | int |
6.	Diastolic blood pressure | Examination Feature | ap_lo | int |
7.	Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
8.	Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
9.	Smoking | Subjective Feature | smoke | binary |
10.	Alcohol intake | Subjective Feature | alco | binary |
11.	Physical activity | Subjective Feature | active | binary |
12.	Presence or absence of cardiovascular disease | Target Variable | cardio | binary | (TARGET)

## Creating Model 
<br>
We can use sklearn library or we can write functions ourselves. Let's them both. Firstly we will write our functions after that we'll use sklearn library to calculate score.

In [3]:
# Reading the Data and Performing Basic Data Checks
print(df.shape)
df.describe()

(68261, 8)


Unnamed: 0,age,ap_hi,ap_lo,cholesterol,gluc,active,cardio,bmi
count,68261.0,68261.0,68261.0,68261.0,68261.0,68261.0,68261.0,68261.0
mean,52.828789,126.290239,81.331068,1.364498,1.225956,0.803548,0.494909,27.463916
std,6.767836,17.887949,9.883714,0.678776,0.571888,0.397317,0.499978,5.284978
min,39.0,-150.0,-70.0,1.0,1.0,0.0,0.0,3.5
25%,48.0,120.0,80.0,1.0,1.0,1.0,0.0,23.9
50%,53.0,120.0,80.0,1.0,1.0,1.0,0.0,26.3
75%,58.0,140.0,90.0,1.0,1.0,1.0,1.0,30.1
max,64.0,240.0,190.0,3.0,3.0,1.0,1.0,86.8


In [4]:
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
plt.style.use('ggplot')
%matplotlib inline
np.set_printoptions(suppress=True)

In [5]:
# Creating the Training and Test Datasets
target_column = ['cardio']
predictors = list(set(list(df.columns))-set(target_column))

X = df[predictors].values
y = df[target_column].values
y = np.ravel(y,order = 'C')

# Scaling by normalizing
df[predictors] = df[predictors]/df[predictors].max() 
df.describe()

Unnamed: 0,age,ap_hi,ap_lo,cholesterol,gluc,active,cardio,bmi
count,68261.0,68261.0,68261.0,68261.0,68261.0,68261.0,68261.0,68261.0
mean,0.82545,0.526209,0.428058,0.454833,0.408652,0.803548,0.494909,0.316405
std,0.105747,0.074533,0.05202,0.226259,0.190629,0.397317,0.499978,0.060887
min,0.609375,-0.625,-0.368421,0.333333,0.333333,0.0,0.0,0.040323
25%,0.75,0.5,0.421053,0.333333,0.333333,1.0,0.0,0.275346
50%,0.828125,0.5,0.421053,0.333333,0.333333,1.0,0.0,0.302995
75%,0.90625,0.583333,0.473684,0.333333,0.333333,1.0,1.0,0.346774
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


We will split our data. 80% of our data will be train data and 20% of it will be test data.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=9)
print(X_train.shape); print(X_test.shape)
print(y_train.shape); print(y_test.shape)

(54608, 7)
(13653, 7)
(54608,)
(13653,)


## Support Vector Machine (SVM) Algorithm 

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn import svm

#SVM
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
clf = RandomizedSearchCV(SVC(probability=True),param_grid,verbose=2, cv=5, n_jobs = -1)
best_clf_svm = clf.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 219.7min


In [None]:
best_clf_svm.best_params_

In [None]:
print("The score for SVM Algorithm is")
print("Training: {:6.2f}%".format(100*best_clf_svm.best_score_))

In [None]:
acc = best_clf_svm.score(X_test,y_test)*100
accuracies['SVM'] = acc
print("Test Accuracy of SVM Algorithm: {:.2f}%".format(acc))

In [None]:
#AUC
from sklearn.metrics import roc_auc_score, roc_curve
ROC_AUC_score = {}
# svm = SVC(kernel='rbf', gamma=1, C=1, )
# best_clf_svm = svm.fit(X_train, y_train)
auc = roc_auc_score(y_test, best_clf_svm.predict_proba(X_test)[:,1])
ROC_AUC_score['SVM'] = auc
print("ROC AUC score = {:.3f}".format(roc_auc_score(y_test, best_clf_svm.predict_proba(X_test)[:,1])))

In [None]:
#Log loss
from sklearn.metrics import log_loss
Log_loss ={}
ll=log_loss(y_test, best_clf_svm.predict_proba(X_test))
Log_loss['SVM'] =ll
print("Log-loss: {:.3f}".format(log_loss(y_test, best_clf_svm.predict_proba(X_test))))

In [None]:
#F1 score
F1_score = {}
svm_f1_score = f1_score(y_test,best_clf_svm.predict(X_test))
F1_score['SVM'] =svm_f1_score
print("F1 score: {:.3f}".format(f1_score(y_test,best_clf_svm.predict(X_test))))

## Sklearn Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)

logit = LogisticRegression(max_iter=100000)
clf = RandomizedSearchCV(logit, hyperparameters,cv=5, n_jobs = -1, verbose = 2)
best_clf_logit = clf.fit(X_train,y_train)

In [None]:
best_clf_logit.best_params_

In [None]:
print("The score for Logistics Regression is")
print("Training: {:6.2f}%".format(100*best_clf_logit.best_score_))

In [None]:
accuracies = {}

acc = best_clf_logit.score(X_test,y_test)*100
accuracies['Logistic Regression'] = acc
print("Test Accuracy {:.2f}%".format(acc))

In [None]:
#AUC
from sklearn.metrics import roc_auc_score, roc_curve
auc = roc_auc_score(y_test, best_clf_logit.predict_proba(X_test)[:,1])
ROC_AUC_score['Logistic Regression'] = auc
print("ROC AUC score = {:.3f}".format(roc_auc_score(y_test, best_clf_logit.predict_proba(X_test)[:,1])))

In [None]:
#Log-loss
from sklearn.metrics import log_loss
ll=log_loss(y_test, best_clf_logit.predict_proba(X_test))
Log_loss['Logistic Regression'] =ll
print("Log-loss: {:.3f}".format(log_loss(y_test, best_clf_logit.predict_proba(X_test))))

In [None]:
#F1 score 
logit_f1_score = f1_score(y_test,best_clf_logit.predict(X_test))
F1_score['Logistic Regression'] =logit_f1_score
print("F1 score: {:.3f}".format(f1_score(y_test,best_clf_logit.predict(X_test))))

## Naive Bayes Algorithm

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB,  BernoulliNB

gnb = GaussianNB()
bnb = BernoulliNB()

cross_val_score(gnb, X, y, cv=5, scoring='accuracy')
cross_val_score(bnb, X, y, cv=5, scoring='accuracy')

print("The score for GaussianNB is")
print("Training: {:6.2f}%".format(100*np.mean(cross_val_score(gnb, X, y, cv=5, scoring='accuracy'))))
print("The score for BernolliNB is")
print("Training: {:6.2f}%".format(100*np.mean(cross_val_score(bnb, X, y, cv=5, scoring='accuracy'))))

In [None]:
gnb = GaussianNB()
gnb.fit(X, y)

bnb = BernoulliNB()
bnb.fit(X, y)

acc = gnb.score(X_test,y_test)*100
acc2 = bnb.score(X_test,y_test)*100
print("Accuracy of Naive Bayes (Gaussian): {:.2f}%".format(acc))
print("Accuracy of Naive Bayes (Bernolli): {:.2f}%".format(acc2))

In [None]:
accuracies['Naive Bayes'] = acc

In [None]:
#AUC
auc = roc_auc_score(y_test, gnb.predict_proba(X_test)[:,1])
ROC_AUC_score['Naive Bayes'] = auc
print("ROC AUC score = {:.2f}".format(roc_auc_score(y_test, gnb.predict_proba(X_test)[:,1])))

In [None]:
#Log loss
ll=log_loss(y_test, gnb.predict_proba(X_test))
Log_loss['Naive Bayes'] =ll
print("Log-loss: {:.3f}".format(log_loss(y_test, gnb.predict_proba(X_test))))

In [None]:
#F1 score
gnb_f1_score = f1_score(y_test,gnb.predict(X_test))
F1_score['Naive Bayes'] =gnb_f1_score
print("F1 score: {:.3f}".format(f1_score(y_test,gnb.predict(X_test))))

## Decision Tree Algorithm

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {'criterion' : ['gini','entropy'],
                'max_depth': [2,4,6,8,10,12,14,16,18,20],
                'min_samples_leaf': [1, 2, 4],
              'max_features': ['auto', 'log2','None'],
                'min_samples_split': [2, 5, 10]}
dtc = DecisionTreeClassifier()
clf = RandomizedSearchCV(dtc, parameters,cv=5, n_jobs = -1, verbose = 2)
best_clf_dtc = clf.fit(X_train,y_train)

In [None]:
best_clf_dtc.best_params_

In [None]:
print("The score for Decision Tree is")
print("Training: {:6.2f}%".format(100*best_clf_dtc.best_score_))

In [None]:
acc = best_clf_dtc.score(X_test, y_test)*100
accuracies['Decision Tree'] = acc
print("Decision Tree Test Accuracy {:.2f}%".format(acc))

In [None]:
#AUC
auc = roc_auc_score(y_test, best_clf_dtc.predict_proba(X_test)[:,1])
ROC_AUC_score['Decision Tree'] = auc
print("ROC AUC score = {:.3f}".format(roc_auc_score(y_test, best_clf_dtc.predict_proba(X_test)[:,1])))

In [None]:
#Log loss
ll=log_loss(y_test, best_clf_dtc.predict_proba(X_test))
Log_loss['Decision Tree'] =ll
print("Log-loss: {:.3f}".format(log_loss(y_test, best_clf_dtc.predict_proba(X_test))))

In [None]:
#F1 score
dtc_f1_score = f1_score(y_test,best_clf_dtc.predict(X_test))
F1_score['Decision Tree'] =dtc_f1_score
print("F1 score: {:.3f}".format(f1_score(y_test,best_clf_dtc.predict(X_test))))

## Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {'bootstrap': [True, False],
              'criterion' : ['gini','entropy'],
                'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                'max_features': ['auto', 'log2','None'],
                'min_samples_leaf': [1, 2, 4],
                 'min_samples_split': [2, 5, 10],
                'n_estimators': [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000]}

rf = RandomForestClassifier()
clf = RandomizedSearchCV(rf, parameters,cv=5, n_jobs = -1, verbose = 2)
best_clf_rf = clf.fit(X_train,y_train)

In [None]:
best_clf_rf.best_params_

In [None]:
print("The score for Random Forest is")
print("Training: {:6.2f}%".format(100*best_clf_rf.best_score_))

In [None]:
acc = best_clf_rf.score(X_test,y_test)*100
accuracies['Random Forest'] = acc
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(acc))

In [None]:
#AUC
auc = roc_auc_score(y_test, best_clf_rf.predict_proba(X_test)[:,1])
ROC_AUC_score['Random Forest'] = auc
print("ROC AUC score = {:.3f}".format(roc_auc_score(y_test, best_clf_rf.predict_proba(X_test)[:,1])))

In [None]:
#Log loss
ll=log_loss(y_test, best_clf_rf.predict_proba(X_test))
Log_loss['Random Forest'] =ll
print("Log-loss: {:.3f}".format(log_loss(y_test, best_clf_rf.predict_proba(X_test))))

In [None]:
#F1 score
rf_f1_score = f1_score(y_test,best_clf_rf.predict(X_test))
F1_score['Random Forest'] =rf_f1_score
print("F1 score: {:.3f}".format(f1_score(y_test,best_clf_rf.predict(X_test))))

## K-Nearest Neighbour (KNN) Classification

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {'n_neighbors':list(np.arange(1,30,1))}
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters, cv=5, n_jobs = -1, verbose = 2)
best_clf_knn = clf.fit(X_train,y_train)

In [None]:
best_clf_knn.best_params_

In [None]:
print("The score for K-Nearest Neighbour (KNN) is")
print("Training: {:6.2f}%".format(100*best_clf_knn.best_score_))

In [None]:
acc = best_clf_knn.score(X_test,y_test)*100
accuracies['KNN'] = acc
print("Test Accuracy {:.2f}%".format(acc))

In [None]:
#AUC
auc = roc_auc_score(y_test, best_clf_knn.predict_proba(X_test)[:,1])
ROC_AUC_score['KNN'] = auc
print("ROC AUC score = {:.3f}".format(roc_auc_score(y_test, best_clf_knn.predict_proba(X_test)[:,1])))

In [None]:
#Log loss
ll=log_loss(y_test, best_clf_knn.predict_proba(X_test))
Log_loss['KNN'] =ll
print("Log-loss: {:.3f}".format(log_loss(y_test, best_clf_knn.predict_proba(X_test))))

In [None]:
# F1 score
knn_f1_score = f1_score(y_test,best_clf_knn.predict(X_test))
F1_score['KNN'] =knn_f1_score
print("F1 score: {:0.3f}".format(f1_score(y_test,best_clf_knn.predict(X_test))))

## Comparing Models

In [None]:
accuracies

In [None]:
ROC_AUC_score

In [None]:
Log_loss

In [None]:
F1_score

In [None]:
colors = ["purple", "green", "orange", "magenta","#CFC60E","#0FBBAE"]

sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette=colors)
plt.title('Comparison on Accuracy')
plt.show()

In [None]:
colors = ["purple", "green", "orange", "magenta","#CFC60E","#0FBBAE"]

sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,1,0.1))
plt.ylabel("ROC-AUC Score")
plt.xlabel("Algorithms")
sns.barplot(x=list(ROC_AUC_score.keys()), y=list(ROC_AUC_score.values()), palette=colors)
plt.title('Comparison of ROC-AUC Score')
plt.show()

In [None]:
colors = ["purple", "green", "orange", "magenta","#CFC60E","#0FBBAE"]

sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,1,0.1))
plt.ylabel("Log_loss")
plt.xlabel("Algorithms")
sns.barplot(x=list(Log_loss.keys()), y=list(Log_loss.values()), palette=colors)
plt.title('Comparison of Log-Loss')
plt.show()

In [None]:
colors = ["purple", "green", "orange", "magenta","#CFC60E","#0FBBAE"]

sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,1,0.1))
plt.ylabel("F1_score")
plt.xlabel("Algorithms")
sns.barplot(x=list(F1_score.keys()), y=list(F1_score.values()), palette=colors)
plt.title('Comparison of F1 score')
plt.show()

## Confusion Matrix

In [None]:
# Predicted values
y_head_logit = best_clf_logit.predict(X_test)
y_head_knn = best_clf_knn.predict(X_test)
y_head_svm = svm.predict(X_test)
y_head_gnb = gnb.predict(X_test)
y_head_dtc = best_clf_dtc.predict(X_test)
y_head_rf = best_clf_rf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

cm_logit = confusion_matrix(y_test,y_head_logit)
cm_knn = confusion_matrix(y_test,y_head_knn)
cm_svm = confusion_matrix(y_test,y_head_svm)
cm_gnb = confusion_matrix(y_test,y_head_gnb)
cm_dtc = confusion_matrix(y_test,y_head_dtc)
cm_rf = confusion_matrix(y_test,y_head_rf)

In [None]:
plt.figure(figsize=(24,12))

plt.suptitle("Confusion Matrixes",fontsize=24)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.subplot(2,3,1)
plt.title("Logistic Regression Confusion Matrix")
sns.heatmap(cm_logit,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,2)
plt.title("K Nearest Neighbors Confusion Matrix")
sns.heatmap(cm_knn,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,3)
plt.title("Support Vector Machine Confusion Matrix")
sns.heatmap(cm_svm,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,4)
plt.title("Naive Bayes Confusion Matrix")
sns.heatmap(cm_gnb,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,5)
plt.title("Decision Tree Classifier Confusion Matrix")
sns.heatmap(cm_dtc,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,6)
plt.title("Random Forest Confusion Matrix")
sns.heatmap(cm_rf,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.show()

#### Basic terminology

0: negative class
1: positive class

True positive (TP): Prediction is +ve and X has CV diseases, we want that

True negative (TN): Prediction is -ve and X is healthy, we want that too

False positive (FP): Prediction is +ve and X is healthy, false alarm, bad

False negative (FN): Prediction is -ve and X has CV diseases, the worst

## Model chosen to evaluate further: Random Forest Classification (To continue in Part 3)