# **PREDICTING HEART DISEASE FROM CLINICAL AND LABORATORIAL DATA USING KNN**

## **INTRODUCTION**

Importations:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
%matplotlib inline

Loading the dataset:

In [None]:
data = pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv")

data.head()

Shape:

In [None]:
data.shape

Variables types:

In [None]:
data.dtypes

## **EDA AND PRE-PROCESSING**




### Outcome variable count:

In [None]:
sns.catplot(x='target',kind='count',palette='ch:.25',data=data)

### Categorical predictive variables:

sex:

In [None]:
sns.catplot(x='sex',kind='count',hue='target',data=data,palette='ch:.25')

cp:

In [None]:
sns.catplot(x='cp',kind='count',hue='target',data=data,palette='ch:.25')

fbs:

In [None]:
sns.catplot(x='fbs',kind='count',hue='target',data=data,palette='ch:.25')

restecg:

In [None]:
sns.catplot(x='restecg',kind='count',hue='target',data=data,palette='ch:.25')

exang:

In [None]:
sns.catplot(x='exang',kind='count',hue='target',data=data,palette='ch:.25')

slope:

In [None]:
sns.catplot(x='slope',kind='count',hue='target',data=data,palette='ch:.25')

ca:

In [None]:
sns.catplot(x='ca',kind='count',hue='target',data=data,palette='ch:.25')

thal:

In [None]:
sns.catplot(x='thal',kind='count',hue='target',data=data,palette='ch:.25')

### Distributional predictive variables:




In [None]:
data[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']].describe()

In [None]:
sns.displot(x='age',multiple='stack',hue='target',data=data,palette='ch:.25')

In [None]:
sns.displot(x='trestbps',multiple='stack',hue='target',data=data,palette='ch:.25')

In [None]:
sns.displot(x='chol',multiple='stack',hue='target',data=data,palette='ch:.25')

In [None]:
sns.displot(x='thalach',multiple='stack',hue='target',data=data,palette='ch:.25')

In [None]:
sns.displot(x='oldpeak',multiple='stack',hue='target',data=data,palette='ch:.25')

## **SPLITTING AND PRE-PROCESSING:**

Defining x_train, x_test, y_train and y_test:

In [None]:
x = data.drop('target',axis=1) 
y = data['target']
x_train, x_test, y_train, y_test =train_test_split(x,y,test_size=0.2,random_state=42)

Scaling the data:

In [None]:
sc = StandardScaler().fit(x_train)
x_train = sc.transform(x_train)
x_test = sc.transform(x_test)

## **TRAINING THE MODEL:**

Parameters for grid search:

In [None]:
knn = KNeighborsClassifier()
parameters = {'n_neighbors': [3,5,7,9,11], 'weights': ['uniform','distance']}

Fiting training data and grid searching:

In [None]:
grid = GridSearchCV(knn, parameters, cv=4, scoring='accuracy')
grid.fit(x_train,y_train)



Displayng best parameters:

In [None]:
print(grid.best_params_)

Picking the best model:

In [None]:
model=grid.best_estimator_

## **EVALUATING THE MODEL:**

Model score on test data:

In [None]:
model.score(x_test,y_test)

Confusion matrix:

In [None]:
predictions = model.predict(x_test)
cm = metrics.confusion_matrix(y_test,predictions)
cm = pd.DataFrame(cm)
sns.heatmap(cm, annot=True)
plt.show()

Calculating sensitivity, specificity, PPV and NPV:

In [None]:
TP = 28
FP = 2
TN = 27
FN = 4
sensitivity = TP/ (TP+FN)*100
specificity = TN/ (TN+FP)*100
ppv = TP/(TP+FP)*100
npv = TN/ (TN+FN)*100


Printing sensitivity, specificity, PPV and NPV:

In [None]:
print('Sensitivity:', sensitivity,'% ','Specificity:', specificity,'% ','positive predictive value:',ppv,'% ','negative predictive value:',npv,'%' )

AUC score:

In [None]:
probs = model.predict_proba(x_test)[:,1]
auc = metrics.roc_auc_score(y_test,probs)
print(auc)

ROC curve:

In [None]:
fpr, tpr, _ = metrics.roc_curve(y_test,probs)
plt.figure()
plt.grid()
plt.plot(fpr,tpr)
plt.plot([0, 1], [0, 1])
plt.show()