# Heart Disease UCI

https://archive.ics.uci.edu/ml/datasets/Heart+Disease

Dataset information
- age
- sex
- chest pain type (4 values)
- resting blood pressure
- serum cholestoral in mg/dl
- fasting blood sugar > 120 mg/dl
- resting electrocardiographic results (values 0,1,2)
- maximum heart rate achieved
- exercise induced angina
- oldpeak = ST depression induced by exercise relative to rest
- the slope of the peak exercise ST segment
- number of major vessels (0-3) colored by flourosopy
- thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

The "target" field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4.

### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

### Importing Dataset

In [None]:
raw_data = pd.read_csv('../input/heart-disease-uci/heart.csv')

### Exploratory Data Analysis (EDA)

In [None]:
raw_data.head()

In [None]:
raw_data.tail()

In [None]:
raw_data.isna().sum()

In [None]:
raw_data.shape

In [None]:
raw_data.describe()

In [None]:
sns.histplot(raw_data['target'])
plt.xticks([0,1])
plt.show()

In [None]:
raw_data['target'].value_counts()

It look's like it is a balanced dataset

In [None]:
raw_data.dtypes

In [None]:
corr = raw_data.corr()
sns.heatmap(corr)
plt.show()

#### Dealing with categorical variables

In [None]:
data_with_dummies = pd.get_dummies(data=raw_data, columns=['sex','cp','fbs','restecg','exang','slope','ca','thal'])

#### Training and testing data

In [None]:
x = data_with_dummies.drop('target', axis=1)
y = data_with_dummies['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
X_train.shape

### Scale columns

In [None]:
pipelines = []

pipelines.append(
    ('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR', LogisticRegression())]))
)
pipelines.append(
    ('ScaledLDA', Pipeline([('Scaler', StandardScaler()),('LDA', LinearDiscriminantAnalysis())]))
)
pipelines.append(
    ('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsClassifier())]))
)
pipelines.append(
    ('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeClassifier())]))
)
pipelines.append(
    ('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB', GaussianNB())]))
)

pipelines.append(
    ('ScaledSVM', Pipeline([('Scaler', StandardScaler()), ('SVM', SVC())]))
)


In [None]:
n_splits = 10
seed = 7
results = []
names = []

for name, model in pipelines:
    kfold = KFold(n_splits=n_splits, random_state=7, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    
    results.append(cv_results)
    names.append(name)
    
    msg = f'Resultado: {name} - {cv_results.mean()} - {cv_results.std()}'
    print(msg)

In [None]:
fig = plt.figure()
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Tuning and GridSearch KNN for better results

In [None]:
x = data_with_dummies.drop('target', axis=1)
y = data_with_dummies['target']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)


scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

model = KNeighborsClassifier()

n_neighbors = [i for i in range(0,(X_train.shape[0] - 30),5)]
param_grid = dict(n_neighbors=n_neighbors)

kfold = KFold(n_splits=n_splits, random_state=seed, shuffle=True)

# model.get_params().keys()
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=kfold)
grid_result = grid.fit(X_train_scaled, y_train)

best_n_neighbors = grid_result.best_params_['n_neighbors']

print('Mean Accuracy: %.3f' % grid_result.best_score_)
print('Config: %s' % grid_result.best_params_)

## Define Final Model

In [None]:
x = data_with_dummies.drop('target', axis=1)
y = data_with_dummies['target']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3)


scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = KNeighborsClassifier(n_neighbors=best_n_neighbors)
kfold = KFold(n_splits=n_splits, random_state=seed, shuffle=True)

model.fit(X_train_scaled, y_train)

accuracy = model.score(X_test_scaled, y_test)
print(f"The accuracy of the model is {accuracy}")

y_pred = model.predict(X_test_scaled)
conf_matrix = confusion_matrix(y_pred,y_test)