## Introduction

The goal of the Heart Disease UCI dataset is to develop a model able to classify whether a person has heart disease or not based on a number of variables with the target conditions.

In this notebook, we will go through some necessary steps of cleaning and visualizing the data to get a better understanding of our problem and finally choose the best model for our dataset.

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Matplotlib style


In [None]:
plt.style.use('fivethirtyeight')

## Import dataset


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dataset = pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv")

## Data Describing 

In [None]:
# Check for data types
dataset.dtypes

In [None]:
# Statistics of the data
dataset.describe()

In [None]:
# Check for missing data
dataset.isnull().sum()

In [None]:
# Check for duplicates
dataset.duplicated().sum()


In [None]:
# Drop duplicates
dataset.drop_duplicates(inplace=True)


In [None]:
# Add an index column to the dataset
dataset.reset_index(inplace=True, level=0)

## Target distribution by features

In [None]:
def plot_target(dataframe, column, title,  fontsize=11, figsize_=(18, 10), percentage=False, rot_=0):
    '''
    Plot target distribution by a feature
    
    Parameters:
        dataframe: Dataset that has the features
        column: The feature we want to plot it's distribution
        title: Title of the plot
    '''
    feature = dataframe[[column, 'target', 'index']].groupby(
        [column, 'target'])['index'].count().unstack(level=1, fill_value=0)
    
    ax_1 = feature.plot(kind='bar',
                figsize=figsize_,
                stacked=True,
                rot=rot_,
                title=title)
    
    # Add percentage to the plot
    c = 0
    df_len = len(feature)
    if percentage == True:
        for bar in ax_1.patches:
            height = bar.get_height()
            ax_1.text(bar.get_x() + bar.get_width() / 2,
                     bar.get_y() + height / 2,
                     '{:.0f}%'.format((height/feature.iloc[c].sum())*100),
                     ha='center',
                     va='center')
            c += 1
            if c == df_len:
                c = 0
    
    # Rename Legend
    plt.legend(['Healthy', 'Patient'])
    # Labels
    plt.ylabel('Number of samples')
    # Set xlabel fontsize
    plt.xticks(fontsize=fontsize)
    plt.tight_layout()


In [None]:
plot_target(dataset, 'age', 'Age Feature')

In [None]:
plot_target(dataset, 'sex', 'Gender', figsize_=(12, 10), percentage=True)
# 0 for female, 1 for male

### We Can Clearly see that females are more likely to have heart disease than men

In [None]:
plot_target(dataset, 'cp', 'Chest pain type', figsize_=(15, 10), percentage=True)

### People having chest pain number 1 are more likely to have hear disease

In [None]:
plot_target(dataset, 'trestbps', 'Resting blood pressure', rot_=70)

## Chol feature has many unique values and plotting it won't be useful

In [None]:
plot_target(dataset, 'fbs', 'Fasting blood sugar', percentage=True)
# 1 True, 0 False

In [None]:
plot_target(dataset, 'restecg', 'Resting electrocariographic results', percentage=True)

## thalach feature has alot of unique values, so we will plot maximum heart rate with the highest number of patients

In [None]:
# Plot values with alot of patients
heart_rate = dataset[['thalach', 'target', 'index']].groupby(
        ['thalach', 'target'])['index'].count().unstack(level=1, fill_value=0)
heart_rate = heart_rate[heart_rate[1] >= heart_rate[1].mean()]
plt.figure(figsize=(12, 5))
plt.plot(heart_rate[1],
        color='red')
plt.xlabel('Maximum heart reate achieved')
plt.ylabel('Num of people')
plt.title('Has heart disease')
plt.tight_layout()

In [None]:
plot_target(dataset, 'exang', 'Exercise induced angina(chest pain)', percentage=True, figsize_=(8,7))
# No: 0, Yes: 1

### Most of the patients didn't have exercise induced angina

In [None]:
# Old peak feature
feature = dataset[['oldpeak', 'target', 'index']].groupby(
        ['oldpeak', 'target'])['index'].count().unstack(level=1, fill_value=0)
plt.figure(figsize=(15, 5))
plt.plot(feature[0])
plt.plot(feature[1])
plt.title('ST depression induced by exercise relative to rest')
plt.xlabel('old peak')
plt.ylabel('Num of people')
plt.tight_layout()

In [None]:
plot_target(dataset, 'slope', 'The slope of the peak exercise ST segment', figsize_=(8, 7), percentage=True)

In [None]:
plot_target(dataset, 'ca', 'Number of major vessels')

In [None]:
plot_target(dataset, 'thal', 'Thal(Blood disorder)', percentage=True)

## Check for outliers



In [None]:
# Create 10 subplots
fig = plt.figure(figsize=(18,10))
a1 = fig.add_subplot(531)
a2 = fig.add_subplot(532)
a3 = fig.add_subplot(533)
a4 = fig.add_subplot(534)
a5 = fig.add_subplot(535)
a6 = fig.add_subplot(536)
a7 = fig.add_subplot(537)
a8 = fig.add_subplot(538)
a9 = fig.add_subplot(539)
a10 = fig.add_subplot(5,3,11)
fig.tight_layout(h_pad=2, w_pad=2)
axes = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10]

# Get columns of dataset
col = dataset.columns
j = 0
for i in range(13):
    if len(dataset[col[i]].value_counts()) > 2:
        axes[j].boxplot(dataset[col[i]], vert=False)
        axes[j].title.set_text(col[i])
        j += 1

In [None]:
plt.boxplot(dataset['thal'], vert=False)
plt.title('thal')

In [None]:
# Statistics of the data
dataset.describe()

In [None]:
# Check unique values in thal feature
dataset['thal'].value_counts()

In [None]:
# Replace outliers in thal feature with the median
dataset[dataset['thal'] == 0]['thal'] = dataset['thal'].median()
plt.boxplot(dataset['thal'], vert=False)
plt.title('thal')

In [None]:
dataset['thal'].value_counts()

In [None]:
# Check unique values in Ca feature
dataset['ca'].value_counts()

In [None]:
# Values must be between (0-3), replace values equal 4 with the median
dataset[dataset['ca'] == 4]['ca'] = dataset['ca'].median()
plt.boxplot(dataset['ca'], vert=False)
plt.title('ca')

In [None]:
# Check unique values in oldpeak feature
dataset['oldpeak'].value_counts().sort_index()

In [None]:
# Replace values greater than or equal 4 with the median value
dataset[dataset['oldpeak'] >= 4]['oldpeak'] = dataset['oldpeak'].median()
plt.boxplot(dataset['oldpeak'], vert=False)
plt.title('oldpeak')

In [None]:
# Check for unique values in chol feature
dataset['chol'].value_counts().sort_index(ascending=True).head(20)

In [None]:
# Replace values greater than 350 or less than 120 with the median
dataset[(dataset['chol'] >= 350) | (dataset['chol'] < 120)]['chol'] = dataset['chol'].median()
plt.boxplot(dataset['chol'], vert=False)
plt.title('chol')

In [None]:
# Check for unique values in trestbps feature
dataset['trestbps'].value_counts().sort_index(ascending=False)

In [None]:
# Replace values greater than 170 or less than 90 with the median value
dataset[(dataset['trestbps'] > 170) | (dataset['trestbps'] < 90)]['trest'] = dataset['trestbps'].median()
plt.boxplot(dataset['trestbps'], vert=False)
plt.title('trestbps')

In [None]:
dataset

## Check for variables correlation

In [None]:
correlation = dataset.corr()

# Plot correlation
plt.figure(figsize=(19, 15))
sns.heatmap(correlation, xticklabels=correlation.columns.values, 
            yticklabels=correlation.columns.values, annot=True, annot_kws={'size':10})

# Axis ticks size
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()

### There is no highly correlated variables

# Model evaluation

In [None]:
# Shuffle rows and reset index
dataset = dataset.sample(frac=1).reset_index()

# Drop index column
dataset.drop(columns='index', inplace=True)

x = dataset.drop(columns='target')
y = dataset['target']

## Perceptron algorithm

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Perceptron
clf = Perceptron(shuffle=True, penalty='l2', max_iter=1000)
scores = cross_val_score(clf, x, y, cv=10).sum() / 10
scores

## Logestic regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
scores = cross_val_score(clf, x, y, cv=10).sum() / 10
scores

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
scores = cross_val_score(clf, x, y, cv=10).sum() / 10
scores

## K-nearest neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3, )
scores = cross_val_score(clf, x, y, cv=10).sum() / 10
scores

## Support vector machines

In [None]:
from sklearn import svm
clf = svm.SVC(kernel='linear')
scores = cross_val_score(clf, x, y, cv=10).sum() / 10
scores

## Decision tree classification

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='entropy')
scores = cross_val_score(clf, x, y, cv=10).sum() / 10
scores

In [None]:
import xgboost as xgb
clf = xgb.XGBClassifier(learning_rate=0.1,max_depth=6,n_estimators=500,n_jobs=-1)
scores = cross_val_score(clf, x, y, cv=10).sum() / 10
scores

### We choose to use SVM model

## Model evaluation

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True)

In [None]:
from sklearn import svm
from sklearn.metrics import confusion_matrix
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
confusion_matrix(y_test, y_pred)

## Findings

#### With the SVM model we were able to get nearly 99% accuracy, however, high percentage accuracy may be an indicator of overfitting so we shuffled the data then split it into a train and test data to make sure that our model doesn't overfit but the accuracy didn't change that much which may be due to dataset small size.