In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from pandas.plotting import parallel_coordinates
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from pylab import rcParams
rcParams['figure.figsize'] = 15, 10

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import plot_tree

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/heart-disease-uci/heart.csv")
df.describe()

The categorical varaibles are in the form of integers, which do not make sense if we check their meaning. Also we notice an abnormality, the variable thal has values from 0 to 3, ie 4 variables but it is defined for 3 variables only. The value of 0 for thal is out of place. Let's remove it before proceeding further.

In [None]:
print(f'There are {df.shape[0]} rows and {df.shape[1]} columns in the dataframe')

In [None]:
df = df.drop(df[df.thal == 0].index)

In [None]:
print(f'There are {df.shape[0]} rows and {df.shape[1]} columns in the dataframe')

Thankfully, we lost 2 rows only.

In [None]:
df.sample(10)

We don't see any blank spaces or any other abnormalities, so we can proceed further.

In [None]:
df.isna().sum()

All clear, lets proceed by giving the categorical variables meaningful names in accordance with the following labelling.

<ol>
    <li> age - age in years </li>
    <li> sex - (categorical)
        1: male, 0: female </li>
    <li> cp - chest pain type (categorical)
        1: typical angina, 2: atypical angina, 3: non-anginal pain, 4: asymptomatic </li>
    <li> trestbps - resting blood pressure in mm Hg</li>
    <li> chol - serum cholestoral in mg/dl  </li>
    <li> fbs - fasting blood sugar > 120 mg/dl (categorical) 1 = true and 0 = false</li>
    <li> restecg - resting electrocardiographic results (categorical)
        0: normal, 1: having ST-T wave abnormality, 2: showing probable or definite left ventricular hypertrophy by Estes' criteria</li>
    <li> thalach - maximum heart rate achieved  </li>
    <li> exang -  exercise induced angina (categorical)
        0:no, 1:yes</li>
    <li> oldpeak - ST depression induced by exercise relative to rest </li>
    <li> slope - the slope of the peak exercise ST segment (catogorical)
         1: upsloping, Value 2: flat, Value 3: downsloping</li>
    <li> ca - number of major vessels (0-3) colored by flourosopy</li>
    <li> thal -  (categorical)
        1: normal, 2: fixed defect, 3: reversable defect </li>
    <li> target - disease is found or not 1 = yes and 0 = no</li>
</ol>

In [None]:
df['sex'][df['sex'] == 0] = 'female'
df['sex'][df['sex'] == 1] = 'male'

df['cp'][df['cp'] == 0] = 'typical angina'
df['cp'][df['cp'] == 1] = 'atypical angina'
df['cp'][df['cp'] == 2] = 'non-anginal pain'
df['cp'][df['cp'] == 3] = 'asymptomatic'

df['fbs'][df['fbs'] == 0] = 'lower than 120mg/ml'
df['fbs'][df['fbs'] == 1] = 'greater than 120mg/ml'

df['restecg'][df['restecg'] == 0] = 'normal'
df['restecg'][df['restecg'] == 1] = 'ST-T wave abnormality'
df['restecg'][df['restecg'] == 2] = 'left ventricular hypertrophy'

df['exang'][df['exang'] == 0] = 'no'
df['exang'][df['exang'] == 1] = 'yes'

df['slope'][df['slope'] == 0] = 'upsloping'
df['slope'][df['slope'] == 1] = 'flat'
df['slope'][df['slope'] == 2] = 'downsloping'

df['thal'][df['thal'] == 1] = 'normal'
df['thal'][df['thal'] == 2] = 'fixed defect'
df['thal'][df['thal'] == 3] = 'reversable defect'

df.info()

In [None]:
plt.subplot(221)
plt.pie(x = df.target.value_counts(),
        labels = ["Disease Found", "Disease not found"], autopct='%1.2f%%', explode = (0, 0.1))
plt.title("Disease found or not")

plt.subplot(222)
plt.pie(x = df.sex.value_counts(),
        labels = ["Male", "Female"], autopct='%1.2f%%', explode = (0, 0.1))
plt.title("Gender distibution in the data")

plt.subplot(223)
plt.pie(x = df.loc[df.target == 1].groupby(['sex']).sex.count().to_list(),
        labels = ["Female", "Male"], autopct='%1.2f%%', explode = (0, 0.1))
plt.title("Gender distibution of people with the disease")

plt.subplot(224)
plt.pie(x = df.loc[df.target == 0].groupby(['sex']).sex.count().to_list(),
        labels = ["Female", "Male"], autopct='%1.2f%%', explode = (0, 0.1))
plt.title("Gender distibution of people without the disease")

This sample data constitutes mostly of the male sex but a disproportionate amount of women are diagnosed with the disease. Let us calculate how many percentage of people from each gender are impacted by the disease.

In [None]:
total_male = len(df.loc[df.sex == 'male'])
total_female = len(df.loc[df.sex == 'female'])
diagnosed_male = len(df.loc[(df.sex == 'male') & (df.target == 1)])
diagnosed_female = len(df.loc[(df.sex == 'female') & (df.target == 1)])
print(f'{diagnosed_male/total_male*100:.2f}% of men are diagnosed whereas \
{diagnosed_female/total_female*100:.2f}% women are diagnosed with the heart disease.')

We can see that gender clearly affects the likelyhood of being diagnosed with the disease. We will dive into why this happens later in the notebook.

In [None]:
plt.subplot(211)
sns.histplot(data = df, x= 'age', hue = 'target', element = 'poly')
plt.subplot(212)
sns.histplot(data = df, x= 'age', hue = 'sex', element = 'poly')

Surprisingly, way more younger people are diagnosed with the disease than older people. This may seem odd from a biological point of view but we could think of some reasons for this phenomenon. Since the risk of heart diseases increases with age, older individuals might go to get checked even for minor symptoms whereas younger individuals might only go to the doctor for severe symptoms which might actually be caused by them contracting the disease.

In [None]:
# dropping all the categorical variables
sns.heatmap(data = df.drop(['sex', 'cp', 'ca', 'fbs', 'restecg', 'exang', 'slope', 'thal'], axis  = 1).corr(), 
            annot = True,  cmap = "vlag", vmin = -1, vmax = 1, center = 0)

It is well known that higher cholestrol levels lead to higher chances of contracting a heart disease but this correlation matrix shows us that there is little to no correlation. We will analyse the reason for this later. A lot of the correlations here might not make sense from a biological point of view because of the way this sample was selected from the population dataset. The imbalance in variables do not clearly reflect the real reasons for heart diseases. Hence it is very important to do sample testing while taking smaller datasets for analysis.

In [None]:
sns.relplot(data = df, x = 'thalach', y = 'oldpeak', hue = 'target', size = 'age',
           sizes=(10,500), alpha=0.5, aspect = 3)

In [None]:
parallel_coordinates(df, 'sex', cols = ['trestbps', 'chol', 'thalach', 'oldpeak'], color = [ '#00FF00', '#FF0000'])

This is why the cholestrol levels didnt affect the target value! Women are more likely to have elevated cholestrol levels, which cancels out the lower cholestrol levels in men. This explains why women were more likely to be diagnosed with the disease as well.

In [None]:
plt.subplot(211)
sns.violinplot(data = df, x = 'exang', y = 'oldpeak', hue = 'sex', split  = True)

plt.subplot(212)
sns.violinplot(data = df, x = 'slope', y = 'thalach', hue = 'sex', split  = True)

In [None]:
sns.pairplot(data = df, hue = 'target')

There are so many overlapping points in the data! It won't be possible to use logistic regression in this data because of the overlapping. KNN or Random Forests might be a good choice for such data. I will go with Random Forest Classifier this time.

# Machine Learning

In [None]:
df = pd.get_dummies(df, drop_first = True)
df

The categorical variables are one hot encoded, time to normalize them.

In [None]:
scaler = MinMaxScaler()
X = df.loc[:,df.columns != 'target']
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
y = df.target

Now we will split them into testing and training samples and let Random Forest do its magic.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
rf_clf = RandomForestClassifier(n_estimators = 1000, n_jobs = -1, random_state = 666)

In [None]:
params_grid = {"max_features": list(range(1,4)),
               "min_samples_split": list(range(10,40,10)),
               "n_estimators": list(range(100, 800, 100))
}
grid_search = GridSearchCV(rf_clf, params_grid, n_jobs = -1, cv = 5, scoring = 'accuracy')

Grid search checks which model having the given parameters gives us the best accuracy on its own!

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
cvrf_clf = grid_search.best_estimator_
predictions = cvrf_clf.predict(X_test)

In [None]:
sns.heatmap(data = confusion_matrix(y_test, predictions), annot = True,  cmap = "coolwarm", 
            vmax = df.shape[0]//8, center = 0)

In [None]:
print(f'The accuracy score for the given model is {accuracy_score(y_test, predictions)*100:.2f}%') 
#cvrf_clf.score(X_test, y_test) can be used as well

In [None]:
plot_tree(cvrf_clf.estimators_[0],
          filled=True, impurity=True, 
          rounded=True)
plt.title("One of the trees from the Random Forest Method")