In [None]:
# Basic 
import numpy as np
import pandas as pd

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Splitting
from sklearn.model_selection import train_test_split

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.ensemble import VotingClassifier

# Evaluation metrics
from sklearn.metrics import jaccard_score, f1_score, log_loss, accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Cross validation
from sklearn.model_selection import cross_val_score

In [None]:
path = '/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv'

records = pd.read_csv(path, index_col=False)
records.head(3)

In [None]:
records.describe()

In [None]:
records.info()

In [None]:
records.isna().sum()

## EDA

Let us go through the correlation first to quickly understand where we can see the relations pop up.

In [None]:
records_corr = records.corr()

plt.figure(figsize=(12,12))
sns.heatmap(records_corr, annot=True, cmap="YlGnBu")
plt.title('Correlation')
plt.show()

Let us go through all the features one by one and visualise the various relations that can be established using this correlation table above.

### 1. Age

Most of the correlations with other indicators are not significant.

Significant ones: 
1. Positively correlated: 'serum_creatinine' and 'DEATH_EVENT' 
2. Negatively correlated: 'time' and 'diabetes'


In [None]:
# Age and death_event

ax = sns.violinplot(x='DEATH_EVENT', y='age', data = records)


medians = records.groupby(['DEATH_EVENT'])['age'].median().values
nobs = records['DEATH_EVENT'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('1. Age vs Death Event')
plt.show()

In [None]:
# Age and serum creatinine

plt.figure(figsize=(8,6))
sns.regplot(x='age', y='serum_creatinine', data = records)
plt.minorticks_on()
plt.grid(b=True, which='both', axis='both', alpha=0.1)
plt.title('2. Age and Serum_creatinine')
plt.show()

In [None]:
# Age and diabetes

ax = sns.violinplot(x='diabetes', y='age', data = records)


medians = records.groupby(['diabetes'])['age'].median().values
nobs = records['diabetes'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('3. Age and Diabetes')
plt.show()

In [None]:
# Age, diabetes and death_event

ax = sns.violinplot(x='diabetes', y='age', data = records, hue = 'DEATH_EVENT')
plt.title('4. Age, Diabetes vs Death Event')
plt.show()

OBSERVATION:

1. The median age of people dying of heart failure is slightly higher than the median age of ones not having one. There ia a disparity in the numbers of people having a heart failure or not can be seen. That could be either due to a skewed data or the fraction of people suffering from heart failure is actually less.

2. The correlation between age and creatinine levels in serum can be visualised. We can can see a slight positive correlation.

3. The number of people having diabetes or not are comparable. We can see however that the median age for people suffering from diabetes is at around 60.

4. People with diabetes and had a heart failure have the same median age.

5. People who do not have diabetes but have had a heart failure are relatively older than the ones who have diabetes + heart failure.

### 2. Anaemia

Most of the correlations with other indicators are not significant.

Significant ones: 

1. Positively correlated: 'DEATH_EVENT'

2. Negatively Correlated: 'creatinine_phosphokinase', 'smoking', 'time'

In [None]:
# anaemia and death-event

sns.countplot(x = 'DEATH_EVENT', data = records, hue = 'anaemia')
plt.title('1. Anaemia and death_event')
plt.xlabel('Death-event')
plt.ylabel('Number of people')
plt.show()


In [None]:
# anaemia and creatinine phosphokinase

ax = sns.violinplot(x='anaemia', y='creatinine_phosphokinase', data = records)


medians = records.groupby(['anaemia'])['creatinine_phosphokinase'].median().values
nobs = records['anaemia'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('2. Anaemia and Creatinine phosphokinase')
plt.show()

In [None]:
# Anaemia and smoking
sns.countplot(x = 'smoking', data = records, hue = 'anaemia')
plt.title('3. Anaemia and smoking')
plt.xlabel('Smoking')
plt.ylabel('Number of people')
plt.show()


OBSERVATION:

1. Percentage wise - anaemia seems to be a good indicator of 'heart failure'.

2. There is no particular indication of a level of creatinine phosphokinase that might lead to anaemia.

3. Smoking is not a good detector for anaemia. Percentage wise, out of people who did not smoke, more people became anaemic. 

### 3. Creatinine Phosphokinase

Most of the correlations with other indicators are not significant.

Significant ones: 
1. Positively correlated: 'DEATH_EVENT'
2. Negatively correlated: 'anaemia'

In [None]:
# creatinine phosphokinaseand death_event

ax = sns.violinplot(x='DEATH_EVENT', y='creatinine_phosphokinase', data = records)


medians = records.groupby(['DEATH_EVENT'])['creatinine_phosphokinase'].median().values
nobs = records['DEATH_EVENT'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('Creatinine phosphokinase and death event')
plt.show()

OBSERVATION:
1. The median level of creatinine phosphokinase for the ones who suffered heart failure and didn't is comparatively same. 
2. THe range of creeatinine phosphokinase seen in the people who suffered from a heart failure is a lot more than for the ones who did not have a heart failure

### 4. Diabetes

Most of the correlations with other indicators are not significant.

Significant ones:

1. Positively correlated: No particular strong correlation
2. Negatively correlated: 'DEATH_EVENT', 'age', 'sex' and 'smoking'

In [None]:
# 'diabetes' and 'DEATH_EVENT'
sns.countplot(x = 'diabetes', data = records, hue = 'DEATH_EVENT')
plt.title('1. Diabetes and death event')
plt.xlabel('diabetes')
plt.ylabel('Number of people')
plt.show()


In [None]:
# diabetes and sex
sns.countplot(x = 'sex', data = records, hue = 'diabetes')
plt.title('2. diabetes and sex')
plt.xlabel('sex')
plt.ylabel('Number of people')
plt.show()

In [None]:
# diabetes and smoking
sns.countplot(x = 'diabetes', data = records, hue = 'smoking')
plt.title('3. diabetes and smoking')
plt.xlabel('diabetes')
plt.ylabel('Number of people')
plt.show()

OBSERVATIONS:

1. The ratio of death events happening for people who have and don't have diabetes is >1. But this is a weak indicator of a death event.
2. The number of males in the dataset are much more than the number of females. Almost all the females have diabetes and about 50% of males have it.
3. The percentage of smokers who hae diabetes is very less. **Smoking here would seem like a negative indicator of diabetes, but we know better than just statistics. Smoking will definitely not lead to less diabetes!**


### 5. Ejection Fraction

Most of the correlations with other indicators are not significant.

Significant ones:

1. Positively correlated: 'serum_sodium'
2. Negatively correlated: 'DEATH_EVENT' and 'sex'

In [None]:
print(records['ejection_fraction'].median())

In [None]:
# ejection fraction and death event

ax = sns.violinplot(x='DEATH_EVENT', y='ejection_fraction', data = records)


medians = records.groupby(['DEATH_EVENT'])['ejection_fraction'].median().values
nobs = records['DEATH_EVENT'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('1. Ejection fraction vs Death Event')
plt.show()

In [None]:
# ejection fraction and serum sodium

plt.figure(figsize=(8,6))
sns.regplot(x='serum_sodium', y='ejection_fraction', data = records)
plt.minorticks_on()
plt.grid(b=True, which='both', axis='both', alpha=0.1)
plt.title('2. Ejection fraction and Serum sodium')
plt.show()

In [None]:
# ejection fraction and sex

# Age and death_event

ax = sns.violinplot(x='sex', y='ejection_fraction', data = records)


medians = records.groupby(['sex'])['ejection_fraction'].median().values
nobs = records['sex'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('3. Ejection fraction vs Sex')
plt.show()

OBSERVATIONS:

1. The median ejection fraction for people with no heart failure is 38. Th enormal is usually >40.

2. We can see that the people suffering from heart failure have a median ejection fraction of around 20. We can also see that there are cases where people with normal ejection fraction also suffer from heart failure. That could be owing to the other factors affecting a death-event.

3. A decent positive correlation can be seen between the ejection fraction and serum sodium. 

4. The major peaks that can be observed for ejection fraction is around 38 and 60 for both males and females. 

### 6. High Blood Pressure

Most of the correlations with other indicators are not significant.

Significant ones:

1. Positively correlated: 'DEATH_EVENT'
2. Negatively correlated: 'time' and 'sex'

In [None]:
# High BP and death-event
sns.countplot(x='DEATH_EVENT', data = records, hue='high_blood_pressure')
plt.title('1. High BP and death-event')
plt.xlabel('Death Event')
plt.ylabel('Number of people')
plt.show()

In [None]:
# high BP and sex
sns.countplot(x='sex', data = records, hue='high_blood_pressure')
plt.title('2. High BP and sex')
plt.xlabel('sex')
plt.ylabel('Number of people')
plt.show()

In [None]:
# high BP and time

ax = sns.violinplot(x='high_blood_pressure', y='time', data = records)


medians = records.groupby(['high_blood_pressure'])['time'].median().values
nobs = records['high_blood_pressure'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('3. High BP vs time')
plt.show()

OBSERVATIONS:

1. The relationship between High BP and a death event cannot be surely determined. People with or without a high BP condition have passed away and people with a high BP have also not suffered a heart failure.

2. The percentage of females having a high BP condition is a lot more than the percentage of males having a high BP condition.

3. The median follow up period for people with a high BP condition is considerably lower than the follow up period of people with no such condition.

### 7. Platelets

Most of the correlations with other indicators are not significant.

Significant ones:

1. Positively correlated: No particular strong correlation
2. Negatively correlated: 'DEATH_EVENT' and 'sex'

In [None]:
# platelets and death event

ax = sns.violinplot(x='DEATH_EVENT', y='platelets', data = records)


medians = records.groupby(['DEATH_EVENT'])['platelets'].median().values
nobs = records['DEATH_EVENT'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('1. Platelet count vs death event')
plt.show()

In [None]:
# platelets and sex

ax = sns.violinplot(x='sex', y='platelets', data = records)


medians = records.groupby(['sex'])['platelets'].median().values
nobs = records['sex'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('1. Platelet count vs sex')
plt.show()

OBSERVATIONS:

1. No significant correlation between platelets and death-event. The median platelet count in case of either death-event is about the same.

2. Females can be seen to have a slightly higher median platelet count than males.

### 8. Serum creatinine

Most of the correlations with other indicators are not significant.

Significant ones:

1. Positively correlated: 'age' and 'DEATH_EVENT'
2. Negatively correlated: 'serum sodium' and 'time'

In [None]:
# serum creatinine and death event

ax = sns.violinplot(x='DEATH_EVENT', y='serum_creatinine', data = records)


medians = records.groupby(['DEATH_EVENT'])['serum_creatinine'].median().values
nobs = records['DEATH_EVENT'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('1. Serum creatinine vs death event')
plt.show()

In [None]:
# serum creatinine and age
sns.regplot(x = 'age', y = 'serum_creatinine', data = records)
plt.title('2. Serum creatinine and age')
plt.xlabel('age')
plt.ylabel('Serum creatinine')
plt.show()

In [None]:
# serum creatinine and serum sodium 
sns.regplot(x = 'serum_sodium', y = 'serum_creatinine', data = records)
plt.title('3. Serum creatinine and Serum sodium')
plt.xlabel('Serum sodium')
plt.ylabel('Serum creatinine')
plt.show()

In [None]:
# serum creatinine and time 
sns.regplot(x = 'time', y = 'serum_creatinine', data = records)
plt.title('4. Serum creatinine and Follow up time')
plt.xlabel('time')
plt.ylabel('Serum creatinine')
plt.show()

OBSERVATIONS:

1. The median serum creatinine level is same for both people with and without a heart failure. The range of serum creatinine level for people with a heart failure is considerably larger than for the ones who did not have a heart failure.

2. A slight positive correlation can be visualised between the age of a person and their serum creatinine level.

3. A negative correlation can be visualised between the levels of serum creatinine and serum sodium in the body.

4. A very small negative correlation can be seen between the serum creatinine level and follouw up period of the person.

### 9. Serum sodium

Most of the correlations with other indicators are not significant.

Significant ones:

1. Positively correlated: 'ejection_fraction'
2. Negatively correlated: 'serum creatinine' and 'DEATH_EVENT'

In [None]:
# serum sodium and death event

ax = sns.violinplot(x='DEATH_EVENT', y='serum_sodium', data = records)


medians = records.groupby(['DEATH_EVENT'])['serum_sodium'].median().values
nobs = records['DEATH_EVENT'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('1. Serum sodium vs death event')
plt.show()

OBSERVATIONS:

1. The median serum sodium level is lower for people who had a heart failure.

2. The range of serum sodium in the body for people with or without a heart failure is the same.

### 10. Sex

Most of the correlations with other indicators are not significant.

Significant ones:

1. Positively correlated: 'smoking'
2. Negatively correlated: 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'DEATH_EVENT' 

In [None]:
# sex and death event
sns.countplot(x='sex', data = records, hue = 'DEATH_EVENT')
plt.title('1. Sex and death-event')
plt.xlabel('Sex')
plt.ylabel('Number of people')
plt.show()

In [None]:
# sex and smoking
sns.countplot(x = 'sex', data = records, hue = 'smoking')
plt.title('2. Sex and smoking')
plt.xlabel('Sex')
plt.ylabel('Number of people')
plt.show()

OBSERVATIONS:

1. The percentage of women and men suffering a heart failure is around the same. A dependence cannot be established.

2. About 50 % of males smoke and only about 6% of women smoke as per the data.

### 11. Smoking

Most of the correlations with other indicators are not significant.

Significant ones:

1. Positively correlated: 'sex'
2. Negatively correlated: 'diabetes', 'anaemia', 'DEATH_EVENT' 

In [None]:
# smoking and death event
sns.countplot(x = 'DEATH_EVENT', data = records, hue = 'smoking')
plt.title('1. Smoking and death event')
plt.xlabel('Death event')
plt.ylabel('Number of people')
plt.show()

In [None]:
ls = records.groupby(['DEATH_EVENT', 'smoking']).count().values[:, 0]
print('Percentge smoking but no heart failure: ', (ls[1]/(ls[0]+ls[1]))*100)
print('Percentge smoking and heart failure: ', (ls[3]/(ls[2]+ls[3]))*100)

OBSERVATION:

As you can see from above, there is not much dependence between smoking and a heart failure.

### 12. Time

Most of the correlations with other indicators are not significant.

Significant ones:

1. Positively correlated: No strong positive correlation
2. Negatively correlated: 'age', 'anaemia', 'high_blood_pressure', 'serum_creatinine', 'DEATH_EVENT' 

In [None]:
# time and death_event

ax = sns.violinplot(x='DEATH_EVENT', y='time', data = records)


medians = records.groupby(['DEATH_EVENT'])['time'].median().values
nobs = records['DEATH_EVENT'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n: ' + i for i in nobs]

pos = range(len(nobs))
for tick, label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.10, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('1. Follow up period vs death event')
plt.show()

OBSERVATION:

1. The median follow up period for someone who later suffered from a heart failure is comparatively very low.

## MODEL

In [None]:
records_corr['DEATH_EVENT'].sort_values(ascending=False)

In [None]:
X = records[['serum_creatinine', 'age', 'time', 'ejection_fraction', 'serum_sodium']].values
y = records.iloc[:, -1].values

print('Shape of X ', X.shape)
print('Shape of y ', y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print('Shape of training set ', X_train.shape)
print('Shape of test set ', X_test.shape)

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 6, metric='minkowski', p=2)

In [None]:
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
    
# print classifier name
print(str(type(classifier)).split('.')[-1][:-2])
    
# Accuracy Score
print('Accuracy Score: {}'.format(accuracy_score(y_test, y_pred)))

# jaccard Score
print('\nJaccard Score: {}'.format(jaccard_score(y_test, y_pred)))
    
# F1 score
print('\nF1 Score: {}'.format(f1_score(y_test, y_pred)))
    
# Log Loss
print('\nLog Loss: {}'.format(log_loss(y_test, y_pred)))
    
print('CROSS VALIDATION')
accuracy = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv=10)
print('Accuracies after CV: ', accuracy)
print('\nMean Accuracy of the model: ', accuracy.mean()*100)
    
# confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, lw = 2, cbar=False)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix: {}'.format(str(type(classifier)).split('.')[-1][:-2]))
plt.show()

### Previous Work: Check this link [out](https://www.kaggle.com/mani97/quick-classification-rundown-93-33-k-nn#Tuning-Hyperparameter) to see how I fine tuned my k-NN model to attain 93% accuracy.

Do comment and let me know if I missed out anything.