In [None]:
# Basic
import numpy as np 
import pandas as pd

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Splitting
from sklearn.model_selection import train_test_split

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Evaluation metrics
from sklearn.metrics import jaccard_score, f1_score, log_loss, accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Cross validation
from sklearn.model_selection import cross_val_score

In [None]:
records = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv', index_col=False)
print(records.shape)
records.head(3)

In [None]:
records.info()

In [None]:
records.describe()

In [None]:
records.isna().sum()

There are no null values in the dataset. All the data is in the right format. We are good to go forward!

## Quick correlation check

In [None]:
records_corr = records.corr()
plt.figure(figsize=(14,12))
sns.heatmap(records_corr, annot=True)
plt.title('Correlation between features')
plt.show()

Check the correlation of all features which just the 'DEATH_EVENT' result

In [None]:
records_corr['DEATH_EVENT'].sort_values(ascending = False)

We can see columns:
1. 'serum_creatinine' and 
2. 'age'

show a **positive significant correlation** with the Heart Failure Status

Columns:
1. 'time'
2. 'ejection_fraction'
3. 'serum_sodium'

show a **negative significant correlation** with the Heart Failure Status.

## Model Training

In [None]:
X = records[['serum_creatinine', 'age', 'time', 'ejection_fraction', 'serum_sodium']].values
y = records.iloc[:, -1].values

print('Shape of X ', X.shape)
print('Shape of y ', y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print('Shape of training set ', X_train.shape)
print('Shape of test set ', X_test.shape)

### Model Construction and Evaluation (CV)

In [None]:
classifiers = [XGBClassifier(), LogisticRegression(max_iter=1000), KNeighborsClassifier(n_neighbors = 10, metric='minkowski', p=2), SVC(kernel = 'linear'), SVC(kernel = 'rbf'), DecisionTreeClassifier(criterion='entropy'), RandomForestClassifier(n_estimators = 10, criterion = 'entropy')]

for classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    # print classifier name
    print(str(type(classifier)).split('.')[-1][:-2])
    
    # Accuracy Score
    print('Accuracy Score: {}'.format(accuracy_score(y_test, y_pred)))
    
    # jaccard Score
    print('\nJaccard Score: {}'.format(jaccard_score(y_test, y_pred)))
    
    # F1 score
    print('\nF1 Score: {}'.format(f1_score(y_test, y_pred)))
    
    # Log Loss
    print('\nLog Loss: {}'.format(log_loss(y_test, y_pred)))
    
    print('CROSS VALIDATION')
    accuracy = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv=10)
    print('Accuracies after CV: ', accuracy)
    print('Mean Accuracy of the model: ', accuracy.mean()*100)
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, lw = 2, cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix: {}'.format(str(type(classifier)).split('.')[-1][:-2]))
    plt.show()

### Model Selection

We can see that the **KNeighbors Classifier** gives us good results overall. Let's see if we can better the performance of this model.

Let us first implement the KNeighbors Classification model separately.

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 10, metric='minkowski', p=2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))

print('ROC-AUC Score: ',roc_auc_score(y_test, y_pred))

In [None]:
# confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, lw = 2, cbar=False)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix: {}'.format(str(type(classifier)).split('.')[-1][:-2]))
plt.show()

## Tuning Hyperparameter

In [None]:
acc=[]
for i in range(1, 20):
    y_p = KNeighborsClassifier(n_neighbors=i, metric='minkowski', p=2).fit(X_train, y_train).predict(X_test)
    
    acc.append(accuracy_score(y_test, y_p))

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(np.arange(1,20, step=1), acc)
plt.xticks(np.arange(1,20, step=1))
plt.grid(b=True, which='major', axis='both', color='#999999', linestyle='-', alpha=0.1)

**We can see that the model on tuning the hyperparameters performs the best when 'n_neighbours' is 6**

## Final Model

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 6, metric='minkowski', p=2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))

print('ROC-AUC Score: ',roc_auc_score(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)

In [None]:
# confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, lw = 2, cbar=False)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix: {}'.format(str(type(classifier)).split('.')[-1][:-2]))
plt.show()

## Conclusion

We can see our accuracy of the model was bumped up from 90% to 93% by hyperparameter tuning. We also saw that there were 5 significant features that helped predicting the death event.

### Future Work

EDA was not implemented in this notebook as I wanted to run quick classification models to see the results. Further work will include that.

Do comment and let me know if it was of some help or if I could improve in these quick modelling.