In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

**Importing the dataset**

In [None]:
dataset = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

**With the .info() method, we can check the information of the dataset. For instance, the names of the columns, the number of data points, their type and how many null elements exist.**

In [None]:
dataset.info()

**With the .head() method, we get to check the first n elements of the dataset. The default number is 5.**

In [None]:
dataset.head()

**In this stage of the project, Exploratory Data Analysis (EDA) was performed, in order to identify potential predictors for death events. Moreover, we can thus explore correlations in the data, which could be useful for our analysis and would provide us with a more detailed picture of our data.**

In [None]:
plt.figure(figsize = (12,6))
sns.countplot(x = 'smoking', hue = 'DEATH_EVENT', data = dataset)

In [None]:
dataset.groupby(['smoking', 'DEATH_EVENT']).count()

**The first parameter which would be interesting to investigate is how many people included in this dataset are smokers. Afterwards, we can also determine the ratio of people who were smokers and did not survived. From the graph and the table above, wen can deduce that approximately 68% of people did not smoke and, out of them, about 32.5% did not survive. On the other hand, about 31% of people who did smoke did not survive.**

In [None]:
plt.figure(figsize = (12,6))
sns.countplot(x = 'high_blood_pressure', hue = 'DEATH_EVENT', data = dataset)

In [None]:
dataset.groupby(['high_blood_pressure', 'DEATH_EVENT']).count()

**The next potential indicator to be explored is the level of blood pressure of the patient. In the above graph, it can be shown that approximately 68% of people did not have high blood pressure and, out of them, 29% did not survive. On the other hand, about 37% who did have high blood pressure did not survive.**

In [None]:
plt.figure(figsize = (12,6))
sns.countplot(x = 'anaemia', hue = 'DEATH_EVENT', data = dataset)

In [None]:
dataset.groupby(['anaemia', 'DEATH_EVENT']).count()

**In the case of anaemia, about 57% people did not have anaemia and out of them, about 29% did not survive. In contrast, about 36% of patients who did have anaemia did not survive.**

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x = 'diabetes', hue = 'DEATH_EVENT', data = dataset)

In [None]:
dataset.groupby(['diabetes', 'DEATH_EVENT']).count()

**In the case of diabetes, approximately 68% of patients without diabetes did survive, with the percent of patients with diabetes who survived being the same.**

*Thus far, we have investigated the categorical variables of this dataset. In the next step, we are going to investigate those parameters which show continuous, numerical values.*

In [None]:
sns.boxplot(x = 'DEATH_EVENT', y = 'creatinine_phosphokinase', data = dataset)

*The first variable to be explored is the level of creatinine phosphokinase in the blood of the patients. From the graph above, it can be deduced that people who survived and people who did not survive excibit similar levels of creatinine phosphokinase.*

In [None]:
sns.boxplot(x = 'DEATH_EVENT', y = 'ejection_fraction', data = dataset)

*The second variable to be explored is the level of ejecton fraction of the patients. From the graph above, it can be deduced that people who survived show significant higher ejection fraction compared to patients who did not survive. Therefore, it could be considered as a potential indicator for the possibility of survival.*

In [None]:
sns.boxplot(x = 'DEATH_EVENT', y = 'platelets', data = dataset)

*In the graph above, we can notice that the levels of platelets in the blood of the patients who survived and those who did not survive are similar. Therefore, we can conclude that the level of platelets in the blood of patients is not a strong indicator.*

In [None]:
sns.boxplot(x = 'DEATH_EVENT', y = 'serum_creatinine', data = dataset)

*In the above boxplot, it is displayed that patients who did not survive show significantly higher levels of serum creatinine in their blood compared to patients who survived.*

In [None]:
sns.boxplot(x = 'DEATH_EVENT', y = 'serum_sodium', data = dataset)

*In the above graph, it is showed that patients that did not survive had on average lower levels of serum sodum in their blood.*

In [None]:
sns.boxplot(x = 'DEATH_EVENT', y = 'time', data = dataset)

*From the graph above, it can identified that patients who survived had on average much more follow-up time compared to people that did not survive.*

In [None]:
sns.boxplot(x = 'DEATH_EVENT', y = 'age', data = dataset)

*From the above figure, it can deduced that patients that did not survive were on average older compared to patients who actually survived.*

*In the following heatmap, we can see how each variable is correlated to one another.*

In [None]:
plt.figure(figsize = (16,10))
corr = dataset.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask = mask, annot = True, cmap = 'viridis')

*In the following figure, it is shown that age and levels of serum creatinine have a considerable positive correlation to the possibility of a death event. In contrast, ejection fraction level and follow-up time have a high negative correlation with a possibility of a death event.*

In [None]:
corr['DEATH_EVENT'].drop('DEATH_EVENT').sort_values(ascending=True).plot.bar()

**Selecting Features for Training the Classification algorithm**

*Given the results from the EDA above, the variables that were selected to train the model were the levels of serum creatinine in the ptients' blood, the level of ejection fraction of the patients and the follow-up time. Several classification algorithms were tested and the results are shown below.*

In [None]:
X = dataset.loc[:, ['serum_creatinine','ejection_fraction', 'time']].values
y = dataset.iloc[:, -1].values
# Splitting to training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
# Scaling the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Support Vector Classifier with linear kernel

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred)*100))
print("Recall: {:.2f} %".format(recall_score(y_test, y_pred)*100))

Support Vector Classifier with rbf kernel

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred)*100))
print("Recall: {:.2f} %".format(recall_score(y_test, y_pred)*100))

Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred)*100))
print("Recall: {:.2f} %".format(recall_score(y_test, y_pred)*100))

KNN Classifier, with k = 5 and distance metric equal to 'minkowski'

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

Naive Bayes Classifier

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred)*100))
print("Recall: {:.2f} %".format(recall_score(y_test, y_pred)*100))

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred)*100))
print("Recall: {:.2f} %".format(recall_score(y_test, y_pred)*100))

Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred)*100))
print("Recall: {:.2f} %".format(recall_score(y_test, y_pred)*100))

Random Forest Classifier, with 50 estimators

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred)*100))
print("Recall: {:.2f} %".format(recall_score(y_test, y_pred)*100))

CatBoost Classifier

In [None]:
from catboost import CatBoostClassifier
classifier = CatBoostClassifier()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred)*100))
print("Recall: {:.2f} %".format(recall_score(y_test, y_pred)*100))

XGBoost Classfier

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred)*100))
print("Recall: {:.2f} %".format(recall_score(y_test, y_pred)*100))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(max_depth=2, random_state=4)
classifier.fit(X_train, y_train)

Gradient Boosting Classifier

In [None]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred)*100))
print("Recall: {:.2f} %".format(recall_score(y_test, y_pred)*100))

k-fold validation of the CatBoost model

In [None]:
from catboost import CatBoostClassifier
classifier = CatBoostClassifier()
classifier.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 5)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

After this analysis, the algorithm that outperformed all other, was CatBoost, with an accuracy of 90% and a recall of 79%. Moreover, when performing k-fold cross validation of this approach, we had the following results:
1. k = 5
* Mean Accuracy =  83%
* Accuracy Standard Adeviation = 3.8%
2. k = 10
* Mean Accuracy =  83%
* Accuracy Standard Deviation = 6.5%