In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read data
data = pd.read_csv(os.path.join(dirname, filename))

In [None]:
# describe the data in general terms
data.describe()

In [None]:
# get some info
data.info()

The data seems clean, nothing is missing and everything is either integer or float; no need to create dummy variables and the preprocessing seems rapid. Still need to select our features carefully!

# 1. Data exploration

In [None]:
# Let's start by looking at the distribution of alive vs. deceased.
n_alive = len(data[data["DEATH_EVENT"]==0])
n_deceased = len(data[data["DEATH_EVENT"]==1])
plt.figure(figsize=(10, 10))
plt.pie((n_alive, n_deceased), labels=("alive", "deceased"), autopct='%1.2f%%')
plt.show()

There's about one third of the cases that resulted in death. The distribution is not completely skewed towards alive nor dead.

In [None]:
# Let's plot the number of deceased vs. alive people sorted by gender
sns.countplot(x="DEATH_EVENT", data=data, hue="sex")

In [None]:
# number of male vs. female in the dataset
sns.countplot(x="sex", data=data)

Women are more present in both categories, which indicates that there's more women in the dataset. Distribution seems similar between deceased and alive, which suggests gender is not super important. A correlation analysis might show that more precisely.

In [None]:
# Let's plot the correlation degree wrt the death event
data.corr()["DEATH_EVENT"].sort_values().plot(kind="bar")

As we suspected, gender is not correlated with the death event, so it's probably not super relevant to use it as a feature for our machine learning model. Diabetes also looks like it's not an important factor. We will drop sex, diabetes and smoking based on this correlation bar plot. We will also drop time, as it's irrelevant for future pre-diagnosis (time is set by the death_event, not the other way around!). 

In [None]:
# final DataFrame without uncorrelated features
dropped = ["sex", "time", "smoking", "diabetes"]
data_notime = data.drop("time", axis=1)
final_data = data.drop(dropped, axis=1)
final_data

In [None]:
# Let's look at the effet of high blood pressure more precisely in the case of the deceased population
deceased = final_data[final_data["DEATH_EVENT"]==1]
alive = final_data[final_data["DEATH_EVENT"]==0]
sns.boxplot(data=deceased, x="high_blood_pressure", y="age")
plt.show()

Logically, high blood pressure is an aggravating factor and leads to deaths at a younger median age, while broadening the age range aswell.

In [None]:
# Let's plot the age distribution for the deceased and alive populations
sns.distplot(alive["age"], label="alive")
sns.distplot(deceased["age"], label="deceased")
plt.legend()
plt.show()

This distribution plot seems to show that older people are more prone to suffer heart failure, typically at age > 65-70, while still being at a real risk after 40.

In [None]:
# Finally, let's group by status and look at the mean values of the different features
final_data.groupby("DEATH_EVENT").mean()

This also points out that people prone to suffer heart failure have a tendency to have higher levels of ceatinine_phosphokinase and serum_creatinine.

### This data exploration showed that the remaining features (8 in total) seem to be of relative importance to determine wether or not a patient will suffer heart failure. Let's now build a model that learns how to predict this outcome!

# 2. Model

In [None]:
# Let's first preprocess the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
# X is the features, y is the target class
X = final_data.drop("DEATH_EVENT", axis=1)
y = final_data["DEATH_EVENT"]

In [None]:
# We split the dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
# The features are scaled between 0 and 1
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# We'll first try with the kmeans algorithm
from sklearn.neighbors import KNeighborsClassifier
kNeighbors = KNeighborsClassifier()
kNeighbors.fit(X_train, y_train)
train_pred_kneigh = kNeighbors.predict(X_train)
test_pred_kneigh = kNeighbors.predict(X_test)

In [None]:
# Let's print some metrics
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix
print(f"KneighborsClassifier:\nTest:\n{classification_report(y_test, test_pred_kneigh)}\nTrain:\n{classification_report(y_train, train_pred_kneigh)}")
plt.figure(figsize=(12, 12))
plot_confusion_matrix(kNeighbors, X_test, y_test, cmap="coolwarm")
plt.show()

The kneighbors classifier has a decent degree of accuracy (70 to 78% on test/training sets) but we can surely do better, epecially on the false negative (which we want to minimize here!). Let's try a linear regression algorithm.

In [None]:
from sklearn.linear_model import LogisticRegression
LogRegression = LogisticRegression()
LogRegression.fit(X_train, y_train)
train_pred_logreg = LogRegression.predict(X_train)
test_pred_logreg = LogRegression.predict(X_test)

In [None]:
# Let's print some metrics
from sklearn.metrics import classification_report, plot_confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
print(f"Logistic Regression:\nTest:\n{classification_report(y_test, test_pred_logreg)}\nTrain:\n{classification_report(y_train, train_pred_logreg)}")
plt.figure(figsize=(12, 12))
plot_confusion_matrix(LogRegression, X_test, y_test, cmap="coolwarm")
plt.show()

Algorithm is a bit better, there's less false positive, but still too much false negative. A possible solution would be to decrase the probability threshold. Let's try to evaluate the different metrics in function of the threshold parameter.

In [None]:
# we prediction probabilities now and manually change the threshold
probabilities = pd.DataFrame(LogRegression.predict_proba(X_test))
thresholds = np.arange(0.1, 0.91, 0.02)
accuracies = np.array([])
f1scores = np.array([])
precisions = np.array([])
recalls = np.array([])

for thresh in thresholds:
    pred_logreg_newthresh = np.empty(len(probabilities), dtype=float)
    for i in range(len(probabilities)):
        if probabilities.iloc[i,0] <= thresh:
            pred_logreg_newthresh[i] = 1
        else:
            pred_logreg_newthresh[i] = 0
    accuracies = np.append(accuracies, accuracy_score(y_test, pred_logreg_newthresh))
    f1scores = np.append(f1scores, f1_score(y_test, pred_logreg_newthresh))
    precisions = np.append(precisions, precision_score(y_test, pred_logreg_newthresh))
    recalls = np.append(recalls, recall_score(y_test, pred_logreg_newthresh))
# DataFrame containing accuracies, f1scores and precisions for these different thresholds
d = {"thresholds": thresholds, "accuracies": accuracies, "f1scores": f1scores, "precision": precisions, "recall": recalls}
df = pd.DataFrame(d)*100

In [None]:
# plot the data
plt.figure(figsize=(10, 6))
plt.scatter(df.thresholds, df.accuracies, color="blue", label="accuracy")
plt.scatter(df.thresholds, df.f1scores, color="orange", label="f1 score")
plt.scatter(df.thresholds, df.precision, color="green", label="precision")
plt.scatter(df.thresholds, df.recall, color="red", label="recall")
plt.xlabel("probability threshold [%]")
plt.ylabel("metric [%]")
plt.axvline(x=50, color="black", linestyle="--", linewidth=0.5, zorder=-34)
plt.legend()
plt.show()

We want to avoid false negatives. I think it's always better to have false positives when it comes to medical diagnosis. We can trade between recall and precision by changing the threshold in order to minimize false negatives. The Logistic Regression model is far from perfect, and adapting the threshold is a limiter approach. Another algorithm might be more suited fo this problem, or even Deep Learning.

In [None]:
probabilities = pd.DataFrame(LogRegression.predict_proba(X_test))
pred_logreg_bestthresh = np.empty(len(probabilities), dtype=float)
threshold = 0.55
for i in range(len(probabilities)):
    if probabilities.iloc[i,0] <= threshold:
        pred_logreg_bestthresh[i] = 1
    else:
        pred_logreg_bestthresh[i] = 0
print(confusion_matrix(y_test, pred_logreg_bestthresh))
print(f"Accuracy score: {accuracy_score(y_test, pred_logreg_bestthresh)*100:.2f}%")
print(f"f1 score: {f1_score(y_test, pred_logreg_bestthresh)*100:.2f}%")

## 3. Deep Learning - Neural Network

In [None]:
# We use tensforflow with the keras API
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Sequential neural network that performs classification
nn = Sequential()
nn.add(Dense(units=8, activation="relu"))
nn.add(Dropout(0.3))
nn.add(Dense(units=16, activation="relu"))
nn.add(Dropout(0.3))
nn.add(Dense(units=32, activation="relu"))
nn.add(Dropout(0.3))
nn.add(Dense(units=16, activation="relu"))
nn.add(Dropout(0.3))
nn.add(Dense(units=8, activation="relu"))
nn.add(Dropout(0.3))
nn.add(Dense(units=1, activation="sigmoid"))
nn.compile(optimizer="adam", metrics=["acc"], loss="binary_crossentropy")

In [None]:
# Early stop if accuracy does not improve over 10 epochs
early_stop = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=15)

In [None]:
nn.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), callbacks=[early_stop])

In [None]:
# plot the metrics
metrics = pd.DataFrame(nn.history.history)
metrics.plot()
plt.show()

Deep Learning doesn't really help here, maybe because we lack more interesting features that would help better estimating chances of heart failure.

### Although while considering time as a feature models can reach accuracies of 90-95%, without it the few different models tested here have reached max accuracies of about 70 to 80%. Let's do automatic feature selection/dimensionality reduction + hyperparameter sweep-up to increase accuracy!

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
X = final_data.drop("DEATH_EVENT", axis=1)
y = final_data["DEATH_EVENT"]

In [None]:
pipeline = Pipeline([("feature_selec","passthrough"), ("clf", LogisticRegression())])

In [None]:
grid = [{"feature_selec": [GenericUnivariateSelect()],
         "feature_selec__mode": ["percentile"],
        "feature_selec__param": np.arange(10, 101, 10),
        "clf": [RandomForestClassifier(), DecisionTreeClassifier()],
        "clf__min_samples_split": np.arange(2, 8, 2)
        },
        {"feature_selec": [PCA()],
        "feature_selec__n_components": np.arange(2, 6, 1),
        "clf": [RandomForestClassifier(), DecisionTreeClassifier()],
        "clf__min_samples_split": np.arange(2, 8, 2)
        }]
models = GridSearchCV(pipeline, grid, scoring="accuracy", cv=5, return_train_score=True)

In [None]:
models_results = models.fit(X, y)

In [None]:
final_results = pd.DataFrame(models_results.cv_results_)

In [None]:
final_results.sort_values(by="rank_test_score")

In [None]:
models_results.best_params_

In [None]:
models_results.best_score_

Best results show more than 90% accuracy on train set & 76% on test set.