# Introduction to the Data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data_filepath = "/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv"
raw_data = pd.read_csv(data_filepath)

Dataset from [here](https://www.kaggle.com/andrewmvd/heart-failure-clinical-data)<br><br>
This is a dataset of patients with Cardiovascular diseases (CVDs), tracking different conditions (below), and if the patient died from heart failure.
<br>
* **age** - *Age of the patient (years)*
* **anaemia** - *Decrease of red blood cells or hemoglobin (boolean)*
* **creatinine_phosphokinase** - *Level of the CPK enzyme in the blood (mcg/L)*
* **diabetes** - *If the patient has diabetes (boolean)*
* **ejection_fraction** - *Percentage of blood leaving the heart at each contraction (percentage)*
* **high_blood_pressure** - *If the patient has hypertension (boolean)*
* **platelets** - *Platelets in the blood (kiloplatelets/mL)*
* **serum_creatinine** - *Level of serum creatinine in the blood (mg/dL)*
* **serum_sodium** - *Level of serum sodium in the blood (mEq/L)*
* **sex** - *Woman or man (binary)*
* **smoking** - *If the patient smokes (boolean)*
<br><br>
* **time** - *This variable captures the time at which DEATH_EVENT happened in days. For example; if the patient died, then it tells how many days it took to happen, if the patient survives, it tells how long recovery took. We could, in theory, use time as another feature to predict DEATH_EVENT, more on this later*
* **DEATH_EVENT** - *If the patient died from heart failure*

<h4><strong><i>Our goal is to predict the probability of death from heart failure as early warning for patients in hospitals with CVD.</i></strong></h4>

In [None]:
raw_data.head()

We can see that the data seems to have ordered `DEATH_EVENT`, let's just quickly shuffle the dataset.

In [None]:
raw_data = raw_data.sample(len(raw_data), random_state=42)
raw_data = raw_data.reset_index(drop=True)

In [None]:
raw_data.describe()

In [None]:
raw_data.info()

# Graphing the Data
***

## Violin Plots of the Numerical Data

In [None]:
plt.figure(figsize=(8, 6));

sns.violinplot(x="DEATH_EVENT", y="ejection_fraction", data=raw_data, split=True, inner="quart", linewidth=1, 
               palette={0: "orangered", 1: "lightcoral"})
sns.despine(offset=10, trim=True)
plt.xlabel("Died");
plt.ylabel("Ejection Fraction");

The graph above shows the density of points at different values of `ejection_fraction` for those who died and those who didn't, we call it the *distribution*.<br>
<br>
We can see that for those who survived the large majority had an `ejection_fraction` of around 40, but there is also another peak at about 60 (which is interesting, wonder if anyone can explain it). But then for those who didn't survive can see that the distributuion of `ejection_fraction` is much more spread out, but the median is lower.

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(16, 16));
fig.suptitle('Distribution of features for different DEATH_COUNT');

_=sns.violinplot(x="DEATH_EVENT", y="ejection_fraction", data=raw_data, split=True, inner="quart", linewidth=1, 
               palette={0: "orangered", 1: "lightcoral"}, ax=axes[0, 0]);
plt.setp(axes[0, 0], ylabel="Ejection Fraction");

_=sns.violinplot(x="DEATH_EVENT", y="age", data=raw_data, split=True, inner="quart", linewidth=1, 
               palette={0: "orangered", 1: "lightcoral"}, ax=axes[0, 1]);
plt.setp(axes[0, 1], ylabel="Age");

_=sns.violinplot(x="DEATH_EVENT", y="creatinine_phosphokinase", data=raw_data, split=True, inner="quart", linewidth=1, 
               palette={0: "orangered", 1: "lightcoral"}, ax=axes[1, 0]);
plt.setp(axes[1, 0], ylabel="Creatinine Phosphokinase");

_=sns.violinplot(x="DEATH_EVENT", y="platelets", data=raw_data, split=True, inner="quart", linewidth=1, 
               palette={0: "orangered", 1: "lightcoral"}, ax=axes[1, 1]);
plt.setp(axes[1, 1], ylabel="Platelets");

_=sns.violinplot(x="DEATH_EVENT", y="serum_creatinine", data=raw_data, split=True, inner="quart", linewidth=1, 
               palette={0: "orangered", 1: "lightcoral"}, ax=axes[2, 0]);
plt.setp(axes[2, 0], ylabel="Serum Creatinine");

_=sns.violinplot(x="DEATH_EVENT", y="serum_sodium", data=raw_data, split=True, inner="quart", linewidth=1, 
               palette={0: "orangered", 1: "lightcoral"}, ax=axes[2, 1]);
plt.setp(axes[2, 1], ylabel="Serum Sodium");

sns.despine(offset=10, trim=True);
plt.setp(axes, xlabel="Died");

A general conclusion to make from all of this is that it seems the people that survived had much less variation in their characteristics than for those who didn't, in other words, only specific people were able to survive, but almost anyone could've died (note though that two thirds of patients lived, which means that a lot of people were this "specific person" that could survive, in these aspects, at least...).

In [None]:
import matplotlib.patches as mpatches

In [None]:
def get_cat_percent_matrix(feature, mode):
    """
    'mode' can be 'individual', 'in_column' or 'column'
    """
    total_amount = len(raw_data)
    
    feature_died_amount = sum((raw_data[feature]==1) & (raw_data["DEATH_EVENT"]==1));
    not_feature_died_amount = sum((raw_data[feature]==0) & (raw_data["DEATH_EVENT"]==1));
    feature_lived_amount = sum((raw_data[feature]==1) & (raw_data["DEATH_EVENT"]==0));
    not_feature_lived_amount = sum((raw_data[feature]==0) & (raw_data["DEATH_EVENT"]==0));
    
    feature_amount = feature_died_amount+feature_lived_amount
    not_feature_amount = not_feature_died_amount+not_feature_lived_amount
    if mode=="individual":
        feature_died_percent = str(round(feature_died_amount*100/(total_amount), 2))+"%";
        not_feature_died_percent = str(round(not_feature_died_amount*100/(total_amount), 2))+"%";
        feature_lived_percent = str(round(feature_lived_amount*100/(total_amount), 2))+"%";
        not_feature_lived_percent = str(round(not_feature_lived_amount*100/(total_amount), 2))+"%";
        return feature_died_percent, not_feature_died_percent, feature_lived_percent, not_feature_lived_percent
    elif mode=="in_column":
        feature_died_percent = str(round(feature_died_amount*100/(feature_amount), 2))+"%";
        not_feature_died_percent = str(round(not_feature_died_amount*100/(not_feature_amount), 2))+"%";
        feature_lived_percent = str(round(feature_lived_amount*100/(feature_amount), 2))+"%";
        not_feature_lived_percent = str(round(not_feature_lived_amount*100/(not_feature_amount), 2))+"%";
        return feature_died_percent, not_feature_died_percent, feature_lived_percent, not_feature_lived_percent
    elif mode=="column":
        feature_percent = str(round((feature_amount)*100/(total_amount), 2))+"%";
        not_feature_percent = str(round((not_feature_amount)*100/(total_amount), 2))+"%";
        return feature_percent, not_feature_percent

def make_count_plot(x, ax, x_label):
    _=sns.histplot(binwidth=0.5, x=x, hue="DEATH_EVENT", data=raw_data, palette={0: "firebrick", 1: "maroon", 2: "darkturquoise", 3:"teal"}, ax=ax, stat="count", multiple="stack")
    sns.despine(left=True, ax=ax)
    ax.set_xticks([0, 1]);
    ax.set_xlabel(x_label);
    
    feature_died_amount = sum((raw_data[x]==1) & (raw_data["DEATH_EVENT"]==1));
    not_feature_died_amount = sum((raw_data[x]==0) & (raw_data["DEATH_EVENT"]==1));
    feature_lived_amount = sum((raw_data[x]==1) & (raw_data["DEATH_EVENT"]==0));
    not_feature_lived_amount = sum((raw_data[x]==0) & (raw_data["DEATH_EVENT"]==0));
    feature_amount = feature_died_amount+feature_lived_amount
    not_feature_amount = not_feature_died_amount+not_feature_lived_amount
    
    feature_died_percent, not_feature_died_percent, feature_lived_percent, not_feature_lived_percent = get_cat_percent_matrix(x, "in_column")
    ax.text(0.2, not_feature_died_amount/2, not_feature_died_percent, style='normal', fontsize=12, fontweight="bold");
    ax.text(0.7, feature_died_amount/2, feature_died_percent, style='normal', fontsize=12, fontweight="bold");
    ax.text(0.7, (feature_lived_amount/2)+feature_died_amount, feature_lived_percent, style='normal', fontsize=12, fontweight="bold");
    ax.text(0.2, (not_feature_lived_amount/2)+not_feature_died_amount, not_feature_lived_percent, style='normal', fontsize=12, fontweight="bold");
    feature_died_percent, not_feature_died_percent, feature_lived_percent, not_feature_lived_percent = get_cat_percent_matrix(x, "individual")
    ax.text(0.2, (not_feature_died_amount/2)-9, not_feature_died_percent, style='normal', fontsize=12, color="white");
    ax.text(0.7, (feature_died_amount/2)-9, feature_died_percent, style='normal', fontsize=12, color="white");
    ax.text(0.7, ((feature_lived_amount/2)+feature_died_amount)-9, feature_lived_percent, style='normal', fontsize=12, color="white");
    ax.text(0.2, ((not_feature_lived_amount/2)+not_feature_died_amount)-9, not_feature_lived_percent, style='normal', fontsize=12, color="white");
    feature_percent, not_feature_percent = get_cat_percent_matrix(x, "column")
    ax.text(0.2, not_feature_amount+2, not_feature_percent, style='normal', fontsize=12);
    ax.text(0.7, feature_amount+2, feature_percent, style='normal', fontsize=12);

    survived_patch = mpatches.Patch(color='firebrick', label='Survived')
    died_patch = mpatches.Patch(color='maroon', label='Died')
    ax.legend(handles=[survived_patch, died_patch]);

## Count Plots of the Categorical Data

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(16, 16));

make_count_plot("anaemia", axes[0, 0], "Anaemia")
make_count_plot("diabetes", axes[0, 1], "Diabetes")
make_count_plot("high_blood_pressure", axes[1, 0], "High Blood Pressure")
make_count_plot("sex", axes[1, 1], "Sex")
make_count_plot("smoking", axes[2, 0], "Smoking")

sns.countplot(x="DEATH_EVENT", data=raw_data, palette={0: "firebrick", 1: "maroon"}, ax=axes[2, 1]);
axes[2, 1].text(-0.1, len(raw_data[raw_data["DEATH_EVENT"]==0])+2, str(round(len(raw_data[raw_data["DEATH_EVENT"]==0])*100/len(raw_data), 2))+"%", style='normal', fontsize=12);
axes[2, 1].text(0.9, len(raw_data[raw_data["DEATH_EVENT"]==1])+2, str(round(len(raw_data[raw_data["DEATH_EVENT"]==1])*100/len(raw_data), 2))+"%", style='normal', fontsize=12);
sns.despine(left=True, right=True, top=True, ax=axes[2, 1]);
axes[2, 1].set_xlabel("Died");

From this we can see that the only features that really make change in **DEATH_EVENT** are `anaemia` and `high_blood_pressure`. Also, that (around) two thirds of patiens lived.

## Histograms of the Numerical Data

Note in this next table of histograms, that `creatinine_phosphokinase` and `serum_creatinine` are on a logarithmic scale.

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(16, 16));

sns.histplot(data=raw_data,x="ejection_fraction",ax=axes[0, 0],hue="DEATH_EVENT",multiple="stack",bins=10,kde=True,palette={0: "orangered", 1: "lightcoral"});
sns.despine(left=True, right=True, top=True, ax=axes[0, 0]);
axes[0, 0].set_xlabel("Ejection Fraction");
sns.histplot(data=raw_data,x="age",ax=axes[0, 1],hue="DEATH_EVENT",multiple="stack",bins=10,kde=True,palette={0: "orangered", 1: "lightcoral"});
sns.despine(left=True, right=True, top=True, ax=axes[0, 1]);
axes[0, 1].set_xlabel("Age");
sns.histplot(data=raw_data,x="creatinine_phosphokinase",ax=axes[1, 0],hue="DEATH_EVENT",multiple="stack",bins=10,kde=True,palette={0: "orangered", 1: "lightcoral"}, log_scale=True);
sns.despine(left=True, right=True, top=True, ax=axes[1, 0]);
axes[1, 0].set_xlabel("Creatinine Phosphokinase");
sns.histplot(data=raw_data,x="platelets",ax=axes[1, 1],hue="DEATH_EVENT",multiple="stack",bins=10,kde=True,palette={0: "orangered", 1: "lightcoral"});
sns.despine(left=True, right=True, top=True, ax=axes[1, 1]);
axes[1, 1].set_xlabel("Platelets");
sns.histplot(data=raw_data,x="serum_creatinine",ax=axes[2, 0],hue="DEATH_EVENT",multiple="stack",bins=10,kde=True,palette={0: "orangered", 1: "lightcoral"}, log_scale=True);
sns.despine(left=True, right=True, top=True, ax=axes[2, 0]);
axes[2, 0].set_xlabel("Serum Creatinine");
sns.histplot(data=raw_data,x="serum_sodium",ax=axes[2, 1],hue="DEATH_EVENT",multiple="stack",bins=10,kde=True,palette={0: "orangered", 1: "lightcoral"});
sns.despine(left=True, right=True, top=True, ax=axes[2, 1]);
axes[2, 1].set_xlabel("Serum Sodium");

# Feature Engineering

***

## Making New Features

In [None]:
engineered_data = raw_data.copy()

In [None]:
engineered_data["heart_function1_ratio"] = engineered_data["creatinine_phosphokinase"] * (engineered_data["ejection_fraction"]/100)
engineered_data["heart_function2_ratio"] = engineered_data["platelets"] * (engineered_data["ejection_fraction"]/100)
engineered_data["blood_function1_ratio"] = engineered_data["serum_sodium"] / engineered_data["creatinine_phosphokinase"]
engineered_data["general_healthiness"] = engineered_data["age"] * engineered_data["serum_creatinine"]
engineered_data["heart_healthiness"] = engineered_data["ejection_fraction"] / (engineered_data["general_healthiness"])
engineered_data["auto1"] = engineered_data["age"] / (engineered_data["ejection_fraction"]*engineered_data["serum_sodium"])
engineered_data["auto2"] = (engineered_data["age"]*engineered_data["serum_sodium"]) / engineered_data["ejection_fraction"]
engineered_data["auto3"] = (engineered_data["platelets"]*engineered_data["serum_creatinine"])/engineered_data["ejection_fraction"]

Note that most of these new (engineered) features were made by hand, but the features `auto 1` - `3` were made by a search algorithm I made. I haven't shown it in a notebook yet, but if you want to see it, it's in <a href="https://stackoverflow.com/questions/65448806/deep-feature-synthesis-depth-for-transformation-primitives-featuretools/65470819#65470819">this</a> StackOverflow post I made.

## Feature Correlations

In [None]:
# Reordering the columns so that DEATH_EVENT is last
num_df = engineered_data.copy()
cols = num_df.columns.tolist()
cols = cols[:12] + cols[13:] + [cols[12]]
num_df = num_df[cols]

In [None]:
all_corrs = abs(num_df.corr())

In [None]:
death_corrs = dict(sorted(all_corrs["DEATH_EVENT"].drop("DEATH_EVENT").to_dict().items(), key=lambda x: x[1], reverse=True))
for feature in death_corrs.keys():
    print(feature+"'s correlation with 'DEATH_EVENT': "+str(death_corrs[feature]))

In [None]:
all_engineered_corrs = abs(pd.concat([engineered_data.drop(raw_data.columns, axis=1), engineered_data["DEATH_EVENT"]], axis=1).corr())
all_raw_corrs = abs(raw_data.corr())

In [None]:
plt.figure(figsize=(16, 6));
sns.heatmap(all_engineered_corrs, linewidths=0, vmin=0, vmax=1, annot=True);

In [None]:
plt.figure(figsize=(16, 6));
sns.heatmap(all_raw_corrs, linewidths=0, vmin=0, vmax=1, annot=True);

Looking into our engineered features, they seem to have more clearly pronounced the relations of the features with **DEATH_EVENT** in the correlation metric, which means it should also be easier for machine learning models to use, which is great.<br>
Looking into the raw features, most of them seem relatively independant from each other, which is good (except that it also applies to **DEATH_EVENT**). For the few correlations that do exist; `time` looks very correlated with **DEATH_EVENT**, which is good (we'll look into it next), and `sex` is very correlated with `smoking`. Other than that, there's some small non-sensical correlations (e.g. between `time` and `high_blood_pressure`).<br> 


## Looking at time

Now's the time to talk about `time`. This variable appears to be the time which the patient already sees the doctor for treatment. And while it was said that this feature is meant to be a target variable, as others have said, this is open to interpretation... So let's see how it relates to **DEATH_EVENT**:

In [None]:
print("Time of the average survivor: "+str(raw_data.loc[raw_data['DEATH_EVENT']==0]['time'].median()))
print("Time of the average deceased: "+str(raw_data.loc[raw_data['DEATH_EVENT']==1]['time'].median()))

In [None]:
plt.figure(figsize=(8, 6));

sns.violinplot(x="DEATH_EVENT", y="time", data=raw_data, split=True, inner="quart", linewidth=1, 
               palette={0: "orangered", 1: "lightcoral"})
sns.despine(offset=10, trim=True)
plt.xlabel("Died");
plt.ylabel("Time");

In [None]:
sns.histplot(data=raw_data, x="time", bins=15, hue="DEATH_EVENT", multiple="stack", kde=True, 
             palette={0: "orangered", 1: "lightcoral"});

In [None]:
print("Correlation of time with DEATH_EVENT: "+str(abs(raw_data[["time", "DEATH_EVENT"]].corr().to_numpy()[0, 1])))

We see it is very correlated, those who perished did so quickly, and those who survived had to wait for a long recovery.<br>
But should we use it?<br>
<br>
To answer this, we need to think about how the final model will be used in production (yes, I'm aware the final model will probably never *actually* make it to production, but I think we should still think like it will nonetheless).<br>
So, how will the model be used? The limit is your imagination! However, I will pretend it'll be used as [vaguely] described in the 
<a href="https://www.kaggle.com/andrewmvd/heart-failure-clinical-data" target="_blank">dataset's description</a>, which is using it for early detection. That would mean this model will be used in a hospital setting, in order to give early warning for patients with CVD for heart failure. Now we know how it's going to be used, we can know (or at least guess) what features will be possible to provide.<br>
Since it'll be used as early warning, we won't know how long a patient is *going to* stay in contact (which is what `time` is), and thus we can't provide `time`... Well, we can't provide `time` *exactly*, but we could in that situation instead provide the time in contact *so far*. Doing this will essentially bias the model to think all new patients will die, but will get less and less confident as said patients keeps surviving. This would make it a pretty terrible predictor of death probability, but instead it'll give a good "how much attention you should give a patient". However, the way it does it (give new patients more attention) is already pretty obvious to hospitals and we probably don't need to skew a model's predictions to say the same thing.<br>
In conclusion, for how I want the model to be used, the use of `time` will only skew the results in ways we don't want. Thus, I will *not* use the `time` variable in this notebook.

In [None]:
engineered_data = engineered_data.drop("time", axis=1)

# Feature Selection

Now, we need to perform feature selection (remove the useless features for prediction).<br>
Here, I've decided to do a small variaton of the [Sequential Feature Selector](https://scikit-learn.org/stable/auto_examples/release_highlights/plot_release_highlights_0_24_0.html). Essentially, it starts with the dataset with all the features, and then gets the (average)* test score of a fine-tuned KNN on that dataset, but with one of the features removed. It does this for all the features, and then properly removes the feature it got the highest score without (if it managed to get a high score without a feature, that means it didn't *need* that feature to get a good score). Then, just rince and repeat, (properly) removing features until there's none left, at which point, we find the point in the process where the (average)* score on the test set was the highest, returning that as the best selection of features it found.<br>
<br>
\* - When I say "*average*" test score, I mean I go from the raw data to the fine-tuned KNN scored on the test set multiple times, eventually finishing all the runs and returning the average test score. I do this because the dataset size is very small, and means the test set size is always very small, that means it'll rarely ever be representative of the population, and thus, not a good way to score a model... To resolve this, I just get the average over many runs, each time doing the train-test split, data preperation, defining the model, training it and scoring it.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import warnings

In [None]:
class ScaleData(BaseEstimator, TransformerMixin):
    def __init__(self, quantile=0.95, ignore_missing=False):
        self.quantile = quantile
        self.ignore_missing = ignore_missing
    def fit(self, X, y=None):
        self.max_features = abs(X).quantile(self.quantile)
        return self
    def transform(self, X):
        if str(type(X)) != "<class 'pandas.core.frame.DataFrame'>":
            raise Exception("'X' must be a pd.DataFrame")
        try:
            self.max_features
        except AttributeError:
            raise Exception("fit method wasn't called.")
        X_scaled = pd.DataFrame()
        if not self.ignore_missing:
            missing_cols = []
            for col in self.max_features.keys():
                if col not in list(X.columns):
                    missing_cols.append(col)
            if missing_cols != []:
                raise Exception(f"{len(missing_cols)} columns missing: {missing_cols}")
        for key in list(self.max_features.keys()):
            try:
                X_scaled = pd.concat([X_scaled, X[key]/self.max_features[key]], axis=1)
            except Exception:
                pass
        X_scaled.columns = list(self.max_features.keys())
        if list(X_scaled.columns) != list(X.columns):
            warnings.warn(f"{len(X.columns)-len(X_scaled.columns)} column(s) were removed, as they weren't recognized: {np.setdiff1d(X.columns, X_scaled.columns)}", Warning)
        return X_scaled
    
def model_metrics(make_model_func, X, y, num_iters=200, silence=False):
    accuracies = []
    precisions = []
    recalls = []
    for i in range(num_iters):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        
        scale_data = ScaleData()
        X_train = scale_data.fit_transform(X_train)
        X_test = scale_data.transform(X_test)
        
        model = make_model_func(X_train, y_train)
        accuracies.append(model.score(X_test, y_test))
        y_pred = model.predict(X_test)
        precisions.append(precision_score(y_test, y_pred, zero_division=0))
        recalls.append(recall_score(y_test, y_pred, zero_division=0))
        
        if (i%10 == 0) and not silence:
            print(f"\r|{''.join('█' for _ in range(round( (i/num_iters)*20 )))}"+\
                  f"{''.join(' ' for _ in range(20-round( (i/num_iters)*20 )))}|"+\
                  f" - {round( (i/num_iters)*100, 1 )}%"+\
                  f" - Running Accuracy: {round( np.array(accuracies).mean()*100, 3 )}%"+\
                  f" - Running Precision: {round( np.array(precisions).mean()*100, 3 )}%"+\
                  f" - Running Recall: {round( np.array(recalls).mean()*100, 3 )}%            ", end="")
    
    if not silence:
        print(f"\rFinished! - Average Accuracy: {round( np.array(accuracies).mean()*100, 3 )}%"+\
              f" - Average Precision: {round( np.array(precisions).mean()*100, 3 )}%"+\
              f" - Average Recall: {round( np.array(recalls).mean()*100, 3 )}%                                     ", end="")
    return accuracies, precisions, recalls

# Extra misc function that's never actually used in the notebook
from sklearn.model_selection import GridSearchCV
def model_tuner(model, param_grid, X, y, num_iters=200, silence=False):
    best_params = []
    for i in range(num_iters):
        scale_data = ScaleData()
        X = scale_data.fit_transform(X)
        
        search = GridSearchCV(model(), param_grid)
        search.fit(X, y)
        best_params.append(str(search.best_params_))
        
        if (i%10 == 0) and not silence:
            print(f"\r|{''.join('█' for _ in range(round( (i/num_iters)*20 )))}"+\
                  f"{''.join(' ' for _ in range(20-round( (i/num_iters)*20 )))}|"+\
                  f" - {round( (i/num_iters)*100, 1 )}%"+\
                  f" - Best Params: {pd.Series(best_params).value_counts().keys()[0]}            ", end="")
    
    if not silence:
        print(f"\rFinished! - Best Params: {pd.Series(best_params).value_counts().keys()[0]}"+\
              f"                                     ", end="")
    return best_params

def select_features(X, y):
    def make_svc(X, y):
        svc = SVC()
        svc.fit(X, y)
        return svc
    print("\rStarting...                             ", end="")
    _, precisions, recalls = model_metrics(make_svc, X, y, num_iters=2000, silence=True)
    feature_combination_scores = {tuple(X.columns):(2*np.array(precisions).mean()*np.array(recalls).mean())/(np.array(precisions).mean()+np.array(recalls).mean())}
    total_rounds = sum([i+2 for i in range(len(X.columns)-1)])
    num_columns = len(X.columns)
    i = 0
    while len(X.columns) > 1:
        j = 0
        feature_costs = {}
        for feature in X.columns:
            print(f"\r|{''.join(['█' for _ in range( round((i*len(X.columns)+j)*20/total_rounds) )])}{''.join([' ' for _ in range( 20-round((i*len(X.columns)+j)*20/total_rounds) )])}| - {round((i*len(X.columns)+j)*100/total_rounds, 2)}%"+\
                  f" - Round {i+1}/{num_columns} - { round(j*100/len(X.columns), 2) }%                               ", end="")
            _, precisions, recalls = model_metrics(make_svc, X.drop(feature, axis=1), y, num_iters=2000, silence=True)
            feature_costs[feature] = (2*np.array(precisions).mean()*np.array(recalls).mean())/\
                                       (np.array(precisions).mean()+np.array(recalls).mean())
            j += 1
        X = X.drop(pd.Series(feature_costs).idxmax(), axis=1)
        feature_combination_scores[tuple(X.columns)] = feature_costs[pd.Series(feature_costs).idxmax()]
        i += 1
    print("\rFinished!                                                                      ")
    return list(max(feature_combination_scores, key=feature_combination_scores.get))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score

In [None]:
# Previously found to be the best set of features
selected_features = ["creatinine_phosphokinase", "sex", "heart_function1_ratio", "general_healthiness", "heart_healthiness", "auto1", "auto2"]

In [None]:
### IF YOU WANT TO RUN THE FEATURE SELECTION ALGORITHM; UNCOMMENT AND RUN THE FOLLOWING LINE:
# selected_features = select_features(engineered_data.drop("DEATH_EVENT",axis=1), engineered_data["DEATH_EVENT"])
### BE WARNED, IT TAKES A LONG TIME

In [None]:
print(selected_features)

In [None]:
plt.figure(figsize=(16, 6));
sns.heatmap(engineered_data[selected_features+["DEATH_EVENT"]].corr(), linewidths=0, vmin=0, vmax=1, annot=True);

# Train-test Split

Now, let's put aside some of the data for testing models later on. I'll use a stratified train-test split, as the dataset size is small and I wan't to make sure `DEATH_EVENT`'s classes have the same proportion in each split.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
engineered_data = engineered_data.reset_index(drop=True)
split = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=42) 
for train_index, test_index in split.split(engineered_data[selected_features+["DEATH_EVENT"]], engineered_data["DEATH_EVENT"]):
    strat_train_set = engineered_data[selected_features+["DEATH_EVENT"]].loc[train_index]
    strat_test_set = engineered_data[selected_features+["DEATH_EVENT"]].loc[test_index]

Okay! Let's check the proportions:

In [None]:
start_bold = "\033[1m"
end_bold = "\033[0m"

In [None]:
print(f"{start_bold}Original proportions of DEATH_EVENT:{end_bold}")
print(f'Percent that lived: {round(sum(engineered_data["DEATH_EVENT"] == 0)*100 / len(engineered_data), 3)}% - Percent that died: {round(sum((engineered_data["DEATH_EVENT"] == 1)*100 / len(engineered_data)), 3)}%')
print("------------------------------------")
print(f"{start_bold}Train/test set proportions:{end_bold}")
print(f'Train - Percent that lived: {round(sum(strat_train_set["DEATH_EVENT"] == 0)*100 / len(strat_train_set), 3)}% - Percent that died: {round(sum((strat_train_set["DEATH_EVENT"] == 1)*100 / len(strat_train_set)), 3)}%')
print(f'Test  - Percent that lived: {round(sum(strat_test_set["DEATH_EVENT"] == 0)*100 / len(strat_test_set), 3)}% - Percent that died: {round(sum((strat_test_set["DEATH_EVENT"] == 1)*100 / len(strat_test_set)), 3)}%')

Looks good! The stratification seems to have worked.

In [None]:
X_train, y_train = strat_train_set.drop("DEATH_EVENT", axis=1), strat_train_set["DEATH_EVENT"]
X_test, y_test = strat_test_set.drop("DEATH_EVENT", axis=1), strat_test_set["DEATH_EVENT"]

In [None]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Data Preprocessing

Now, we just need to preprocess the data to make it better for machine learning models. There's not much we actually have to do (the data came pretty prepared already), I'll just be scaling to to the range `[0. - 1.]`.

In [None]:
scale_data = ScaleData()

In [None]:
X_train = scale_data.fit_transform(X_train)
X_test = scale_data.transform(X_test) # Make sure not to call the `fit` method on the test set!

In [None]:
X_train.head()

# Testing Different Models

Since the dataset size is small, scoring from a test set is always very random. So, I've automated the process, from raw data to fine-tuned model so that I can get an average of the test set results.<br>
First let's try the KNN:

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def make_knn(X_train, y_train):
    # Fine-tunes `n_neighbours` of KNN, changing the search space until it finds the optimal value
    prev_best = None
    neighbours_search, neighbours_prev_best = [6, 7, 8], None
    while True:
        knn_search = GridSearchCV(KNeighborsClassifier(), [{
            "n_neighbors": neighbours_search
        }], cv=5, scoring="accuracy", return_train_score=True)
        knn_search.fit(X_train, y_train)

        if knn_search.best_params_["n_neighbors"] == min(neighbours_search):
            if (knn_search.best_params_["n_neighbors"] == neighbours_prev_best) or (knn_search.best_params_["n_neighbors"] == 1):
                break
            else:
                neighbours_search = list(range( min(neighbours_search)-len(neighbours_search)+1, min(neighbours_search)+1 ))
                neighbours_search = [num for num in neighbours_search if num >= 1] # Filter illegal values
        elif knn_search.best_params_["n_neighbors"] == max(neighbours_search):
            if (knn_search.best_params_["n_neighbors"] == neighbours_prev_best) or (knn_search.best_params_["n_neighbors"]==int(0.7*len(X_train))):
                break
            else:
                neighbours_search = list(range( max(neighbours_search), max(neighbours_search)+len(neighbours_search) ))
                neighbours_search = [num for num in neighbours_search if num <= int(0.7*len(X_train))] # Filter illegal values
        else:
            break

        neighbours_prev_best = knn_search.best_params_["n_neighbors"]
    return knn_search.best_estimator_

In [None]:
knn_accuracies, knn_precisions, knn_recalls = model_metrics(make_knn, engineered_data[selected_features], 
                                                            engineered_data["DEATH_EVENT"], num_iters=2000)

Certaintly not too great... But, as to be expected. Next up is logistic regression:

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
def make_log_reg(X_train, y_train):
    # Fine-tunes various aspects of logistic regression
    log_reg = LogisticRegression(C=1, penalty="elasticnet", l1_ratio=0.5, solver="saga", max_iter=1000)
    log_reg.fit(X_train, y_train)
    return log_reg

In [None]:
log_accuracies, log_precisions, log_recalls = model_metrics(make_log_reg, engineered_data[selected_features], 
                                                            engineered_data["DEATH_EVENT"], num_iters=2000)

Finally, let's do the SVM:

In [None]:
from sklearn.svm import SVC

In [None]:
def make_svc(X_train, y_train):
    # Returns a trained SVC model
    svc = SVC()
    svc.fit(X_train, y_train)
    return svc

In [None]:
svc_accuracies, svc_precisions, svc_recalls = model_metrics(make_svc, engineered_data[selected_features], 
                                                            engineered_data["DEATH_EVENT"], num_iters=2000)

As we can see, the SVC performed best.<br>
**Note** - I've also tried Decision Trees, Random Forests and XGBoosting, but they weren't very successful, they are just very parametric and complex, and after spending all that time feature engineering and selecting, I don't thing we'd need a very complex model in the first place.

# Making the Final Model

Since the SVC performed the best, we'll use the SVC...

In [None]:
model = SVC(probability=True);
model.fit(X_train, y_train);

## Explaining Decisions with SHAP

Here's something you might not have seen before, it's called SHAP (SHapley Additive exPlanations), it essentially can explain why a model made it's decision.

In [None]:
import shap

In [None]:
med = X_train.median().values.reshape((1, X_train.shape[1]))

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
svc = BaggingClassifier(base_estimator=SVC(probability=True), n_estimators=100)
svc.fit(X_train, np.ravel(y_train.to_numpy()))
svc_f = lambda x: svc.predict_proba(x)[:,1]
svc_explainer = shap.Explainer(svc_f, med)
svc_shap_values = svc_explainer(X_test)

In [None]:
import ipywidgets as widgets
from IPython.display import clear_output

In [None]:
output = widgets.Output()

def explain_prediction(change):
    with output:
        clear_output(wait=True)
        plt.show(shap.plots.waterfall(svc_shap_values[change["new"]]))
        print("DEATH_EVENT: "+str(y_test.iloc[change["new"]]))

index_chooser = widgets.BoundedIntText(
    value=0,
    min=0,
    max=len(svc_shap_values),
    step=1,
    description='Index:',
    disabled=False
)
index_chooser.observe(explain_prediction, names="value")

In [None]:
display(index_chooser, output)
explain_prediction({"new": 0})

In [None]:
import math

This is how different values for different features affected the model's output:

In [None]:
num_rows, num_cols = round(np.ceil(len(X_train.columns)/2)), 2
fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, 12))

i = 0
for col in X_train.columns:
    if i%2 == 0:
        shap.plots.scatter(svc_shap_values[:, col], hist=False, ax=axes[round(np.floor(i/2)), 0])
    else:
        shap.plots.scatter(svc_shap_values[:, col], hist=False, ax=axes[round(np.floor(i/2)), 1])
    clear_output(wait=True)
    i += 1

And a more compact version of above:

In [None]:
shap.plots.beeswarm(svc_shap_values, plot_size=(16, 6))

As we can see, the features don't affect the output much, but they have outliers where they do. Also, the features seem to only ever *raise* the predicted probability.

## Error Analysis

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve

In [None]:
svc_f1s = (2*np.array(svc_precisions)*np.array(svc_recalls))/(np.array(svc_precisions)+np.array(svc_recalls))

In [None]:
print(f"{start_bold}SVC Metrics{end_bold}\n‾‾‾‾‾‾‾‾‾‾‾")
print(f"Accuracy:  {round(np.array(svc_accuracies).mean()*100, 3)}% - Deviation: {round(np.array(svc_accuracies).std()*100, 3)}%")
print(f"Precision: {round(np.array(svc_precisions).mean()*100, 3)}% - Deviation: {round(np.array(svc_precisions).std()*100, 3)}%")
print(f"Recall:    {round(np.array(svc_recalls).mean()*100, 3)}% - Deviation: {round(np.array(svc_recalls).std()*100, 3)}%")
print(f"F1:        {round(np.array(svc_f1s).mean()*100, 3)}% - Deviation: {round(np.array(svc_f1s).std()*100, 3)}%")

In [None]:
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [None]:
def svc_metrics_for_threshold(X, y, threshold, num_iters=200, silence=False):
    accuracies = []
    precisions = []
    recalls = []
    tprs = []
    fprs = []
    for i in range(num_iters):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        
        scale_data = ScaleData()
        X_train = scale_data.fit_transform(X_train)
        X_test = scale_data.transform(X_test)
        
        model = SVC(probability=True)
        model.fit(X_train, y_train)
        y_pred = (model.predict_proba(X_test)[:, 1] > threshold).astype(np.int64)
        
        TP, FP, TN, FN = sum((y_pred == 1) & (y_test == 1)), sum((y_pred == 1) & (y_test == 0)), sum((y_pred == 0) & (y_test == 0)), sum((y_pred == 0) & (y_test == 1))
        
        accuracies.append(sum(y_pred == y_test)/len(y_pred))
        precisions.append(precision_score(y_test, y_pred, zero_division=0))
        recalls.append(recall_score(y_test, y_pred, zero_division=0))
        tprs.append(TP / (TP + FN))
        fprs.append(FP / (FP + TN))
        
        if (i%10 == 0) and not silence:
            print(f"\r|{''.join('█' for _ in range(round( (i/num_iters)*20 )))}"+\
                  f"{''.join(' ' for _ in range(20-round( (i/num_iters)*20 )))}|"+\
                  f" - {round( (i/num_iters)*100, 1 )}%"+\
                  f" - Running Accuracy: {round( np.array(accuracies).mean()*100, 3 )}%"+\
                  f" - Running Precision: {round( np.array(precisions).mean()*100, 3 )}%"+\
                  f" - Running Recall: {round( np.array(recalls).mean()*100, 3 )}%            ", end="")
    
    if not silence:
        print(f"\rFinished! - Average Accuracy: {round( np.array(accuracies).mean()*100, 3 )}%"+\
              f" - Average Precision: {round( np.array(precisions).mean()*100, 3 )}%"+\
              f" - Average Recall: {round( np.array(recalls).mean()*100, 3 )}%                                     ", end="")
    return accuracies, precisions, recalls, tprs, fprs

In [None]:
_, _, thresholds = precision_recall_curve(y_test, y_pred_proba)
accuracies, precisions, recalls = [], [], []
for thresh in thresholds:
    accs, precs, recs, _, _ = svc_metrics_for_threshold(engineered_data[selected_features], engineered_data["DEATH_EVENT"], thresh, num_iters=500, silence=True)
    accuracies.append(np.array(accs).mean())
    precisions.append(np.array(precs).mean())
    recalls.append(np.array(recs).mean())

In [None]:
_, _, roc_thresholds = roc_curve(y_test, y_pred_proba)
tprs, fprs = [], []
for thresh in roc_thresholds:
    _, _, _, tps, fps = svc_metrics_for_threshold(engineered_data[selected_features], engineered_data["DEATH_EVENT"], thresh, num_iters=500, silence=True)
    tprs.append(np.array(tps).mean())
    fprs.append(np.array(fps).mean())

In [None]:
def plot_precisions_recalls_vs_thresholds(precisions, recalls, thresholds):
    fig= plt.figure(figsize=(10,5))
    axes= fig.add_axes([0.1,0.1,0.8,0.8])
    
    axes.plot(thresholds, precisions[:-1], "b-", label="Precision")
    axes.plot(thresholds, recalls[:-1], "g-", label="Recall")
    
    plt.title("Precision & Recall vs. Threshold")
    plt.xlabel("Threshold")
    plt.legend()
    plt.grid(True)
    
def plot_precision_vs_recall(precisions, recalls):
    fig= plt.figure(figsize=(10,5))
    axes= fig.add_axes([0.1,0.1,0.8,0.8])
    
    axes.plot(precisions, recalls, "b-")
    
    plt.title("Precision vs. Recall")
    plt.xlabel("Precision")
    plt.ylabel("Recall")
    plt.grid(True)

def plot_roc_curve(fpr, tpr):
    fig= plt.figure(figsize=(10,5))
    axes= fig.add_axes([0.1,0.1,0.8,0.8])
    
    axes.plot(fpr, tpr, linewidth=2)
    axes.plot([0,1], [0,1], "k--")
    
    plt.title("FPR vs. TPR (a.k.a. ROC curve)")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.grid(True)

In [None]:
plot_precisions_recalls_vs_thresholds(precisions, recalls, thresholds[:-1])

In [None]:
plot_precision_vs_recall(precisions, recalls)

In [None]:
plot_roc_curve(fprs, tprs)

We now have a better idea of how our model will do in the wild, and it's not great... The features have very low correlations with `DEATH_EVENT`, so we can't have high hopes to begin with. Things like `ejection_fraction` and `serum_creatinine` don't have very good predictive power, and there really isn't that much data. Although, that's what makes this dataset a good challenge.

# How Much Would More Data Help?

Here's a little side-question; "*How much would more data improve the accuracy of our models?*". It's a suprisingly tough question to answer, I might not have even done it right in the following code, but I tried my best:

In [None]:
def model_metrics_with_subsample(make_model_func, X, y, num_iters=200, subsample=None, silence=False):
    accuracies = []
    precisions = []
    recalls = []
    for i in range(num_iters):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=60)
        if subsample != None:
            inds = np.random.randint(0, len(X_train), (subsample,))
            X_train, y_train = X_train.iloc[inds], y_train.iloc[inds]
        
        scale_data = ScaleData()
        X_train = scale_data.fit_transform(X_train)
        X_test = scale_data.transform(X_test)
        
        model = make_model_func(X_train, y_train)
        accuracies.append(model.score(X_test, y_test))
        y_pred = model.predict(X_test)
        precisions.append(precision_score(y_test, y_pred, zero_division=0))
        recalls.append(recall_score(y_test, y_pred, zero_division=0))
        
        if (i%10 == 0) and not silence:
            print(f"\r|{''.join('█' for _ in range(round( (i/num_iters)*20 )))}"+\
                  f"{''.join(' ' for _ in range(20-round( (i/num_iters)*20 )))}|"+\
                  f" - {round( (i/num_iters)*100, 1 )}%"+\
                  f" - Running Accuracy: {round( np.array(accuracies).mean()*100, 3 )}%"+\
                  f" - Running Precision: {round( np.array(precisions).mean()*100, 3 )}%"+\
                  f" - Running Recall: {round( np.array(recalls).mean()*100, 3 )}%            ", end="")
    
    if not silence:
        print("\r", end="")
    return accuracies, precisions, recalls

In [None]:
import warnings
warnings.filterwarnings("ignore")

**UNCOMMENT THE FOLLOWING CELLS IF YOU WISH TO RUN THE ALGORITHM, PLEASE NOTE THAT THEY TAKE A LONG TIME TO RUN.**

In [None]:
# scores_for_n = {}
# for num_points in range(20, 240, 10):
#     scores_for_n[num_points], _, _ = model_metrics_with_subsample(make_knn, engineered_data[selected_features], 
#                                                                   engineered_data["DEATH_EVENT"], 
#                                                                   subsample=num_points, num_iters=2000, silence=False)
#     print("Finished "+str(num_points)+" points. Got a mean accuracy of "+str(round(np.array(scores_for_n[num_points]).mean()*100, 3))+\
#           "% With a standard deviation of: "+str(round( np.array(scores_for_n[num_points]).std()*100 ,3))+\
#           "%                                                                                                                            ")

In [None]:
# data_points = []
# accuracies = []
# for key in scores_for_n.keys():
#     data_points.append(int(key))
#     accuracies.append(np.array(scores_for_n[key]).mean())

In [None]:
# plt.figure(figsize=(12, 8))
# plt.plot(data_points, accuracies, alpha=0.25)
# plt.scatter(data_points, accuracies)
# plt.xlabel("Dataset Size")
# plt.ylabel("Mean Accuracy")
# plt.grid(True)

<img src="https://imgur.com/gK5BGK3.png">

# The End