In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**<h1><center>Heart Failure Prediction</center></h1>**<h1>
<center><img src="https://afmc.org/wp-content/uploads/2017/02/heartfailure.jpg" width="600"></center>

The dataset studied here has 12 features which can be used to predict mortality by heart failure (which is our label). More info about this dataset can be found here: https://www.kaggle.com/andrewmvd/heart-failure-clinical-data

Other notebooks that helped me prepare this one and which I would like to give credit to:

https://www.kaggle.com/shivarajmishra/ann-vs-rest-endgame-for-heart-failure-prediction

https://www.kaggle.com/karnikakapoor/heart-failure-prediction-ann


In [None]:
# Import necessary libraries
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import hypertools as hyp
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from keras.layers import Dense, BatchNormalization, LSTM
from keras.models import Sequential
from keras.utils import to_categorical
from keras import callbacks
from keras.wrappers.scikit_learn import KerasClassifier
import eli5
from eli5.sklearn import PermutationImportance
import matplotlib.pyplot as plt

<h2>Part 1: EDA (Data Quality Check, Exploration, Visualization)</h2>

In [None]:
# Load and take a peek at dataset
data = pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
data.head()

In [None]:
# Check for null values
data.isnull().sum()

In [None]:
# Generate descriptive statistics
data.describe()

In [None]:
# Check degree of class imbalance 
data.groupby('DEATH_EVENT').size()

In [None]:
# 3D scatter for PCA using Hypertools
labels = data['DEATH_EVENT']
hyp.plot(data, '.', hue=labels, reduce='PCA',legend=labels.unique().tolist())

In [None]:
# Normalize based on (https://sebastianraschka.com/Articles/2015_pca_in_3_steps.html#standardizing)
hyp.plot(data, '.', hue=labels, reduce='PCA',normalize='within', legend=labels.unique().tolist())

In [None]:
# Correlation matrix
corr=data.corr()
sns.heatmap(corr)

<h2> Part 2: Data Preparation, Model Training and Assessment </h2> 

In [None]:
# Seperate data into features, target
y = (data["DEATH_EVENT"]).to_numpy()
X_raw= (data.drop(["DEATH_EVENT"],axis=1))
# Getting column names here to be used later
col_names=X_raw.columns.tolist()
X_raw = X_raw.to_numpy()

In [None]:
# Train, Test Split. Then normalize features (note - we fit the transform on training data, then apply to train and test data in order to avoid data leakage)
X_raw_train, X_raw_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=1) 
# Fit transform on training data
scaler=StandardScaler()
scaler.fit(X_raw_train)
# Normalize train and test data by applying fitted transform
X_train = scaler.transform(X_raw_train)
X_test = scaler.transform(X_raw_test)

In [None]:
# Test decision tree on test data for baseline, training on rest of the data (don't need validation split because not tuning hyperparameters)
# Train model
DT_model = DecisionTreeClassifier(random_state=0)
DT_model.fit(X_train, y_train)
# Test model measuring accuracy and MCC, display results
DT_MCC = matthews_corrcoef(y_test, DT_model.predict(X_test))
DT_ACC = accuracy_score(y_test, DT_model.predict(X_test))
print(DT_MCC)
print(DT_ACC)


# See whether normalizing our data had an impact (by training and testing on data that wasn't normalized)
DT_model_raw = DecisionTreeClassifier(random_state=0)
DT_model_raw.fit(X_raw_train, y_train)
print(matthews_corrcoef(y_test, DT_model_raw.predict(X_raw_test)))
print(accuracy_score(y_test, DT_model_raw.predict(X_raw_test)))


In [None]:
# Train ANN/NN (neural network). Use early stopping to mitigate overfitting. Use sklearn wrapper to be able to more easily investigate important features. 
# Note that sometimes training will get stuck at validation accuracy of around 0.7083. If this happens, re-run this cell. 
# Define early stopping behaviour
early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, 
    patience=20, 
    restore_best_weights=True)

# Function to create model 
def create_model():
    model = Sequential()
    
    model.add(Dense(units = 9, kernel_initializer = 'uniform', activation = 'relu', input_dim = 12))
    model.add(Dense(units = 9, kernel_initializer = 'uniform', activation = 'relu'))
    model.add(Dense(units = 7, kernel_initializer = 'uniform', activation = 'relu'))
    model.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu'))
    model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

# Create model using Keras wrapper for sklearn
model = KerasClassifier(build_fn=create_model)
# Train model, display results of training
history = model.fit(X_train, y_train, batch_size = 32, epochs = 500, validation_split=0.2, callbacks=[early_stopping])
val_accuracy = np.mean(history.history['val_accuracy'])
print("\n%s: %.2f%%" % ('val_accuracy', val_accuracy*100))

In [None]:
# Test NN on test data
NN_MCC = (matthews_corrcoef(y_test, model.predict(X_test)))
NN_ACC = (accuracy_score(y_test, model.predict(X_test)))

print(NN_MCC)
print(NN_ACC)


In [None]:
# A quick visualization comparing model performance. Save to PDF. 
dict_to_plot = {"DT_MCC": DT_MCC, "DT_ACC": DT_ACC, "NN_MCC": NN_MCC, "NN_ACC": NN_ACC}
color_to_plot = ['black', 'blue']
f=plt.figure()
plt.bar(dict_to_plot.keys(), dict_to_plot.values(), color=color_to_plot)
plt.title("Performance of DT (Decision Tree) and NN (Neural Network)")
f.savefig("Results.pdf", bbox_inches='tight')

<h2> Part 3: Potentially Important Features </h2>

In [None]:
# Having seen that performance is relatively good, and competitive with decision tree model, use neural network model to investigate potentially important features through finding permutation importances
perm = PermutationImportance(model, random_state=1).fit(X_test,y_test)
eli5.show_weights(perm, feature_names = col_names)

<h2> Thank you for your time! Hope you learned something :) </h2>