In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.weightstats import ztest
from scipy.stats import sem

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

## Read in the Data

This data was taken from the Kaggle competition [Heart Failure Prediction](https://www.kaggle.com/andrewmvd/heart-failure-clinical-data). The purpose of this analysis is to determine what factors contribute to heart failure and develop a model that can help prediction whether or not a heart failure will occur based on the data provided.

In [None]:
data = pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

## Explore the Data

The description on each column are as follows:
- age: Age of the patient
- anaemia: Decrease of red blood cells or hemoglobin (boolean)
- creatinine_phosphokinase: Level of CPK enzyme
- diabetes: If the patient has diabetes (boolean)
- ejection_fraction: Percentage of blood leaving the head at each contraction (percentage)
- high_blood_pressure: If the patient has hypertension (boolean)
- platelets: Platelets in the blood
- serum_creatinine: Level of serum creatinine
- serum_sodium: Level of serum sodium
- sex: 0 = Female, 1 = Male
- smoking: If the patient smokes (boolean)
- time: Follow-up period in days
- DEATH_EVENT: If the patient deceased during the follow-up period (boolean)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.head()

Based on a quick glance, there does not appear to be any null values or any outstanding faults with the data.

# Data Analysis

As stated above, the purpose is to determine if there are any factors that could contribute, or signify, a higher chance of a heart failure. Therefore I want to see how the averages for each feature compare between death events and non-death events then see if those averages are statistically different from each other.

In [None]:
# Calculate the average of features grouped by 'DEATH_EVENT'
avg_by_death = data.groupby('DEATH_EVENT').mean().transpose()
print('Total number of deaths: %i' %data['DEATH_EVENT'].sum())
print('Percent of DEATH_EVENTS: %f' %(data['DEATH_EVENT'].sum()/len(data)))
avg_by_death

In [None]:
# Calculate the standard error in the means by DEATH_EVENT
sem_by_death = data.groupby('DEATH_EVENT').sem().transpose()

Now that the averages have been calculated, I want to test their statistical significance to determine how different the averages are across `DEATH_EVENT`. The following determines the averages of each feature, based on death events, and calculates if the differences are statically significant.

In [None]:
for col in data.columns[:-1]:
    print(col)

In [None]:
# Calculate the statistical significance between averages amongst features with 0.05 significance
for col in data.columns[:-1]:
    # Z test for statistical difference between average DEATH_EVENT by column
    non_death_events = data[data['DEATH_EVENT'] == 0][col]
    death_events = data[data['DEATH_EVENT'] == 1][col]
    
    ttest, pval = ztest(x1 = non_death_events, x2 = death_events)
    
    if pval <= 0.05:
        print('The null hypothesis can be rejected for %s. The averages are statistically different' %col)
    else:
        print('The null hypothesis cannot be rejected %s. The averages are not statistically different' %col)

The only features that seem to have statistically different means across `DEATH_EVENT` are `age`, `time`, `serum_creatinine`, `serum_sodium`, and `ejection_fraction`. Interestingly, `high_blood_pressure`, `diabetes`, `smoking`, or `sex` don't appear to be significant factors that could help predict heart failures based on the data. The above analysis is visualized below.

In [None]:
# Plot box plots for non-boolean columns
for col in data.columns[:-1]:
    figure = go.Figure()
    figure.add_trace(go.Bar(name = 'DEATH_EVENT = 1',
                            x = ['DEATH_EVENT = 1'],
                            y = [avg_by_death[1][col]],
                            error_y = dict(type='data', array = [sem_by_death[1][col]])))
    figure.add_trace(go.Bar(name = 'DEATH_EVENT = 0',
                            x = ['DEATH_EVENT = 0'],
                            y = [avg_by_death[0][col]],
                            error_y = dict(type='data', array = [sem_by_death[0][col]])))
    figure.update_layout(title_text = 'Mean ' + col)
    figure.show()

96 out of 299 observations are death events.

In [None]:
# Plot the correlation graph between variables
_ = plt.figure(figsize=(15,15))
_ = sns.heatmap(data.corr(), cmap='coolwarm', annot=True)
_ = plt.show()

Again, `age`, `ejection_fraction`, `serum_creatinine`, `serum_sodium`, and `time` appear to have the highest correlation to `DEATH_EVENT` which will be focused on in the following anayses. Also worth noting:
- There's a slight correlation between `age` and `serum_creatinine`
- There's a slight correlation between `sex` and `ejection_fraction`
- There's a slight correlation between `age` and `time`


### Gender Analysis

Like stated above, it's an interesting note that `gender`, `high_blood_pressure`, `diabetes`, and `smoking` don't play a role in determining a `DEATH_EVENT` according to this data. The following investigates these variables.

In [None]:
# Create a function to plot pie charts for boolean variables colored by DEATH_EVENT
def cat_pie_by_de(category, category_true_label, category_false_label):
    '''Plots pie charts for a given category by DEATH_EVENT'''
    
    # Creates value counts for a true/false (positive/negative) boolean value by DEATH_EVENT
    true_de = data.loc[(data[category] == 1) & (data['DEATH_EVENT'] == 1), 'DEATH_EVENT'].count()
    true_nde = data.loc[(data[category] == 1) & (data['DEATH_EVENT'] == 0), 'DEATH_EVENT'].count()
    false_de = data.loc[(data[category] == 0) & (data['DEATH_EVENT'] == 1), 'DEATH_EVENT'].count()
    false_nde = data.loc[(data[category] == 0) & (data['DEATH_EVENT'] == 0), 'DEATH_EVENT'].count()
    
    # Creates lists for true/false (positive/negative) boolean values with death and non-death counts
    true_pie_values = [true_de,  true_nde]
    false_pie_values = [false_de, false_nde]
    
    # Creates labels for the pie charts
    true_labels = ['%s Death Events' %category_true_label, '%s non-Death Events' %category_true_label]
    false_labels = ['%s Death Events' %category_false_label, '%s non-Death Events' %category_false_label]
    
    # Makes a subplot for two pie charts
    figure = make_subplots(rows = 1, cols = 2,
                      column_widths=[0.5,0.5],
                      specs = [[{"type":"pie"}, {"type":"pie"}]])
    
    # Creates a pie chart for true (positive) boolean values colored by DEATH_EVENT
    figure.add_trace(
        go.Pie(labels = true_labels, values = true_pie_values, title = category_true_label),
        row = 1, col = 1)
    
    # Creates a pie chart for false (negative) boolean values colored by DEATH_EVENT
    figure.add_trace(
        go.Pie(labels = false_labels, values = false_pie_values, title = category_false_label),
        row = 1, col = 2)

    # Adds a title to the pie charts
    figure.update_layout(title = "Death Event by %s" %category)

    # Plots the charts
    figure.show()

In [None]:
# Plot the pie charts for sex
cat_pie_by_de(category = 'sex',
              category_true_label = 'Male',
              category_false_label = 'Female')

From the pie chart above, the percentages between `DEATH_EVENT` for both genders don't differ too significantly which might explain why it it's not helpful to determine `DEATH_EVENT` according to this data.

### High Blood Pressure Analysis

In [None]:
# Plot the pie charts for high blood pressure
cat_pie_by_de(category = 'high_blood_pressure',
              category_true_label = 'High Blood Pressure',
              category_false_label = 'Non-High Blood Pressure')

Interestingly, the percentages are noticibly different but somehow not enough to be a reliable feature to distinguish `DEATH_EVENT`. Based on the `high_blood_pressure` bar chart above, the means to seem to be different, but possibly due to the small sample size, there isn't enough data to confidently say that these means are statistically different. I would like to see if numerical data, instead of binary, would be a better determinant(s) in predicting `DEATH_EVENT`. It's worth noting that of the four categetories being investigated now, `high_blood_pressure` has the highest absolute correlation with `DEATH_EVENT`.

### Diabetes Analysis

In [None]:
# Plot the pie charts for diabetes
cat_pie_by_de(category = 'diabetes',
             category_true_label = 'Diabetic',
             category_false_label = 'Non-Diabetic')

As with `sex`, the means aren't too different which explains why it's not a significant feature in predicting `DEATH_EVENT`.

### Smoking Analysis

In [None]:
# Plot the pie charts for smoking
cat_pie_by_de(category='smoking',
             category_true_label = 'Smoker',
             category_false_label = 'Non-Smoker')

Of the four features investigated, `smoking` was the most surprising since it's a known fact that smoking increases ones chance in developing heart disease. But according to the data, the means aren't statistically different resulting in a weak correlation with `DEATH_EVENT`. That could be due to how the data was collected and by whom; perhaps the data was collected from a group of individuals with known heart conditions.

## Data Modeling

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, confusion_matrix, accuracy_score

In [None]:
# List features to be included in model
features = ['age', 'ejection_fraction', 'serum_creatinine', 'time', 'serum_sodium']

# Split data into features and target variable
X = data[features]
y = data['DEATH_EVENT']

In [None]:
# Split the data into training and test sets while keeping ratio of DEATH_EVENT
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, 
                                                    random_state = 159,
                                                    stratify = y)

In [None]:
def model_data(model_name, probability = False, grid_search = False, param_grid = None):
    '''With given model, fit, predict, and display various metrics from the data'''
    # Instantiate and fit model to train data
    model = model_name
    
    #-------------------------------------------------------------------
    # Check if grid_search is true and if true, param_grid is specified
    if grid_search:
        assert param_grid != None
        
        print('Performing Grid Search Cross Validation')
        
        # Perform grid search
        model_cv = GridSearchCV(model, param_grid, cv = 2)
        model_cv.fit(X_train, y_train)
        
        # Print best parameters & score
        print('Best model parameters:')
        for param in model_cv.best_params_:
            print('\t %s = %s' % (param, str(model_cv.best_params_[param])))
        print('Best score = %f' %model_cv.best_score_)
        
        # Set model to best_estimator
        model = model_cv.best_estimator_
    #-------------------------------------------------------------------
    
    # Fit training data to model
    model.fit(X_train, y_train)
    
    # Make predictions from the trained model
    predictions = model.predict(X_test)
    
    # Print the classification report
    print(classification_report(y_test, predictions))
    
    # Plot the confusion matrix
    conf_matrix = confusion_matrix(y_test, predictions)
    
    figure1 = ff.create_annotated_heatmap(conf_matrix,
                                         x = [0, 1],
                                         y = [0, 1],
                                         colorscale = 'Blues')
    figure1.update_layout(title='Confusion Matrix',
                         xaxis = dict(title='Predicted Values', side='bottom'),
                         yaxis = dict(title='True Values', autorange='reversed'))
    
    figure1.show()
    
    # Plot the ROC curve and print AUC metric
    y_pred_prob = model.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    
    figure2 = px.area(x = fpr, y = tpr,
                 title = 'ROC Curve',
                 labels=dict(x = 'False Positive Rate', y = 'True Positive Rate'),
                 width = 700, height = 700)
    
    figure2.add_shape(type = 'line',
                      line = dict(dash = 'dash'),
                      x0 = 0, y0 = 0,
                      x1 = 1, y1 = 1)
    
    figure2.show()
    
    print('AUROC score = %f' %roc_auc_score(y_test, y_pred_prob))
    
    # Return model accuracy and AUROC score
    model_accuracy = accuracy_score(y_test, predictions)
    auroc = roc_auc_score(y_test, y_pred_prob)
    
    return model_accuracy, auroc

In [None]:
# Create a data frame with model metrics
model_metrics = dict()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
acc, auc = model_data(LogisticRegression())

In [None]:
model_metrics['Logistic Regression'] = [acc, auc]

### Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
param_grid = {'n_estimators':[5, 10, 20, 50, 75, 100],
             'min_samples_split':[2, 5], 
             'max_depth':[5, 10, 15]}

In [None]:
acc, auc = model_data(RandomForestClassifier(random_state = 789), grid_search = True, param_grid = param_grid)

In [None]:
model_metrics['Random Forest'] = [acc, auc]

### XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
acc, auc = model_data(XGBClassifier())

In [None]:
model_metrics['XGBoost Classifier'] = [acc, auc]

### Support Vector Machine

In [None]:
from sklearn import svm

In [None]:
acc, auc = model_data(svm.SVC(probability = True))

In [None]:
model_metrics['Support Vector Machine'] = [acc, auc]

### K-Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
param_grid = {'n_neighbors':[3, 5, 7],
             'weights':['uniform', 'distance']}

In [None]:
acc, auc = model_data(KNeighborsClassifier(), grid_search = True, param_grid = param_grid)

In [None]:
model_metrics['K-Nearest Neighbors'] = [acc, auc]

## Results

In [None]:
# Display metric data from models
pd.DataFrame.from_dict(model_metrics,
                       orient = 'index',
                       columns = ['Model Accuracy', 'AUROC'])

Of the models developed, the XGBoost classifier seems to have performed best in terms of accuracy and AUROC score with $90\%$ and $0.959$ respectfully followed closely by the Random Forest model at $~87\%$ accuracy and $0.937$ AUROC score.