In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.graph_objects as go
import plotly.express as px
import plotly
import os

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div style="text-align:center"><img src="https://www.verywellhealth.com/thmb/PjK68gYYU57mdcDj0XdIcTe7GHQ=/1001x1001/smart/filters:no_upscale()/heart-failure-causes-40-5ae0bcdec673350037cb2ddd.png", width="500" height="500"></div>

## Index

1. <a href='#1'> Data Ingestion and Exploration </a>
    - <a href='#1.1'>1.1 Data Preparation </a>
    - <a href='#1.2'>1.2 Visualize the dataset </a>
    - <a href='#1.3'>1.3 Feature Engineering </a>
2. <a href='#2'> Integrating Classification Labels </a>
3. <a href='#3'> Resampling Techniques </a>
4. <a href='#4'> Prediction </a>
5. <a href='#5'> Evaluation </a>

In [None]:
df = pd.read_csv(r'/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv', 
                               error_bad_lines=False)


In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df['DEATH_EVENT'].unique()

In [None]:
df.describe()

In [None]:
# Check the missing values in the column
missing_data = df.isna().sum().sort_values(ascending=False)
missing_data = missing_data.reset_index(drop=False)
missing_data = missing_data.rename(columns={"index": "Columns", 0: "Value"})
missing_data['Proportion'] = (missing_data['Value']/len(df))*100
missing_data

In [None]:
fig = px.pie(df, names='DEATH_EVENT',
             color_discrete_sequence=px.colors.sequential.Viridis_r,
             title='Proportion of data for Class column')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',
                  plot_bgcolor='rgba(0,0,0,0)',
                  font=dict(family='Cambria, monospace', size=12, color='#000000'))
fig.show()

The proportion of Class 0 is double than of Class 1. Thus the dataset is imbalanced. We need to use resampling techniques and different performance metrics to evaluate the results.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,12))

# Compute the correlation matrix
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap="GnBu", vmax=.3, center=0,
            square=True, linewidths=.7, cbar_kws={"shrink": .7})

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
X = df.iloc[:,:-1]
new_X = calc_vif(X)

In [None]:
new_X

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(14,6))
sns.set(style="darkgrid")
sns.countplot(x='age',data = df, hue = 'DEATH_EVENT',palette='RdBu')
plt.title("Count Plot of DEATH EVENT per age\n", fontsize=16)
sns.set_context("paper", font_scale=1.4)
plt.show()

In [None]:
plt.figure(figsize=(14,6))
sns.set(style="darkgrid")
sns.countplot(x='sex',data = df, hue = 'DEATH_EVENT',palette='RdBu')
plt.title("Count Plot of DEATH EVENT per sex\n", fontsize=16)
sns.set_context("paper", font_scale=1.4)
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
# Using 5 folds cross-validation
def CrossVal(trainX, trainY, model):
    f1=cross_val_score(model,trainX , trainY, cv=5, scoring='f1')
    return(f1)

In [None]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from imblearn.ensemble import BalancedRandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,recall_score,precision_recall_curve,average_precision_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier

In [None]:
# define meta learner model
level1 = SVC()
# define the stacking ensemble
estimates = list()
estimates.append(('rf', RandomForestClassifier(n_estimators=500,
                                          max_depth = 8,
                                          min_samples_leaf=10,
                                          # max_features='sqrt',
                                          max_features = 0.7,
                                          class_weight={0:1, 1:5},
                                          n_jobs=4)))
estimates.append(('brf', BalancedRandomForestClassifier(n_estimators=500, 
                                                   max_depth = 8,
                                                   random_state=330, 
                                                   # max_features='sqrt', 
                                                   max_features = 0.7,
                                                   class_weight={0:1, 1:5},
                                                   n_jobs=4)))
estimates.append(('xgb', XGBClassifier(max_depth=8,
                                  learning_rate=0.7,
                                  n_estimators=500,
                                  # max_features='sqrt',
                                  max_features = 0.7,
                                  min_samples_leaf=10,
                                  eval_metric="logloss",
                                  n_jobs=4)))
estimates.append(('lgbm', LGBMClassifier(boosting_type='gbdt',
                                    num_leaves=10,
                                    max_depth=5,
                                    learning_rate=0.7,
                                    n_estimators=500,
                                    # max_features='sqrt',
                                    max_features = 0.7,
                                    eval_metric="logloss",
                                    class_weight={0:1, 1:5},
                                    n_jobs=4)))
estimates.append(('bbc', BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                              n_estimators=500,
                                              sampling_strategy='auto',
                                              max_features = 0.5,
                                              replacement=False,
                                              random_state=330)))

# Stacking Classifier
stack = StackingClassifier(estimators = estimates, final_estimator=level1)
# Voting Classifier with hard voting
vot_hard = VotingClassifier(estimators = estimates, voting ='hard')

In [None]:
def get_models():
    models = {}
    models['rf'] = RandomForestClassifier(n_estimators=500,
                                          max_depth = 8,
                                          min_samples_leaf=10,
                                          # max_features='sqrt',
                                          max_features = 0.7,
                                          class_weight={0:1, 1:5},
                                          n_jobs=4)
    models['brf'] = BalancedRandomForestClassifier(n_estimators=500, 
                                                   max_depth = 8,
                                                   random_state=330, 
                                                   # max_features='sqrt', 
                                                   max_features = 0.7,
                                                   class_weight={0:1, 1:5},
                                                   n_jobs=4)
    models['xgb'] = XGBClassifier(max_depth=8,
                                  learning_rate=0.7,
                                  n_estimators=500,
                                  # max_features='sqrt',
                                  max_features = 0.7,
                                  min_samples_leaf=10,
                                  n_jobs=4)
    models['lgbm'] = LGBMClassifier(boosting_type='gbdt',
                                    num_leaves=10,
                                    max_depth=5,
                                    learning_rate=0.7,
                                    n_estimators=500,
                                    # max_features='sqrt',
                                    max_features = 0.7,
                                    class_weight={0:1, 1:5},
                                    n_jobs=4)
    models['bbc'] = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                              n_estimators=500,
                                              sampling_strategy='auto',
                                              max_features = 0.5,
                                              replacement=False,
                                              random_state=330)
    models['stack'] = stack
    models['vot_hard'] = vot_hard
    return models

In [None]:
from sklearn.utils import resample
# Separate input features and target
Y = df['DEATH_EVENT']
X = df.drop('DEATH_EVENT', axis=1)

# setting up testing and training sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2727)

# concatenate our training data back together
X = pd.concat([X_train, Y_train], axis=1)

normal = X[X['DEATH_EVENT']==0]
anamoly = X[X['DEATH_EVENT']!=0]

# upsample minority
anamoly_upsampled = resample(anamoly,
                          replace=True, # sample with replacement
                          n_samples=len(normal), # match number in majority class
                          random_state=2727) # reproducible results

# combine majority and oversampled minority
oversampled = pd.concat([normal, anamoly_upsampled])

# check new class counts
oversampled['DEATH_EVENT'].value_counts()

In [None]:
# Proportion before Oversampling
print(Y_train.value_counts())

In [None]:
# Unseen data proportion of class
print(Y_test.value_counts())

In [None]:
models = get_models()

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

# trying xgboost again with the balanced dataset
y_train = oversampled['DEATH_EVENT']
X_train = oversampled.drop('DEATH_EVENT', axis=1)
print("Class Proportion after oversampling",y_train.value_counts())

print("Train Size", X_train.shape)
print("Test Size", X_test.shape)

X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled)

X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled)

prediction_smote = {}
for model in models.keys():
    print("Model {0}".format(model))
    smote = models[model]
    f1 = CrossVal(X_train_scaled, y_train, smote)
    print("Cross-Validation F1 Score is {:.2f}%".format(f1.mean()))
    smote.fit(X_train_scaled, y_train)
    # Predict on test
    smote_pred = smote.predict(X_test_scaled)
    # predict probabilities
    #probs = smote.predict_proba(X_test_scaled)
    # keep probabilities for the positive outcome only
    #probs = probs[:, 1]
    prediction_smote[model] = smote_pred

In [None]:
prediction_smote

In [None]:
import shap
explainer = shap.TreeExplainer(models['xgb'])
shap_values = explainer.shap_values(X_test_scaled)
shap.summary_plot(shap_values, X_test_scaled, plot_type="bar")

In [None]:
# Checking Balanced accuracy
from sklearn.metrics import balanced_accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate(Y_test, smote_pred):
    b_a = balanced_accuracy_score(Y_test, smote_pred)
    print("Balanced Test Accuracy is {:.2f}%".format(b_a * 100.0))
    f1_over = f1_score(Y_test, smote_pred)
    print("F1 Score is {:.2f}%".format(f1_over))
    # assign cnf_matrix with result of confusion_matrix array
    cnf_matrix = confusion_matrix(Y_test,smote_pred)
    #create a heat map
    sns.heatmap(pd.DataFrame(cnf_matrix), annot = True, cmap = 'Blues', fmt = 'd')
    plt.xlabel('Predicted')
    plt.ylabel('Expected')
    plt.show()
    recall = np.diag(cnf_matrix) / np.sum(cnf_matrix, axis = 1)
    precision = np.diag(cnf_matrix) / np.sum(cnf_matrix, axis = 0)
    print("Mean Recall", np.mean(recall))
    print("Mean Precision", np.mean(precision))
    return b_a, f1_over, recall, precision


In [None]:
b_accuracy = []
f1_scores = []
recalls = []
precisions = []
for model in prediction_smote.keys():
    print("\nModel is {0}".format(model))
    a,b,c,d = evaluate(np.array(Y_test.astype(int)), prediction_smote[model])
    b_accuracy.append(a)
    f1_scores.append(b)
    recalls.append(c)
    precisions.append(d)

In [None]:
# Name List of ML Models used
models = ['Random Forest', 'Balanced Random Forest', 'XGB', 'LGBM', 'Balanced Bagging', 'Stacking', 'Voting']
y_pos = np.arange(len(models)) #Position = 0,1,2,3,4,5,6
# Plot Cross Validation Accuracy
plt.figure(figsize=(12, 6))  
plt.bar(y_pos, b_accuracy, align='center', alpha=0.8, color=sns.color_palette("PuRd"))
plt.xticks(y_pos, models)
plt.ylabel('Balanced Accuracy')
plt.title('Performance based on Balanced Accuracy')

# Plot F1 Score
plt.figure(figsize=(12, 6))  
plt.bar(y_pos, f1_scores, align='center', alpha=0.8, color=sns.color_palette("RdPu"))
plt.xticks(y_pos, models)
plt.ylabel('F1 Score')
plt.title('Performance based on F1 Score')