## Predicting Heart Failure Mortality

In [34]:
from path import Path
import pandas as pd
import matplotlib.pyplot as plt

In [35]:
data = Path('Resources/heart_failure_clinical_records_dataset.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [36]:
# Drop less important features
df = df.drop(columns=['high_blood_pressure', 'diabetes', 'anaemia', 'sex'])
df.head(10)

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium,smoking,time,DEATH_EVENT
0,75.0,582,20,265000.0,1.9,130,0,4,1
1,55.0,7861,38,263358.03,1.1,136,0,6,1
2,65.0,146,20,162000.0,1.3,129,1,7,1
3,50.0,111,20,210000.0,1.9,137,0,7,1
4,65.0,160,20,327000.0,2.7,116,0,8,1
5,90.0,47,40,204000.0,2.1,132,1,8,1
6,75.0,246,15,127000.0,1.2,137,0,10,1
7,60.0,315,60,454000.0,1.1,131,1,10,1
8,65.0,157,65,263358.03,1.5,138,0,10,1
9,80.0,123,35,388000.0,9.4,133,1,10,1


## Separate the Features (X) from the Target (y)

In [37]:
y = df["DEATH_EVENT"]
X = df.drop(columns="DEATH_EVENT")

## Split our data into training and testing

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=1,
                                                    stratify=y)
X_train.shape

(224, 8)

## Create a Logistic Regression Model

In [39]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                               max_iter=200,
                               random_state=1)

## Fit (train) our model using the training data

In [40]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

## Make Predictions

In [41]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,1,0
5,0,1
6,0,0
7,0,0
8,0,0
9,0,0


## Obtain Accuracy Score

In [42]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7733333333333333


## Run Confusion Matrix and Classification Report

In [43]:
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[44  7]
 [10 14]]


In [44]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.81      0.86      0.84        51
           1       0.67      0.58      0.62        24

    accuracy                           0.77        75
   macro avg       0.74      0.72      0.73        75
weighted avg       0.77      0.77      0.77        75



## Balanced Random Forest Classifier

In [45]:
# Scale data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Resample the training data with the RandomOversampler
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators = 1000, random_state =1)
model = brfc.fit(X_train_scaled, y_train)
BalancedRandomForestClassifier()

BalancedRandomForestClassifier()

In [46]:
# Calculate the balanced accuracy score
predictions = model.predict(X_test_scaled)
accuracy_score(y_test, predictions)

0.7866666666666666

In [47]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[39, 12],
       [ 4, 20]])

In [48]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.91      0.76      0.83        51
           1       0.62      0.83      0.71        24

    accuracy                           0.79        75
   macro avg       0.77      0.80      0.77        75
weighted avg       0.82      0.79      0.79        75



In [49]:
# Calculate feature importance in the Random Forest model.
importances = model.feature_importances_
importances

array([0.08810839, 0.08706681, 0.14812984, 0.08558144, 0.16335594,
       0.08282914, 0.01175226, 0.33317618])

In [50]:
# Sort the features by their importance.
sorted(zip(model.feature_importances_, X.columns), reverse=True)

[(0.33317618212752714, 'time'),
 (0.16335593729165884, 'serum_creatinine'),
 (0.14812983667582866, 'ejection_fraction'),
 (0.08810838950321483, 'age'),
 (0.08706680659312555, 'creatinine_phosphokinase'),
 (0.08558144449816783, 'platelets'),
 (0.08282913992864546, 'serum_sodium'),
 (0.011752263381831714, 'smoking')]

## Easy Ensemble AdaBoost Classifier

In [51]:
# Train the EasyEnsemble Classifier
from imblearn.ensemble import EasyEnsembleClassifier

model = EasyEnsembleClassifier(base_estimator=None, n_estimators=100, n_jobs=1, random_state=1,
                                  replacement=False, sampling_strategy='auto', verbose=0,
                                  warm_start=False)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

EasyEnsembleClassifier()

EasyEnsembleClassifier()

In [52]:
# Calculate the random accuracy score
from sklearn.metrics import balanced_accuracy_score
predictions = model.predict(X_train_scaled)
balanced_accuracy_score(y_train, predictions)

0.9177631578947368

In [53]:
# Calculate the random accuracy score
from sklearn.metrics import balanced_accuracy_score
predictions = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, predictions)

0.821078431372549

In [54]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[37, 14],
       [ 2, 22]])

In [55]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      0.73      0.82        51
           1       0.61      0.92      0.73        24

    accuracy                           0.79        75
   macro avg       0.78      0.82      0.78        75
weighted avg       0.84      0.79      0.79        75



## SMOTE Oversampling

In [56]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train, y_train
)
from collections import Counter

Counter(y_resampled)

Counter({1: 152, 0: 152})

In [57]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [58]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6850490196078431

In [59]:
# Calculated the balanced accuracy score
#y_pred = model.predict(X_train)
#balanced_accuracy_score(y_train, y_pred)

In [60]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[38, 13],
       [ 9, 15]])

In [61]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.81      0.75      0.62      0.78      0.68      0.47        51
          1       0.54      0.62      0.75      0.58      0.68      0.46        24

avg / total       0.72      0.71      0.66      0.71      0.68      0.47        75



## Combination (Over and Under) Sampling

In [62]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

sm = SMOTEENN(random_state=1)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 30, 1: 35})

In [63]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [64]:
# Calculate the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6519607843137255

In [65]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[41, 10],
       [12, 12]])

In [66]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.77      0.80      0.50      0.79      0.63      0.41        51
          1       0.55      0.50      0.80      0.52      0.63      0.39        24

avg / total       0.70      0.71      0.60      0.70      0.63      0.41        75

