## Predicting Heart Failure Mortality

In [108]:
from path import Path
import pandas as pd

In [109]:
data = Path('Resources/heart_failure_clinical_records_dataset.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


## Separate the Features (X) from the Target (y)

In [110]:
y = df["DEATH_EVENT"]
X = df.drop(columns="DEATH_EVENT")

## Split our data into training and testing

In [111]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=1,
                                                    stratify=y)
X_train.shape

(224, 12)

## Create a Logistic Regression Model

In [112]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                               max_iter=200,
                               random_state=1)

## Fit (train) our model using the training data

In [113]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

## Make Predictions

In [114]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,1
6,0,0
7,0,0
8,0,0
9,0,0


## Obtain Accuracy Score

In [115]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7733333333333333


## Run Confusion Matrix and Classification Report

In [116]:
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[43  8]
 [ 9 15]]


In [117]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.83      0.84      0.83        51
           1       0.65      0.62      0.64        24

    accuracy                           0.77        75
   macro avg       0.74      0.73      0.74        75
weighted avg       0.77      0.77      0.77        75



## Balanced Random Forest Classifier

In [118]:
# Scale data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Resample the training data with the RandomOversampler
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators = 1000, random_state =1)
model = brfc.fit(X_train_scaled, y_train)
BalancedRandomForestClassifier()

BalancedRandomForestClassifier()

In [119]:
# Calculate the balanced accuracy score
predictions = model.predict(X_test_scaled)
accuracy_score(y_test, predictions)

0.7866666666666666

In [120]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[40, 11],
       [ 5, 19]])

In [121]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.89      0.78      0.83        51
           1       0.63      0.79      0.70        24

    accuracy                           0.79        75
   macro avg       0.76      0.79      0.77        75
weighted avg       0.81      0.79      0.79        75



## Easy Ensemble AdaBoost Classifier

In [122]:
# Train the EasyEnsemble Classifier
from imblearn.ensemble import EasyEnsembleClassifier

model = EasyEnsembleClassifier(base_estimator=None, n_estimators=100, n_jobs=1, random_state=1,
                                  replacement=False, sampling_strategy='auto', verbose=0,
                                  warm_start=False)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

EasyEnsembleClassifier()

EasyEnsembleClassifier()

In [123]:
# Calculate the random accuracy score
#from sklearn.metrics import balanced_accuracy_score
#predictions = model.predict(X_train_scaled)
#balanced_accuracy_score(y_train, predictions)

0.924342105263158

In [124]:
# Calculate the random accuracy score
from sklearn.metrics import balanced_accuracy_score
predictions = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, predictions)

0.7683823529411764

In [125]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[38, 13],
       [ 5, 19]])

In [126]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.75      0.81        51
           1       0.59      0.79      0.68        24

    accuracy                           0.76        75
   macro avg       0.74      0.77      0.74        75
weighted avg       0.79      0.76      0.77        75



## SMOTE Oversampling

In [127]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train, y_train
)
from collections import Counter

Counter(y_resampled)

Counter({1: 152, 0: 152})

In [128]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [129]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6850490196078431

In [130]:
# Calculated the balanced accuracy score
#y_pred = model.predict(X_train)
#balanced_accuracy_score(y_train, y_pred)

In [131]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[38, 13],
       [ 9, 15]])

In [132]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.81      0.75      0.62      0.78      0.68      0.47        51
          1       0.54      0.62      0.75      0.58      0.68      0.46        24

avg / total       0.72      0.71      0.66      0.71      0.68      0.47        75



## Combination (Over and Under) Sampling

In [133]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

sm = SMOTEENN(random_state=1)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 30, 1: 35})

In [134]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [135]:
# Calculate the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6519607843137255

In [136]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[41, 10],
       [12, 12]])

In [137]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.77      0.80      0.50      0.79      0.63      0.41        51
          1       0.55      0.50      0.80      0.52      0.63      0.39        24

avg / total       0.70      0.71      0.60      0.70      0.63      0.41        75

