# Heart Failure Prediction

## Problem statement
See the corresponding [Kaggle link](https://www.kaggle.com/code/ragishehab/eda-of-heart-disease-prediction) for details of the problem.

In [123]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [124]:
data = pd.read_csv("../data/data_heart-failure.csv")
data.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Let us evaluate the data a little bit

In [125]:
print(f"total data shape: {data.shape}")

nums = []# unique possibilities for each feature
for feature in data.columns:
    num = len(data.loc[:,feature].unique())
    nums.append(num)
    print(f"feature {feature:20s}: unique values = {num}")

total data shape: (918, 12)
feature Age                 : unique values = 50
feature Sex                 : unique values = 2
feature ChestPainType       : unique values = 4
feature RestingBP           : unique values = 67
feature Cholesterol         : unique values = 222
feature FastingBS           : unique values = 2
feature RestingECG          : unique values = 3
feature MaxHR               : unique values = 119
feature ExerciseAngina      : unique values = 2
feature Oldpeak             : unique values = 53
feature ST_Slope            : unique values = 3
feature HeartDisease        : unique values = 2


### Binary labels to 0/1

In [126]:
data = data.replace({'Sex': {'M': 1, 'F': 0}})
data = data.replace({'ExerciseAngina': {'Y': 1, 'N': 0}})
data = data.replace({'ST_Slope': {'Up': 1, 'Flat': 0}})

data.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,1,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,0,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,1,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,0,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,1,0


### One-hot encoding
We first translate categorical features with more than 2 classes into several binary features.

In [127]:
cat_features = ['ChestPainType','RestingECG','ST_Slope']
print(f"The multiclass categorical features are:\n{cat_features}")
dataOHE = pd.get_dummies(data = data, prefix = cat_features, columns = cat_features)

print(f"modified data shape: {dataOHE.shape}")
print(f"number of features: {dataOHE.shape[1]-1}")
print(f"number of examples: {dataOHE.shape[0]}")

dataOHE.head(5)

The multiclass categorical features are:
['ChestPainType', 'RestingECG', 'ST_Slope']
modified data shape: (918, 19)
number of features: 18
number of examples: 918


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_0,ST_Slope_1,ST_Slope_Down
0,40,1,140,289,0,172,0,0.0,0,0,1,0,0,0,1,0,0,1,0
1,49,0,160,180,0,156,0,1.0,1,0,0,1,0,0,1,0,1,0,0
2,37,1,130,283,0,98,0,0.0,0,0,1,0,0,0,0,1,0,1,0
3,48,0,138,214,0,108,1,1.5,1,1,0,0,0,0,1,0,1,0,0
4,54,1,150,195,0,122,0,0.0,0,0,0,1,0,0,1,0,0,1,0


In [128]:
features = [x for x in dataOHE.columns if x != 'HeartDisease'] ## Removing our target variable
print(f"total number of features after one-hot encoding: {len(features)}")

total number of features after one-hot encoding: 18


### Split the data using Scikit-learn

In [131]:
X_train, X_dev, y_train, y_dev = train_test_split(dataOHE[features],dataOHE['HeartDisease'],train_size=0.8, random_state=1)
print(f"X_train.shape = {X_train.shape}, X_dev.shape = {X_dev.shape}")
print(f"y_train.shape = {y_train.shape}, y_dev.shape = {y_dev.shape}")

X_train.shape = (734, 18), X_dev.shape = (184, 18)
y_train.shape = (734,), y_dev.shape = (184,)


## XGBoost

In [149]:
n_estimators = 100 # default val (number of decision trees in the ensemble)
early_stopping_rounds = 20 # patience on stopping criteria
learning_rate = 0.1 # learning rate of the Gradient Descent
model = XGBClassifier(n_estimators = n_estimators, learning_rate = learning_rate,
                      verbosity = 0, random_state = 1, early_stopping_rounds = early_stopping_rounds)
model.fit(X_train,y_train, eval_set = [(X_dev,y_dev)],verbose=False)

# yhat = model.predict(X_test)
# print(f"Prediction Accuracy = {np.mean(yhat==y_test)*100:.1f}%")

from sklearn.metrics import accuracy_score # the same np.mean(yhat==y)*100
print(f"train: Accuracy score: {accuracy_score(model.predict(X_train),y_train)*100:.2f}%")
print(f"dev  : Accuracy score: {accuracy_score(model.predict(X_dev),y_dev)*100:.2f}%")

train: Accuracy score: 98.23%
dev  : Accuracy score: 88.04%
