In [8]:
import kagglehub
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.cm import rainbow
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit 

## Step 1: Download the Heart Failure Prediction Dataset


In [9]:

# Load the dataset
dataset = pd.read_csv("C:\\Users\\antoi\\.cache\\kagglehub\\datasets\\fedesoriano\\heart-failure-prediction\\versions\\1\heart.csv")
dataset.head() 

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Step 2: Dataset Preparation

In [10]:
#Setting a random seed ensures that any random processes produce the same results each time the code is run.
import random
random.seed(42)

### Feature Preprocessing

In [11]:
X = dataset.drop(columns="HeartDisease")  # All columns except the target
y = dataset["HeartDisease"]               # Target column

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()  

# Normalize only numeric columns
for col in X.columns:
    if pd.api.types.is_numeric_dtype(X[col]):
        X[col] = scaler.fit_transform(X[[col]])
X

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,0.244898,M,ATA,0.70,0.479270,0.0,Normal,0.788732,N,0.295455,Up
1,0.428571,F,NAP,0.80,0.298507,0.0,Normal,0.676056,N,0.409091,Flat
2,0.183673,M,ATA,0.65,0.469320,0.0,ST,0.267606,N,0.295455,Up
3,0.408163,F,ASY,0.69,0.354892,0.0,Normal,0.338028,Y,0.465909,Flat
4,0.530612,M,NAP,0.75,0.323383,0.0,Normal,0.436620,N,0.295455,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,0.346939,M,TA,0.55,0.437811,0.0,Normal,0.507042,N,0.431818,Flat
914,0.816327,M,ASY,0.72,0.320066,1.0,Normal,0.570423,N,0.681818,Flat
915,0.591837,M,ASY,0.65,0.217247,0.0,Normal,0.387324,Y,0.431818,Flat
916,0.591837,F,ATA,0.65,0.391376,0.0,LVH,0.802817,N,0.295455,Flat


In [12]:
#categorical features encoding
cat_cols = X.select_dtypes(include='object').columns
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
print(X.head())

        Age  RestingBP  Cholesterol  FastingBS     MaxHR   Oldpeak  Sex_M  \
0  0.244898       0.70     0.479270        0.0  0.788732  0.295455      1   
1  0.428571       0.80     0.298507        0.0  0.676056  0.409091      0   
2  0.183673       0.65     0.469320        0.0  0.267606  0.295455      1   
3  0.408163       0.69     0.354892        0.0  0.338028  0.465909      0   
4  0.530612       0.75     0.323383        0.0  0.436620  0.295455      1   

   ChestPainType_ATA  ChestPainType_NAP  ChestPainType_TA  RestingECG_Normal  \
0                  1                  0                 0                  1   
1                  0                  1                 0                  1   
2                  1                  0                 0                  0   
3                  0                  0                 0                  1   
4                  0                  1                 0                  1   

   RestingECG_ST  ExerciseAngina_Y  ST_Slope_Flat  ST_Sl

### Train/Validation/Test Split

In [13]:
X_train, X_test_val, y_train, y_test_val = train_test_split(
    X, y, 
    random_state=42,   # For reproducibility ------> no need for shuffling
    test_size=0.3,     # 30% for testing and validation, 70% for training
     stratify=y
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test_val, y_test_val, 
    random_state=42,   # For reproducibility ------> no need for shuffling
    test_size=1/3,     # 20% for testing, 10% for validation of the whole dataset
     stratify=y_test_val
)

#ratio between number of zeros and ones are the same for all sets
print(y_test.value_counts())
print(y_val.value_counts())
print(y_train.value_counts())

1    102
0     82
Name: HeartDisease, dtype: int64
1    51
0    41
Name: HeartDisease, dtype: int64
1    355
0    287
Name: HeartDisease, dtype: int64


## Bagging Ensemble

In [14]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample

#Create bootstrap samples randomly
def create_samples(X, y, num_of_resamples):
    bootstrap_samples = []
    for i in range(num_of_resamples):
        resampled_X, resampled_y = resample(X, y, replace = True) 
        bootstrap_samples.append((resampled_X, resampled_y))


    #print("-" * 40)  # Separator for readability       
    return bootstrap_samples

#obtain decision tree for each bootstrap
def train_models(bootstrap_samples):
    base_learners = []
    for resampled_X, resampled_y in bootstrap_samples:
        decision_tree = DecisionTreeClassifier(max_depth=5)    # we need to replace by our implementation
        decision_tree.fit(resampled_X, resampled_y)
        base_learners.append(decision_tree)
        
    return base_learners

# Calculate majority vote for predictions
def majority_predictions(base_learners, X):
    # Each col represents the predictions of a certain sample and each row represents a certain decision tree from base_learners
    all_predictions = np.array([tree.predict(X) for tree in base_learners]) 
    final_prediction = np.apply_along_axis(lambda y: np.bincount(y).argmax(), axis=0, arr=all_predictions)
     
    return final_prediction


    
    
def bagging_ensemble(X, y, num_of_datasets, X_test_or_val):
    bootstrap_samples = create_samples(X, y, num_of_datasets)
    base_learners = train_models(bootstrap_samples)
    predicted_y = majority_predictions(base_learners, X_test_or_val)
    
    return predicted_y


num_of_datasets = [10, 15, 20, 25, 30, 35, 100]
highest_accuracy = 0
best_n_dataset = None
best_base_learners = []
# Evaluate accuracy for different hyperparameter n using validation set
for n in num_of_datasets:
    
    bootstrap_samples = create_samples(X_train, y_train, n)
    base_learners = train_models(bootstrap_samples)
    predicted_y_training = majority_predictions(base_learners, X_train)
    #predicted_y_training = bagging_ensemble(X_train, y_train, n, X_train)
    training_accuracy = accuracy_score(y_train, predicted_y_training)
    print(f"Number of datasets = {n}, Training Accuracy = {training_accuracy}")
    
    #predicted_y_val = bagging_ensemble(X_train, y_train, n, X_val)
    predicted_y_val = majority_predictions(base_learners, X_val)
    val_accuracy = accuracy_score(y_val, predicted_y_val)
    print(f"Number of datasets = {n}, Validation Accuracy = {val_accuracy}")
    
    
    print("\n")
    if val_accuracy > highest_accuracy:
        highest_accuracy = val_accuracy
        best_n_dataset = n
        best_base_learners = base_learners
        
        
print(f"\nBest n_datasets: {best_n_dataset} with Validation Accuracy: {highest_accuracy}")  

# Test the model
#predicted_y_test = bagging_ensemble(X_train, y_train, best_n_dataset, X_test)
predicted_y_test = majority_predictions(best_base_learners, X_test)
accuracy = accuracy_score(y_test, predicted_y_test)
print(f"\nTest accuracy with best n_datasets {best_n_dataset} =  {accuracy}")  

    

Number of datasets = 10, Training Accuracy = 0.9330218068535826
Number of datasets = 10, Validation Accuracy = 0.8913043478260869


Number of datasets = 15, Training Accuracy = 0.9299065420560748
Number of datasets = 15, Validation Accuracy = 0.8586956521739131


Number of datasets = 20, Training Accuracy = 0.9361370716510904
Number of datasets = 20, Validation Accuracy = 0.8478260869565217


Number of datasets = 25, Training Accuracy = 0.9283489096573209
Number of datasets = 25, Validation Accuracy = 0.8695652173913043

Number of datasets = 30, Training Accuracy = 0.9345794392523364
Number of datasets = 30, Validation Accuracy = 0.8586956521739131

Number of datasets = 35, Training Accuracy = 0.9376947040498442
Number of datasets = 35, Validation Accuracy = 0.8695652173913043

Number of datasets = 100, Training Accuracy = 0.9392523364485982
Number of datasets = 100, Validation Accuracy = 0.8586956521739131



Best n_datasets: 10 with Validation Accuracy: 0.8913043478260869

Test accur