Load the data

In [1]:
import pandas as pd

data = pd.read_csv('../data/heart_2020_cleaned.csv')

Normalize variables

In [2]:
from sklearn.preprocessing import MinMaxScaler

columns_to_normalize = data.select_dtypes(include=['float64']).columns
scaler = MinMaxScaler()
data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])


Separate between objective and independent variables

In [3]:
char = data.drop(columns=['HeartDisease'])
obj = data['HeartDisease']


Divide the data between train and test data

In [4]:
from sklearn.model_selection import train_test_split

char_train, char_test, obj_train, obj_test = train_test_split(char, obj, test_size=0.2, random_state=42)


Create a Random Forest model

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score

selected_features = []
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Evaluate the features chosen
def evaluate_model(features):
    rf_model.fit(char_train.iloc[:, features], obj_train)
    predictions = rf_model.predict(char_test.iloc[:, features])
    accuracy = accuracy_score(obj_test, predictions)
    recall = recall_score(obj_test, predictions)
    f1 = f1_score(obj_test, predictions)
    return accuracy, recall, f1

# Bucle Forward Selection
while len(selected_features) < char_train.shape[1]:
    best_accuracy = 0
    best_recall = 0
    best_f1 = 0
    best_feature = None
    
    # Iterate over the non-selected features
    for feature_index in range(char_train.shape[1]):
        if feature_index not in selected_features:
            current_features = selected_features + [feature_index]
            accuracy, recall, f1 = evaluate_model(current_features)
            
            # If the new accuracy is better than the last accuracy, updates it
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_recall = recall
                best_f1 = f1
                best_feature = feature_index
    
    # Adds the best feature to the other best ones
    selected_features.append(best_feature)
    
    # Progress
    print(f"Selected Features: {selected_features}, Accuracy: {best_accuracy:.2f}, Recall: {best_recall:.2f}, F1-Score: {best_f1:.2f}")

print("Final Selected Features:", selected_features)
# Print the column names of the selected features

Selected Features: [8], Accuracy: 0.70, Recall: 0.80, F1-Score: 0.73
Selected Features: [8, 11], Accuracy: 0.74, Recall: 0.82, F1-Score: 0.77
Selected Features: [8, 11, 7], Accuracy: 0.76, Recall: 0.83, F1-Score: 0.78
Selected Features: [8, 11, 7, 3], Accuracy: 0.76, Recall: 0.83, F1-Score: 0.79
Selected Features: [8, 11, 7, 3, 1], Accuracy: 0.77, Recall: 0.84, F1-Score: 0.79
Selected Features: [8, 11, 7, 3, 1, 16], Accuracy: 0.77, Recall: 0.83, F1-Score: 0.79
Selected Features: [8, 11, 7, 3, 1, 16, 18], Accuracy: 0.77, Recall: 0.82, F1-Score: 0.79
Selected Features: [8, 11, 7, 3, 1, 16, 18, 6], Accuracy: 0.77, Recall: 0.83, F1-Score: 0.79
Selected Features: [8, 11, 7, 3, 1, 16, 18, 6, 19], Accuracy: 0.77, Recall: 0.83, F1-Score: 0.79
Selected Features: [8, 11, 7, 3, 1, 16, 18, 6, 19, 9], Accuracy: 0.76, Recall: 0.82, F1-Score: 0.78
Selected Features: [8, 11, 7, 3, 1, 16, 18, 6, 19, 9, 2], Accuracy: 0.76, Recall: 0.82, F1-Score: 0.78
Selected Features: [8, 11, 7, 3, 1, 16, 18, 6, 19, 9

: 