In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
import pandas as pd

In [2]:

# read dataset from csv file
df = pd.read_csv("heart_failure_clinical_records_dataset.csv")

# shuffle dataset for preventing bias
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [3]:
# separate features and labels of dataset
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# identify features with continuous values.
lst1 = list()
for col in X:
    if len(X[col].unique()) > 2:
        lst1.append(col)

# scale continuous features with standard scaler
scaler = StandardScaler()
X[lst1] = scaler.fit_transform(X[lst1])

In [4]:
# split dataset for training and testing randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
# create random forest classifier with entropy criterion
tree = RandomForestClassifier(criterion="gini")

# fit train data to model (training)
model = tree.fit(X_train, y_train)

# make predictions with test data
y_pred = model.predict(X_test)

# calculate results and scores with test data
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88        51
           1       0.82      0.58      0.68        24

    accuracy                           0.83        75
   macro avg       0.83      0.76      0.78        75
weighted avg       0.83      0.83      0.82        75

[[48  3]
 [10 14]]


In [6]:
# perform cross validation on dataset with 5-fold cross validation
scores = cross_val_score(tree, X, y, cv=5)

# print cv scores
for s in range(len(scores)):
    print(f"Cross validation {s+1} score: {scores[s]}")

print("Average score:", np.mean(scores))

Cross validation 1 score: 0.7666666666666667
Cross validation 2 score: 0.8333333333333334
Cross validation 3 score: 0.85
Cross validation 4 score: 0.8833333333333333
Cross validation 5 score: 0.847457627118644
Average score: 0.8361581920903955


In [9]:
# perform grid search on random forest classifier to find parameters with highest scores
clf = GridSearchCV(tree, {
    "n_estimators": [10, 50, 100, 200],
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 1, 3, 5, 10, 20],
    "min_samples_split": [2, 3, 5]}, cv=5)
clf.fit(X, y)

# print best results
print("Best score of grid search:", clf.best_score_)
print("Best parameters in grid search:")
for u, v in clf.best_params_.items():
    print(u, ":", v)

Best score of grid search: 0.8630508474576271
Best parameters in grid search:
criterion : gini
max_depth : 10
min_samples_split : 5
n_estimators : 100
