# Heart Failure Prediction - Basic Machine Learning Models with Scikit-learn

![](https://www.redcross.org.au/kenticoimage.axd/bc07808c-0e79-4572-b601-7e91e0839090.jpg?v=010505&width=585&height=329&mode=crop)

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:red; border:0' role="tab" aria-controls="home"><center>Quick Navigation</center></h3>

* [1. Data Loading and Splitting](#1)
* [2. Data Scaling](#2)
* [3. Logistic Regression](#3)
* [4. Support Vector Machine](#4)
* [5. Decision Tree](#5)    
* [6. Random Forest](#6)
* [7. K-Nearest Neighbor](#7)
* [8. LightGBM](#8)
* [9. Final Comparison](#9)

In [None]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection as sk_model_selection
from sklearn import preprocessing as sk_preprocessing
from sklearn import linear_model as sk_linear_model
from sklearn import svm as sk_svm
from sklearn import tree as sk_tree
from sklearn import ensemble as sk_ensemble
from sklearn import neighbors as sk_neighbors
from sklearn import metrics as sk_metrics
import lightgbm as lgbm

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)


SEED = 42
set_seed(SEED)

In [None]:
def print_metrics(y_true, y_pred):
    accuracy = sk_metrics.accuracy_score(y_true, y_pred)
    f1 = sk_metrics.f1_score(y_true, y_pred)
    precision = sk_metrics.precision_score(y_true, y_pred)
    recall = sk_metrics.recall_score(y_true, y_pred)
    
    print(f'Accuracy (test set)\t| {accuracy:.4f}')
    print(f'F1 (test set)\t\t| {f1:.4f}')
    print(f'Precision (test set)\t| {precision:.4f}')
    print(f'Recall (test set)\t| {recall:.4f}')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [None]:
def fit_model_with_grid_search(model, parameters, scoring='f1', verbose=1):
    model = sk_model_selection.GridSearchCV(
        model,
        parameters,
        scoring=scoring
    )
    
    model.fit(X_train, y_train)
    
    if verbose:
        print(f'best_params_: {model.best_params_}')
        print(f'Mean cross-validated F1 score of the best_estimator: {model.best_score_:.4f}')
        
    return model

In [None]:
dict_results = {}

<a id="1"></a>
<h2 style='background:red; border:0; color:white'><center>Data Loading and Splitting<center><h2>

In [None]:
df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
print(df.shape)
df.head(5)

In [None]:
X_data = df.drop(['DEATH_EVENT'], axis=1).values
y_data = df['DEATH_EVENT'].values
X_data.shape, y_data.shape

In [None]:
X_train, X_test, y_train, y_test = sk_model_selection.train_test_split(
    X_data, 
    y_data, 
    test_size=0.2, 
    random_state=42, 
    shuffle=True, 
    stratify=y_data
)
print(f'X_train shape: {X_train.shape} y_train.shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape} y_test.shape: {y_test.shape}')

<a id="2"></a>
<h2 style='background:red; border:0; color:white'><center>Data Scaling<center><h2>

In [None]:
scaler = sk_preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

<a id="3"></a>
<h2 style='background:red; border:0; color:white'><center>Logistic Regression<center><h2>

In [None]:
model_logistic_regression = sk_linear_model.LogisticRegression(
    class_weight='balanced', 
    random_state=SEED,
)

parameters = {
    'C': [0.01, 0.1, 1],
}

model_logistic_regression = fit_model_with_grid_search(
    model_logistic_regression,
    parameters,
    scoring='f1',
)

y_test_pred = model_logistic_regression.predict(X_test)

print()
dict_results['Logistic Regression'] = print_metrics(y_test, y_test_pred)

<a id="4"></a>
<h2 style='background:red; border:0; color:white'><center>Support Vector Machine<center><h2>

In [None]:
model_svc = sk_svm.SVC(
    class_weight='balanced', 
    random_state=SEED,
)

parameters = {
    'C': [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
}

model_svc = fit_model_with_grid_search(
    model_svc,
    parameters,
    scoring='f1',
)

y_test_pred = model_svc.predict(X_test)

print()
dict_results['SVC'] = print_metrics(y_test, y_test_pred)

<a id="5"></a>
<h2 style='background:red; border:0; color:white'><center>Decision Tree<center><h2>

In [None]:
model_decision_tree = sk_tree.DecisionTreeClassifier(
    class_weight='balanced', 
    random_state=SEED,
)

parameters = {
    "max_depth": [1, 2, 3, 5, 10, None], 
    "min_samples_leaf": [1, 5, 10, 20],
}

model_decision_tree = fit_model_with_grid_search(
    model_decision_tree,
    parameters,
    scoring='f1',
)

y_test_pred = model_decision_tree.predict(X_test)

print()
dict_results['Decision Tree'] = print_metrics(y_test, y_test_pred)

<a id="6"></a>
<h2 style='background:red; border:0; color:white'><center>Random Forest<center><h2>

In [None]:
model_random_forest = sk_ensemble.RandomForestClassifier(
    class_weight='balanced', 
    random_state=SEED,
)

parameters = {
    "n_estimators": [5, 10, 15, 20],
    "max_depth": [1, 2, 3, 5, 10, None],
    "min_samples_leaf": [1, 5, 10, 20]
}

model_random_forest = fit_model_with_grid_search(
    model_random_forest,
    parameters,
    scoring='f1',
)

y_test_pred = model_random_forest.predict(X_test)

print()
dict_results['Random Forest'] = print_metrics(y_test, y_test_pred)

<a id="7"></a>
<h2 style='background:red; border:0; color:white'><center>K-Nearest Neighbor<center><h2>

In [None]:
model_k_neighbors = sk_neighbors.KNeighborsClassifier()

parameters = {
    "n_neighbors": list(range(1, 11)),
    "weights": ['uniform', 'distance'],
}

model_k_neighbors = fit_model_with_grid_search(
    model_k_neighbors,
    parameters,
    scoring='f1',
)

y_test_pred = model_k_neighbors.predict(X_test)

print()
dict_results['K-Neighbors'] = print_metrics(y_test, y_test_pred)

<a id="8"></a>
<h2 style='background:red; border:0; color:white'><center>LightGBM<center><h2>

In [None]:
model_lgbm = lgbm.LGBMClassifier(
    class_weight='balanced',
    random_state=SEED,
)

parameters = {
    'num_leaves': [7, 15, 31],
    'learning_rate': [0.001, 0.01, 0.1],
    'n_estimators': [100, 200, 300],
    'reg_alpha': [1],
    'reg_lambda': [1],
    'colsample_bytree': [0.5, 0.75, 1.]
}

model_lgbm = fit_model_with_grid_search(
    model_lgbm,
    parameters,
    scoring='f1',
)

y_test_pred = model_lgbm.predict(X_test)

print()
dict_results['LightGBM'] = print_metrics(y_test, y_test_pred)

<a id="9"></a>
<h2 style='background:red; border:0; color:white'><center>Final Comparison<center><h2>

In [None]:
pd.DataFrame(dict_results).T