In [None]:
! pip install lazypredict

In [None]:
import numpy as np
import pandas as pd
import inspect # Debugging 

import warnings
warnings.filterwarnings("ignore")

data    = pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
columns = data.columns.values

# No NaNs
# data.isna().any()

In [None]:
# Correlations
from sklearn.decomposition import PCA 
import matplotlib.pyplot as plt

corr = data.corr()

# PCA
pca          = PCA(n_components=2)
reduced_data = pca.fit_transform(data.values) 

fig, (corr_ax, pca_ax) = plt.subplots(1, 2, figsize=(16,8))
corr_ax.imshow(corr)
corr_ax.set_xticks(np.arange(len(columns)))
corr_ax.set_yticks(np.arange(len(columns)))
corr_ax.set_xticklabels(columns)
corr_ax.set_yticklabels(columns)
corr_ax.set_title("Correlations")
plt.setp(corr_ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

scatter = pca_ax.scatter(reduced_data[:, 0], reduced_data[:, 1], 
               c=data["DEATH_EVENT"].values) 
pca_ax.set_xticklabels([])
pca_ax.set_yticklabels([])
pca_ax.set_xticks([])
pca_ax.set_yticks([])
pca_ax.legend(*scatter.legend_elements(), title="Legend")
pca_ax.set_title("PCA")
plt.show()

In [None]:
# Splitting 

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing   import StandardScaler
from sklearn.neighbors       import LocalOutlierFactor

train  = data.drop(columns=["DEATH_EVENT"]).values
target = data["DEATH_EVENT"].values

lof       = LocalOutlierFactor(n_neighbors=5)
outliers  = lof.fit_predict(train)
to_delete = np.where(outliers == -1)
print(f"Initial num. of rows: {len(train)}, rows dropped: {len(to_delete[0])}")

train  = np.delete(train, to_delete, 0)
target = np.delete(target, to_delete, 0)

scaler = StandardScaler()
train  = scaler.fit_transform(train)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=68)
for train_index, test_index in sss.split(train, target):
    x_train, y_train = train[train_index, :], target[train_index]
    x_test, y_test   = train[test_index, :], target[test_index]

In [None]:
# Quick models evaluation

from lazypredict.Supervised import LazyClassifier
from sklearn.metrics        import recall_score

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=recall_score)
models, predictions = clf.fit(x_train, x_test, y_train, y_test)

print(models)

In [None]:
# Model(s) testing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics         import make_scorer, accuracy_score, recall_score

from sklearn.linear_model  import LogisticRegression
from sklearn.svm           import SVC
from sklearn.ensemble      import BaggingClassifier
from sklearn.neighbors     import NearestCentroid

from xgboost import XGBClassifier

estimators = {"LogisticRegression": {"func"  : LogisticRegression(),
                                     "params": {"C": [0.1, 0.5, 1, 1.5, 2]}},
              "SVC"               : {"func"  : SVC(),
                                     "params": {"kernel": ["poly",
                                                           "rbf"],
                                                "C"     : [0.1, 0.5, 1, 1.5, 2],
                                                "gamma" : ["scale", "auto"],
                                                "shrinking"  : [True, False],
                                                "probability": [True, False]}},
              "BaggingClassifier" : {"func"  : BaggingClassifier(),
                                     "params": {"n_estimators": [10, 20, 30, 40, 50],
                                                "max_samples" : [0.2, 0.4, 0.6, 0.8, 1.0],
                                                "max_features": [0.2, 0.4, 0.6, 0.8, 1.0],
                                                "bootstrap"   : [True, False],
                                                "bootstrap_features": [True, False],
                                                "oob_score"         : [True, False],
                                                "warm_start"        : [True, False]}},
             "XGBClassifier"      : {"func"  : XGBClassifier(),
                                     "params": {"n_estimators "   : [20],
                                                "max_depth "      : [6],
                                                "learning_rate"   : [0.05],
                                                "booster"         : ["gbtree", "gblinear", "dart"],
                                                "min_child_weight": [0.1, 0.5, 0.8],
                                                "colsample_bytree": [0.4, 0.6, 0.8, 1],
                                                "subsample"       : [0.5, 0.75, 1],
                                                "n_jobs":           [-1]}},
             "NearestCentroid"    : {"func"  : NearestCentroid(),
                                     "params": {"metric": ["euclidean", "manhattan"]}}}

#models_to_test = estimators.keys()
models_to_test = ["NearestCentroid"]

for estimator_name in models_to_test:
    model = GridSearchCV(estimator=estimators[estimator_name]["func"],
                        param_grid=estimators[estimator_name]["params"],
                        scoring=make_scorer(recall_score),
                        n_jobs=-1)
    model.fit(x_train, y_train)
    preds  = model.predict(x_test)
    recall = recall_score(y_test, preds)
    acc    = accuracy_score(y_test, preds)
    print(f"{estimator_name}: \n REC: {recall}, \n ACC: {acc}, \n BEST PARAM: {model.best_params_} \n")

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(model, x_test, y_test)