In [None]:
import numpy as np
import pandas as pd
from tabulate import tabulate
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef, \
    f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score, precision_recall_curve, auc
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from mlxtend.plotting import plot_decision_regions
from sklearn.tree import DecisionTreeClassifier
import random


class HF_Pred:
    """
    By Sadegh Soleimani
    Statistical Pattern Recognition
    Implementation of https://doi.org/10.1186/s12911-020-1023-5
    Date: Feb 2021
    
    This work implements parts of the article.
    All rights reserved.
    """

    def __init__(self):
        self.filepath = '../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv'

    def __str__(self):
        str = "Implementation of https://doi.org/10.1186/s12911-020-1023-5"
        print(str)

    def dataset(self, data):
        data = data
        feature_names = np.array(data.columns.values).reshape(13, 1)

        # Data Representation
        # Table 1
        data_min, data_max = np.array(data.min()).reshape(13, 1), np.array(data.max()).reshape(13, 1)
        table_1 = np.hstack((np.hstack((feature_names, data_min)), data_max))
        print(tabulate(table_1, headers=['Feature', 'Min', 'Max']))  # Table 1 from page 3

        # Table 2
        anaemia_0 = ['anaemia_0', 299 - sum(data.anaemia.values),
                     sum(data.DEATH_EVENT.values[np.where(data.anaemia.values == 0)])]
        anaemia_1 = ['anaemia_1', sum(data.anaemia.values),
                     sum(data.DEATH_EVENT.values[np.where(data.anaemia.values == 1)])]
        high_blood_pressure_0 = ['high_blood_pressure_0', 299 - sum(data.high_blood_pressure.values),
                                 sum(data.DEATH_EVENT.values[np.where(data.high_blood_pressure.values == 0)])]
        high_blood_pressure_1 = ['high_blood_pressure_1', sum(data.high_blood_pressure.values),
                                 sum(data.DEATH_EVENT.values[np.where(data.high_blood_pressure.values == 1)])]
        diabetes_0 = ['diabetes_0', 299 - sum(data.diabetes.values),
                      sum(data.DEATH_EVENT.values[np.where(data.diabetes.values == 0)])]
        diabetes_1 = ['diabetes_1', sum(data.diabetes.values),
                      sum(data.DEATH_EVENT.values[np.where(data.diabetes.values == 1)])]
        woman = ['woman', 299 - sum(data.sex.values),
                 sum(data.DEATH_EVENT.values[np.where(data.sex.values == 0)])]
        man = ['man', sum(data.sex.values),
               sum(data.DEATH_EVENT.values[np.where(data.sex.values == 1)])]
        smoking_0 = ['smoking_0', 299 - sum(data.smoking.values),
                     sum(data.DEATH_EVENT.values[np.where(data.smoking.values == 0)])]
        smoking_1 = ['smoking_1', sum(data.smoking.values),
                     sum(data.DEATH_EVENT.values[np.where(data.smoking.values == 1)])]
        table_2 = np.hstack((anaemia_0, anaemia_1, high_blood_pressure_0, high_blood_pressure_1,
                             diabetes_0, diabetes_1, woman, man, smoking_0, smoking_1)).reshape(10, 3)

        print('\n', tabulate(table_2, headers=['Feature', 'Sample Counts', 'Dead patients'],
                             tablefmt='orgtbl'))  # Table 2 from page 4

        # Table 3
        age = ['age',
               np.median(data.age.values),
               np.mean(data.age.values),
               np.std(data.age.values),
               np.median(data.age.values[np.where(data.DEATH_EVENT.values == 1)]),
               np.mean(data.age.values[np.where(data.DEATH_EVENT.values == 1)]),
               np.std(data.age.values[np.where(data.DEATH_EVENT.values == 1)]),
               np.median(data.age.values[np.where(data.DEATH_EVENT.values == 0)]),
               np.mean(data.age.values[np.where(data.DEATH_EVENT.values == 0)]),
               np.std(data.age.values[np.where(data.DEATH_EVENT.values == 0)])
               ]
        creatinine_phosphokinase = ['creatinine_phosphokinase',
                                    np.median(data.creatinine_phosphokinase.values),
                                    np.mean(data.creatinine_phosphokinase.values),
                                    np.std(data.creatinine_phosphokinase.values),
                                    np.median(
                                        data.creatinine_phosphokinase.values[np.where(data.DEATH_EVENT.values == 1)]),
                                    np.mean(
                                        data.creatinine_phosphokinase.values[np.where(data.DEATH_EVENT.values == 1)]),
                                    np.std(
                                        data.creatinine_phosphokinase.values[np.where(data.DEATH_EVENT.values == 1)]),
                                    np.median(
                                        data.creatinine_phosphokinase.values[np.where(data.DEATH_EVENT.values == 0)]),
                                    np.mean(
                                        data.creatinine_phosphokinase.values[np.where(data.DEATH_EVENT.values == 0)]),
                                    np.std(data.creatinine_phosphokinase.values[np.where(data.DEATH_EVENT.values == 0)])
                                    ]
        ejection_fraction = ['ejection_fraction',
                             np.median(data.ejection_fraction.values),
                             np.mean(data.ejection_fraction.values),
                             np.std(data.ejection_fraction.values),
                             np.median(data.ejection_fraction.values[np.where(data.DEATH_EVENT.values == 1)]),
                             np.mean(data.ejection_fraction.values[np.where(data.DEATH_EVENT.values == 1)]),
                             np.std(data.ejection_fraction.values[np.where(data.DEATH_EVENT.values == 1)]),
                             np.median(data.ejection_fraction.values[np.where(data.DEATH_EVENT.values == 0)]),
                             np.mean(data.ejection_fraction.values[np.where(data.DEATH_EVENT.values == 0)]),
                             np.std(data.ejection_fraction.values[np.where(data.DEATH_EVENT.values == 0)])
                             ]
        platelets = ['platelets',
                     np.median(data.platelets.values),
                     np.mean(data.platelets.values),
                     np.std(data.platelets.values),
                     np.median(data.platelets.values[np.where(data.DEATH_EVENT.values == 1)]),
                     np.mean(data.platelets.values[np.where(data.DEATH_EVENT.values == 1)]),
                     np.std(data.platelets.values[np.where(data.DEATH_EVENT.values == 1)]),
                     np.median(data.platelets.values[np.where(data.DEATH_EVENT.values == 0)]),
                     np.mean(data.platelets.values[np.where(data.DEATH_EVENT.values == 0)]),
                     np.std(data.platelets.values[np.where(data.DEATH_EVENT.values == 0)])
                     ]
        serum_creatinine = ['serum_creatinine',
                            np.median(data.serum_creatinine.values),
                            np.mean(data.serum_creatinine.values),
                            np.std(data.serum_creatinine.values),
                            np.median(data.serum_creatinine.values[np.where(data.DEATH_EVENT.values == 1)]),
                            np.mean(data.serum_creatinine.values[np.where(data.DEATH_EVENT.values == 1)]),
                            np.std(data.serum_creatinine.values[np.where(data.DEATH_EVENT.values == 1)]),
                            np.median(data.serum_creatinine.values[np.where(data.DEATH_EVENT.values == 0)]),
                            np.mean(data.serum_creatinine.values[np.where(data.DEATH_EVENT.values == 0)]),
                            np.std(data.serum_creatinine.values[np.where(data.DEATH_EVENT.values == 0)])
                            ]
        serum_sodium = ['serum_sodium',
                        np.median(data.serum_sodium.values), np.mean(data.serum_sodium.values),
                        np.std(data.serum_sodium.values),
                        np.median(data.serum_sodium.values[np.where(data.DEATH_EVENT.values == 1)]),
                        np.mean(data.serum_sodium.values[np.where(data.DEATH_EVENT.values == 1)]),
                        np.std(data.serum_sodium.values[np.where(data.DEATH_EVENT.values == 1)]),
                        np.median(data.serum_sodium.values[np.where(data.DEATH_EVENT.values == 0)]),
                        np.mean(data.serum_sodium.values[np.where(data.DEATH_EVENT.values == 0)]),
                        np.std(data.serum_sodium.values[np.where(data.DEATH_EVENT.values == 0)])
                        ]
        time = ['time',
                np.median(data.time.values),
                np.mean(data.time.values),
                np.std(data.time.values),
                np.median(data.time.values[np.where(data.DEATH_EVENT.values == 1)]),
                np.mean(data.time.values[np.where(data.DEATH_EVENT.values == 1)]),
                np.std(data.time.values[np.where(data.DEATH_EVENT.values == 1)]),
                np.median(data.time.values[np.where(data.DEATH_EVENT.values == 0)]),
                np.mean(data.time.values[np.where(data.DEATH_EVENT.values == 0)]),
                np.std(data.time.values[np.where(data.DEATH_EVENT.values == 0)])
                ]
        table_2 = np.hstack((age, creatinine_phosphokinase, ejection_fraction, platelets,
                             serum_creatinine, serum_sodium, time)).reshape(7, 10)

        print('\n', tabulate(table_2, headers=['all_median', 'all_mean', 'all_std',
                                               'dead_median', 'dead_mean', 'dead_std',
                                               'alive_median', 'alive_mean', 'alive_std'],
                             tablefmt='orgtbl'))  # Table 3 from page 4

    def feature_ranking(self, data):
        data = data
        feature_name = np.array(['age', 'serum_creatinine', 'ejection_fraction', 'serum_sodium',
                                 'high_blood_pressure', 'anaemia', 'platelets', 'creatinine_phosphokinase',
                                 'smoking', 'sex', 'diabetes']).reshape(11, 1).astype(str)
        # Mann Whitney U Test, Table 5
        m_w_u_age = stats.mannwhitneyu(data.age.values, data.DEATH_EVENT.values, alternative='two-sided')[1]
        m_w_u_serum_creatinine = stats.mannwhitneyu(data.serum_creatinine, data.DEATH_EVENT, alternative='two-sided')[1]
        m_w_u_ejection_fraction = stats.mannwhitneyu(data.ejection_fraction, data.DEATH_EVENT, alternative='two-sided')[
            1]
        m_w_u_serum_sodium = stats.mannwhitneyu(data.serum_sodium, data.DEATH_EVENT, alternative='two-sided')[1]
        m_w_u_high_blood_pressure = \
            stats.mannwhitneyu(data.high_blood_pressure, data.DEATH_EVENT, alternative='two-sided')[1]
        m_w_u_anaemia = stats.mannwhitneyu(data.anaemia, data.DEATH_EVENT, alternative='two-sided')[1]
        m_w_u_platelets = stats.mannwhitneyu(data.platelets, data.DEATH_EVENT, alternative='two-sided')[1]
        m_w_u_creatinine_phosphokinase = stats.mannwhitneyu(data.creatinine_phosphokinase, data.DEATH_EVENT,
                                                            alternative='two-sided')[1]
        m_w_u_smoking = stats.mannwhitneyu(data.smoking, data.DEATH_EVENT, alternative='two-sided')[1]
        m_w_u_sex = stats.mannwhitneyu(data.sex, data.DEATH_EVENT, alternative='two-sided')[1]
        m_w_u_diabetes = stats.mannwhitneyu(data.diabetes, data.DEATH_EVENT, alternative='two-sided')[1]
        m_w_u = np.array([m_w_u_age, m_w_u_serum_creatinine, m_w_u_ejection_fraction, m_w_u_serum_sodium,
                          m_w_u_high_blood_pressure, m_w_u_anaemia, m_w_u_platelets, m_w_u_creatinine_phosphokinase,
                          m_w_u_smoking, m_w_u_sex, m_w_u_diabetes]).reshape(11, 1)
        indexes = np.argsort(m_w_u.T)
        m_w_u = np.array(m_w_u[indexes]).reshape(11, 1)
        feature_name_mwu = np.array(feature_name[indexes]).reshape(11, 1)
        table_5 = np.hstack((feature_name_mwu, m_w_u)).reshape(11, 2)
        print('\n', tabulate(table_5, headers=['Feature Name', 'Mann–Whitney U'],
                             tablefmt='orgtbl'))  # Table 5 from page 4 Mann–Whitney U test

        # Pearson Correlation Coefficient, Table 6
        pearson_age = stats.pearsonr(data.age, data.DEATH_EVENT)[0]
        pearson_serum_creatinine = stats.pearsonr(data.serum_creatinine, data.DEATH_EVENT)[0]
        pearson_ejection_fraction = stats.pearsonr(data.ejection_fraction, data.DEATH_EVENT)[0]
        pearson_serum_sodium = stats.pearsonr(data.serum_sodium, data.DEATH_EVENT)[0]
        pearson_high_blood_pressure = stats.pearsonr(data.high_blood_pressure, data.DEATH_EVENT)[0]
        pearson_anaemia = stats.pearsonr(data.anaemia, data.DEATH_EVENT)[0]
        pearson_platelets = stats.pearsonr(data.platelets, data.DEATH_EVENT)[0]
        pearson_creatinine_phosphokinase = stats.pearsonr(data.creatinine_phosphokinase, data.DEATH_EVENT)[0]
        pearson_smoking = stats.pearsonr(data.smoking, data.DEATH_EVENT)[0]
        pearson_sex = stats.pearsonr(data.sex, data.DEATH_EVENT)[0]
        pearson_diabetes = stats.pearsonr(data.diabetes, data.DEATH_EVENT)[0]
        pearson = np.array([pearson_age, pearson_serum_creatinine, pearson_ejection_fraction, pearson_serum_sodium,
                            pearson_high_blood_pressure, pearson_anaemia, pearson_platelets,
                            pearson_creatinine_phosphokinase,
                            pearson_smoking, pearson_sex, pearson_diabetes]).reshape(11, 1)
        indexes = np.argsort(np.abs(pearson).T)
        pearson = np.array(pearson[indexes]).reshape(11, 1)
        feature_name_pearson = np.array(feature_name[indexes]).reshape(11, 1)
        table_6_A = np.hstack((feature_name_pearson, np.abs(pearson))).reshape(11, 2)
        print('\n', tabulate(table_6_A, headers=['Feature Name', 'Pearson Correlation Coefficient'],
                             tablefmt='orgtbl'))  # Table 6 from page 8 Pearson Correlation Coefficient

        # Shapiro-Wilk Test , Table 6
        shapiro_age = stats.shapiro(data.age)[1]
        shapiro_serum_creatinine = stats.shapiro(data.serum_creatinine)[1]
        shapiro_ejection_fraction = stats.shapiro(data.ejection_fraction)[1]
        shapiro_serum_sodium = stats.shapiro(data.serum_sodium)[1]
        shapiro_high_blood_pressure = stats.shapiro(data.high_blood_pressure)[1]
        shapiro_anaemia = stats.shapiro(data.anaemia)[1]
        shapiro_platelets = stats.shapiro(data.platelets)[1]
        shapiro_creatinine_phosphokinase = stats.shapiro(data.creatinine_phosphokinase)[1]
        shapiro_smoking = stats.shapiro(data.smoking)[1]
        shapiro_sex = stats.shapiro(data.sex)[1]
        shapiro_diabetes = stats.shapiro(data.diabetes)[1]
        shapiro_death_event = stats.shapiro(data.DEATH_EVENT)[1]
        shapiro = np.array([shapiro_age, shapiro_serum_creatinine, shapiro_ejection_fraction, shapiro_serum_sodium,
                            shapiro_high_blood_pressure, shapiro_anaemia, shapiro_platelets,
                            shapiro_creatinine_phosphokinase,
                            shapiro_smoking, shapiro_sex, shapiro_diabetes, shapiro_death_event]).reshape(12, 1)
        indexes = np.argsort(shapiro.T)
        shapiro = np.array(shapiro[indexes]).reshape(12, 1)
        feature_name_shapiro = np.array(np.append(feature_name, 'Dead Event')[indexes]).reshape(12, 1)
        table_6_B = np.hstack((feature_name_shapiro, shapiro)).reshape(12, 2)
        print('\n', tabulate(table_6_B, headers=['Feature Name', 'Shapiro–Wilk tests'],
                             tablefmt='orgtbl'))  # Table 6 from page 8 Shapiro–Wilk tests

    def feature_importance_gini(self, data):
        feature_name = data.columns.values[:11]
        importances = np.zeros(11)
        n = 100
        for i in range(n):
            rnd_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,
                                             random_state=0, criterion='gini')
            rnd_clf.fit(data.drop(['time', 'DEATH_EVENT'], axis=1), data.DEATH_EVENT)
            importances += rnd_clf.feature_importances_
        plt.title('Feature Importance gini')
        index = np.argsort(importances)
        plt.barh(range(len(index)), importances[index], color='b', align='center')
        plt.yticks(range(len(index)), [feature_name[i] for i in index])
        plt.xlabel('Relative Importance')
        plt.show()

    def feature_importance_accuracy_reduction(self, data):
        feature_name = ['No feature Excluded', 'sex', 'smoking', 'diabetes', 'high_blood_pressure', 'anaemia',
                        'age', 'ejection_fraction', 'serum_sodium', 'serum_creatinine', 'platelets',
                        'creatinine_phosphokinase']
        accuracy = np.zeros(12)
        mcc = 0
        tn_rate = 0
        tp_rate = 0
        precision = 0
        f1 = 0
        acc_all_feature = 0
        roc = 0
        n = 10
        for i in range(n):
            train_data, test_data = train_test_split(data, test_size=80)
            train_data = train_data.drop(['time'], axis=1)
            test_data = test_data.drop(['time'], axis=1)
            for j in range(12):
                rnd_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0, criterion='entropy')
                if j == 0:
                    rnd_clf.fit(train_data.drop(['DEATH_EVENT'], axis=1), train_data.DEATH_EVENT)
                    y_pred = rnd_clf.predict(test_data.drop(['DEATH_EVENT'], axis=1))
                    mcc += matthews_corrcoef(test_data.DEATH_EVENT, y_pred)
                    f1 += f1_score(test_data.DEATH_EVENT, y_pred)
                    acc_all_feature += accuracy_score(test_data.DEATH_EVENT, y_pred)
                    tp_rate += recall_score(test_data.DEATH_EVENT, y_pred)
                    tn_rate += precision_score(test_data.DEATH_EVENT, y_pred)
                    precision += auc(precision_recall_curve(test_data.DEATH_EVENT, y_pred)[1],
                                     precision_recall_curve(test_data.DEATH_EVENT, y_pred)[0])
                    roc += roc_auc_score(test_data.DEATH_EVENT, y_pred)
                else:
                    rnd_clf.fit(train_data.drop(['DEATH_EVENT', feature_name[j]], axis=1), train_data.DEATH_EVENT)
                    y_pred = rnd_clf.predict(test_data.drop(['DEATH_EVENT', feature_name[j]], axis=1))

                accuracy[j] += accuracy_score(test_data.DEATH_EVENT, y_pred)

        mcc /= n
        tn_rate /= n
        tp_rate /= n
        precision /= n
        f1 /= n
        acc_all_feature /= n
        accuracy_reduction = (accuracy - accuracy[0]) / n
        accuracy_reduction = - accuracy_reduction
        index = np.argsort(accuracy_reduction)[:]
        plt.title('Feature Importance accuracy reduction')
        plt.barh(range(len(index)), accuracy_reduction[index], color='b', align='center')
        plt.yticks(range(len(index)), [feature_name[i] for i in index])
        plt.xlabel('Relative Importance')
        plt.show()
        table_9 = np.array(['Random Forest', mcc, f1, acc_all_feature, tp_rate, tn_rate, precision, roc]).reshape(1, 8)
        print('\n', tabulate(table_9,
                             headers=[
                                 'Method', 'MCC', 'F 1 score', 'Accuracy', 'TP rate', 'TN rate', 'PR AUC', 'ROC AUC'
                             ], tablefmt='orgtbl'))

    def ML_prediction(self, data):
        report = np.zeros((7, 7))  # 7 classifiers and 7 metrics
        n = 100
        for i in range(n):
            k = random.choice([1, 3, 5])
            c_l = abs(random.gauss(0.1, 0.01)) + 0.0000001
            c_r = abs(random.gauss(10, 2)) + 0.00001
            r_es = random.randint(500, 700)
            neural_network = MLPClassifier(max_iter=300, validation_fraction=0.20, activation='relu', solver='adam')
            support_vector_linear = SVC(C=c_l, kernel='linear')
            support_vector_rbf = SVC(C=c_r, kernel='rbf')
            k_nearest_neighbour = KNeighborsClassifier(n_neighbors=k)
            random_forest = RandomForestClassifier(n_estimators=r_es, n_jobs=-1, criterion='entropy')
            naive_bayes = GaussianNB()
            decision_tree = DecisionTreeClassifier(criterion='entropy')
            test_data, train_data = train_test_split(data, test_size=120)
            models = [neural_network, support_vector_linear, support_vector_rbf, k_nearest_neighbour, random_forest,
                      naive_bayes, decision_tree]

            for model in models:
                model.fit(train_data.drop(['DEATH_EVENT'], axis=1), train_data.DEATH_EVENT)
            for j in range(len(models)):
                y_pred = models[j].predict(test_data.drop(['DEATH_EVENT'], axis=1))
                report[j, 0] += matthews_corrcoef(test_data.DEATH_EVENT, y_pred)
                report[j, 1] += f1_score(test_data.DEATH_EVENT, y_pred)
                report[j, 2] += accuracy_score(test_data.DEATH_EVENT, y_pred)
                report[j, 3] += recall_score(test_data.DEATH_EVENT, y_pred)
                report[j, 4] += precision_score(test_data.DEATH_EVENT, y_pred)
                report[j, 5] += auc(precision_recall_curve(test_data.DEATH_EVENT, y_pred)[1],
                                    precision_recall_curve(test_data.DEATH_EVENT, y_pred)[0])
                report[j, 6] += roc_auc_score(test_data.DEATH_EVENT, y_pred)
        models_name = np.array(['neural_network', 'support_vector_linear', 'support_vector_rbf', 'k_nearest_neighbour',
                                'random_forest', 'naive_bayes', 'decision_tree']).reshape(7, 1)
        index = np.argsort(-1 * report[:, 0])[:]
        table_4 = np.hstack((models_name[index], report[index] / n)).reshape(7, 8)
        print('\n', tabulate(table_4, headers=[
            'Method', 'MCC', 'F 1 score', 'Accuracy', 'TP rate', 'TN rate', 'PR AUC', 'ROC AUC'],
                             tablefmt='orgtbl'))

    def ML_prediction_2features(self, data):
        report = np.zeros((7, 7))  # 7 classifiers and 7 metrics
        n = 100
        for i in range(n):
            neural_network = MLPClassifier(random_state=1, max_iter=300,
                                           validation_fraction=0.20, activation='relu', solver='adam')
            support_vector_linear = SVC(C=0.1, kernel='linear')
            support_vector_rbf = SVC(C=10, kernel='rbf')
            k_nearest_neighbour = KNeighborsClassifier(n_neighbors=3)
            random_forest = RandomForestClassifier(n_estimators=700, n_jobs=-1, random_state=0, criterion='entropy')
            naive_bayes = GaussianNB()
            decision_tree = DecisionTreeClassifier(random_state=0)
            test_data, train_data = train_test_split(data, test_size=120)
            models = [neural_network, support_vector_linear, support_vector_rbf, k_nearest_neighbour, random_forest,
                      naive_bayes, decision_tree]

            for model in models:
                model.fit(train_data.drop(['DEATH_EVENT'], axis=1), train_data.DEATH_EVENT)
            for j in range(len(models)):
                y_pred = models[j].predict(test_data.drop(['DEATH_EVENT'], axis=1))
                report[j, 0] += matthews_corrcoef(test_data.DEATH_EVENT, y_pred)
                report[j, 1] += f1_score(test_data.DEATH_EVENT, y_pred)
                report[j, 2] += accuracy_score(test_data.DEATH_EVENT, y_pred)
                report[j, 3] += recall_score(test_data.DEATH_EVENT, y_pred)
                report[j, 4] += precision_score(test_data.DEATH_EVENT, y_pred)
                report[j, 5] += auc(precision_recall_curve(test_data.DEATH_EVENT, y_pred)[1],
                                    precision_recall_curve(test_data.DEATH_EVENT, y_pred)[0])
                report[j, 6] += roc_auc_score(test_data.DEATH_EVENT, y_pred)
        models_name = np.array(['neural_network', 'support_vector_linear', 'support_vector_rbf', 'k_nearest_neighbour',
                                'random_forest', 'naive_bayes', 'decision_tree']).reshape(7, 1)
        index = np.argsort(-1 * report[:, 0])[:]
        table_4 = np.hstack((models_name[index], report[index] / n)).reshape(7, 8)
        print('\n', tabulate(table_4, headers=[
            'Method', 'MCC', 'F 1 score', 'Accuracy', 'TP rate', 'TN rate', 'PR AUC', 'ROC AUC'],
                             tablefmt='orgtbl'))  # Table 5 from page 4 Mann–Whitney U test

    def leave_one_feature_out(self, data):
        report = np.zeros((1, 7))  # 7 classifiers and 7 metrics
        n = 100
        for i in range(n):
            random_forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0, criterion='entropy')
            test_data, train_data = train_test_split(data, test_size=120)
            random_forest.fit(train_data.drop(['DEATH_EVENT'], axis=1), train_data.DEATH_EVENT)
            for j in range(1):
                y_pred = random_forest.predict(test_data.drop(['DEATH_EVENT'], axis=1))
                report[j, 0] += matthews_corrcoef(test_data.DEATH_EVENT, y_pred)
                report[j, 1] += f1_score(test_data.DEATH_EVENT, y_pred)
                report[j, 2] += accuracy_score(test_data.DEATH_EVENT, y_pred)
                report[j, 3] += recall_score(test_data.DEATH_EVENT, y_pred)
                report[j, 4] += precision_score(test_data.DEATH_EVENT, y_pred)
                report[j, 5] += auc(precision_recall_curve(test_data.DEATH_EVENT, y_pred)[1],
                                    precision_recall_curve(test_data.DEATH_EVENT, y_pred)[0])
                report[j, 6] += roc_auc_score(test_data.DEATH_EVENT, y_pred)
        return report[:, 0:3] / n

    def plot_feature_ranking(self, report, feature_name):
        report = np.array(report).reshape(10, 3)
        mcc = report[:, 0]
        f1 = report[:, 1]
        acc = report[:, 2]
        index = np.argsort(-1 * acc)[:]
        df = pd.DataFrame({'MCC': mcc[index], 'F1-score': f1[index], 'Accuracy': acc[index]}, index=feature_name[index])
        ax = df.plot.barh()
        plt.title('Leaving One feature out for determining its value')
        plt.show()

    def plot_fig_3(self, data):
        svm = SVC(C=0.1, kernel='linear')
        svm.fit(data[['serum_creatinine', 'ejection_fraction']].values, data.DEATH_EVENT.values)
        plot_decision_regions(data[['serum_creatinine', 'ejection_fraction']].values, data.DEATH_EVENT.values,
                              clf=svm, legend=2)
        plt.title('Scatterplot of serum creatinine versus ejection fraction')
        plt.show()

    def plot_fig_4(self, data):
        data = data[['time', 'DEATH_EVENT']]
        time = data.time.values
        d = abs(1 - data.DEATH_EVENT.values)
        m = np.array([np.where(time < 30), np.where((30 <= time) & (time < 60)),
                      np.where((60 <= time) & (time < 90)), np.where((90 <= time) & (time < 120)),
                      np.where((120 <= time) & (time < 150)), np.where((150 <= time) & (time < 180)),
                      np.where((180 <= time) & (time < 210)), np.where((210 <= time) & (time < 240)),
                      np.where((240 <= time) & (time < 270)), np.where((270 <= time) & (time < 300))
                      ]).flatten()

        df = pd.DataFrame({'Months': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
                           'Survived Percentage': [sum(d[m[0]]) / len(m[0]) * 100, sum(d[m[1]]) / len(m[1]) * 100,
                                                   sum(d[m[2]]) / len(m[2]) * 100,
                                                   sum(d[m[3]]) / len(m[3]) * 100, sum(d[m[4]]) / len(m[4]) * 100,
                                                   sum(d[m[5]]) / len(m[5]) * 100,
                                                   sum(d[m[6]]) / len(m[6]) * 100, sum(d[m[7]]) / len(m[7]) * 100,
                                                   sum(d[m[8]]) / len(m[8]) * 100,
                                                   sum(d[m[9]]) / len(m[9]) * 100]
                           })
        df.plot.bar(x='Months', y='Survived Percentage', rot=0)
        plt.show()


if '__main__' == __name__:
    obj = HF_Pred()
    data = pd.read_csv(obj.filepath)
    data.platelets = data.platelets / 1000
    data.high_blood_pressure.astype(bool)
    data.anaemia.astype(bool)
    data.diabetes.astype(bool)
    data.sex.astype(bool)
    data.smoking.astype(bool)
    data.DEATH_EVENT.astype(bool)
    obj.dataset(data)
    obj.feature_ranking(data)
    obj.feature_importance_gini(data)
    feature_name = data.columns.values[:11]
    report_one_feature_out = []
    for i in range(10):
        report_one_feature_out.append(obj.leave_one_feature_out(data.drop([feature_name[i], 'time'], axis=1)))
    obj.plot_feature_ranking(report_one_feature_out, feature_name)
    obj.feature_importance_accuracy_reduction(data)
    obj.ML_prediction(data.drop(['time'], axis=1))
    obj.ML_prediction_2features(data[['serum_creatinine', 'ejection_fraction', 'DEATH_EVENT']])
    obj.plot_fig_3(data)
    obj.plot_fig_4(data)
