## Predicting Heart Disease mortality using machine learning

### Comparing 3 Models


###  Problem definition
> Based on the given parameters, if we can predict mortality in CHF?

###  Data
https://www.kaggle.com/andrewmvd/heart-failure-clinical-data

Chicco, D., Jurman, G. Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone. BMC Med Inform Decis Mak 20, 16 (2020). https://doi.org/10.1186/s12911-020-1023-5


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline  

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# model evaluation
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

In [None]:
df = pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df["DEATH_EVENT"].value_counts()

In [None]:
df["DEATH_EVENT"].value_counts().plot(kind="bar", color=["green", "red"]);

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
pd.crosstab(df.DEATH_EVENT, df.sex).plot(kind="bar", figsize=(10, 6), color=["salmon", "lightblue"])
plt.title("Death_Event Frequency by Sex")
plt.xlabel("0 = No Death_Event, 1 = Death_Event")
plt.ylabel("Subjects")
plt.legend(["Female", "Male"])
plt.xticks(rotation=0); 

In [None]:
df.corr()
corr_matrix = df.corr()
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(corr_matrix,
                annot=True,
                linewidths=0.5,
                fmt=".2f",
                cmap="YlGnBu");

In [None]:
X = df.drop("DEATH_EVENT", axis = 1)
y = df["DEATH_EVENT"]

In [None]:
np.random.seed(36)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train

In [None]:
y_train, len(y_train)

### Comparing 3 classification models

* Logistic Regression
* K-Nearest Neighbours Classifier
* Random Forest Classifier

In [None]:
models = {"Logistic Regression": LogisticRegression(),
         "KNN": KNeighborsClassifier(),
         "Random Forest": RandomForestClassifier()}

def fit_and_score (models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    Models = a dict of different scikit-learn machine learning models
   
    """
    np.random.seed(36)
    model_scores = {}
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score (X_test, y_test)
    return model_scores

In [None]:
model_scores = fit_and_score(models=models,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test)
model_scores

In [None]:
model_compare = pd.DataFrame(model_scores, index=["accuracy"])
model_compare.T.plot.bar();

In [None]:
train_scores = []
test_scores = []

neighbors = range(1, 21)
knn = KNeighborsClassifier()

for i in neighbors:
    knn.set_params(n_neighbors=i)
    knn.fit(X_train, y_train)
    train_scores.append(knn.score(X_train, y_train))
    test_scores.append(knn.score (X_test, y_test))

In [None]:
train_scores

In [None]:
test_scores

In [None]:
plt.plot(neighbors, train_scores, label="train score")
plt.plot(neighbors, test_scores, label="test score")
plt.xticks(np.arange(1, 21, 1))   
plt.xlabel("number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data:{max(test_scores)*100:.2f}%")

In [None]:
# Tuning LogisticRegression() and RandomForestClassifier() using RandomizedSearchCV

log_reg_grid = {"C":np.logspace(-4, 4, 20),
               "solver": ["liblinear"]}

In [None]:
np.logspace(-4, 4, 20)

In [None]:
rf_grid = {"n_estimators": np.arange(10, 1000, 50),
          "max_depth": [None, 3, 5, 10],
          "min_samples_split": np.arange(2, 20, 2),
          "min_samples_leaf": np.arange(1, 20, 2)}

In [None]:
np.random.seed(36)
# setup random hyperparameter search for logisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                               param_distributions=log_reg_grid,
                               cv=5,
                               n_iter=40,
                               verbose=True)

# fit random hyperparameter search model for logistic Regression

rs_log_reg.fit(X_train, y_train)

In [None]:
rs_log_reg.best_params_

In [None]:
rs_log_reg.score(X_test, y_test)

In [None]:
# RandomForestClassifier
np.random.seed(36)
rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                               param_distributions=rf_grid,
                               cv=5,
                               n_iter=40,
                               verbose=True)

rs_rf.fit(X_train, y_train)                        

In [None]:
rs_rf.best_params_

In [None]:
rs_rf.score(X_test, y_test)

In [None]:
# GridSearchCV on LogisticRegression
log_reg_grid = {"C":np.logspace(-4, 4, 30),
               "solver": ["liblinear"]}

gs_log_reg = GridSearchCV(LogisticRegression(),
                         param_grid=log_reg_grid,
                         cv=5,
                         verbose=True)

gs_log_reg.fit(X_train, y_train);

In [None]:
gs_log_reg.best_params_

In [None]:
gs_log_reg.score(X_test, y_test)

In [None]:
y_preds = gs_log_reg.predict(X_test)

In [None]:
y_preds

In [None]:
plot_roc_curve(gs_log_reg, X_test, y_test);

In [None]:
print(confusion_matrix(y_test, y_preds))

In [None]:
sns.set(font_scale=1.5)

def plot_conf_mat(y_test, y_preds):
    """
    plots a confusion matrix using Seaborn heatmap()
    """
    fig, ax = plt.subplots(figsize= (3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),
                    annot=True,
                    cbar=False)
    plt.xlabel("True Label")
    plt.ylabel("predicted_label")
    
plot_conf_mat(y_test, y_preds)   

In [None]:
print(classification_report(y_test, y_preds)) 

#### evaluation metrics using Cross validation (CV)
using cross_val_score()

In [None]:
gs_log_reg.best_params_

In [None]:
# using best parameters
clf = LogisticRegression(C=221.22162910704503,
                      solver='liblinear')

In [None]:
# cross-validated accuracy
cv_acc = cross_val_score(clf,
                        X,
                        y,
                        cv=5,
                         scoring="accuracy")
cv_acc = np.mean(cv_acc)
cv_acc
            

In [None]:
# cross-validated Precision
cv_precision = cross_val_score(clf,
                        X,
                        y,
                        cv=5,
                         scoring="precision")
cv_precision = np.mean(cv_precision)
cv_precision

In [None]:
# cross-validated recall
cv_recall = cross_val_score(clf,
                        X,
                        y,
                        cv=5,
                         scoring="recall")
cv_recall = np.mean(cv_recall)
cv_recall

In [None]:
# cross.validated f1_score
cv_f1 = cross_val_score(clf,
                        X,
                        y,
                        cv=5,
                         scoring="f1")
cv_f1 = np.mean(cv_f1)
cv_f1

In [None]:
cv_metrics = pd.DataFrame({"accuracy": cv_acc,
                          "precision": cv_precision,
                          "recall": cv_recall,
                          "f1": cv_f1},
                         index=[0])

cv_metrics.T.plot.bar(title= "Cross validated classification metrics",
                     legend=False);

In [None]:
gs_log_reg.best_params_

In [None]:
clf = LogisticRegression(C=221.22162910704503,
                        solver='liblinear')
clf.fit(X_train, y_train);

In [None]:
clf.coef_

In [None]:
feature_dict=dict(zip(df.columns, list(clf.coef_[0])))
feature_dict

In [None]:
feature_df=pd.DataFrame(feature_dict, index=[0])
feature_df.T.plot.bar(title="feature importance", legend=False);