# Predicting heart disease using Machine Learning

This notebbok looks into using various Python-based ML and data science libraries in an attempt to build a ML model capable of predicting whether or not someone has heart disease based on their medical attributes 

We're going to take the following approach:
1. Problem definition
2. Data
3. Evaluation
4. Features
5. Modelling
6. Experimentaion

## Data

The original data came from the Cleveland data from the UCI ML Repository.

Another version is also available at Kaggle.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [None]:
# Loading data
df = pd.read_csv('../input/heart-disease-uci/heart.csv')
df.shape

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df["target"].value_counts()

In [None]:
df.isna().sum()

In [None]:
df["target"].value_counts().plot(kind="bar", color=["salmon", "lightblue"]);

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.sex.value_counts()

In [None]:
pd.crosstab(df.target, df.sex)

In [None]:
pd.crosstab(df.target, df.sex).plot(kind="bar",
                                   figsize=(10,6),
                                   color=["salmon", "lightblue"])
plt.title("Heart Disease Frequency for Sex")
plt.xlabel("0 = No Disease, 1=Disease")
plt.ylabel("Amount")
plt.legend(["Female", "Male"]);
plt.xticks(rotation=0);

In [None]:
df["thalach"].value_counts()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df.age[df.target==1],
           df.thalach[df.target==1],
           c="salmon");

plt.scatter(df.age[df.target==0],
           df.thalach[df.target==0],
           c="lightblue");

plt.title("Heart disease in function of Age and max Heart Rate")
plt.xlabel("Age")
plt.ylabel("Max Heart Rate")
plt.legend(["Disease", "No Disease"]);


In [None]:
df.age.plot.hist();

In [None]:
pd.crosstab(df.cp, df.target)

In [None]:
pd.crosstab(df.cp, df.target).plot(kind="bar",
                                   figsize=(10, 6),
                                    color=["salmon", "lightblue"])

plt.title("Heart Disease Frequency Per Chest Pain type")
plt.xlabel("Chest Pain type")
plt.ylabel("Amount")
plt.legend(["No Disease", "Disease"])
plt.xticks(rotation=0);

In [None]:
df.head()

In [None]:
# correaltion matrix
df.corr()

In [None]:
corr_matrix = df.corr()
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(corr_matrix,
                annot=True,
                linewidths=0.5,
                fmt="0.2f",
                cmap="YlGnBu");


In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

X = df.drop("target", axis=1)
y = df["target"]

X

In [None]:
y

In [None]:
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train, len(X_train)

In [None]:
y_train, len(y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


In [None]:
models = {"Logistic Regression": LogisticRegression(),
         "KNN": KNeighborsClassifier(),
         "Random Forest": RandomForestClassifier()}

def fit_and_score(models, X_train, X_test, y_train, y_test):
    
    np.random.seed(42)
    model_scores = {}
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [None]:
model_scores = fit_and_score(models,
                            X_train,
                            X_test,
                            y_train,
                            y_test)
model_scores

In [None]:
model_compare = pd.DataFrame(model_scores, index=["accuracy"])
model_compare.T.plot.bar();

### Tuning the knn model

In [None]:


train_scores = []
test_scores = []

neighbors = range(1, 21)

knn = KNeighborsClassifier()

for i in neighbors:
    knn.set_params(n_neighbors=i)
    knn.fit(X_train, y_train)
    train_scores.append(knn.score(X_train, y_train))
    test_scores.append(knn.score(X_test, y_test))

In [None]:
train_scores

In [None]:
test_scores

In [None]:
plt.plot(neighbors, train_scores, label="Train score")
plt.plot(neighbors, test_scores, label="Test score")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_scores)*100:.2f}%")

In [None]:
# Tuning the n_neighbors to (11) from default (5). score slightly increased

### Tuning with RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
log_reg_grid = {"C": np.logspace(-4, 4, 20),
               "solver": ["liblinear"]}

rf_grid = {"n_estimators": np.arange(10, 1000, 50),
          "max_depth": [None, 3, 5, 10],
          "min_samples_split": np.arange(2, 20, 2),
          "min_samples_leaf": np.arange(1, 20, 2)}

In [None]:
np.arange(10, 1000, 50)

In [None]:
np.random.seed(42)

rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                               param_distributions=log_reg_grid,
                               cv=5,
                               n_iter=20,
                               verbose=True)
rs_log_reg.fit(X_train, y_train)

In [None]:
rs_log_reg.best_params_

In [None]:
rs_log_reg.score(X_test, y_test)

In [None]:
# RandomizedSearchCV
np.random.seed(42)

rs_rf = RandomizedSearchCV(RandomForestClassifier(), 
                          param_distributions=rf_grid,
                          cv=5,
                          n_iter=20,
                          verbose=True)

rs_rf.fit(X_train, y_train)

In [None]:
X_train


In [None]:
rs_rf.score(X_test, y_test)

In [None]:
# GridSearchCV
log_reg_grid = {"C": np.logspace(-4, 4, 30),
               "solver": ["liblinear"]}

gs_log_reg = GridSearchCV(LogisticRegression(),
                           param_grid=log_reg_grid,
                           cv=5,
                           verbose=True)

gs_log_reg.fit(X_train, y_train)

In [None]:
gs_log_reg

In [None]:
gs_log_reg.best_params_

In [None]:
gs_log_reg.score(X_test, y_test)

In [None]:
model_scores

In [None]:
y_preds = gs_log_reg.predict(X_test)
y_preds

In [None]:
y_test

In [None]:
# Evaluating the trained models

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

In [None]:
plot_roc_curve(gs_log_reg, X_test, y_test)

In [None]:
print(confusion_matrix(y_test, y_preds))

In [None]:
sns.set(font_scale=1.5)

def plot_conf_mat(y_test, y_preds):
    """
    Plotting confusion matrix using heatmap()
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),
                    annot=True,
                    cbar=False)
    plt.xlabel("True Label")
    plt.ylabel("Predicted Label")
    

plot_conf_mat(y_test, y_preds)

In [None]:
print(classification_report(y_test, y_preds))

In [None]:
gs_log_reg.best_params_

In [None]:
clf = LogisticRegression(C=0.20433597178569418,
                        solver='liblinear')

In [None]:
# Cross-Validated Accuracy
cv_acc = cross_val_score(clf,
                        X,
                        y,
                        cv=5,
                        scoring="accuracy")
cv_acc = np.mean(cv_acc)
cv_acc

In [None]:
# Cross-Validated Precision
cv_precision = cross_val_score(clf,
                        X,
                        y,
                        cv=5,
                        scoring="precision")

cv_precision = np.mean(cv_precision)
cv_precision

In [None]:
# Cross-Validated recall
cv_recall = cross_val_score(clf,
                        X,
                        y,
                        cv=5,
                        scoring="recall")

cv_recall = np.mean(cv_recall)
cv_recall

In [None]:
# Cross-Validated f1-score
cv_f1 = cross_val_score(clf,
                        X,
                        y,
                        cv=5,
                        scoring="f1")

cv_f1 = np.mean(cv_f1)
cv_f1

In [None]:
cv_metrics = pd.DataFrame({"Accuracy": cv_acc,
                          "Precision": cv_precision,
                          "Recall": cv_recall,
                          "F1": cv_f1},
                         index=[0])

cv_metrics.T.plot.bar(title="Cross-validated classification metrics", legend=False);

In [None]:
clf =LogisticRegression(C=0.20433597178569418,
                       solver='liblinear')
clf.fit(X_train, y_train);

In [None]:
clf.coef_

In [None]:
# coef of features to columns
feature_dict = dict(zip(df.columns, list(clf.coef_[0])))
feature_dict

In [None]:
feature_df = pd.DataFrame(feature_dict, index=[0])
feature_df.T.plot.bar(title="Feature Importance", legend=False);

In [None]:
pd.crosstab(y_test,y_preds)

In [None]:
cv_acc