# Predicting Heart Disease using Machine Learning 

**Objective:** Given clinical parameters about a patient, build a predictive model that can predict whether a patient has heart disease or not.

### Data Source

* Kaggle: https://www.kaggle.com/ronitf/heart-disease-uci
* UCI Machine Learning repository: https://archive.ics.uci.edu/ml/datasets/Heart+Disease

### Data Dictionary

1. age - age in years
2. sex - (1 = male; 0 = female)
3. cp - chest pain type
        0: Typical angina: chest pain related decrease blood supply to the heart
        1: Atypical angina: chest pain not related to heart
        2: Non-anginal pain: typically esophageal spasms (non heart related)
        3: Asymptomatic: chest pain not showing signs of disease
4. trestbps - resting blood pressure (in mm Hg on admission to the hospital)
    - anything above 130-140 is typically cause for concern
5. chol - serum cholestoral in mg/dl
    - serum = LDL + HDL + .2 * triglycerides 
    - above 200 is cause for concern
6. fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
    - '>126' mg/dL signals diabetes
7. restecg - resting electrocardiographic results
        0: Nothing to note
        1: ST-T Wave abnormality
            can range from mild symptoms to severe problems
            signals non-normal heart beat
        2: Possible or definite left ventricular hypertrophy
            Enlarged heart's main pumping chamber
8. thalach - maximum heart rate achieved
9. exang - exercise induced angina (1 = yes; 0 = no)
10. oldpeak - ST depression induced by exercise relative to rest
    - looks at stress of heart during excercise
    - unhealthy heart will stress more
11. slope - the slope of the peak exercise ST segment
        0: Upsloping: better heart rate with excercise (uncommon)
        1: Flatsloping: minimal change (typical healthy heart)
        2: Downslopins: signs of unhealthy heart
12. ca - number of major vessels (0-3) colored by flourosopy
    - colored vessel means the doctor can see the blood passing through
    - the more blood movement the better (no clots)
13. thal - thalium stress result
        1,3: normal
        6: fixed defect: used to be defect but ok now
        7: reversable defect: no proper blood movement when excercising
14. target - have disease or not (1=yes, 0=no) (= the predicted attribute)

In [None]:
# Regular EDA and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Scikit-Learn ML models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Model Evaluation tools
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

# Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

## Data

In [None]:
# Data
data_p = pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
data_p.head()

In [None]:
data_p.info()

## EDA

In [None]:
# Visualizing the correlation matrix
corr_matrix = data_p.corr()
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(corr_matrix,
                annot=True,
                linewidths=0.5,
                fmt=".2f",
                cmap='YlGnBu');

In [None]:
type(data_p.corr())

In [None]:
# Calculate heart disease ratio from true/false of outcome variable
n_true = len(data_p.loc[data_p['target'] == True])
n_false = len(data_p.loc[data_p['target'] == False ])
print("Number of Positive cases: {0}. Percentage = {1}".format(n_true, n_true/(n_true + n_false) * 100))
print("Number of Negative cases: {0}. Percentage = {1}".format(n_false, n_false/(n_true + n_false) * 100))

# Visualization
dic_1 = {"Postive": n_true, "Negative": n_false}
fig, ax = plt.subplots(figsize=(4, 4))
ax.bar(dic_1.keys(), dic_1.values(), width=0.8, color=['salmon', 'lightblue'])
ax.set(title="Number of Cases",
       ylabel="Number of cases");

In [None]:
# Creating a box plot
ax = sns.boxplot(x='age', y='sex', data=data_p, orient="h");
ax.set(title='Whisker Plot of Age vs. Sex',
       ylabel="M = 1| F = 0");

 This whisker plot indicates the different quartiles of our dataset for each sex. The mean age for females in our dataset is higher than that of males. Any female subjects below the age of thirty isn't in our sample. On the other hand, male subjects cover a more vast range than female subjects

In [None]:
# Ratio of male and female postive & negative cases
pd.crosstab(data_p['sex'], data_p['target'])

## Modelling

In [None]:
# Find the categorical variables
categorical_variables = []

for column in data_p:
    if len(data_p[column].unique()) <= 10:
        categorical_variables.append(column)

# Remove the target variable
categorical_variables.remove('target')

print(categorical_variables)

In [None]:
# Make dummy variables for categorical variables
data = pd.get_dummies(data_p, columns=categorical_variables)

In [None]:
# Split the data
X = data.drop('target', axis=1)
y = data['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [None]:
# Non-scaled data
X_train_ws = X_train
X_test_ws = X_test

In [None]:
# Scaled data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
# Models to use
models = {"Logistic Regression": LogisticRegression(random_state=7, solver='liblinear'),
          "Random Forest": RandomForestClassifier(random_state=7),
          "K-Nearest Neighbors": KNeighborsClassifier(),
          "SVM Kernel": SVC(kernel="rbf", random_state=7),
          "Naive Bayes": GaussianNB()}

# Function to fit and score models
def fit_score(models, X_train, X_test, y_train, y_test):
    """
    1. models: arguments excepts machine learning models
    2. X_train, X_test, y_train, y_test: arguement takes the splitted data
    """
    np.random.seed(7)
    
    # Model scores within the dictionary
    model_scores = {}
    
    # Loop through the models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Evaluate the model
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [None]:
# Model accuracies with scaling
model_scores = fit_score(models=models,
                         X_train=X_train,
                         X_test=X_test,
                         y_train=y_train,
                         y_test=y_test)
model_scores

In [None]:
# Model accuracies without scaling
model_scores_ws = fit_score(models=models,
                       X_train=X_train_ws,
                       X_test=X_test_ws,
                       y_train=y_train,
                       y_test=y_test)
model_scores_ws

## Model Comparision 

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(model_scores.keys(), model_scores.values())
ax.legend(['Accuracy'])
ax.set(ylabel="Accuracy",
       title='Model Score Comparision (With Scaling)');

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(model_scores_ws.keys(), model_scores_ws.values())
ax.legend(['Accuracy'])
ax.set(ylabel="Accuracy",
       title='Model Score Comparision (Without Scaling)');

#### Model performance on scaled data vs non-scaled data:
* KNN Model had a drastic change in accuracy score with scaling. 
> **Conclusion:** Euclidean distance based models require standardisation on data.

* Random Forest and Naive Bayes model performed better without scaling. 
> **Conculsion:** Random Forest and Naive Bayes (Gaussian Naive Bayes performs standardization internally) don't require standardisation on data.

* Logistic Regression performed slightly better without scaling

* SVM scored better with scaling. 
> **Conclusion:** Yes, SVM kernel `(Because Support Vector Machine (SVM) optimization occurs by minimizing the decision vector w, the optimal hyperplane is influenced by the scale of the input features and it’s therefore recommended that data be standardized (mean 0, var 1) prior to SVM model training.)` performs better with standardisation on data.

Other resource(s):
https://www.youtube.com/watch?v=mnKm3YP56PY

### EXTRA MODEL: CatBoost

In [None]:
# Install catboost
# import sys
# !conda config --add channels conda-forge
# !conda install --yes --prefix {sys.prefix} catboost

In [None]:
# Visualize confusion matrix
def plot_conf_matrix(con_mat):
    
    fig, ax = plt.subplots(figsize=(3,3))
    ax = sns.heatmap(con_mat,
                     annot=True,
                     cbar=False,)
    ax.set(xlabel="True label",
           ylabel='Predicted label')

In [None]:
# Apply Min-Max scaling For CatBoost
mms = MinMaxScaler()
X_train_mms = mms.fit_transform(X_train_ws ,y=None)
X_test_mms = mms.fit_transform(X_test_ws ,y=None)

In [None]:
from catboost import CatBoostClassifier

cb_clf = CatBoostClassifier(verbose=False)
cb_clf.fit(X_train_mms, y_train)

In [None]:
y_preds_cb = cb_clf.predict(X_test_mms)

In [None]:
cat_conf_mat = confusion_matrix(y_test, y_preds_cb)
plot_conf_matrix(cat_conf_mat)

In [None]:
cb_clf.score(X_train_mms, y_train)

In [None]:
cb_clf.score(X_test_mms, y_test)

In [None]:
# Cross validation
cb_clf_cv = cross_val_score(cb_clf,
                           X,
                           y,
                           cv=5,
                           scoring='accuracy',
                           verbose=False)

In [None]:
cb_clf_cv.mean()

In [None]:
cb_clf_cv.std()

Let's look at the following:
* Hyperparameter tuning
* Feature importance
* Confusion matrix
* Cross-validation
* Precision
* Recall
* F1 score
* Classification report
* ROC curve
* Area under the curve (AUC)

## Evaluation

### Model 1: KNN

In [None]:
train_scores = {}
test_scores = {}

train_scores_l = []
test_scores_l = []

# Number of neighbors
neighbors = range(1, 50)

# Setup KNN instance
knn = KNeighborsClassifier()

# Loop through the different number of neighbors
for i in neighbors:
    knn.set_params(n_neighbors=i)
    knn.fit(X_train, y_train)
    train_scores.update({str(i) :knn.score(X_train, y_train)})
    test_scores.update({str(i) :knn.score(X_test, y_test)})
    test_scores_l.append(knn.score(X_test, y_test))
    train_scores_l.append(knn.score(X_train, y_train))

In [None]:
# Maximum train score value
max_train_val = max(train_scores.values())
max_train_val

In [None]:
# Calculate the mean of training scores
dict_vals = train_scores.values()

vals_list = []

for i in dict_vals:
    vals_list.append(i)
    
arr_1 = np.array(vals_list)
arr_1.mean()

In [None]:
# Maximum test score value
max_test_val = max(test_scores.values())
max_test_val

In [None]:
# Calculate the mean of test scores
dict_vals_2 = test_scores.values()

vals_list_2 = []

for i in dict_vals_2:
    vals_list_2.append(i)
    
arr_2 = np.array(vals_list)
arr_2.mean()

In [None]:
plt.plot(neighbors, train_scores_l, label="Train score")
plt.plot(neighbors, test_scores_l, label="Test score")
plt.xticks(np.arange(0, 50, 5))
plt.xlabel("Number of neighbors")
plt.ylabel('Model score')
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_scores_l)*100:.2f}%");

In [None]:
# Find number of neighbors with the highest accuracy
n_trees = max_test_val

# Empty list to store the best number of neighbors
max_key = []
for key, val in test_scores.items(): 
    if val == n_trees:
        print(key)
        max_key.append(key)

In [None]:
# Scale X for cross-validation
X_scaled = sc.fit_transform(X)

In [None]:
# Take the best number of neighbors
best_num = int(max_key[1])

In [None]:
# Initiate KNN instance
knn_cv = KNeighborsClassifier(n_neighbors=best_num)

In [None]:
# Cross-validated accuracy
cv_knn = cross_val_score(knn_cv, 
                         X_scaled, 
                         y, 
                         cv=5, 
                         scoring='accuracy') 

In [None]:
cv_knn

In [None]:
# Average of cross-validated scores
cv_knn.mean()

In [None]:
# Standard deviation of cross-validated scores
cv_knn.std()

In [None]:
# Setting up parameters for Randomized Search CV and Grid Search CV
KNN_params = {'n_neighbors': [3,40],
             'p': [1, 2, 5]}

In [None]:
# Perform Randomized Search on KNN model
rs_KNN = RandomizedSearchCV(estimator=KNeighborsClassifier(),
                     param_distributions=KNN_params,
                     cv=5,
                     verbose=True,
                     n_iter=12)

rs_KNN.fit(X_train, y_train)

In [None]:
rs_KNN.best_params_

In [None]:
rs_KNN.score(X_train, y_train)

In [None]:
rs_KNN.score(X_test, y_test)

In [None]:
# Final model
clf_knn = KNeighborsClassifier(n_neighbors=best_num)
clf_knn.fit(X_train, y_train)
# Model score on test data
clf_knn.score(X_test, y_test)

In [None]:
# Model score on train data
clf_knn.score(X_train, y_train)

In [None]:
y_preds_knn = clf_knn.predict(X_test)

In [None]:
# Confusion matrix
knn_conf_mat = confusion_matrix(y_test, y_preds_knn)
plot_conf_matrix(knn_conf_mat)

In [None]:
# Classification report
print(classification_report(y_test, y_preds_knn))

In [None]:
## Plot ROC curve and calculate AUC metric
plot_roc_curve(clf_knn, X_test, y_test);

### Model 2: Naive Bayes

P.S: I don't have sufficient knowledge of hyperparameter tuning for this particular model and so, I will skip it for now and implement the model maybe again in a future update.

### Model 3: Logistic Regression
Resources: https://www.kaggle.com/joparga3/2-tuning-parameters-for-logistic-regression
* Solver (According to Scikit-Learn): For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.

In [None]:
# Logistic Regression grid for hyperparameter tuning
log_reg_grid = {"C": np.logspace(-4, 4, 30),
               'solver': ['liblinear']}

In [None]:
# Perform RandomizedSearchCV on logistic regression
rs_log_reg = RandomizedSearchCV(LogisticRegression(random_state=7),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True)

rs_log_reg.fit(X_train, y_train)

In [None]:
rs_log_reg.best_params_

In [None]:
rs_log_reg.score(X_test, y_test)

In [None]:
y_preds_rslg = rs_log_reg.predict(X_test)

In [None]:
lg_conf_mat = confusion_matrix(y_test, y_preds_rslg)
plot_conf_matrix(lg_conf_mat)

In [None]:
# Perform GridSearchCV on logistic Regression
gs_log_reg = GridSearchCV(LogisticRegression(random_state=7),
                         param_grid=log_reg_grid,
                         cv=5,
                         verbose=True)

gs_log_reg.fit(X_train, y_train)

In [None]:
gs_log_reg.best_params_

In [None]:
gs_log_reg.score(X_test, y_test)

In [None]:
cv_log_reg = cross_val_score(LogisticRegression(C=0.1082636733874054,
                                               solver='liblinear'),
                            X_scaled,
                            y,
                            cv=5,
                            scoring='accuracy')

In [None]:
cv_log_reg

In [None]:
cv_log_reg.mean()

In [None]:
cv_log_reg.std() * 100

In [None]:
# Final model
clf_log = LogisticRegression(C=0.1082636733874054,
                            solver='liblinear')

clf_log.fit(X_train, y_train)

In [None]:
clf_log.score(X_test, y_test)

In [None]:
y_preds_clf_log = clf_log.predict(X_test)

In [None]:
print(classification_report(y_test, y_preds_clf_log))

In [None]:
# Plot ROC Curve
plot_roc_curve(clf_log, X_test, y_test)

### Model 4: SVM Kernel

In [None]:
# GridSearch
svm_grid = {"C": np.logspace(-1, 2, 20),
            "gamma": np.logspace(-4, 2, 20)}

In [None]:
gs_svm = GridSearchCV(SVC(kernel='rbf', random_state=7),
                      param_grid=svm_grid,
                      cv=5,
                      verbose=True)

gs_svm.fit(X_train, y_train)

In [None]:
svm_params = gs_svm.best_params_

In [None]:
c_param = svm_params['C']
g_param = svm_params['gamma']

In [None]:
gs_svm.score(X_test, y_test)

In [None]:
svm_cv = cross_val_score(SVC(kernel='rbf', random_state=7, C=c_param, gamma=g_param),
                         X_scaled,
                         y,
                         cv=5,
                         scoring='accuracy')

In [None]:
svm_cv.mean()

In [None]:
svm_cv.std()

In [None]:
# Final model
svm_clf = SVC(kernel='rbf', random_state=7, C=c_param, gamma=g_param)
svm_clf.fit(X_train, y_train)

In [None]:
svm_preds = svm_clf.predict(X_test)

In [None]:
svm_conf_mat = confusion_matrix(y_test, svm_preds)
plot_conf_matrix(svm_conf_mat)

In [None]:
svm_clf.score(X_test, y_test)

### Model 5: Random Forest

In [None]:
# Dictionary of number of trees with its accuracy score
tas = {}

for i in range(1, 2000, 100):
    rf_clf = RandomForestClassifier(n_estimators=i)
    rf_clf.fit(X_train_ws, y_train)
    tas.update({str(i): rf_clf.score(X_test_ws, y_test)})

In [None]:
max(tas.values())

In [None]:
# Grid search for other parameters
rf_grid = {"max_depth": [5, 8, 15, 25, 30],
           "min_samples_split": [2, 5, 10, 15, 100],
           "min_samples_leaf": [2, 5, 10],
           "n_estimators": [100, 150, 200]} 

In [None]:
gs_rf = GridSearchCV(RandomForestClassifier(),
                     param_grid = rf_grid,
                     cv = 5,
                     verbose = True)

gs_rf.fit(X_train_ws, y_train)

In [None]:
gs_rf.score(X_test_ws, y_test)

In [None]:
rf_params = gs_rf.best_params_

In [None]:
rf_params

In [None]:
rf_cv = cross_val_score(RandomForestClassifier(random_state=7, max_depth=5, min_samples_leaf=2, min_samples_split=15, n_estimators= 150),
                        X,
                        y,
                        cv=5,
                        scoring='accuracy')

In [None]:
rf_cv.mean()

In [None]:
rf_cv.std() * 100

In [None]:
# Default model cross-validation
rf_cv2 = cross_val_score(RandomForestClassifier(random_state=7),
                        X,
                        y,
                        cv=5,
                        scoring='accuracy')

In [None]:
rf_cv2.mean()

In [None]:
rf_cv2.std() * 100

In [None]:
# Final model
rf_clf = RandomForestClassifier(random_state=7, n_estimators=100)
rf_clf.fit(X_train_ws, y_train)

In [None]:
rf_clf.score(X_test_ws, y_test)

In [None]:
rf_clf.score(X_train_ws, y_train)

In [None]:
rf_y_preds = rf_clf.predict(X_test_ws)

In [None]:
rf_conf_mat = confusion_matrix(y_test, rf_y_preds)
plot_conf_matrix(rf_conf_mat)

In [None]:
plot_roc_curve(rf_clf, X_test, y_test);

In [None]:
print(classification_report(y_test, rf_y_preds))

In [None]:
# Final scores with hyperparameter tuning
f_scores = {"Cat Boost:": cb_clf.score(X_test_mms, y_test),
            "KNN:": rs_KNN.score(X_test, y_test) ,
            "Logistic Regression:": rs_log_reg.score(X_test, y_test),
            "SVM Kernel:": gs_svm.score(X_test, y_test),
            "Random Forest:": rf_clf.score(X_test_ws, y_test)} 

In [None]:
for i in f_scores:
    print(i, f_scores[i])