In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Importing all the tools we need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score,f1_score
from sklearn.metrics import plot_roc_curve

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Definition
See if you can find any other trends in heart data to predict certain cardiovascular events or find any clear indications of heart health.
## 2. Data
Taken from https://www.kaggle.com/ronitf/heart-disease-uci
## 3. Features

This is where you'll get different information about each of the features in your data.

**Create data dictionary**

1. age - age in years
2. sex - (1 = male; 0 = female)
3. cp - chest pain type
    * 0: Typical angina: chest pain related decrease blood supply to the heart
    * 1: Atypical angina: chest pain not related to heart
    * 2: Non-anginal pain: typically esophageal spasms (non heart related)
    * 3: Asymptomatic: chest pain not showing signs of disease
4. trestbps - resting blood pressure (in mm Hg on admission to the hospital) anything above 130-140 is typically cause for concern
5. chol - serum cholestoral in mg/dl
    * serum = LDL + HDL + .2 * triglycerides
    * above 200 is cause for concern
6. fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
    * '>126' mg/dL signals diabetes
7. restecg - resting electrocardiographic results
    * 0: Nothing to note
    * 1: ST-T Wave abnormality
        * can range from mild symptoms to severe problems
        * signals non-normal heart beat
    * 2: Possible or definite left ventricular hypertrophy
        * Enlarged heart's main pumping chamber
8. thalach - maximum heart rate achieved
9. exang - exercise induced angina (1 = yes; 0 = no)
10. oldpeak - ST depression induced by exercise relative to rest looks at stress of heart during excercise unhealthy heart will stress more
11. slope - the slope of the peak exercise ST segment
    * 0: Upsloping: better heart rate with excercise (uncommon)
    * 1: Flatsloping: minimal change (typical healthy heart)
    * 2: Downslopins: signs of unhealthy heart
12. ca - number of major vessels (0-3) colored by flourosopy
    * colored vessel means the doctor can see the blood passing through
    * the more blood movement the better (no clots)
13. thal - thalium stress result
    * 1,3: normal
    * 6: fixed defect: used to be defect but ok now
    * 7: reversable defect: no proper blood movement when excercising
14. target - have disease or not (1=yes, 0=no) (= the predicted attribute)

In [None]:
#Read data from csv
df = pd.read_csv("../input/heart-disease-uci/heart.csv")

In [None]:
#Get some rows
df.head()

# Data exploration

In [None]:
#Check if everything is a number
df.info()

In [None]:
# Are there any missing data?
df.isna().sum() 

In [None]:
df["target"].value_counts()

In [None]:
df["target"].value_counts().plot(kind="bar", color=["salmon","lightblue"]);

In [None]:
 df.describe()

In [None]:
# Create a plot of crosstab
pd.crosstab(df.target, df.sex).plot(kind="bar",
                                    figsize=(10,6),
                                    color=["salmon","lightblue"])
plt.title("Heart Disease Frequency for Sex")
plt.xlabel("0 = No Disease, 1 = Disease")
plt.legend(["Female","Male"])
plt.xticks(rotation=0);

In [None]:
# Create another figure
plt.figure(figsize=(10, 6))

# Scatter with positive example
plt.scatter(df.age[df.target==1], df.thalach[df.target==1], color="salmon")

#Scatter with negative examples
plt.scatter(df.age[df.target==0], df.thalach[df.target==0], color="lightblue");

# Add some helpful info
plt.title("Heart Disease in function of Age and Max Heart Rate")
plt.xlabel("Age")
plt.ylabel("Max Hear Rate")
plt.legend(["Disease", "No Disease"]);

In [None]:
# Check the distribution of the age column with histogram
df.age.plot.hist();

In [None]:
# Make a crosstab more visual
pd.crosstab(df.cp, df.target).plot(kind="bar",
                                  figsize=(10,6),
                                  color=["salmon","lightblue"])

#Add some communication
plt.title("Heart Disease Frequency Per Chest Pain Type")
plt.xlabel("Chest Pain Type")
plt.ylabel("Amount")
plt.legend(["Disease","No Disease"])
plt.xticks(rotation=0);

In [None]:
#Create a correlation matrix
corr_matrix = df.corr()
fig,ax=plt.subplots(figsize=(15,10))
ax = sns.heatmap(corr_matrix,
                annot=True,
                linewidths=0.5,
                fmt=".2f",
                cmap="YlGnBu")

# Modelling

In [None]:
# Split data into X and y
X = df.drop("target",axis=1)
y = df["target"]

In [None]:
#Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

We're going to try 3 different machine learning models:
1. Logistic regression
2. K-Nearest Neighbours Classifier
3. Random Forest Classifier

In [None]:
# Put models in a dictionary
models = {"Logistic Regression": LogisticRegression(),
         "KNN": KNeighborsClassifier(),
         "Random Forest": RandomForestClassifier()}

# Create a function to fit and score models

def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models: a dict of different Scikit-Learn machine learning models
    X_train : training data (no labels)
    X_test : testing data (no labels)
    y_train : training labels
    y_test : testing labels
    """
    # Set random seed
    np.random.seed(42)
    
    #Make a dictionary to keep model scores
    model_scores={}
    #Loop through models
    for name, model in models.items():
        #Fit the model to the data
        model.fit(X_train,y_train)
        #Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test,y_test)
    return model_scores

In [None]:
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)
model_scores

In [None]:
model_compare = pd.DataFrame(model_scores, index=["accuracy"])
model_compare.T.plot.bar();

Results: KNN is staying behind. We will check how accuracy changes with a little tweaking. If it's not increasing, KNN will be droppped for further tuning.

# Tuning

## Checking accuracy based on different parameters for KNN

In [None]:
train_scores = []
test_scores = []

# Create a list of different values for n_neighbors
neighbors = range(1,40)

#Setup KNN instance
knn=KNeighborsClassifier()

# Loop through different n_neighbors
for i in neighbors:
    knn.set_params(n_neighbors=i)
    
    #Fit the algorithm
    knn.fit(X_train,y_train)
    
    # Update the training scores list
    train_scores.append(knn.score(X_train, y_train))
    
    #Update the test scores list
    test_scores.append(knn.score(X_test, y_test ))

In [None]:
plt.plot(neighbors, train_scores, label = "train score")
plt.plot(neighbors, test_scores, label = "test scores")
plt.xticks(np.arange(1,21,1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_scores)*100:.2f}%")

## Trying to get better parameters for LogisticRegression() and RandomForestClassfier() using RandomizedGridCV

In [None]:
# Create a hyperparameter grid for Logistic regression
log_reg_grid = {"C": np.logspace(-4,4,20),
               "solver": ["liblinear"]}

# Create hyperparameter grid for RandomForestClassfier
rf_grid = {"n_estimators": np.arange(10, 1000, 50),
          "max_depth": [None, 3, 5, 10],
          "min_samples_split": np.arange(2, 20, 2),
          "min_samples_leaf": np.arange(1, 20, 2)}

Tune LogsticRegression

In [None]:
# Tune LogsticRegression

# Setup random hyperparameter search for LogisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid, 
                                cv=5,
                               n_iter=20,
                               verbose = True)

# Fit random hyperparametr search model for LogisticRegression
rs_log_reg.fit(X_train,y_train)

In [None]:
#best parameters are:
rs_log_reg.best_params_

In [None]:
#best accuracy score for Logistic regression
rs_log_reg.score(X_test,y_test)

Tune RandomizedForestClassifier

In [None]:
# Setup random hyperparameter search for RandomizedForestClassifier
rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions=rf_grid,
                           cv = 5,
                           n_iter=20,
                           verbose=True,
                           n_jobs=-1)

# Fit random Hyperparameter srach model for RandomForestClassifier()
rs_rf.fit(X_train,y_train)

In [None]:
#best parameters are:
rs_rf.best_params_

In [None]:
# best accuracy score for RandomForestClassifier
rs_rf.score(X_test,y_test)

Results: Since our LogisticRegression model provides the best scores so far, we'll try and imporve them again using GfridSearchCV

## Hyperparameter Tuning with GridSearchCV



In [None]:
# Different hyperparameters for our LogisticsRegression model
log_reg_grid = {"C": np.logspace(-4,4,30),
               "solver": ["liblinear"]}

#Setup grid hyperparameter search for LogisticRegression
gs_log_reg = GridSearchCV(LogisticRegression(),
                         param_grid=log_reg_grid,
                         cv=5,
                         verbose=True)

# Fit grid hyperparameter search model
gs_log_reg.fit(X_train,y_train)

In [None]:
# Best parameters are:
gs_log_reg.best_params_

In [None]:
# Evaluate the grid search LogisticRegression model
gs_log_reg.score(X_test,y_test)

# Evaluating our tuned machine learning classifier, beyond accuracy

After acuring best parameters for LogisticRegression. We will evaluate following metrics:
* ROC curve and AUC score
* Confusion matrix
* Classification report
* Precision
* Recall
* F1-score

In [None]:
# Make predictions with tuned model
y_preds = gs_log_reg.predict(X_test)

ROC curve and AUC metric

In [None]:
# Plot ROC curve and calculate AUC metric
plot_roc_curve(gs_log_reg, X_test,y_test)

Confusion matrix

In [None]:
sns.set(font_scale=1.5)

def plot_conf_mat(y_test, y_preds):
    """
    Plots a nice looking confusion matrix using Seaborn's heatmap()
    """
    fig, ax = plt.subplots (figsize=(3,3))
    ax= sns.heatmap(confusion_matrix(y_test,y_preds),
                   annot=True,
                   cbar=False)
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    
plot_conf_mat(y_test,y_preds)

Classification report

In [None]:
print(classification_report(y_test,y_preds))

## Calculate evaluation metrics using cross-validation

In [None]:
# Check best params
gs_log_reg.best_params_

In [None]:
#Making it dynamically, my best params is commented
clf = LogisticRegression()
clf.set_params(**gs_log_reg.best_params_)
#clf = LogisticRegression(C = 0.38566204211634725,solver = "liblinear")


Cross validated accuracy

In [None]:
cv_acc=cross_val_score(clf,
                       X,y,
                       cv=5,scoring="accuracy"
)
cv_acc = np.mean(cv_acc)

Cross validated precision

In [None]:
cv_precision=cross_val_score(clf,
                       X,y,
                       cv=5,scoring="precision"
)
cv_precision = np.mean(cv_precision)

Cross validated recall

In [None]:
cv_recall=cross_val_score(clf,
                       X,y,
                       cv=5,scoring="recall"
)
cv_recall = np.mean(cv_recall)

Cross validated f1-score

In [None]:
cv_f1=cross_val_score(clf,
                       X,y,
                       cv=5,scoring="f1"
)
cv_f1 = np.mean(cv_f1)

In [None]:
print(f"Accuracy: {cv_acc*100:.2f}%   Overall, how often is the classifier correct?")
print(f"Recall (Sensitivity): {cv_recall*100:.2f}%    When it's actually yes, how often does it predict yes?")
print(f"Precision: {cv_precision*100:.2f}%    When it predicts yes, how often is it correct?")
print(f"F1 score: {cv_f1*100:.2f}%    good F1 score means that you have low false positives and low false negatives")


In [None]:
# Visualize cross-validated metrics
cv_metrics = pd.DataFrame({"Accuracy":cv_acc,
                          "Precision":cv_precision,
                          "Recall": cv_recall,
                          "F1 score": cv_f1},
                         index=[0])
cv_metrics.T.plot.bar(title="Cross-validated metrics", legend=False);

## Feature importance

In [None]:
clf.fit(X_train,y_train)

In [None]:
df.head()

In [None]:
 clf.coef_

In [None]:
# Match coef's of features to columns
feature_dict = dict (zip(df.columns, list(clf.coef_[0])))
feature_dict

In [None]:
# Visualize feature importance
feature_df = pd.DataFrame(feature_dict, index=[0])
feature_df.T.plot.bar(title="feature Importance", legend=False);

In [None]:
# Helper function for plotting feature importance
def plot_features(columns, importances, n=20):
    df = (pd.DataFrame({"features": columns,
                       "feature_importances": importances})
          .sort_values("feature_importances",ascending = False)
          .reset_index(drop=True))
    fig, ax =plt.subplots()
    ax.barh(df["features"][:n], df["feature_importances"][:20])
    ax.set_ylabel("Features")
    ax.set_xlabel("Feature importance")
    ax.invert_yaxis()

In [None]:
# another visualization
plot_features(X_train.columns, clf.coef_[0])

## Results (as a non expert in field):

1. Dataset has strange Heart Disease Frequency according to Sex distribution

If the patient is female, she has a higher chances of heart disease.

Is it a dataset problem? Or women generally go to hospitals only with serious pains?

2. Correlation coefficient and feature importance for cholesterol is non existent.

That's a surprising finding, every media is saying there is a correlation between cholesterol levels and heart diseases.

3. There is a positive correlation between cp (chest pain) and heart disease

cp - chest pain type
  * 0: Typical angina: chest pain related decrease blood supply to the heart
  * 1: Atypical angina: chest pain not related to heart
  * 2: Non-anginal pain: typically esophageal spasms (non heart related)
  * 3: Asymptomatic: chest pain not showing signs of disease

So, basically, if there is no chest pain, it's more likely patient have a heart disease. Which can be a sign for dataset problem.