# Diagnosing Heart Disease

Data contains;

* age - age in years
* sex - (1 = male; 0 = female)
* cp - chest pain type
* trestbps - resting blood pressure (in mm Hg on admission to the hospital)
* chol - serum cholestoral in mg/dl
* fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
* restecg - resting electrocardiographic results
* thalach - maximum heart rate achieved
* exang - exercise induced angina (1 = yes; 0 = no)
* oldpeak - ST depression induced by exercise relative to rest
* slope - the slope of the peak exercise ST segment
* ca - number of major vessels (0-3) colored by flourosopy
* thal - 3 = normal; 6 = fixed defect; 7 = reversable defect
* target - have disease or not (1=yes, 0=no)

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%matplotlib inline

plt.style.use('seaborn')
sns.set(style="darkgrid")
plt.rcParams['figure.figsize'] = (10.0, 8.0)
plt.rcParams['xtick.labelsize'] = 14 
plt.rcParams['ytick.labelsize'] = 14 
plt.rcParams['axes.labelsize'] = 18

sns.set(font_scale=1.8)
sns.set(style="darkgrid")

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.options.mode.chained_assignment = None
pd.set_option('display.float_format', lambda x: '%.4f' % x)
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Import the Data

In [None]:
raw_df = pd.read_csv("../input/heart-disease-uci/heart.csv")

raw_df.shape

# Exploring the data + Preprocessing

Let's change the column names to be a bit clearer


In [None]:
raw_df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

### Get some basic feel for the data

In [None]:
raw_df.sample(10)

In [None]:
raw_df.info()

In [None]:
raw_df.describe()

Since 54% of the dataset had a heart disease, we can't infer anything by the target feature frequency in other features(such as age), becuase it has a strong bias. </br>
That is, we can't say that (for example) for ages 25-35 has higher probability of having a heart disease than 35-45.

### Understanding resting electrocardiographic results

From looking at the features description, I don't understand some of the features nature. <br/>
Those features are: 
<br/>resting electrocardiographic results (values 0,1,2)

<br/> Is this ordinal? <br/> I need to know this in order to decide whether or not I should perform one-hot-encoding on it (if it's ordinal, it's better to leave it as-is).

So, let's try to make sense of the not-understood features

In [None]:
fig, ax = plt.subplots(figsize=(14,8))
sns.set(font_scale=1.5)
sns.barplot(x="rest_ecg", y="target", data=raw_df, ci=None)
ax.set_xlabel('rest_ecg')
ax.set_ylabel('target')
plt.title("target distribution for each rest ECG value".title(), fontsize=20)
plt.show()

Doesnt seem ordinal to me.
Let's one-hot-encode!

### One-Hot-Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder


df = pd.get_dummies(raw_df, drop_first=True, columns=["chest_pain_type", "rest_ecg", "thalassemia", "st_slope"])

df.describe()

### Outliers

Since the dataset is not big, outliers will have a significant impact. Let's boxlot some outlier suspected columns

Reminder: the box is 25th-75th percentiles (aka IQR). The bottom and top lines are defined by: 
</br>
top: 75th percentile + 1.5 * IQR
</br>
bottom:25th percentile - 1.5 * IQR


In [None]:
cols_to_box_plot = ["cholesterol", "max_heart_rate_achieved"]

fig, ax = plt.subplots(len(cols_to_box_plot), 1, figsize=(12, 10*len(cols_to_box_plot)))
for i, col in enumerate(cols_to_box_plot):
    sns.boxplot(y=col, data=df, ax=ax[i])
    ax[i].set_title(f"Box Plot for {col} column".title())
    print()


Let's remove the outliers

In [None]:
def remove_outliers(df, col_names, how="IQR", what_to_do="drop"):
    for col_name in col_names:
        Q1 = df[col_name].quantile(0.25)
        Q3 = df[col_name].quantile(0.75)
        IQR = Q3 - Q1  # IQR is interquartile range. 

        filter1 = (df[col_name] >= Q1 - 1.5 * IQR) & (df[col_name] <= Q3 + 1.5 *IQR)

        if what_to_do == "drop":
            df = df[filter1]

        if what_to_do == "median":
            df.loc[filter1, col_name] = df.col_name.median()

        if what_to_do == "mean":
            df.loc[filter1, col_name] = df.col_name.mean()
        break
        
    return df

In [None]:
df_with_outliers = df.copy()  # to keep track of whether or not the ouliers removal improved models. And if so, which.
df = remove_outliers(df, ["cholesterol", "max_heart_rate_achieved"])
df

check for NAs

In [None]:
df.isna().any()

Great! no NAs at all                     

Let's explore some more

In [None]:
df.target.value_counts()

It's well known that older people tend to have higher possibilty for heart disease, I wonder in what age does this start to reflect, and by how much. Let's check

Let's create a temp df with one-hot-encoding, without throwing the first categorical value, </br>
to better understand the corrleation matrix


In [None]:
temp_df = pd.get_dummies(raw_df, drop_first=False, columns=["chest_pain_type", "rest_ecg", "thalassemia", "st_slope"])


### Correlations

In [None]:
plt.figure(figsize=(22, 16))
sns.heatmap(temp_df.corr(), annot=True, fmt='.1f', cmap='BrBG', vmax=1, vmin=-1)
plt.title("DataFrame Correlation Matrix\n", fontsize=16)
plt.show()

It doesn't seem there's a strong Multicollinearity in the data. </br>
Perhaps st_slope and st_depression (0.6) are strognly correlated and hence, worth considiration in terms of whether or not to remove one of them.

In [None]:
plt.figure(figsize=(8, 12))
heatmap = sns.heatmap(temp_df.corr()[['target']].sort_values(by='target', ascending=False), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with the Target feature'.title(), fontdict={'fontsize': 18}, pad=16);

## Feature Engineering

In [None]:
df["age_sq"] = df.age ** 2
df["age_sex"] = df.age * df.sex

# Feature Scalling

We'll try both Normalization & standadization, and see which one performes better.

## Standardization

Standardization: Will scale the input to have mean of 0 and variance of 1. $$X_{stand} = \frac{X - \mu}{\sigma}$$


In [None]:
from sklearn.preprocessing import StandardScaler 

desc_df = df.describe()
cols = [col for col in df.columns if desc_df[col]["max"] != 1.0]  # if the max is 1, then it's a dummy var (I checked)

scaler = StandardScaler()

stndrd_df = df.copy()
stndrd_df[cols] = scaler.fit_transform(df[cols])

stndrd_df.describe()

## Normalization

Min Max Scaling: Will scale the input to have minimum of 0 and maximum of 1. </br> That is, it scales the data in the range of [0, 1] This is useful when the parameters have to be on same positive scale. But in this case, the outliers are lost. $$X_{norm} = \frac{X - X_{min}}{X_{max} - X_{min}}$$

In [None]:
from sklearn.preprocessing import MinMaxScaler


cols = df.columns
scaler = MinMaxScaler()
norm_df = df.copy()
norm_df[cols] = scaler.fit_transform(df[cols])

norm_df.describe()

## Dummy df (for Bernoulli Naive bayes)

In [None]:
from sklearn.preprocessing import OrdinalEncoder


continues_cols = ["age", "age_sex", "age_sq", "resting_blood_pressure", "cholesterol", "max_heart_rate_achieved", "st_depression"]
category_cols = ["num_major_vessels"]
dummy_cols = [col for col in df.columns if col not in continues_cols + category_cols + ["target"]]

dummy_df = df.copy()

dummy_df = pd.get_dummies(dummy_df, drop_first=True, columns=category_cols)

for col in continues_cols:
    try:
        dummy_df[col] = pd.qcut(dummy_df[col], 3, labels=[1, 2, 3], duplicates='drop')
    except ValueError as e:
        print(col, e, sep="\n")
        dummy_df[col] = pd.qcut(dummy_df[col], q=[0.1, 0.5, 0.8], labels=[1, 2], duplicates='drop')

enc = OrdinalEncoder()
dummy_df_ = enc.fit_transform(dummy_df)

dummy_df = pd.DataFrame(dummy_df_, columns=dummy_df.columns)
dummy_df = pd.get_dummies(dummy_df, drop_first=True, columns=continues_cols)


[col for col in dummy_df.columns if max(dummy_df[col]) != 1.0]  # make sure all cols are binary

# ML models

## First, Benchmark

The benchmark would be the most common label in the train set

In [None]:
counts = df.target.value_counts(dropna=False)
counts

The dataset is balanced, so it'll probably be easy to get better accuracy than the beanchmark.

In [None]:
from sklearn.metrics import f1_score 


acc = len(df[df.target==counts.sort_values(ascending=False).index[0]])/len(df)
pred = np.ones(len(df))
f1 = f1_score(df.target, pred)

print('Beanchmark Accuracy:', acc)
print('Beanchmark F1:', f1) # it'll be 0..

#### Let's create some variables and dfs that will help with the help functions

In [None]:
dfs = {"Normalized_df": norm_df, "Standadized_df": stndrd_df, "not_scaled_df": df, "not_scaled_with_outliers_df": df_with_outliers}
dfs_w_dummy = {"Normalized_df": norm_df, "Standadized_df": stndrd_df, "not_scaled_df": df, "not_scaled_with_outliers_df": df_with_outliers, "dummy_df": dummy_df}  
# perhaps I won't want to try all models with dummy as well. It's mainly for NB

target_var = "target"

evaluations = ["avg_test_accuracy", "avg_test_f1"]

index = [
    np.repeat(list(dfs_w_dummy.keys()), len(evaluations)),
    evaluations * len(dfs_w_dummy)
        ]

beanchmark_vals = np.array([acc, f1])
beanchmark_vals = np.tile(beanchmark_vals, len(dfs_w_dummy))

evaluation_df = pd.DataFrame(index=index)
evaluation_df["beanchmark"] = beanchmark_vals

evaluation_df

### Helper function, to evaluate a model

In [None]:
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate



def evaluate(classifier, dfs=dfs, k=4):
    """
    Description: Given a classifier, list of dataframes and k, computes the accuracy and f1 scores with KFOLD, and stores it in evaluation_df for further analysis and comparison.
    params:
    classifier: classifier object
    dfs: list of dataframes. The rational for more than 1 df, is in case you'll want to compare different preprocessed datasets, such as: different features, saclling, etc.
    k: number of k in KFOLD
    returns: a DataFrame object
    """
    clf_name = classifier.__class__.__name__
    print(f"For model: {clf_name}")
    
    for i, df_item in enumerate(dfs.items()):
        df_name = df_item[0]
        df_ = df_item[1]
        
        X = df_.loc[:, df_.columns != target_var]

        y = df_[target_var]
        
        scores = cross_validate(classifier, X, y, cv=k, scoring=('accuracy', 'f1'), return_train_score=True)
        avg_test_accuracy = scores['test_accuracy'].mean()
        avg_test_f1 = scores['test_f1'].mean()
        
        print(f"\nfor dataframe {df_name}:")
        print(f"{avg_test_accuracy:.3f} accuracy with a standard deviation of {scores['test_accuracy'].std():.3f}")
        print(f"{avg_test_f1:.3f} f1 with a standard deviation of {scores['test_f1'].std():.3f}")
        print("Train accuracy:", scores["train_accuracy"])
        print((df_name, evaluations), clf_name)
        evaluation_df.loc[(df_name, evaluations), clf_name] =  [avg_test_accuracy, avg_test_f1]

    
    print("\n")
    
    return evaluation_df



### Helper function, to perform a grid search

In [None]:
from sklearn.model_selection import GridSearchCV


def grid_search(model, grid: dict, dfs=dfs, k=4, to_print=True):
    """
    Description: Performing a GridSearch given model, dfs and paramters
    params:
    model: model object that is working with GridSearchCV
    grid: a dict object(we can iterate over it: list, tuple, numpy array) grid which has param name and param values
    dfs: list of dataframes. The rational for more than 1 df, is in case you'll want to compare different preprocessed datasets, such as: different features, saclling, etc.
    k: number of k in KFOLD
    returns: a dict object, best_params ***on one of the given datasets (if its performed way better than others, we'll see it in evaluation_df)***
    """
    max_acc = -1
    for df_name, df_ in dfs.items():
        X = df_.loc[:, df_.columns != target_var]
        y = df_[target_var]

        grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=k, scoring='accuracy',error_score=0)
        grid_result = grid_search.fit(X, y)
    
        print(f"\nFor {df_name}:")
        print("Best: %.3f using %s" % (grid_result.best_score_, grid_result.best_params_))
        
        if grid_result.best_score_ > max_acc: # if this accuracy is higher than the last one, then change the best_params to this one and the max_acc var
            best_params = grid_result.best_params_ 
            max_acc = grid_result.best_score_
            
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        
        if to_print == True:
            for mean, stdev, param in zip(means, stds, params):
                print("%.3f (%.3f) with: %r" % (mean, stdev, param))
        
    return best_params # NOTE: That's returning the best performed params, that performed best on a given dataset

# Naive Bayes

### There are deifferent kinds of naive bayes classifiers. </br>
all bayes classifiers obey to this equation: p(y | x1; x2...; xn) = p (x1; x2...; xn | y) * p(y) / p(x).  </br>
* Multinomial - the most common one, assumes discrete features. simple conditional probability with the probability being simply the frequency of each feature in each class.  </br>
* Gaussian - assumes the likelihood probabilities follow Gaussian distribution.  </br>
* Bernoully - assumes bernoully distribution of the features(binary).
* Categorical - assumes features are categorical

For the Bernoully naive bayes, we need to binarize values, so we will only use the standardized df(has both negative and positive values, approx in the 50 percentile) with different binarize values (ex: binarize=0 is to cut it to approx half 0 and half 1). 

In [None]:
stndrd_df.describe()

In [None]:
%%time

from sklearn.naive_bayes import BernoulliNB


binarize = [-0.5, 0, 0.5]
standard_dfs = {"Standadized_df": stndrd_df}
grid = dict(binarize=binarize)
model = BernoulliNB()
best_params = grid_search(model, grid, to_print=False, dfs=dfs_w_dummy)


##########################################################################################


model = BernoulliNB(**best_params)

evaluate(model, k=4, dfs=dfs_w_dummy)

Makes sense that BernoulliNB will work best on the standardized df(with binary=0 or close to it). </br> However, it doesn't seem siginificant, which probably means that the already-binary(those the model didn't binarize) features has the most impact on the model.

Now, let's try with the dummy df (binarize paramter in BernoulliNB makes it 2 categories features). </br>
Note: BernoulliNB on standadized df is only 2 categories while the dummy df has 3 categories for the continues values. it shouldn't behave too different.</br>
Let's see if it's better that way. </br>

In [None]:
from sklearn.naive_bayes import GaussianNB


model = GaussianNB()

evaluate(model, k=4, dfs=dfs_w_dummy)

Note to self: no need for categorical NB because bernoully NB takes its place.
</br>
Also, multinominal is not relavent beacuse it's not suited for both continues and binary data. It needs discrite values.

# Logistic regression

In [None]:
%%time

from sklearn.linear_model import LogisticRegression


model = LogisticRegression()

solvers = ['newton-cg', 'liblinear']
penalty = ['l1', 'l2']
c_values = (0.1, 1, 2, 5, 10, 20, 30, 40, 50)
n_jobs = [-1]

grid = dict(solver=solvers, penalty=penalty, C=c_values)

best_params = grid_search(model, grid, to_print=False)

In [None]:
%%time


model = LogisticRegression(**best_params)

evaluate(model, k=4, dfs=dfs_w_dummy)

# Support Vector Machines

In [None]:
%%time

from sklearn.svm import SVC


model = SVC()

kernel = ['poly', 'rbf', 'sigmoid']
C = np.linspace(0.1, 20, 4)
degree = [2, 3]
coef0 = [1, 5]

grid = dict(kernel=kernel, C=C, degree=degree, coef0=coef0)

best_params = grid_search(model, grid, to_print=False)

In [None]:
%%time


model = SVC(**best_params)

evaluate(model, k=4)

# KNN

In [None]:
%%time

from sklearn.neighbors import KNeighborsClassifier


model = KNeighborsClassifier()

n_neighbors = range(3, 24, 3)
weights = ['uniform', 'distance']
metric = ['minkowski']  # remember that minkowski can be manhattan or euclidean (p=1, p=2, respectively)
p = [1, 2, 3, 4]  # for minkowski


grid = dict(n_neighbors=n_neighbors, weights=weights, metric=metric, p=p, n_jobs=n_jobs)

best_params = grid_search(model, grid)

In [None]:
%%time


model = KNeighborsClassifier(**best_params)

evaluate(model, k=4)

# Decision Tree

In [None]:
%%time

from sklearn.tree import DecisionTreeClassifier


model = DecisionTreeClassifier()

min_samples_leaf = np.linspace(5, 20, 4).astype(int)
max_depth = [3, 7, 9]


grid = dict(max_depth=max_depth, min_samples_leaf=min_samples_leaf)


best_params = grid_search(model, grid)

In [None]:
%%time


model = DecisionTreeClassifier(**best_params)

evaluate(model, k=4, dfs=dfs_w_dummy)

# Random Forest

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier()


n_estimators = [10, 100, 500, 1000]
min_samples_leaf = np.linspace(5, 20, 4).astype(int)
max_depth = [3, 7, 9]


grid = dict(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf)


best_params = grid_search(model, grid, to_print=False)

In [None]:
%%time


model = RandomForestClassifier(**best_params)

evaluate(model, k=4)

# AdaBoost

In [None]:
%%time

from sklearn.ensemble import AdaBoostClassifier


model = AdaBoostClassifier()


n_estimators = [10, 100, 500, 700, 1000]
learning_rate = [0.001, 0.01, 0.1]


grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)


best_params = grid_search(model, grid)

In [None]:
%%time


model = AdaBoostClassifier(**best_params)

evaluate(model, k=4)

# XGBoost

In [None]:
%%time

from sklearn.ensemble import GradientBoostingClassifier


model = GradientBoostingClassifier()

n_estimators = [10, 100, 1000]
learning_rate = [0.001, 0.01, 0.1]
subsample = [0.5, 0.7, 1.0]
max_depth = [3, 7, 9]


grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)

best_params = grid_search(model, grid, to_print=False)

In [None]:
%%time


model = GradientBoostingClassifier(**best_params)

evaluate(model, k=4)

# Neural Networks

To be continued...

# Evaluation

In [None]:
evaluation_df

In [None]:
evaluation_df1 = evaluation_df.groupby(level=[1]).max() * 100 
evaluation_df1 = evaluation_df1.T.reset_index(col_fill="model")
evaluation_df1.rename(columns={'index': 'model'}, inplace=True)
evaluation_df1

In [None]:
y = evaluation_df1.avg_test_accuracy



fig, ax = plt.subplots(figsize=(20, 10))
plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
plot = sns.barplot(x=evaluation_df1.model, y=y)
plt.title("Test set/s Mean accuracy %".title())

# annotate the accuracy
for bar in plot.patches:
    plot.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 5),
                   textcoords='offset points')
    
for bar in ax.patches:
    bar.set_facecolor('#888888')

highlight = evaluation_df1.iloc[y.idxmax()].model

pos = y.idxmax()

ax.patches[pos].set_facecolor('#aa3333')

Note that this accuracies are a result of the best-performed hyperparamters & data type.

In [None]:
y = evaluation_df1.avg_test_f1


fig, ax = plt.subplots(figsize=(20, 10))
plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
plot = sns.barplot(x=evaluation_df1.model, y=y)
plt.title("Test set/s Mean f1 score %".title())

# annotate the accuracy
for bar in plot.patches:
    plot.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 5),
                   textcoords='offset points')
    
for bar in ax.patches:
    bar.set_facecolor('#888888')

highlight = evaluation_df1.iloc[y.idxmax()].model

pos = y.idxmax()

ax.patches[pos].set_facecolor('#aa3333')

## And the winner is ... (Drumroll...) ...

# Logistic Regression!