# End-to-end Heart Disease Prediction Project

[Front-end](https://predict-heart-diseases.herokuapp.com/)

[GitHub repo](https://github.com/MichaelBryantDS/heart-disease-pred)

**Import libraries and data**

In [None]:
#import libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
#suppress warnings
warnings.filterwarnings("ignore")

In [None]:
#import data
heart_data = pd.read_csv('../input/heart-disease-uci/heart.csv')

# EDA

In [None]:
#look at formatting of entries
heart_data.head()

In [None]:
#display null values and data types
heart_data.info()

In [None]:
#null values for ca
heart_data[heart_data.ca==4]

In [None]:
#null percentage for ca
print('Percentage of ca null: {}%'.format((len(heart_data[heart_data.ca==4])/len(heart_data.ca))*100))

In [None]:
#null values for thal
heart_data[heart_data.thal==0]

In [None]:
#null percentage for thal
print('Percentage of thal null: {}%'.format((len(heart_data[heart_data.thal==0])/len(heart_data.thal))*100))

In [None]:
#numerical features
numerical = [
    'age',
    'trestbps',
    'chol',
    'thalach',
    'oldpeak',
]

#categorical features
categorical = [
    'sex',
    'cp',
    'fbs',
    'restecg',
    'exang',
    'slope',
    'ca',
    'thal',
    'target'
]

**Data distribution and outliers**

In [None]:
#look at distribution of data
heart_data.describe()

In [None]:
#look at number of outliers greater than or equal to 3 std from mean
heart_data[np.abs(stats.zscore(heart_data)) >= 3]

In [None]:
#look at number of outliers greater than or equal to 4 std from mean
heart_data[np.abs(stats.zscore(heart_data)) >= 4]

In [None]:
#an outlier who is a 67 year old female with a cholesterol greater than six std from mean
heart_data[np.abs(stats.zscore(heart_data)) >= 6]

In [None]:
#oldpeak outlier visualized
sns.boxplot(x=heart_data['oldpeak'], palette='Set1')
plt.xlabel('oldpeak')

In [None]:
#cholesterol outlier visualized
sns.boxplot(x=heart_data['chol'], palette='Set1')
plt.xlabel('chol')

In [None]:
#look at numerical data distribution
for i in heart_data[numerical].columns:
    plt.hist(heart_data[numerical][i], color='steelblue', edgecolor='black')
    plt.xticks()
    plt.xlabel(i)
    plt.ylabel('number of people')
    plt.show()

In [None]:
#look at categorical data distribution
for i in heart_data[categorical].columns:
    sns.barplot(edgecolor='black',x=heart_data[categorical][i].value_counts().index,y=heart_data[categorical][i].value_counts(),palette='Set1')
    plt.xlabel(i)
    plt.ylabel('number of people')
    plt.show()

**Finding correlations with a heat map and visualizations**

In [None]:
#heat map to see numerical correlations, pearson measures monotonic relationship (numerical or ordinal categorical)
plt.figure(figsize=(16, 6))
sns.heatmap(heart_data[numerical].corr(method='pearson'), vmin=-1, vmax=1, annot=True,cmap='coolwarm')
plt.title('Pearson Correlation Heatmap for Numerical Variables', fontdict={'fontsize':12}, pad=12);

In [None]:
#look at how target is distributed among variables
sns.pairplot(heart_data,hue='target',palette='Set1')
plt.legend()
plt.show()

In [None]:
#thalach vs age
sns.lmplot(x='age', y='thalach', data=heart_data, palette='Set1')

#settings to display all markers
xticks, xticklabels = plt.xticks()
xmin = (3*xticks[0] - xticks[1])/2.
xmax = (3*xticks[-1] - xticks[-2])/2.
plt.xlim(xmin, xmax)
plt.xticks(xticks)

plt.show()

In [None]:
#thalach vs age with target hue
sns.lmplot(x='age', y='thalach', hue='target', data=heart_data,palette='Set1')

#settings to display all markers
xticks, xticklabels = plt.xticks()
xmin = (8*xticks[0] - xticks[1])/2.
xmax = (3*xticks[-1] - xticks[-2])/2.
plt.xlim(xmin, xmax)
plt.xticks(xticks)

plt.show()

In [None]:
#age vs target
sns.violinplot(x='target', y='age', data=heart_data, palette='Set1')
plt.show()

In [None]:
#cp distribution with exang hue
sns.histplot(discrete=True,x="cp", hue="exang", data=heart_data, stat="count", multiple="stack",palette='Set1')

plt.ylabel('number of people')
plt.show()

In [None]:
#cp distribution with target hue
sns.histplot(discrete=True, x="cp", hue="target", data=heart_data, stat="count", multiple="stack",palette='Set1')

plt.ylabel('number of people')
plt.xticks(ticks=[0,1,2,3])
plt.show()

In [None]:
#thalach vs exang
sns.violinplot(x='exang', y='thalach', data=heart_data, palette='Set1')
plt.show()

In [None]:
#thalach vs exang with target hue
sns.violinplot(x='exang', y='thalach', data=heart_data, palette='Set1', hue='target')
plt.show()

In [None]:
#thalach vs exang with target hue
sns.swarmplot(y=heart_data['thalach'],
              x=heart_data['exang'], hue=heart_data['target'],palette='Set1')

plt.show()

In [None]:
#thalach vs target
sns.violinplot(x='target', y='thalach', data=heart_data, palette='Set1')
plt.show()

In [None]:
#exang distribution with target hue
sns.histplot(discrete=True, x="exang", hue="target", data=heart_data, stat="count", multiple="stack",palette='Set1')
plt.ylabel('number of people')
plt.xticks(ticks=[0,1])
plt.show()

In [None]:
#oldpeak vs slope
sns.violinplot(x='slope', y='oldpeak', data=heart_data, palette='Set1')
plt.show

In [None]:
#oldpeak vs target
sns.violinplot(x='target', y='oldpeak', data=heart_data, palette='Set1')
plt.show()

In [None]:
#distribution of slope with target hue
sns.histplot(discrete=True, x="slope", hue="target", data=heart_data, stat="count", multiple="stack",palette='Set1')

plt.ylabel('number of people')
plt.xticks(ticks=[0,1,2])
plt.show()

In [None]:
#distribution of ca with target hue
sns.histplot(discrete=True, x="ca", hue="target", data=heart_data, stat="count", multiple="stack",palette='Set1')

plt.ylabel('number of people')
plt.xticks(ticks=[0,1,2,3,4])
plt.show()

In [None]:
#distribution of thal with target hue
sns.histplot(discrete=True, x="thal", hue="target", data=heart_data, stat="count", multiple="stack",palette='Set1')

plt.ylabel('number of people')
plt.xticks(ticks=[0,1,2,3])
plt.show()

**Risk factors feature**

In [None]:
#creating arrays that meet critera for risk factors
age_sex_risk = heart_data.loc[(heart_data.sex == 0) & (heart_data.age >= 50) |
                                   (heart_data.sex == 1) & (heart_data.age >= 45) ]

high_blood_pressure_risk = heart_data.loc[heart_data.trestbps >= 130]

high_cholesterol_risk = heart_data.loc[heart_data.chol >= 240]

diabetes_risk = heart_data.loc[heart_data.fbs == 1]

In [None]:
#creating a new column called 'risk factors' which counts the number of risk factors each patient has
risk_factors_indices = np.concatenate((age_sex_risk.index,
                                       high_blood_pressure_risk.index,
                                       high_cholesterol_risk.index,
                                       diabetes_risk.index))

risk_factor_counts = np.bincount(risk_factors_indices)

risk_factors = pd.DataFrame(risk_factor_counts)

risk_factors['risk factors']=risk_factors

risk_factors['target'] = heart_data['target'].copy()

In [None]:
#distribution of risk factors with target hue
sns.histplot(discrete=True, x="risk factors", hue="target", data=risk_factors, stat="count", multiple="stack",palette='Set1')

plt.ylabel('number of people')
plt.xticks(ticks=[0,1,2,3,4])
plt.show()

**Mutual information**

In [None]:
#remove target variable from categorical array
categorical.remove('target')

#change dtype of categorical features to object
heart_data[categorical]=heart_data[categorical].astype('object')

#copy of variables and target
X = heart_data.copy()
y = X.pop('target')

In [None]:
X.info()

In [None]:
X_mi = X.copy()

In [None]:
#label encoding for categorical variables
for colname in X_mi.select_dtypes("object"):
    X_mi[colname], _ = X_mi[colname].factorize()

#all discrete features have int dtypes
discrete_features = X_mi.dtypes == int

In [None]:
discrete_features

In [None]:
#some continuous variables also have int dtypes
discrete_features[numerical] = False

In [None]:
#use classification since the target variable is discrete
from sklearn.feature_selection import mutual_info_classif

#define a function to produce mutual information scores
def make_mi_scores(X_mi, y, discrete_features):
    mi_scores = mutual_info_classif(X_mi, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X_mi.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

#compute mutual information scores
mi_scores = make_mi_scores(X_mi, y, discrete_features)
mi_scores

In [None]:
#define a function to plot mutual information scores
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores, color='steelblue', edgecolor='black')
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

#plot the scores
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

In [None]:
#plot selling_price against car_name
sns.violinplot(y=X_mi.thal, x=y, palette='Set1');

# Peparing data for ML

In [None]:
#import libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
#get feature names
X = pd.concat([X[numerical],pd.get_dummies(X[categorical], drop_first=True)],axis=1)
feature_names = X.columns

# train/test split with stratify making sure classes are evenlly represented across splits
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.75, random_state=1)

#define scaler
scaler=MinMaxScaler()

#apply preprocessing to split data with scaler
X_train[numerical] = scaler.fit_transform(X_train[numerical])
X_test[numerical] = scaler.transform(X_test[numerical])

# Modeling

In [None]:
#import ml algorithms
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from numpy import mean, std
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

**Baseline**

In [None]:
#naive Bayes with five-fold cross validation
gnb = GaussianNB()
cv = cross_val_score(gnb,X_train,y_train,cv=5)
print(mean(cv), '+/-', std(cv))

In [None]:
#logistic regression with five-fold cross validation
lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr,X_train,y_train,cv=5)
print(mean(cv), '+/-', std(cv))

In [None]:
#decession tree with five-fold cross validation
dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt,X_train,y_train,cv=5)
print(mean(cv), '+/-', std(cv))

In [None]:
#k-nearest neighbors classifier with five-fold cross validation
knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train,y_train,cv=5)
print(mean(cv), '+/-', std(cv))

In [None]:
#random forest classifier with five-fold cross validation
rf = RandomForestClassifier(random_state = 1)
cv = cross_val_score(rf,X_train,y_train,cv=5)
print(mean(cv), '+/-', std(cv))

In [None]:
#support vector classifier with five-fold cross validation
svc = SVC(probability = True)
cv = cross_val_score(svc,X_train,y_train,cv=5)
print(mean(cv), '+/-', std(cv))

In [None]:
#xgboost classifier with five-fold cross validation
xgb = XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1)
cv = cross_val_score(xgb,X_train,y_train,cv=5)
print(mean(cv), '+/-', std(cv))

**Hyperparameter tuning**

In [None]:
#ml algorithm tuner
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 

#performance reporting function
def clf_performance(classifier, model_name):
    print(model_name)
    print('Best Score: {} +/- {}'.format(str(classifier.best_score_),str(classifier.cv_results_['std_test_score'][classifier.best_index_])))
    print('Best Parameters: ' + str(classifier.best_params_))

In [None]:
#naive Bayes performance tuner
gnb = GaussianNB()
param_grid = {
              'var_smoothing': np.logspace(0,-10, num=100)
             }
clf_lr = GridSearchCV(gnb, param_grid = param_grid, cv = 5, n_jobs = -1)
best_clf_gnb = clf_lr.fit(X_train,y_train)
clf_performance(best_clf_gnb,'Naive Bayes')

In [None]:
#logistic regression performance tuner
lr = LogisticRegression()
param_grid = {'max_iter' : [15000],
              'C' : np.arange(.5,1.5,.1)
             }
clf_lr = GridSearchCV(lr, param_grid = param_grid, cv = 5, n_jobs = -1)
best_clf_lr = clf_lr.fit(X_train,y_train)
clf_performance(best_clf_lr,'Logistic Regression')

In [None]:
#decision tree performance tuner
dt = tree.DecisionTreeClassifier(random_state = 1)
param_grid = {
             'criterion':['gini','entropy'],
             'max_depth': np.arange(1, 15)
             }
clf_dt = GridSearchCV(dt, param_grid = param_grid, cv = 5, n_jobs = -1)
best_clf_dt = clf_dt.fit(X_train,y_train)
clf_performance(best_clf_dt,'Decision Tree')

In [None]:
#k-nearest neighbors classifier performance tuner
knn = KNeighborsClassifier()
param_grid = {
              'n_neighbors' : np.arange(15,20,1),
              'weights' : ['uniform', 'distance'],
              'algorithm' : ['auto', 'ball_tree','kd_tree','brute'],
              'p' : [2,3,4,5]
             }
clf_knn = GridSearchCV(knn, param_grid = param_grid, cv = 5, n_jobs = -1)
best_clf_knn = clf_knn.fit(X_train,y_train)
clf_performance(best_clf_knn,'K-Nearest Neighbors Classifier')

In [None]:
#random forest performance tuner
rf = RandomForestClassifier(random_state = 1)
param_grid =  {
                'n_estimators': [310], 
                'bootstrap': [True,False], #bagging (T) vs. pasting (F)
                'max_depth': [1],
                'max_features': ['auto','sqrt'],
                #'min_samples_leaf': [1],
                #'min_samples_split': [1]
              }
clf_rf_rnd = GridSearchCV(rf, param_grid = param_grid, cv = 5, n_jobs = -1)
best_clf_rf_rnd = clf_rf_rnd.fit(X_train,y_train)
clf_performance(best_clf_rf_rnd,'Random Forest')

In [None]:
#support vector classifier performance tuner
svc = SVC(probability = True, random_state = 1)
param_grid = {
              'kernel': ['rbf'],
              'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4],
              'C': np.arange(70,85,1)
             }
clf_svc = GridSearchCV(svc, param_grid = param_grid, cv = 5, n_jobs = -1)
best_clf_svc = clf_svc.fit(X_train,y_train)
clf_performance(best_clf_svc,'Support Vector Classifier')

In [None]:
#xgboost classifier performance tuner
xgb = XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1)
param_grid = {
              'max_depth': [9],
              'n_estimators': [37],
              'learning_rate': [1.2]
             }
clf_xgb = GridSearchCV(xgb, param_grid = param_grid, cv = 5, n_jobs = -1)
best_clf_xgb = clf_xgb.fit(X_train,y_train)
clf_performance(best_clf_xgb,'XGBoost Classifier')

# Stacking Classifier

**Baseline**

In [None]:
#stacking def
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('gnb', GaussianNB()))
    level0.append(('dt', tree.DecisionTreeClassifier(random_state = 1)))
    level0.append(('lr', LogisticRegression()))
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('rf', RandomForestClassifier(random_state = 1)))
    level0.append(('svc', SVC(probability = True, random_state = 1)))
    level0.append(('xgb', XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1)))
    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    stacking_model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return stacking_model

#models def
def get_models():
    models = dict()
    models['gnb'] = GaussianNB()
    models['dt'] = tree.DecisionTreeClassifier(random_state = 1)
    models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    models['rf'] = RandomForestClassifier(random_state = 1)
    models['svc'] = SVC(probability = True, random_state = 1)
    models['xgb'] = XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1)
    models['stacking'] = get_stacking()
    return models

#cross validate models and print results
models = get_models()
results, names = list(),list()
print('Mean accuracy:')
for name, model in models.items():
    scores = cross_val_score(model,X_train,y_train, scoring='accuracy', cv=5, n_jobs=-1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f +/- %.3f' % (name, mean(scores), std(scores)))

**Hyperparameter tuning**

In [None]:
#stacking def
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('gnb', GaussianNB(var_smoothing= 0.19630406500402708)))
    #level0.append(('dt', tree.DecisionTreeClassifier(random_state = 1, criterion= 'gini', max_depth= 1)))
    #level0.append(('lr', LogisticRegression(C= 0.9999999999999999, max_iter= 15000)))
    #level0.append(('knn', KNeighborsClassifier(algorithm= 'auto', n_neighbors= 19, p= 4, weights= 'uniform')))
    #level0.append(('rf', RandomForestClassifier(random_state = 1, bootstrap= False, max_depth= 1, max_features= 'auto', n_estimators= 310)))
    level0.append(('svc', SVC(probability = True, random_state = 1,C= 70, gamma = 0.001, kernel= 'rbf')))
    #level0.append(('xgb', XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1, learning_rate= 1.2, max_depth= 9, n_estimators= 37)))
    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    stacking_model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return stacking_model

#models def
def get_models():
    models = dict()
    models['gnb'] = GaussianNB(var_smoothing= 0.19630406500402708)
    #models['dt'] = tree.DecisionTreeClassifier(random_state = 1, criterion= 'gini', max_depth= 1)
    #models['lr'] = LogisticRegression(C= 0.9999999999999999, max_iter= 15000)
    #models['knn'] = KNeighborsClassifier(algorithm= 'auto', n_neighbors= 19, p= 4, weights= 'uniform')
    #models['rf'] = RandomForestClassifier(random_state = 1, bootstrap= False, max_depth= 1, max_features= 'auto', n_estimators= 310)
    models['svc'] = SVC(probability = True, random_state = 1,C= 70, gamma = 0.001, kernel= 'rbf')
    #models['xgb'] = XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1, learning_rate= 1.2, max_depth= 9, n_estimators= 37)
    models['stacking'] = get_stacking()
    return models

#cross validate models and print results
models = get_models()
results, names = list(),list()
print('Mean accuracy:')
for name, model in models.items():
    scores = cross_val_score(model,X_train,y_train, scoring='accuracy', cv=5, n_jobs=-1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f +/- %.3f' % (name, mean(scores), std(scores)))

# Hard VotingClassifier

**Baseline**

In [None]:
#stacking def
def get_hard_voting():
    # define the base models
    level0 = list()
    level0.append(('gnb', GaussianNB()))
    level0.append(('dt', tree.DecisionTreeClassifier(random_state = 1)))
    level0.append(('lr', LogisticRegression()))
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('rf', RandomForestClassifier(random_state = 1)))
    level0.append(('svc', SVC(probability = True, random_state = 1)))
    level0.append(('xgb', XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1)))
    # define meta learner model
    hard_voting_model = VotingClassifier(estimators=level0, voting='hard')
    return hard_voting_model

#models def
def get_models():
    models = dict()
    models['gnb'] = GaussianNB()
    models['dt'] = tree.DecisionTreeClassifier(random_state = 1)
    models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    models['rf'] = RandomForestClassifier(random_state = 1)
    models['svc'] = SVC(probability = True, random_state = 1)
    models['xgb'] = XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1)
    models['hv'] = get_hard_voting()
    return models

#cross validate models and print results
models = get_models()
results, names = list(),list()
print('Mean accuracy:')
for name, model in models.items():
    scores = cross_val_score(model,X_train,y_train, scoring='accuracy', cv=5, n_jobs=-1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f +/- %.3f' % (name, mean(scores), std(scores)))

**Hyperparameter tuning**

In [None]:
#stacking def
def get_hard_voting():
    # define the base models
    level0 = list()
    level0.append(('gnb', GaussianNB(var_smoothing= 0.19630406500402708)))
    #level0.append(('dt', tree.DecisionTreeClassifier(random_state = 1, criterion= 'gini', max_depth= 1)))
    level0.append(('lr', LogisticRegression(C= 0.9999999999999999, max_iter= 15000)))
    #level0.append(('knn', KNeighborsClassifier(algorithm= 'auto', n_neighbors= 19, p= 4, weights= 'uniform')))
    #level0.append(('rf', RandomForestClassifier(random_state = 1, bootstrap= False, max_depth= 1, max_features= 'auto', n_estimators= 310)))
    level0.append(('svc', SVC(probability = True, random_state = 1,C= 70, gamma = 0.001, kernel= 'rbf')))
    #level0.append(('xgb', XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1, learning_rate= 1.2, max_depth= 9, n_estimators= 37)))
    hard_voting_model = VotingClassifier(estimators=level0, voting='hard')
    return hard_voting_model

#models def
def get_models():
    models = dict()
    models['gnb'] = GaussianNB(var_smoothing= 0.19630406500402708)
    #models['dt'] = tree.DecisionTreeClassifier(random_state = 1, criterion= 'gini', max_depth= 1)
    models['lr'] = LogisticRegression(C= 0.9999999999999999, max_iter= 15000)
    #models['knn'] = KNeighborsClassifier(algorithm= 'auto', n_neighbors= 19, p= 4, weights= 'uniform')
    #models['rf'] = RandomForestClassifier(random_state = 1, bootstrap= False, max_depth= 1, max_features= 'auto', n_estimators= 310)
    models['svc'] = SVC(probability = True, random_state = 1,C= 70, gamma = 0.001, kernel= 'rbf')
    #models['xgb'] = XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1, learning_rate= 1.2, max_depth= 9, n_estimators= 37)
    models['hv'] = get_hard_voting()
    return models

#cross validate models and print results
models = get_models()
results, names = list(),list()
print('Mean accuracy:')
for name, model in models.items():
    scores = cross_val_score(model,X_train,y_train, scoring='accuracy', cv=5, n_jobs=-1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f +/- %.3f' % (name, mean(scores), std(scores)))

# Soft VotingClassifier

**Baseline**

In [None]:
#stacking def
def get_soft_voting():
    # define the base models
    level0 = list()
    level0.append(('gnb', GaussianNB()))
    level0.append(('dt', tree.DecisionTreeClassifier(random_state = 1)))
    level0.append(('lr', LogisticRegression()))
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('rf', RandomForestClassifier(random_state = 1)))
    level0.append(('svc', SVC(probability = True, random_state = 1)))
    level0.append(('xgb', XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1)))
    # define meta learner model
    soft_voting_model = VotingClassifier(estimators=level0, voting='soft')
    return soft_voting_model

#models def
def get_models():
    models = dict()
    models['gnb'] = GaussianNB()
    models['dt'] = tree.DecisionTreeClassifier(random_state = 1)
    models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    models['rf'] = RandomForestClassifier(random_state = 1)
    models['svc'] = SVC(probability = True, random_state = 1)
    models['xgb'] = XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1)
    models['sv'] = get_soft_voting()
    return models

#cross validate models and print results
models = get_models()
results, names = list(),list()
print('Mean accuracy:')
for name, model in models.items():
    scores = cross_val_score(model,X_train,y_train, scoring='accuracy', cv=5, n_jobs=-1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f +/- %.3f' % (name, mean(scores), std(scores)))

**Hyperparameter tuning**

In [None]:
#stacking def
def get_soft_voting():
    # define the base models
    level0 = list()
    level0.append(('gnb', GaussianNB(var_smoothing= 0.19630406500402708)))
    #level0.append(('dt', tree.DecisionTreeClassifier(random_state = 1, criterion= 'gini', max_depth= 1)))
    #level0.append(('lr', LogisticRegression(C= 0.9999999999999999, max_iter= 15000)))
    #level0.append(('knn', KNeighborsClassifier(algorithm= 'auto', n_neighbors= 19, p= 4, weights= 'uniform')))
    #level0.append(('rf', RandomForestClassifier(random_state = 1, bootstrap= False, max_depth= 1, max_features= 'auto', n_estimators= 310)))
    level0.append(('svc', SVC(probability = True, random_state = 1,C= 70, gamma = 0.001, kernel= 'rbf')))
    #level0.append(('xgb', XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1, learning_rate= 1.2, max_depth= 9, n_estimators= 37)))
    # define meta learner model
    soft_voting_model = VotingClassifier(estimators=level0, voting='soft')
    return soft_voting_model

#models def
def get_models():
    models = dict()
    models['gnb'] = GaussianNB(var_smoothing= 0.19630406500402708)
    #models['dt'] = tree.DecisionTreeClassifier(random_state = 1, criterion= 'gini', max_depth= 1)
    #models['lr'] = LogisticRegression(C= 0.9999999999999999, max_iter= 15000)
    #models['knn'] = KNeighborsClassifier(algorithm= 'auto', n_neighbors= 19, p= 4, weights= 'uniform')
    #models['rf'] = RandomForestClassifier(random_state = 1, bootstrap= False, max_depth= 1, max_features= 'auto', n_estimators= 310)
    models['svc'] = SVC(probability = True, random_state = 1,C= 70, gamma = 0.001, kernel= 'rbf')
    #models['xgb'] = XGBClassifier(use_label_encoder=False, eval_metric='error', random_state =1, learning_rate= 1.2, max_depth= 9, n_estimators= 37)
    models['sv'] = get_soft_voting()
    return models

#cross validate models and print results
models = get_models()
results, names = list(),list()
print('Mean accuracy:')
for name, model in models.items():
    scores = cross_val_score(model,X_train,y_train, scoring='accuracy', cv=5, n_jobs=-1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f +/- %.3f' % (name, mean(scores), std(scores)))

# BaggingClassifier

**Baseline**

In [None]:
#baggingclassifier baseline
bagging_model = BaggingClassifier(base_estimator=RandomForestClassifier(),
                                     bootstrap=True,
                                     random_state=1,
                                     n_jobs=-1
                                     )

bagging_model.fit(X_train , y_train)

cv = cross_val_score(bagging_model, X_train, y_train, cv=5)
print(mean(cv), '+/-', std(cv))

**Hyperparameter tuning**

In [None]:
#baggingclassifier tuning
bagging_model = BaggingClassifier(base_estimator=RandomForestClassifier(),
                                     bootstrap=True,
                                     random_state=1,
                                     n_estimators=20,
                                     max_samples=50,
                                     n_jobs=-1,
                                     )

bagging_model.fit(X_train , y_train)

cv = cross_val_score(bagging_model, X_train, y_train, cv=5)
print(mean(cv), '+/-', std(cv))

# BaggingClassifier (Pasting)

**Baseline**

In [None]:
#baggingclassifier (pasting) baseline
pasting_model = BaggingClassifier(base_estimator=RandomForestClassifier(),
                                     bootstrap=False,
                                     random_state=1,
                                     n_jobs=-1
                                     )

pasting_model.fit(X_train , y_train)

cv = cross_val_score(pasting_model, X_train, y_train, cv=5)
print(mean(cv), '+/-', std(cv))

**Hyperparameter tuning**

In [None]:
#baggingclassifier (pasting) tuner
pasting_model = BaggingClassifier(base_estimator=RandomForestClassifier(random_state = 1, bootstrap=True,max_depth=7, max_features='auto', n_estimators=340),
                                     bootstrap=False,
                                     random_state=1,
                                     n_estimators=20,
                                     max_samples=50,
                                     n_jobs=-1,
                                     )

pasting_model.fit(X_train , y_train)

cv = cross_val_score(pasting_model, X_train, y_train, cv=5)
print(mean(cv), '+/-', std(cv))

# AdaBoostClassifier

**Baseline**

In [None]:
#addboostclassifier baseline
adaboost_model = AdaBoostClassifier(base_estimator=RandomForestClassifier(),
                                       random_state=1)

adaboost_model.fit(X_train , y_train)

cv = cross_val_score(adaboost_model, X_train, y_train, cv=5)
print(mean(cv), '+/-', std(cv))

**Hyperparameter tuning**

In [None]:
#adaboostclassifier tuning
adaboost_model = AdaBoostClassifier(base_estimator=RandomForestClassifier(random_state = 1, bootstrap=True,max_depth=7, max_features='auto', n_estimators=340),
#                                        learning_rate=1,
                                       random_state=1)

adaboost_model.fit(X_train , y_train)

cv = cross_val_score(adaboost_model, X_train, y_train, cv=5)
print(mean(cv), '+/-', std(cv))

# Evaluating the best models

In [None]:
#import evaluation tools
from sklearn.metrics import accuracy_score,precision_score, matthews_corrcoef, confusion_matrix, classification_report
import scikitplot as skplt

**SVC**

In [None]:
#create support vector classifier model with tuned parameters
svc = SVC(probability = True, random_state = 1,C= 70, gamma = 0.001, kernel= 'rbf')
svc.fit(X_train,y_train)
y_pred1 = svc.predict(X_test)

#assess accuracy
print('SVC test accuracy: {}'.format(accuracy_score(y_test, y_pred1)))

In [None]:
#support vector classifier confusion matrix
#create and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred1)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

#plot as heatmap
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=sns.color_palette('Reds'), linewidths=0.2, vmin=0, vmax=1)

#plot settings
class_names = ['Heart disease', 'No heart disease']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Support Vector Classifier')
plt.show()

In [None]:
#support vector classifier sensitivity and specificity calculations
total=sum(sum(matrix))

print('SVC')
sensitivity = matrix[0,0]/(matrix[0,0]+matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = matrix[1,1]/(matrix[1,1]+matrix[0,1])
print('Specificity : ', specificity)

In [None]:
#view the support vector classification report
print('SVC')
print(classification_report(y_test, y_pred1))

In [None]:
#lift curve for support vector classifier
target_prob = svc.predict_proba(X_test)
skplt.metrics.plot_lift_curve(y_test, target_prob)
plt.show()

In [None]:
#Matthews correlation coefficient for SVC
print('SVC MCC: {}'.format(matthews_corrcoef(y_test, y_pred1)))

**StackingClassifier**

In [None]:
#create stacking classifier model
stacking_model = get_stacking()
stacking_model.fit(X_train,y_train)
y_pred2 = stacking_model.predict(X_test)

#assess accuracy
print('StackingClassifier test accuracy: {}'.format(accuracy_score(y_test, y_pred2)))

In [None]:
#stacking classifier confusion matrix
#create and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred2)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

#plot as heatmap
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=sns.color_palette('Reds'), linewidths=0.2, vmin=0, vmax=1)

#plot settings
class_names = ['Heart disease', 'No heart disease']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Stacking Classifier')
plt.show()

In [None]:
#stacking classifier sensitivity and specificity calculations
total=sum(sum(matrix))

print('StackingClassifier')
sensitivity = matrix[0,0]/(matrix[0,0]+matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = matrix[1,1]/(matrix[1,1]+matrix[0,1])
print('Specificity : ', specificity)

In [None]:
#view the stacking classification report
print('StackingClassifier')
print(classification_report(y_test, y_pred2))

In [None]:
#lift curve for the stacking model
target_prob = stacking_model.predict_proba(X_test)
skplt.metrics.plot_lift_curve(y_test, target_prob)
plt.show()

In [None]:
#Matthews correlation coefficient for StackingClassifier
print('StackingClassifier MCC: {}'.format(matthews_corrcoef(y_test, y_pred2)))

**Hard VotingClassifier**

In [None]:
#create hard voting classifier model
hv_model = get_hard_voting()
hv_model.fit(X_train,y_train)
y_pred3 = hv_model.predict(X_test)

#assess accuracy
print('Hard VotingClassifier test accuracy: {}'.format(accuracy_score(y_test, y_pred3)))

In [None]:
#hard voting classifier confusion matrix
#create and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred3)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

#plot as heatmap
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=sns.color_palette('Reds'), linewidths=0.2, vmin=0, vmax=1)

#plot settings
class_names = ['Heart disease', 'No heart disease']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Hard VotingClassifier')
plt.show()

In [None]:
#hard voting classifier sensitivity and specificity calculations
total=sum(sum(matrix))

print('Hard VotingClassifier')
sensitivity = matrix[0,0]/(matrix[0,0]+matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = matrix[1,1]/(matrix[1,1]+matrix[0,1])
print('Specificity : ', specificity)

In [None]:
#view the hard voting classification report
print('Hard VotingClassifier')
print(classification_report(y_test, y_pred3))

In [None]:
#Matthews correlation coefficient for Hard VotingClassifier
print('Hard VotingClassifier MCC: {}'.format(matthews_corrcoef(y_test, y_pred3)))

**Soft VotingClassifier**

In [None]:
#create soft voting classifier model
sv_model = get_soft_voting()
sv_model.fit(X_train,y_train)
y_pred4 = sv_model.predict(X_test)

#assess accuracy
print('Soft VotingClassifier test accuracy: {}'.format(accuracy_score(y_test, y_pred4)))

In [None]:
#soft voting classifier confusion matrix
#create and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred4)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

#plot as heatmap
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=sns.color_palette('Reds'), linewidths=0.2, vmin=0, vmax=1)

#plot settings
class_names = ['Heart disease', 'No heart disease']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Soft VotingClassifier')
plt.show()

In [None]:
#soft voting classifier sensitivity and specificity calculations
total=sum(sum(matrix))

print('Soft VotingClassifier')
sensitivity = matrix[0,0]/(matrix[0,0]+matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = matrix[1,1]/(matrix[1,1]+matrix[0,1])
print('Specificity : ', specificity)

In [None]:
#view the soft voting classification report
print('Soft VotingClassifier')
print(classification_report(y_test, y_pred4))

In [None]:
#lift curve for the soft voting model
target_prob = sv_model.predict_proba(X_test)
skplt.metrics.plot_lift_curve(y_test, target_prob)
plt.show()

In [None]:
#Matthews correlation coefficient for Soft VotingClassifier
print('Soft VotingClassifier MCC: {}'.format(matthews_corrcoef(y_test, y_pred4)))

**Naive Bayes**

In [None]:
#create naive bayes model
gnb = GaussianNB(var_smoothing= 0.19630406500402708)
gnb.fit(X_train,y_train)
y_pred5 = gnb.predict(X_test)

#assess accuracy
print('GaussianNB test accuracy: {}'.format(accuracy_score(y_test, y_pred5)))

In [None]:
#naive bayes confusion matrix
#create and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred5)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

#plot as heatmap
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=sns.color_palette('Reds'), linewidths=0.2, vmin=0, vmax=1)

#plot settings
class_names = ['Heart disease', 'No heart disease']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for GaussianNB')
plt.show()

In [None]:
#naive bayes sensitivity and specificity calculations
total=sum(sum(matrix))

print('GaussianNB')
sensitivity = matrix[0,0]/(matrix[0,0]+matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = matrix[1,1]/(matrix[1,1]+matrix[0,1])
print('Specificity : ', specificity)

In [None]:
#view the niave bayes report
print('GaussianNB')
print(classification_report(y_test, y_pred5))

In [None]:
#lift curve for the pasting model
target_prob = gnb.predict_proba(X_test)
skplt.metrics.plot_lift_curve(y_test, target_prob)
plt.show()

In [None]:
print('GaussianNB MCC: {}'.format(matthews_corrcoef(y_test, y_pred5)))

**ROC/AUC**

In [None]:
#plot ROC curve for best classifiers
from sklearn import metrics

pred_prob1 = svc.predict_proba(X_test)
pred_prob2 = stacking_model.predict_proba(X_test)
pred_prob4 = sv_model.predict_proba(X_test)
pred_prob5 = pasting_model.predict_proba(X_test)
fpr1, tpr1, thresholds1 = metrics.roc_curve(y_test, pred_prob1[:,1],pos_label=1)
fpr2, tpr2, thresholds2 = metrics.roc_curve(y_test, pred_prob2[:,1],pos_label=1)
fpr4, tpr4, thresholds2 = metrics.roc_curve(y_test, pred_prob4[:,1],pos_label=1)
fpr5, tpr5, thresholds3 = metrics.roc_curve(y_test, pred_prob5[:,1],pos_label=1)

fig, ax = plt.subplots(figsize=(16, 10))
ax.plot(fpr1, tpr1, label='SVC')
ax.plot(fpr2, tpr2, label='Stacking')
ax.plot(fpr4, tpr4, label='Soft Voting')
ax.plot(fpr5, tpr5, label='GaussianNB')
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
plt.legend()
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve for heart disease classifiers')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
#calculate AUC for classifiers
print('SVC AUC: {}'.format(metrics.auc(fpr1, tpr1)))
print('Stacking AUC: {}'.format(metrics.auc(fpr2, tpr2)))
print('Soft Voting AUC: {}'.format(metrics.auc(fpr4, tpr4)))
print('GaussianNB AUC: {}'.format(metrics.auc(fpr5, tpr5)))

# Feature Importance

In [None]:
#import libraries
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots
import eli5
from eli5.sklearn import PermutationImportance
from sklearn import tree
import graphviz
import shap

**Hard VotingClassifier**

In [None]:
#fit the model
hv_model.fit(X_train,y_train)

#make prediction
y_pred = hv_model.predict(X_test)

In [None]:
#determine feature weights
perm = PermutationImportance(hv_model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = list(feature_names), top=len(feature_names))

**SVC**

In [None]:
#build SVC model
svc_model = SVC(probability = True, random_state = 1, C= 100, gamma= 0.01, kernel = 'rbf').fit(X_train,y_train)

In [None]:
#create object that can calculate shap values
explainer = shap.KernelExplainer(svc_model.predict_proba, X_train)

pred_data = pd.DataFrame(X_test)

pred_data.columns = feature_names

data_for_prediction = pred_data

#calculate Shap values
shap_values = explainer.shap_values(data_for_prediction)

shap.initjs()
shap.summary_plot(shap_values[1], data_for_prediction)

**StackingClassifier**

In [None]:
#fit the model
stacking_model.fit(X_train,y_train)

#make prediction
y_pred = stacking_model.predict(X_test)

In [None]:
#determine feature weights
perm = PermutationImportance(stacking_model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = list(feature_names), top=len(feature_names))

**Decision Tree**

In [None]:
#create decision tree model with tuned parameters
dt = tree.DecisionTreeClassifier(criterion= 'entropy', max_depth= 3)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)

#dt accuracy print
print('dt test accuracy: {}'.format(accuracy_score(y_test, y_pred)))

In [None]:
#look at decision tree
#value tells how many records from each category entered the box (i.e., [# of records = 0, # of records = 1])
tree_graph = tree.export_graphviz(dt, out_file=None, feature_names=feature_names)
graphviz.Source(tree_graph)

In [None]:
#create object that can calculate shap values
explainer = shap.TreeExplainer(dt)

pred_data = pd.DataFrame(X_test)

pred_data.columns = feature_names

data_for_prediction = pred_data

#calculate Shap values
shap_values = explainer.shap_values(data_for_prediction)

#create summary plot
shap.initjs()
shap.summary_plot(shap_values[1], data_for_prediction)

**Best model**: Hard VotingClassifier

* Accuracy: 0.8553
* Sensitivity: 0.8717
* Specificity: 0.8367
* Precision: 0.8571
* MCC: 0.7084

# Productionization

I created a [front-end](https://predict-heart-diseases.herokuapp.com/) using this model using Flask and Heroku to help doctors diagnose heart disease in patients with angina.

See the [GitHub repo](https://github.com/MichaelBryantDS/heart-disease-pred) for more information.