# Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, plot_roc_curve 

# Load data

In [2]:
dataset = pd.read_csv("./heart.csv")
dataset

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


# Data preprocessing

In [3]:
y = dataset["HeartDisease"]
X = dataset.drop(["HeartDisease"], axis=1)

print(X.shape, y.shape)

(918, 11) (918,)


In [4]:
cat_attribs = [col for col in X.columns if X[col].dtype=="object"]
num_attribs = X.columns.drop(cat_attribs)

print("cat_cols:", cat_attribs)
print("num_cols:", num_attribs)

cat_cols: ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
num_cols: Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')


## train-test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)

(734, 11) (184, 11)


## Build transformation pipelines

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("ohe", OneHotEncoder(drop="if_binary"))
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])

In [7]:
X_train_tr = full_pipeline.fit_transform(X_train)
X_test_tr = full_pipeline.transform(X_test)

print(X_train_tr.shape, X_test_tr.shape)

(734, 18) (184, 18)


In [8]:
pd.DataFrame(X_train_tr, columns=full_pipeline.get_feature_names_out())

Unnamed: 0,num__Age,num__RestingBP,num__Cholesterol,num__FastingBS,num__MaxHR,num__Oldpeak,cat__Sex_M,cat__ChestPainType_ASY,cat__ChestPainType_ATA,cat__ChestPainType_NAP,cat__ChestPainType_TA,cat__RestingECG_LVH,cat__RestingECG_Normal,cat__RestingECG_ST,cat__ExerciseAngina_Y,cat__ST_Slope_Down,cat__ST_Slope_Flat,cat__ST_Slope_Up
0,-1.245067,-0.708985,0.372803,1.842609,2.284353,-0.097061,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,-1.886236,-0.166285,0.086146,-0.542709,1.652241,-0.836286,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.250993,0.919115,0.123134,1.842609,-0.441628,0.087745,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,-1.779375,-0.166285,0.104640,-0.542709,0.229991,-0.836286,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,-0.283314,-0.708985,-1.846478,1.842609,-1.271274,-0.836286,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,-0.603898,-0.708985,0.502261,-0.542709,-1.034232,-0.836286,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
730,-0.924483,-0.708985,0.234098,-0.542709,0.150977,-0.836286,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
731,0.678439,-0.166285,0.493014,-0.542709,0.309005,0.457358,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
732,0.678439,1.027656,-1.846478,-0.542709,-0.718176,-0.836286,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


# Build model

In [14]:
import time 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [17]:
log_reg = LogisticRegression(solver="lbfgs", random_state=42)
knn = KNeighborsClassifier()
svc = SVC(random_state=42)
gnb = GaussianNB()
dtc = DecisionTreeClassifier(random_state=42)
rfc = RandomForestClassifier(random_state=42)
extratrees_clf = ExtraTreesClassifier(random_state=42)

classifiers = zip(
    ["log_reg", "knn", "gnb", "dtc", "rfc"],
    [log_reg, knn, gnb, dtc, rfc]
)

In [18]:
end = 0

for label, model in classifiers:
    start = time.time()
    scores = cross_val_score(model, X_train_tr, y_train, scoring="accuracy", cv=5)
    elapsed_time = time.time() - start
    print("[%s] accuracy: %0.3f (+/- %0.3f) - %f sec" % (label, scores.mean(), scores.std(), elapsed_time))
    end += elapsed_time

print("Elasped time : %f sec" %  end)

[log_reg] accuracy: 0.862 (+/- 0.021) - 0.081017 sec
[knn] accuracy: 0.858 (+/- 0.022) - 0.046849 sec
[gnb] accuracy: 0.851 (+/- 0.017) - 0.015625 sec
[dtc] accuracy: 0.779 (+/- 0.027) - 0.031272 sec
[rfc] accuracy: 0.865 (+/- 0.020) - 0.999751 sec
Elasped time : 1.174513 sec


# Fine-Tune Model

## Grid Search

In [19]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 6, 9],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ["auto"]
}

grid_search = GridSearchCV(estimator=rfc, param_grid=parameters, scoring="accuracy", cv=5, n_jobs=-1)
grid_search.fit(X_train_tr, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 6, 9], 'max_features': ['auto'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 4, 8],
                         'n_estimators': [50, 100, 150]},
             scoring='accuracy')

In [20]:
best_score = grid_search.best_score_
best_params = grid_search.best_params_

print("best_score", best_score)
print("best_params", best_params)

best_score 0.871922467617184
best_params {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


In [21]:
df = pd.DataFrame(grid_search.cv_results_)
print(df[["params", "mean_test_score", "rank_test_score"]].sort_values("rank_test_score").head(25).to_string())

                                                                                                             params  mean_test_score  rank_test_score
0    {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}         0.871922                1
2   {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}         0.870553                2
29     {'max_depth': 6, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}         0.870553                2
63      {'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}         0.870553                2
66      {'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 50}         0.870553                2
65     {'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_es

In [22]:
# final_model
final_model = grid_search.best_estimator_
y_pred = final_model.predict(X_test_tr)

cm = confusion_matrix(y_test, y_pred)

print(cm)
print(classification_report(y_test, y_pred))

[[69  8]
 [12 95]]
              precision    recall  f1-score   support

           0       0.85      0.90      0.87        77
           1       0.92      0.89      0.90       107

    accuracy                           0.89       184
   macro avg       0.89      0.89      0.89       184
weighted avg       0.89      0.89      0.89       184



In [25]:
from sklearn.ensemble import RandomForestClassifier

params =  {
  'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50
}

rfc = RandomForestClassifier(**params)
rfc.fit(X_train_tr, y_train)

RandomForestClassifier(n_estimators=50)

In [26]:
# final_model
final_model = rfc
y_pred = final_model.predict(X_test_tr)

cm = confusion_matrix(y_test, y_pred)

print(cm)
print(classification_report(y_test, y_pred))

[[69  8]
 [11 96]]
              precision    recall  f1-score   support

           0       0.86      0.90      0.88        77
           1       0.92      0.90      0.91       107

    accuracy                           0.90       184
   macro avg       0.89      0.90      0.89       184
weighted avg       0.90      0.90      0.90       184

