# Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

# Load data

In [2]:
dataset = pd.read_csv("./data/heart.csv")
dataset

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


# Data preprocessing

In [3]:
y = dataset["HeartDisease"]
X = dataset.drop(["HeartDisease"], axis=1)

print(X.shape, y.shape)

(918, 11) (918,)


In [4]:
cat_attribs = [col for col in X.columns if X[col].dtype=="object"]
num_attribs = X.columns.drop(cat_attribs)

print("cat_cols:", cat_attribs)
print("num_cols:", num_attribs)

cat_cols: ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
num_cols: Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')


## train-test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)

(734, 11) (184, 11)


## Build transformation pipelines

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline([
    ("ohe", OneHotEncoder(drop="if_binary"))
])
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])

In [7]:
X_train_tr = full_pipeline.fit_transform(X_train)
X_test_tr = full_pipeline.transform(X_test)

print(X_train_tr.shape, X_test_tr.shape)

(734, 18) (184, 18)


In [8]:
# pd.DataFrame(X_train_tr, columns=full_pipeline.get_feature_names_out())

# Build model

In [9]:
import time 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [10]:
log_reg = LogisticRegression(solver="lbfgs", random_state=42)
knn = KNeighborsClassifier()
gnb = GaussianNB()
dtc = DecisionTreeClassifier(random_state=42)
rfc = RandomForestClassifier(random_state=42)

classifiers = zip(
    ["LogisticRegression", "KNeighborsClassifier", "GaussianNB", "DecisionTreeClassifier", "RandomForestClassifier"],
    [log_reg, knn, gnb, dtc, rfc]
)

In [11]:
end = 0
for label, model in classifiers:
    start = time.time()
    scores = cross_val_score(model, X_train_tr, y_train, scoring="accuracy", cv=5)
    elapsed_time = time.time() - start
    print("[%s] accuracy: %0.3f (+/- %0.3f) - %f sec" % (label, scores.mean(), scores.std(), elapsed_time))
    end += elapsed_time

print("Elasped time : %f sec" %  end)

[LogisticRegression] accuracy: 0.862 (+/- 0.021) - 0.521527 sec
[KNeighborsClassifier] accuracy: 0.858 (+/- 0.022) - 1.156719 sec
[GaussianNB] accuracy: 0.851 (+/- 0.017) - 0.036075 sec
[DecisionTreeClassifier] accuracy: 0.779 (+/- 0.027) - 0.402703 sec
[RandomForestClassifier] accuracy: 0.865 (+/- 0.020) - 2.197342 sec
Elasped time : 4.314365 sec


# Fine-Tune Model

## Grid Search

In [12]:
from sklearn.model_selection import GridSearchCV


parameters = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 6, 9],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [0.8]
}

grid_search = GridSearchCV(estimator=rfc, param_grid=parameters, scoring="accuracy", cv=5, n_jobs=-1)
grid_search.fit(X_train_tr, y_train)

In [13]:
best_score = grid_search.best_score_
best_params = grid_search.best_params_

print("best_score", best_score)
print("best_params", best_params)

best_score 0.8637405647190383
best_params {'max_depth': 6, 'max_features': 0.8, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}


In [14]:
df = pd.DataFrame(grid_search.cv_results_)
print(df[["params", "mean_test_score", "rank_test_score"]].sort_values("rank_test_score").head().to_string())

                                                                                                       params  mean_test_score  rank_test_score
41  {'max_depth': 6, 'max_features': 0.8, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 150}         0.863741                1
38  {'max_depth': 6, 'max_features': 0.8, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}         0.863741                1
57   {'max_depth': 9, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 50}         0.862389                3
34  {'max_depth': 6, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 100}         0.862371                4
37  {'max_depth': 6, 'max_features': 0.8, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}         0.861010                5


In [15]:
# final_model
final_model = grid_search.best_estimator_
y_pred = final_model.predict(X_test_tr)
cm = confusion_matrix(y_test, y_pred)

print(cm)
print(classification_report(y_test, y_pred))

[[67 10]
 [14 93]]
              precision    recall  f1-score   support

           0       0.83      0.87      0.85        77
           1       0.90      0.87      0.89       107

    accuracy                           0.87       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.87      0.87       184



# Build full pipeline with the estimator

In [16]:
# final model
params = {
    'n_estimators': 150,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'min_samples_split': 2,
    'max_features': 0.8
}
final_model = RandomForestClassifier(**params)

In [17]:
# full pipeline
num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline([
    ("ohe", OneHotEncoder(drop="if_binary"))
])
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])

In [18]:
# full pipeline with estimator
full_pipeline_with_estimator = Pipeline([    
    ("preprocessing", full_pipeline),
    ("final_model", final_model)
])
full_pipeline_with_estimator.fit(X_train, y_train)

In [19]:
y_pred = full_pipeline_with_estimator.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(cm)
print(classification_report(y_test, y_pred))

[[67 10]
 [12 95]]
              precision    recall  f1-score   support

           0       0.85      0.87      0.86        77
           1       0.90      0.89      0.90       107

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184



# Save model

In [20]:
import pickle


filename = "final_model.pkl"
pickle.dump(full_pipeline_with_estimator, open(filename, "wb"))

In [21]:
# #load model
# loaded_model = pickle.load(open(filename, "rb"))