In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# load the data
data = pd.read_csv('heart_v2.csv')
data.shape

(270, 5)

In [3]:
# data visuals
data.head()

Unnamed: 0,age,sex,BP,cholestrol,heart disease
0,70,1,130,322,1
1,67,0,115,564,0
2,57,1,124,261,1
3,64,1,128,263,0
4,74,0,120,269,0


In [4]:
# create X and y
X = data.drop(columns='heart disease', axis=1)
y = data['heart disease']

In [5]:
# check the population of class in y
sum(y) / len(y)

0.4444444444444444

In [6]:
# split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)

In [7]:
# define random forest classifier
rfc = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42, n_jobs=-1)

In [8]:
# train the model
rfc.fit(X_train, y_train)

In [9]:
def evaluateModel(model, X_eval, y_eval):
    y_eval_pred = model.predict(X_eval)
    print('Model accuracy =', round(accuracy_score(y_eval, y_eval_pred), 4))
    
    mat = confusion_matrix(y_eval, y_eval_pred)
    print('\nconfusion matrix\n', mat)

    sensitivity = mat[1][1] / (mat[1][1] + mat[1][0])
    print('\nsensitivity =', round(sensitivity, 4))

    specificity = mat[0][0] / (mat[0][0] + mat[0][1])
    print('\nspecificity =', round(specificity, 4))

    print('\nrecall =', round(recall_score(y_eval, y_eval_pred), 4))

    print('\nprecision =', round(precision_score(y_eval, y_eval_pred), 4))

- **Sensitivity:** True positive rate
- **Specificity:** Probability that an actual 'No' case is predicted correctly (True negative rate)
- **Recall:** Probability that an actual 'Yes' case is predicted correctly, it is same as sensitivity.
- **Precision:** Probability that a predicted 'Yes' is actually a 'Yes'

In [10]:
# evaluation on train set
evaluateModel(rfc, X_train, y_train)

Model accuracy = 0.7407

confusion matrix
 [[80 21]
 [28 60]]

sensitivity = 0.6818

specificity = 0.7921

recall = 0.6818

precision = 0.7407


In [11]:
# evaluation on test set
evaluateModel(rfc, X_test, y_test)

Model accuracy = 0.6173

confusion matrix
 [[35 14]
 [17 15]]

sensitivity = 0.4688

specificity = 0.7143

recall = 0.4688

precision = 0.5172


### Random forest with hyperparameter tuning

In [12]:
# define K-folds
folds = KFold(n_splits=5, shuffle=True, random_state=42)

In [13]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [1, 2, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'max_features': [2,3,4],
    'n_estimators': [10, 30, 50, 100, 200]
}

In [14]:
# define grid search cv
cv = GridSearchCV(RandomForestClassifier(n_jobs=-1, random_state=42), params, cv=folds, scoring='accuracy', verbose=1)

In [15]:
%%time
# cross validation on train set
cv.fit(X_train, y_train)

Fitting 5 folds for each of 375 candidates, totalling 1875 fits
CPU times: total: 4min 49s
Wall time: 4min 58s


In [16]:
rf_best = cv.best_estimator_
rf_best.fit(X_train, y_train)

In [17]:
evaluateModel(rf_best, X_train, y_train)

Model accuracy = 0.7249

confusion matrix
 [[82 19]
 [33 55]]

sensitivity = 0.625

specificity = 0.8119

recall = 0.625

precision = 0.7432


In [18]:
evaluateModel(rf_best, X_test, y_test)

Model accuracy = 0.6296

confusion matrix
 [[36 13]
 [17 15]]

sensitivity = 0.4688

specificity = 0.7347

recall = 0.4688

precision = 0.5357
