# Model Evaluation

Read in the heart disease dataset from `data/heart.csv`

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
data = pd.read_csv('data/heart.csv')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


Extract and scale (using a StandardScaler) the first 13 columns as a features set. Extract the last column ('num') as a target variable, in particular, whether num != 0 ('heart disease is present')

In [3]:
from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(data[data.columns[:-1]])
y = data.num > 0

Train and evaluate a RandomForest(n_estimators=10) on the data set.

Use StratifiedKFold cross-validation to generate scores (use n_splits=10).

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

rfc = RandomForestClassifier(n_estimators=10)
cv = StratifiedKFold(n_splits=10)
scores = cross_val_score(rfc, X, y, cv=cv)
scores

array([0.9       , 0.86666667, 0.83333333, 0.9       , 0.83333333,
       0.8       , 0.7       , 0.83333333, 0.76666667, 0.82758621])

In [9]:
scores.mean()

0.8260919540229885

Use a GridSearchCV to find ideal parameters for the random forest:

- try n_estimators = 1, 2, 4, 8, 16, 32, 64, 128, 256
- try max_depth = 1, 2, 3, 4, 5



In [5]:
from sklearn.model_selection import GridSearchCV

grid = {
    'n_estimators': [ 1, 2, 4, 8, 16, 32, 64, 128, 256],
    'max_depth': [1,2,3,4,5]
}

model = GridSearchCV(rfc, grid, cv=cv, n_jobs=-1)
model.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                         

In [6]:
model.best_params_

{'max_depth': 3, 'n_estimators': 128}

Print classification reports for the 'tuned' RandomForestClassifier

In [7]:
from sklearn.metrics import classification_report

In [8]:
print(classification_report(y, model.predict(X)))

              precision    recall  f1-score   support

       False       0.84      0.93      0.88       160
        True       0.90      0.80      0.85       139

    accuracy                           0.87       299
   macro avg       0.87      0.86      0.86       299
weighted avg       0.87      0.87      0.87       299



In [10]:
from sklearn import model_selection

Xtrain, Xtest, ytrain, ytest = model_selection.train_test_split(X, y)
model = GridSearchCV(rfc, grid, cv=cv, n_jobs=-1)
model.fit(Xtrain, ytrain)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                         

In [11]:
model.best_params_

{'max_depth': 2, 'n_estimators': 16}

In [13]:
final_model = model.best_estimator_
final_model.fit(Xtrain, ytrain)
pd.crosstab(ytest, final_model.predict(Xtest))

col_0,False,True
num,Unnamed: 1_level_1,Unnamed: 2_level_1
False,32,5
True,4,34


In [15]:
print(classification_report(ytest, final_model.predict(Xtest)))

              precision    recall  f1-score   support

       False       0.89      0.86      0.88        37
        True       0.87      0.89      0.88        38

    accuracy                           0.88        75
   macro avg       0.88      0.88      0.88        75
weighted avg       0.88      0.88      0.88        75

