In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

heart_dataset = pd.read_csv('../datasets/uci-heart-disease/heart.csv')

heart_dataset.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [2]:
# Split the data into input and labels
labels = heart_dataset['target']
input_data = heart_dataset.drop(columns=['target'])

In [3]:
input_data.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [5]:
from sklearn.model_selection import GridSearchCV

# Note, we don't split the data. GridSearchCV will automatically apply 5-fold cross validation by default.

tuned_parameters = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 3, 5, 10, 20],
    'min_samples_split': [2, 4, 8, 16, 32],
    'max_leaf_nodes': [None, 10, 20, 40, 80]
}

# These two lines will result in every possible combo of the above paramters to be fit and scored
# which can take a LONG TIME with large datasets.
clf = DecisionTreeClassifier()
grid_tree = GridSearchCV(clf, tuned_parameters)
grid_tree.fit(input_data, labels)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [None, 3, 5, 10, 20],
                         'max_leaf_nodes': [None, 10, 20, 40, 80],
                         'min_samples_split': [2, 4, 8, 16, 32],
                         'splitter': ['best', 'random']})

In [6]:
print("Best parameters set found on development set:")
print()
print(grid_tree.best_params_, grid_tree.best_score_)
print()
print("Grid scores on development set:")
print()
means = grid_tree.cv_results_['mean_test_score']
stds = grid_tree.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_tree.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': 10, 'max_leaf_nodes': 20, 'min_samples_split': 4, 'splitter': 'random'} 0.8284153005464482

Grid scores on development set:

0.756 (+/-0.119) for {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_split': 2, 'splitter': 'best'}
0.752 (+/-0.068) for {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_split': 2, 'splitter': 'random'}
0.746 (+/-0.118) for {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_split': 4, 'splitter': 'best'}
0.743 (+/-0.110) for {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_split': 4, 'splitter': 'random'}
0.779 (+/-0.082) for {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_split': 8, 'splitter': 'best'}
0.795 (+/-0.110) for {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_split': 8, 'splitter': 'random'}
0.752 (

In [10]:
print(grid_tree.cv_results_.keys())

18


In [14]:
results_df = pd.DataFrame({
    'mean_test_score': grid_tree.cv_results_['mean_test_score'],
})

params_df = pd.DataFrame(grid_tree.cv_results_['params'], columns=grid_tree.cv_results_['params'][0].keys())

all_results = pd.concat([results_df, params_df], axis=1).sort_values('mean_test_score', ascending=False)

# Lets filter out the ones with nans, then list them!
# (36 is the total number of models, so I know 36 is enough to display them all.)
all_results.dropna().head(5)

Unnamed: 0,mean_test_score,criterion,max_depth,max_leaf_nodes,min_samples_split,splitter
173,0.828415,gini,10.0,20.0,4,random
379,0.821585,entropy,5.0,20.0,32,random
488,0.815027,entropy,20.0,40.0,32,best
398,0.815027,entropy,5.0,80.0,32,best
388,0.815027,entropy,5.0,40.0,32,best


In [16]:
all_results.dropna().tail(5)

Unnamed: 0,mean_test_score,criterion,max_depth,max_leaf_nodes,min_samples_split,splitter
494,0.738962,entropy,20.0,80.0,8,best
195,0.735956,gini,10.0,80.0,8,random
495,0.735902,entropy,20.0,80.0,8,random
245,0.725902,gini,20.0,80.0,8,random
243,0.702678,gini,20.0,80.0,4,random


In [21]:
from sklearn.ensemble import RandomForestClassifier

tuned_parameters = {
    'n_estimators': [10, 100, 200, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 5, 10, 20],
    'min_samples_split': [2, 4, 8, 16, 32],
    'max_leaf_nodes': [None, 10, 40, 80],
    'n_jobs': [-1]
}

model = RandomForestClassifier()
grid_tree = GridSearchCV(model, tuned_parameters)
grid_tree.fit(input_data, labels)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [None, 3, 5, 10, 20],
                         'max_leaf_nodes': [None, 10, 40, 80],
                         'min_samples_split': [2, 4, 8, 16, 32],
                         'n_estimators': [10, 100, 200, 500], 'n_jobs': [-1]})