In [50]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

fish_dataset = pd.read_csv('../datasets/fish/Fish.csv')
fish_dataset = shuffle (fish_dataset)
fish_dataset.head(100)

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672
138,Pike,567.0,43.2,46.0,48.7,7.7920,4.8700
16,Bream,700.0,30.4,33.0,38.3,14.8604,5.2854
8,Bream,450.0,27.6,30.0,35.1,14.0049,4.8438
140,Pike,950.0,48.3,51.7,55.1,8.9262,6.1712
...,...,...,...,...,...,...,...
115,Perch,690.0,34.6,37.0,39.3,10.5717,6.3666
123,Perch,1100.0,39.0,42.0,44.6,12.8002,6.8684
112,Perch,685.0,34.0,36.5,39.0,10.8810,6.8640
120,Perch,900.0,37.0,40.0,42.5,11.7300,7.2250


In [51]:
labels = fish_dataset['Species']
input_data = fish_dataset.drop(columns=['Species'])


In [52]:
from scipy.stats import randint as sp_randint

# Lists will be uniformly sampled.
# Distributions from scipy will follow the sampling distribution 
# (uniform in this case, but you could use any other provided distribution)
param_dist = {"max_depth": [None, 3, 5, 10, 20, 50, 100],
              "min_samples_split": sp_randint(2, 50),
              "max_leaf_nodes": sp_randint(2, 100),
              "criterion": ["gini", "entropy"]}


In [53]:
clf = DecisionTreeClassifier()
n_iter_search = 1000
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)
random_search.fit(input_data, labels)

print("Best parameters set found on development set:")
print()
print(random_search.best_params_, random_search.best_score_)
print()
print("Scores on development set:")
print()
means = random_search.cv_results_['mean_test_score']
stds = random_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, random_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': 5, 'max_leaf_nodes': 67, 'min_samples_split': 6} 0.7360887096774194

Scores on development set:

0.604 (+/-0.087) for {'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 46, 'min_samples_split': 28}
0.723 (+/-0.084) for {'criterion': 'entropy', 'max_depth': 50, 'max_leaf_nodes': 39, 'min_samples_split': 2}
0.691 (+/-0.152) for {'criterion': 'gini', 'max_depth': 50, 'max_leaf_nodes': 27, 'min_samples_split': 25}
0.610 (+/-0.084) for {'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 22, 'min_samples_split': 49}
0.585 (+/-0.111) for {'criterion': 'entropy', 'max_depth': 3, 'max_leaf_nodes': 77, 'min_samples_split': 38}
0.635 (+/-0.120) for {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': 41, 'min_samples_split': 49}
0.641 (+/-0.111) for {'criterion': 'gini', 'max_depth': 100, 'max_leaf_nodes': 43, 'min_samples_split': 43}
0.736 (+/-0.066) for {'criterion': 'entropy', 'max_depth': 5, 'm

In [49]:
scoring = {"Specificity": make_scorer(specificity_score)}
tuned_parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 5, 10, 20],
    'min_samples_split': [2, 4, 8, 16, 32, 64, 128],
    'max_leaf_nodes': [None, 10, 20, 40, 80],
}

# These two lines will result in every possible combo of the above paramters to be fit and scored
# which can take a LONG TIME with large datasets.
clf = DecisionTreeClassifier()
grid_tree = GridSearchCV(clf, tuned_parameters,     scoring=scoring, refit="Specificity")
grid_tree.fit(input_data, labels) 

print("Best parameters set found on development set:")
print()
print(grid_tree.best_params_, grid_tree.best_score_)
print()
print("Grid scores on development set:")
print()
#print (grid_tree.cv_results_)
# Plus, check out this trick making the DF a bit easier to work with
results = pd.DataFrame({
    'mean': grid_tree.cv_results_['mean_test_Specificity'],
    'std': grid_tree.cv_results_['std_test_Specificity'],
})

params_df = pd.DataFrame(grid_tree.cv_results_['params'], columns=grid_tree.cv_results_['params'][0].keys())
all_results = pd.concat([results, params_df], axis=1)

# Lots of interesting analysis could be done on this dataframe now to learn
# about the trained models...
all_results
all_results.sort_values('mean', ascending=False)

Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': None, 'max_leaf_nodes': 20, 'min_samples_split': 8} 0.7673387096774194

Grid scores on development set:



Unnamed: 0,mean,std,criterion,max_depth,max_leaf_nodes,min_samples_split
191,0.767339,0.050506,entropy,,20.0,8
113,0.761089,0.024084,gini,10.0,10.0,4
147,0.761089,0.024084,gini,20.0,10.0,2
175,0.754839,0.053179,entropy,,,2
126,0.748992,0.072690,gini,10.0,40.0,2
...,...,...,...,...,...,...
286,0.384476,0.066930,entropy,10.0,,128
293,0.384476,0.066930,entropy,10.0,10.0,128
300,0.384476,0.066930,entropy,10.0,20.0,128
307,0.384476,0.066930,entropy,10.0,40.0,128


In [23]:
grid_tree.cv_results_

{'mean_fit_time': array([0.00190125, 0.00159783, 0.00157704, 0.00153208, 0.00139523,
        0.00130515, 0.00150328, 0.00156574, 0.00141249, 0.00157385,
        0.00154781, 0.00141311, 0.00128784, 0.00109572, 0.00159373,
        0.00141792, 0.00133252, 0.00143185, 0.00171776, 0.001267  ,
        0.00121069, 0.00158825, 0.00147638, 0.00138688, 0.00158882,
        0.00138588, 0.00134273, 0.00110226, 0.00135622, 0.00153608,
        0.00143104, 0.00139737, 0.00130959, 0.00171409, 0.001159  ,
        0.00144577, 0.0011806 , 0.00150695, 0.00121603, 0.00121746,
        0.00135012, 0.00145707, 0.00126367, 0.00128331, 0.00141444,
        0.00160937, 0.00122995, 0.00122533, 0.00112243, 0.0014451 ,
        0.00130377, 0.00127478, 0.00128751, 0.00146375, 0.00128374,
        0.0011682 , 0.00128465, 0.00149741, 0.00131555, 0.00123544,
        0.00124393, 0.00178256, 0.00121384, 0.00127864, 0.00136662,
        0.00150795, 0.00125489, 0.00120344, 0.00139608, 0.00125179,
        0.00131397, 0.00133448,

In [32]:

scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score,  pos_label=0)}
gs = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid={"min_samples_split": range(2, 403, 20)},
    scoring=scoring,
    refit="AUC",
    n_jobs=2,
    return_train_score=True,
)
gs.fit(input_data, labels)
results = gs.cv_results_

Traceback (most recent call last):
  File "/Users/fsharafi/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/fsharafi/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/Users/fsharafi/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 349, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "/Users/fsharafi/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/fsharafi/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *

alidation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/fsharafi/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/Users/fsharafi/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 349, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "/Users/fsharafi/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/fsharafi/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/Users/fsharafi/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 349, in _sc