In [104]:
from sklearn.datasets import load_digits
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
import warnings # supress warnings
warnings.filterwarnings('ignore')
import timeit
data = load_digits()
df_digit = pd.DataFrame(data.data, columns=data.feature_names)
df_digit['class'] = pd.DataFrame(data.target)
df_digit.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,class
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [105]:
X = df_digit.drop(columns='class')
y = df_digit['class']

In [106]:
#Use the get_params() function to see what parameters you can set for a decision tree classifier
dt_clf_digit = DecisionTreeClassifier()
dt_clf_digit.get_params()


{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [107]:
#Use GridSearchCV to find the best parameter settings for a decision tree classifier, used on the digits dataset.  
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
scores=['precision','recall','f1']


In [108]:
# Utility function to report best scores
import numpy as np
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
# reference: https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html

In [110]:
parameters = {'criterion':['gini','entropy'],'max_depth': [1,2,3,4,5,6,7,8,9,10],'splitter': ['best', 'random']}
# run randomized search
total_time=0
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    start = timeit.default_timer()
    clf = GridSearchCV(dt_clf_digit, parameters, scoring='%s_macro' % score)
    clf.fit(X_train,y_train)
    end = timeit.default_timer()
    total_time= total_time + (end-start)
    print("Time it takes to perfrom parameter search: %0.3f sec" % (end-start))
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Top 3 best parameters are :")
    report(clf.cv_results_)
    print("Detailed classification report:")
    y_pred =clf.predict(X_test)
    print(classification_report(y_test, y_pred))
print("Total running time for Grid search is :%0.3f" % total_time)

# Tuning hyper-parameters for precision
Time it takes to perfrom parameter search: 7.435 sec
Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': 10, 'splitter': 'random'}

Top 3 best parameters are :
Model with rank: 1
Mean validation score: 0.847 (std: 0.020)
Parameters: {'criterion': 'entropy', 'max_depth': 10, 'splitter': 'random'}

Model with rank: 2
Mean validation score: 0.846 (std: 0.022)
Parameters: {'criterion': 'entropy', 'max_depth': 9, 'splitter': 'best'}

Model with rank: 3
Mean validation score: 0.845 (std: 0.024)
Parameters: {'criterion': 'entropy', 'max_depth': 8, 'splitter': 'best'}

Detailed classification report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        53
           1       0.85      0.90      0.87        50
           2       0.75      0.83      0.79        47
           3       0.83      0.80      0.81        54
           4       0.94      0.83      0.88        60
  

In [114]:
parameters = {'criterion':['gini','entropy'],'max_depth': [1,2,3,4,5,6,7,8,9,10],'splitter': ['best', 'random']}
# run randomized search
n_iter_search = 20 #20 candidates parameter settings.
total_time=0
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    start = timeit.default_timer()
    clf_random = RandomizedSearchCV(dt_clf_digit, parameters, scoring='%s_macro' % score,n_iter=n_iter_search)
    clf_random.fit(X_train,y_train)
    end = timeit.default_timer()
    total_time= total_time + (end-start)
    print("Time it takes to perfrom parameter search: %0.3f sec" % (end-start))
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Top 3 best parameters are :")
    report(clf_random.cv_results_)
    print("Detailed classification report:")
    y_pred =clf_random.predict(X_test)
    print(classification_report(y_test, y_pred))
print("Total running time for randomized search is :%0.3f" % total_time)

# Tuning hyper-parameters for precision
Time it takes to perfrom parameter search: 4.043 sec
Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': 10, 'splitter': 'best'}

Top 3 best parameters are :
Model with rank: 1
Mean validation score: 0.855 (std: 0.020)
Parameters: {'splitter': 'best', 'max_depth': 8, 'criterion': 'entropy'}

Model with rank: 2
Mean validation score: 0.850 (std: 0.030)
Parameters: {'splitter': 'best', 'max_depth': 10, 'criterion': 'entropy'}

Model with rank: 3
Mean validation score: 0.842 (std: 0.023)
Parameters: {'splitter': 'best', 'max_depth': 7, 'criterion': 'entropy'}

Detailed classification report:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94        53
           1       0.94      0.92      0.93        50
           2       0.83      0.85      0.84        47
           3       0.91      0.94      0.93        54
           4       0.86      0.83      0.85        60
      

### Grid search exhaustively considers all parameters while searching for best parameter combination. However Randomized search takes the samples randomly from the set of candidates in order to look for best parameters. 
### Running time for randomized search is almost half of the grid search running time and results are similar from both hyper paramter tuning strategies.


In [115]:
#to look at all paramter combinations we can use below code
print("Grid scores on Decision tree's training set:")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for params in clf.cv_results_['params']:
    print("%r"% ( params)) 
 

Grid scores on Decision tree's training set:
{'criterion': 'gini', 'max_depth': 1, 'splitter': 'best'}
{'criterion': 'gini', 'max_depth': 1, 'splitter': 'random'}
{'criterion': 'gini', 'max_depth': 2, 'splitter': 'best'}
{'criterion': 'gini', 'max_depth': 2, 'splitter': 'random'}
{'criterion': 'gini', 'max_depth': 3, 'splitter': 'best'}
{'criterion': 'gini', 'max_depth': 3, 'splitter': 'random'}
{'criterion': 'gini', 'max_depth': 4, 'splitter': 'best'}
{'criterion': 'gini', 'max_depth': 4, 'splitter': 'random'}
{'criterion': 'gini', 'max_depth': 5, 'splitter': 'best'}
{'criterion': 'gini', 'max_depth': 5, 'splitter': 'random'}
{'criterion': 'gini', 'max_depth': 6, 'splitter': 'best'}
{'criterion': 'gini', 'max_depth': 6, 'splitter': 'random'}
{'criterion': 'gini', 'max_depth': 7, 'splitter': 'best'}
{'criterion': 'gini', 'max_depth': 7, 'splitter': 'random'}
{'criterion': 'gini', 'max_depth': 8, 'splitter': 'best'}
{'criterion': 'gini', 'max_depth': 8, 'splitter': 'random'}
{'criterion

In [116]:
#to look at all paramter combinations we can use below code
print("Grid scores on Decision tree's training set:")
means = clf_random.cv_results_['mean_test_score']
stds = clf_random.cv_results_['std_test_score']
for params in clf_random.cv_results_['params']:
    print("%r"% (params)) 

Grid scores on Decision tree's training set:
{'splitter': 'best', 'max_depth': 1, 'criterion': 'entropy'}
{'splitter': 'best', 'max_depth': 3, 'criterion': 'entropy'}
{'splitter': 'random', 'max_depth': 4, 'criterion': 'entropy'}
{'splitter': 'best', 'max_depth': 8, 'criterion': 'gini'}
{'splitter': 'random', 'max_depth': 9, 'criterion': 'gini'}
{'splitter': 'random', 'max_depth': 4, 'criterion': 'gini'}
{'splitter': 'random', 'max_depth': 2, 'criterion': 'gini'}
{'splitter': 'best', 'max_depth': 5, 'criterion': 'entropy'}
{'splitter': 'random', 'max_depth': 8, 'criterion': 'entropy'}
{'splitter': 'best', 'max_depth': 10, 'criterion': 'entropy'}
{'splitter': 'best', 'max_depth': 10, 'criterion': 'gini'}
{'splitter': 'best', 'max_depth': 2, 'criterion': 'entropy'}
{'splitter': 'best', 'max_depth': 9, 'criterion': 'gini'}
{'splitter': 'random', 'max_depth': 6, 'criterion': 'entropy'}
{'splitter': 'best', 'max_depth': 4, 'criterion': 'gini'}
{'splitter': 'best', 'max_depth': 6, 'criterion