In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [2]:
from sklearn.datasets import load_digits

In [3]:
data = load_digits()

In [8]:
X_train = data['data']
y_train = data['target']

In [12]:
tuned_parameters = {'n_estimators':[10,50,100,300]}
scores = ['precision','recall']
rf = RandomForestClassifier()


In [13]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    #y_true, y_pred = y_test, clf.predict(X_test)
    #print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'n_estimators': 100}

Grid scores on development set:

0.911 (+/-0.053) for {'n_estimators': 10}
0.939 (+/-0.045) for {'n_estimators': 50}
0.943 (+/-0.030) for {'n_estimators': 100}
0.942 (+/-0.035) for {'n_estimators': 300}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.


# Tuning hyper-parameters for recall

Best parameters set found on development set:

{'n_estimators': 300}

Grid scores on development set:

0.898 (+/-0.062) for {'n_estimators': 10}
0.930 (+/-0.058) for {'n_estimators': 50}
0.933 (+/-0.045) for {'n_estimators': 100}
0.939 (+/-0.038) for {'n_estimators': 300}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.




In [14]:
tuned_parameters = {'n_estimators':[10,50,100,300], 'criterion':['gini','entropy'], 'max_depth':[5,10,15], 'min_samples_split':[2,4,9]}
scores = ['precision','recall']

In [16]:
%%time
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    #y_true, y_pred = y_test, clf.predict(X_test)
    #print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}

Grid scores on development set:

0.848 (+/-0.108) for {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 10}
0.901 (+/-0.067) for {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}
0.902 (+/-0.057) for {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100}
0.909 (+/-0.052) for {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 300}
0.852 (+/-0.074) for {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 10}
0.897 (+/-0.051) for {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 50}
0.901 (+/-0.057) for {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 100}
0.904 (+/-0.051) for {'criterion': 'gini', 'max_depth': 5, 'min_sam

In [17]:
from sklearn.model_selection import RandomizedSearchCV

In [20]:
%%time
tuned_parameters = {'n_estimators':[10,50,100,300], 'criterion':['gini','entropy'], 'max_depth':[5,10,15], 'min_samples_split':[2,4,9]}
scores = ['precision','recall']
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = RandomizedSearchCV(RandomForestClassifier(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score, n_iter=10)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    #y_true, y_pred = y_test, clf.predict(X_test)
    #print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'n_estimators': 300, 'min_samples_split': 2, 'max_depth': 15, 'criterion': 'gini'}

Grid scores on development set:

0.939 (+/-0.045) for {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': 10, 'criterion': 'entropy'}
0.859 (+/-0.063) for {'n_estimators': 10, 'min_samples_split': 4, 'max_depth': 5, 'criterion': 'entropy'}
0.940 (+/-0.037) for {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': 15, 'criterion': 'entropy'}
0.899 (+/-0.059) for {'n_estimators': 10, 'min_samples_split': 9, 'max_depth': 10, 'criterion': 'entropy'}
0.934 (+/-0.039) for {'n_estimators': 50, 'min_samples_split': 4, 'max_depth': 10, 'criterion': 'entropy'}
0.900 (+/-0.055) for {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': 5, 'criterion': 'gini'}
0.901 (+/-0.056) for {'n_estimators': 50, 'min_samples_split': 2, 'max_depth': 5, 'criterion': 'gini'}
0.943 (+/-0.033) for {'n_estimators': 300, 'min_sam