In [1]:
import numpy as np

from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

In [2]:
# get some data
digits = load_digits()
X, y = digits.data, digits.target

In [3]:
# build a classifier
clf = RandomForestClassifier(n_estimators=20)

In [4]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [32]:
# specify parameters and distributions to sample from
param_dist = {"max_depth": sp_randint(1, 101),
              "max_features": [None, 'auto', 'sqrt', 'log2'],
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [31]:
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, scoring='f1_weighted')

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

TypeError: object of type 'int' has no len()

In [18]:
top_scores = sorted(random_search.grid_scores_, key=itemgetter(1), reverse=True)[:3][0]
print random_search.grid_scores_[0]
# clf = RandomForestClassifier(n_estimators=20, kwargs=)

mean: 0.78130, std: 0.00826, params: {'bootstrap': False, 'min_samples_leaf': 3, 'min_samples_split': 4, 'criterion': 'gini', 'max_features': 10, 'max_depth': 3}


In [26]:
clf = random_search.best_estimator_

In [24]:
clf.fit(X, y)

RandomForestClassifier(bootstrap=False, class_weight=None,
            criterion='entropy', max_depth=None, max_features=4,
            max_leaf_nodes=None, min_samples_leaf=9, min_samples_split=6,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [27]:
clf.predict_proba(X)

array([[  9.85602837e-01,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   3.54609929e-04,   9.16666667e-03],
       [  0.00000000e+00,   7.35366774e-01,   3.04314204e-02, ...,
          3.55092593e-02,   1.13046673e-01,   1.05092593e-02],
       [  9.72222222e-03,   1.07586233e-01,   5.79600538e-01, ...,
          1.56250000e-02,   1.77108100e-01,   1.54166667e-02],
       ..., 
       [  0.00000000e+00,   1.21489899e-01,   1.80822650e-02, ...,
          1.58173077e-02,   6.34445873e-01,   3.65384615e-02],
       [  3.33333333e-02,   1.89850427e-02,   2.78409091e-02, ...,
          3.33333333e-03,   1.64608375e-01,   6.67968312e-01],
       [  0.00000000e+00,   2.12337662e-02,   1.92456294e-01, ...,
          1.11111111e-02,   4.53387030e-01,   1.19966422e-01]])