In [66]:
from sklearn import datasets, cross_validation, tree, ensemble, grid_search
import numpy as np

In [48]:
digits = datasets.load_digits()

In [49]:
X, y = digits['data'], digits['target']

In [77]:
print(X[:3])
print(y[:3])
len(X[0])

[[  0.   0.   5.  13.   9.   1.   0.   0.   0.   0.  13.  15.  10.  15.
    5.   0.   0.   3.  15.   2.   0.  11.   8.   0.   0.   4.  12.   0.
    0.   8.   8.   0.   0.   5.   8.   0.   0.   9.   8.   0.   0.   4.
   11.   0.   1.  12.   7.   0.   0.   2.  14.   5.  10.  12.   0.   0.
    0.   0.   6.  13.  10.   0.   0.   0.]
 [  0.   0.   0.  12.  13.   5.   0.   0.   0.   0.   0.  11.  16.   9.
    0.   0.   0.   0.   3.  15.  16.   6.   0.   0.   0.   7.  15.  16.
   16.   2.   0.   0.   0.   0.   1.  16.  16.   3.   0.   0.   0.   0.
    1.  16.  16.   6.   0.   0.   0.   0.   1.  16.  16.   6.   0.   0.
    0.   0.   0.  11.  16.  10.   0.   0.]
 [  0.   0.   0.   4.  15.  12.   0.   0.   0.   0.   3.  16.  15.  14.
    0.   0.   0.   0.   8.  13.   8.  16.   0.   0.   0.   0.   1.   6.
   15.  11.   0.   0.   0.   1.   8.  13.  15.   1.   0.   0.   0.   9.
   16.  16.   5.   0.   0.   0.   0.   3.  13.  16.  16.  11.   5.   0.
    0.   0.   0.   3.  11.  16.   9.   0.]]
[0 1 2

64

In [51]:
dtree = tree.DecisionTreeClassifier()

In [52]:
scores = cross_validation.cross_val_score(dtree, X, y=y, cv=10)

In [53]:
print(scores)
print(scores.mean())

[ 0.78918919  0.8579235   0.83977901  0.80555556  0.79888268  0.87150838
  0.89385475  0.79775281  0.83050847  0.8125    ]
0.829745434116


In [54]:
bagging_classifier = ensemble.BaggingClassifier(n_estimators=100)

In [55]:
scores = cross_validation.cross_val_score(bagging_classifier, X, y=y, cv=10)

In [56]:
print(scores)
print(scores.mean())

[ 0.87027027  0.95628415  0.90055249  0.94444444  0.90502793  0.97765363
  0.96089385  0.91011236  0.87570621  0.93181818]
0.923276352896


In [57]:
bagging_subclasses = ensemble.BaggingClassifier(n_estimators=100, 
                                                max_features=np.sqrt(len(X[0])))

In [58]:
scores = cross_validation.cross_val_score(bagging_classifier, X, y=y, cv=10)

In [46]:
print(scores)
print(scores.mean())

[ 0.89189189  0.96174863  0.92265193  0.93888889  0.92178771  0.98324022
  0.96648045  0.92696629  0.8700565   0.92613636]
0.93098488812


In [59]:
bagging_subclasses = ensemble.BaggingClassifier(base_estimator=dtree, 
                                                n_estimators=100, 
                                                max_features=np.sqrt(len(X[0])))

In [60]:
scores = cross_validation.cross_val_score(bagging_classifier, X, y=y, cv=10)

In [61]:
print(scores)
print(scores.mean())

[ 0.87567568  0.94535519  0.92265193  0.92777778  0.91620112  0.98324022
  0.96648045  0.90449438  0.87570621  0.91477273]
0.923235569011


In [62]:
scores = cross_validation.cross_val_score(ensemble.RandomForestClassifier(), X, y=y, cv=10)

In [63]:
print(scores)
print(scores.mean())

[ 0.88108108  0.93989071  0.88950276  0.93888889  0.92178771  0.97765363
  0.95530726  0.94382022  0.88700565  0.92613636]
0.926107428421


In [64]:
rforest = ensemble.RandomForestClassifier()

In [65]:
rforest.get_params().keys()

dict_keys(['min_samples_split', 'n_estimators', 'min_weight_fraction_leaf', 'class_weight', 'criterion', 'max_leaf_nodes', 'max_depth', 'bootstrap', 'min_samples_leaf', 'max_features', 'n_jobs', 'random_state', 'verbose', 'oob_score', 'warm_start'])

In [79]:
pgrid = {
    'n_estimators': [5, 10, 50, 100],
    'max_features': [int(np.sqrt(len(X[0]))), len(X[0])],
    'max_depth': [5, 10, 50, 100]
}

In [80]:
grid_cv = grid_search.GridSearchCV(rforest, param_grid=pgrid, scoring='accuracy', cv=10)

In [81]:
%%time
grid_cv.fit(X, y)

Wall time: 1min 32s


GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [5, 10, 50, 100], 'max_features': [8, 64], 'n_estimators': [5, 10, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [83]:
print (grid_cv.best_score_)
print (grid_cv.best_params_)

0.954368391764
{'max_depth': 50, 'n_estimators': 100, 'max_features': 8}


In [84]:
grid_cv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features=8, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [85]:
grid_cv.grid_scores_[:10]

[mean: 0.81302, std: 0.04715, params: {'max_depth': 5, 'n_estimators': 5, 'max_features': 8},
 mean: 0.86366, std: 0.03844, params: {'max_depth': 5, 'n_estimators': 10, 'max_features': 8},
 mean: 0.90206, std: 0.03123, params: {'max_depth': 5, 'n_estimators': 50, 'max_features': 8},
 mean: 0.91263, std: 0.03116, params: {'max_depth': 5, 'n_estimators': 100, 'max_features': 8},
 mean: 0.78798, std: 0.04289, params: {'max_depth': 5, 'n_estimators': 5, 'max_features': 64},
 mean: 0.82026, std: 0.05400, params: {'max_depth': 5, 'n_estimators': 10, 'max_features': 64},
 mean: 0.84252, std: 0.05644, params: {'max_depth': 5, 'n_estimators': 50, 'max_features': 64},
 mean: 0.84697, std: 0.05668, params: {'max_depth': 5, 'n_estimators': 100, 'max_features': 64},
 mean: 0.85531, std: 0.05524, params: {'max_depth': 10, 'n_estimators': 5, 'max_features': 8},
 mean: 0.92432, std: 0.03076, params: {'max_depth': 10, 'n_estimators': 10, 'max_features': 8}]