In [1]:
import sklearn.model_selection as cv
import sklearn.ensemble        as ensem
from sklearn.datasets import make_classification

# 1. Dataset

In [2]:
X_all, y_all = make_classification(n_samples            = 500, 
                                   n_features           = 50,
                                   n_informative        = 20,
                                   n_classes            = 2,
                                   n_clusters_per_class = 10,
                                   random_state         = 42)

In [3]:
X, X_test, y, y_test = cv.train_test_split(X_all,
                                           y_all, 
                                           random_state = 42)

# 2. Model

In [40]:
from randomForestCV import RandomForestClassifierCV, GridSearch

# 3. Fit

In [6]:
param_grid = {
    'criterion'         : ['entropy'],
    'max_depth'         : [None, 1, 3],
    'min_samples_leaf'  : [1, 5 ],
    'max_features'      : [5, 'sqrt'],
    'max_leaf_nodes'    : [None, 10, 50],
    'class_weight'      : [None]
}



model = GridSearch(RandomForestClassifierCV(), param_grid)

In [7]:
%%time
model.fit(X, y)

CPU times: user 57.3 s, sys: 651 ms, total: 57.9 s
Wall time: 58.8 s


RandomForestClassifierCV(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=None, oob_score=False, random_state=None,
                         verbose=0, warm_start=False)

In [8]:
model.best_params

{'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': 50,
 'min_samples_leaf': 1}

In [9]:
model.score(X_test, y_test)

0.56

# 4. Testing/Profiling

In [35]:
param_grid = {
    'n_estimators'      : [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'criterion'         : ['entropy'],
    'max_depth'         : [None, 5],
    'min_samples_leaf'  : [1],
    'max_features'      : ['sqrt'],
    'max_leaf_nodes'    : [None],
    'class_weight'      : [None]
}

In [11]:
model = cv.GridSearchCV(ensem.RandomForestClassifier(), param_grid)

In [12]:
%%time
model.fit(X, y)

CPU times: user 2min 28s, sys: 2.07 s, total: 2min 30s
Wall time: 2min 44s


GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'class_weight': [None], 'criterion': ['entropy'],
                         'max_depth': [None, 5], 'max_features': ['sqrt'],
                         'max_leaf_nodes': [None], 'min_samples_leaf': [1],
                         'n_estimators': [100, 200, 300, 400, 500, 600, 700,
                                          800, 900, 1000]})

In [13]:
%%time
model = cv.GridSearchCV(ensem.RandomForestClassifier(warm_start=True), param_grid)
model.fit(X, y)

CPU times: user 2min 15s, sys: 1.05 s, total: 2min 16s
Wall time: 2min 18s


GridSearchCV(estimator=RandomForestClassifier(warm_start=True),
             param_grid={'class_weight': [None], 'criterion': ['entropy'],
                         'max_depth': [None, 5], 'max_features': ['sqrt'],
                         'max_leaf_nodes': [None], 'min_samples_leaf': [1],
                         'n_estimators': [100, 200, 300, 400, 500, 600, 700,
                                          800, 900, 1000]})

In [36]:
n_estimators_range = param_grid['n_estimators']
del param_grid['n_estimators']

In [38]:
%%time
model = GridSearch(RandomForestClassifierCV(n_estimators_range = n_estimators_range), param_grid)
model.fit(X, y)

CPU times: user 13 s, sys: 236 ms, total: 13.2 s
Wall time: 14.5 s


RandomForestClassifierCV(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_estimators_range=[100, 200, 300, 400, 500, 600, 700,
                                             800, 900, 1000],
                         n_jobs=None, oob_score=False, random_state=None,
                         verbose=0, warm_start=False)