In [1]:
from sklearn.datasets import make_classification
import numpy as np 
import pandas as pd

# make dataset 
X, y = make_classification(n_samples = 10000, 
                           n_features=20, 
                           n_informative=4, 
                           n_redundant=0, 
                           random_state=11)

df = pd.DataFrame(X)
df['target'] = y
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,target
0,0.520216,-0.299523,1.697775,0.152835,-0.071976,0.002353,0.057001,1.656589,0.059377,0.634026,...,0.230848,-2.133668,-0.658056,0.227366,-1.005542,-0.533868,-0.656252,-1.167656,-0.902226,0
1,0.252605,1.432791,1.561181,-1.456888,-0.325153,-1.757407,1.183243,0.931166,0.967256,-1.833468,...,-1.644497,1.259892,1.355751,-1.085283,-1.34722,-0.073796,0.718362,-2.33463,1.531651,0
2,-1.118205,-0.335938,-0.979303,0.188338,-0.346252,-1.263341,-1.037886,-0.870959,2.105311,0.892956,...,0.794894,0.796176,0.193527,-2.070266,-1.183444,-0.231885,1.581976,1.110054,1.610723,1
3,0.334311,1.568198,-0.423843,-0.962124,1.060851,-3.596107,-0.416077,-0.602925,-0.523378,0.834385,...,-0.636568,-2.537476,-0.355572,1.03274,0.195867,-0.227352,-0.332308,0.813405,-1.037039,1
4,-0.803574,-0.573973,2.605967,0.600801,0.823409,0.494084,-0.398244,1.332191,0.273173,1.08931,...,-1.030162,-1.252967,1.109795,-1.197247,-0.681647,-0.78671,0.833898,-0.258752,0.161887,0


In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Create the random grid
param_grid = {'max_depth': max_depth,
              'n_estimators': n_estimators}

In [4]:
param_grid

{'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

Now I will find the best hyperparameters for the random forest using GridSearchCV. 

In [4]:
clf = RandomForestClassifier()

In [8]:
rf_gridsearch = GridSearchCV(estimator = clf, param_grid = param_grid,
                             cv = 3, verbose=1)

In [9]:
%%time
rf_gridsearch.fit(df.drop('target', axis = 1), df['target'])

Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed: 137.8min finished


CPU times: user 2h 13min 33s, sys: 51.8 s, total: 2h 14min 25s
Wall time: 2h 18min 46s


GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100,
                                       110, None],
                         'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400,
                                          1600, 1800, 2000]},
             verbose=1)

In [10]:
rf_gridsearch = GridSearchCV(estimator = clf, param_grid = param_grid,
                             cv = 3, verbose=2, n_jobs = -1)

In [11]:
%%time
rf_gridsearch.fit(df.drop('target', axis = 1), df['target'])

Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 31.1min finished


CPU times: user 8.24 s, sys: 717 ms, total: 8.96 s
Wall time: 31min 13s


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100,
                                       110, None],
                         'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400,
                                          1600, 1800, 2000]},
             verbose=2)