In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification


In [12]:
#extract data and create train/test subsets

train = pd.read_csv('train.csv')
X = train.iloc[:,2:]
species = train.iloc[:,1]
y = pd.factorize(species)[0]


xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2, random_state = 0)



In [19]:
#search for best parameters using random search

rf = RandomForestClassifier()

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter=100, cv=3, verbose=2, random_state=0, n_jobs=-1)
rf_random.fit(xTrain, yTrain)

print (rf_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 18.0min finished


{'n_estimators': 2000, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': True}


In [28]:
#conduct grid search with cross validation based on results from random search

param_grid = {
    'n_estimators': [1000, 1500, 2000, 2500],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [80,90,100],
    'bootstrap': [True],
    'min_samples_split': [2,3,5],
    'min_samples_leaf': [1,2]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,cv=3,n_jobs=-1, verbose=10)
grid_search.fit(xTrain,yTrain)

best_grid = grid_search.best_estimator_
print(best_grid)
best_grid.fit(xTrain,yTrain)
pred = best_grid.predict(xTest)
grid_accuracy = accuracy_score(pred,yTest)
print('Accuracy = ', grid_accuracy)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 18

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=80, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.9494949494949495


In [27]:
best_param = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=80, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=None,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

best_param1 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=80, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

best_param2=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=80, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

best_param.fit(xTrain,yTrain)
pred = best_param.predict(xTest)
accuracy = accuracy_score(pred, yTest)
best_param1.fit(xTrain,yTrain)
pred1 = best_param1.predict(xTest)
accuracy1 = accuracy_score(pred1, yTest)
best_param2.fit(xTrain,yTrain)
pred2 = best_param2.predict(xTest)
accuracy2 = accuracy_score(pred2, yTest)

print(accuracy)
print(accuracy1)
print(accuracy1)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1200 out of 1200 | elapsed:   12.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1200 out of 1200 | elapsed:    0.3s finished


0.9545454545454546
0.9545454545454546
0.9545454545454546


In [17]:
default_param = RandomForestClassifier()
default_param.fit(xTrain,yTrain)
def_pred = default_param.predict(xTest)
def_accuracy = accuracy_score(def_pred,yTest)
print(def_accuracy)

0.803030303030303


