In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
%matplotlib inline

In [25]:
combined_df = pd.read_csv('combined_binary_df.csv', index_col='date', infer_datetime_format=True, parse_dates=True)

In [26]:
X = combined_df.drop(columns = 'class')
y = combined_df['class']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [28]:
# set parameters to compare
search_params = {
    'n_estimators'      : [5, 10, 20, 30, 50, 100, 300],
    'max_features'      : [i for i in range(1,X_train.shape[1])],
    'random_state'      : [1],
    'n_jobs'            : [1],
    'min_samples_split' : [3, 5, 10, 15, 20, 25, 30, 40, 50, 100],
    'max_depth'         : [3, 5, 10, 15, 20, 25, 30, 40, 50, 100]
}

In [29]:
# define gridsearch
gsc = GridSearchCV(
    RandomForestClassifier(),
    search_params,
    cv = 3,
    n_jobs = -1,
    verbose=True)

In [30]:
gsc.fit(X_train, y_train)

Fitting 3 folds for each of 4900 candidates, totalling 14700 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [3, 5, 10, 15, 20, 25, 30, 40, 50, 100],
                         'max_features': [1, 2, 3, 4, 5, 6, 7],
                         'min_samples_split': [3, 5, 10, 15, 20, 25, 30, 40, 50,
                                               100],
                         'n_estimators': [5, 10, 20, 30, 50, 100, 300],
                         'n_jobs': [1], 'random_state': [1]},
             verbose=True)

In [31]:
# show best parameters
gsc.best_estimator_

RandomForestClassifier(max_depth=25, max_features=2, min_samples_split=3,
                       n_estimators=5, n_jobs=1, random_state=1)

In [32]:
# show score
score = gsc.best_estimator_.score(X_test, y_test)
print("%0.2f accuracy" % (score))

0.52 accuracy


In [33]:
predicted = gsc.best_estimator_.predict(X_test)

In [34]:
from sklearn.metrics import classification_report

In [35]:
classification_report(y_test, predicted)

'              precision    recall  f1-score   support\n\n         0.0       0.47      0.47      0.47       132\n         1.0       0.56      0.56      0.56       158\n\n    accuracy                           0.52       290\n   macro avg       0.52      0.52      0.52       290\nweighted avg       0.52      0.52      0.52       290\n'