In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter 
%matplotlib inline

In [2]:
combined_df = pd.read_csv('combined_df.csv', index_col='date', infer_datetime_format=True, parse_dates=True)

In [3]:
X = combined_df.drop(columns = 'class')
y = combined_df['class']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [7]:
# resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
X_resampled, y_resampled = SMOTEENN(random_state=1).fit_resample(X_train, y_train)
# view the count of target classes with Counter
Counter(y_resampled)

Counter({-1: 437, 0: 330, 1: 528})

In [8]:
# set parameters to compare
search_params = {
    'n_estimators': [5, 10, 20, 30, 50, 100, 300],
    'max_features': [i for i in range(1,X_train.shape[1])],
    'random_state': [1],
    'n_jobs': [1],
    'min_samples_split': [3, 5, 10, 15, 20, 25, 30, 40, 50, 100],
    'max_depth': [3, 5, 10, 15, 20, 25, 30, 40, 50, 100]
}

In [9]:
# define gridsearch
gsc = GridSearchCV(
    RandomForestClassifier(),
    search_params,
    cv = 3,
    n_jobs = -1,
    verbose=True)

In [16]:
gsc.fit(X_resampled, y_resampled)

Fitting 3 folds for each of 4900 candidates, totalling 14700 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [3, 5, 10, 15, 20, 25, 30, 40, 50, 100],
                         'max_features': [1, 2, 3, 4, 5, 6, 7],
                         'min_samples_split': [3, 5, 10, 15, 20, 25, 30, 40, 50,
                                               100],
                         'n_estimators': [5, 10, 20, 30, 50, 100, 300],
                         'n_jobs': [1], 'random_state': [1]},
             verbose=True)

In [17]:
# show best parameters
gsc.best_estimator_

RandomForestClassifier(max_depth=15, max_features=2, min_samples_split=3,
                       n_estimators=50, n_jobs=1, random_state=1)

In [18]:
# show score
score = gsc.best_estimator_.score(X_test, y_test)
print("%0.2f accuracy" % (score))

0.82 accuracy


In [19]:
predicted = gsc.best_estimator_.predict(X_test)

In [21]:
classification_report(y_test, predicted)

'              precision    recall  f1-score   support\n\n          -1       0.12      0.40      0.19         5\n           0       0.97      0.84      0.90       277\n           1       0.09      0.38      0.14         8\n\n    accuracy                           0.82       290\n   macro avg       0.40      0.54      0.41       290\nweighted avg       0.94      0.82      0.87       290\n'