In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

import ml_project.data as d
import ml_project.helpers as h

In [2]:
ds = d.Dataset(Path("../data/15s.arff"))

In [None]:
import pickle

from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

df = ds.flagged

X = df.drop(columns="cls")
y = df["cls"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69)

# Random Forest with all its default (hyper)parameters 
clf = RandomForestClassifier(max_depth=100, min_samples_split=5, n_estimators=800)
clf.fit(X_train, y_train)

# Persist the model
with open("rf.pkl", "wb") as f:
    pickle.dump(clf, f, protocol=pickle.HIGHEST_PROTOCOL) # pickle.HIGHEST_PROTOCOL equals 5 (python>=3.8). Using protocol=5 is recommended to reduce memory usage and make it faster to store and load any large NumPy array stored as a fitted attribute in the model. Source: scikit-learn docs.

# Predict and calculate accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"{accuracy=}")
print(classification_report(y_pred, y_test))

'''
accuracy=0.8035988588983981
              precision    recall  f1-score   support

           1       0.78      0.67      0.72       702
           2       0.59      0.70      0.64       195
           3       0.70      0.77      0.74       105
           4       0.68      0.76      0.72        62
           5       0.99      0.99      0.99       687
           6       0.91      0.89      0.90       251
           7       0.80      0.88      0.84       208
           8       0.99      0.98      0.98       594
           9       0.58      0.58      0.58       283
          10       0.90      0.81      0.85       124
          11       0.74      0.77      0.76       475
          12       0.71      0.72      0.71       577
          13       0.63      0.77      0.70       159
          14       0.75      0.62      0.68       135

    accuracy                           0.80      4557
   macro avg       0.77      0.78      0.77      4557
weighted avg       0.81      0.80      0.80      4557
'''

accuracy=0.8084265964450297
              precision    recall  f1-score   support

           1       0.79      0.67      0.73       712
           2       0.59      0.73      0.65       190
           3       0.65      0.77      0.70        98
           4       0.65      0.79      0.71        57
           5       0.99      0.99      0.99       688
           6       0.92      0.90      0.91       251
           7       0.79      0.89      0.84       203
           8       0.99      0.98      0.99       592
           9       0.62      0.61      0.61       285
          10       0.89      0.80      0.84       124
          11       0.75      0.77      0.76       482
          12       0.71      0.72      0.72       578
          13       0.66      0.79      0.72       162
          14       0.76      0.63      0.69       135

    accuracy                           0.81      4557
   macro avg       0.77      0.79      0.78      4557
weighted avg       0.81      0.81      0.81      455

'\naccuracy=0.8035988588983981\n              precision    recall  f1-score   support\n\n           1       0.78      0.67      0.72       702\n           2       0.59      0.70      0.64       195\n           3       0.70      0.77      0.74       105\n           4       0.68      0.76      0.72        62\n           5       0.99      0.99      0.99       687\n           6       0.91      0.89      0.90       251\n           7       0.80      0.88      0.84       208\n           8       0.99      0.98      0.98       594\n           9       0.58      0.58      0.58       283\n          10       0.90      0.81      0.85       124\n          11       0.74      0.77      0.76       475\n          12       0.71      0.72      0.71       577\n          13       0.63      0.77      0.70       159\n          14       0.75      0.62      0.68       135\n\n    accuracy                           0.80      4557\n   macro avg       0.77      0.78      0.77      4557\nweighted avg       0.81      

In [57]:
# Cross-validation results

from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier()
scores = cross_val_score(clf, X, y, cv=5)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[0.61601755 0.69190672 0.67654321 0.69410151 0.51632373]
0.64 accuracy with a standard deviation of 0.07


<b><i>TODO:</i></b><br>
It's interesting that the accuracy drops this much between using a 80-20 train/test split and a cross-validation with 5 folds (meaning, each fold takes 20% of the dataset). 

This might indicate that the dataset is (heavily) unbalanced? What other explanations are there? 

<b><i>TODO:</b></i><br>
For Random Forests, play around with the OOB score, as it enables us to not necessarily require a validation set and skip cross-validation as a whole. 

Source: https://scikit-learn.org/stable/modules/grid_search.html#out-of-bag-estimates

In [None]:
# Grid Search: could theoretically be done, but the best bet is probably doing a randomized search. Because here, how do you define what set of parameters work the best? 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Random parameters selected just to make the point of how Grid Search in scikit-learn works. The results from this search are not relevant. 
param_grid = {
    'n_estimators': [100, 200, 300], # 3
    'max_depth': [10, 2, 1], # 3
    'min_samples_split': [2, 3], #  2
    'min_samples_leaf': [2, 3],
    'bootstrap': [True]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)

Best Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best Estimator: RandomForestClassifier(max_depth=10, min_samples_leaf=2, n_estimators=200)


In [3]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=23, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


NameError: name 'RandomForestClassifier' is not defined