In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

import ml_project.data as d
import ml_project.helpers as h

In [2]:
ds = d.Dataset(Path("../data/15s.arff"))

In [None]:
import pickle
from pickle import dump

from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def try_rf(df):
    X = df.drop(columns="cls")
    y = df["cls"]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

    # Random Forest with all its default (hyper)parameters 
    clf = RandomForestClassifier(
        n_estimators=200,
        criterion='entropy',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features='sqrt',
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        bootstrap=True,
        oob_score=False,
        n_jobs=None,
        random_state=None,
        verbose=0,
        warm_start=False,
        class_weight=None,
        ccp_alpha=0.0,
        max_samples=None,
        monotonic_cst=None
    )
    clf.fit(X_train, y_train)

    # Persist the model
    with open("rf.pkl", "wb") as f:
        dump(clf, f, protocol=pickle.HIGHEST_PROTOCOL) # pickle.HIGHEST_PROTOCOL equals 5 as of python 3.8 and later. Using protocol=5 is recommended to reduce memory usage and make it faster to store and load any large NumPy array stored as a fitted attribute in the model. Source: scikit-learn docs.

    # Predict and calculate accuracy
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print("Test accuracy:", accuracy)

[try_rf(ds.flagged) for _ in range(1)]

Test accuracy: 0.8115743280307186


[None]

In [48]:
# Cross-validation results

from sklearn.model_selection import cross_val_score

df = ds.flagged
X = df.drop(columns="cls")
y = df["cls"]

clf = RandomForestClassifier()
scores = cross_val_score(clf, X, y, cv=5)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[0.61930883 0.69657064 0.67901235 0.68916324 0.5138546 ]
0.64 accuracy with a standard deviation of 0.07


<b><i>TODO:</i></b><br>
It's interesting that the accuracy drops this much between using a 80-20 train/test split and a cross-validation with 5 folds (meaning, each fold takes 20% of the dataset). 

This might indicate that the dataset is (heavily) unbalanced? What other explanations are there? 

<b><i>TODO:</b></i><br>
For Random Forests, play around with the OOB score, as it enables us to not necessarily require a validation set and skip cross-validation as a whole. 

Source: https://scikit-learn.org/stable/modules/grid_search.html#out-of-bag-estimates