In [3]:
import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

import ml_project.data as d
import ml_project.helpers as h

In [4]:
ds = pd.read_csv("../data/filtered.csv")

X = ds.drop(columns="cls")
y = ds["cls"]

X.shape, y.shape

((15146, 26), (15146,))

In [5]:
import pickle

from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

def try_rf(params: dict = "default"):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12)

    if params == "default":
        print("training with default params")
        clf = RandomForestClassifier()  # default
    else:
        print(f"training rf with custom params: {params}")
        clf = RandomForestClassifier(**params)  # whatever params we passed
    
    clf.fit(X_train, y_train)

    importances = clf.feature_importances_
    print(f"{importances=}")

    # Persist the model (we probably don't need this, as training just takes ~30 seconds)
    with open("rf.pkl", "wb") as f:
        pickle.dump(clf, f, protocol=pickle.HIGHEST_PROTOCOL) # pickle.HIGHEST_PROTOCOL equals 5 (python>=3.8). Using protocol=5 is recommended to reduce memory usage and make it faster to store and load any large NumPy array stored as a fitted attribute in the model. Source: scikit-learn docs.

    # Predict and calculate accuracy
    y_pred = clf.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    # Some stats
    print(f"{test_accuracy=}")
    print(classification_report(y_pred, y_test))

    # Save stats in an sqlite db
    params = ", ".join([f"{key}: {val}" for key, val in clf.get_params().items()])
    report = classification_report(y_pred, y_test)
    h.save_metrics(alg="rf", params=params, report=report)

    y_pred = clf.predict(X_train)
    train_accuracy = accuracy_score(y_pred, y_train)
    print(f"{train_accuracy=}")

try_rf()

training with default params
importances=array([0.03907216, 0.06015807, 0.08496782, 0.03511352, 0.02994906,
       0.05020424, 0.04830738, 0.06065636, 0.05329999, 0.05266511,
       0.07956834, 0.06295916, 0.03659097, 0.0569092 , 0.04516059,
       0.0429289 , 0.0520958 , 0.05056895, 0.00726797, 0.01423971,
       0.01252564, 0.01266288, 0.0087384 , 0.00110021, 0.00066922,
       0.00162038])
test_accuracy=0.8798679867986798
              precision    recall  f1-score   support

           1       0.86      0.87      0.86       508
           2       0.68      0.79      0.73       110
           3       0.78      0.78      0.78        64
           4       0.90      0.90      0.90        31
           5       0.99      1.00      0.99       530
           6       0.83      0.78      0.81        37
           7       0.85      0.95      0.90       151
           8       0.99      0.99      0.99       464
           9       0.73      0.71      0.72       203
          10       0.91      0

In [None]:
# previously best hyperparameters with the old dataset
params = {
  "bootstrap": False, 
  "class_weight": 'balanced', 
  "max_depth": 80,
  "max_features": 'log2', 
  "n_estimators": 400
}

try_rf(params)

training rf with custom params: {'bootstrap': False, 'class_weight': 'balanced', 'max_depth': 80, 'max_features': 'log2', 'n_estimators': 400}
importances=array([0.04285341, 0.06478113, 0.07008177, 0.03599795, 0.03506323,
       0.05124002, 0.05712972, 0.04797067, 0.04193507, 0.05359486,
       0.09132468, 0.06647887, 0.04246704, 0.05714711, 0.04461639,
       0.04338886, 0.04155668, 0.03974977, 0.00957558, 0.01513691,
       0.01614094, 0.01677837, 0.01025914, 0.00181124, 0.00073511,
       0.00218546])
accuracy=0.8894389438943895
              precision    recall  f1-score   support

           1       0.91      0.85      0.88       487
           2       0.77      0.84      0.80       120
           3       0.72      0.93      0.81        60
           4       0.86      0.97      0.91        32
           5       1.00      1.00      1.00       515
           6       0.82      0.80      0.81        35
           7       0.85      0.95      0.90       162
           8       1.00      

In [None]:
# See all models you've trained so far. 
h.retrieve_metrics(alg="rf") 

In [None]:
# Cross-validation results

from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier()
scores = cross_val_score(clf, X, y, cv=5)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[0.72310231 0.83955101 0.82271377 0.78243645 0.62925058]
0.76 accuracy with a standard deviation of 0.08


<b><i>TODO:</i></b><br>
It's interesting that the accuracy drops this much between using a 80-20 train/test split and a cross-validation with 5 folds (meaning, each fold takes 20% of the dataset). 

This might indicate that the dataset is (heavily) unbalanced? What other explanations are there? 

In [None]:
# Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Random parameters selected to make a template of how grid search works in scikit-learn. The results from this search are not relevant or used anywhere. 
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 2, 1], 
    'min_samples_split': [2, 3],
    'min_samples_leaf': [2, 3],
    'bootstrap': [True]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12)

grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)

### Finding relevant hyperparameters to do a RandomizedGridSearch on
*Parameters for RandomForestClassifier*

```
sklearn.ensemble.RandomForestClassifier(
  n_estimators=100, # important for obvious reasons
  *, 
  criterion='gini', # keeping this as is is fine in most cases. I'm keeping it unchanged. 
  max_depth=None, # imp
  min_samples_split=2, # imp
  min_samples_leaf=1, # imp
  min_weight_fraction_leaf=0.0, # not relevant in our case
  max_features='sqrt', # imp
  max_leaf_nodes=None, # can be kept to the default, as we're setting the min values before
  min_impurity_decrease=0.0, # okay to leave untouched
  bootstrap=True, # whether the entire dataset is used or not, relevant for OOB scoring and some other hyperparameters
  oob_score=False, # I'm not making use of it. How it might be useful: https://scikit-learn.org/stable/modules/grid_search.html#out-of-bag-estimates 
  n_jobs=None, # could trying setting to -1 if i want all CPU cores, but for now, it's fine 
  random_state=None, 
  verbose=0, # whether info is printed. 0 is fine
  warm_start=False, # not for us 
  class_weight=None, # using bc our dataset is somewhat unbalanced. so why not 
  ccp_alpha=0.0, # not tweaking
  max_samples=None, # keeping as is
  monotonic_cst=None # keeping as is 
)
```

In [17]:
# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameters we're choosing to optimize
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['log2', 'sqrt'] # not using None bc apparently that's good in larger datasets
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10, 20, 50]
min_samples_leaf = [1, 2, 5, 10, 20, 50]
bootstrap = [True, False]
class_weight = [None, 'balanced', 'balanced_subsample'] # not typical to optimize this. but as our dataset is somewhat unbalanced, let's see what this brings. 

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'class_weight': class_weight}

print(random_grid)

# Before we start, some feature selection
# Using the VarianceThreshold, as our data has binary features. Source: https://scikit-learn.org/stable/modules/feature_selection.html#removing-features-with-low-variance
from sklearn.feature_selection import VarianceThreshold

# original data
X = ds.drop(columns="cls")
y = ds["cls"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12)
sel = VarianceThreshold(threshold=(.8 * (1 - .8))) # remove columns where > 80% rows have the same value.

print("original: ", X.shape)

# removes no_fiat and no_biat
X = sel.fit_transform(X) # updated X
print("after VarianceThreshold: ", X.shape)

# see which features got yeeted out
mask = sel.get_support()
removed_features = X_train.columns[~mask]
print("removed features:", removed_features)

# Now we start with the RandomizedGridSearch to find the best hyperparameters
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 120, cv = 5, verbose=2, random_state=12, n_jobs = -1)

# Fit the random search model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12)
rf_random.fit(X_train, y_train)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['log2', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'min_samples_split': [2, 5, 10, 20, 50], 'min_samples_leaf': [1, 2, 5, 10, 20, 50], 'bootstrap': [True, False], 'class_weight': [None, 'balanced', 'balanced_subsample']}
original:  (15146, 26)
after VarianceThreshold:  (15146, 24)
removed features: Index(['no_fiat', 'no_biat'], dtype='object')
Fitting 5 folds for each of 120 candidates, totalling 600 fits


0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'bootstrap': [True, False], 'class_weight': [None, 'balanced', ...], 'max_depth': [10, 20, ...], 'max_features': ['log2', 'sqrt'], ...}"
,n_iter,120
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,12

0,1,2
,n_estimators,600
,criterion,'gini'
,max_depth,50
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [6]:
# trying a test run with the suggested hyperparameters
params = {
  "bootstrap": False, 
  "max_depth": 50, 
  "n_estimators": 600
}

try_rf(params)

training rf with custom params: {'bootstrap': False, 'max_depth': 50, 'n_estimators': 600}
importances=array([0.03911767, 0.06254405, 0.08543776, 0.03351733, 0.0293652 ,
       0.04698297, 0.05118042, 0.06603417, 0.0498227 , 0.05498713,
       0.07910834, 0.06704376, 0.03814702, 0.05619364, 0.04315911,
       0.04680134, 0.04565165, 0.04732231, 0.00723011, 0.01243783,
       0.01290611, 0.0130714 , 0.00841459, 0.0010554 , 0.00070011,
       0.00176789])
test_accuracy=0.8891089108910891
              precision    recall  f1-score   support

           1       0.86      0.87      0.87       508
           2       0.74      0.84      0.79       113
           3       0.78      0.81      0.79        62
           4       0.94      0.91      0.92        32
           5       0.99      0.99      0.99       532
           6       0.86      0.79      0.82        38
           7       0.88      0.96      0.92       156
           8       0.99      0.99      0.99       464
           9       0.7