In [52]:
from pathlib import Path

import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

import ml_project.data as d
import ml_project.helpers as h

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
ds = pd.read_csv("../data/filtered.csv")

X = ds.drop(columns="cls")
y = ds["cls"]

In [None]:
import pickle

from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

def try_rf():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=69)

    # Random Forest with current "best" hyperparameters 
    clf = RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=80, max_features='log2', n_estimators=400)
    clf.fit(X_train, y_train)

    importances = clf.feature_importances_
    print(f"{importances=}")

    model = SelectFromModel(clf, prefit=True)
    print(f"{model.transform(X)=}") # suggests removing no_biat, as it has an importance close to 0

    # Persist the model (we probably don't need this, as training just takes ~30 seconds)
    with open("rf.pkl", "wb") as f:
        pickle.dump(clf, f, protocol=pickle.HIGHEST_PROTOCOL) # pickle.HIGHEST_PROTOCOL equals 5 (python>=3.8). Using protocol=5 is recommended to reduce memory usage and make it faster to store and load any large NumPy array stored as a fitted attribute in the model. Source: scikit-learn docs.

    # Predict and calculate accuracy
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # some stats
    print(f"{accuracy=}")
    print(classification_report(y_pred, y_test))

    # Save stats in an sqlite db
    params = ", ".join([f"{key}: {val}" for key, val in clf.get_params().items()])
    report = classification_report(y_pred, y_test)
    h.save_metrics(alg="rf", params=params, report=report)

try_rf()

importances=array([0.04779311, 0.07527648, 0.07917351, 0.04284901, 0.03901709,
       0.06359894, 0.06962897, 0.05723611, 0.04901928, 0.08012909,
       0.05045351, 0.07786501, 0.05100361, 0.04537861, 0.04703413,
       0.04668525, 0.01018996, 0.01692735, 0.01815933, 0.0192147 ,
       0.01161244, 0.00175451])
X_new.shape=(15146, 13)
accuracy=0.8698177977290732
              precision    recall  f1-score   support

           1       0.87      0.84      0.86       574
           2       0.75      0.75      0.75       165
           3       0.70      0.85      0.77        75
           4       0.84      0.97      0.90        38
           5       0.99      1.00      0.99       659
           6       0.76      0.74      0.75        47
           7       0.82      0.90      0.86       193
           8       0.99      0.99      0.99       582
           9       0.68      0.65      0.66       257
          10       0.93      0.97      0.95       105
          11       0.75      0.76      0.

In [None]:
# See all models you've trained so far. 
h.retrieve_metrics(alg="rf") 

In [6]:
# Cross-validation results

from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier()
scores = cross_val_score(clf, X, y, cv=5)
print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[0.71683168 0.81842192 0.80620667 0.76361836 0.63387257]
0.75 accuracy with a standard deviation of 0.07


<b><i>TODO:</i></b><br>
It's interesting that the accuracy drops this much between using a 80-20 train/test split and a cross-validation with 5 folds (meaning, each fold takes 20% of the dataset). 

This might indicate that the dataset is (heavily) unbalanced? What other explanations are there? 

In [None]:
# Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Random parameters selected just to make a template of how grid search works in scikit-learn. The results from this search are not relevant. 
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 2, 1], 
    'min_samples_split': [2, 3],
    'min_samples_leaf': [2, 3],
    'bootstrap': [True]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69)

grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)

### Finding relevant hyperparameters to do a RandomizedGridSearch on
*Parameters for RandomForestClassifier*

```
sklearn.ensemble.RandomForestClassifier(
  n_estimators=100, # important for obvious reasons
  *, 
  criterion='gini', # keeping this as is is fine in most cases. I'm keeping it unchanged. 
  max_depth=None, # imp
  min_samples_split=2, # imp
  min_samples_leaf=1, # imp
  min_weight_fraction_leaf=0.0, # not relevant in our case
  max_features='sqrt', # imp
  max_leaf_nodes=None, # can be kept to the default, as we're setting the min values before
  min_impurity_decrease=0.0, # okay to leave untouched
  bootstrap=True, # whether the entire dataset is used or not, relevant for OOB scoring and some other hyperparameters
  oob_score=False, # I'm not making use of it. How it might be useful: https://scikit-learn.org/stable/modules/grid_search.html#out-of-bag-estimates 
  n_jobs=None, # could trying setting to -1 if i want all CPU cores, but for now, it's fine 
  random_state=None, 
  verbose=0, # whether info is printed. 0 is fine
  warm_start=False, # not for us 
  class_weight=None, # using bc our dataset is somewhat unbalanced. so why not 
  ccp_alpha=0.0, # not tweaking
  max_samples=None, # keeping as is
  monotonic_cst=None # keeping as is 
)
```

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameters we're choosing to optimize
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['log2', 'sqrt'] # not using None bc apparently that's good in larger datasets
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10, 20, 50]
min_samples_leaf = [1, 2, 5, 10]
bootstrap = [True, False]
class_weight = [None, 'balanced', 'balanced_subsample'] # as our dataset is somewhat unbalanced. let's see what this brings. 

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'class_weight': class_weight}

print(random_grid)

# Use the random grid to search for best hyperparameters
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 60, cv = 5, verbose=2, random_state=23, n_jobs = -1)

# Fit the random search model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69)
rf_random.fit(X_train, y_train)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['log2', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'min_samples_split': [2, 5, 10, 20, 50], 'min_samples_leaf': [1, 2, 5, 10], 'bootstrap': [True, False], 'class_weight': [None, 'balanced', 'balanced_subsample']}
Fitting 5 folds for each of 60 candidates, totalling 300 fits


0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'bootstrap': [True, False], 'class_weight': [None, 'balanced', ...], 'max_depth': [10, 20, ...], 'max_features': ['log2', 'sqrt'], ...}"
,n_iter,60
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,23

0,1,2
,n_estimators,400
,criterion,'gini'
,max_depth,80
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


**Note**<br>
I will not use Scaling for the RandomForestClassifier, as its invariant to scaling, as it splits nodes based on feature thresholds and not distance or vector norms. What we could do is doing a further grid search based off of the values received from the randomized_grid_search, but I doubt it'll improve anything drastically. Might still be worth looking into.

Maybe try making a pipeline? But for what. <br>
https://scikit-learn.org/stable/modules/feature_selection.html#feature-selection-as-part-of-a-pipeline

In [None]:
# Chat, are we onto something? Source: https://scikit-learn.org/stable/modules/feature_selection.html#removing-features-with-low-variance

X = ds.drop(columns="cls")
y = ds["cls"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69)

from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

# removing features with low variance. 
# removes no_fiat and no_biat, and a new training can be done using our new X. doesn't increase accuracy by much, though. 
print(X.shape)
X = sel.fit_transform(X)
print(X.shape)

try_rf()

(15146, 24)
(15146, 22)


KeyboardInterrupt: 