Overview

This is a notebook that shows you how to tune plankton ML models using the 'tune' class.

This is the first notebook in a set of three:

    - tune.ipynb: tune hyper-parameters to find the best model configuration

    - predict.ipynb: make predictions using the best fitting model

    - post.ipynb: analyse predictions and calculate metrics such as diversity

There are several dependencies that need to be install prior to running this notebook:

    pandas
    numpy
    scikit-learn
    xgboost
    joblib
    

Tuned models and scoring are saved using the following directory structure:

    
    /your_base_path/scoring/xgb/sppA_reg.sav
    /your_base_path/scoring/rf/sppA_reg.sav
    /your_base_path/scoring/rf/sppA_reg.sav

    
    /your_base_path/tuning/xgb/sppA_reg.sav


In [1]:
# import required packages
import pandas as pd
import numpy as np
from tune import tune 
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_regression

from yaml import safe_load, load, dump
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper
    

In [8]:
# Setting up the model

with open('/home/phyto/planktonSDM/model_config.yml', 'r') as f:
    model_config = load(f, Loader=Loader)


seed = 1 # random seed
n_threads = 2 # how many cpu threads to use
n_spp = 0 # which species to model
path_out = "/home/phyto/ModelOutput/test/" #where to save model output


X, y = make_regression(n_samples=500, n_features=5, noise=20, random_state=59)
# scale so values are strictly positive:
scaler = MinMaxScaler()  
scaler.fit(y.reshape(-1,1))  
y = scaler.transform(y.reshape(-1,1))
# add exp transformation to data
# make distribution exponential:
y = np.exp(y)-1
#cut tail
y[y <= 0.5] = 0
y = np.squeeze(y)

#name y with species name

cv = 3
verbose = 0

{'clf_scoring': {'accuracy': 'balanced_accuracy'}, 'reg_scoring': {'R2': 'r2', 'MAE': 'neg_mean_absolute_error', 'RMSE': 'neg_root_mean_squared_error'}, 'rf_param_grid': {'reg_param_grid': {'regressor__n_estimators': [100], 'regressor__max_features': [2, 3, 4], 'regressor__max_depth': [3, 5], 'regressor__min_samples_leaf': [0.2, 0.5, 0.8], 'regressor__max_samples': [0.2, 0.5, 0.8]}, 'clf_param_grid': {'n_estimators': [100], 'max_features': [2, 3, 4], 'max_depth': [3, 5], 'min_samples_leaf': [0.2, 0.5, 0.8], 'max_samples': [0.5]}}, 'xgb_param_grid': {'clf_param_grid': {'eta': [0.01], 'n_estimators': [10], 'max_depth': [4], 'subsample': [0.6], 'colsample_bytree': [0.6], 'gamma': [1], 'alpha': [1]}, 'reg_param_grid': {'regressor__eta': [0.01], 'regressor__n_estimators': [10], 'regressor__max_depth': [4], 'regressor__subsample': [0.6], 'regressor__colsample_bytree': [0.6], 'regressor__gamma': [1], 'regressor__alpha': [1]}}, 'knn_param_grid': {'clf_param_grid': {'max_samples': [0.5], 'max_f

In [3]:
'''
1-phase Random forest 
'''
reg_scoring = model_config['reg_scoring']
reg_param_grid = model_config['rf_param_grid']['reg_param_grid']

m = tune(X, y, seed, n_threads, verbose, cv, path_out)
m.XGB(reg_scoring, reg_param_grid, cv=cv, model="rf", zir=False, log="yes")

finished tuning model
reg rRMSE: -0.24979607584108895
reg rMAE: -0.24979607584108895
reg R2: 0.3765433093248478
execution time: 12.329315900802612 seconds


In [4]:
'''
2-phase Random forest 
note: for the 2-phase model we need to define the model configuration for both the classifier and the regressor
'''

reg_scoring = model_config['reg_scoring']
clf_scoring = model_config['clf_scoring']

clf_param_grid = model_config['rf_param_grid']['clf_param_grid']
reg_param_grid = model_config['rf_param_grid']['reg_param_grid']

m = tune(X, y, seed, n_threads, verbose, cv, path_out)
m.XGB(reg_scoring, reg_param_grid, clf_scoring = clf_scoring, clf_param_grid = clf_param_grid, 
      cv=cv, model="rf", zir=True, log="yes")

finished tuning model
reg rRMSE: -0.24979607584108895
reg rMAE: -0.24979607584108895
reg R2: 0.3765433093248478
zir rRMSE: -0.24979607584108895
zir rMAE: -0.24979607584108895
zir R2: 0.3765433093248478
execution time: 16.189189434051514 seconds


In [5]:
'''
Testing the impact of log transformation on the 1-phase Random forest 

note: we test both log and no-log by defining log="both"
'''

reg_scoring = model_config['reg_scoring']
reg_param_grid = model_config['rf_param_grid']['reg_param_grid']

m = tune(X, y, seed, n_threads, verbose, cv, path_out)
m.XGB(reg_scoring, reg_param_grid, cv=cv, model="rf", zir=False, log="both")

finished tuning model
reg rRMSE: -0.24817451463290974
reg rMAE: -0.24817451463290974
reg R2: 0.3845985809620281
execution time: 22.195815086364746 seconds


In [6]:
'''
1-phase Gradient boosting with XGBoost:
'''

reg_scoring = model_config['reg_scoring']
reg_param_grid = model_config['xgb_param_grid']['reg_param_grid']

m = tune(X, y, seed, n_threads, verbose, cv, path_out)
m.XGB(reg_scoring, reg_param_grid, cv=cv, model="xgb", zir=False, log="yes")

finished tuning model
reg rRMSE: -0.6049248181787253
reg rMAE: -0.6049248181787253
reg R2: -2.6547241748838233
execution time: 0.2293400764465332 seconds


In [9]:
'''
2-phase Gradient boosting with XGBoost:
'''

reg_scoring = model_config['reg_scoring']
clf_scoring = model_config['clf_scoring']

clf_param_grid = model_config['xgb_param_grid']['clf_param_grid']
reg_param_grid = model_config['xgb_param_grid']['reg_param_grid']

m = tune(X, y, seed, n_threads, verbose, cv, path_out)
m.XGB(reg_scoring, reg_param_grid, clf_scoring = clf_scoring, clf_param_grid = clf_param_grid,
      cv=cv, model="xgb", zir=True, log="yes")

finished tuning model
reg rRMSE: -0.6065273458128824
reg rMAE: -0.6065273458128824
reg R2: 0.05456661376035166
zir rRMSE: -0.5274685344635704
zir rMAE: -0.5274685344635704
zir R2: 0.2844541176880262
execution time: 0.59261155128479 seconds


In [10]:
'''
1-phase nearest neighbors with a bagged KNN
note: we need to define the number of bags when running KNN by defining bagging_estimators=30
'''

reg_scoring = model_config['reg_scoring']
reg_param_grid = model_config['knn_param_grid']['reg_param_grid']

m = tune(X, y, seed, n_threads, verbose, cv, path_out)
m.XGB(reg_scoring, reg_param_grid, cv=cv, model="knn", zir=False, log="yes", bagging_estimators=30)

finished tuning model
reg rRMSE: -0.44664058917447036
reg rMAE: -0.44664058917447036
reg R2: 0.48736673651578083
execution time: 0.5374126434326172 seconds


In [12]:
'''
2-phase nearest neighbors with a bagged KNN
note: we need to define the number of bags when running KNN by defining bagging_estimators=30
'''

reg_scoring = model_config['reg_scoring']
clf_scoring = model_config['clf_scoring']

clf_param_grid = model_config['knn_param_grid']['clf_param_grid']
reg_param_grid = model_config['knn_param_grid']['reg_param_grid']

m = tune(X, y, seed, n_threads, verbose, cv, path_out)
m.XGB(reg_scoring, reg_param_grid,  clf_scoring = clf_scoring, clf_param_grid = clf_param_grid,  
      cv=cv, model="knn", zir=True, log="both", bagging_estimators=30)

finished tuning model
reg rRMSE: -0.48138705366221834
reg rMAE: -0.48138705366221834
reg R2: 0.4045149637539427
zir rRMSE: -0.4574185755799064
zir rMAE: -0.4574185755799064
zir R2: 0.45665439310040346
execution time: 1.6946873664855957 seconds


TO DO:
    
Add print statement for log="both"

Add tau scoring

Add one_hot_encoding