Overview

This is a notebook that shows you how to tune plankton ML models using the 'tune' class.

This is the first notebook in a set of three:

    - tune.ipynb: tune hyper-parameters to find the best model configuration

    - predict.ipynb: make predictions using the best fitting model

    - post.ipynb: analyse predictions and calculate metrics such as diversity

There are several dependencies that need to be install prior to running this notebook:

    pandas
    numpy
    scikit-learn
    xgboost
    joblib
    

Tuned models and scoring are saved using the following directory structure:

    
    /your_base_path/scoring/xgb/sppA_reg.sav
    /your_base_path/scoring/rf/sppA_reg.sav
    /your_base_path/scoring/rf/sppA_reg.sav

    
    /your_base_path/tuning/xgb/sppA_reg.sav


In [1]:
# import required packages
import pandas as pd
import numpy as np
from tune import tune 

In [2]:
'''
Setting up the model framework.

Here we define the training data and model config within Python, but you could import your own training data 
from a .csv and load your model_config from a YAML.
'''

seed = 1 #random seed
n_threads = 2 # how many cpu threads to use
n_spp = 0 # which species to model
path_out = "/home/phyto/ModelOutput/test/" #where to save model output

d = pd.DataFrame({"mld":[50, 100, 120, 50, 50, 100, 120, 50, 200], 
                    "temperature":[25, 20, 15, 45, 25, 20, 15, 45, 10],
                    "Emiliania huxleyi":[300000, 100000, 0, 6000, 9000, 5000, 3000, 0, 0],
                    "Coccolithus pelagicus":[50000, 30000, 500, 800, 900, 0, 1000, 5000, 0]
})

model_config =  {

    "X_vars" : ["mld", "temperature"],

    "species" : ["Emiliania huxleyi", "Coccolithus pelagicus"],

    "reg_scoring" : {
                "R2":"r2",
                "MAE": "neg_mean_absolute_error", 
                "RMSE":"neg_root_mean_squared_error",
                },

    "clf_scoring" : {
                "accuracy": "balanced_accuracy",
                },


    "rf_param_grid":{

        "reg_param_grid" : {
                    'regressor__n_estimators': [100],
                    'regressor__max_features': [5],
                    'regressor__max_depth': [5],
                    'regressor__min_samples_leaf': [0.5],
                    'regressor__max_samples':[0.5]       
                    },

        "clf_param_grid" : {'n_estimators': [100],
                    'max_features': [5],
                    'max_depth': [5],
                    'min_samples_leaf': [0.5],
                    'max_samples': [0.5]
                    }

    },

    "xgb_param_grid":{

        "clf_param_grid" : {    
                'eta':[0.01],       
                'n_estimators': [10],
                'max_depth': [4],
                'subsample': [0.6],  
                'colsample_bytree': [0.6],  
                'gamma':[1],     
                'alpha':[1]   
                },     

        "reg_param_grid" : {
                        'regressor__eta': [0.01],
                        'regressor__n_estimators': [10],
                        'regressor__max_depth': [4],
                        'regressor__colsample_bytree': [0.6],               
                        'regressor__subsample': [0.6],          
                        'regressor__gamma': [1],
                        'regressor__alpha': [1]
                        },

    },

    "knn_param_grid":{

        "clf_param_grid" : {'max_samples': [0.5],
                        'max_features': [0.5],
                        'estimator__leaf_size': [30],
                        'estimator__n_neighbors': [3],
                        'estimator__p':  [1],           
                        'estimator__weights': ["uniform"],                                       
                        },


        "reg_param_grid" : {'regressor__max_samples': [0.5],
                        'regressor__max_features': [0.5],
                        'regressor__estimator__leaf_size': [30],
                        'regressor__estimator__n_neighbors': [3],
                        'regressor__estimator__p':  [1],
                        'regressor__estimator__weights': ["uniform"],                           
                        }
    }

}

X_vars = model_config["X_vars"] 
species = model_config["species"][n_spp]
cv = 3
verbose = 0

In [3]:
'''
1-phase Random forest 
'''
reg_scoring = model_config['reg_scoring']
reg_param_grid = model_config['rf_param_grid']['reg_param_grid']

m = tune(d, X_vars, species, seed, n_threads, verbose, cv, path_out, hot_encode=False)
m.XGB(reg_scoring, reg_param_grid, cv=cv, model="rf", zir=False, log="yes")

finished tuning model
zir rRMSE: -1.3654907602127317
zir rMAE: -1.3654907602127317
zir R2: -6.913892048232192
execution time: 2.217594623565674 seconds


In [4]:
'''
2-phase Random forest 
note: for the 2-phase model we need to define the model configuration for both the classifier and the regressor
'''

reg_scoring = model_config['reg_scoring']
clf_scoring = model_config['clf_scoring']

clf_param_grid = model_config['rf_param_grid']['clf_param_grid']
reg_param_grid = model_config['rf_param_grid']['reg_param_grid']

m = tune(d, X_vars, species, seed, n_threads, verbose, cv, path_out, hot_encode=False)
m.XGB(reg_scoring, reg_param_grid, clf_scoring = clf_scoring, clf_param_grid = clf_param_grid, 
      cv=cv, model="rf", zir=True, log="yes")

finished tuning model
zir rRMSE: -1.366898625553064
zir rMAE: -1.366898625553064
zir R2: -7.2245129036588205
execution time: 5.151210069656372 seconds


In [5]:
'''
Testing the impact of log transformation on the 1-phase Random forest 

note: we test both log and no-log by defining log="both"
'''

reg_scoring = model_config['reg_scoring']
reg_param_grid = model_config['rf_param_grid']['reg_param_grid']

m = tune(d, X_vars, species, seed, n_threads, verbose, cv, path_out, hot_encode=False)
m.XGB(reg_scoring, reg_param_grid, cv=cv, model="rf", zir=False, log="both")

finished tuning model
zir rRMSE: -1.3654907602127317
zir rMAE: -1.3654907602127317
zir R2: -6.913892048232192
execution time: 3.7091381549835205 seconds


In [6]:
'''
1-phase Gradient boosting with XGBoost:
'''

reg_scoring = model_config['reg_scoring']
reg_param_grid = model_config['rf_param_grid']['reg_param_grid']

m = tune(d, X_vars, species, seed, n_threads, verbose, cv, path_out, hot_encode=False)
m.XGB(reg_scoring, reg_param_grid, cv=cv, model="rf", zir=False, log="yes")

finished tuning model
zir rRMSE: -1.3654907602127317
zir rMAE: -1.3654907602127317
zir R2: -6.913892048232192
execution time: 2.3106985092163086 seconds


In [7]:
'''
2-phase Gradient boosting with XGBoost:
'''

reg_scoring = model_config['reg_scoring']
clf_scoring = model_config['clf_scoring']

clf_param_grid = model_config['xgb_param_grid']['clf_param_grid']
reg_param_grid = model_config['xgb_param_grid']['reg_param_grid']

m = tune(d, X_vars, species, seed, n_threads, verbose, cv, path_out, hot_encode=False)
m.XGB(reg_scoring, reg_param_grid, clf_scoring = clf_scoring, clf_param_grid = clf_param_grid,
      cv=cv, model="xgb", zir=True, log="yes")

finished tuning model
zir rRMSE: -1.3559173585491615
zir rMAE: -1.3559173585491615
zir R2: -5.67547992661585
execution time: 1.121462106704712 seconds


In [8]:
'''
1-phase nearest neighbors with a bagged KNN
note: we need to define the number of bags when running KNN by defining bagging_estimators=30
'''

reg_scoring = model_config['reg_scoring']
reg_param_grid = model_config['knn_param_grid']['reg_param_grid']

m = tune(d, X_vars, species, seed, n_threads, verbose, cv, path_out, hot_encode=False)
m.XGB(reg_scoring, reg_param_grid, cv=cv, model="knn", zir=False, log="yes", bagging_estimators=30)

finished tuning model
zir rRMSE: -1.3920747906582163
zir rMAE: -1.3920747906582163
zir R2: -13.9488704843248
execution time: 1.3500545024871826 seconds


In [9]:
'''
2-phase nearest neighbors with a bagged KNN
note: we need to define the number of bags when running KNN by defining bagging_estimators=30
'''

reg_scoring = model_config['reg_scoring']
clf_scoring = model_config['clf_scoring']

clf_param_grid = model_config['knn_param_grid']['clf_param_grid']
reg_param_grid = model_config['knn_param_grid']['reg_param_grid']

m = tune(d, X_vars, species, seed, n_threads, verbose, cv, path_out, hot_encode=False)
m.XGB(reg_scoring, reg_param_grid,  clf_scoring = clf_scoring, clf_param_grid = clf_param_grid,  
      cv=cv, model="knn", zir=True, log="yes", bagging_estimators=30)

finished tuning model
zir rRMSE: -1.3594291344985059
zir rMAE: -1.3594291344985059
zir R2: -6.184075971103664
execution time: 3.033029556274414 seconds


In [None]:
'''
TO DO:
    
Add print statement for log="both"

Add tau scoring

Add one_hot_encoding

'''