In [1]:
# import required packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_regression
import sys

from yaml import load
from yaml import CLoader as Loader

sys.path.insert(0, '/home/phyto/planktonSDM/functions/')
from tune import tune 
from functions import example_data

In [2]:
# Setting up the model

with open('/home/phyto/planktonSDM/configuration/example_model_config.yml', 'r') as f:
    model_config = load(f, Loader=Loader)

X, y = example_data(y_name =  "Coccolithus pelagicus", n_samples=500, n_features=5, noise=20, random_state=model_config['seed'])

m = tune(X, y, model_config)

In [4]:
'''
1-phase Random forest 
'''
m.train(model="rf")

{'regressor__n_estimators': [100], 'regressor__max_features': [2, 3, 4], 'regressor__max_depth': [3, 5], 'regressor__min_samples_leaf': [0.2, 0.5], 'regressor__max_samples': [0.2, 0.5]}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
finished tuning model
reg rRMSE: 49%
reg rMAE: 36%
reg R2: 0.14
execution time: 8.792784929275513 seconds


In [5]:
'''
2-phase Random forest 
note: for the 2-phase model we need to define the model configuration for both the classifier and the regressor
'''
m.train(model="rf", zir=True)

{'regressor__n_estimators': [100], 'regressor__max_features': [2, 3, 4], 'regressor__max_depth': [3, 5], 'regressor__min_samples_leaf': [0.2, 0.5], 'regressor__max_samples': [0.2, 0.5]}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
finished tuning model
reg rRMSE: 49%
reg rMAE: 36%
reg R2: 0.14
Fitting 3 folds for each of 12 candidates, totalling 36 fits
zir rRMSE: 49%
zir rMAE: 36%
zir R2: 0.14
execution time: 14.214104175567627 seconds


In [6]:
'''
Adding log transformation
'''
#add log:
m.train(model="rf", log="yes")

#try both:
print("both models:")
m.train(model="rf", log="both")

{'regressor__n_estimators': [100], 'regressor__max_features': [2, 3, 4], 'regressor__max_depth': [3, 5], 'regressor__min_samples_leaf': [0.2, 0.5], 'regressor__max_samples': [0.2, 0.5]}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
finished tuning model
reg rRMSE: 49%
reg rMAE: 38%
reg R2: 0.14
execution time: 8.988041400909424 seconds
both models:
{'regressor__n_estimators': [100], 'regressor__max_features': [2, 3, 4], 'regressor__max_depth': [3, 5], 'regressor__min_samples_leaf': [0.2, 0.5], 'regressor__max_samples': [0.2, 0.5]}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Fitting 3 folds for each of 24 candidates, totalling 72 fits
finished tuning model
reg rRMSE: 49%
reg rMAE: 36%
reg R2: 0.14
execution time: 17.228796005249023 seconds


In [7]:
'''
1-phase Gradient boosting with XGBoost:
'''
m.train(model="xgb")

{'regressor__eta': [0.01], 'regressor__n_estimators': [100], 'regressor__max_depth': [4], 'regressor__subsample': [0.6], 'regressor__colsample_bytree': [0.6], 'regressor__gamma': [1], 'regressor__alpha': [1]}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
finished tuning model
reg rRMSE: 44%
reg rMAE: 35%
reg R2: 0.31
execution time: 0.3430941104888916 seconds


In [8]:
'''
2-phase Gradient boosting with XGBoost:
'''
m.train(model="xgb", zir=True)

{'regressor__eta': [0.01], 'regressor__n_estimators': [100], 'regressor__max_depth': [4], 'regressor__subsample': [0.6], 'regressor__colsample_bytree': [0.6], 'regressor__gamma': [1], 'regressor__alpha': [1]}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
finished tuning model
reg rRMSE: 44%
reg rMAE: 35%
reg R2: 0.31
Fitting 3 folds for each of 1 candidates, totalling 3 fits
zir rRMSE: 42%
zir rMAE: 30%
zir R2: 0.38
execution time: 0.5884270668029785 seconds


In [9]:
'''
1-phase bagged nearest neighbors
'''
m.train(model="knn")

{'regressor__max_samples': [0.5], 'regressor__max_features': [0.5], 'regressor__estimator__leaf_size': [30], 'regressor__estimator__n_neighbors': [3], 'regressor__estimator__p': [1], 'regressor__estimator__weights': ['uniform']}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
finished tuning model
reg rRMSE: 39%
reg rMAE: 28%
reg R2: 0.44
execution time: 1.529144048690796 seconds


In [10]:
'''
2-phase bagged nearest neighbors
'''
m.train(model="knn", zir=True)

{'regressor__max_samples': [0.5], 'regressor__max_features': [0.5], 'regressor__estimator__leaf_size': [30], 'regressor__estimator__n_neighbors': [3], 'regressor__estimator__p': [1], 'regressor__estimator__weights': ['uniform']}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
finished tuning model
reg rRMSE: 39%
reg rMAE: 28%
reg R2: 0.46
Fitting 3 folds for each of 1 candidates, totalling 3 fits
zir rRMSE: 40%
zir rMAE: 28%
zir R2: 0.42
execution time: 4.300597667694092 seconds
