In [1]:
import pandas as pd
from yaml import load
from yaml import CLoader as Loader
from sklearn.preprocessing import OneHotEncoder

In [2]:
#If you want to debug the package without repackaging, you can read files locally:
import sys
sys.path.insert(0, '/home/phyto/Abil/abil/')
from tune import tune
from functions import upsample

In [3]:
# Setting up a regressor model
with open('/home/phyto/Abil/configuration/2-phase.yml', 'r') as f:
    model_config = load(f, Loader=Loader)

predictors = model_config['predictors']
#read your target + environmental data:
d = pd.read_csv(model_config['local_root'] + model_config['training'])
target =  "Emiliania huxleyi"


#in this example we introduce pseudo-absences:
d[target] = d[target].fillna(0)
d = upsample(d, target, ratio=10)

#drop any missing values:
d = d.dropna(subset=[target])
d = d.dropna(subset=predictors)

#here we randomly sample data to speed up training:
d = d.sample(1000)

X_train = d[predictors]
y = d[target]

m = tune(X_train, y, model_config)

length of y:
250


In [4]:
'''
1-phase RF classifier
'''
m.train(model="rf", classifier=True)

training classifier
length of y_clf:
250
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Fitting 3 folds for each of 16 candidates, totalling 48 fits
exported model to:/home/phyto/Abil/ModelOutput/rf/model/Emiliania huxleyi_clf.sav
exported scoring to: /home/phyto/Abil/ModelOutput/rf/s

In [5]:
'''
1-phase RF regressor
'''
m.train(model="rf", regressor=True)

training regressor
{'R2': 'r2', 'MAE': 'neg_mean_absolute_error', 'RMSE': 'neg_root_mean_squared_error', 'tau': make_scorer(tau_scoring), 'tau_p': make_scorer(tau_scoring_p)}
{'regressor__estimator__n_estimators': [100], 'regressor__estimator__max_features': [4], 'regressor__estimator__max_depth': [50], 'regressor__estimator__min_samples_leaf': [0.5, 1], 'regressor__estimator__max_samples': [0.5, 1]}
Fitting 3 folds for each of 4 candidates, totalling 12 fits
exported model to: /home/phyto/Abil/ModelOutput/rf/model/Emiliania huxleyi_reg.sav
exported scoring to: /home/phyto/Abil/ModelOutput/rf/scoring/Emiliania huxleyi_reg.sav
reg rRMSE: 836%
reg rMAE: 192%
reg R2: -0.04
reg tau: nan
execution time: 7.152557373046875e-07 seconds


In [6]:
'''
2-phase RF regressor
'''
m.train(model="rf", classifier=True, regressor=True)

training classifier
length of y_clf:
250
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Fitting 3 folds for each of 16 candidates, totalling 48 fits
exported model to:/home/phyto/Abil/ModelOutput/rf/model/Emiliania huxleyi_clf.sav
exported scoring to: /home/phyto/Abil/ModelOutput/rf/s

In [7]:
'''
1-phase KNN classifier
'''
m.train(model="knn", classifier=True)

training classifier
length of y_clf:
250
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Fitting 3 folds for each of 1 candidates, totalling 3 fits
exported model to:/home/phyto/Abil/ModelOutput/knn/model/Emiliania huxleyi_clf.sav
exported scoring to: /home/phyto/Abil/ModelOutput/knn/s

In [8]:
'''
1-phase KNN regressor
'''
m.train(model="knn", regressor=True)

training regressor
{'R2': 'r2', 'MAE': 'neg_mean_absolute_error', 'RMSE': 'neg_root_mean_squared_error', 'tau': make_scorer(tau_scoring), 'tau_p': make_scorer(tau_scoring_p)}
{'regressor__estimator__max_samples': [0.2], 'regressor__estimator__max_features': [0.2], 'regressor__estimator__estimator__leaf_size': [25], 'regressor__estimator__estimator__n_neighbors': [5]}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
exported model to: /home/phyto/Abil/ModelOutput/knn/model/Emiliania huxleyi_reg.sav
exported scoring to: /home/phyto/Abil/ModelOutput/knn/scoring/Emiliania huxleyi_reg.sav
reg rRMSE: 834%
reg rMAE: 144%
reg R2: -0.01
reg tau: 0.07
execution time: 9.5367431640625e-07 seconds


In [9]:
'''
2-phase KNN regressor
'''
m.train(model="knn", classifier=True, regressor=True)

training classifier
length of y_clf:
250
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Fitting 3 folds for each of 1 candidates, totalling 3 fits
exported model to:/home/phyto/Abil/ModelOutput/knn/model/Emiliania huxleyi_clf.sav
exported scoring to: /home/phyto/Abil/ModelOutput/knn/s

joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/phyto/anaconda3/envs/my-geopandas-env-2/lib/python3.11/site-packages/joblib/_parallel_backends.py", line 273, in _wrap_func_call
    return func()
           ^^^^^^
  File "/home/phyto/anaconda3/envs/my-geopandas-env-2/lib/python3.11/site-packages/joblib/parallel.py", line 588, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/phyto/anaconda3/envs/my-geopandas-env-2/lib/python3.11/site-packages/joblib/parallel.py", line 588, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "/home/phyto/anaconda3/envs/my-geopandas-env-2/lib/python3.11/site-packages/sklearn/utils/parallel.py", line 127, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/phyto/anaconda3/envs/my-geopandas-env-2/lib/python3.11/site-packages/sklearn/ensemble/_bagging.py", lin

exported model to: /home/phyto/Abil/ModelOutput/knn/model/Emiliania huxleyi_reg.sav


joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/phyto/anaconda3/envs/my-geopandas-env-2/lib/python3.11/site-packages/joblib/_parallel_backends.py", line 273, in _wrap_func_call
    return func()
           ^^^^^^
  File "/home/phyto/anaconda3/envs/my-geopandas-env-2/lib/python3.11/site-packages/joblib/parallel.py", line 588, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/phyto/anaconda3/envs/my-geopandas-env-2/lib/python3.11/site-packages/joblib/parallel.py", line 588, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "/home/phyto/anaconda3/envs/my-geopandas-env-2/lib/python3.11/site-packages/sklearn/utils/parallel.py", line 127, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/phyto/anaconda3/envs/my-geopandas-env-2/lib/python3.11/site-packages/sklearn/ensemble/_bagging.py", lin

exported scoring to: /home/phyto/Abil/ModelOutput/knn/scoring/Emiliania huxleyi_reg.sav


type: cannot convert float NaN to integer

In [None]:
'''
1-phase XGB classifier
'''
m.train(model="xgb", classifier=True)

In [None]:
'''
1-phase XGB regressor
'''
m.train(model="xgb", regressor=True)

training regressor
{'R2': 'r2', 'MAE': 'neg_mean_absolute_error', 'RMSE': 'neg_root_mean_squared_error', 'tau': make_scorer(tau_scoring), 'tau_p': make_scorer(tau_scoring_p)}
{'regressor__estimator__max_samples': [0.2], 'regressor__estimator__max_features': [0.2], 'regressor__estimator__estimator__leaf_size': [25], 'regressor__estimator__estimator__n_neighbors': [5]}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
exported model to: /home/phyto/Abil/ModelOutput/knn/model/Florisphaera profunda_reg.sav
exported scoring to: /home/phyto/Abil/ModelOutput/knn/scoring/Florisphaera profunda_reg.sav
reg rRMSE: 174%
reg rMAE: 109%
reg R2: -0.08
reg tau: 0.01
execution time: 9.5367431640625e-07 seconds


In [None]:
'''
2-phase XGB regressor
'''
m.train(model="xgb", classifier=True, regressor=True)