In [1]:
# import required packages
import pandas as pd
import numpy as np
from yaml import load, Loader

from tune import tune 
from predict import predict
from post import post

In [None]:
# Setting up the model

with open('/home/phyto/planktonSDM/devries2023_model_config.yml', 'r') as f:
    model_config = load(f, Loader=Loader)

seed = 1 # random seed
n_threads = 8 # how many cpu threads to use
path_out = "/home/phyto/ModelOutput/deVries2023/" #where to save model output
path_in = "" #make it the same as "path_out"?
traits = pd.read_cv("")
X = pd.read_csv("")
y = pd.read_csv("")
envdata = pd.read_csv("")
cv = 10
verbose = 3

reg_scoring = model_config['reg_scoring']
clf_scoring = model_config['clf_scoring']

#print the main parameters

In [None]:
'''
2-phase Random forest 
'''

clf_param_grid = model_config['rf_param_grid']['clf_param_grid']
reg_param_grid = model_config['rf_param_grid']['reg_param_grid']

for n in range(0, len(traits)):
      m = tune(X, y, seed, n_threads, verbose, cv, path_out)
      m.XGB(reg_scoring, reg_param_grid, clf_scoring = clf_scoring, clf_param_grid = clf_param_grid, 
            cv=cv, model="rf", zir=True, log="yes")

In [None]:
'''
2-phase Gradient boosting with XGBoost:
'''

clf_param_grid = model_config['xgb_param_grid']['clf_param_grid']
reg_param_grid = model_config['xgb_param_grid']['reg_param_grid']

for n in range(0, len(traits)):
      
      m = tune(X, y[n], seed, n_threads, verbose, cv, path_out)
      m.XGB(reg_scoring, reg_param_grid, clf_scoring = clf_scoring, clf_param_grid = clf_param_grid,
            cv=cv, model="xgb", zir=True, log="yes")

In [None]:
#running KNN

clf_param_grid = model_config['knn_param_grid']['clf_param_grid']
reg_param_grid = model_config['knn_param_grid']['reg_param_grid']

for n in range(0, len(traits)):

    species_y = y[n]

    m = tune(X, y, seed, n_threads, verbose, cv, path_out)
    m.XGB(reg_scoring, reg_param_grid,  clf_scoring = clf_scoring, clf_param_grid = clf_param_grid,  
        cv=cv, model="knn", zir=True, log="both", bagging_estimators=30)

In [None]:
#predicting the tuned ensemble:

model_config = {
    "rf": {
        "path":"/user/work/ba18321/CoccoRandomForestBP/rf/",
        "config": "zir"
    },
    "xgb": {
        "path":"/user/work/ba18321/CoccoRandomForestBP/xgb/",
        "config": "zir"
    },
    "knn": {
        "path":"/user/work/ba18321/CoccoRandomForestBP/knn/",
        "config": "zir"
    }
}

for n in range(0, len(traits)):
    m = predict(X, y[n], envdata, model_config, seed, n_threads, verbose, cv, path_out, scale=True)
    m.make_prediction()

In [None]:
#merge SDMS:
m = post(path_in, path_out)

#apply calculations:
m.cwm(traits, "cell diameter")
m.richness('observed_otus')
m.richness('simpson')
m.total()

#export merged dataset to netcdf:
m.export_ds()