# Get parameters of best performing random forest models

In [None]:
# Libraries
import os
import pandas as pd
import dask.dataframe as dd
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from dask_ml.model_selection import GridSearchCV as gscv 

In [None]:
# Directories
dir02 = '../paper_deficit/output/02_dbase/'
dir03 = '../paper_deficit/output/03_rf/'
dir03p = '../paper_deficit/output/03_rf/files_params/'

---

In [None]:
# Libraries
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import dask

# Initialize dask
cluster = SLURMCluster(
    queue='compute',                      # SLURM queue to use
    cores=48,                             # Number of CPU cores per job
    memory='256 GB',                      # Memory per job
    account='bm0891',                     # Account allocation
    interface="ib0",                      # Network interface for communication
    walltime='04:00:00',                  # Maximum runtime per job
    local_directory='../dask/',           # Directory for local storage
    job_extra_directives=[                # Additional SLURM directives for logging
        '-o ../dask/LOG_worker_%j.o',     # Output log
        '-e ../dask/LOG_worker_%j.e'      # Error log
    ]
)

# Scale dask cluster
cluster.scale(jobs=15)

# Configurate dashboard url
dask.config.config.get('distributed').get('dashboard').update(
    {'link': '{JUPYTERHUB_SERVICE_PREFIX}/proxy/{port}/status'}
)

# Create client
client = Client(cluster)

client

In [None]:
# Get database
dbase = dd.read_parquet(dir02 + 'df_dbase.parquet')

# Explanatory variables
vars_exp = ['geom90m_convergence', 'geom90m_cti', 'geom90m_eastness',
            'geom90m_northness', 'geom90m_slope', 'geom90m_spi',
            'soilgrids2017_bdricm', 'soilgrids2017_bdrlog',
            'soilgrids2017_bdticm', 
            'soilgrids2020_cec', 'soilgrids2020_cfvo', 'soilgrids2020_clay', 
            'soilgrids2020_phh2o', 'soilgrids2020_sand', 'soilgrids2020_silt',
            'worldclim_bio1', 'worldclim_bio3', 'worldclim_bio4',
            'worldclim_bio5', 'worldclim_bio6', 'worldclim_bio12', 
            'worldclim_bio13', 'worldclim_bio14', 'worldclim_bio15', 
            'worldclim_elev']

In [None]:
def random_forest_params_rank(var_tar, scen):
   
    """Find 10 best parameters for random forest regression in 
       pre-defined parameter space
    """

    # Get training data
    df_train = dbase[dbase['train_' + scen] == True][[var_tar, *vars_exp]] \
        .repartition(partition_size='200 MiB') \
        .persist()
    
    # Split training data in features and target/label
    X_train = df_train[vars_exp]
    y_train = df_train[var_tar]
    
    # Create pipline for randomforest regressor, needed for gridserachcv
    pipe = make_pipeline(
        # StandardScaler(),
        RandomForestRegressor(),
        )
        
    # Select parameters of grid
    param_grid = dict(
        #randomforestregressor__max_depth = [50],
        randomforestregressor__n_estimators = [100, 200, 300], #[50, 100, 150],
        randomforestregressor__min_samples_leaf = [2, 3, 4, 5],
        randomforestregressor__max_features = [2, 3, 4, 5, 6, 7, 8, 9, 10],
        randomforestregressor__random_state = [42]
    )
    
    
    # Grid search with cross validation
    grid_results = gscv(pipe, param_grid, 
                        cv=5, # five-fold cross validation
                        scoring='r2', # r2 as evaluation criteria # sklearn: R2 gives the same ranking as squared error.
                        refit=False)
    
    # Fit models
    grid_results_fit = grid_results.fit(X_train, y_train)
    
    # Extract rank_test_score and parameters
    df_params_rank = pd.DataFrame.from_dict(grid_results_fit.cv_results_) \
        .sort_values('rank_test_score') \
        .iloc[:,-5:] \
        .reset_index(drop=True)
    
    # Rename columns
    df_params_rank.columns = [
        i[29:] if i.startswith('param_randomforestregressor__') 
        else i for i in df_params_rank.columns
        ]
    
    # Export
    df_params_rank.to_csv(
        dir03p + 'df_params_rank_' + var_tar + '_' + scen + '.csv', 
        index=False)

In [None]:
# Get parameters for agbc
for var_tar in ['agbc_min', 'agbc_mean', 'agbc_max']:
    for scen in ['prim', 'secd']:
        %time random_forest_params_rank(var_tar, scen)

In [None]:
# Get parameters for bgbc
for var_tar in ['bgbc_min', 'bgbc_mean', 'bgbc_max']:
    for scen in ['prim', 'secd']:
        %time random_forest_params_rank(var_tar, scen)

In [None]:
# Get paramters for soc
for var_tar in ['soc_min', 'soc_mean', 'soc_max']:
    for scen in ['prim', 'secd']:
        %time random_forest_params_rank(var_tar, scen)

In [None]:
cluster.close()

---