# Predict carbon with random forest models

In [None]:
# Libraries
import os, shutil, time
import pandas as pd
import dask.dataframe as dd
from dask_ml.wrappers import ParallelPostFit
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Directories
dir02 = '../paper_deficit/output/02_dbase/'
dir03 = '../paper_deficit/output/03_rf/'
dir03p = '../paper_deficit/output/03_rf/files_predicted/'

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

---

In [None]:
# Libraries
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import dask

# Initialize dask
cluster = SLURMCluster(
    queue='compute',                      # SLURM queue to use
    cores=24,                             # Number of CPU cores per job
    memory='256 GB',                      # Memory per job
    account='bm0891',                     # Account allocation
    interface="ib0",                      # Network interface for communication
    walltime='08:00:00',                  # Maximum runtime per job
    local_directory='../dask/',           # Directory for local storage
    job_extra_directives=[                # Additional SLURM directives for logging
        '-o ../dask/LOG_worker_%j.o',     # Output log
        '-e ../dask/LOG_worker_%j.e'      # Error log
    ]
)

# Scale dask cluster
cluster.scale(jobs=2)

# Configurate dashboard url
dask.config.config.get('distributed').get('dashboard').update(
    {'link': '{JUPYTERHUB_SERVICE_PREFIX}/proxy/{port}/status'}
)

# Create client
client = Client(cluster)

client

In [None]:
# Read dbase file
df_dbase = dd.read_parquet(os.path.join(dir02, "df_dbase.parquet")) \
    .repartition(partition_size='1000 MiB') \
    .persist()


# Pandas dataframe with lat and lon values
df_latlon = df_dbase[['lat', 'lon']].compute()

# Explanatory variables
vars_exp = ['geom90m_convergence', 'geom90m_cti', 'geom90m_eastness',
            'geom90m_northness', 'geom90m_slope', 'geom90m_spi',
            'soilgrids2017_bdricm', 'soilgrids2017_bdrlog',
            'soilgrids2017_bdticm', 
            'soilgrids2020_cec', 'soilgrids2020_cfvo', 'soilgrids2020_clay', 
            'soilgrids2020_phh2o', 'soilgrids2020_sand', 'soilgrids2020_silt',
            'worldclim_bio1', 'worldclim_bio3', 'worldclim_bio4',
            'worldclim_bio5', 'worldclim_bio6', 'worldclim_bio12', 
            'worldclim_bio13', 'worldclim_bio14', 'worldclim_bio15', 
            'worldclim_elev']

In [None]:
def parallel_postfit_predict(X_train, y_train, X_predict, params):
    """
    Fits a ParallelPostFit RandomForestRegressor model with the given parameters
    and writes predictions to a Parquet file.

    Parameters:
        X_train: Training feature set
        y_train: Training target set
        X_predict: Features to make predictions on
        params: Dictionary of model parameters
        output_path: Path to save the output Parquet file
    """
    # Initialize and fit the model
    rfr = ParallelPostFit(
        estimator=RandomForestRegressor(
            min_samples_leaf=params["min_samples_leaf"],
            max_features=params["max_features"],
            n_estimators=params["n_estimators"],
            random_state=params["random_state"],
            n_jobs=-1
        ),
        scoring='r2'
    )
    rfr.fit(X_train, y_train)

    # Return predictions
    return rfr.predict(X_predict)


def random_forest_predict(var_tar, scen):

    """Predict carbon values for 10 best performing models and
    export as parquet file"""
    
    # Get dataframe with parameters and ranks
    df_params = pd.read_csv(
         os.path.join(dir03, f"files_params/df_params_rank_{var_tar}_{scen}.csv"))
    
    # Get training data
    df_train = df_dbase[df_dbase['train_' + scen] == True]# \.repartition(partition_size='1000 MiB')
    
    # Define x_train and y_train 
    X_train = df_train[vars_exp].to_dask_array(lengths=True).compute()
    y_train = df_train[var_tar].to_dask_array(lengths=True).compute()
    
    # Select columns of explanatory variables
    X_predict = df_dbase[vars_exp].to_dask_array(lengths=True).persist()

    df_latlonx = df_latlon
    
    # Predict and export values for best 10 models
    for rank in range(1, 11):
        # Filter parameters for the current rank
        df_params_rank = df_params[df_params.rank_test_score == rank]
        
        # Extract parameters as a dictionary
        params = {"min_samples_leaf": df_params_rank.min_samples_leaf.item(),
                  "max_features": df_params_rank.max_features.item(),
                  "n_estimators": df_params_rank.n_estimators.item(),
                  "random_state": df_params_rank.random_state.item()
                 }
        
        # Run the prediction
        rfr_predict = parallel_postfit_predict(
            X_train, y_train, X_predict, params)

        # Add as array to dataframe with lat and lon
        df_latlonx['rfr_' + str(rank)] = rfr_predict.compute()

    # Construct output file name
    output_file = os.path.join(dir03p, f"df_rfpred_{var_tar}_{scen}.parquet")
    
    # Remove output_file if already exists
    if os.path.exists(output_file):
        os.remove(output_file)

    # Export as parquet file
    df_latlonx.to_parquet(output_file)

In [None]:
# Predict values of agbc
for var_tar in ['agbc_min', 'agbc_mean', 'agbc_max']:
    for scen in ['prim', 'secd']:
        %time random_forest_predict(var_tar, scen)

In [None]:
# Predict values of bgbc
for var_tar in ['bgbc_min', 'bgbc_mean', 'bgbc_max']:
    for scen in ['prim', 'secd']:
        %time random_forest_predict(var_tar, scen)

In [None]:
# Predict values of soc
for var_tar in ['soc_min', 'soc_mean', 'soc_max']:
    for scen in ['prim', 'secd']:
        %time random_forest_predict(var_tar, scen)

In [None]:
cluster.close()