# Calculate statistical metrics

In [None]:
# Libraries
import os
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import dask.dataframe as dd

In [None]:
# Directories
dir02 = '../paper_deficit/output/02_dbase/'
dir03 = '../paper_deficit/output/03_rf/'
dir03p = os.path.join(dir03 + 'files_predicted/')
dir03a = os.path.join(dir03 + 'files_adjusted/')
dir03s = os.path.join(dir03 + 'files_scores/')

---

In [None]:
# Libraries
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import dask

# Initialize dask
cluster = SLURMCluster(
    queue='compute',                      # SLURM queue to use
    cores=24,                             # Number of CPU cores per job
    memory='256 GB',                      # Memory per job
    account='bm0891',                     # Account allocation
    interface="ib0",                      # Network interface for communication
    walltime='02:00:00',                  # Maximum runtime per job
    local_directory='../dask/',           # Directory for local storage
    job_extra_directives=[                # Additional SLURM directives for logging
        '-o ../dask/LOG_worker_%j.o',     # Output log
        '-e ../dask/LOG_worker_%j.e'      # Error log
    ]
)

# Scale dask cluster
cluster.scale(jobs=1)

# Configurate dashboard url
dask.config.config.get('distributed').get('dashboard').update(
    {'link': '{JUPYTERHUB_SERVICE_PREFIX}/proxy/{port}/status'}
)

# Create client
client = Client(cluster)

client

In [None]:
# Read dbase file
df_dbase = dd.read_parquet(os.path.join(dir02, 'df_dbase.parquet')) \
    .repartition(partition_size='1000 MiB') \
    .persist()

In [None]:
def get_scores(var_tar, scen):
    """
    Calculates statistical metrics for Random Forest predictions.
    
    Parameters:
        var_tar (str): Target variable name.
        scen (str): Scenario name.
    
    Returns:
        list: [R2, RMSE, MAE, MSE, Max Error] rounded to 2 decimal places.
    """
    # Load Random Forest prediction data
    input_file = os.path.join(dir03p, f"df_rfpred_{var_tar}_{scen}.parquet")
    df_rfpred = dd.read_parquet(input_file)
    
    # Calculate the mean of Random Forest prediction columns
    rfr_columns = [col for col in df_rfpred.columns if col.startswith('rfr_')]
    df_rfpred['rfr_mean'] = df_rfpred[rfr_columns].mean(axis=1)
    
    # Select relevant columns from df_dbase
    df_dbase_sel = df_dbase[['lat', 'lon', f'pot_{scen}', f'train_{scen}', var_tar]]
    
    # Merge base data with Random Forest predictions
    df_merged = df_dbase_sel.merge(
        df_rfpred[['lat', 'lon', 'rfr_mean']],
        on=['lat', 'lon'],
        how='left'
    )
    
    # Filter for testing data (potential but not training)
    test_data = df_merged[
        (df_merged[f'pot_{scen}']) & (~df_merged[f'train_{scen}'])
    ]
    
    # Extract original and predicted values
    test_orig = test_data[var_tar].persist()
    test_pred = test_data['rfr_mean'].persist()
    
    # Compute statistical metrics
    metrics_dict = {
        "R2": metrics.r2_score(test_orig, test_pred),
        "RMSE": metrics.root_mean_squared_error(test_orig, test_pred),
        "MAE": metrics.mean_absolute_error(test_orig, test_pred),
        "MSE": metrics.mean_squared_error(test_orig, test_pred),
        "Max Error": metrics.max_error(test_orig, test_pred)
    }
    
    # Round metrics to 2 decimal places and return as a list
    return [round(value, 2) for value in metrics_dict.values()]


In [None]:
%%time
# Create dataframe with score values
df_score = pd.DataFrame(
    columns=['var_tar', 'scen', 'r2', 'rmse', 'mae', 'mse', 'max_error'])

for var_tar in ['agbc_min', 'agbc_mean', 'agbc_max',
                'bgbc_min', 'bgbc_mean', 'bgbc_max',
                'soc_min', 'soc_mean', 'soc_max',]:
    for scen in ['prim', 'secd']:
        df_score.loc[len(df_score)] = [var_tar, scen, *get_scores(var_tar, scen)]

# Export dataframe
df_score.to_csv(os.path.join(dir03s, 'df_score_rfr_mean.csv'), index=False)

In [None]:
cluster.close()