# Figures Performance of Random Forest Models Scatter

In [None]:
# Libraries
import os
import numpy as np
import dask.dataframe as dd
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

In [None]:
# Directories
dir02 = '../paper_deficit/output/02_dbase/'
dir03p = '../paper_deficit/output/03_rf/files_predicted/'
dir06 = '../paper_deficit/output/06_eval/'

---

In [None]:
# Libraries
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import dask

# Initialize dask
cluster = SLURMCluster(
    queue='compute',                      # SLURM queue to use
    cores=2,                             # Number of CPU cores per job
    memory='256 GB',                      # Memory per job
    account='bm0891',                     # Account allocation
    interface="ib0",                      # Network interface for communication
    walltime='00:30:00',                  # Maximum runtime per job
    local_directory='../dask/',           # Directory for local storage
    job_extra_directives=[                # Additional SLURM directives for logging
        '-o ../dask/LOG_worker_%j.o',     # Output log
        '-e ../dask/LOG_worker_%j.e'      # Error log
    ]
)

# Scale dask cluster
cluster.scale(jobs=4)

# Configurate dashboard url
dask.config.config.get('distributed').get('dashboard').update(
    {'link': '{JUPYTERHUB_SERVICE_PREFIX}/proxy/{port}/status'}
)

# Create client
client = Client(cluster)

client

In [None]:
# Get dbase file
df_dbase = dd.read_parquet(os.path.join(dir02, 'df_dbase.parquet'))

In [None]:
def prep_data(var_tar, scen):

    """Prepare scatter data"""
    
    # Get predicted random forest data
    df_rfpred = dd.read_parquet(
        os.path.join(dir03p, f'df_rfpred_{var_tar}_{scen}.parquet'))

    # Prepare dataframe with predicted values
    rfr_cols = ['rfr_' + str(i) for i in range(1, 11)]
    
    df_rfpred_sel = df_rfpred \
        .assign(rfr_mean = df_rfpred[rfr_cols].mean(axis=1)) \
        [['lat', 'lon', 'rfr_mean']]

    # Prepare dataframe with testing values
    df_test = df_dbase[((df_dbase[f'pot_{scen}'] == True) &
                        (df_dbase[f'train_{scen}'] == False))] \
        [['lat', 'lon', var_tar]]

    return dd.merge(df_test, df_rfpred_sel,
                    how='left', on=['lat', 'lon']).compute()

In [None]:
def plot_scatter(ax, var_tar, scen, title):

    #Get data
    df_sel = prep_data(var_tar, scen)

    # Sample the data for plotting
    n_sample=25000
    df_sam = df_sel.sample(n=n_sample)

    # Convert to pandas for faster operations
    x_sam, y_sam = df_sam[var_tar], df_sam['rfr_mean']

    # Calculate the point density using Gaussian Kernel Density Estimation
    xy = np.vstack([x_sam, y_sam])
    z = np.log(gaussian_kde(xy)(xy))

    # Perform linear regression
    m, b = np.polyfit(x_sam, y_sam, deg=1)

    # Calculate metrics on the full dataset (using Dask for larger data)
    x_full, y_full = df_sel[var_tar], df_sel.rfr_mean
    metrics_dict = {
        "R2": metrics.r2_score(x_full, y_full),
        "RMSE": metrics.root_mean_squared_error(x_full, y_full),
    }

    # Plotting
    ax.scatter(x_sam, y_sam, c=z, s=1)
    ax.axline((0, 0), (1, 1), c='#636363', linestyle='--')
    ax.axline(xy1=(0, b), slope=m, c='#000000')

    # Display metrics in the plot
    metric_text = f"R2: {metrics_dict['R2']:.2f}\nRMSE: {metrics_dict['RMSE']:.2f}"
    ax.text(0, y_sam.max(), va='top', bbox=dict(facecolor='#ffffff', alpha=0.5), s=metric_text)

    # Set titles and labels
    ax.set_title(title)
    ax.set_ylabel('Prediction (tC ha$^{-1}$)')
    ax.set_xlabel('Testing (tC ha$^{-1}$)')

In [None]:
# List of parameters for the plots
plot_params = [
    ('agbc_min', 'prim', 'AGBC, min'),
    ('agbc_mean', 'prim', 'AGBC, mean'),
    ('agbc_max', 'prim', 'AGBC, max'),
    ('bgbc_min', 'prim', 'BGBC, min'),
    ('bgbc_mean', 'prim', 'BGBC, mean'),
    ('bgbc_max', 'prim', 'BGBC, max'),
    ('soc_min', 'prim', 'SOC 0-30 cm, min'),
    ('soc_mean', 'prim', 'SOC 0-30 cm, mean'),
    ('soc_max', 'prim', 'SOC 0-30 cm, max'),

]

# Create the figure and axes in a single step
fig, axs = plt.subplots(figsize=(9, 9), ncols=3, nrows=3, dpi=600)
axs = axs.ravel()

# Loop over the parameters and axes to create the plots
for ax, (var_tar, scen, label) in zip(axs, plot_params):
    plot_scatter(ax, var_tar, scen, label)

# Adjust layout
plt.tight_layout()

# Export
plt.savefig(os.path.join(dir06, 'pdf/figs13_scatter_performance_primary.pdf'), dpi=600)
plt.savefig(os.path.join(dir06, 'png/figs13_scatter_performance_primary.png'), dpi=600);

In [None]:
# List of parameters for the plots
plot_params = [
    ('agbc_min', 'secd', 'AGBC, min'),
    ('agbc_mean', 'secd', 'AGBC, mean'),
    ('agbc_max', 'secd', 'AGBC, max'),
    ('bgbc_min', 'secd', 'BGBC, min'),
    ('bgbc_mean', 'secd', 'BGBC, mean'),
    ('bgbc_max', 'secd', 'BGBC, max'),
    ('soc_min', 'secd', 'SOC 0-30 cm, min'),
    ('soc_mean', 'secd', 'SOC 0-30 cm, mean'),
    ('soc_max', 'secd', 'SOC 0-30 cm, max'),

]

# Create the figure and axes in a single step
fig, axs = plt.subplots(figsize=(9, 9), ncols=3, nrows=3, dpi=600)
axs = axs.ravel()

# Loop over the parameters and axes to create the plots
for ax, (var_tar, scen, label) in zip(axs, plot_params):
    plot_scatter(ax, var_tar, scen, label)

# Adjust layout
plt.tight_layout()

# Export
plt.savefig(os.path.join(dir06, 'pdf/figs14_scatter_performance_secondary.pdf'), dpi=600)
plt.savefig(os.path.join(dir06, 'png/figs14_scatter_performance_secondary.png'), dpi=600);

In [None]:
cluster.close()