# Calculate numbers for publication

In [None]:
# Libraries
import os
import numpy as np
import pandas as pd
import xarray as xr

In [None]:
# Directories
dir01 = '../paper_deficit/output/01_prep/'
dir04 = '../paper_deficit/output/04_out/'
dir05 = '../paper_deficit/output/05_prep_other/'
dir06 = '../paper_deficit/output/06_eval/'

---

In [None]:
# Libraries
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import dask

# Initialize dask
cluster = SLURMCluster(
    queue='compute',                      # SLURM queue to use
    cores=24,                             # Number of CPU cores per job
    memory='256 GB',                      # Memory per job
    account='bm0891',                     # Account allocation
    interface="ib0",                      # Network interface for communication
    walltime='02:00:00',                  # Maximum runtime per job
    local_directory='../dask/',           # Directory for local storage
    job_extra_directives=[                # Additional SLURM directives for logging
        '-o ../dask/LOG_worker_%j.o',     # Output log
        '-e ../dask/LOG_worker_%j.e'      # Error log
    ]
)

# Scale dask cluster
cluster.scale(jobs=2)

# Configurate dashboard url
dask.config.config.get('distributed').get('dashboard').update(
    {'link': '{JUPYTERHUB_SERVICE_PREFIX}/proxy/{port}/status'}
)

# Create client
client = Client(cluster)

client

---

In [None]:
# Get data predicted data
ds_agbc = xr.open_dataset(os.path.join(dir04, 'agbc.nc')) \
    .chunk(dict(lat=5000, lon=5000)).persist()
ds_bgbc = xr.open_dataset(os.path.join(dir04, 'bgbc.nc')) \
    .chunk(dict(lat=5000, lon=5000)).persist()
ds_soc = xr.open_dataset(os.path.join(dir04, 'soc.nc')) \
    .chunk(dict(lat=5000, lon=5000)).persist()

# Area array
da_area = ds_agbc.area_ha

In [None]:
# Get hilda 2015 and lesiv forest management data
ds_hilda = xr.open_mfdataset(os.path.join(dir01, 'ds_prep_hilda2015_*.zarr'),engine='zarr')
ds_lesiv = xr.open_zarr(os.path.join(dir01, 'ds_prep_lesiv_nat.zarr'))

In [None]:
# Get land-sea mask, primary area, and area arrays at luh2 resolution
ds_land_luh2res = xr.open_dataset(os.path.join(dir05, 'fig_dgvm', 'pot', 'ds_pot_land_luh2res.nc'))
ds_prim_luh2res = xr.open_dataset(os.path.join(dir05, 'fig_dgvm', 'luh2', 'ds_luh2_prim_1700.nc'))
ds_area_luh2res = xr.open_dataset(os.path.join(dir05, 'fig_dgvm', 'luh2', 'ds_luh2_grid_cell_area.nc'))

---

### Get global carbon values

In [None]:
def get_carbon_values(ds):

    """ Prepare dataframe with  global actual and potential min, mean, max 
    carbon values as PgC"""
    
    # Calculate carbon values and reshape to a DataFrame
    df = (
        (ds * ds.area_ha).sum() * 1E-09
    ).drop_vars('area_ha').compute().round().astype('int16') \
        .to_pandas().to_frame('carbon_pgc').reset_index(names='v')

    # Parse the 'v' column into ctype, stype, and atype, then pivot the table
    df_pivot = (
        df.assign(
            ctype=[i.split('_')[0] for i in df.v],
            stype=[i.split('_')[1] for i in df.v],
            atype=[i.split('_')[2] for i in df.v]
        )
        .pivot(index=["ctype", "atype"], columns="stype", values="carbon_pgc")
        .reset_index()
    )

    # Clean up column names
    df_pivot.columns.name = None
    df_pivot = df_pivot.rename_axis(None, axis=1)
    df_pivot = df_pivot[['ctype', 'atype', 'min', 'mean', 'max']]

    return df_pivot

# Calculate carbon values for agbc, bgbc, and soc
df_agbc_pgc =  get_carbon_values(ds_agbc)
df_bgbc_pgc =  get_carbon_values(ds_bgbc)
df_soc_pgc =  get_carbon_values(ds_soc)

# Merge agbc, bgbc, soc values in one dataframe
df_carbon_pgc = pd.concat([df_agbc_pgc, df_bgbc_pgc, df_soc_pgc]).reset_index(drop=True)
df_carbon_pgc

In [None]:
# Create dataframes with global, min, mean, max, best and deficit estimates 
# for all carbon pools and aggregated carbon pools

# Create empty dataframe
df_carbon_pgc2 = pd.DataFrame(columns=['ctype', 'atype', 'metric', 'unit', 'value'])

# Fill dataframe with loop over actual, primary and secondary dataframes
for atype in ['act', 'prim', 'secd']:
    # Select prepared dataframes
    df_sel = df_carbon_pgc[df_carbon_pgc.atype == atype]
    df_sel_agbc = df_sel[df_sel.ctype == 'agbc']
    df_sel_bgbc = df_sel[df_sel.ctype == 'bgbc']
    df_sel_soc = df_sel[df_sel.ctype == 'soc']

    # Add min, mean, max estimates to df_carbon_pgc2
    for ctype in ['agbc', 'bgbc', 'soc']:
        for metric in ['min', 'mean', 'max']:
            df_carbon_pgc2.loc[len(df_carbon_pgc2)] = [ctype, atype, metric, 'pgc', df_sel[df_sel.ctype == ctype][metric].item()]

    # Add best guess to df_carbon_pgc2
    df_carbon_pgc2.loc[len(df_carbon_pgc2)] = ['agbc', atype, 'best', 'pgc', df_sel_agbc['max'].item()]
    df_carbon_pgc2.loc[len(df_carbon_pgc2)] = ['bgbc', atype, 'best', 'pgc', df_sel_bgbc['max'].item()]
    df_carbon_pgc2.loc[len(df_carbon_pgc2)] = ['soc', atype, 'best', 'pgc', df_sel_soc['mean'].item()]

    # Add cveg to df_carbon_pgc2
    df_carbon_pgc2.loc[len(df_carbon_pgc2)] = ['cveg', atype, 'min', 'pgc', (df_sel_agbc['min'].item() + df_sel_bgbc['min'].item())]
    df_carbon_pgc2.loc[len(df_carbon_pgc2)] = ['cveg', atype, 'mean', 'pgc', (df_sel_agbc['mean'].item() + df_sel_bgbc['mean'].item())]
    df_carbon_pgc2.loc[len(df_carbon_pgc2)] = ['cveg', atype, 'max', 'pgc', (df_sel_agbc['max'].item() + df_sel_bgbc['max'].item())]
    df_carbon_pgc2.loc[len(df_carbon_pgc2)] = ['cveg', atype, 'best', 'pgc', (df_sel_agbc['max'].item() + df_sel_bgbc['max'].item())]

    # Add call (carbon all) to empty dataframe
    df_carbon_pgc2.loc[len(df_carbon_pgc2)] = ['call', atype, 'min', 'pgc', (df_sel_agbc['min'].item() + df_sel_bgbc['min'].item() + df_sel_soc['min'].item())]
    df_carbon_pgc2.loc[len(df_carbon_pgc2)] = ['call', atype, 'mean', 'pgc', (df_sel_agbc['mean'].item() + df_sel_bgbc['mean'].item() + df_sel_soc['mean'].item())]
    df_carbon_pgc2.loc[len(df_carbon_pgc2)] = ['call', atype, 'max', 'pgc', (df_sel_agbc['max'].item() + df_sel_bgbc['max'].item() + df_sel_soc['max'].item())]
    df_carbon_pgc2.loc[len(df_carbon_pgc2)] = ['call', atype, 'best', 'pgc', (df_sel_agbc['max'].item() + df_sel_bgbc['max'].item() + df_sel_soc['mean'].item())]

# Make best, max, mean and min columns
df_carbon_pgc2 = df_carbon_pgc2.pivot(index=["ctype", "atype", "unit"], columns="metric", values="value").reset_index()

# Create empty dataframe to store deficit percentage values
df_deficit_pgc = pd.DataFrame(columns=df_carbon_pgc2.columns)

# Create empty dataframe to store deficit percentage values
df_deficit_per = pd.DataFrame(columns=df_carbon_pgc2.columns)

# Calculate deficit in pgc and per and store in dataframes
for ctype in ['agbc', 'bgbc', 'soc', 'cveg', 'call']:
    df_carbon_pgc2_sel = df_carbon_pgc2[df_carbon_pgc2.ctype == ctype]
    
    row_act = df_carbon_pgc2_sel[df_carbon_pgc2_sel.atype == 'act'].iloc[0, 3:]
    
    for atype in ['prim', 'secd']:
        row_ctype = df_carbon_pgc2_sel[df_carbon_pgc2_sel.atype == atype].iloc[0, 3:]
        row_new_pgc = row_ctype - row_act
        row_new_per = [round(i,2) for i in 1-(row_act / row_ctype)]
        df_deficit_pgc.loc[len(df_deficit_pgc)] = [ctype, f'{atype}_deficit', 'pgc', *row_new_pgc]
        df_deficit_per.loc[len(df_deficit_per)] = [ctype, f'{atype}_deficit', 'per', *row_new_per]


# Calculate deficit of agbc, bgbc and soc as share of total deficit
df_deficit_share_per = pd.DataFrame(columns=['ctype', 'atype', 'unit', 'best'])

for atype in ['prim_deficit', 'secd_deficit']:
    for ctype in ['agbc', 'bgbc', 'soc', 'cveg']:

        v_call = df_deficit_pgc[((df_deficit_pgc.atype == atype) & 
                                 (df_deficit_pgc.ctype == 'call'))].best.item()

        v_ctype = df_deficit_pgc[((df_deficit_pgc.atype == atype) & 
                                 (df_deficit_pgc.ctype == ctype))].best.item()

        df_deficit_share_per.loc[len(df_deficit_share_per)] = [ctype, atype, 'per', round(v_ctype/v_call, 2)]

In [None]:
# carbon in pgc
df_carbon_pgc2.to_csv(f'{dir06}csv/df_carbon_pgc.csv', index=False)
df_carbon_pgc2

In [None]:
# Deficit in pgc
df_deficit_pgc.to_csv(f'{dir06}csv/df_deficit_pgc.csv', index=False)
df_deficit_pgc

In [None]:
# Deficit in percent
df_deficit_per.to_csv(f'{dir06}csv/df_deficit_per.csv', index=False)
df_deficit_per

In [None]:
# Share of ctypes on deficit
df_deficit_share_per.to_csv(f'{dir06}csv/df_deficit_share_per.csv', index=False)
df_deficit_share_per

---

### Get LUC carbon values

In [None]:
# Calculate actual, potential and deficit for vegetation and soil and combined
def get_data(scen):
    ds = xr.Dataset()
    ds = ds.assign(
        agbc_act = ds_agbc.agbc_max_act,
        agbc_pot = ds_agbc[f'agbc_max_{scen}'],
        bgbc_act = ds_bgbc.bgbc_max_act,
        bgbc_pot = ds_bgbc[f'bgbc_max_{scen}'],
        cveg_act = ds_agbc.agbc_max_act + ds_bgbc.bgbc_max_act,
        cveg_pot = ds_agbc[f'agbc_max_{scen}'] + ds_bgbc[f'bgbc_max_{scen}'],
        soc_act = ds_soc.soc_mean_act,
        soc_pot = ds_soc[f'soc_mean_{scen}'])
    
    
    ds = ds.assign(
        cveg_def = ds.cveg_pot - ds.cveg_act,
        soc_def = ds.soc_pot - ds.soc_act,
        call_act = ds.cveg_act + ds.soc_act,
        call_pot = ds.cveg_pot + ds.soc_pot,
        call_def = ((ds.cveg_pot + ds.soc_pot) - (ds.cveg_act + ds.soc_act))
    )
    return ds


ds_prim = get_data('prim')

In [None]:
# Define that all forest that is not natural according to lesiv is human influenced
da_forestn = ds_hilda.hilda2015_forest * ds_lesiv.lesiv_nat
da_foresth = ds_hilda.hilda2015_forest - da_forestn
ds_hilda = ds_hilda \
    .assign(hilda2015_forestn = da_forestn,
            hilda2015_foresth = da_foresth) \
    .drop_vars('hilda2015_forest')

In [None]:
# Create hilda dataset with variables renamed
ds_hilda2 = ds_hilda.rename({var: var.replace("hilda2015_", "") for var in ds_hilda.data_vars})

# Initialize an empty list to store DataFrames
df_luc_list = []

# Loop over each variable and compute the corresponding DataFrame
for var in ds_prim.data_vars:
    temp_df = (((ds_hilda2 * ds_prim[var]) * da_area)
               .sum(['lat', 'lon']) * 1e-9) \
               .compute() \
               .to_pandas() \
               .to_frame(var) \
               .transpose() \
               .reset_index(names='data_vars')
    
    df_luc_list.append(temp_df)

# Concatenate all processed DataFrames
df_luc = pd.concat(df_luc_list, ignore_index=True)

# Add column luc_all_pgc with sum of all carbon attributed to luc areas
df_luc = df_luc.assign(luc_pgc=df_luc[list(ds_hilda2.data_vars)].sum(axis=1).values)

# Rename columns
df_luc = df_luc.rename(columns={i: i + '_pgc' for i in list(ds_hilda2.data_vars)})

# Export
df_luc.round().to_csv(f'{dir06}csv/df_luc_pgc.csv', index=False)
df_luc.round()

In [None]:
# Creating new dataframe with percentage values  
df_per = df_luc.copy()  
for col in [f'{i}_pgc' for i in list(ds_hilda2.data_vars)]:  
    df_per[f'{col.split('_')[0]}_per'] = df_luc[col] / df_luc['luc_pgc']  

# Keeping only required columns  
df_per = df_per[["data_vars"] + [i for i in df_per.columns if i.endswith('per')]]  

# Export
df_per.round(2).to_csv(f'{dir06}csv/df_deficit_luc_per.csv', index=False)
df_per.round(2)

---

### Share of luc on deficit as percentages

In [None]:
# Get data from ecozone preparation
df = pd.read_csv(os.path.join(dir05, 'fig_ecozone', 'df_ecozone_data_prim.csv'))

# split cat column to new columns
df = df.assign(eco = [i.split('_')[0] for i in df.cat], 
               luc = [i.split('_')[1] for i in df.cat],
               ctype = [i.split('_')[2] for i in df.cat],
               atype = [i.split('_')[3] for i in df.cat],)

# Filter deficit estimates
df_def = df[df.atype == 'def'][['carbon_pg', 'eco', 'luc', 'ctype']]

# Pivot the table
df_def = df_def.pivot(index=['eco', 'ctype'], columns='luc', values='carbon_pg') \
    .reset_index()

# Renaming columns for clarity
df_def.columns.name = None
df_def = df_def.rename_axis(None, axis=1)

# Reset index
df_def = df_def.set_index(['eco', 'ctype'])

# Add column with total deficit estimates from luc values
df_def = df_def.assign(luc_all = df_def.sum(axis=1))

# Create table with deficit as percentage
df_def_per = df_def.div(df_def['luc_all'], axis=0).round(2).reset_index()

# Filter total deficit rows
df_def_per = df_def_per[df_def_per.ctype == 'call']

# Export
df_def_per.to_csv(f'{dir06}csv/df_deficit_luc_eco_per.csv', index=False)
df_def_per

---

### Average actual, potential and deficit (tha, percentage) per area

In [None]:
# Stats luc classes (tha)
df = pd.read_csv(os.path.join(dir05, 'fig_ecozone', 'df_ecozone_scatter_data_prim.csv'))
df = df[df.cat0 == 'all'] \
    [['cat1', 'cat2', 'act_tha', 'pot_tha', 'def_tha']] \
    .assign(def_p = round(1-(df.act_tha / df.pot_tha), 2)) \
    .sort_values('cat1', ascending=True) \
    .sort_values('cat2', ascending=False)

for i in ['act_tha', 'pot_tha', 'def_tha']:
    df[i] = round(df[i]).astype('int')

df.to_csv(f'{dir06}csv/df_deficit_luc_perarea.csv', index=False)
df

---

### Get DGVM data

In [None]:
# Get global dgvm and other data data for primary area
df_dgvm = pd.read_csv(os.path.join(dir05, 'fig_dgvm', 'data_dgvm_global.csv'))

df_dgvm = df_dgvm.assign(call_d = df_dgvm.cveg_d + df_dgvm.csoil_d)
v_call_d_gan = df_dgvm[df_dgvm.model == 'ganzenmueller'].call_d.item()
df_dgvm = df_dgvm.assign(call_d_diff2gan_per = 1-(df_dgvm.call_d / v_call_d_gan))

# Rename columns
new_columns = []

for col in df_dgvm.columns:
    if col == 'model':
        new_columns.append(col)
    elif col.endswith('_dp'):
        new_columns.append(col.replace('_dp', '_per'))
    elif not col.endswith('_dp') and not col.endswith('_per'):
        new_columns.append(col + '_pgc')
    else:
        new_columns.append(col)

df_dgvm.columns = new_columns

df_dgvm.to_csv(f'{dir06}csv/df_deficit_dgvm.csv', index=False)
df_dgvm

In [None]:
# Print relevant numbers
list_dgvm = ['cablepop', 'classic', 'clm', 'dlem', 'ibis', 'isam', 'jsbach', 
             'jules', 'lpjguess', 'lpjwsl', 'orchidee']

df_dgvm_model = df_dgvm[df_dgvm.model.isin(list_dgvm)]

print(f'DGVM mean difference to our estimates (%): {round(np.mean(df_dgvm_model.call_d_diff2gan_per), 2)}')
print(f'DGVM minimum difference to our estimates (%): {round(np.min(df_dgvm_model.call_d_diff2gan_per), 2)}')
print(f'DGVM maximum difference to our estimates (%): {round(np.max(df_dgvm_model.call_d_diff2gan_per), 2)}')

print(f'DGVM mean (PgC): {round(df_dgvm[df_dgvm.model == 'dgvm_mean'].call_d_pgc.item(), 2)}')
print(f'DGVM SD (PgC): {round(np.std(df_dgvm_model.call_d_pgc), 2)}')

print(f'DGVM cVeg mean (PgC): {round(df_dgvm[df_dgvm.model == 'dgvm_mean'].cveg_d_pgc.item(), 2)}')
print(f'DGVM cVeg SD (PgC): {round(np.std(df_dgvm_model.cveg_d_pgc), 2)}')

print(f'DGVM SOC mean (PgC): {round(df_dgvm[df_dgvm.model == 'dgvm_mean'].csoil_d_pgc.item(), 2)}')
print(f'DGVM SOC SD (PgC): {round(np.std(df_dgvm_model.csoil_d_pgc), 2)}')

# Evaluates to:
# DGVM mean difference to our estimates (%): 0.37
# DGVM minimum difference to our estimates (%): 0.02
# DGVM maximum difference to our estimates (%): 0.58
# DGVM mean (PgC): -171.15
# DGVM SD (PgC): 52.25
# DGVM cVeg mean (PgC): -134.07
# DGVM cVeg SD (PgC): 45.02
# DGVM SOC mean (PgC): -37.08
# DGVM SOC SD (PgC): 31.88

---

### LUH2 Primary land area

In [None]:
# Get 1700 primary land area as ratio of total land area
da_land_luh2res = ds_land_luh2res.land_sea_mask
da_prim_luh2res = ds_prim_luh2res.prim_1700
da_area_luh2res = ds_area_luh2res.grid_cell_area_ha

area_prim = ((da_prim_luh2res * da_area_luh2res).sum(['lat', 'lon']) * 0.01).compute().item()
area_land = ((da_land_luh2res * da_area_luh2res).sum(['lat', 'lon']) * 0.01).compute().item()

# Print area
print(f'LUH2 primary 1700 area vs. total land area : {round(area_prim / area_land, 3)}')

# Evaluates to: 
# LUH2 primary 1700 area vs. total land area : 0.824

---

In [None]:
cluster.close()