# Prepare data for Figure "Total carbon barplot"

In [None]:
# Libraries
import os
import numpy as np
import pandas as pd
import xarray as xr
import rioxarray

In [None]:
# Directories
dir_data = '../data/'
dir01 = '../paper_deficit/output/01_prep/'
dir04 = '../paper_deficit/output/04_out/'
dir05 = '../paper_deficit/output/05_prep_other/'

---

In [None]:
# Libraries
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import dask

# Initialize dask
cluster = SLURMCluster(
    queue='compute',                      # SLURM queue to use
    cores=24,                             # Number of CPU cores per job
    memory='256 GB',                      # Memory per job
    account='bm0891',                     # Account allocation
    interface="ib0",                      # Network interface for communication
    walltime='02:00:00',                  # Maximum runtime per job
    local_directory='../dask/',           # Directory for local storage
    job_extra_directives=[                # Additional SLURM directives for logging
        '-o ../dask/LOG_worker_%j.o',     # Output log
        '-e ../dask/LOG_worker_%j.e'      # Error log
    ]
)

# Scale dask cluster
cluster.scale(jobs=2)

# Configurate dashboard url
dask.config.config.get('distributed').get('dashboard').update(
    {'link': '{JUPYTERHUB_SERVICE_PREFIX}/proxy/{port}/status'}
)

# Create client
client = Client(cluster)

client

---

In [None]:
# import functions to calculate grid cell area and grid cell boundaries
from xgrid_utils import grid_cell_areas

def prep_da_grid_cell_area(da):
    return xr.DataArray(data=grid_cell_areas(da.lon.data, da.lat.data),
                 dims=('lat', 'lon'), 
                 coords=(da.lat.data, da.lon.data,)) * 0.0001 # convert from m2 to ha

---

In [None]:
# Get area file
ds_area = xr.open_zarr(os.path.join(dir01, 'ds_prep_area_ha.zarr'))
da_area = ds_area.area_ha

---

### Predicted carbon values

In [None]:
# Get data
ds_agbc = xr.open_dataset(os.path.join(dir04, 'agbc.nc')).chunk(lat=5000, lon=5000)
ds_bgbc = xr.open_dataset(os.path.join(dir04, 'bgbc.nc')).chunk(lat=5000, lon=5000)
ds_soc = xr.open_dataset(os.path.join(dir04, 'soc.nc')).chunk(lat=5000, lon=5000)


def get_carbon_sum(ds):
    """Calculate global sums and export as dataframe"""
    ds = ((ds.where(ds != -32768) * da_area).sum() * 1E-09) \
        .compute() \
        .drop_vars('area_ha')

    df = pd.DataFrame({'variable': list(ds.data_vars.keys()),
                       'carbon_pg': [ds[var].item() for var in ds.data_vars]
                       })
    return df

# Calculate global sums and export as dataframe
df_agbc_sum = get_carbon_sum(ds_agbc)
df_bgbc_sum = get_carbon_sum(ds_bgbc)
df_soc_sum = get_carbon_sum(ds_soc)

# Merge dataframes
df_pot_sum = pd.concat([df_agbc_sum, df_bgbc_sum, df_soc_sum])
df_pot_sum

---

### Erb et al. 2018

In [None]:
%%time
ds_erb = xr.Dataset()

def get_erb_data(fig, n):
    """Get and prepare erb biomass carbon data"""
    # read data
    da = rioxarray.open_rasterio(
        os.path.join(dir_data, f'erb2018/ExtDat_Fig{fig}{n}_gcm.tif'), 
        chunks=True) \
        .rename(y='lat', x='lon')[0]
    # mask non-land and convert to t/ha
    da2 = (da.where(da != da.attrs['_FillValue'])  * 0.01)
    if ((fig != 4) | (n != 'E')):
        ds_erb[f'erb_fig{fig}_{n.lower()}'] = da2
    else:
        ds_erb[f'erb_fig{fig}_{n.lower()}'] = (('lat', 'lon'), da2.data)

# Get prepared erb data
for i in ['A', 'B', 'C', 'D', 'E', 'F', 'G']:
    get_erb_data(3, i)
for i in ['A', 'B', 'C', 'D', 'E', 'F']:
    get_erb_data(4, i)

# Create grid cell area data
da_erb_area = prep_da_grid_cell_area(ds_erb).chunk(dict(lat=-1, lon=-1))

# Calculate global sums
ds_erb_sum = (ds_erb * da_erb_area).sum().compute() * 1e-9

# Create Dataframe with values
df_erb_sum = pd.DataFrame({
    'variable': list(ds_erb_sum.data_vars.keys()),
    'carbon_pg': [ds_erb_sum[var].item() for var in ds_erb_sum.data_vars]
})

df_erb_sum

---

### GSOC

In [None]:
%%time 
# Get data
da_gsoc = rioxarray.open_rasterio(
    os.path.join(dir_data, 'gsoc/GSOCmap1.6.1.tif'), chunks=True) \
    .rename(y='lat', x='lon')[0]
    
# Replace nan_value
da_gsoc = da_gsoc.where(da_gsoc != da_gsoc.attrs.get('_FillValue')).compute()

# Grid cell area
da_gsoc_area = prep_da_grid_cell_area(da_gsoc)

# Calculate carbon stocks
value_gsoc = (da_gsoc * da_gsoc_area).sum().item() * 1e-9

# Create Dataframe with values
df_gsoc_sum = pd.DataFrame({'variable': ['gsoc'], 
                            'carbon_pg': [value_gsoc]})

df_gsoc_sum

---

### Soilgrids 2020

In [None]:
%%time
# Get data
ds_sgrids2020ocs = xr.open_mfdataset(
    os.path.join(dir01, 'ds_regridded_soilgrids2020_ocs*.zarr'), 
    engine='zarr')

# Create dataset with global soc data values
def get_sgrids2020ocs_sum(da, var_str):
    """ Calculate global sum of soc data"""
    ds_sgrids2020ocs_sum[var_str] = ((da * da_area).sum() * 1e-9).compute()


ds_sgrids2020ocs_sum = xr.Dataset()
get_sgrids2020ocs_sum(ds_sgrids2020ocs['regridded_soilgrids2020_ocs_0-30cm'], 
                      'soilgrids2020_soc_030')
get_sgrids2020ocs_sum(ds_sgrids2020ocs['regridded_soilgrids2020_ocsunc_0-30cm'], 
                      'soilgrids2020_socunc_030')

# Lower uncertainty bound
ds_sgrids2020ocs_sum['soilgrids2020_soc_030_lower'] = \
    (ds_sgrids2020ocs_sum.soilgrids2020_soc_030 -
     (ds_sgrids2020ocs_sum.soilgrids2020_socunc_030 / 2))
# Upper uncertainty bound
ds_sgrids2020ocs_sum['soilgrids2020_soc_030_upper'] = \
    (ds_sgrids2020ocs_sum.soilgrids2020_soc_030 +
     (ds_sgrids2020ocs_sum.soilgrids2020_socunc_030 / 2))

# Create dataframe with values
df_sgrids2020ocs_sum = pd.DataFrame({
    'variable': list(ds_sgrids2020ocs_sum.data_vars.keys()),
    'carbon_pg': [ds_sgrids2020ocs_sum[var].item() for var in ds_sgrids2020ocs_sum.data_vars]
})

df_sgrids2020ocs_sum

---

### Soilgrids 2017

In [None]:
# Get data
ds_sgrids2017ocs = xr.open_mfdataset(
    os.path.join(dir01, 'ds_regridded_soilgrids2017_ocs*.zarr'), 
    engine='zarr')

# Create dataset with global soc data values
def get_sgrids2017ocs_sum(da, var_str):
    """ Calculate global sum of soc data"""
    ds_sgrids2017ocs_sum[var_str] = ((da * da_area).sum() * 1e-9).compute()

ds_sgrids2017ocs_sum = xr.Dataset()
# 0-30cm
get_sgrids2017ocs_sum(
    ds_sgrids2017ocs['regridded_soilgrids2017_ocs_0-30cm'], 'soilgrids2017_soc_030')
# 30-100cm
get_sgrids2017ocs_sum(
    ds_sgrids2017ocs['regridded_soilgrids2017_ocs_30-100cm'], 'soilgrids2017_soc_100')
# 100-200cm
get_sgrids2017ocs_sum(
    ds_sgrids2017ocs['regridded_soilgrids2017_ocs_100-200cm'], 'soilgrids2017_soc_200')

# Create dataframe with values
df_sgrids2017ocs_sum = pd.DataFrame({
    'variable': list(ds_sgrids2017ocs_sum.data_vars.keys()),
    'carbon_pg': [ds_sgrids2017ocs_sum[var].item() for var in ds_sgrids2017ocs_sum.data_vars]
})

df_sgrids2017ocs_sum

---

# ESA CCI Biomass

In [None]:
%%time

# Get data
ds_esabio_agb = xr.open_mfdataset(
    os.path.join(dir01, 'ds_regridded_esabio_agb20*.zarr'), 
    engine='zarr')

# Create Dataset with esabio agbc sum values
def get_esabio_sum(da, var_str):
    ds_esabio_sum[var_str] = ((da * da_area).sum() * 0.47 * 1e-9).compute()

ds_esabio_sum = xr.Dataset()

for i in [2015, 2016, 2017, 2018, 2019, 2020, 2021]:
    da_esabio_agb_x = ds_esabio_agb[f'regridded_esabio_agb{i}']
    get_esabio_sum(da_esabio_agb_x, f'esabio_agbc{i}')

# Get mean of years
get_esabio_sum(ds_esabio_agb.to_array('a').mean('a'), f'esabio_agbc_mean')
# Get min of years
get_esabio_sum(ds_esabio_agb.to_array('a').min('a'), f'esabio_agbc_min')
# Get max of years
get_esabio_sum(ds_esabio_agb.to_array('a').max('a'), f'esabio_agbc_max')

# Create Dataframe with values
df_esabio_sum = pd.DataFrame({
    'variable': list(ds_esabio_sum.data_vars.keys()),
    'carbon_pg': [ds_esabio_sum[var].item() for var in ds_esabio_sum.data_vars]
})

df_esabio_sum

---

# Spawn Biomass

In [None]:
# Get data
ds_spawn = xr.open_mfdataset(
    os.path.join(dir01, 'ds_regridded_spawn_*.zarr'), 
    engine='zarr')


# Create dataset with esabio agbc sum values
def get_spawn_sum(da, var_str):
    ds_spawn_sum[var_str] = ((da * da_area).sum() * 1e-9).compute()

ds_spawn_sum = xr.Dataset()

get_spawn_sum(ds_spawn.regridded_spawn_agbc, f'spawn_agbc')
get_spawn_sum(ds_spawn.regridded_spawn_agbcunc, f'spawn_agbcunc')
get_spawn_sum(ds_spawn.regridded_spawn_bgbc, f'spawn_bgbc')
get_spawn_sum(ds_spawn.regridded_spawn_bgbcunc, f'spawn_bgbcunc')

ds_spawn_sum['spawn_agbc_lower'] = (ds_spawn_sum.spawn_agbc -
                                    (ds_spawn_sum.spawn_agbcunc / 2))
ds_spawn_sum['spawn_agbc_upper'] = (ds_spawn_sum.spawn_agbc +
                                    (ds_spawn_sum.spawn_agbcunc / 2))
ds_spawn_sum['spawn_bgbc_lower'] = (ds_spawn_sum.spawn_bgbc -
                                    (ds_spawn_sum.spawn_bgbcunc / 2))
ds_spawn_sum['spawn_bgbc_upper'] = (ds_spawn_sum.spawn_bgbc +
                                    (ds_spawn_sum.spawn_bgbcunc / 2))

# Create dataframe with values
df_spawn_sum = pd.DataFrame({
    'variable': list(ds_spawn_sum.data_vars.keys()),
    'carbon_pg': [ds_spawn_sum[var].item() for var in ds_spawn_sum.data_vars]
})

df_spawn_sum

---

In [None]:
# Merge and export data as csv
file_carbon_col = 'data_carbon_pg_collected.csv'

pd.concat([df_pot_sum, df_erb_sum, df_esabio_sum, df_spawn_sum, 
           df_gsoc_sum, df_sgrids2020ocs_sum, df_sgrids2017ocs_sum]) \
    .to_csv(os.path.join(dir05, 'fig_total_carbon', file_carbon_col),
           index=False)

---

In [None]:
# close dask cluster
cluster.close()