# Prepare data for Figure "Ecozone"

In [None]:
# Libraries
import os
import pandas as pd
import xarray as xr

In [None]:
# Directories
dir01 = '../paper_deficit/output/01_prep/'
dir04 = '../paper_deficit/output/04_out/'
dir05 = '../paper_deficit/output/05_prep_other/'

---

In [None]:
# Libraries
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import dask

# Initialize dask
cluster = SLURMCluster(
    queue='compute',                      # SLURM queue to use
    cores=24,                             # Number of CPU cores per job
    memory='256 GB',                      # Memory per job
    account='bm0891',                     # Account allocation
    interface="ib0",                      # Network interface for communication
    walltime='02:00:00',                  # Maximum runtime per job
    local_directory='../dask/',           # Directory for local storage
    job_extra_directives=[                # Additional SLURM directives for logging
        '-o ../dask/LOG_worker_%j.o',     # Output log
        '-e ../dask/LOG_worker_%j.e'      # Error log
    ]
)

# Scale dask cluster
cluster.scale(jobs=5)

# Configurate dashboard url
dask.config.config.get('distributed').get('dashboard').update(
    {'link': '{JUPYTERHUB_SERVICE_PREFIX}/proxy/{port}/status'}
)

# Create client
client = Client(cluster)


In [None]:
# Get data
ds_area = xr.open_zarr(os.path.join(dir01, 'ds_prep_area_ha.zarr'))
da_area = ds_area.area_ha

ds_fao = xr.open_zarr(os.path.join(dir01, 'ds_prep_fao2010.zarr'))

ds_hilda = xr.open_mfdataset(os.path.join(dir01, 'ds_prep_hilda2015_*.zarr'),
                            engine='zarr')

ds_lesiv = xr.open_zarr(os.path.join(dir01, 'ds_prep_lesiv_nat.zarr'))

In [None]:
# Get carbon densities from analysis
ds_agbc = xr.open_dataset(os.path.join(dir04, 'agbc.nc')) \
    .chunk(dict(lat=5000, lon=5000)).persist()
ds_bgbc = xr.open_dataset(os.path.join(dir04, 'bgbc.nc')) \
    .chunk(dict(lat=5000, lon=5000)).persist()
ds_soc = xr.open_dataset(os.path.join(dir04, 'soc.nc')) \
    .chunk(dict(lat=5000, lon=5000)).persist()


# Calculate actual, potential and deficit for vegetation and soil and combined
def get_data(scen):
    ds = xr.Dataset()
    ds = ds.assign(
        cveg_act = ds_agbc.agbc_max_act + ds_bgbc.bgbc_max_act,
        cveg_pot = ds_agbc['agbc_max_' + scen] + ds_bgbc['bgbc_max_' + scen],
        soc_act = ds_soc.soc_mean_act,
        soc_pot = ds_soc['soc_mean_' + scen])
    
    
    ds = ds.assign(
        cveg_def = ds.cveg_pot - ds.cveg_act,
        soc_def = ds.soc_pot - ds.soc_act,
        call_act = ds.cveg_act + ds.soc_act,
        call_pot = ds.cveg_pot + ds.soc_pot,
        call_def = ((ds.cveg_pot + ds.soc_pot) - (ds.cveg_act + ds.soc_act))
    )
    return ds


ds_prim = get_data('prim')
ds_secd = get_data('secd') # secd def has negative grid cells <- actual <= primary but cases with actual > secondary

In [None]:
# Carbon stocks our study
ds_prim_sum = ((ds_prim * da_area).sum(['lat', 'lon']) * 1e-9).compute()
ds_secd_sum = ((ds_secd * da_area).sum(['lat', 'lon']) * 1e-9).compute()

In [None]:
# Define that all forest that is not natural according to lesiv is human influenced
da_forestn = ds_hilda.hilda2015_forest * ds_lesiv.lesiv_nat
da_foresth = ds_hilda.hilda2015_forest - da_forestn
ds_hilda = ds_hilda \
    .assign(hilda2015_forestn = da_forestn,
            hilda2015_foresth = da_foresth) \
    .drop_vars('hilda2015_forest')

In [None]:
# Prepare fao2010
ds_fao_split = xr.Dataset(
    dict(fao2010_trop1 = xr.where(ds_fao.fao2010 == 1, 1, 0),
         fao2010_trop2 = xr.where(ds_fao.fao2010 == 2, 1, 0),
         fao2010_subt = xr.where(ds_fao.fao2010 == 3, 1, 0),
         fao2010_temp = xr.where(ds_fao.fao2010 == 4, 1, 0),
         fao2010_bore = xr.where(ds_fao.fao2010 == 5, 1, 0),
         fao2010_pola = xr.where(ds_fao.fao2010 == 6, 1, 0))) \
    .drop_vars('spatial_ref') \
    .astype('bool') \
    .reindex_like(ds_prim, method='nearest') # ds_fao_split.lat.equals(ds_prim.lat) returns False. Not sure why

---

In [None]:
ds = ds_prim

# Dataset with carbon stocks for each combination of ecozones and land cover
ds_split = xr.Dataset()
for i in list(ds_fao_split.data_vars):
        for iii in list(ds.data_vars):
            ds_split[i[8:] + '_' + iii] = (
                ds_fao_split[i] * ds[iii])

# Dataframe with carbons stocks of each combination
df_split = (ds_split * da_area * 1e-9) \
    .sum(['lat', 'lon']) \
    .compute() \
    .to_pandas() \
    .reset_index() \
    .rename(columns={'index':'cat', 0: 'carbon_pg'})

# Prepare categories
df_split['cat0'] = [i[0] for i in [i.split('_') for i in df_split.cat]]
df_split['cat1'] = [i[1] for i in [i.split('_') for i in df_split.cat]]
df_split['cat2'] = [i[2] for i in [i.split('_') for i in df_split.cat]]


df_split = df_split \
    .drop('cat', axis=1) \
    .assign(cat12 = df_split.cat1 + '_' + df_split.cat2) \
    .drop(columns=['cat1', 'cat2']) \
    .pivot(columns=['cat12'], index=['cat0']) \
    .groupby(['cat0']) \
    .sum() \
    .droplevel(0, axis=1) \
    .reset_index() \
    [['cat0', 'cveg_act', 'cveg_pot', 'cveg_def', 
      'soc_act', 'soc_pot', 'soc_def', 
      'call_act', 'call_pot', 'call_def']]

In [None]:
# Datasets with carbon stocks for each combination of ecozones and land cover
def prep_ecozone_data(scen):

    # Select dataset
    if scen == 'prim':
        ds = ds_prim
    if scen == 'secd':
        ds = ds_secd
        
    # Dataset with carbon stocks for each combination of ecozones and land cover
    ds_split = xr.Dataset()
    for i in list(ds_fao_split.data_vars):
        for ii in [i for i in list(ds_hilda.data_vars)]:
            for iii in list(ds.data_vars):
                ds_split[i[8:] + '_' + ii[10:] + '_' + iii] = (
                    ds_fao_split[i] * ds_hilda[ii] * ds[iii])

    # Dataframe with carbons stocks of each combination
    df_split = (ds_split * da_area * 1e-9) \
        .sum(['lat', 'lon']) \
        .compute() \
        .to_pandas() \
        .reset_index() \
        .rename(columns={'index':'cat', 0: 'carbon_pg'})
    
    # Export
    file_out = f'df_ecozone_data_{scen}.csv'
    df_split.to_csv(os.path.join(dir05, 'fig4_ecozone', file_out), index=False)
    

%time prep_ecozone_data('prim')
%time prep_ecozone_data('secd')

In [None]:
%%time
def prep_ecozone_area_data():

    # Dataset with area for each combination of ecozones and land cover
    ds_split_area = xr.Dataset()
    for i in list(ds_fao_split.data_vars):
        for ii in [i for i in list(ds_hilda.data_vars)]:
            ds_split_area[i[8:] + '_' + ii[10:]] = ds_fao_split[i] * ds_hilda[ii]
            
    # dataframe with areas of each combination
    df_split_area = (ds_split_area  * da_area) \
        .sum(['lat', 'lon']) \
        .compute() \
        .to_pandas() \
        .reset_index() \
        .rename(columns={'index':'cat', 0: 'area_ha'})
    
    # export
    df_split_area.to_csv(
        os.path.join(dir05, 'fig_ecozone/df_ecozone_area_data.csv'), 
        index=False)


prep_ecozone_area_data()

In [None]:
%%time
def prep_ecozone_scatter():
    # Prepare scatter plot data
    # Get carbon data
    df_split = pd.read_csv(os.path.join(dir05, 'fig_ecozone/df_ecozone_data_prim.csv'))
        
    df_split['cat0'] = [i[0] for i in [i.split('_') for i in df_split.cat]]
    df_split['cat1'] = [i[1] for i in [i.split('_') for i in df_split.cat]]
    df_split['cat2'] = [i[2] for i in [i.split('_') for i in df_split.cat]]
    df_split['cat3'] = [i[3] for i in [i.split('_') for i in df_split.cat]]
    df_split = df_split.drop('cat', axis=1)
        
    # Get area data
    df_split_area = pd.read_csv(os.path.join(dir05, 'fig_ecozone/df_ecozone_area_data.csv'))
    df_split_area['cat0'] = [i[0] for i in [i.split('_') for i in df_split_area.cat]]
    df_split_area['cat1'] = [i[1] for i in [i.split('_') for i in df_split_area.cat]]
    df_split_area = df_split_area[['cat0', 'cat1', 'area_ha']]
        
    # Get global data
    df_split_all = df_split \
        .assign(carbon_t = df_split.carbon_pg / 1e-9) \
        .groupby(['cat1', 'cat2', 'cat3']) \
        .sum('carbon_t') \
        .reset_index() \
        .pivot(columns='cat3', index=['cat1', 'cat2'], values=['carbon_t']) \
        .droplevel(0, axis=1) \
        .reset_index() \
        .assign(cat0 = 'all')
    
    df_split_all = df_split_all[['cat0', 'cat1', 'cat2', 'act', 'pot', 'def']]
    df_split_all = df_split_all \
        .merge(df_split_area.groupby('cat1').sum('area_ha').reset_index())
        
    # Get ecozone data
    df_split_cat0 = df_split \
        .assign(carbon_t = df_split.carbon_pg / 1e-9) \
        .pivot(columns='cat3', index=['cat0', 'cat1', 'cat2'], values=['carbon_t']) \
        .droplevel(0, axis=1) \
        .reset_index()
    
    df_split_cat0 = df_split_cat0[['cat0', 'cat1', 'cat2', 'act', 'pot', 'def']]
    df_split_cat0 = df_split_cat0.merge(df_split_area, on=['cat0', 'cat1'])
        
    # Cancatenate global and ecozone data and export scatter plot data
    df_concat = pd.concat([df_split_all, df_split_cat0])
    df_concat.assign(act_tha = df_concat['act'] / df_concat.area_ha,
                     pot_tha = df_concat['pot'] / df_concat.area_ha,
                     def_tha = df_concat['def'] / df_concat.area_ha) \
        .reset_index(drop=True) \
        .sort_values('cat2', ascending=False) \
        .to_csv(os.path.join(dir05, 'fig_ecozone/df_ecozone_scatter_data_prim.csv'),
                index=False)


prep_ecozone_scatter()

In [None]:
# Ecozone map data (pgc)
def ecozone_map_data_pgc(scen):
   
    # Get data
    df_split = pd.read_csv(
        os.path.join(dir05, f'fig_ecozone/df_ecozone_data_{scen}.csv'))
    
    # Prepare categories
    df_split['cat0'] = [i[0] for i in [i.split('_') for i in df_split.cat]]
    df_split['cat1'] = [i[1] for i in [i.split('_') for i in df_split.cat]]
    df_split['cat2'] = [i[2] for i in [i.split('_') for i in df_split.cat]]
    df_split['cat3'] = [i[3] for i in [i.split('_') for i in df_split.cat]]
    df_split = df_split.drop('cat', axis=1)

    # Pivot, sum, and export
    df_split \
        .assign(cat23 = df_split.cat2 + '_' + df_split.cat3) \
        .drop(columns=['cat2', 'cat3']) \
        .pivot(columns=['cat23'], index=['cat0', 'cat1']) \
        .groupby(['cat0']) \
        .sum() \
        .droplevel(0, axis=1) \
        .reset_index() \
        [['cat0', 'cveg_act', 'cveg_pot', 'cveg_def', 
          'soc_act', 'soc_pot', 'soc_def', 
          'call_act', 'call_pot', 'call_def']] \
        .to_csv(
            os.path.join(dir05, 'fig_ecozone', f'df_ecozone_map_data_{scen}.csv'), 
            index=False)
    
    # Print
    print(scen)
    fstr = f'df_ecozone_map_data_{scen}.csv'
    print(round(pd.read_csv(os.path.join(dir05, 'fig_ecozone', fstr))))
    

#%time ecozone_map_data_pgc('prim')
#%time ecozone_map_data_pgc('secd')

In [None]:
# Ecozone map data (pgc)
def ecozone_map_data_pgc_new(scen):

    # Select dataset
    if scen == 'prim':
        ds = ds_prim
    if scen == 'secd':
        ds = ds_secd

    # Dataset with carbon stocks for each combination of ecozones
    ds_split = xr.Dataset()
    for i in list(ds_fao_split.data_vars):
            for iii in list(ds.data_vars):
                ds_split[i[8:] + '_' + iii] = (
                    ds_fao_split[i] * ds[iii])
    
    # Dataframe with carbons stocks of each combination
    df_split = (ds_split * da_area * 1e-9) \
        .sum(['lat', 'lon']) \
        .compute() \
        .to_pandas() \
        .reset_index() \
        .rename(columns={'index':'cat', 0: 'carbon_pg'})
    
    # Prepare categories
    df_split['cat0'] = [i[0] for i in [i.split('_') for i in df_split.cat]]
    df_split['cat1'] = [i[1] for i in [i.split('_') for i in df_split.cat]]
    df_split['cat2'] = [i[2] for i in [i.split('_') for i in df_split.cat]]
    
    # Pivot dataframe and prepare for export
    df_split \
        .drop('cat', axis=1) \
        .assign(cat12 = df_split.cat1 + '_' + df_split.cat2) \
        .drop(columns=['cat1', 'cat2']) \
        .pivot(columns=['cat12'], index=['cat0']) \
        .groupby(['cat0']) \
        .sum() \
        .droplevel(0, axis=1) \
        .reset_index() \
        [['cat0', 'cveg_act', 'cveg_pot', 'cveg_def', 
          'soc_act', 'soc_pot', 'soc_def', 
          'call_act', 'call_pot', 'call_def']] \
        .to_csv(
            os.path.join(dir05, 'fig_ecozone', f'df_ecozone_map_data_{scen}.csv'), 
            index=False)
    
    # Print
    print(scen)
    fstr = f'df_ecozone_map_data_{scen}.csv'
    print(round(pd.read_csv(os.path.join(dir05, 'fig_ecozone', fstr))))
    

%time ecozone_map_data_pgc_new('prim')
%time ecozone_map_data_pgc_new('secd')

In [None]:
# Ecozone bar data (pgc)
def ecozone_map_data_pgc(scen):

    # Get data
    df_split = pd.read_csv(
        os.path.join(dir05 + f'fig_ecozone/df_ecozone_data_{scen}.csv'))
    
    # Prepare categories
    df_split = df_split.assign(
        cat0 = [i[0] for i in [i.split('_') for i in df_split.cat]],
        cat1 = [i[1] for i in [i.split('_') for i in df_split.cat]],
        cat2 = [i[2] for i in [i.split('_') for i in df_split.cat]],
        cat3 = [i[3] for i in [i.split('_') for i in df_split.cat]]) \
        .drop('cat', axis=1)
    
    # Pivot
    df_split_bar = df_split[df_split.cat3 == 'def'] \
        .pivot(columns='cat1', index=['cat0', 'cat2'], values='carbon_pg')
    
    # Create ambiguos class, export
    df_split_bar = df_split_bar \
        .assign(ambig = df_split_bar.forestn + df_split_bar.other + df_split_bar.shrub) \
        .reset_index() \
        .sort_values(['cat2', 'cat0'], ascending=[False, True]) \
        .to_csv(
            os.path.join(dir05 + f'fig_ecozone/df_ecozone_bar_data_{scen}.csv'),
        index=False)
    
    # Print
    print(scen)
    fstr = f'df_ecozone_bar_data_{scen}.csv'
    print(round(pd.read_csv(os.path.join(dir05 + f'fig_ecozone/{fstr}')), 0))
    
ecozone_map_data_pgc('prim')
ecozone_map_data_pgc('secd')

---

### Values

In [None]:
# Average carbon per area for each carbon pool and each luc class (tha and %)
# Get data
df = pd.read_csv(
    os.path.join(dir05, 'fig_ecozone/df_ecozone_scatter_data_prim.csv'))
df = df[df.cat0 == 'all'] \
    [['cat1', 'cat2', 'act_tha', 'pot_tha', 'def_tha']] \
    .assign(def_p = round(1-(df.act_tha / df.pot_tha), 2)) \
    .sort_values('cat1', ascending=True) \
    .sort_values('cat2', ascending=False) \


for i in ['act_tha', 'pot_tha', 'def_tha']:
    df[i] = round(df[i]).astype('int')

df

In [None]:
# Average carbon per area for each carbon pool and each ecozone (tha and %)
df = pd.read_csv(
    os.path.join(dir05, 'fig_ecozone/df_ecozone_scatter_data_prim.csv'))
round(df[df.cat1 == 'foresth'][['cat0', 'cat2', 'act_tha', 'pot_tha', 'def_tha']])

In [None]:
# Global deficit stats luc classes (PgC)
df = pd.read_csv(
    os.path.join(dir05, 'fig_ecozone/df_ecozone_scatter_data_prim.csv'))

df[df.cat0 == 'all'][['cat1', 'cat2', 'def']] \
    .assign(d = round(df['def']  * 1e-9).astype('int'))[['cat1', 'cat2', 'd']] \
    .sort_values('cat1', ascending=True) \
    .sort_values('cat2', ascending=False) \
    .reset_index(drop=True)

In [None]:
# Global deficit stats ecozone (PgC)
df = pd.read_csv(
    os.path.join(dir05, 'fig_ecozone/df_ecozone_scatter_data_prim.csv'))
df['def'] = df['def']  * 1e-9
round(df[df.cat1 == 'foresth'][['cat0', 'cat1', 'cat2', 'def']])

---

In [None]:
cluster.close()