# Create database for random forest models

In [None]:
# Libraries
import os, shutil
import numpy as np
import pandas as pd
import xarray as xr
import dask.dataframe as dd

In [None]:
# Directories
dir01 = '../paper_deficit/output/01_prep/'
dir02 = '../paper_deficit/output/02_dbase/'

---

In [None]:
# Libraries
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import dask

# Initialize dask
cluster = SLURMCluster(
    queue='compute',                      # SLURM queue to use
    cores=24,                             # Number of CPU cores per job
    memory='256 GB',                      # Memory per job
    account='bm0891',                     # Account allocation
    interface="ib0",                      # Network interface for communication
    walltime='00:30:00',                  # Maximum runtime per job
    local_directory='../dask/',           # Directory for local storage
    job_extra_directives=[                # Additional SLURM directives for logging
        '-o ../dask/LOG_worker_%j.o',     # Output log
        '-e ../dask/LOG_worker_%j.e'      # Error log
    ]
)

# Scale dask cluster
cluster.scale(jobs=19)

# Configurate dashboard url
dask.config.config.get('distributed').get('dashboard').update(
    {'link': '{JUPYTERHUB_SERVICE_PREFIX}/proxy/{port}/status'}
)

# Create client
client = Client(cluster)

client

In [None]:
# Remove output file if exists
if os.path.exists(dir02 + 'df_dbase.parquet'):
    shutil.rmtree(dir02 + 'df_dbase.parquet')

In [None]:
# Read data
ds_prep_geom90m = xr.open_mfdataset(dir01 + 'ds_prep_geom90m_*.zarr', engine='zarr')
ds_prep_soilgrids2017 = xr.open_mfdataset(dir01 + 'ds_prep_soilgrids2017_*.zarr', engine='zarr')
ds_prep_soilgrids2020 = xr.open_mfdataset(dir01 + 'ds_prep_soilgrids2020_*.zarr', engine='zarr')
ds_prep_worldclim =  xr.open_mfdataset(dir01 + 'ds_prep_worldclim_*.zarr', engine='zarr')
ds_prep_agbc = xr.open_zarr(dir02 + 'ds_prep_agbc.zarr')
ds_prep_bgbc = xr.open_zarr(dir02 + 'ds_prep_bgbc.zarr')
ds_prep_soc = xr.open_zarr(dir02 + 'ds_prep_soc.zarr')
ds_prep_pot_prim = xr.open_zarr(dir02 + 'ds_prep_pot_prim.zarr')
ds_prep_pot_secd = xr.open_zarr(dir02 + 'ds_prep_pot_secd.zarr')

In [None]:
ds_prep_geom90m

In [None]:
ds_prep_soilgrids2017

In [None]:
ds_prep_soilgrids2020

In [None]:
ds_prep_worldclim

In [None]:
ds_prep_agbc

In [None]:
ds_prep_bgbc

In [None]:
ds_prep_soc

In [None]:
ds_prep_pot_prim

In [None]:
ds_prep_pot_secd

In [None]:
# List with variables used in random forest models
vars_exp = ['geom90m_convergence', 'geom90m_cti', 'geom90m_eastness',
            'geom90m_northness', 'geom90m_slope', 'geom90m_spi',
            'soilgrids2017_bdricm', 'soilgrids2017_bdrlog',
            'soilgrids2017_bdticm', 
            'soilgrids2020_cec', 'soilgrids2020_cfvo', 'soilgrids2020_clay', 
            'soilgrids2020_phh2o', 'soilgrids2020_sand', 'soilgrids2020_silt',
            'worldclim_bio1', 'worldclim_bio3', 'worldclim_bio4',
            'worldclim_bio5', 'worldclim_bio6', 'worldclim_bio12', 
            'worldclim_bio13', 'worldclim_bio14', 'worldclim_bio15', 
            'worldclim_elev']

# Create dataset with relevant variables
ds_var = xr.merge([ds_prep_geom90m, ds_prep_soilgrids2017,
         ds_prep_soilgrids2020, ds_prep_worldclim])[vars_exp] \
    .drop_vars(['band', 'spatial_ref'])

# Merge with biomass, soil carbon and primary/seconary data
ds_dbase = xr.merge([ds_prep_agbc, ds_prep_bgbc, ds_prep_soc, 
                  ds_prep_pot_prim, ds_prep_pot_secd,
                  ds_var]) \
    .drop_vars(['band', 'spatial_ref'])  \
    .chunk(lat=5000, lon=5000)

ds_dbase

In [None]:
# Create dataframe
df_dbase = ds_dbase.to_dask_dataframe(dim_order=['lat', 'lon'])

# Drop non-land grid cells and repartition
df_dbase = df_dbase \
    .dropna() \
    .persist()

df_dbase

In [None]:
def get_train_cells(prim_secd, ncells=50):

    """Add column to dbase with training data for primary and secondary.
    Training data is selected from potential primary/Secondary grid cells and 
    limited to ncells in each each 1*1 degree tile.
    """
    
    # Create dataframe only with potential training grid cells
    df_sel = df_dbase[df_dbase['pot_' + prim_secd] == 1][['lat', 'lon']] \
        .compute()
    
    # Reset index from 0 to 1 (reset_index resets within partition)
    df_sel = df_sel.assign(idx=1)
    df_sel['idx'] = (df_sel.idx.cumsum() - 1)
    df_sel = df_sel.set_index(df_sel.idx, drop=True)
    
    # Create dataframe with lat-lon identifier for each 1deg*1deg grid cell
    df_sel_lat_lon = (np.floor(df_sel.lat).astype('int').astype('str') + '_' + 
                      np.floor(df_sel.lon).astype('int').astype('str')) \
        .to_frame('lat_lon')
    
    # Create dataframe with count of stable grid cells in 1degree*1degree tile
    df_sel_lat_lon_count = df_sel_lat_lon \
        .groupby('lat_lon') \
        .lat_lon \
        .count() \
        .to_frame(name='ncells') \
        .sort_values(by='ncells') \
        .reset_index()
    
    # List with 1deg*1deg tile identifiers where count of stable grid cells <= ncells
    list_sel_lat_lon_nlower = list(
        df_sel_lat_lon_count[df_sel_lat_lon_count.ncells <= ncells].lat_lon)
    
    # List with 1degree*1degree tile identifiers where count of stable grid cells > ncells
    list_sel_lat_lon_nlarger = list(
        df_sel_lat_lon_count[df_sel_lat_lon_count.ncells > ncells].lat_lon)
    
    # List with indexes from 1deg*1deg tile identifiers where count of stable grid cells <= n
    list_sel_lat_lon_nlower_index = df_sel_lat_lon[
        df_sel_lat_lon.lat_lon.isin(list_sel_lat_lon_nlower)] \
        .index \
        .to_list()
    
    # List with randomly selected indexes from 1deg*1deg tile identifiers where count of stable grid cells > n
    list_sel_lat_lon_nlarger_index = df_sel_lat_lon[
        df_sel_lat_lon.lat_lon.isin(list_sel_lat_lon_nlarger)] \
        .groupby('lat_lon') \
        .lat_lon \
        .sample(ncells) \
        .index \
        .to_list()
    
    # Combine index lists to define training rows
    list_index_train = [*list_sel_lat_lon_nlower_index,
                        *list_sel_lat_lon_nlarger_index]
    
    # Select rows for training data
    return df_sel[['lat', 'lon']] \
        .loc[list_index_train] \
        .assign(train_sel = 1) \
        .rename(columns = dict(train_sel = 'train_' + prim_secd))

In [None]:
# Create columns indicating primary and secondary training grid cells
df_dbase = df_dbase.merge(get_train_cells('prim'), how='left', on=['lat', 'lon'])
df_dbase = df_dbase.merge(get_train_cells('secd'), how='left', on=['lat', 'lon'])

# Change data type of primary and secondary training grid cell columns
for i in ['pot_prim', 'pot_secd', 'train_prim', 'train_secd']:
    df_dbase[i] = df_dbase[i].fillna(0).astype('bool')

# Sort columns
df_dbase = df_dbase[['lat', 'lon', 
                     'pot_prim', 'pot_secd', 'train_prim', 'train_secd',
                     'agbc_max', 'agbc_mean', 'agbc_min', 
                     'bgbc_max','bgbc_mean', 'bgbc_min', 
                     'soc_max', 'soc_mean', 'soc_min',  
                     *vars_exp]]

# Export
df_dbase \
    .repartition(partition_size='200 MiB') \
    .to_parquet(dir02 + 'df_dbase.parquet')

In [None]:
# Close dask cluster
cluster.close()

---

### Check

In [None]:
df_dbase = dd.read_parquet(dir02 + 'df_dbase.parquet')
df_dbase.head()

In [None]:
df_dbase[['pot_prim', 'pot_secd', 'train_prim', 'train_secd']].sum().compute()

---

### Check - Plots

In [None]:
dir_nearth = '../data/naturalearth/'

In [None]:
import geopandas as gpd
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.pyplot as plt

In [None]:
# read coastline data
coastline110 = gpd.read_file(
    dir_nearth + 'ne_110m_coastline/ne_110m_coastline.shp')

In [None]:
def plot_hexbin(df):

    fig, ax = plt.subplots(figsize=(20, 10), ncols=1, nrows=1)
    
    coastline110.plot(ax=ax, color='#000000', linewidth=0.5)
    
    im = ax.hexbin(x=df.lon, y=df.lat, C=df.iloc[:,2], 
                   gridsize=150, reduce_C_function=sum, linewidths=0.2,
                   cmap='inferno', bins='log')
    
    cbar_ticks = [1, 5, 10, 25, 50]
            
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="2%", pad=0.2)
    cbar = fig.colorbar(im, cax=cax, label='Grid cells for training')
    cbar.set_ticks(cbar_ticks)
    cbar.set_ticklabels(cbar_ticks)

In [None]:
plot_hexbin(df_dbase[['lat', 'lon', 'train_prim']][df_dbase.train_prim == 1])

In [None]:
plot_hexbin(df_dbase[['lat', 'lon', 'pot_prim']][df_dbase.pot_prim == 1])

In [None]:
plot_hexbin(df_dbase[['lat', 'lon', 'pot_secd']][df_dbase.pot_secd == 1])

In [None]:
plot_hexbin(df_dbase[['lat', 'lon', 'train_secd']][df_dbase.train_secd == 1])