# Prepare Copernicus LUC data

In [None]:
# Libraries
import os, time, shutil, rioxarray
import numpy as np
import xarray as xr
from scipy import ndimage
from dask.distributed import Lock
import matplotlib.pyplot as plt

In [None]:
# Directories
dir_data =  '../data/'
dir01 = '../paper_deficit/output/01_prep/'

---

### Pre-processing

In [None]:
# Libraries
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import dask

# Initialize dask
cluster = SLURMCluster(
    queue='compute',                      # SLURM queue to use
    cores=24,                             # Number of CPU cores per job
    memory='256 GB',                      # Memory per job
    account='bm0891',                     # Account allocation
    interface="ib0",                      # Network interface for communication
    walltime='02:00:00',                  # Maximum runtime per job
    local_directory='../dask/',           # Directory for local storage
    job_extra_directives=[                # Additional SLURM directives for logging
        '-o ../dask/LOG_worker_%j.o',     # Output log
        '-e ../dask/LOG_worker_%j.e'      # Error log
    ]
)

# Scale dask cluster
cluster.scale(jobs=19)

# Configurate dashboard url
dask.config.config.get('distributed').get('dashboard').update(
    {'link': '{JUPYTERHUB_SERVICE_PREFIX}/proxy/{port}/status'}
)

# Create client
client = Client(cluster)

client

In [None]:
# Function to trim memory of workers
def trim_memory() -> int:
    import ctypes
    libc = ctypes.CDLL("libc.so.6")
    return libc.malloc_trim(0)

In [None]:
# Functions to prepare copernicus luc data

def _open_da_cop_luc(year, ltype):
    """ Open Copernicus LUC TIF file and change NaN-default value to NaN """

    # dictionary with luc types and orig names
    dict_cop_fract = {
        'cop_bare': 'Bare-CoverFraction',
        'cop_builtup': 'BuiltUp-CoverFraction', 
        'cop_crops': 'Crops-CoverFraction',
        'cop_grass': 'Grass-CoverFraction',
        'cop_moss_lichen': 'MossLichen-CoverFraction',
        'cop_permanent_water': 'PermanentWater-CoverFraction',
        'cop_seasonal_water': 'SeasonalWater-CoverFraction',
        'cop_shrub': 'Shrub-CoverFraction',
        'cop_snow': 'Snow-CoverFraction',
        'cop_tree': 'Tree-CoverFraction'
    }
    
    #read data
    if year == 2015:
        str_add = '-base_'
    if year in [2016, 2017, 2018]:
        str_add = '-conso_'
    if year == 2019:
        str_add = '-nrt_'
    
    if ltype != 'forest_type':
        file_path = (f"{dir_data}/copernicus_luc/{year}"
                     f"/PROBAV_LC100_global_v3.0.1_{year}{str_add}"
                     f"{dict_cop_fract[ltype]}-layer_EPSG-4326.tif")
    if ltype == 'forest_type':
        file_path = (f"{dir_data}/copernicus_luc/{year}"
                     f"/PROBAV_LC100_global_v3.0.1_{year}{str_add}"
                     f"Forest-Type-layer_EPSG-4326.tif")
        
    da = rioxarray.open_rasterio(file_path,
                                 chunks=dict(y=5000, x=5000), lock=False)

    # set nan-default value to nan
    da = da.where(da!=255).squeeze('band', drop=True)

    return da


def _prep_da_cop_luc(ltype):
    """ Prepare Copernicus fractional type LUC TIFs for one LUC type across 
    multiple years """

    # List of years to process
    years = range(2015, 2020)

    # Open data for all years and concatenate into an xarray dataset
    ds_sub = xr.concat(
        [_open_da_cop_luc(year, ltype) \
             .expand_dims(year=[year]) for year in years],
        dim='year'
    )

    # Calculate the mean over the years, round, and set NaN-default values to 255
    da_mean = ds_sub.mean('year').round(0).fillna(255).astype(np.uint8)

    # Copy attributes from the 2015 dataset
    da_mean.attrs = ds_sub.sel(year=2015).attrs

    return da_mean


def _prep_ds_forest_types():
    """ Prepare Copernicus forest type LUC TIFs """

    def prep_da_cop_forest(year):
        # read data
        da = _open_da_cop_luc(year, 'forest_type')
        # create dataset with arrays of forest types for one year
        ds = xr.Dataset()
        ds['cop_forest_unknown'] = xr.where(da == 0, 1, 0).astype('uint8')
        ds['cop_forest_enf'] = xr.where(da == 1, 1, 0).astype('uint8')
        ds['cop_forest_ebf'] = xr.where(da == 2, 1, 0).astype('uint8')
        ds['cop_forest_dnf'] = xr.where(da == 3, 1, 0).astype('uint8')
        ds['cop_forest_dbf'] = xr.where(da == 4, 1, 0).astype('uint8')
        ds['cop_forest_mixed'] = xr.where(da == 5, 1, 0).astype('uint8')
        
        return ds

    # Prepare arrays for each year using a loop and list comprehension
    ds_years = xr.concat(
        [prep_da_cop_forest(year).expand_dims(year=[year]) 
         for year in range(2015, 2020)],
        dim='year'
    )

    # Average over the five years, round, and convert to uint8
    ds = ds_years.mean('year').round(0).astype(np.uint8)

    # Add attributes to each forest type variable
    for var in ds.data_vars:
        ds[var].attrs = dict(
            band_crs='WGS84 (EPSG:4326)',
            missing_value=255,
            _FillValue=255.0,
            scale_factor=1.0,
            add_offset=0.0,
            unit='None',
            note='Forest Type for all pixels with tree PVC bigger than 1%'
        )

    # Assign short names using the forest type map
    ds['cop_forest_unknown'].attrs['short_name'] = 'Unknown forest type'
    ds['cop_forest_enf'].attrs['short_name'] = 'Evergreen needle-leaved'
    ds['cop_forest_ebf'].attrs['short_name'] = 'Evergreen broadleaved'
    ds['cop_forest_dnf'].attrs['short_name'] = 'Deciduous needle-leaved'
    ds['cop_forest_dbf'].attrs['short_name'] = 'Deciduous broadleaved'
    ds['cop_forest_mixed'].attrs['short_name'] = 'Mixed'

    return ds


def prep_copernicus():

    """ Prepare copernicus luc tifs """
    
    # list with copernicus luc types except forest types
    list_cop_fract = ['cop_bare', 'cop_builtup', 'cop_crops', 'cop_grass',
                      'cop_moss_lichen', 'cop_permanent_water',
                      'cop_seasonal_water', 'cop_shrub', 'cop_snow', 'cop_tree']

    # list with copernicus forest types
    list_cop_ftype = ['cop_forest_unknown', 'cop_forest_enf', 'cop_forest_ebf', 
                      'cop_forest_dnf', 'cop_forest_dbf', 'cop_forest_mixed']
    
    # prepare and export luc types except forest types as tif
    for i in list_cop_fract:
        # Prepare data
        da = _prep_da_cop_luc(i)
        # Export as tif
        da.rio.to_raster(dir01 + i + '.tif', tiled=True)
         # Trim memory
        client.run(trim_memory)

        
    # prepare and export luc forest types as tif
    for i in list_cop_ftype:
        # Prepare data
        da = _prep_ds_forest_types()[i]
        # Export as tif
        da.rio.to_raster(dir01 + i + '.tif', tiled=True)
        # Trim memory
        client.run(trim_memory)

In [None]:
# Prepare copernicus luc data
%time prep_copernicus()

In [None]:
# Close dask cluster
cluster.close()

In [None]:
# Wait for 60s for dask client to completely disconnect
time.sleep(60)

---

### Regridding

In [None]:
# Import regridding function
from regrid_high_res_v1_01 import regrid_high_res, prep_tif

In [None]:
def regrid_da(f_source, dir_target, dir_source, dir_out, 
              size_tiles, fill_value=None, olap=1):  
    """Regrid large xarray dataarrays.

    Args:
        f_source (str): The filename (without extension) of the source .tif file to be regridded.
        dir_target (str): Directory containing target grid .tif file.
        dir_source (str): Directory containing the the source  .tif file.
        dir_out (str): Directory to store the output and intermediate files.
        size_tiles (int): Size of the regridding tiles in degrees.
        fill_value (float, optional): Fill value to use in the regridding process. Defaults to None.
        olap (int, optional): Overlap size in degrees for regridding tiles. Defaults to 1.
        
    Returns:
        xarray.Dataset: The combined dataset after regridding.
    """
    # Prepare the target and source data arrays from TIFF files
    da_target = prep_tif(dir_target + 'target_grid.tif', 'target_grid')
    da_source = prep_tif(dir_source + f_source + '.tif', f_source)
    # Regridd source array to target grid
    regrid_high_res(da_target, da_source, dir_out,
                    account='bm0891', partition='compute',
                    size_tiles=size_tiles, olap=olap, fill_value = fill_value,
                    type_export='zarr', del_interm=False)

In [None]:
# list with copernicus luc types except forest types
list_cop_fract = ['cop_bare', 'cop_builtup', 'cop_crops', 'cop_grass',
                  'cop_moss_lichen', 'cop_permanent_water',
                  'cop_seasonal_water', 'cop_shrub', 'cop_snow', 'cop_tree']


for i in list_cop_fract:
    print(i)
    %time regrid_da(i, dir01, dir01, dir01, 15, 255, 0.2)

In [None]:
# list with copernicus forest types
list_cop_ftype = ['cop_forest_unknown', 'cop_forest_enf', 'cop_forest_ebf', 
                  'cop_forest_dnf', 'cop_forest_dbf', 'cop_forest_mixed']

for i in list_cop_ftype:
    print(i)
    %time regrid_da(i, dir01, dir01, dir01, 15, 255, 0.2)

---

### Fill nans

In [None]:
def fill_nans(var, dir_out):
    """
    Fills NaN values in the specified variable's dataset using the nearest valid 
    data, applies a land mask, and exports the result to a new Zarr dataset.

    Args:
        var (str): Name of the variable to process (e.g., 'temperature', 'precipitation').
        dir_out (str): Directory where prepared data is stored and the filled dataset will be exported.

    Returns:
        None
    """

    def fill_nans_array(data, invalid):
        """
        Replace invalid (NaN) data cells by the value of the nearest valid data 
        cell.
        """
        ind = ndimage.distance_transform_edt(invalid,
                                             return_distances=False,
                                             return_indices=True)
        return data[tuple(ind)]

    # Paths for input and output
    land_mask_path = os.path.join(dir_out, 'ds_prep_copernicus_land_mask.zarr')
    var_data_path = os.path.join(dir_out, f'ds_regridded_{var}.zarr')
    output_path = os.path.join(dir_out, f'ds_prep_{var}.zarr')

    # Read land mask data
    da_land = xr.open_zarr(land_mask_path) \
                .chunk(dict(lat=5000, lon=5000)) \
                .copernicus_land_mask \
                .compute()

    # Read variable data
    da_var = xr.open_zarr(var_data_path)['regridded_' + var]

    # Fill nan using function fill_nans_array
    # If there are no NaNs, skip filling process
    if not da_var.isnull().any():
        da_fill = da_var.values  # No filling required
    else:
        da_fill = fill_nans_array(da_var.values, da_var.isnull().values)

    # Create a new Dataset with filled data
    ds_filled = xr.Dataset(dict(lat = da_var.lat, lon=da_var.lon))
    ds_filled[var] = (('lat', 'lon'), da_fill)

    # Apply land mask to the filled data
    ds_filled = ds_filled.where(da_land)

    # Export the filled dataset to Zarr format
    ds_filled.chunk(dict(lat=5000, lon=5000)) \
             .to_zarr(output_path, mode='w')

In [None]:
%%time
# List of luc variables
list_cop_fract = ['cop_bare', 'cop_builtup', 'cop_crops', 'cop_grass',
                  'cop_moss_lichen', 'cop_permanent_water',
                  'cop_seasonal_water', 'cop_shrub', 'cop_snow', 'cop_tree']

# List of forest variables
list_cop_ftype = ['cop_forest_unknown', 'cop_forest_enf', 'cop_forest_ebf', 
                  'cop_forest_dnf', 'cop_forest_dbf', 'cop_forest_mixed']

# Fill nans
for i in [*list_cop_fract, *list_cop_ftype]:
    %time fill_nans(i, dir01)

---

### Check

In [None]:
# Plot to check
# List of luc variables
list_cop_fract = ['cop_bare', 'cop_builtup', 'cop_crops', 'cop_grass',
                  'cop_moss_lichen', 'cop_permanent_water',
                  'cop_seasonal_water', 'cop_shrub', 'cop_snow', 'cop_tree']

for i in list_cop_fract:
    fig, ax = plt.subplots(figsize=(10, 5), ncols=1, nrows=1)
    xr.open_zarr(dir01 + 'ds_prep_' + i + '.zarr')[i] \
        .plot.imshow(ax=ax)
    ax.set_title(i)

In [None]:
# Plot to check
# List of forest variables
list_cop_ftype = ['cop_forest_unknown', 'cop_forest_enf', 'cop_forest_ebf', 
                  'cop_forest_dnf', 'cop_forest_dbf', 'cop_forest_mixed']

for i in list_cop_ftype:
    fig, ax = plt.subplots(figsize=(10, 5), ncols=1, nrows=1)
    xr.open_zarr(dir01 + 'ds_prep_' + i + '.zarr')[i] \
        .plot.imshow(ax=ax)
    ax.set_title(i)