# Prepare Riggio data on human influence

In [None]:
# Libraries
import os, time, shutil, rioxarray
import numpy as np
import xarray as xr
from scipy import ndimage
import matplotlib.pyplot as plt

In [None]:
# Directories
dir_data =  '../data/'
dir01 = '../paper_deficit/output/01_prep/'

---

### Pre-processing

In [None]:
def prep_riggio():

    """Prepare riggio data for regridding"""

    # Directory where riggio data is stored
    dir_riggio = dir_data + 'riggio2020/Global_Human_Influence_Dryad/'
    
    # Mapping of human influence types to their file paths
    hi_types = {
        'vlhi': 'Figure2a_VeryLowHumanInfluence.tif',
        'lhi': 'Figure2b_LowHumanInfluence.tif'
    }
    
    for hi_type, file_name in hi_types.items():   
        # Load data
        da = rioxarray.open_rasterio(dir_riggio + file_name) \
            .rio.reproject('EPSG:4326') \
            .chunk(dict(y=5000, x=5000))
            
        # Prepare and export
        for i in [0,1,2,3,4]:
            xr.where(da == i, True, False) \
                .astype('float32') \
                .rename('riggio_' + hi_type + str(i)) \
                .rio.to_raster(dir01 + 'riggio_' + hi_type + str(i) + '.tif')

In [None]:
%time prep_riggio()

---

### Regridding

In [None]:
# Import regridding function
from regrid_high_res_v1_01 import regrid_high_res, prep_tif

In [None]:
def regrid_da(f_source, dir_target, dir_source, dir_out, 
              size_tiles, fill_value=None, olap=1):  
    """Regrid large xarray dataarrays.

    Args:
        f_source (str): The filename (without extension) of the source .tif file to be regridded.
        dir_target (str): Directory containing target grid .tif file.
        dir_source (str): Directory containing the the source  .tif file.
        dir_out (str): Directory to store the output and intermediate files.
        size_tiles (int): Size of the regridding tiles in degrees.
        fill_value (float, optional): Fill value to use in the regridding process. Defaults to None.
        olap (int, optional): Overlap size in degrees for regridding tiles. Defaults to 1.
        
    Returns:
        xarray.Dataset: The combined dataset after regridding.
    """
    # Prepare the target and source data arrays from TIFF files
    da_target = prep_tif(dir_target + 'target_grid.tif', 'target_grid')
    da_source = prep_tif(dir_source + f_source + '.tif', f_source)
    # Regridd source array to target grid
    regrid_high_res(da_target, da_source, dir_out,
                    account='bm0891', partition='compute',
                    size_tiles=size_tiles, olap=olap, fill_value = fill_value,
                    type_export='zarr', del_interm=False)

In [None]:
# Regridding
for i in range(5):
    %time regrid_da('riggio_vlhi' + str(i), dir01, dir01, dir01, 30, 127, 0.1)
    %time regrid_da('riggio_lhi' + str(i), dir01, dir01, dir01, 30, 127, 0.1)
# takes about 4-6min for one file

---

### Fill nans

In [None]:
def fill_nans(var, dir_out):
    """
    Fills NaN values in the specified variable's dataset using the nearest valid 
    data, applies a land mask, and exports the result to a new Zarr dataset.

    Args:
        var (str): Name of the variable to process (e.g., 'temperature', 'precipitation').
        dir_out (str): Directory where prepared data is stored and the filled dataset will be exported.

    Returns:
        None
    """

    def fill_nans_array(data, invalid):
        """
        Replace invalid (NaN) data cells by the value of the nearest valid data 
        cell.
        """
        ind = ndimage.distance_transform_edt(invalid,
                                             return_distances=False,
                                             return_indices=True)
        return data[tuple(ind)]

    # Paths for input and output
    land_mask_path = os.path.join(dir_out, 'ds_prep_copernicus_land_mask.zarr')
    var_data_path = os.path.join(dir_out, f'ds_regridded_{var}.zarr')
    output_path = os.path.join(dir_out, f'ds_prep_{var}.zarr')

    # Read land mask data
    da_land = xr.open_zarr(land_mask_path) \
                .chunk(dict(lat=5000, lon=5000)) \
                .copernicus_land_mask \
                .compute()

    # Read variable data
    da_var = xr.open_zarr(var_data_path)['regridded_' + var]

    # Fill nan using function fill_nans_array
    # If there are no NaNs, skip filling process
    if not da_var.isnull().any():
        da_fill = da_var.values  # No filling required
    else:
        da_fill = fill_nans_array(da_var.values, da_var.isnull().values)

    # Create a new Dataset with filled data
    ds_filled = xr.Dataset(dict(lat = da_var.lat, lon=da_var.lon))
    ds_filled[var] = (('lat', 'lon'), da_fill)

    # Apply land mask to the filled data
    ds_filled = ds_filled.where(da_land)

    # Export the filled dataset to Zarr format
    ds_filled.chunk(dict(lat=5000, lon=5000)) \
             .to_zarr(output_path, mode='w')

In [None]:
%%time
for i in range(5):
    %time fill_nans('riggio_vlhi' + str(i), dir01)
    %time fill_nans('riggio_lhi' + str(i), dir01)

---

### Check

In [None]:
# Plot to check
for i in ['riggio_vlhi0', 'riggio_vlhi1', 'riggio_vlhi2', 'riggio_vlhi3',
          'riggio_vlhi4']:
    fig, ax = plt.subplots(figsize=(10, 5), ncols=1, nrows=1)
    xr.open_zarr(dir01 + 'ds_prep_' + i + '.zarr')[i] \
        .plot.imshow(ax=ax)
    ax.set_title(i)

In [None]:
# Plot to check
for i in ['riggio_lhi0', 'riggio_lhi1', 'riggio_lhi2', 'riggio_lhi3',
          'riggio_lhi4']:
    fig, ax = plt.subplots(figsize=(10, 5), ncols=1, nrows=1)
    xr.open_zarr(dir01 + 'ds_prep_' + i + '.zarr')[i] \
        .plot.imshow(ax=ax)
    ax.set_title(i)