Adding tests to warrant ATLAS dataset support (#319)

* Adding C3S ATLAS test datasets * Add fix for compressed str variables and cap compression level at 1 * Linting updates --------- Co-authored-by: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com>
roocs · Feb 9, 2024 · 640dbc3 · 640dbc3
1 parent 4160b30
commit 640dbc3
Show file tree

Hide file tree

Showing 18 changed files with 281 additions and 27 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,11 +21,11 @@ repos:
     hooks:
       - id: toml-sort-fix
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 23.12.1
+    rev: 24.1.1
     hooks:
     -   id: black
   - repo: https://github.com/pycqa/flake8
-    rev: 6.1.0
+    rev: 7.0.0
     hooks:
       - id: flake8
         args: [ '--config=.flake8' ]
@@ -47,7 +47,7 @@ repos:
     rev: v0.3.9
     hooks:
       - id: blackdoc
-        additional_dependencies: [ 'black==23.12.1' ]
+        additional_dependencies: [ 'black==24.1.1' ]
   - repo: https://github.com/python-jsonschema/check-jsonschema
     rev: 0.27.3
     hooks:

diff --git a/HISTORY.rst b/HISTORY.rst
@@ -10,6 +10,11 @@ Bug Fixes
 * Fixed KeyError for temporal subsetting by components if not all components can be found in the dataset (#316).
 * Raising KeyError for temporal subsetting by components when no time steps match the selection criteria (#316).
 * Coordinate detection for remapping operator via standard_name if detection via cf-xarray fails / is ambiguous (#316).
+* Remove encoding settings with regards to compression for string variables to avoid netCDF write errors with newer netcdf-c versions (>4.9.0) (#319).
+
+Other Changes
+^^^^^^^^^^^^^
+* The compression level is capped at 1 to reduce write times (#319).
 
 v0.12.2 (2024-01-03)
 --------------------

diff --git a/binder/environment.yml b/binder/environment.yml
@@ -20,7 +20,7 @@ dependencies:
  - pyproj >=3.3.0
  - requests >=2.0
  - roocs-grids>=0.1.2
- - roocs-utils >=0.6.4,<0.7
+ - roocs-utils >=0.6.7,<0.7
  - shapely >=1.9
  - xarray >=0.21,<2023.3.0 # https://github.com/pydata/xarray/issues/7794
  - xesmf >=0.8.2

diff --git a/clisops/__init__.py b/clisops/__init__.py
@@ -1,4 +1,5 @@
 """CLISOPS - Climate simulation operations."""
+
 import os
 import warnings
 

diff --git a/clisops/core/average.py b/clisops/core/average.py
@@ -1,4 +1,5 @@
 """Average module."""
+
 import warnings
 from pathlib import Path
 from typing import Sequence, Tuple, Union

diff --git a/clisops/core/regrid.py b/clisops/core/regrid.py
@@ -1,4 +1,5 @@
 """Regrid module."""
+
 from __future__ import annotations
 
 import functools

diff --git a/clisops/core/subset.py b/clisops/core/subset.py
@@ -1,4 +1,5 @@
 """Subset module."""
+
 import numbers
 import re
 import warnings

diff --git a/clisops/ops/base_operation.py b/clisops/ops/base_operation.py
@@ -64,6 +64,59 @@ def _calculate(self):
         """The `_calculate()` method is implemented within each operation subclass."""
         raise NotImplementedError
 
+    def _remove_str_compression(self, ds):
+        """
+        netCDF4 datatypes of variable length are decoded to str by xarray<2023.11.0.
+        As of xarray 2023.11.0 they are decoded to one of np.dtypes.StrDType (eg. "<U20")
+        of variable length and stripped of all encoding settings. In netcdf-c versions >= 4.9.0
+        and xarray < 2023.11.0 the latter part needs to be conducted manually to avoid an Exception
+        when writing the xarray.Dataset to disk.
+        See issue:  https://github.com/Unidata/netcdf4-python/issues/1205
+        See PR: https://github.com/roocs/clisops/pull/319
+        """
+        if isinstance(ds, xr.Dataset):
+            varlist = list(ds.coords) + list(ds.data_vars)
+        elif isinstance(ds, xr.DataArray):
+            varlist = list(ds.coords)
+
+        for var in varlist:
+            if "dtype" in ds[var].encoding:
+                if ds[var].encoding["dtype"] == str:
+                    for en in [
+                        "compression",
+                        "complevel",
+                        "shuffle",
+                        "fletcher32",
+                        "endian",
+                        "zlib",
+                    ]:
+                        if en in ds[var].encoding:
+                            del ds[var].encoding[en]
+        return ds
+
+    def _cap_deflate_level(self, ds):
+        """
+        For CMOR3 / CMIP6 it was investigated which netCDF4 deflate_level should be set to optimize
+        the balance between reduction of file size and degradation in performance. The values found
+        were deflate_level=1, shuffle=True. To keep the write times at a minimum, compression level 1
+        is not exceeded.
+        See issue: https://github.com/PCMDI/cmor/issues/403
+        """
+        if isinstance(ds, xr.Dataset):
+            varlist = list(ds.coords) + list(ds.data_vars)
+        elif isinstance(ds, xr.DataArray):
+            varlist = list(ds.coords)
+
+        for var in varlist:
+            complevel = ds[var].encoding.get("complevel", 0)
+            compression = ds[var].encoding.get("compression_opts", 0)
+            if complevel > 1:
+                ds[var].encoding["complevel"] = 1
+            elif compression > 1:
+                ds[var].encoding["compression_opts"] = 1
+
+        return ds
+
     def _remove_redundant_fill_values(self, ds):
         """
         Get coordinate and data variables and remove fill values added by xarray
@@ -151,6 +204,10 @@ def process(self) -> List[Union[xr.Dataset, Path]]:
         processed_ds = self._remove_redundant_fill_values(processed_ds)
         # remove redundant coordinates from bounds
         processed_ds = self._remove_redundant_coordinates_attr(processed_ds)
+        # remove compression for string variables (as it is not supported by netcdf-c >= 4.9.0)
+        processed_ds = self._remove_str_compression(processed_ds)
+        # cap deflate level at 1
+        processed_ds = self._cap_deflate_level(processed_ds)
 
         # Work out how many outputs should be created based on the size
         # of the array. Manage this as a list of time slices.

diff --git a/clisops/utils/tutorial.py b/clisops/utils/tutorial.py
@@ -1,4 +1,5 @@
 """Testing and tutorial utilities module."""
+
 # Most of this code copied and adapted from xarray, xclim, and raven
 import hashlib
 import re

diff --git a/docs/environment.yml b/docs/environment.yml
@@ -21,7 +21,7 @@ dependencies:
  - pyproj >=3.3.0
  - requests >=2.0
  - roocs-grids >=0.1.2
- - roocs-utils >=0.6.4,<0.7
+ - roocs-utils >=0.6.7,<0.7
  - shapely >=1.9
  - xarray >=0.21,<2023.3.0 # https://github.com/pydata/xarray/issues/7794
  - xesmf >=0.8.2

diff --git a/environment.yml b/environment.yml
@@ -20,7 +20,7 @@ dependencies:
  - pyproj >=3.3.0
  - requests >=2.0
  - roocs-grids >=0.1.2
- - roocs-utils >=0.6.4,<0.7
+ - roocs-utils >=0.6.7,<0.7
  - shapely >=1.9
  - xarray >=0.21,<2023.03.0 # See: https://github.com/pydata/xarray/issues/7794
  - xesmf >=0.8.2

diff --git a/pyproject.toml b/pyproject.toml
@@ -62,7 +62,7 @@ dependencies = [
   "requests>=2.0",
   # roocs_grids is differently named on conda-forge
   "roocs_grids>=0.1.2",
-  "roocs-utils>=0.6.4,<0.7",
+  "roocs-utils>=0.6.7,<0.7",
   "shapely>=1.9",
   "xarray>=0.21"
 ]

diff --git a/tests/_common.py b/tests/_common.py
@@ -354,3 +354,47 @@ def cmip6_archive_base():
     MINI_ESGF_CACHE_DIR,
     "master/test_data/pool/data/CORDEX/data/cordex/output/EUR-11/KNMI/MPI-M-MPI-ESM-LR/rcp85/r1i1p1/KNMI-RACMO22E/v1/mon/tas/v20190625/tas_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_KNMI-RACMO22E_v1_mon_209101.nc",
 ).as_posix()
+
+# ATLAS v1 datasets with full time series
+ATLAS_v1_CMIP5 = Path(
+    MINI_ESGF_CACHE_DIR,
+    "master/test_data/pool/data/c3s-cica-atlas/CMIP5/rcp26/pr_CMIP5_rcp26_mon_200601-210012.nc",
+).as_posix()
+
+ATLAS_v1_EOBS = Path(
+    MINI_ESGF_CACHE_DIR,
+    "master/test_data/pool/data/c3s-cica-atlas/E-OBS/sfcwind_E-OBS_mon_195001-202112.nc",
+).as_posix()
+
+ATLAS_v1_ERA5 = Path(
+    MINI_ESGF_CACHE_DIR,
+    "master/test_data/pool/data/c3s-cica-atlas/ERA5/psl_ERA5_mon_194001-202212.nc",
+).as_posix()
+
+# ATLAS v1 datasets with full horizontal grid
+ATLAS_v1_CORDEX = Path(
+    MINI_ESGF_CACHE_DIR,
+    "master/test_data/pool/data/c3s-cica-atlas/CORDEX-CORE/historical/huss_CORDEX-CORE_historical_mon_197001.nc",
+).as_posix()
+
+ATLAS_v1_EOBS_GRID = Path(
+    MINI_ESGF_CACHE_DIR,
+    "master/test_data/pool/data/c3s-cica-atlas/E-OBS/t_E-OBS_mon_195001.nc",
+).as_posix()
+
+# ATLAS v0 datasets with full time series
+ATLAS_v0_CORDEX_NAM = Path(
+    MINI_ESGF_CACHE_DIR,
+    "master/test_data/pool/data/c3s-ipcc-ar6-atlas/CORDEX-NAM/historical/rx1day_CORDEX-NAM_historical_mon_197001-200512.nc",
+).as_posix()
+
+ATLAS_v0_CMIP6 = Path(
+    MINI_ESGF_CACHE_DIR,
+    "master/test_data/pool/data/c3s-ipcc-ar6-atlas/CMIP6/ssp245/sst_CMIP6_ssp245_mon_201501-210012.nc",
+).as_posix()
+
+# ATLAS v0 datasets with full horizontal grid
+ATLAS_v0_CORDEX_ANT = Path(
+    MINI_ESGF_CACHE_DIR,
+    "master/test_data/pool/data/c3s-ipcc-ar6-atlas/CORDEX-ANT/rcp45/tnn_CORDEX-ANT_rcp45_mon_200601.nc",
+).as_posix()
diff --git a/tests/test_core_regrid.py b/tests/test_core_regrid.py
@@ -129,7 +129,6 @@ def test_grid_init_ds_tos_curvilinear(load_esgf_test_data):
 def test_grid_init_ds_tas_cordex(load_esgf_test_data):
     ds = xr.open_dataset(CORDEX_TAS_ONE_TIMESTEP, use_cftime=True)
     grid = Grid(ds=ds)
-    print(ds)
 
     assert grid.format == "CF"
     assert grid.source == "Dataset"
@@ -146,7 +145,6 @@ def test_grid_init_ds_tas_cordex(load_esgf_test_data):
     assert grid.ncells == 45225
 
     ds = ds.drop(["lat", "lon", "lat_vertices", "lon_vertices"])
-    print(ds)
     with pytest.raises(
         Exception,
         match="The grid format is not supported.",
@@ -235,7 +233,6 @@ def test_grid_init_ds_tas_unstructured(load_esgf_test_data):
     assert grid.lat_bnds == "latitude_bnds"
     assert grid.lon_bnds == "longitude_bnds"
     assert grid.ncells == 20480
-    print(grid.contains_collapsed_cells)
 
     # not implemented yet
     # assert self.mask
@@ -455,17 +452,8 @@ def test_grid_from_ds_adaptive_reproducibility():
 
     gAa = Grid(ds=dsA, grid_id="adaptive")
     gA = Grid(grid_id="0pt25deg")
-    print(repr(gAa))
-    print(repr(gA))
-    print(gAa.ds.lon[715:735])
-    print(gA.ds.lon[715:735])
-
     gBa = Grid(ds=dsB, grid_id="adaptive")
     gB = Grid(grid_id="1deg")
-    print(gBa.ds.lon[170:190])
-    print(gB.ds.lon[170:190])
-    print(repr(gBa))
-    print(repr(gB))
 
     assert gA.extent == "global"
     assert gA.compare_grid(gAa)
@@ -1122,15 +1110,13 @@ def test_adaptive_masking(self, load_esgf_test_data, tmp_path):
         self._setup()
         weights_cache_init(Path(tmp_path, "weights"))
         w = Weights(grid_in=self.grid_in, grid_out=self.grid_out, method="conservative")
-        r = regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=0.7)
-        print(r)
+        regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=0.7)
 
     def test_no_adaptive_masking(self, load_esgf_test_data, tmp_path):
         self._setup()
         weights_cache_init(Path(tmp_path, "weights"))
         w = Weights(grid_in=self.grid_in, grid_out=self.grid_out, method="bilinear")
-        r = regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=-1.0)
-        print(r)
+        regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=-1.0)
 
     def test_duplicated_cells_warning_issued(self, load_esgf_test_data, tmp_path):
         self._setup()
@@ -1149,14 +1135,13 @@ def test_duplicated_cells_warning_issued(self, load_esgf_test_data, tmp_path):
             "However, please be wary with the results and consider removing / masking "
             "the duplicated cells before remapping.",
         ) as issuedWarnings:
-            r = regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=0.0)
+            regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=0.0)
             if not issuedWarnings:
                 raise Exception(
                     "No warning issued regarding the duplicated cells in the grid."
                 )
             else:
                 assert len(issuedWarnings) == 1
-            print(r)
 
     def test_regrid_dataarray(self, load_esgf_test_data, tmp_path):
         self._setup()

diff --git a/tests/test_file_namers.py b/tests/test_file_namers.py
@@ -3,7 +3,13 @@
 from roocs_utils.exceptions import InvalidProject
 from roocs_utils.parameter.param_utils import time_interval
 
-from _common import C3S_CORDEX_NAM_PR, CMIP5_TAS, CMIP6_SICONC
+from _common import (
+    C3S_CORDEX_NAM_PR,
+    CMIP5_TAS,
+    CMIP6_SICONC,
+    ATLAS_v0_CORDEX_NAM,
+    ATLAS_v1_ERA5,
+)
 from clisops import CONFIG
 from clisops.ops.subset import subset
 from clisops.utils.file_namers import get_file_namer
@@ -188,3 +194,47 @@ def test_StandardFileNamer_c3s_cordex_use_default_attr_names(load_esgf_test_data
     for ds, expected in checks:
         resp = s.get_file_name(ds)
         assert resp == expected
+
+
+def test_StandardFileNamer_c3s_atlas_v0(load_esgf_test_data):
+    "Test C3S ATLAS v0 (c3s-ipcc-ar6-atlas) filenamer"
+    s = get_file_namer("standard")()
+
+    _ds = xr.open_mfdataset(
+        ATLAS_v0_CORDEX_NAM,
+        use_cftime=True,
+        combine="by_coords",
+    )
+
+    checks = [
+        (
+            _ds,
+            "rx1day_CORDEX-NAM_historical_mon_19700101-20051201.nc",
+        )
+    ]
+
+    for ds, expected in checks:
+        resp = s.get_file_name(ds)
+        assert resp == expected
+
+
+def test_StandardFileNamer_c3s_atlas_v1(load_esgf_test_data):
+    "Test C3S ATLAS v1 (c3s-cica-atlas) filenamer"
+    s = get_file_namer("standard")()
+
+    _ds = xr.open_mfdataset(
+        ATLAS_v1_ERA5,
+        use_cftime=True,
+        combine="by_coords",
+    )
+
+    checks = [
+        (
+            _ds,
+            "psl_ERA5_no-expt_mon_19400101-20221201.nc",
+        )
+    ]
+
+    for ds, expected in checks:
+        resp = s.get_file_name(ds)
+        assert resp == expected