Skip to content

Commit

Permalink
Adding tests to warrant ATLAS dataset support (#319)
Browse files Browse the repository at this point in the history
* Adding C3S ATLAS test datasets
* Add fix for compressed str variables and cap compression level at 1
* Linting updates
---------

Co-authored-by: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com>
  • Loading branch information
sol1105 and Zeitsperre committed Feb 9, 2024
1 parent 4160b30 commit 640dbc3
Show file tree
Hide file tree
Showing 18 changed files with 281 additions and 27 deletions.
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ repos:
hooks:
- id: toml-sort-fix
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 23.12.1
rev: 24.1.1
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
rev: 6.1.0
rev: 7.0.0
hooks:
- id: flake8
args: [ '--config=.flake8' ]
Expand All @@ -47,7 +47,7 @@ repos:
rev: v0.3.9
hooks:
- id: blackdoc
additional_dependencies: [ 'black==23.12.1' ]
additional_dependencies: [ 'black==24.1.1' ]
- repo: https://github.com/python-jsonschema/check-jsonschema
rev: 0.27.3
hooks:
Expand Down
5 changes: 5 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ Bug Fixes
* Fixed KeyError for temporal subsetting by components if not all components can be found in the dataset (#316).
* Raising KeyError for temporal subsetting by components when no time steps match the selection criteria (#316).
* Coordinate detection for remapping operator via standard_name if detection via cf-xarray fails / is ambiguous (#316).
* Remove encoding settings with regards to compression for string variables to avoid netCDF write errors with newer netcdf-c versions (>4.9.0) (#319).

Other Changes
^^^^^^^^^^^^^
* The compression level is capped at 1 to reduce write times (#319).

v0.12.2 (2024-01-03)
--------------------
Expand Down
2 changes: 1 addition & 1 deletion binder/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dependencies:
- pyproj >=3.3.0
- requests >=2.0
- roocs-grids>=0.1.2
- roocs-utils >=0.6.4,<0.7
- roocs-utils >=0.6.7,<0.7
- shapely >=1.9
- xarray >=0.21,<2023.3.0 # https://github.com/pydata/xarray/issues/7794
- xesmf >=0.8.2
Expand Down
1 change: 1 addition & 0 deletions clisops/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""CLISOPS - Climate simulation operations."""

import os
import warnings

Expand Down
1 change: 1 addition & 0 deletions clisops/core/average.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Average module."""

import warnings
from pathlib import Path
from typing import Sequence, Tuple, Union
Expand Down
1 change: 1 addition & 0 deletions clisops/core/regrid.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Regrid module."""

from __future__ import annotations

import functools
Expand Down
1 change: 1 addition & 0 deletions clisops/core/subset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Subset module."""

import numbers
import re
import warnings
Expand Down
57 changes: 57 additions & 0 deletions clisops/ops/base_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,59 @@ def _calculate(self):
"""The `_calculate()` method is implemented within each operation subclass."""
raise NotImplementedError

def _remove_str_compression(self, ds):
"""
netCDF4 datatypes of variable length are decoded to str by xarray<2023.11.0.
As of xarray 2023.11.0 they are decoded to one of np.dtypes.StrDType (eg. "<U20")
of variable length and stripped of all encoding settings. In netcdf-c versions >= 4.9.0
and xarray < 2023.11.0 the latter part needs to be conducted manually to avoid an Exception
when writing the xarray.Dataset to disk.
See issue: https://github.com/Unidata/netcdf4-python/issues/1205
See PR: https://github.com/roocs/clisops/pull/319
"""
if isinstance(ds, xr.Dataset):
varlist = list(ds.coords) + list(ds.data_vars)
elif isinstance(ds, xr.DataArray):
varlist = list(ds.coords)

for var in varlist:
if "dtype" in ds[var].encoding:
if ds[var].encoding["dtype"] == str:
for en in [
"compression",
"complevel",
"shuffle",
"fletcher32",
"endian",
"zlib",
]:
if en in ds[var].encoding:
del ds[var].encoding[en]
return ds

def _cap_deflate_level(self, ds):
"""
For CMOR3 / CMIP6 it was investigated which netCDF4 deflate_level should be set to optimize
the balance between reduction of file size and degradation in performance. The values found
were deflate_level=1, shuffle=True. To keep the write times at a minimum, compression level 1
is not exceeded.
See issue: https://github.com/PCMDI/cmor/issues/403
"""
if isinstance(ds, xr.Dataset):
varlist = list(ds.coords) + list(ds.data_vars)
elif isinstance(ds, xr.DataArray):
varlist = list(ds.coords)

for var in varlist:
complevel = ds[var].encoding.get("complevel", 0)
compression = ds[var].encoding.get("compression_opts", 0)
if complevel > 1:
ds[var].encoding["complevel"] = 1
elif compression > 1:
ds[var].encoding["compression_opts"] = 1

return ds

def _remove_redundant_fill_values(self, ds):
"""
Get coordinate and data variables and remove fill values added by xarray
Expand Down Expand Up @@ -151,6 +204,10 @@ def process(self) -> List[Union[xr.Dataset, Path]]:
processed_ds = self._remove_redundant_fill_values(processed_ds)
# remove redundant coordinates from bounds
processed_ds = self._remove_redundant_coordinates_attr(processed_ds)
# remove compression for string variables (as it is not supported by netcdf-c >= 4.9.0)
processed_ds = self._remove_str_compression(processed_ds)
# cap deflate level at 1
processed_ds = self._cap_deflate_level(processed_ds)

# Work out how many outputs should be created based on the size
# of the array. Manage this as a list of time slices.
Expand Down
1 change: 1 addition & 0 deletions clisops/utils/tutorial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Testing and tutorial utilities module."""

# Most of this code copied and adapted from xarray, xclim, and raven
import hashlib
import re
Expand Down
2 changes: 1 addition & 1 deletion docs/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ dependencies:
- pyproj >=3.3.0
- requests >=2.0
- roocs-grids >=0.1.2
- roocs-utils >=0.6.4,<0.7
- roocs-utils >=0.6.7,<0.7
- shapely >=1.9
- xarray >=0.21,<2023.3.0 # https://github.com/pydata/xarray/issues/7794
- xesmf >=0.8.2
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dependencies:
- pyproj >=3.3.0
- requests >=2.0
- roocs-grids >=0.1.2
- roocs-utils >=0.6.4,<0.7
- roocs-utils >=0.6.7,<0.7
- shapely >=1.9
- xarray >=0.21,<2023.03.0 # See: https://github.com/pydata/xarray/issues/7794
- xesmf >=0.8.2
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ dependencies = [
"requests>=2.0",
# roocs_grids is differently named on conda-forge
"roocs_grids>=0.1.2",
"roocs-utils>=0.6.4,<0.7",
"roocs-utils>=0.6.7,<0.7",
"shapely>=1.9",
"xarray>=0.21"
]
Expand Down
44 changes: 44 additions & 0 deletions tests/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,3 +354,47 @@ def cmip6_archive_base():
MINI_ESGF_CACHE_DIR,
"master/test_data/pool/data/CORDEX/data/cordex/output/EUR-11/KNMI/MPI-M-MPI-ESM-LR/rcp85/r1i1p1/KNMI-RACMO22E/v1/mon/tas/v20190625/tas_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_KNMI-RACMO22E_v1_mon_209101.nc",
).as_posix()

# ATLAS v1 datasets with full time series
ATLAS_v1_CMIP5 = Path(
MINI_ESGF_CACHE_DIR,
"master/test_data/pool/data/c3s-cica-atlas/CMIP5/rcp26/pr_CMIP5_rcp26_mon_200601-210012.nc",
).as_posix()

ATLAS_v1_EOBS = Path(
MINI_ESGF_CACHE_DIR,
"master/test_data/pool/data/c3s-cica-atlas/E-OBS/sfcwind_E-OBS_mon_195001-202112.nc",
).as_posix()

ATLAS_v1_ERA5 = Path(
MINI_ESGF_CACHE_DIR,
"master/test_data/pool/data/c3s-cica-atlas/ERA5/psl_ERA5_mon_194001-202212.nc",
).as_posix()

# ATLAS v1 datasets with full horizontal grid
ATLAS_v1_CORDEX = Path(
MINI_ESGF_CACHE_DIR,
"master/test_data/pool/data/c3s-cica-atlas/CORDEX-CORE/historical/huss_CORDEX-CORE_historical_mon_197001.nc",
).as_posix()

ATLAS_v1_EOBS_GRID = Path(
MINI_ESGF_CACHE_DIR,
"master/test_data/pool/data/c3s-cica-atlas/E-OBS/t_E-OBS_mon_195001.nc",
).as_posix()

# ATLAS v0 datasets with full time series
ATLAS_v0_CORDEX_NAM = Path(
MINI_ESGF_CACHE_DIR,
"master/test_data/pool/data/c3s-ipcc-ar6-atlas/CORDEX-NAM/historical/rx1day_CORDEX-NAM_historical_mon_197001-200512.nc",
).as_posix()

ATLAS_v0_CMIP6 = Path(
MINI_ESGF_CACHE_DIR,
"master/test_data/pool/data/c3s-ipcc-ar6-atlas/CMIP6/ssp245/sst_CMIP6_ssp245_mon_201501-210012.nc",
).as_posix()

# ATLAS v0 datasets with full horizontal grid
ATLAS_v0_CORDEX_ANT = Path(
MINI_ESGF_CACHE_DIR,
"master/test_data/pool/data/c3s-ipcc-ar6-atlas/CORDEX-ANT/rcp45/tnn_CORDEX-ANT_rcp45_mon_200601.nc",
).as_posix()
21 changes: 3 additions & 18 deletions tests/test_core_regrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ def test_grid_init_ds_tos_curvilinear(load_esgf_test_data):
def test_grid_init_ds_tas_cordex(load_esgf_test_data):
ds = xr.open_dataset(CORDEX_TAS_ONE_TIMESTEP, use_cftime=True)
grid = Grid(ds=ds)
print(ds)

assert grid.format == "CF"
assert grid.source == "Dataset"
Expand All @@ -146,7 +145,6 @@ def test_grid_init_ds_tas_cordex(load_esgf_test_data):
assert grid.ncells == 45225

ds = ds.drop(["lat", "lon", "lat_vertices", "lon_vertices"])
print(ds)
with pytest.raises(
Exception,
match="The grid format is not supported.",
Expand Down Expand Up @@ -235,7 +233,6 @@ def test_grid_init_ds_tas_unstructured(load_esgf_test_data):
assert grid.lat_bnds == "latitude_bnds"
assert grid.lon_bnds == "longitude_bnds"
assert grid.ncells == 20480
print(grid.contains_collapsed_cells)

# not implemented yet
# assert self.mask
Expand Down Expand Up @@ -455,17 +452,8 @@ def test_grid_from_ds_adaptive_reproducibility():

gAa = Grid(ds=dsA, grid_id="adaptive")
gA = Grid(grid_id="0pt25deg")
print(repr(gAa))
print(repr(gA))
print(gAa.ds.lon[715:735])
print(gA.ds.lon[715:735])

gBa = Grid(ds=dsB, grid_id="adaptive")
gB = Grid(grid_id="1deg")
print(gBa.ds.lon[170:190])
print(gB.ds.lon[170:190])
print(repr(gBa))
print(repr(gB))

assert gA.extent == "global"
assert gA.compare_grid(gAa)
Expand Down Expand Up @@ -1122,15 +1110,13 @@ def test_adaptive_masking(self, load_esgf_test_data, tmp_path):
self._setup()
weights_cache_init(Path(tmp_path, "weights"))
w = Weights(grid_in=self.grid_in, grid_out=self.grid_out, method="conservative")
r = regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=0.7)
print(r)
regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=0.7)

def test_no_adaptive_masking(self, load_esgf_test_data, tmp_path):
self._setup()
weights_cache_init(Path(tmp_path, "weights"))
w = Weights(grid_in=self.grid_in, grid_out=self.grid_out, method="bilinear")
r = regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=-1.0)
print(r)
regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=-1.0)

def test_duplicated_cells_warning_issued(self, load_esgf_test_data, tmp_path):
self._setup()
Expand All @@ -1149,14 +1135,13 @@ def test_duplicated_cells_warning_issued(self, load_esgf_test_data, tmp_path):
"However, please be wary with the results and consider removing / masking "
"the duplicated cells before remapping.",
) as issuedWarnings:
r = regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=0.0)
regrid(self.grid_in, self.grid_out, w, adaptive_masking_threshold=0.0)
if not issuedWarnings:
raise Exception(
"No warning issued regarding the duplicated cells in the grid."
)
else:
assert len(issuedWarnings) == 1
print(r)

def test_regrid_dataarray(self, load_esgf_test_data, tmp_path):
self._setup()
Expand Down
52 changes: 51 additions & 1 deletion tests/test_file_namers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
from roocs_utils.exceptions import InvalidProject
from roocs_utils.parameter.param_utils import time_interval

from _common import C3S_CORDEX_NAM_PR, CMIP5_TAS, CMIP6_SICONC
from _common import (
C3S_CORDEX_NAM_PR,
CMIP5_TAS,
CMIP6_SICONC,
ATLAS_v0_CORDEX_NAM,
ATLAS_v1_ERA5,
)
from clisops import CONFIG
from clisops.ops.subset import subset
from clisops.utils.file_namers import get_file_namer
Expand Down Expand Up @@ -188,3 +194,47 @@ def test_StandardFileNamer_c3s_cordex_use_default_attr_names(load_esgf_test_data
for ds, expected in checks:
resp = s.get_file_name(ds)
assert resp == expected


def test_StandardFileNamer_c3s_atlas_v0(load_esgf_test_data):
"Test C3S ATLAS v0 (c3s-ipcc-ar6-atlas) filenamer"
s = get_file_namer("standard")()

_ds = xr.open_mfdataset(
ATLAS_v0_CORDEX_NAM,
use_cftime=True,
combine="by_coords",
)

checks = [
(
_ds,
"rx1day_CORDEX-NAM_historical_mon_19700101-20051201.nc",
)
]

for ds, expected in checks:
resp = s.get_file_name(ds)
assert resp == expected


def test_StandardFileNamer_c3s_atlas_v1(load_esgf_test_data):
"Test C3S ATLAS v1 (c3s-cica-atlas) filenamer"
s = get_file_namer("standard")()

_ds = xr.open_mfdataset(
ATLAS_v1_ERA5,
use_cftime=True,
combine="by_coords",
)

checks = [
(
_ds,
"psl_ERA5_no-expt_mon_19400101-20221201.nc",
)
]

for ds, expected in checks:
resp = s.get_file_name(ds)
assert resp == expected
Loading

0 comments on commit 640dbc3

Please sign in to comment.