Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for the new compression arguments. #7551

Merged
merged 37 commits into from Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
f8a0014
Support for the new compression arguments.
garciampred Feb 17, 2023
42c5a97
Merge branch 'main' into fix_compression_options
dcherian Mar 16, 2023
7abf103
significant_digit and other missing keys added
markelg Mar 22, 2023
fa408c5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 22, 2023
1b9f6db
Merge branch 'main' into fix_compression_options
markelg Apr 24, 2023
8c0c584
Merge branch 'pydata:main' into fix_compression_options
markelg Jun 20, 2023
e489a32
test for the new compression argument
markelg Jun 21, 2023
fff18a8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 21, 2023
a89f816
move the new test to TestNetCDF4Data
markelg Jun 21, 2023
2feafc5
fixed conflict
markelg Jun 21, 2023
371d2d7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 21, 2023
e74ec56
Merge branch 'main' into fix_compression_options
rabernat Sep 28, 2023
34718f4
Merge branch 'pydata:main' into fix_compression_options
markelg Oct 30, 2023
f77d126
simplify this line (code review)
markelg Oct 30, 2023
2df3387
Added entry to whats-new
markelg Oct 30, 2023
011b939
Merge branch 'main' into fix_compression_options
markelg Nov 14, 2023
d39173c
Merge branch 'main' into fix_compression_options
dcherian Nov 15, 2023
da5dee8
Merge branch 'pydata:main' into fix_compression_options
markelg Dec 11, 2023
ee57f4a
Merge branch 'main' into fix_compression_options
kmuehlbauer Dec 18, 2023
c2ce8d5
bump netcdf4 to 1.6.2 in min-all-deps.yml
kmuehlbauer Dec 18, 2023
e5d0609
parametrize compression in test
kmuehlbauer Dec 18, 2023
d77e3fd
Revert "bump netcdf4 to 1.6.2 in min-all-deps.yml"
kmuehlbauer Dec 18, 2023
fbffef2
check netCDF4 version and skip test if netcdf4 version <1.6.2
kmuehlbauer Dec 18, 2023
5b271fa
fix typing
kmuehlbauer Dec 18, 2023
a2a41ab
Larger chunks to avoid random blosc errors
markelg Dec 18, 2023
b98c926
use decorator to skip old netCDF4 versions
markelg Dec 18, 2023
74b74d7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 18, 2023
5d69807
Merge branch 'main' into fix_compression_options
kmuehlbauer Dec 19, 2023
a4e4d8c
remove stale version-property
kmuehlbauer Dec 19, 2023
4ae2bca
fix whats-new.rst
kmuehlbauer Dec 19, 2023
f8f37f4
fix requires-decorator
kmuehlbauer Dec 19, 2023
b7e56fe
fix for asserts of other tests that use test data
markelg Dec 19, 2023
138a439
Apply suggestions from code review
kmuehlbauer Dec 20, 2023
9e25e6e
Update xarray/tests/__init__.py
kmuehlbauer Dec 20, 2023
4e59c9a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 20, 2023
674d35f
Merge branch 'main' into fix_compression_options
kmuehlbauer Dec 20, 2023
bc74cd8
Update xarray/tests/test_backends.py
kmuehlbauer Dec 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Expand Up @@ -26,6 +26,10 @@ New Features

- :py:meth:`xr.cov` and :py:meth:`xr.corr` now support using weights (:issue:`8527`, :pull:`7392`).
By `Llorenç Lledó <https://github.com/lluritu>`_.
- Accept the compression arguments new in netCDF 1.6.0 in the netCDF4 backend.
See `netCDF4 documentation <https://unidata.github.io/netcdf4-python/#efficient-compression-of-netcdf-variables>`_ for details.
By `Markel García-Díez <https://github.com/markelg>`_. (:issue:`6929`, :pull:`7551`) Note that some
new compression filters needs plugins to be installed which may not be available in all netCDF distributions.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
25 changes: 17 additions & 8 deletions xarray/backends/netCDF4_.py
Expand Up @@ -257,6 +257,12 @@ def _extract_nc4_variable_encoding(
"_FillValue",
"dtype",
"compression",
"significant_digits",
"quantize_mode",
"blosc_shuffle",
"szip_coding",
"szip_pixels_per_block",
"endian",
}
if lsd_okay:
valid_encodings.add("least_significant_digit")
Expand Down Expand Up @@ -497,20 +503,23 @@ def prepare_variable(
if name in self.ds.variables:
nc4_var = self.ds.variables[name]
else:
nc4_var = self.ds.createVariable(
default_args = dict(
varname=name,
datatype=datatype,
dimensions=variable.dims,
zlib=encoding.get("zlib", False),
complevel=encoding.get("complevel", 4),
shuffle=encoding.get("shuffle", True),
fletcher32=encoding.get("fletcher32", False),
contiguous=encoding.get("contiguous", False),
chunksizes=encoding.get("chunksizes"),
zlib=False,
complevel=4,
shuffle=True,
fletcher32=False,
contiguous=False,
chunksizes=None,
endian="native",
least_significant_digit=encoding.get("least_significant_digit"),
least_significant_digit=None,
fill_value=fill_value,
)
default_args.update(encoding)
markelg marked this conversation as resolved.
Show resolved Hide resolved
default_args.pop("_FillValue", None)
nc4_var = self.ds.createVariable(**default_args)

nc4_var.setncatts(attrs)

Expand Down
32 changes: 25 additions & 7 deletions xarray/tests/__init__.py
Expand Up @@ -2,6 +2,7 @@

import importlib
import platform
import string
import warnings
from contextlib import contextmanager, nullcontext
from unittest import mock # noqa: F401
Expand Down Expand Up @@ -112,6 +113,10 @@ def _importorskip(
not has_h5netcdf_ros3[0], reason="requires h5netcdf 1.3.0"
)

has_netCDF4_1_6_2_or_above, requires_netCDF4_1_6_2_or_above = _importorskip(
"netCDF4", "1.6.2"
)
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved

# change some global options for tests
set_options(warn_for_unclosed_files=True)

Expand Down Expand Up @@ -262,28 +267,41 @@ def assert_allclose(a, b, check_default_indexes=True, **kwargs):
xarray.testing._assert_internal_invariants(b, check_default_indexes)


def create_test_data(seed: int | None = None, add_attrs: bool = True) -> Dataset:
_DEFAULT_TEST_DIM_SIZES = (8, 9, 10)


def create_test_data(
seed: int | None = None,
add_attrs: bool = True,
dim_sizes: tuple[int, int, int] = _DEFAULT_TEST_DIM_SIZES,
) -> Dataset:
rs = np.random.RandomState(seed)
_vars = {
"var1": ["dim1", "dim2"],
"var2": ["dim1", "dim2"],
"var3": ["dim3", "dim1"],
}
_dims = {"dim1": 8, "dim2": 9, "dim3": 10}
_dims = {"dim1": dim_sizes[0], "dim2": dim_sizes[1], "dim3": dim_sizes[2]}

obj = Dataset()
obj["dim2"] = ("dim2", 0.5 * np.arange(_dims["dim2"]))
obj["dim3"] = ("dim3", list("abcdefghij"))
if _dims["dim3"] > 26:
raise RuntimeError(
f'Not enough letters for filling this dimension size ({_dims["dim3"]})'
)
obj["dim3"] = ("dim3", list(string.ascii_lowercase[0 : _dims["dim3"]]))
obj["time"] = ("time", pd.date_range("2000-01-01", periods=20))
for v, dims in sorted(_vars.items()):
data = rs.normal(size=tuple(_dims[d] for d in dims))
obj[v] = (dims, data)
if add_attrs:
obj[v].attrs = {"foo": "variable"}
obj.coords["numbers"] = (
"dim3",
np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64"),
)

if dim_sizes == _DEFAULT_TEST_DIM_SIZES:
numbers_values = np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64")
else:
numbers_values = np.random.randint(0, 3, _dims["dim3"], dtype="int64")
obj.coords["numbers"] = ("dim3", numbers_values)
obj.encoding = {"foo": "bar"}
assert all(obj.data.flags.writeable for obj in obj.variables.values())
return obj
Expand Down
73 changes: 71 additions & 2 deletions xarray/tests/test_backends.py
Expand Up @@ -72,6 +72,7 @@
requires_h5netcdf_ros3,
requires_iris,
requires_netCDF4,
requires_netCDF4_1_6_2_or_above,
requires_pydap,
requires_pynio,
requires_scipy,
Expand Down Expand Up @@ -1486,7 +1487,7 @@ def test_dump_and_open_encodings(self) -> None:
assert ds.variables["time"].getncattr("units") == units
assert_array_equal(ds.variables["time"], np.arange(10) + 4)

def test_compression_encoding(self) -> None:
def test_compression_encoding_legacy(self) -> None:
data = create_test_data()
data["var2"].encoding.update(
{
Expand Down Expand Up @@ -1767,6 +1768,74 @@ def test_setncattr_string(self) -> None:
assert_array_equal(one_element_list_of_strings, totest.attrs["bar"])
assert one_string == totest.attrs["baz"]

@pytest.mark.parametrize(
"compression",
[
None,
"zlib",
"szip",
"zstd",
"blosc_lz",
"blosc_lz4",
"blosc_lz4hc",
"blosc_zlib",
"blosc_zstd",
],
)
@requires_netCDF4_1_6_2_or_above
@pytest.mark.xfail(ON_WINDOWS, reason="new compression not yet implemented")
def test_compression_encoding(self, compression: str | None) -> None:
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
data = create_test_data(dim_sizes=(20, 80, 10))
encoding_params: dict[str, Any] = dict(compression=compression, blosc_shuffle=1)
data["var2"].encoding.update(encoding_params)
data["var2"].encoding.update(
{
"chunksizes": (20, 40),
"original_shape": data.var2.shape,
"blosc_shuffle": 1,
"fletcher32": False,
}
)
with self.roundtrip(data) as actual:
expected_encoding = data["var2"].encoding.copy()
# compression does not appear in the retrieved encoding, that differs
# from the input encoding. shuffle also chantges. Here we modify the
# expected encoding to account for this
compression = expected_encoding.pop("compression")
blosc_shuffle = expected_encoding.pop("blosc_shuffle")
if compression is not None:
if "blosc" in compression and blosc_shuffle:
expected_encoding["blosc"] = {
"compressor": compression,
"shuffle": blosc_shuffle,
}
expected_encoding["shuffle"] = False
elif compression == "szip":
expected_encoding["szip"] = {
"coding": "nn",
"pixels_per_block": 8,
}
expected_encoding["shuffle"] = False
else:
# This will set a key like zlib=true which is what appears in
# the encoding when we read it.
expected_encoding[compression] = True
if compression == "zstd":
expected_encoding["shuffle"] = False
else:
expected_encoding["shuffle"] = False

actual_encoding = actual["var2"].encoding
assert expected_encoding.items() <= actual_encoding.items()
if (
encoding_params["compression"] is not None
and "blosc" not in encoding_params["compression"]
markelg marked this conversation as resolved.
Show resolved Hide resolved
):
# regression test for #156
expected = data.isel(dim1=0)
with self.roundtrip(expected) as actual:
assert_equal(expected, actual)

@pytest.mark.skip(reason="https://github.com/Unidata/netcdf4-python/issues/1195")
def test_refresh_from_disk(self) -> None:
super().test_refresh_from_disk()
Expand Down Expand Up @@ -4518,7 +4587,7 @@ def test_extract_nc4_variable_encoding(self) -> None:
assert {} == encoding

@requires_netCDF4
def test_extract_nc4_variable_encoding_netcdf4(self, monkeypatch):
def test_extract_nc4_variable_encoding_netcdf4(self):
# New netCDF4 1.6.0 compression argument.
var = xr.Variable(("x",), [1, 2, 3], {}, {"compression": "szlib"})
_extract_nc4_variable_encoding(var, backend="netCDF4", raise_on_invalid=True)
Expand Down