Skip to content

Commit

Permalink
Autodetect the engine only when engine=None (#4458)
Browse files Browse the repository at this point in the history
* Perform engine autodetection only when `engine=None`

* More specific "is it a file" condition

* Add test for wrong combination of file object and engine

* Add what's new

* Better wording of what's new

* Fix code style

* Move path normalization code in an helper function

* Move some autodetection logic back into _get_engine_from_magic_number

* Simplify input validation and move it all to the open function

* Fix code style

* Remove tests that never trigger and make error message more precise

* Update doc/whats-new.rst

Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>

* Update xarray/backends/api.py

Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>

* Update xarray/backends/api.py

Co-authored-by: Joe Hamman <jhamman1@gmail.com>

* Update xarray/backends/api.py

Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>

* Add tests for file-objects with `engine=None`

* Trivial code style

* Code style, use f-string in error message

Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
Co-authored-by: Joe Hamman <jhamman1@gmail.com>
  • Loading branch information
3 people committed Sep 28, 2020
1 parent ed1b865 commit f821fe2
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 32 deletions.
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ New Features
Bug fixes
~~~~~~~~~

- Fix silently overwriting the `engine` key when passing :py:func:`open_dataset` a file object
to an incompatible netCDF (:issue:`4457`). Now incompatible combinations of files and engines raise
an exception instead. By `Alessandro Amici <https://github.com/alexamici>`_.


Documentation
~~~~~~~~~~~~~
Expand Down
50 changes: 21 additions & 29 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,7 @@ def _get_engine_from_magic_number(filename_or_obj):
if filename_or_obj.tell() != 0:
raise ValueError(
"file-like object read/write pointer not at zero "
"please close and reopen, or use a context "
"manager"
"please close and reopen, or use a context manager"
)
magic_number = filename_or_obj.read(8)
filename_or_obj.seek(0)
Expand All @@ -136,17 +135,10 @@ def _get_engine_from_magic_number(filename_or_obj):
engine = "scipy"
elif magic_number.startswith(b"\211HDF\r\n\032\n"):
engine = "h5netcdf"
if isinstance(filename_or_obj, bytes):
raise ValueError(
"can't open netCDF4/HDF5 as bytes "
"try passing a path or file-like object"
)
else:
if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80:
filename_or_obj = filename_or_obj[:80] + b"..."
raise ValueError(
"{} is not a valid netCDF file "
"did you mean to pass a string for a path instead?".format(filename_or_obj)
f"{magic_number} is not the signature of any supported file format "
"did you mean to pass a string for a path instead?"
)
return engine

Expand All @@ -163,6 +155,14 @@ def _get_default_engine(path, allow_remote=False):
return engine


def _autodetect_engine(filename_or_obj):
if isinstance(filename_or_obj, str):
engine = _get_default_engine(filename_or_obj, allow_remote=True)
else:
engine = _get_engine_from_magic_number(filename_or_obj)
return engine


def _get_backend_cls(engine):
"""Select open_dataset method based on current engine"""
try:
Expand All @@ -175,10 +175,13 @@ def _get_backend_cls(engine):


def _normalize_path(path):
if is_remote_uri(path):
return path
else:
return os.path.abspath(os.path.expanduser(path))
if isinstance(path, Path):
path = str(path)

if isinstance(path, str) and not is_remote_uri(path):
path = os.path.abspath(os.path.expanduser(path))

return path


def _validate_dataset_names(dataset):
Expand Down Expand Up @@ -532,24 +535,13 @@ def maybe_decode_store(store, chunks, lock=False):
ds2._file_obj = ds._file_obj
return ds2

if isinstance(filename_or_obj, Path):
filename_or_obj = str(filename_or_obj)
filename_or_obj = _normalize_path(filename_or_obj)

if isinstance(filename_or_obj, AbstractDataStore):
store = filename_or_obj
else:
if isinstance(filename_or_obj, str):
filename_or_obj = _normalize_path(filename_or_obj)

if engine is None:
engine = _get_default_engine(filename_or_obj, allow_remote=True)
elif engine != "zarr":
if engine not in [None, "scipy", "h5netcdf"]:
raise ValueError(
"can only read bytes or file-like objects "
"with engine='scipy' or 'h5netcdf'"
)
engine = _get_engine_from_magic_number(filename_or_obj)
if engine is None:
engine = _autodetect_engine(filename_or_obj)

if engine in ["netcdf4", "h5netcdf"]:
extra_kwargs["group"] = group
Expand Down
19 changes: 19 additions & 0 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,25 @@ def open(
):
import h5netcdf

if isinstance(filename, bytes):
raise ValueError(
"can't open netCDF4/HDF5 as bytes "
"try passing a path or file-like object"
)
elif hasattr(filename, "tell"):
if filename.tell() != 0:
raise ValueError(
"file-like object read/write pointer not at zero "
"please close and reopen, or use a context manager"
)
else:
magic_number = filename.read(8)
filename.seek(0)
if not magic_number.startswith(b"\211HDF\r\n\032\n"):
raise ValueError(
f"{magic_number} is not the signature of a valid netCDF file"
)

if format not in [None, "NETCDF4"]:
raise ValueError("invalid format for h5netcdf backend")

Expand Down
6 changes: 6 additions & 0 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,12 @@ def open(
):
import netCDF4

if not isinstance(filename, str):
raise ValueError(
"can only read bytes or file-like objects "
"with engine='scipy' or 'h5netcdf'"
)

if format is None:
format = "NETCDF4"

Expand Down
18 changes: 15 additions & 3 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2224,7 +2224,7 @@ def test_engine(self):
open_dataset(tmp_file, engine="foobar")

netcdf_bytes = data.to_netcdf()
with raises_regex(ValueError, "can only read bytes or file-like"):
with raises_regex(ValueError, "unrecognized engine"):
open_dataset(BytesIO(netcdf_bytes), engine="foobar")

def test_cross_engine_read_write_netcdf3(self):
Expand Down Expand Up @@ -2494,13 +2494,13 @@ def test_open_badbytes(self):
with raises_regex(ValueError, "HDF5 as bytes"):
with open_dataset(b"\211HDF\r\n\032\n", engine="h5netcdf"):
pass
with raises_regex(ValueError, "not a valid netCDF"):
with raises_regex(ValueError, "not the signature of any supported file"):
with open_dataset(b"garbage"):
pass
with raises_regex(ValueError, "can only read bytes"):
with open_dataset(b"garbage", engine="netcdf4"):
pass
with raises_regex(ValueError, "not a valid netCDF"):
with raises_regex(ValueError, "not the signature of a valid netCDF file"):
with open_dataset(BytesIO(b"garbage"), engine="h5netcdf"):
pass

Expand All @@ -2526,11 +2526,23 @@ def test_open_fileobj(self):
with open_dataset(f, engine="h5netcdf") as actual:
assert_identical(expected, actual)

f.seek(0)
with open_dataset(f) as actual:
assert_identical(expected, actual)

f.seek(0)
with BytesIO(f.read()) as bio:
with open_dataset(bio, engine="h5netcdf") as actual:
assert_identical(expected, actual)

f.seek(0)
with raises_regex(TypeError, "not a valid NetCDF 3"):
open_dataset(f, engine="scipy")

f.seek(8)
with raises_regex(ValueError, "read/write pointer not at zero"):
open_dataset(f)


@requires_h5netcdf
@requires_dask
Expand Down

0 comments on commit f821fe2

Please sign in to comment.