From 2af21abbb9874799f3a48ce6865e815b7a5c0ee6 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Tue, 30 Sep 2025 12:58:58 -0400 Subject: [PATCH 01/17] fix: be more more caution when claiming a backend can open a URL --- xarray/backends/netCDF4_.py | 7 +++- xarray/backends/pydap_.py | 10 ++++- xarray/tests/test_backends.py | 71 +++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 2 deletions(-) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 8d4ca6441c9..cf398c11085 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -702,7 +702,12 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint): def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): - return True + # For remote URIs, check file extension to avoid claiming non-netCDF URLs + # (e.g., remote Zarr stores) + _, ext = os.path.splitext(filename_or_obj.rstrip("/")) + # Accept remote URIs with netCDF extensions or no extension + # (OPeNDAP endpoints often have no extension) + return ext in {".nc", ".nc4", ".cdf", ""} magic_number = ( bytes(filename_or_obj[:8]) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 4fbfe8ee210..88325b2c9d2 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from collections.abc import Iterable from typing import TYPE_CHECKING, Any @@ -209,7 +210,14 @@ class PydapBackendEntrypoint(BackendEntrypoint): url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.PydapBackendEntrypoint.html" def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: - return isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) + if not (isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj)): + return False + + # Check file extension to avoid claiming non-OPeNDAP URLs (e.g., remote Zarr stores) + _, ext = os.path.splitext(filename_or_obj.rstrip("/")) + # Pydap handles OPeNDAP endpoints, which typically have no extension or .nc/.nc4 + # Reject URLs with non-OPeNDAP extensions like .zarr + return ext not in {".zarr", ".zip", ".tar", ".gz"} def open_dataset( self, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7df9596b1ae..0190c0e4431 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7252,6 +7252,77 @@ def test_zarr_entrypoint(tmp_path: Path) -> None: assert not entrypoint.guess_can_open("something.zarr.txt") +@requires_netCDF4 +@requires_pydap +@requires_zarr +def test_remote_url_backend_auto_detection() -> None: + """ + Test that remote URLs are correctly claimed by appropriate backends. + + This tests the fix for issue where netCDF4 and pydap backends were + claiming ALL remote URLs, preventing remote Zarr stores from being + auto-detected. + + See: https://github.com/pydata/xarray/issues/XXXXX + """ + from xarray.backends.netCDF4_ import NetCDF4BackendEntrypoint + from xarray.backends.pydap_ import PydapBackendEntrypoint + from xarray.backends.zarr import ZarrBackendEntrypoint + + netcdf4_entrypoint = NetCDF4BackendEntrypoint() + pydap_entrypoint = PydapBackendEntrypoint() + zarr_entrypoint = ZarrBackendEntrypoint() + + # Remote Zarr URLs should be claimed by Zarr backend, not netCDF4/pydap + remote_zarr_urls = [ + "https://example.com/store.zarr", + "http://example.com/data.zarr/", + "s3://bucket/path/to/data.zarr", + ] + + for url in remote_zarr_urls: + assert zarr_entrypoint.guess_can_open(url), f"Zarr should claim {url}" + assert not netcdf4_entrypoint.guess_can_open(url), ( + f"NetCDF4 should not claim {url}" + ) + assert not pydap_entrypoint.guess_can_open(url), f"Pydap should not claim {url}" + + # Remote netCDF URLs with extensions should be claimed by netCDF4, not Zarr + remote_netcdf_urls_with_ext = [ + "https://example.com/file.nc", + "http://example.com/data.nc4", + "https://example.com/test.cdf", + ] + + for url in remote_netcdf_urls_with_ext: + assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" + assert netcdf4_entrypoint.guess_can_open(url), f"NetCDF4 should claim {url}" + + # OPeNDAP endpoints (no extension) should be claimed by both netCDF4 and pydap + opendap_urls = [ + "http://opendap.example.com/data", + "https://test.opendap.org/dataset", + ] + + for url in opendap_urls: + assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" + assert netcdf4_entrypoint.guess_can_open(url), f"NetCDF4 should claim {url}" + assert pydap_entrypoint.guess_can_open(url), f"Pydap should claim {url}" + + # Other file types should not be claimed + other_urls = [ + "https://example.com/data.zip", + "https://example.com/data.tar.gz", + ] + + for url in other_urls: + assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" + assert not netcdf4_entrypoint.guess_can_open(url), ( + f"NetCDF4 should not claim {url}" + ) + assert not pydap_entrypoint.guess_can_open(url), f"Pydap should not claim {url}" + + @requires_netCDF4 @pytest.mark.parametrize("str_type", (str, np.str_)) def test_write_file_from_np_str(str_type: type[str | np.str_], tmpdir: str) -> None: From 1a3e7dfb3ad5f079df0c98e7a4182957ce845d26 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Tue, 30 Sep 2025 14:13:00 -0400 Subject: [PATCH 02/17] add whats new entry --- doc/whats-new.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b8ffab2889f..b7f6bce85ea 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,7 +24,9 @@ Deprecations Bug fixes ~~~~~~~~~ - +- ``netcdf`` and ``pydap`` engines no longer incorrectly claim to read all remote URLs preventing + the ``zarr`` backend from reading remote zarr stores without an explicit ``engine=`` argument. + (:pull:`10804`). By `Ian Hunt-Isaak Date: Tue, 30 Sep 2025 15:37:05 -0400 Subject: [PATCH 03/17] fixes from review --- doc/whats-new.rst | 2 +- xarray/tests/test_backends.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b7f6bce85ea..41a5f93a67d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -26,7 +26,7 @@ Bug fixes ~~~~~~~~~ - ``netcdf`` and ``pydap`` engines no longer incorrectly claim to read all remote URLs preventing the ``zarr`` backend from reading remote zarr stores without an explicit ``engine=`` argument. - (:pull:`10804`). By `Ian Hunt-Isaak `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0190c0e4431..d7d56acbb2e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7263,7 +7263,7 @@ def test_remote_url_backend_auto_detection() -> None: claiming ALL remote URLs, preventing remote Zarr stores from being auto-detected. - See: https://github.com/pydata/xarray/issues/XXXXX + See: https://github.com/pydata/xarray/issues/10801 """ from xarray.backends.netCDF4_ import NetCDF4BackendEntrypoint from xarray.backends.pydap_ import PydapBackendEntrypoint From 7ed1f0ad231c9a666fdc65cb921595ae67f88262 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 1 Oct 2025 11:16:14 -0400 Subject: [PATCH 04/17] more caution in scipy netcdf backend --- xarray/backends/scipy_.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 5ac5008098b..08c161a1926 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -330,7 +330,7 @@ class ScipyBackendEntrypoint(BackendEntrypoint): """ Backend for netCDF files based on the scipy package. - It can open ".nc", ".nc4", ".cdf" and ".gz" files but will only be + It can open ".nc", ".cdf", and "nc..gz" files but will only be selected as the default if the "netcdf4" and "h5netcdf" engines are not available. It has the advantage that is is a lightweight engine that has no system requirements (unlike netcdf4 and h5netcdf). @@ -347,7 +347,7 @@ class ScipyBackendEntrypoint(BackendEntrypoint): backends.H5netcdfBackendEntrypoint """ - description = "Open netCDF files (.nc, .nc4, .cdf and .gz) using scipy in Xarray" + description = "Open netCDF files (.nc, .cdf and .nc.gz) using scipy in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.ScipyBackendEntrypoint.html" def guess_can_open( @@ -364,7 +364,7 @@ def guess_can_open( if isinstance(filename_or_obj, str | os.PathLike): _, ext = os.path.splitext(filename_or_obj) - return ext in {".nc", ".nc4", ".cdf", ".gz"} + return ext in {".nc", ".cdf", ".nc.gz"} return False From 60c11585e07ec768833ffc6dcb155a6115b829a6 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 1 Oct 2025 11:27:19 -0400 Subject: [PATCH 05/17] correct suffix detection for scipy backend --- xarray/backends/scipy_.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 08c161a1926..59ed3743bbc 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -363,8 +363,10 @@ def guess_can_open( return magic_number.startswith(b"CDF") if isinstance(filename_or_obj, str | os.PathLike): - _, ext = os.path.splitext(filename_or_obj) - return ext in {".nc", ".cdf", ".nc.gz"} + from pathlib import Path + + suffix = "".join(Path(filename_or_obj).suffixes) + return suffix in {".nc", ".cdf", ".nc.gz"} return False From d2334e42781a729dfca505f6f159dd0e7d49bef1 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 3 Oct 2025 14:17:36 -0400 Subject: [PATCH 06/17] stricter URL detection for netcdf/dap --- doc/whats-new.rst | 8 ++- xarray/backends/h5netcdf_.py | 8 ++- xarray/backends/netCDF4_.py | 44 ++++++++------ xarray/backends/pydap_.py | 21 +++++-- xarray/core/utils.py | 34 +++++++++++ xarray/tests/test_backends.py | 111 ++++++++++++++++++---------------- 6 files changed, 146 insertions(+), 80 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 41a5f93a67d..e60b0417213 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,9 +24,11 @@ Deprecations Bug fixes ~~~~~~~~~ -- ``netcdf`` and ``pydap`` engines no longer incorrectly claim to read all remote URLs preventing - the ``zarr`` backend from reading remote zarr stores without an explicit ``engine=`` argument. - (:pull:`10804`). By `Ian Hunt-Isaak `_. +- ``netcdf4`` and ``pydap`` backends now use stricter URL detection to avoid incorrectly claiming + remote URLs. The ``pydap`` backend now only claims URLs with explicit DAP protocol indicators + (``dap2://`` or ``dap4://`` schemes, or ``/dap2/`` or ``/dap4/`` in the URL path). This prevents + both backends from claiming remote Zarr stores and other non-DAP URLs without an explicit + ``engine=`` argument. (:pull:`10804`). By `Ian Hunt-Isaak `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 28565f92de9..1801ab9b6f4 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -462,13 +462,19 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): supports_groups = True def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: + from xarray.core.utils import is_remote_uri, strip_uri_params + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None: return magic_number.startswith(b"\211HDF\r\n\032\n") if isinstance(filename_or_obj, str | os.PathLike): - _, ext = os.path.splitext(filename_or_obj) + path = str(filename_or_obj) + # For remote URIs, strip query parameters and fragments before checking extension + if isinstance(filename_or_obj, str) and is_remote_uri(path): + path = strip_uri_params(path) + _, ext = os.path.splitext(path) return ext in {".nc", ".nc4", ".cdf"} return False diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index cf398c11085..e6782723f74 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -701,26 +701,36 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint): supports_groups = True def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: + # Helper to check if magic number is netCDF or HDF5 + def _is_netcdf_magic(magic: bytes) -> bool: + return magic.startswith((b"CDF", b"\211HDF\r\n\032\n")) + + # Helper to check if extension is netCDF + def _has_netcdf_ext(path: str | os.PathLike, is_remote: bool = False) -> bool: + from xarray.core.utils import strip_uri_params + + path = str(path).rstrip("/") + # For remote URIs, strip query parameters and fragments + if is_remote: + path = strip_uri_params(path) + _, ext = os.path.splitext(path) + return ext in {".nc", ".nc4", ".cdf"} + if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): - # For remote URIs, check file extension to avoid claiming non-netCDF URLs - # (e.g., remote Zarr stores) - _, ext = os.path.splitext(filename_or_obj.rstrip("/")) - # Accept remote URIs with netCDF extensions or no extension - # (OPeNDAP endpoints often have no extension) - return ext in {".nc", ".nc4", ".cdf", ""} - - magic_number = ( - bytes(filename_or_obj[:8]) - if isinstance(filename_or_obj, bytes | memoryview) - else try_read_magic_number_from_path(filename_or_obj) - ) - if magic_number is not None: - # netcdf 3 or HDF5 - return magic_number.startswith((b"CDF", b"\211HDF\r\n\032\n")) + # For remote URIs, check extension (accounting for query params/fragments) + # Remote netcdf-c can handle both regular URLs and DAP URLs + return _has_netcdf_ext(filename_or_obj, is_remote=True) if isinstance(filename_or_obj, str | os.PathLike): - _, ext = os.path.splitext(filename_or_obj) - return ext in {".nc", ".nc4", ".cdf"} + # For local paths, check magic number first, then extension + magic_number = try_read_magic_number_from_path(filename_or_obj) + if magic_number is not None: + return _is_netcdf_magic(magic_number) + # No magic number available, fallback to extension + return _has_netcdf_ext(filename_or_obj) + + if isinstance(filename_or_obj, bytes | memoryview): + return _is_netcdf_magic(bytes(filename_or_obj[:8])) return False diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 88325b2c9d2..4883efe187f 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -210,14 +210,23 @@ class PydapBackendEntrypoint(BackendEntrypoint): url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.PydapBackendEntrypoint.html" def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: - if not (isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj)): + if not isinstance(filename_or_obj, str): return False - # Check file extension to avoid claiming non-OPeNDAP URLs (e.g., remote Zarr stores) - _, ext = os.path.splitext(filename_or_obj.rstrip("/")) - # Pydap handles OPeNDAP endpoints, which typically have no extension or .nc/.nc4 - # Reject URLs with non-OPeNDAP extensions like .zarr - return ext not in {".zarr", ".zip", ".tar", ".gz"} + # Check for explicit DAP protocol indicators: + # 1. DAP scheme: dap2:// or dap4:// (case-insensitive, may not be recognized by is_remote_uri) + # 2. Remote URI with /dap2/ or /dap4/ in URL path (case-insensitive) + # Note: We intentionally do NOT check for .dap suffix as that would match + # file extensions like .dap which trigger downloads of binary data + url_lower = filename_or_obj.lower() + if url_lower.startswith(("dap2://", "dap4://")): + return True + + # For standard remote URIs, check for DAP indicators in path + if is_remote_uri(filename_or_obj): + return "/dap2/" in url_lower or "/dap4/" in url_lower + + return False def open_dataset( self, diff --git a/xarray/core/utils.py b/xarray/core/utils.py index ec4edf255f6..db5827148ab 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -728,6 +728,40 @@ def is_remote_uri(path: str) -> bool: return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path)) +def strip_uri_params(uri: str) -> str: + """Strip query parameters and fragments from a URI. + + This is useful for extracting the file extension from URLs that + contain query parameters (e.g., OPeNDAP constraint expressions). + + Parameters + ---------- + uri : str + The URI to strip + + Returns + ------- + str + The URI without query parameters (?) or fragments (#) + + Examples + -------- + >>> strip_uri_params("http://example.com/file.nc?var=temp&time=0") + 'http://example.com/file.nc' + >>> strip_uri_params("http://example.com/file.nc#section") + 'http://example.com/file.nc' + >>> strip_uri_params("/local/path/file.nc") + '/local/path/file.nc' + """ + from urllib.parse import urlsplit, urlunsplit + + # Use urlsplit to properly parse the URI + # This handles both absolute URLs and relative paths + parsed = urlsplit(uri) + # Reconstruct without query and fragment using urlunsplit + return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, "", "")) + + def read_magic_number_from_file(filename_or_obj, count=8) -> bytes: # check byte header to determine file type if not isinstance(filename_or_obj, io.IOBase): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d7d56acbb2e..2feb31504b2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7159,7 +7159,10 @@ def test_netcdf4_entrypoint(tmp_path: Path) -> None: _check_guess_can_open_and_open(entrypoint, path, engine="netcdf4", expected=ds) _check_guess_can_open_and_open(entrypoint, str(path), engine="netcdf4", expected=ds) - assert entrypoint.guess_can_open("http://something/remote") + # Remote URLs without extensions are no longer claimed (stricter detection) + assert not entrypoint.guess_can_open("http://something/remote") + # Remote URLs with netCDF extensions are claimed + assert entrypoint.guess_can_open("http://something/remote.nc") assert entrypoint.guess_can_open("something-local.nc") assert entrypoint.guess_can_open("something-local.nc4") assert entrypoint.guess_can_open("something-local.cdf") @@ -7202,6 +7205,10 @@ def test_scipy_entrypoint(tmp_path: Path) -> None: assert entrypoint.guess_can_open("something-local.nc.gz") assert not entrypoint.guess_can_open("not-found-and-no-extension") assert not entrypoint.guess_can_open(b"not-a-netcdf-file") + # Should not claim .gz files that aren't netCDF + assert not entrypoint.guess_can_open("something.zarr.gz") + assert not entrypoint.guess_can_open("something.tar.gz") + assert not entrypoint.guess_can_open("something.txt.gz") @requires_h5netcdf @@ -7252,75 +7259,73 @@ def test_zarr_entrypoint(tmp_path: Path) -> None: assert not entrypoint.guess_can_open("something.zarr.txt") +@requires_h5netcdf @requires_netCDF4 @requires_pydap @requires_zarr def test_remote_url_backend_auto_detection() -> None: """ - Test that remote URLs are correctly claimed by appropriate backends. + Test that remote URLs are correctly selected by the backend resolution system. - This tests the fix for issue where netCDF4 and pydap backends were + This tests the fix for issue where netCDF4, h5netcdf, and pydap backends were claiming ALL remote URLs, preventing remote Zarr stores from being auto-detected. See: https://github.com/pydata/xarray/issues/10801 """ - from xarray.backends.netCDF4_ import NetCDF4BackendEntrypoint - from xarray.backends.pydap_ import PydapBackendEntrypoint - from xarray.backends.zarr import ZarrBackendEntrypoint - - netcdf4_entrypoint = NetCDF4BackendEntrypoint() - pydap_entrypoint = PydapBackendEntrypoint() - zarr_entrypoint = ZarrBackendEntrypoint() - - # Remote Zarr URLs should be claimed by Zarr backend, not netCDF4/pydap - remote_zarr_urls = [ - "https://example.com/store.zarr", - "http://example.com/data.zarr/", - "s3://bucket/path/to/data.zarr", + from xarray.backends.plugins import guess_engine + + # Test cases: (url, expected_backend) + test_cases = [ + # Remote Zarr URLs + ("https://example.com/store.zarr", "zarr"), + ("http://example.com/data.zarr/", "zarr"), + ("s3://bucket/path/to/data.zarr", "zarr"), + # Remote netCDF URLs (non-DAP) - h5netcdf wins (first in order) + ("https://example.com/file.nc", "h5netcdf"), + ("http://example.com/data.nc4", "h5netcdf"), + ("https://example.com/test.cdf", "h5netcdf"), + ("https://example.com/data.nc?var=temperature&time=0", "h5netcdf"), + # DAP URLs with query parameters - h5netcdf wins (has .nc4 ext, first in order) + ( + "http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4?dap4.ce=/time[0:1:0]", + "h5netcdf", + ), + # DAP URLs without extensions - pydap wins + ("dap2://opendap.earthdata.nasa.gov/collections/dataset", "pydap"), + ("dap4://opendap.earthdata.nasa.gov/collections/dataset", "pydap"), + ("DAP2://example.com/dataset", "pydap"), # uppercase scheme + ("DAP4://example.com/dataset", "pydap"), # uppercase scheme + ("https://example.com/services/DAP2/dataset", "pydap"), # uppercase in path + # DAP URLs with .nc extensions - h5netcdf wins (first in order) + ("http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4", "h5netcdf"), + ("https://example.com/DAP4/data.nc", "h5netcdf"), + ("http://example.com/data/Dap4/file.nc", "h5netcdf"), + ("s3://bucket/path/to/data.nc", "h5netcdf"), ] - for url in remote_zarr_urls: - assert zarr_entrypoint.guess_can_open(url), f"Zarr should claim {url}" - assert not netcdf4_entrypoint.guess_can_open(url), ( - f"NetCDF4 should not claim {url}" + for url, expected_backend in test_cases: + engine = guess_engine(url) + assert engine == expected_backend, ( + f"URL {url!r} should select {expected_backend!r} but got {engine!r}" ) - assert not pydap_entrypoint.guess_can_open(url), f"Pydap should not claim {url}" - # Remote netCDF URLs with extensions should be claimed by netCDF4, not Zarr - remote_netcdf_urls_with_ext = [ - "https://example.com/file.nc", - "http://example.com/data.nc4", - "https://example.com/test.cdf", + # URLs that should raise ValueError (no backend can open them) + invalid_urls = [ + "http://test.opendap.org/opendap/data/nc/coads_climatology.nc.dap", # .dap suffix + "https://example.com/data.dap", # .dap suffix + "http://opendap.example.com/data", # no extension, no DAP indicators + "https://test.opendap.org/dataset", # no extension, no DAP indicators ] - for url in remote_netcdf_urls_with_ext: - assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" - assert netcdf4_entrypoint.guess_can_open(url), f"NetCDF4 should claim {url}" - - # OPeNDAP endpoints (no extension) should be claimed by both netCDF4 and pydap - opendap_urls = [ - "http://opendap.example.com/data", - "https://test.opendap.org/dataset", - ] - - for url in opendap_urls: - assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" - assert netcdf4_entrypoint.guess_can_open(url), f"NetCDF4 should claim {url}" - assert pydap_entrypoint.guess_can_open(url), f"Pydap should claim {url}" - - # Other file types should not be claimed - other_urls = [ - "https://example.com/data.zip", - "https://example.com/data.tar.gz", - ] - - for url in other_urls: - assert not zarr_entrypoint.guess_can_open(url), f"Zarr should not claim {url}" - assert not netcdf4_entrypoint.guess_can_open(url), ( - f"NetCDF4 should not claim {url}" - ) - assert not pydap_entrypoint.guess_can_open(url), f"Pydap should not claim {url}" + for url in invalid_urls: + try: + engine = guess_engine(url) + raise AssertionError( + f"URL {url!r} should not be claimed by any backend, but {engine!r} claimed it" + ) + except ValueError: + pass # Expected @requires_netCDF4 From ef3e07c0b17adbd11174f145b66a1b77e154a8c4 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 3 Oct 2025 14:24:28 -0400 Subject: [PATCH 07/17] no query params for h5netcdf --- xarray/backends/h5netcdf_.py | 8 +------- xarray/tests/test_backends.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 1801ab9b6f4..4ea29014a68 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -462,19 +462,13 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): supports_groups = True def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: - from xarray.core.utils import is_remote_uri, strip_uri_params - filename_or_obj = _normalize_filename_or_obj(filename_or_obj) magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None: return magic_number.startswith(b"\211HDF\r\n\032\n") if isinstance(filename_or_obj, str | os.PathLike): - path = str(filename_or_obj) - # For remote URIs, strip query parameters and fragments before checking extension - if isinstance(filename_or_obj, str) and is_remote_uri(path): - path = strip_uri_params(path) - _, ext = os.path.splitext(path) + _, ext = os.path.splitext(str(filename_or_obj)) return ext in {".nc", ".nc4", ".cdf"} return False diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2feb31504b2..5a420aed523 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7281,15 +7281,18 @@ def test_remote_url_backend_auto_detection() -> None: ("https://example.com/store.zarr", "zarr"), ("http://example.com/data.zarr/", "zarr"), ("s3://bucket/path/to/data.zarr", "zarr"), - # Remote netCDF URLs (non-DAP) - h5netcdf wins (first in order) + # Remote netCDF URLs (non-DAP) - h5netcdf wins (first in order, no query params) ("https://example.com/file.nc", "h5netcdf"), ("http://example.com/data.nc4", "h5netcdf"), ("https://example.com/test.cdf", "h5netcdf"), - ("https://example.com/data.nc?var=temperature&time=0", "h5netcdf"), - # DAP URLs with query parameters - h5netcdf wins (has .nc4 ext, first in order) + ("s3://bucket/path/to/data.nc", "h5netcdf"), + # Remote netCDF URLs with query params - netcdf4 wins + # Note: Query params are typically indicative of DAP URLs (e.g., OPeNDAP constraint expressions), + # so we prefer netcdf4 (which has DAP support) over h5netcdf (which doesn't) + ("https://example.com/data.nc?var=temperature&time=0", "netcdf4"), ( "http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4?dap4.ce=/time[0:1:0]", - "h5netcdf", + "netcdf4", ), # DAP URLs without extensions - pydap wins ("dap2://opendap.earthdata.nasa.gov/collections/dataset", "pydap"), @@ -7297,11 +7300,10 @@ def test_remote_url_backend_auto_detection() -> None: ("DAP2://example.com/dataset", "pydap"), # uppercase scheme ("DAP4://example.com/dataset", "pydap"), # uppercase scheme ("https://example.com/services/DAP2/dataset", "pydap"), # uppercase in path - # DAP URLs with .nc extensions - h5netcdf wins (first in order) + # DAP URLs with .nc extensions (no query params) - h5netcdf wins (first in order) ("http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4", "h5netcdf"), ("https://example.com/DAP4/data.nc", "h5netcdf"), ("http://example.com/data/Dap4/file.nc", "h5netcdf"), - ("s3://bucket/path/to/data.nc", "h5netcdf"), ] for url, expected_backend in test_cases: From c07e7ea03d97a4eef67ebb61c6508111a8838698 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 3 Oct 2025 14:41:00 -0400 Subject: [PATCH 08/17] scipy no urls --- xarray/backends/scipy_.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 59ed3743bbc..c7e4956820c 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -335,7 +335,7 @@ class ScipyBackendEntrypoint(BackendEntrypoint): not available. It has the advantage that is is a lightweight engine that has no system requirements (unlike netcdf4 and h5netcdf). - Additionally it can open gizp compressed (".gz") files. + Additionally it can open gzip compressed (".gz") files. For more information about the underlying library, visit: https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.netcdf_file.html @@ -354,6 +354,8 @@ def guess_can_open( self, filename_or_obj: T_PathFileOrDataStore, ) -> bool: + from xarray.core.utils import is_remote_uri + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None and magic_number.startswith(b"\x1f\x8b"): @@ -365,6 +367,10 @@ def guess_can_open( if isinstance(filename_or_obj, str | os.PathLike): from pathlib import Path + # scipy can only handle local files + if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): + return False + suffix = "".join(Path(filename_or_obj).suffixes) return suffix in {".nc", ".cdf", ".nc.gz"} From 9cf669be5e3a6b5cd866a8de22fba719747d6521 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Mon, 6 Oct 2025 13:01:15 -0400 Subject: [PATCH 09/17] don't try to read magic numbers for remote uris --- xarray/backends/h5netcdf_.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 4ea29014a68..d685a47b977 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -462,10 +462,16 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): supports_groups = True def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: + from xarray.core.utils import is_remote_uri + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) - magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) - if magic_number is not None: - return magic_number.startswith(b"\211HDF\r\n\032\n") + + # Try to read magic number for local files only + is_remote = isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) + if not is_remote: + magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) + if magic_number is not None: + return magic_number.startswith(b"\211HDF\r\n\032\n") if isinstance(filename_or_obj, str | os.PathLike): _, ext = os.path.splitext(str(filename_or_obj)) From a50b2f68fc730a3df30b4707e12533d81eaa55da Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 8 Oct 2025 11:59:44 -0400 Subject: [PATCH 10/17] review comments --- xarray/backends/h5netcdf_.py | 2 +- xarray/tests/test_backends.py | 30 ++++++++++++++---------------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index d685a47b977..691c1c9b484 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -474,7 +474,7 @@ def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: return magic_number.startswith(b"\211HDF\r\n\032\n") if isinstance(filename_or_obj, str | os.PathLike): - _, ext = os.path.splitext(str(filename_or_obj)) + _, ext = os.path.splitext(filename_or_obj) return ext in {".nc", ".nc4", ".cdf"} return False diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f3034400385..513421b7341 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7289,11 +7289,11 @@ def test_remote_url_backend_auto_detection() -> None: ("https://example.com/store.zarr", "zarr"), ("http://example.com/data.zarr/", "zarr"), ("s3://bucket/path/to/data.zarr", "zarr"), - # Remote netCDF URLs (non-DAP) - h5netcdf wins (first in order, no query params) - ("https://example.com/file.nc", "h5netcdf"), - ("http://example.com/data.nc4", "h5netcdf"), - ("https://example.com/test.cdf", "h5netcdf"), - ("s3://bucket/path/to/data.nc", "h5netcdf"), + # Remote netCDF URLs (non-DAP) - netcdf4 wins (first in order, no query params) + ("https://example.com/file.nc", "netcdf4"), + ("http://example.com/data.nc4", "netcdf4"), + ("https://example.com/test.cdf", "netcdf4"), + ("s3://bucket/path/to/data.nc", "netcdf4"), # Remote netCDF URLs with query params - netcdf4 wins # Note: Query params are typically indicative of DAP URLs (e.g., OPeNDAP constraint expressions), # so we prefer netcdf4 (which has DAP support) over h5netcdf (which doesn't) @@ -7308,10 +7308,10 @@ def test_remote_url_backend_auto_detection() -> None: ("DAP2://example.com/dataset", "pydap"), # uppercase scheme ("DAP4://example.com/dataset", "pydap"), # uppercase scheme ("https://example.com/services/DAP2/dataset", "pydap"), # uppercase in path - # DAP URLs with .nc extensions (no query params) - h5netcdf wins (first in order) - ("http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4", "h5netcdf"), - ("https://example.com/DAP4/data.nc", "h5netcdf"), - ("http://example.com/data/Dap4/file.nc", "h5netcdf"), + # DAP URLs with .nc extensions (no query params) - netcdf4 wins (first in order) + ("http://test.opendap.org/opendap/dap4/StaggeredGrid.nc4", "netcdf4"), + ("https://example.com/DAP4/data.nc", "netcdf4"), + ("http://example.com/data/Dap4/file.nc", "netcdf4"), ] for url, expected_backend in test_cases: @@ -7329,13 +7329,11 @@ def test_remote_url_backend_auto_detection() -> None: ] for url in invalid_urls: - try: - engine = guess_engine(url) - raise AssertionError( - f"URL {url!r} should not be claimed by any backend, but {engine!r} claimed it" - ) - except ValueError: - pass # Expected + with pytest.raises( + ValueError, + match=r"did not find a match in any of xarray's currently installed IO backends", + ): + guess_engine(url) @requires_netCDF4 From 10d6edd5e36c22d6556f40651bbd4d7886b9daea Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 8 Oct 2025 12:21:12 -0400 Subject: [PATCH 11/17] fix windows failures --- xarray/backends/scipy_.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index c7e4956820c..dffb5ffbfe8 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -357,6 +357,11 @@ def guess_can_open( from xarray.core.utils import is_remote_uri filename_or_obj = _normalize_filename_or_obj(filename_or_obj) + + # scipy can only handle local files - check this before trying to read magic number + if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): + return False + magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None and magic_number.startswith(b"\x1f\x8b"): with gzip.open(filename_or_obj) as f: # type: ignore[arg-type] @@ -367,10 +372,6 @@ def guess_can_open( if isinstance(filename_or_obj, str | os.PathLike): from pathlib import Path - # scipy can only handle local files - if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): - return False - suffix = "".join(Path(filename_or_obj).suffixes) return suffix in {".nc", ".cdf", ".nc.gz"} From 8c77986e6e2708cffca648af9d7ed4cf6e1f03dc Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 8 Oct 2025 14:57:43 -0400 Subject: [PATCH 12/17] docs on backend resolution --- doc/user-guide/io.rst | 132 +++++++++++++++++++++++++++++++++++++ doc/user-guide/options.rst | 2 +- 2 files changed, 133 insertions(+), 1 deletion(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index ccde3064e4e..ce6f08cd459 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -112,6 +112,136 @@ You can learn more about using and developing backends in the linkStyle default font-size:18pt,stroke-width:4 +.. _io.backend_resolution: + +Backend Selection +----------------- + +When opening a file or URL without explicitly specifying the ``engine`` parameter, +xarray automatically selects an appropriate backend based on the file path or URL. +The backends are tried in order: **netcdf4 → h5netcdf → scipy → pydap → zarr**. + +.. note:: + You can customize the order in which netCDF backends are tried using the + ``netcdf_engine_order`` option in :py:func:`~xarray.set_options`: + + .. code-block:: python + + # Prefer h5netcdf over netcdf4 + xr.set_options(netcdf_engine_order=['h5netcdf', 'netcdf4', 'scipy']) + + See :ref:`options` for more details on configuration options. + +The following tables show which backend will be selected for different types of URLs and files. + +.. important:: + ✅ means the backend will **guess it can open** the URL or file based on its path, extension, + or magic number, but this doesn't guarantee success. For example, not all Zarr stores are + xarray-compatible, and not all ``.nc`` files are valid netCDF. If opening fails, xarray will + try the next backend in the order. + + ❌ means the backend will not attempt to open it. + +Remote URL Resolution +~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 50 10 10 10 10 10 + + * - URL + - :ref:`netcdf4 ` + - :ref:`h5netcdf ` + - :ref:`scipy ` + - :ref:`pydap ` + - :ref:`zarr ` + * - ``https://example.com/store.zarr`` + - ❌ + - ❌ + - ❌ + - ❌ + - ✅ + * - ``https://example.com/data.nc`` + - ✅ + - ✅ + - ❌ + - ❌ + - ❌ + * - ``http://example.com/data.nc?var=temp`` + - ✅ + - ❌ + - ❌ + - ❌ + - ❌ + * - ``dap2://opendap.nasa.gov/dataset`` + - ❌ + - ❌ + - ❌ + - ✅ + - ❌ + * - ``https://example.com/DAP4/data`` + - ❌ + - ❌ + - ❌ + - ✅ + - ❌ + +Local File Resolution +~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :header-rows: 1 + :widths: 40 20 10 10 10 10 + + * - File Path + - Magic Number + - :ref:`netcdf4 ` + - :ref:`h5netcdf ` + - :ref:`scipy ` + - :ref:`zarr ` + * - ``/path/to/file.nc`` + - ``CDF\x01`` (netCDF3) + - ✅ + - ✅ + - ✅ + - ❌ + * - ``/path/to/file.nc4`` + - ``\x89HDF\r\n\x1a\n`` (HDF5/netCDF4) + - ✅ + - ✅ + - ❌ + - ❌ + * - ``/path/to/file.nc.gz`` + - ``\x1f\x8b`` + ``CDF`` inside + - ❌ + - ❌ + - ✅ + - ❌ + * - ``/path/to/store.zarr/`` + - (directory) + - ❌ + - ❌ + - ❌ + - ✅ + +.. note:: + Remote URLs ending in ``.nc`` are **ambiguous**: + + - They could be netCDF files stored on a remote HTTP server (readable by ``netcdf4`` or ``h5netcdf``) + - They could be OPeNDAP/DAP endpoints (readable by ``netcdf4`` with DAP support or ``pydap``) + + These interpretations are fundamentally incompatible. If xarray's automatic + selection chooses the wrong backend, you must explicitly specify the ``engine`` parameter: + + .. code-block:: python + + # Force interpretation as a DAP endpoint + ds = xr.open_dataset("http://example.com/data.nc", engine="pydap") + + # Force interpretation as a remote netCDF file + ds = xr.open_dataset("https://example.com/data.nc", engine="netcdf4") + + .. _io.netcdf: netCDF @@ -1213,6 +1343,8 @@ See for example : `ncdata usage examples`_ .. _Ncdata: https://ncdata.readthedocs.io/en/latest/index.html .. _ncdata usage examples: https://github.com/pp-mo/ncdata/tree/v0.1.2?tab=readme-ov-file#correct-a-miscoded-attribute-in-iris-input +.. _io.opendap: + OPeNDAP ------- diff --git a/doc/user-guide/options.rst b/doc/user-guide/options.rst index 12844eccbe4..f55348f825c 100644 --- a/doc/user-guide/options.rst +++ b/doc/user-guide/options.rst @@ -18,7 +18,7 @@ Xarray offers a small number of configuration options through :py:func:`set_opti 2. Control behaviour during operations: ``arithmetic_join``, ``keep_attrs``, ``use_bottleneck``. 3. Control colormaps for plots:``cmap_divergent``, ``cmap_sequential``. -4. Aspects of file reading: ``file_cache_maxsize``, ``warn_on_unclosed_files``. +4. Aspects of file reading: ``file_cache_maxsize``, ``netcdf_engine_order``, ``warn_on_unclosed_files``. You can set these options either globally From 079b2909345f9d4ec737e867254f5c8e6f6dd7b0 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 8 Oct 2025 15:05:49 -0400 Subject: [PATCH 13/17] more complete table --- doc/user-guide/io.rst | 47 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index ce6f08cd459..4bad0a2dd70 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -173,6 +173,12 @@ Remote URL Resolution - ❌ - ❌ - ❌ + * - ``http://test.opendap.org/dap4/file.nc4?dap4.ce=/time[0]`` + - ✅ + - ❌ + - ❌ + - ✅ + - ❌ * - ``dap2://opendap.nasa.gov/dataset`` - ❌ - ❌ @@ -185,10 +191,25 @@ Remote URL Resolution - ❌ - ✅ - ❌ + * - ``http://test.opendap.org/dap4/file.nc4`` + - ✅ + - ✅ + - ❌ + - ✅ + - ❌ + * - ``https://example.com/DAP4/data.nc`` + - ✅ + - ✅ + - ❌ + - ✅ + - ❌ Local File Resolution ~~~~~~~~~~~~~~~~~~~~~ +For local files, backends first try to read the file's **magic number** (first few bytes). +If the magic number cannot be read, they fall back to checking the file **extension**. + .. list-table:: :header-rows: 1 :widths: 40 20 10 10 10 10 @@ -202,7 +223,7 @@ Local File Resolution * - ``/path/to/file.nc`` - ``CDF\x01`` (netCDF3) - ✅ - - ✅ + - ❌ - ✅ - ❌ * - ``/path/to/file.nc4`` @@ -223,6 +244,30 @@ Local File Resolution - ❌ - ❌ - ✅ + * - ``/path/to/file.nc`` + - *(no magic number)* + - ✅ + - ✅ + - ✅ + - ❌ + * - ``/path/to/file.xyz`` + - ``CDF\x01`` (netCDF3) + - ✅ + - ❌ + - ✅ + - ❌ + * - ``/path/to/file.xyz`` + - ``\x89HDF\r\n\x1a\n`` (HDF5/netCDF4) + - ✅ + - ✅ + - ❌ + - ❌ + * - ``/path/to/file.xyz`` + - *(no magic number)* + - ❌ + - ❌ + - ❌ + - ❌ .. note:: Remote URLs ending in ``.nc`` are **ambiguous**: From 6ee2910aa48a04c6f5a3d968496618235a69f3cf Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 8 Oct 2025 15:23:00 -0400 Subject: [PATCH 14/17] no horizontal scroll on table --- doc/user-guide/io.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 4bad0a2dd70..eea99e21063 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -173,7 +173,7 @@ Remote URL Resolution - ❌ - ❌ - ❌ - * - ``http://test.opendap.org/dap4/file.nc4?dap4.ce=/time[0]`` + * - ``http://example.com/dap4/data.nc?var=x`` - ✅ - ❌ - ❌ @@ -208,7 +208,9 @@ Local File Resolution ~~~~~~~~~~~~~~~~~~~~~ For local files, backends first try to read the file's **magic number** (first few bytes). -If the magic number cannot be read, they fall back to checking the file **extension**. +If the magic number **cannot be read** (e.g., file doesn't exist, no permissions), they fall +back to checking the file **extension**. If the magic number is readable but invalid, the +backend returns False (does not fall back to extension). .. list-table:: :header-rows: 1 From e32e93a63d0adf07f71ae6b5b16249b9c47d8c26 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 8 Oct 2025 15:37:51 -0400 Subject: [PATCH 15/17] fix whats new header --- doc/whats-new.rst | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 30a1550b8f9..26c5d29949f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -2,15 +2,6 @@ .. _whats-new: -Bug fixes -~~~~~~~~~ - -- ``netcdf4`` and ``pydap`` backends now use stricter URL detection to avoid incorrectly claiming - remote URLs. The ``pydap`` backend now only claims URLs with explicit DAP protocol indicators - (``dap2://`` or ``dap4://`` schemes, or ``/dap2/`` or ``/dap4/`` in the URL path). This prevents - both backends from claiming remote Zarr stores and other non-DAP URLs without an explicit - ``engine=`` argument. (:pull:`10804`). By `Ian Hunt-Isaak `_. - What's New ========== @@ -38,6 +29,11 @@ Deprecations Bug Fixes ~~~~~~~~~ +- ``netcdf4`` and ``pydap`` backends now use stricter URL detection to avoid incorrectly claiming + remote URLs. The ``pydap`` backend now only claims URLs with explicit DAP protocol indicators + (``dap2://`` or ``dap4://`` schemes, or ``/dap2/`` or ``/dap4/`` in the URL path). This prevents + both backends from claiming remote Zarr stores and other non-DAP URLs without an explicit + ``engine=`` argument. (:pull:`10804`). By `Ian Hunt-Isaak `_. Documentation ~~~~~~~~~~~~~ From f445045e409f0e2cff4085f432361d6cf3403784 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 8 Oct 2025 15:45:57 -0400 Subject: [PATCH 16/17] correct description --- doc/user-guide/io.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index eea99e21063..8f474cb99f1 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -137,8 +137,7 @@ The following tables show which backend will be selected for different types of .. important:: ✅ means the backend will **guess it can open** the URL or file based on its path, extension, or magic number, but this doesn't guarantee success. For example, not all Zarr stores are - xarray-compatible, and not all ``.nc`` files are valid netCDF. If opening fails, xarray will - try the next backend in the order. + xarray-compatible. ❌ means the backend will not attempt to open it. From 4a717e729f55900b0fba68436f94c6306fc7cf00 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 8 Oct 2025 16:07:57 -0400 Subject: [PATCH 17/17] case insensitivity to DAP: vs dap: --- xarray/core/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index db5827148ab..62914b87a8b 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -725,7 +725,7 @@ def is_remote_uri(path: str) -> bool: This also matches for http[s]://, which were the only remote URLs supported in <=v0.16.2. """ - return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path)) + return bool(re.search(r"^[a-zA-Z][a-zA-Z0-9]*(\://|\:\:)", path)) def strip_uri_params(uri: str) -> str: