Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow configuring open_dataset via backend instances #8520

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
c32beaf
initial try with netcdf4 backend
headtr1ck Dec 4, 2023
2a55aa8
Merge branch 'main' into backend-instances
headtr1ck Dec 6, 2023
fbed422
move encoding options back to open_dataset + some typing
headtr1ck Dec 6, 2023
131e8e0
add an init to h5netcdf backend
headtr1ck Dec 6, 2023
7bd75af
add init to zarr backend
headtr1ck Dec 6, 2023
d02e653
remove uneccessary try-except in type-checking only imports
headtr1ck Dec 6, 2023
cda2fe8
align netcdf4 and h5netcdf backends
headtr1ck Dec 7, 2023
0242fed
add init to pydap backend
headtr1ck Dec 7, 2023
fa892fb
add init to scipy backend and disallow lock=True
headtr1ck Dec 7, 2023
7188a86
rename ZarrWriteModes to ZarrOpenModes
headtr1ck Dec 7, 2023
cb9ee06
type file manager
headtr1ck Dec 8, 2023
2f929f6
make FileManager generic
headtr1ck Dec 8, 2023
66b123d
add netcdf formats to types
headtr1ck Dec 8, 2023
a919de6
more typing and netcdf4 docstrings
headtr1ck Dec 8, 2023
1df97f9
add docstring to h5netcdf backend
headtr1ck Dec 8, 2023
e7519ca
improve typing of scipy backend
headtr1ck Dec 8, 2023
4c26d8f
add docstring to scipy backend
headtr1ck Dec 8, 2023
257983d
fix invalid syntax
headtr1ck Dec 8, 2023
bebef48
add some type hints to zarr backend
headtr1ck Dec 9, 2023
4c1b13f
type open_zarr
headtr1ck Dec 10, 2023
4aca90d
type store backend
headtr1ck Dec 10, 2023
1273430
add open_dataset_parameters
headtr1ck Dec 10, 2023
28e72d7
finish typing pydap backend
headtr1ck Dec 10, 2023
792eb36
docstring fixes
headtr1ck Dec 10, 2023
a4db518
add fallback for TypeAlias import
headtr1ck Dec 10, 2023
498a1ba
fix failing zarr test
headtr1ck Dec 10, 2023
b91b190
improve typing of backend tests
headtr1ck Dec 11, 2023
3b795c8
add tests for open_dataset with engine instances
headtr1ck Dec 11, 2023
3a406a9
Merge branch 'main' into backend-instances
headtr1ck Dec 11, 2023
f6638f2
fix docstrings
headtr1ck Dec 12, 2023
e85cd3d
use dataclasses
headtr1ck Dec 12, 2023
9b611be
Merge branch 'main' into backend-instances
headtr1ck Jan 14, 2024
6b3e1f9
improve typing of locks
headtr1ck Jan 14, 2024
ee973d4
remove unneccessary ImportError catches in type checking
headtr1ck Jan 14, 2024
041c357
Merge branch 'main' into backend-instances
headtr1ck Jan 14, 2024
cec1a88
raise Error if unsupported kwarg is passed
headtr1ck Jan 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
72 changes: 38 additions & 34 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk
from xarray.core.indexes import Index
from xarray.core.parallelcompat import guess_chunkmanager
from xarray.core.types import ZarrWriteModes
from xarray.core.types import ZarrOpenModes
from xarray.core.utils import is_remote_uri

if TYPE_CHECKING:
Expand All @@ -55,20 +55,20 @@
CompatOptions,
JoinOptions,
NestedSequence,
NetcdfFormats,
T_Chunks,
)

T_NetcdfEngine = Literal["netcdf4", "scipy", "h5netcdf"]
T_Engine = Union[
T_NetcdfEngine,
Literal["pydap", "pynio", "zarr"],
BackendEntrypoint,
type[BackendEntrypoint],
str, # no nice typing support for custom backends
None,
]
T_NetcdfTypes = Literal[
"NETCDF4", "NETCDF4_CLASSIC", "NETCDF3_64BIT", "NETCDF3_CLASSIC"
]


DATAARRAY_NAME = "__xarray_dataarray_name__"
DATAARRAY_VARIABLE = "__xarray_dataarray_variable__"
Expand Down Expand Up @@ -421,11 +421,10 @@ def open_dataset(
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", \
"zarr", None}, installed backend \
or subclass of xarray.backends.BackendEntrypoint, optional
or instance or subclass of xarray.backends.BackendEntrypoint, optional
Engine to use when reading files. If not provided, the default engine
is chosen based on available dependencies, with a preference for
"netcdf4". A custom backend class (a subclass of ``BackendEntrypoint``)
can also be used.
"netcdf4".
chunks : int, dict, 'auto' or None, optional
If chunks is provided, it is used to load the new dataset into dask
arrays. ``chunks=-1`` loads the dataset with dask using a single
Expand Down Expand Up @@ -595,8 +594,8 @@ def open_dataset(
def open_dataarray(
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
*,
engine: T_Engine | None = None,
chunks: T_Chunks | None = None,
engine: T_Engine = None,
chunks: T_Chunks = None,
cache: bool | None = None,
decode_cf: bool | None = None,
mask_and_scale: bool | None = None,
Expand Down Expand Up @@ -628,7 +627,7 @@ def open_dataarray(
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", \
"zarr", None}, installed backend \
or subclass of xarray.backends.BackendEntrypoint, optional
or instance or subclass of xarray.backends.BackendEntrypoint, optional
Engine to use when reading files. If not provided, the default engine
is chosen based on available dependencies, with a preference for
"netcdf4".
Expand Down Expand Up @@ -707,16 +706,20 @@ def open_dataarray(
in the values of the task graph. See :py:func:`dask.array.from_array`.
chunked_array_type: str, optional
Which chunked array type to coerce the underlying data array to.
Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system.
Defaults to 'dask' if installed, else whatever is registered via the
`ChunkManagerEnetryPoint` system.
Experimental API that should not be relied upon.
from_array_kwargs: dict
Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create
chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg.
For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed
to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon.
Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array`
method used to create chunked arrays, via whichever chunk manager is
specified through the `chunked_array_type` kwarg.
For example if :py:func:`dask.array.Array` objects are used for chunking,
additional kwargs will be passed to :py:func:`dask.array.from_array`.
Experimental API that should not be relied upon.
backend_kwargs: dict
Additional keyword arguments passed on to the engine open function,
equivalent to `**kwargs`.
equivalent to `**kwargs`. Alternatively pass a configured Backend object
as engine.
**kwargs: dict
Additional keyword arguments passed on to the engine open function.
For example:
Expand All @@ -729,7 +732,8 @@ def open_dataarray(
currently active dask scheduler. Supported by "netcdf4", "h5netcdf",
"scipy", "pynio".

See engine open function for kwargs accepted by each specific engine.
See engine open function for kwargs accepted by each specific engine or
create an instance of the Backend and configure it in the constructor.

Notes
-----
Expand Down Expand Up @@ -790,7 +794,7 @@ def open_dataarray(

def open_mfdataset(
paths: str | NestedSequence[str | os.PathLike],
chunks: T_Chunks | None = None,
chunks: T_Chunks = None,
concat_dim: str
| DataArray
| Index
Expand All @@ -800,7 +804,7 @@ def open_mfdataset(
| None = None,
compat: CompatOptions = "no_conflicts",
preprocess: Callable[[Dataset], Dataset] | None = None,
engine: T_Engine | None = None,
engine: T_Engine = None,
data_vars: Literal["all", "minimal", "different"] | list[str] = "all",
coords="different",
combine: Literal["by_coords", "nested"] = "by_coords",
Expand Down Expand Up @@ -868,7 +872,7 @@ def open_mfdataset(
``ds.encoding["source"]``.
engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", \
"zarr", None}, installed backend \
or subclass of xarray.backends.BackendEntrypoint, optional
or instance or subclass of xarray.backends.BackendEntrypoint, optional
Engine to use when reading files. If not provided, the default engine
is chosen based on available dependencies, with a preference for
"netcdf4".
Expand Down Expand Up @@ -1092,7 +1096,7 @@ def to_netcdf(
dataset: Dataset,
path_or_file: str | os.PathLike | None = None,
mode: Literal["w", "a"] = "w",
format: T_NetcdfTypes | None = None,
format: NetcdfFormats | None = None,
group: str | None = None,
engine: T_NetcdfEngine | None = None,
encoding: Mapping[Hashable, Mapping[str, Any]] | None = None,
Expand All @@ -1111,7 +1115,7 @@ def to_netcdf(
dataset: Dataset,
path_or_file: None = None,
mode: Literal["w", "a"] = "w",
format: T_NetcdfTypes | None = None,
format: NetcdfFormats | None = None,
group: str | None = None,
engine: T_NetcdfEngine | None = None,
encoding: Mapping[Hashable, Mapping[str, Any]] | None = None,
Expand All @@ -1129,7 +1133,7 @@ def to_netcdf(
dataset: Dataset,
path_or_file: str | os.PathLike,
mode: Literal["w", "a"] = "w",
format: T_NetcdfTypes | None = None,
format: NetcdfFormats | None = None,
group: str | None = None,
engine: T_NetcdfEngine | None = None,
encoding: Mapping[Hashable, Mapping[str, Any]] | None = None,
Expand All @@ -1148,7 +1152,7 @@ def to_netcdf(
dataset: Dataset,
path_or_file: str | os.PathLike,
mode: Literal["w", "a"] = "w",
format: T_NetcdfTypes | None = None,
format: NetcdfFormats | None = None,
group: str | None = None,
engine: T_NetcdfEngine | None = None,
encoding: Mapping[Hashable, Mapping[str, Any]] | None = None,
Expand All @@ -1167,7 +1171,7 @@ def to_netcdf(
dataset: Dataset,
path_or_file: str | os.PathLike,
mode: Literal["w", "a"] = "w",
format: T_NetcdfTypes | None = None,
format: NetcdfFormats | None = None,
group: str | None = None,
engine: T_NetcdfEngine | None = None,
encoding: Mapping[Hashable, Mapping[str, Any]] | None = None,
Expand All @@ -1186,7 +1190,7 @@ def to_netcdf(
dataset: Dataset,
path_or_file: str | os.PathLike,
mode: Literal["w", "a"] = "w",
format: T_NetcdfTypes | None = None,
format: NetcdfFormats | None = None,
group: str | None = None,
engine: T_NetcdfEngine | None = None,
encoding: Mapping[Hashable, Mapping[str, Any]] | None = None,
Expand All @@ -1204,7 +1208,7 @@ def to_netcdf(
dataset: Dataset,
path_or_file: str | os.PathLike | None,
mode: Literal["w", "a"] = "w",
format: T_NetcdfTypes | None = None,
format: NetcdfFormats | None = None,
group: str | None = None,
engine: T_NetcdfEngine | None = None,
encoding: Mapping[Hashable, Mapping[str, Any]] | None = None,
Expand All @@ -1220,7 +1224,7 @@ def to_netcdf(
dataset: Dataset,
path_or_file: str | os.PathLike | None = None,
mode: Literal["w", "a"] = "w",
format: T_NetcdfTypes | None = None,
format: NetcdfFormats | None = None,
group: str | None = None,
engine: T_NetcdfEngine | None = None,
encoding: Mapping[Hashable, Mapping[str, Any]] | None = None,
Expand Down Expand Up @@ -1633,14 +1637,14 @@ def to_zarr(
dataset: Dataset,
store: MutableMapping | str | os.PathLike[str] | None = None,
chunk_store: MutableMapping | str | os.PathLike | None = None,
mode: ZarrWriteModes | None = None,
mode: ZarrOpenModes | None = None,
synchronizer=None,
group: str | None = None,
encoding: Mapping | None = None,
*,
compute: Literal[True] = True,
consolidated: bool | None = None,
append_dim: Hashable | None = None,
append_dim: str | None = None,
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
safe_chunks: bool = True,
storage_options: dict[str, str] | None = None,
Expand All @@ -1657,14 +1661,14 @@ def to_zarr(
dataset: Dataset,
store: MutableMapping | str | os.PathLike[str] | None = None,
chunk_store: MutableMapping | str | os.PathLike | None = None,
mode: ZarrWriteModes | None = None,
mode: ZarrOpenModes | None = None,
synchronizer=None,
group: str | None = None,
encoding: Mapping | None = None,
*,
compute: Literal[False],
consolidated: bool | None = None,
append_dim: Hashable | None = None,
append_dim: str | None = None,
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
safe_chunks: bool = True,
storage_options: dict[str, str] | None = None,
Expand All @@ -1679,14 +1683,14 @@ def to_zarr(
dataset: Dataset,
store: MutableMapping | str | os.PathLike[str] | None = None,
chunk_store: MutableMapping | str | os.PathLike | None = None,
mode: ZarrWriteModes | None = None,
mode: ZarrOpenModes | None = None,
synchronizer=None,
group: str | None = None,
encoding: Mapping | None = None,
*,
compute: bool = True,
consolidated: bool | None = None,
append_dim: Hashable | None = None,
append_dim: str | None = None,
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
safe_chunks: bool = True,
storage_options: dict[str, str] | None = None,
Expand Down
43 changes: 27 additions & 16 deletions xarray/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,41 @@
import traceback
from collections.abc import Iterable
from glob import glob
from typing import TYPE_CHECKING, Any, ClassVar
from typing import TYPE_CHECKING, ClassVar, TypeVar, overload

import numpy as np

from xarray.conventions import cf_encoder
from xarray.core import indexing
from xarray.core.parallelcompat import get_chunked_array_type
from xarray.core.pycompat import is_chunked_array
from xarray.core.types import T_BackendDatasetLike
from xarray.core.utils import FrozenDict, NdimSizeLenMixin, is_remote_uri

if TYPE_CHECKING:
from io import BufferedIOBase

from xarray.core.dataset import Dataset
from xarray.core.types import NestedSequence
from xarray.core.types import NestedSequence, T_XarrayCanOpen

# Create a logger object, but don't add any handlers. Leave that to user code.
logger = logging.getLogger(__name__)


NONE_VAR_NAME = "__values__"

T = TypeVar("T")


@overload
def _normalize_path(path: os.PathLike) -> str: # type: ignore[overload-overlap]
...


@overload
def _normalize_path(path: T) -> T:
...


def _normalize_path(path):
def _normalize_path(path: os.PathLike | T) -> str | T:
"""
Normalize pathlikes to string.

Expand All @@ -52,9 +63,9 @@ def _normalize_path(path):
path = os.fspath(path)

if isinstance(path, str) and not is_remote_uri(path):
path = os.path.abspath(os.path.expanduser(path))
return os.path.abspath(os.path.expanduser(path))

return path
return path # type: ignore[return-value]


def _find_absolute_paths(
Expand Down Expand Up @@ -127,9 +138,9 @@ def _decode_variable_name(name):
return name


def find_root_and_group(ds):
def find_root_and_group(ds: T_BackendDatasetLike) -> tuple[T_BackendDatasetLike, str]:
"""Find the root and group name of a netCDF4/h5netcdf dataset."""
hierarchy = ()
hierarchy: tuple[str, ...] = ()
while ds.parent is not None:
hierarchy = (ds.name.split("/")[-1],) + hierarchy
ds = ds.parent
Expand Down Expand Up @@ -462,20 +473,21 @@ class BackendEntrypoint:
Attributes
----------

open_dataset_parameters : tuple, default: None
A list of ``open_dataset`` method parameters.
The setting of this attribute is not mandatory.
description : str, default: ""
A short string describing the engine.
The setting of this attribute is not mandatory.
url : str, default: ""
A string with the URL to the backend's documentation.
The setting of this attribute is not mandatory.
open_dataset_parameters : tuple, default: None
A list of ``open_dataset`` method parameters.
The setting of this attribute is only mandatory if the
open_dataset method contains *args or **kwargs.
"""

open_dataset_parameters: ClassVar[tuple | None] = None
description: ClassVar[str] = ""
url: ClassVar[str] = ""
open_dataset_parameters: ClassVar[tuple[str, ...] | None] = None

def __repr__(self) -> str:
txt = f"<{type(self).__name__}>"
Expand All @@ -487,10 +499,9 @@ def __repr__(self) -> str:

def open_dataset(
self,
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
filename_or_obj: T_XarrayCanOpen,
*,
drop_variables: str | Iterable[str] | None = None,
**kwargs: Any,
) -> Dataset:
"""
Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`.
Expand All @@ -500,7 +511,7 @@ def open_dataset(

def guess_can_open(
self,
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
filename_or_obj: T_XarrayCanOpen,
) -> bool:
"""
Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`.
Expand Down