Merge remote-tracking branch 'upstream/main' into fix-zarr-append

* upstream/main: Raise an informative error message when object array has mixed types (pydata#4700) Start renaming `dims` to `dim` (pydata#8487) Reduce redundancy between namedarray and variable tests (pydata#8405) Fix Zarr region transpose (pydata#8484) Refine rolling_exp error messages (pydata#8485) Use numbagg for `ffill` by default (pydata#8389) Fix bug for categorical pandas index with categories with EA dtype (pydata#8481) Improve "variable not found" error message (pydata#8474) Add whatsnew for pydata#8475 (pydata#8478) Allow `rank` to run on dask arrays (pydata#8475) Fix mypy tests (pydata#8476) Use concise date format when plotting (pydata#8449) Fix `map_blocks` docs' formatting (pydata#8464) Consolidate `_get_alpha` func (pydata#8465)
rabernat · Nov 28, 2023 · b438628 · b438628
2 parents 8cdce90 + dc0931a
commit b438628
Show file tree

Hide file tree

Showing 33 changed files with 884 additions and 648 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -23,6 +23,13 @@ v2023.11.1 (unreleased)
 New Features
 ~~~~~~~~~~~~
 
+- Use a concise format when plotting datetime arrays. (:pull:`8449`).
+  By `Jimmy Westling <https://github.com/illviljan>`_.
+
+
+- :py:meth:`~xarray.DataArray.rank` now operates on dask-backed arrays, assuming
+  the core dim has exactly one chunk. (:pull:`8475`).
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
@@ -31,18 +38,36 @@ Breaking changes
 Deprecations
 ~~~~~~~~~~~~
 
+- As part of an effort to standardize the API, we're renaming the ``dims``
+  keyword arg to ``dim`` for the minority of functions which current use
+  ``dims``. This started with :py:func:`xarray.dot` & :py:meth:`DataArray.dot`
+  and we'll gradually roll this out across all functions. The warnings are
+  currently ``PendingDeprecationWarning``, which are silenced by default. We'll
+  convert these to ``DeprecationWarning`` in a future release.
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
 
 Bug fixes
 ~~~~~~~~~
 
+- Fix dtype inference for ``pd.CategoricalIndex`` when categories are backed by a ``pd.ExtensionDtype`` (:pull:`8481`)
+- Fix writing a variable that requires transposing when not writing to a region (:pull:`8484`)
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
+
 
 Documentation
 ~~~~~~~~~~~~~
 
+- Improved error message when attempting to get a variable which doesn't exist from a Dataset.
+  (:pull:`8474`)
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
 
 Internal Changes
 ~~~~~~~~~~~~~~~~
 
+- :py:meth:`DataArray.bfill` & :py:meth:`DataArray.ffill` now use numbagg by
+  default, which is up to 5x faster where parallelization is possible. (:pull:`8339`)
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
+
 .. _whats-new.2023.11.0:
 
 v2023.11.0 (Nov 16, 2023)

diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -178,8 +178,8 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks):
     # DESIGN CHOICE: do not allow multiple dask chunks on a single zarr chunk
     # this avoids the need to get involved in zarr synchronization / locking
     # From zarr docs:
-    #  "If each worker in a parallel computation is writing to a separate
-    #   region of the array, and if region boundaries are perfectly aligned
+    #  "If each worker in a parallel computation is writing to a
+    #   separate region of the array, and if region boundaries are perfectly aligned
     #   with chunk boundaries, then no synchronization is required."
     # TODO: incorporate synchronizer to allow writes from multiple dask
     # threads
@@ -626,12 +626,10 @@ def store(
             variables_encoded.update(vars_with_encoding)
 
             for var_name in existing_variable_names:
-                new_var = variables_encoded[var_name]
-                existing_var = existing_vars[var_name]
-                new_var = _validate_and_transpose_existing_dims(
+                variables_encoded[var_name] = _validate_and_transpose_existing_dims(
                     var_name,
-                    new_var,
-                    existing_var,
+                    variables_encoded[var_name],
+                    existing_vars[var_name],
                     self._write_region,
                     self._append_dim,
                 )

diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -52,16 +52,32 @@ def _var_as_tuple(var: Variable) -> T_VarTuple:
     return var.dims, var.data, var.attrs.copy(), var.encoding.copy()
 
 
-def _infer_dtype(array, name: T_Name = None) -> np.dtype:
-    """Given an object array with no missing values, infer its dtype from its
-    first element
-    """
+def _infer_dtype(array, name=None):
+    """Given an object array with no missing values, infer its dtype from all elements."""
     if array.dtype.kind != "O":
         raise TypeError("infer_type must be called on a dtype=object array")
 
     if array.size == 0:
         return np.dtype(float)
 
+    native_dtypes = set(np.vectorize(type, otypes=[object])(array.ravel()))
+    if len(native_dtypes) > 1 and native_dtypes != {bytes, str}:
+        raise ValueError(
+            "unable to infer dtype on variable {!r}; object array "
+            "contains mixed native types: {}".format(
+                name, ", ".join(x.__name__ for x in native_dtypes)
+            )
+        )
+
+    native_dtypes = set(np.vectorize(type, otypes=[object])(array.ravel()))
+    if len(native_dtypes) > 1 and native_dtypes != {bytes, str}:
+        raise ValueError(
+            "unable to infer dtype on variable {!r}; object array "
+            "contains mixed native types: {}".format(
+                name, ", ".join(x.__name__ for x in native_dtypes)
+            )
+        )
+
     element = array[(0,) * array.ndim]
     # We use the base types to avoid subclasses of bytes and str (which might
     # not play nice with e.g. hdf5 datatypes), such as those from numpy

diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py
@@ -324,7 +324,7 @@ def assert_no_index_conflict(self) -> None:
                     "- they may be used to reindex data along common dimensions"
                 )
 
-    def _need_reindex(self, dims, cmp_indexes) -> bool:
+    def _need_reindex(self, dim, cmp_indexes) -> bool:
         """Whether or not we need to reindex variables for a set of
         matching indexes.
 
@@ -340,14 +340,14 @@ def _need_reindex(self, dims, cmp_indexes) -> bool:
             return True
 
         unindexed_dims_sizes = {}
-        for dim in dims:
-            if dim in self.unindexed_dim_sizes:
-                sizes = self.unindexed_dim_sizes[dim]
+        for d in dim:
+            if d in self.unindexed_dim_sizes:
+                sizes = self.unindexed_dim_sizes[d]
                 if len(sizes) > 1:
                     # reindex if different sizes are found for unindexed dims
                     return True
                 else:
-                    unindexed_dims_sizes[dim] = next(iter(sizes))
+                    unindexed_dims_sizes[d] = next(iter(sizes))
 
         if unindexed_dims_sizes:
             indexed_dims_sizes = {}
@@ -356,8 +356,8 @@ def _need_reindex(self, dims, cmp_indexes) -> bool:
                 for var in index_vars.values():
                     indexed_dims_sizes.update(var.sizes)
 
-            for dim, size in unindexed_dims_sizes.items():
-                if indexed_dims_sizes.get(dim, -1) != size:
+            for d, size in unindexed_dims_sizes.items():
+                if indexed_dims_sizes.get(d, -1) != size:
                     # reindex if unindexed dimension size doesn't match
                     return True
 

diff --git a/xarray/core/computation.py b/xarray/core/computation.py
@@ -26,6 +26,7 @@
 from xarray.core.types import Dims, T_DataArray
 from xarray.core.utils import is_dict_like, is_scalar
 from xarray.core.variable import Variable
+from xarray.util.deprecation_helpers import deprecate_dims
 
 if TYPE_CHECKING:
     from xarray.core.coordinates import Coordinates
@@ -1691,9 +1692,10 @@ def cross(
     return c
 
 
+@deprecate_dims
 def dot(
     *arrays,
-    dims: Dims = None,
+    dim: Dims = None,
     **kwargs: Any,
 ):
     """Generalized dot product for xarray objects. Like ``np.einsum``, but
@@ -1703,7 +1705,7 @@ def dot(
     ----------
     *arrays : DataArray or Variable
         Arrays to compute.
-    dims : str, iterable of hashable, "..." or None, optional
+    dim : str, iterable of hashable, "..." or None, optional
         Which dimensions to sum over. Ellipsis ('...') sums over all dimensions.
         If not specified, then all the common dimensions are summed over.
     **kwargs : dict
@@ -1756,18 +1758,18 @@ def dot(
            [3, 4, 5]])
     Dimensions without coordinates: c, d
 
-    >>> xr.dot(da_a, da_b, dims=["a", "b"])
+    >>> xr.dot(da_a, da_b, dim=["a", "b"])
     <xarray.DataArray (c: 2)>
     array([110, 125])
     Dimensions without coordinates: c
 
-    >>> xr.dot(da_a, da_b, dims=["a"])
+    >>> xr.dot(da_a, da_b, dim=["a"])
     <xarray.DataArray (b: 2, c: 2)>
     array([[40, 46],
            [70, 79]])
     Dimensions without coordinates: b, c
 
-    >>> xr.dot(da_a, da_b, da_c, dims=["b", "c"])
+    >>> xr.dot(da_a, da_b, da_c, dim=["b", "c"])
     <xarray.DataArray (a: 3, d: 3)>
     array([[  9,  14,  19],
            [ 93, 150, 207],
@@ -1779,7 +1781,7 @@ def dot(
     array([110, 125])
     Dimensions without coordinates: c
 
-    >>> xr.dot(da_a, da_b, dims=...)
+    >>> xr.dot(da_a, da_b, dim=...)
     <xarray.DataArray ()>
     array(235)
     """
@@ -1803,18 +1805,18 @@ def dot(
     einsum_axes = "abcdefghijklmnopqrstuvwxyz"
     dim_map = {d: einsum_axes[i] for i, d in enumerate(all_dims)}
 
-    if dims is ...:
-        dims = all_dims
-    elif isinstance(dims, str):
-        dims = (dims,)
-    elif dims is None:
+    if dim is ...:
+        dim = all_dims
+    elif isinstance(dim, str):
+        dim = (dim,)
+    elif dim is None:
         # find dimensions that occur more than one times
         dim_counts: Counter = Counter()
         for arr in arrays:
             dim_counts.update(arr.dims)
-        dims = tuple(d for d, c in dim_counts.items() if c > 1)
+        dim = tuple(d for d, c in dim_counts.items() if c > 1)
 
-    dot_dims: set[Hashable] = set(dims)
+    dot_dims: set[Hashable] = set(dim)
 
     # dimensions to be parallelized
     broadcast_dims = common_dims - dot_dims

diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py
@@ -59,10 +59,11 @@ def push(array, n, axis):
     """
     Dask-aware bottleneck.push
     """
-    import bottleneck
     import dask.array as da
     import numpy as np
 
+    from xarray.core.duck_array_ops import _push
+
     def _fill_with_last_one(a, b):
         # cumreduction apply the push func over all the blocks first so, the only missing part is filling
         # the missing values using the last data of the previous chunk
@@ -85,7 +86,7 @@ def _fill_with_last_one(a, b):
 
     # The method parameter makes that the tests for python 3.7 fails.
     return da.reductions.cumreduction(
-        func=bottleneck.push,
+        func=_push,
         binop=_fill_with_last_one,
         ident=np.nan,
         x=array,

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -70,7 +70,7 @@
 )
 from xarray.plot.accessor import DataArrayPlotAccessor
 from xarray.plot.utils import _get_units_from_attrs
-from xarray.util.deprecation_helpers import _deprecate_positional_args
+from xarray.util.deprecation_helpers import _deprecate_positional_args, deprecate_dims
 
 if TYPE_CHECKING:
     from typing import TypeVar, Union
@@ -120,14 +120,14 @@
     T_XarrayOther = TypeVar("T_XarrayOther", bound=Union["DataArray", Dataset])
 
 
-def _check_coords_dims(shape, coords, dims):
-    sizes = dict(zip(dims, shape))
+def _check_coords_dims(shape, coords, dim):
+    sizes = dict(zip(dim, shape))
     for k, v in coords.items():
-        if any(d not in dims for d in v.dims):
+        if any(d not in dim for d in v.dims):
             raise ValueError(
                 f"coordinate {k} has dimensions {v.dims}, but these "
                 "are not a subset of the DataArray "
-                f"dimensions {dims}"
+                f"dimensions {dim}"
             )
 
         for d, s in v.sizes.items():
@@ -4901,10 +4901,11 @@ def imag(self) -> Self:
         """
         return self._replace(self.variable.imag)
 
+    @deprecate_dims
     def dot(
         self,
         other: T_Xarray,
-        dims: Dims = None,
+        dim: Dims = None,
     ) -> T_Xarray:
         """Perform dot product of two DataArrays along their shared dims.
 
@@ -4914,7 +4915,7 @@ def dot(
         ----------
         other : DataArray
             The other array with which the dot product is performed.
-        dims : ..., str, Iterable of Hashable or None, optional
+        dim : ..., str, Iterable of Hashable or None, optional
             Which dimensions to sum over. Ellipsis (`...`) sums over all dimensions.
             If not specified, then all the common dimensions are summed over.
 
@@ -4953,7 +4954,7 @@ def dot(
         if not isinstance(other, DataArray):
             raise TypeError("dot only operates on DataArrays.")
 
-        return computation.dot(self, other, dims=dims)
+        return computation.dot(self, other, dim=dim)
 
     def sortby(
         self,

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -1540,10 +1540,18 @@ def __getitem__(
 
         Indexing with a list of names will return a new ``Dataset`` object.
         """
+        from xarray.core.formatting import shorten_list_repr
+
         if utils.is_dict_like(key):
             return self.isel(**key)
         if utils.hashable(key):
-            return self._construct_dataarray(key)
+            try:
+                return self._construct_dataarray(key)
+            except KeyError as e:
+                raise KeyError(
+                    f"No variable named {key!r}. Variables on the dataset include {shorten_list_repr(list(self.variables.keys()), max_items=10)}"
+                ) from e
+
         if utils.iterable_of_hashable(key):
             return self._copy_listed(key)
         raise ValueError(f"Unsupported key-type {type(key)}")