Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into fix-zarr-append
Browse files Browse the repository at this point in the history
* upstream/main:
  Raise an informative error message when object array has mixed types (pydata#4700)
  Start renaming `dims` to `dim` (pydata#8487)
  Reduce redundancy between namedarray and variable tests (pydata#8405)
  Fix Zarr region transpose (pydata#8484)
  Refine rolling_exp error messages (pydata#8485)
  Use numbagg for `ffill` by default (pydata#8389)
  Fix bug for categorical pandas index with categories with EA dtype (pydata#8481)
  Improve "variable not found" error message (pydata#8474)
  Add whatsnew for pydata#8475 (pydata#8478)
  Allow `rank` to run on dask arrays (pydata#8475)
  Fix mypy tests (pydata#8476)
  Use concise date format when plotting (pydata#8449)
  Fix `map_blocks` docs' formatting (pydata#8464)
  Consolidate `_get_alpha` func (pydata#8465)
  • Loading branch information
dcherian committed Nov 28, 2023
2 parents 8cdce90 + dc0931a commit b438628
Show file tree
Hide file tree
Showing 33 changed files with 884 additions and 648 deletions.
25 changes: 25 additions & 0 deletions doc/whats-new.rst
Expand Up @@ -23,6 +23,13 @@ v2023.11.1 (unreleased)
New Features
~~~~~~~~~~~~

- Use a concise format when plotting datetime arrays. (:pull:`8449`).
By `Jimmy Westling <https://github.com/illviljan>`_.


- :py:meth:`~xarray.DataArray.rank` now operates on dask-backed arrays, assuming
the core dim has exactly one chunk. (:pull:`8475`).
By `Maximilian Roos <https://github.com/max-sixty>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand All @@ -31,18 +38,36 @@ Breaking changes
Deprecations
~~~~~~~~~~~~

- As part of an effort to standardize the API, we're renaming the ``dims``
keyword arg to ``dim`` for the minority of functions which current use
``dims``. This started with :py:func:`xarray.dot` & :py:meth:`DataArray.dot`
and we'll gradually roll this out across all functions. The warnings are
currently ``PendingDeprecationWarning``, which are silenced by default. We'll
convert these to ``DeprecationWarning`` in a future release.
By `Maximilian Roos <https://github.com/max-sixty>`_.

Bug fixes
~~~~~~~~~

- Fix dtype inference for ``pd.CategoricalIndex`` when categories are backed by a ``pd.ExtensionDtype`` (:pull:`8481`)
- Fix writing a variable that requires transposing when not writing to a region (:pull:`8484`)
By `Maximilian Roos <https://github.com/max-sixty>`_.


Documentation
~~~~~~~~~~~~~

- Improved error message when attempting to get a variable which doesn't exist from a Dataset.
(:pull:`8474`)
By `Maximilian Roos <https://github.com/max-sixty>`_.

Internal Changes
~~~~~~~~~~~~~~~~

- :py:meth:`DataArray.bfill` & :py:meth:`DataArray.ffill` now use numbagg by
default, which is up to 5x faster where parallelization is possible. (:pull:`8339`)
By `Maximilian Roos <https://github.com/max-sixty>`_.

.. _whats-new.2023.11.0:

v2023.11.0 (Nov 16, 2023)
Expand Down
12 changes: 5 additions & 7 deletions xarray/backends/zarr.py
Expand Up @@ -178,8 +178,8 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks):
# DESIGN CHOICE: do not allow multiple dask chunks on a single zarr chunk
# this avoids the need to get involved in zarr synchronization / locking
# From zarr docs:
# "If each worker in a parallel computation is writing to a separate
# region of the array, and if region boundaries are perfectly aligned
# "If each worker in a parallel computation is writing to a
# separate region of the array, and if region boundaries are perfectly aligned
# with chunk boundaries, then no synchronization is required."
# TODO: incorporate synchronizer to allow writes from multiple dask
# threads
Expand Down Expand Up @@ -626,12 +626,10 @@ def store(
variables_encoded.update(vars_with_encoding)

for var_name in existing_variable_names:
new_var = variables_encoded[var_name]
existing_var = existing_vars[var_name]
new_var = _validate_and_transpose_existing_dims(
variables_encoded[var_name] = _validate_and_transpose_existing_dims(
var_name,
new_var,
existing_var,
variables_encoded[var_name],
existing_vars[var_name],
self._write_region,
self._append_dim,
)
Expand Down
24 changes: 20 additions & 4 deletions xarray/conventions.py
Expand Up @@ -52,16 +52,32 @@ def _var_as_tuple(var: Variable) -> T_VarTuple:
return var.dims, var.data, var.attrs.copy(), var.encoding.copy()


def _infer_dtype(array, name: T_Name = None) -> np.dtype:
"""Given an object array with no missing values, infer its dtype from its
first element
"""
def _infer_dtype(array, name=None):
"""Given an object array with no missing values, infer its dtype from all elements."""
if array.dtype.kind != "O":
raise TypeError("infer_type must be called on a dtype=object array")

if array.size == 0:
return np.dtype(float)

native_dtypes = set(np.vectorize(type, otypes=[object])(array.ravel()))
if len(native_dtypes) > 1 and native_dtypes != {bytes, str}:
raise ValueError(
"unable to infer dtype on variable {!r}; object array "
"contains mixed native types: {}".format(
name, ", ".join(x.__name__ for x in native_dtypes)
)
)

native_dtypes = set(np.vectorize(type, otypes=[object])(array.ravel()))
if len(native_dtypes) > 1 and native_dtypes != {bytes, str}:
raise ValueError(
"unable to infer dtype on variable {!r}; object array "
"contains mixed native types: {}".format(
name, ", ".join(x.__name__ for x in native_dtypes)
)
)

element = array[(0,) * array.ndim]
# We use the base types to avoid subclasses of bytes and str (which might
# not play nice with e.g. hdf5 datatypes), such as those from numpy
Expand Down
14 changes: 7 additions & 7 deletions xarray/core/alignment.py
Expand Up @@ -324,7 +324,7 @@ def assert_no_index_conflict(self) -> None:
"- they may be used to reindex data along common dimensions"
)

def _need_reindex(self, dims, cmp_indexes) -> bool:
def _need_reindex(self, dim, cmp_indexes) -> bool:
"""Whether or not we need to reindex variables for a set of
matching indexes.
Expand All @@ -340,14 +340,14 @@ def _need_reindex(self, dims, cmp_indexes) -> bool:
return True

unindexed_dims_sizes = {}
for dim in dims:
if dim in self.unindexed_dim_sizes:
sizes = self.unindexed_dim_sizes[dim]
for d in dim:
if d in self.unindexed_dim_sizes:
sizes = self.unindexed_dim_sizes[d]
if len(sizes) > 1:
# reindex if different sizes are found for unindexed dims
return True
else:
unindexed_dims_sizes[dim] = next(iter(sizes))
unindexed_dims_sizes[d] = next(iter(sizes))

if unindexed_dims_sizes:
indexed_dims_sizes = {}
Expand All @@ -356,8 +356,8 @@ def _need_reindex(self, dims, cmp_indexes) -> bool:
for var in index_vars.values():
indexed_dims_sizes.update(var.sizes)

for dim, size in unindexed_dims_sizes.items():
if indexed_dims_sizes.get(dim, -1) != size:
for d, size in unindexed_dims_sizes.items():
if indexed_dims_sizes.get(d, -1) != size:
# reindex if unindexed dimension size doesn't match
return True

Expand Down
28 changes: 15 additions & 13 deletions xarray/core/computation.py
Expand Up @@ -26,6 +26,7 @@
from xarray.core.types import Dims, T_DataArray
from xarray.core.utils import is_dict_like, is_scalar
from xarray.core.variable import Variable
from xarray.util.deprecation_helpers import deprecate_dims

if TYPE_CHECKING:
from xarray.core.coordinates import Coordinates
Expand Down Expand Up @@ -1691,9 +1692,10 @@ def cross(
return c


@deprecate_dims
def dot(
*arrays,
dims: Dims = None,
dim: Dims = None,
**kwargs: Any,
):
"""Generalized dot product for xarray objects. Like ``np.einsum``, but
Expand All @@ -1703,7 +1705,7 @@ def dot(
----------
*arrays : DataArray or Variable
Arrays to compute.
dims : str, iterable of hashable, "..." or None, optional
dim : str, iterable of hashable, "..." or None, optional
Which dimensions to sum over. Ellipsis ('...') sums over all dimensions.
If not specified, then all the common dimensions are summed over.
**kwargs : dict
Expand Down Expand Up @@ -1756,18 +1758,18 @@ def dot(
[3, 4, 5]])
Dimensions without coordinates: c, d
>>> xr.dot(da_a, da_b, dims=["a", "b"])
>>> xr.dot(da_a, da_b, dim=["a", "b"])
<xarray.DataArray (c: 2)>
array([110, 125])
Dimensions without coordinates: c
>>> xr.dot(da_a, da_b, dims=["a"])
>>> xr.dot(da_a, da_b, dim=["a"])
<xarray.DataArray (b: 2, c: 2)>
array([[40, 46],
[70, 79]])
Dimensions without coordinates: b, c
>>> xr.dot(da_a, da_b, da_c, dims=["b", "c"])
>>> xr.dot(da_a, da_b, da_c, dim=["b", "c"])
<xarray.DataArray (a: 3, d: 3)>
array([[ 9, 14, 19],
[ 93, 150, 207],
Expand All @@ -1779,7 +1781,7 @@ def dot(
array([110, 125])
Dimensions without coordinates: c
>>> xr.dot(da_a, da_b, dims=...)
>>> xr.dot(da_a, da_b, dim=...)
<xarray.DataArray ()>
array(235)
"""
Expand All @@ -1803,18 +1805,18 @@ def dot(
einsum_axes = "abcdefghijklmnopqrstuvwxyz"
dim_map = {d: einsum_axes[i] for i, d in enumerate(all_dims)}

if dims is ...:
dims = all_dims
elif isinstance(dims, str):
dims = (dims,)
elif dims is None:
if dim is ...:
dim = all_dims
elif isinstance(dim, str):
dim = (dim,)
elif dim is None:
# find dimensions that occur more than one times
dim_counts: Counter = Counter()
for arr in arrays:
dim_counts.update(arr.dims)
dims = tuple(d for d, c in dim_counts.items() if c > 1)
dim = tuple(d for d, c in dim_counts.items() if c > 1)

dot_dims: set[Hashable] = set(dims)
dot_dims: set[Hashable] = set(dim)

# dimensions to be parallelized
broadcast_dims = common_dims - dot_dims
Expand Down
5 changes: 3 additions & 2 deletions xarray/core/dask_array_ops.py
Expand Up @@ -59,10 +59,11 @@ def push(array, n, axis):
"""
Dask-aware bottleneck.push
"""
import bottleneck
import dask.array as da
import numpy as np

from xarray.core.duck_array_ops import _push

def _fill_with_last_one(a, b):
# cumreduction apply the push func over all the blocks first so, the only missing part is filling
# the missing values using the last data of the previous chunk
Expand All @@ -85,7 +86,7 @@ def _fill_with_last_one(a, b):

# The method parameter makes that the tests for python 3.7 fails.
return da.reductions.cumreduction(
func=bottleneck.push,
func=_push,
binop=_fill_with_last_one,
ident=np.nan,
x=array,
Expand Down
17 changes: 9 additions & 8 deletions xarray/core/dataarray.py
Expand Up @@ -70,7 +70,7 @@
)
from xarray.plot.accessor import DataArrayPlotAccessor
from xarray.plot.utils import _get_units_from_attrs
from xarray.util.deprecation_helpers import _deprecate_positional_args
from xarray.util.deprecation_helpers import _deprecate_positional_args, deprecate_dims

if TYPE_CHECKING:
from typing import TypeVar, Union
Expand Down Expand Up @@ -120,14 +120,14 @@
T_XarrayOther = TypeVar("T_XarrayOther", bound=Union["DataArray", Dataset])


def _check_coords_dims(shape, coords, dims):
sizes = dict(zip(dims, shape))
def _check_coords_dims(shape, coords, dim):
sizes = dict(zip(dim, shape))
for k, v in coords.items():
if any(d not in dims for d in v.dims):
if any(d not in dim for d in v.dims):
raise ValueError(
f"coordinate {k} has dimensions {v.dims}, but these "
"are not a subset of the DataArray "
f"dimensions {dims}"
f"dimensions {dim}"
)

for d, s in v.sizes.items():
Expand Down Expand Up @@ -4901,10 +4901,11 @@ def imag(self) -> Self:
"""
return self._replace(self.variable.imag)

@deprecate_dims
def dot(
self,
other: T_Xarray,
dims: Dims = None,
dim: Dims = None,
) -> T_Xarray:
"""Perform dot product of two DataArrays along their shared dims.
Expand All @@ -4914,7 +4915,7 @@ def dot(
----------
other : DataArray
The other array with which the dot product is performed.
dims : ..., str, Iterable of Hashable or None, optional
dim : ..., str, Iterable of Hashable or None, optional
Which dimensions to sum over. Ellipsis (`...`) sums over all dimensions.
If not specified, then all the common dimensions are summed over.
Expand Down Expand Up @@ -4953,7 +4954,7 @@ def dot(
if not isinstance(other, DataArray):
raise TypeError("dot only operates on DataArrays.")

return computation.dot(self, other, dims=dims)
return computation.dot(self, other, dim=dim)

def sortby(
self,
Expand Down
10 changes: 9 additions & 1 deletion xarray/core/dataset.py
Expand Up @@ -1540,10 +1540,18 @@ def __getitem__(
Indexing with a list of names will return a new ``Dataset`` object.
"""
from xarray.core.formatting import shorten_list_repr

if utils.is_dict_like(key):
return self.isel(**key)
if utils.hashable(key):
return self._construct_dataarray(key)
try:
return self._construct_dataarray(key)
except KeyError as e:
raise KeyError(
f"No variable named {key!r}. Variables on the dataset include {shorten_list_repr(list(self.variables.keys()), max_items=10)}"
) from e

if utils.iterable_of_hashable(key):
return self._copy_listed(key)
raise ValueError(f"Unsupported key-type {type(key)}")
Expand Down

0 comments on commit b438628

Please sign in to comment.