Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ Enhancements
using ``join=outer``. By `Guido Imperiale <https://github.com/crusaderky>`_.
- Better error message when assigning variables without dimensions
(:issue:`971`). By `Stephan Hoyer <https://github.com/shoyer>`_.
- Better error message when reindex/align fails due to duplicate index values
(:issue:`956`). By `Stephan Hoyer <https://github.com/shoyer>`_.

Bug fixes
~~~~~~~~~
Expand All @@ -53,6 +55,8 @@ Bug fixes
By `Stephan Hoyer <https://github.com/shoyer>`_.
- ``groupby_bins`` sorted bin labels as strings (:issue:`952`).
By `Stephan Hoyer <https://github.com/shoyer>`_.
- Fix bug introduced by v0.8.0 that broke assignment to datasets when both the
left and right side have the same non-unique index values (:issue:`956`).

.. _whats-new.0.8.1:

Expand Down
51 changes: 28 additions & 23 deletions xarray/core/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,6 @@ def _get_joiner(join):
raise ValueError('invalid value for join: %s' % join)


def _get_all_indexes(objects, exclude=set()):
all_indexes = defaultdict(list)
for obj in objects:
for k, v in iteritems(obj.indexes):
if k not in exclude:
all_indexes[k].append(v)
return all_indexes


def _join_indexes(join, objects, exclude=set()):
joiner = _get_joiner(join)
indexes = _get_all_indexes(objects, exclude=exclude)
# exclude dimensions with all equal indices (the usual case) to avoid
# unnecessary reindexing work.
# TODO: don't bother to check equals for left or right joins
joined_indexes = dict((k, joiner(v)) for k, v in iteritems(indexes)
if any(not v[0].equals(idx) for idx in v[1:]))
return joined_indexes


def align(*objects, **kwargs):
"""align(*objects, join='inner', copy=True)

Expand Down Expand Up @@ -87,15 +67,36 @@ def align(*objects, **kwargs):
copy = kwargs.pop('copy', True)
indexes = kwargs.pop('indexes', None)
exclude = kwargs.pop('exclude', None)
if indexes is None:
indexes = {}
if exclude is None:
exclude = set()
if kwargs:
raise TypeError('align() got unexpected keyword arguments: %s'
% list(kwargs))

joined_indexes = _join_indexes(join, objects, exclude=exclude)
if indexes is not None:
joined_indexes.update(indexes)
all_indexes = defaultdict(list)
for obj in objects:
for dim, index in iteritems(obj.indexes):
if dim not in exclude:
all_indexes[dim].append(index)

# We don't join over dimensions with all equal indexes for two reasons:
# - It's faster for the usual case (already aligned objects).
# - It ensures it's possible to do operations that don't require alignment
# on indexes with duplicate values (which cannot be reindexed with
# pandas). This is useful, e.g., for overwriting such duplicate indexes.
joiner = _get_joiner(join)
joined_indexes = {}
for dim, dim_indexes in iteritems(all_indexes):
if dim in indexes:
index = utils.safe_cast_to_index(indexes[dim])
if any(not index.equals(other) for other in dim_indexes):
joined_indexes[dim] = index
else:
if any(not dim_indexes[0].equals(other)
for other in dim_indexes[1:]):
joined_indexes[dim] = joiner(dim_indexes)

result = []
for obj in objects:
Expand Down Expand Up @@ -163,6 +164,10 @@ def reindex_variables(variables, indexes, indexers, method=None,
to_shape[name] = index.size
if name in indexers:
target = utils.safe_cast_to_index(indexers[name])
if not index.is_unique:
raise ValueError(
'cannot reindex or align along dimension %r because the '
'index has duplicate values' % name)
indexer = index.get_indexer(target, method=method,
**get_indexer_kwargs)

Expand Down
3 changes: 1 addition & 2 deletions xarray/core/coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ def to_index(self, ordered_dims=None):
def update(self, other):
other_vars = getattr(other, 'variables', other)
coords = merge_coords([self.variables, other_vars],
priority_arg=1, indexes=self.indexes,
indexes_from_arg=0)
priority_arg=1, indexes=self.indexes)
self._update_coords(coords)

def _merge_raw(self, other):
Expand Down
28 changes: 6 additions & 22 deletions xarray/core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,7 @@ def merge_coords_without_align(objs, priority_vars=None):
return variables


def _align_for_merge(input_objects, join, copy, indexes=None,
indexes_from_arg=None):
def _align_for_merge(input_objects, join, copy, indexes=None):
"""Align objects for merging, recursing into dictionary values.
"""
if indexes is None:
Expand Down Expand Up @@ -314,14 +313,6 @@ def is_alignable(obj):
targets.append(v)
out.append(OrderedDict(variables))

if not targets or (len(targets) == 1 and
(indexes is None or positions == [indexes_from_arg])):
# Don't align if we only have one object to align and it's already
# aligned to itself. This ensures it's possible to overwrite index
# coordinates with non-unique values, which cannot be reindexed:
# https://github.com/pydata/xarray/issues/943
return input_objects

aligned = align(*targets, join=join, copy=copy, indexes=indexes)

for position, key, aligned_obj in zip(positions, keys, aligned):
Expand Down Expand Up @@ -366,7 +357,7 @@ def _get_priority_vars(objects, priority_arg, compat='equals'):


def merge_coords(objs, compat='minimal', join='outer', priority_arg=None,
indexes=None, indexes_from_arg=False):
indexes=None):
"""Merge coordinate variables.

See merge_core below for argument descriptions. This works similarly to
Expand All @@ -375,8 +366,7 @@ def merge_coords(objs, compat='minimal', join='outer', priority_arg=None,
"""
_assert_compat_valid(compat)
coerced = coerce_pandas_values(objs)
aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes,
indexes_from_arg=indexes_from_arg)
aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes)
expanded = expand_variable_dicts(aligned)
priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat)
variables = merge_variables(expanded, priority_vars, compat=compat)
Expand All @@ -393,7 +383,7 @@ def merge_data_and_coords(data, coords, compat='broadcast_equals',


def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
explicit_coords=None, indexes=None, indexes_from_arg=False):
explicit_coords=None, indexes=None):
"""Core logic for merging labeled objects.

This is not public API.
Expand All @@ -412,10 +402,6 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
An explicit list of variables from `objs` that are coordinates.
indexes : dict, optional
Dictionary with values given by pandas.Index objects.
indexes_from_arg : boolean, optional
If indexes were provided, were these taken from one of the objects in
``objs``? This allows us to skip alignment if this object is the only
one to be aligned.

Returns
-------
Expand All @@ -435,8 +421,7 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
_assert_compat_valid(compat)

coerced = coerce_pandas_values(objs)
aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes,
indexes_from_arg=indexes_from_arg)
aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes)
expanded = expand_variable_dicts(aligned)

coord_names, noncoord_names = determine_coords(coerced)
Expand Down Expand Up @@ -552,5 +537,4 @@ def dataset_merge_method(dataset, other, overwrite_vars=frozenset(),

def dataset_update_method(dataset, other):
"""Guts of the Dataset.update method"""
return merge_core([dataset, other], priority_arg=1, indexes=dataset.indexes,
indexes_from_arg=0)
return merge_core([dataset, other], priority_arg=1, indexes=dataset.indexes)
41 changes: 35 additions & 6 deletions xarray/test/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1066,20 +1066,29 @@ def test_align(self):
align(left, right, foo='bar')

def test_align_exclude(self):
x = Dataset({'foo': DataArray([[1, 2],[3, 4]], dims=['x', 'y'], coords={'x': [1, 2], 'y': [3, 4]})})
y = Dataset({'bar': DataArray([[1, 2],[3, 4]], dims=['x', 'y'], coords={'x': [1, 3], 'y': [5, 6]})})
x = Dataset({'foo': DataArray([[1, 2],[3, 4]], dims=['x', 'y'],
coords={'x': [1, 2], 'y': [3, 4]})})
y = Dataset({'bar': DataArray([[1, 2],[3, 4]], dims=['x', 'y'],
coords={'x': [1, 3], 'y': [5, 6]})})
x2, y2 = align(x, y, exclude=['y'], join='outer')

expected_x2 = Dataset({'foo': DataArray([[1, 2], [3, 4], [np.nan, np.nan]], dims=['x', 'y'], coords={'x': [1, 2, 3], 'y': [3, 4]})})
expected_y2 = Dataset({'bar': DataArray([[1, 2], [np.nan, np.nan], [3, 4]], dims=['x', 'y'], coords={'x': [1, 2, 3], 'y': [5, 6]})})
expected_x2 = Dataset(
{'foo': DataArray([[1, 2], [3, 4], [np.nan, np.nan]],
dims=['x', 'y'],
coords={'x': [1, 2, 3], 'y': [3, 4]})})
expected_y2 = Dataset(
{'bar': DataArray([[1, 2], [np.nan, np.nan], [3, 4]],
dims=['x', 'y'],
coords={'x': [1, 2, 3], 'y': [5, 6]})})
self.assertDatasetIdentical(expected_x2, x2)
self.assertDatasetIdentical(expected_y2, y2)

def test_align_nocopy(self):
x = Dataset({'foo': DataArray([1, 2, 3], coords={'x': [1, 2, 3]})})
y = Dataset({'foo': DataArray([1, 2], coords={'x': [1, 2]})})
expected_x2 = x
expected_y2 = Dataset({'foo': DataArray([1, 2, np.nan], coords={'x': [1, 2, 3]})})
expected_y2 = Dataset({'foo': DataArray([1, 2, np.nan],
coords={'x': [1, 2, 3]})})

x2, y2 = align(x, y, copy=False, join='outer')
self.assertDatasetIdentical(expected_x2, x2)
Expand All @@ -1097,6 +1106,15 @@ def test_align_indexes(self):
expected_x2 = Dataset({'foo': DataArray([2, 3, 1], coords={'x': [2, 3, 1]})})
self.assertDatasetIdentical(expected_x2, x2)

def test_align_non_unique(self):
x = Dataset({'foo': ('x', [3, 4, 5]), 'x': [0, 0, 1]})
x1, x2 = align(x, x)
assert x1.identical(x) and x2.identical(x)

y = Dataset({'bar': ('x', [6, 7]), 'x': [0, 1]})
with self.assertRaisesRegexp(ValueError, 'cannot reindex or align'):
align(x, y)

def test_broadcast(self):
ds = Dataset({'foo': 0, 'bar': ('x', [1]), 'baz': ('y', [2, 3])},
{'c': ('x', [4])})
Expand Down Expand Up @@ -1573,7 +1591,7 @@ def test_assign(self):
expected = expected.set_coords('c')
self.assertDatasetIdentical(actual, expected)

def test_setitem_non_unique_index(self):
def test_setitem_original_non_unique_index(self):
# regression test for GH943
original = Dataset({'data': ('x', np.arange(5))},
coords={'x': [0, 1, 2, 0, 1]})
Expand All @@ -1591,6 +1609,17 @@ def test_setitem_non_unique_index(self):
actual.coords['x'] = list(range(5))
self.assertDatasetIdentical(actual, expected)

def test_setitem_both_non_unique_index(self):
# regression test for GH956
names = ['joaquin', 'manolo', 'joaquin']
values = np.random.randint(0, 256, (3, 4, 4))
array = DataArray(values, dims=['name', 'row', 'column'],
coords=[names, range(4), range(4)])
expected = Dataset({'first': array, 'second': array})
actual = array.rename('first').to_dataset()
actual['second'] = array
self.assertDatasetIdentical(expected, actual)

def test_delitem(self):
data = create_test_data()
all_items = set(data)
Expand Down