From c12a6ea9d31e08b0798b8bea454c3335e774c5ef Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 17 Aug 2016 09:32:11 -0700 Subject: [PATCH] Consistently skip identical indexes in align Also give a more informative error message identifying the offending dimension if reindex/align cannot succeed due to duplicate values along an index. Fixes GH956 --- doc/whats-new.rst | 4 +++ xarray/core/alignment.py | 51 ++++++++++++++++++++----------------- xarray/core/coordinates.py | 3 +-- xarray/core/merge.py | 28 +++++--------------- xarray/test/test_dataset.py | 41 ++++++++++++++++++++++++----- 5 files changed, 74 insertions(+), 53 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 93280ec4bc7..051cdcb5a1b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,6 +35,8 @@ Enhancements using ``join=outer``. By `Guido Imperiale `_. - Better error message when assigning variables without dimensions (:issue:`971`). By `Stephan Hoyer `_. +- Better error message when reindex/align fails due to duplicate index values + (:issue:`956`). By `Stephan Hoyer `_. Bug fixes ~~~~~~~~~ @@ -53,6 +55,8 @@ Bug fixes By `Stephan Hoyer `_. - ``groupby_bins`` sorted bin labels as strings (:issue:`952`). By `Stephan Hoyer `_. +- Fix bug introduced by v0.8.0 that broke assignment to datasets when both the + left and right side have the same non-unique index values (:issue:`956`). .. _whats-new.0.8.1: diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index b1f221420b8..2cbe49d5de6 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -25,26 +25,6 @@ def _get_joiner(join): raise ValueError('invalid value for join: %s' % join) -def _get_all_indexes(objects, exclude=set()): - all_indexes = defaultdict(list) - for obj in objects: - for k, v in iteritems(obj.indexes): - if k not in exclude: - all_indexes[k].append(v) - return all_indexes - - -def _join_indexes(join, objects, exclude=set()): - joiner = _get_joiner(join) - indexes = _get_all_indexes(objects, exclude=exclude) - # exclude dimensions with all equal indices (the usual case) to avoid - # unnecessary reindexing work. - # TODO: don't bother to check equals for left or right joins - joined_indexes = dict((k, joiner(v)) for k, v in iteritems(indexes) - if any(not v[0].equals(idx) for idx in v[1:])) - return joined_indexes - - def align(*objects, **kwargs): """align(*objects, join='inner', copy=True) @@ -87,15 +67,36 @@ def align(*objects, **kwargs): copy = kwargs.pop('copy', True) indexes = kwargs.pop('indexes', None) exclude = kwargs.pop('exclude', None) + if indexes is None: + indexes = {} if exclude is None: exclude = set() if kwargs: raise TypeError('align() got unexpected keyword arguments: %s' % list(kwargs)) - joined_indexes = _join_indexes(join, objects, exclude=exclude) - if indexes is not None: - joined_indexes.update(indexes) + all_indexes = defaultdict(list) + for obj in objects: + for dim, index in iteritems(obj.indexes): + if dim not in exclude: + all_indexes[dim].append(index) + + # We don't join over dimensions with all equal indexes for two reasons: + # - It's faster for the usual case (already aligned objects). + # - It ensures it's possible to do operations that don't require alignment + # on indexes with duplicate values (which cannot be reindexed with + # pandas). This is useful, e.g., for overwriting such duplicate indexes. + joiner = _get_joiner(join) + joined_indexes = {} + for dim, dim_indexes in iteritems(all_indexes): + if dim in indexes: + index = utils.safe_cast_to_index(indexes[dim]) + if any(not index.equals(other) for other in dim_indexes): + joined_indexes[dim] = index + else: + if any(not dim_indexes[0].equals(other) + for other in dim_indexes[1:]): + joined_indexes[dim] = joiner(dim_indexes) result = [] for obj in objects: @@ -163,6 +164,10 @@ def reindex_variables(variables, indexes, indexers, method=None, to_shape[name] = index.size if name in indexers: target = utils.safe_cast_to_index(indexers[name]) + if not index.is_unique: + raise ValueError( + 'cannot reindex or align along dimension %r because the ' + 'index has duplicate values' % name) indexer = index.get_indexer(target, method=method, **get_indexer_kwargs) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 1ec231e12df..73c7a3a109c 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -63,8 +63,7 @@ def to_index(self, ordered_dims=None): def update(self, other): other_vars = getattr(other, 'variables', other) coords = merge_coords([self.variables, other_vars], - priority_arg=1, indexes=self.indexes, - indexes_from_arg=0) + priority_arg=1, indexes=self.indexes) self._update_coords(coords) def _merge_raw(self, other): diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 7c25de45535..28eb8ecea84 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -281,8 +281,7 @@ def merge_coords_without_align(objs, priority_vars=None): return variables -def _align_for_merge(input_objects, join, copy, indexes=None, - indexes_from_arg=None): +def _align_for_merge(input_objects, join, copy, indexes=None): """Align objects for merging, recursing into dictionary values. """ if indexes is None: @@ -314,14 +313,6 @@ def is_alignable(obj): targets.append(v) out.append(OrderedDict(variables)) - if not targets or (len(targets) == 1 and - (indexes is None or positions == [indexes_from_arg])): - # Don't align if we only have one object to align and it's already - # aligned to itself. This ensures it's possible to overwrite index - # coordinates with non-unique values, which cannot be reindexed: - # https://github.com/pydata/xarray/issues/943 - return input_objects - aligned = align(*targets, join=join, copy=copy, indexes=indexes) for position, key, aligned_obj in zip(positions, keys, aligned): @@ -366,7 +357,7 @@ def _get_priority_vars(objects, priority_arg, compat='equals'): def merge_coords(objs, compat='minimal', join='outer', priority_arg=None, - indexes=None, indexes_from_arg=False): + indexes=None): """Merge coordinate variables. See merge_core below for argument descriptions. This works similarly to @@ -375,8 +366,7 @@ def merge_coords(objs, compat='minimal', join='outer', priority_arg=None, """ _assert_compat_valid(compat) coerced = coerce_pandas_values(objs) - aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes, - indexes_from_arg=indexes_from_arg) + aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes) expanded = expand_variable_dicts(aligned) priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat) variables = merge_variables(expanded, priority_vars, compat=compat) @@ -393,7 +383,7 @@ def merge_data_and_coords(data, coords, compat='broadcast_equals', def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None, - explicit_coords=None, indexes=None, indexes_from_arg=False): + explicit_coords=None, indexes=None): """Core logic for merging labeled objects. This is not public API. @@ -412,10 +402,6 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None, An explicit list of variables from `objs` that are coordinates. indexes : dict, optional Dictionary with values given by pandas.Index objects. - indexes_from_arg : boolean, optional - If indexes were provided, were these taken from one of the objects in - ``objs``? This allows us to skip alignment if this object is the only - one to be aligned. Returns ------- @@ -435,8 +421,7 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None, _assert_compat_valid(compat) coerced = coerce_pandas_values(objs) - aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes, - indexes_from_arg=indexes_from_arg) + aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes) expanded = expand_variable_dicts(aligned) coord_names, noncoord_names = determine_coords(coerced) @@ -552,5 +537,4 @@ def dataset_merge_method(dataset, other, overwrite_vars=frozenset(), def dataset_update_method(dataset, other): """Guts of the Dataset.update method""" - return merge_core([dataset, other], priority_arg=1, indexes=dataset.indexes, - indexes_from_arg=0) + return merge_core([dataset, other], priority_arg=1, indexes=dataset.indexes) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 0067996634d..9b2c6c1e22d 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -1066,12 +1066,20 @@ def test_align(self): align(left, right, foo='bar') def test_align_exclude(self): - x = Dataset({'foo': DataArray([[1, 2],[3, 4]], dims=['x', 'y'], coords={'x': [1, 2], 'y': [3, 4]})}) - y = Dataset({'bar': DataArray([[1, 2],[3, 4]], dims=['x', 'y'], coords={'x': [1, 3], 'y': [5, 6]})}) + x = Dataset({'foo': DataArray([[1, 2],[3, 4]], dims=['x', 'y'], + coords={'x': [1, 2], 'y': [3, 4]})}) + y = Dataset({'bar': DataArray([[1, 2],[3, 4]], dims=['x', 'y'], + coords={'x': [1, 3], 'y': [5, 6]})}) x2, y2 = align(x, y, exclude=['y'], join='outer') - expected_x2 = Dataset({'foo': DataArray([[1, 2], [3, 4], [np.nan, np.nan]], dims=['x', 'y'], coords={'x': [1, 2, 3], 'y': [3, 4]})}) - expected_y2 = Dataset({'bar': DataArray([[1, 2], [np.nan, np.nan], [3, 4]], dims=['x', 'y'], coords={'x': [1, 2, 3], 'y': [5, 6]})}) + expected_x2 = Dataset( + {'foo': DataArray([[1, 2], [3, 4], [np.nan, np.nan]], + dims=['x', 'y'], + coords={'x': [1, 2, 3], 'y': [3, 4]})}) + expected_y2 = Dataset( + {'bar': DataArray([[1, 2], [np.nan, np.nan], [3, 4]], + dims=['x', 'y'], + coords={'x': [1, 2, 3], 'y': [5, 6]})}) self.assertDatasetIdentical(expected_x2, x2) self.assertDatasetIdentical(expected_y2, y2) @@ -1079,7 +1087,8 @@ def test_align_nocopy(self): x = Dataset({'foo': DataArray([1, 2, 3], coords={'x': [1, 2, 3]})}) y = Dataset({'foo': DataArray([1, 2], coords={'x': [1, 2]})}) expected_x2 = x - expected_y2 = Dataset({'foo': DataArray([1, 2, np.nan], coords={'x': [1, 2, 3]})}) + expected_y2 = Dataset({'foo': DataArray([1, 2, np.nan], + coords={'x': [1, 2, 3]})}) x2, y2 = align(x, y, copy=False, join='outer') self.assertDatasetIdentical(expected_x2, x2) @@ -1097,6 +1106,15 @@ def test_align_indexes(self): expected_x2 = Dataset({'foo': DataArray([2, 3, 1], coords={'x': [2, 3, 1]})}) self.assertDatasetIdentical(expected_x2, x2) + def test_align_non_unique(self): + x = Dataset({'foo': ('x', [3, 4, 5]), 'x': [0, 0, 1]}) + x1, x2 = align(x, x) + assert x1.identical(x) and x2.identical(x) + + y = Dataset({'bar': ('x', [6, 7]), 'x': [0, 1]}) + with self.assertRaisesRegexp(ValueError, 'cannot reindex or align'): + align(x, y) + def test_broadcast(self): ds = Dataset({'foo': 0, 'bar': ('x', [1]), 'baz': ('y', [2, 3])}, {'c': ('x', [4])}) @@ -1573,7 +1591,7 @@ def test_assign(self): expected = expected.set_coords('c') self.assertDatasetIdentical(actual, expected) - def test_setitem_non_unique_index(self): + def test_setitem_original_non_unique_index(self): # regression test for GH943 original = Dataset({'data': ('x', np.arange(5))}, coords={'x': [0, 1, 2, 0, 1]}) @@ -1591,6 +1609,17 @@ def test_setitem_non_unique_index(self): actual.coords['x'] = list(range(5)) self.assertDatasetIdentical(actual, expected) + def test_setitem_both_non_unique_index(self): + # regression test for GH956 + names = ['joaquin', 'manolo', 'joaquin'] + values = np.random.randint(0, 256, (3, 4, 4)) + array = DataArray(values, dims=['name', 'row', 'column'], + coords=[names, range(4), range(4)]) + expected = Dataset({'first': array, 'second': array}) + actual = array.rename('first').to_dataset() + actual['second'] = array + self.assertDatasetIdentical(expected, actual) + def test_delitem(self): data = create_test_data() all_items = set(data)