pydata · shoyer · Aug 19, 2016 · Aug 17, 2016
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -35,6 +35,8 @@ Enhancements
   using ``join=outer``. By `Guido Imperiale <https://github.com/crusaderky>`_.
 - Better error message when assigning variables without dimensions
   (:issue:`971`). By `Stephan Hoyer <https://github.com/shoyer>`_.
+- Better error message when reindex/align fails due to duplicate index values
+  (:issue:`956`). By `Stephan Hoyer <https://github.com/shoyer>`_.
 
 Bug fixes
 ~~~~~~~~~
@@ -53,6 +55,8 @@ Bug fixes
   By `Stephan Hoyer <https://github.com/shoyer>`_.
 - ``groupby_bins`` sorted bin labels as strings (:issue:`952`).
   By `Stephan Hoyer <https://github.com/shoyer>`_.
+- Fix bug introduced by v0.8.0 that broke assignment to datasets when both the
+  left and right side have the same non-unique index values (:issue:`956`).
 
 .. _whats-new.0.8.1:
 

diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py
@@ -25,26 +25,6 @@ def _get_joiner(join):
         raise ValueError('invalid value for join: %s' % join)
 
 
-def _get_all_indexes(objects, exclude=set()):
-    all_indexes = defaultdict(list)
-    for obj in objects:
-        for k, v in iteritems(obj.indexes):
-            if k not in exclude:
-                all_indexes[k].append(v)
-    return all_indexes
-
-
-def _join_indexes(join, objects, exclude=set()):
-    joiner = _get_joiner(join)
-    indexes = _get_all_indexes(objects, exclude=exclude)
-    # exclude dimensions with all equal indices (the usual case) to avoid
-    # unnecessary reindexing work.
-    # TODO: don't bother to check equals for left or right joins
-    joined_indexes = dict((k, joiner(v)) for k, v in iteritems(indexes)
-                          if any(not v[0].equals(idx) for idx in v[1:]))
-    return joined_indexes
-
-
 def align(*objects, **kwargs):
     """align(*objects, join='inner', copy=True)
 
@@ -87,15 +67,36 @@ def align(*objects, **kwargs):
     copy = kwargs.pop('copy', True)
     indexes = kwargs.pop('indexes', None)
     exclude = kwargs.pop('exclude', None)
+    if indexes is None:
+        indexes = {}
     if exclude is None:
         exclude = set()
     if kwargs:
         raise TypeError('align() got unexpected keyword arguments: %s'
                         % list(kwargs))
 
-    joined_indexes = _join_indexes(join, objects, exclude=exclude)
-    if indexes is not None:
-        joined_indexes.update(indexes)
+    all_indexes = defaultdict(list)
+    for obj in objects:
+        for dim, index in iteritems(obj.indexes):
+            if dim not in exclude:
+                all_indexes[dim].append(index)
+
+    # We don't join over dimensions with all equal indexes for two reasons:
+    # - It's faster for the usual case (already aligned objects).
+    # - It ensures it's possible to do operations that don't require alignment
+    #   on indexes with duplicate values (which cannot be reindexed with
+    #   pandas). This is useful, e.g., for overwriting such duplicate indexes.
+    joiner = _get_joiner(join)
+    joined_indexes = {}
+    for dim, dim_indexes in iteritems(all_indexes):
+        if dim in indexes:
+            index = utils.safe_cast_to_index(indexes[dim])
+            if any(not index.equals(other) for other in dim_indexes):
+                joined_indexes[dim] = index
+        else:
+            if any(not dim_indexes[0].equals(other)
+                   for other in dim_indexes[1:]):
+                joined_indexes[dim] = joiner(dim_indexes)
 
     result = []
     for obj in objects:
@@ -163,6 +164,10 @@ def reindex_variables(variables, indexes, indexers, method=None,
         to_shape[name] = index.size
         if name in indexers:
             target = utils.safe_cast_to_index(indexers[name])
+            if not index.is_unique:
+                raise ValueError(
+                    'cannot reindex or align along dimension %r because the '
+                    'index has duplicate values' % name)
             indexer = index.get_indexer(target, method=method,
                                         **get_indexer_kwargs)
 

diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py
@@ -63,8 +63,7 @@ def to_index(self, ordered_dims=None):
     def update(self, other):
         other_vars = getattr(other, 'variables', other)
         coords = merge_coords([self.variables, other_vars],
-                              priority_arg=1, indexes=self.indexes,
-                              indexes_from_arg=0)
+                              priority_arg=1, indexes=self.indexes)
         self._update_coords(coords)
 
     def _merge_raw(self, other):

diff --git a/xarray/core/merge.py b/xarray/core/merge.py
@@ -281,8 +281,7 @@ def merge_coords_without_align(objs, priority_vars=None):
     return variables
 
 
-def _align_for_merge(input_objects, join, copy, indexes=None,
-                     indexes_from_arg=None):
+def _align_for_merge(input_objects, join, copy, indexes=None):
     """Align objects for merging, recursing into dictionary values.
     """
     if indexes is None:
@@ -314,14 +313,6 @@ def is_alignable(obj):
                     targets.append(v)
             out.append(OrderedDict(variables))
 
-    if not targets or (len(targets) == 1 and
-                       (indexes is None or positions == [indexes_from_arg])):
-        # Don't align if we only have one object to align and it's already
-        # aligned to itself. This ensures it's possible to overwrite index
-        # coordinates with non-unique values, which cannot be reindexed:
-        # https://github.com/pydata/xarray/issues/943
-        return input_objects
-
     aligned = align(*targets, join=join, copy=copy, indexes=indexes)
 
     for position, key, aligned_obj in zip(positions, keys, aligned):
@@ -366,7 +357,7 @@ def _get_priority_vars(objects, priority_arg, compat='equals'):
 
 
 def merge_coords(objs, compat='minimal', join='outer', priority_arg=None,
-                 indexes=None, indexes_from_arg=False):
+                 indexes=None):
     """Merge coordinate variables.
 
     See merge_core below for argument descriptions. This works similarly to
@@ -375,8 +366,7 @@ def merge_coords(objs, compat='minimal', join='outer', priority_arg=None,
     """
     _assert_compat_valid(compat)
     coerced = coerce_pandas_values(objs)
-    aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes,
-                               indexes_from_arg=indexes_from_arg)
+    aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes)
     expanded = expand_variable_dicts(aligned)
     priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat)
     variables = merge_variables(expanded, priority_vars, compat=compat)
@@ -393,7 +383,7 @@ def merge_data_and_coords(data, coords, compat='broadcast_equals',
 
 
 def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
-               explicit_coords=None, indexes=None, indexes_from_arg=False):
+               explicit_coords=None, indexes=None):
     """Core logic for merging labeled objects.
 
     This is not public API.
@@ -412,10 +402,6 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
         An explicit list of variables from `objs` that are coordinates.
     indexes : dict, optional
         Dictionary with values given by pandas.Index objects.
-    indexes_from_arg : boolean, optional
-        If indexes were provided, were these taken from one of the objects in
-        ``objs``? This allows us to skip alignment if this object is the only
-        one to be aligned.
 
     Returns
     -------
@@ -435,8 +421,7 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
     _assert_compat_valid(compat)
 
     coerced = coerce_pandas_values(objs)
-    aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes,
-                               indexes_from_arg=indexes_from_arg)
+    aligned = _align_for_merge(coerced, join=join, copy=False, indexes=indexes)
     expanded = expand_variable_dicts(aligned)
 
     coord_names, noncoord_names = determine_coords(coerced)
@@ -552,5 +537,4 @@ def dataset_merge_method(dataset, other, overwrite_vars=frozenset(),
 
 def dataset_update_method(dataset, other):
     """Guts of the Dataset.update method"""
-    return merge_core([dataset, other], priority_arg=1, indexes=dataset.indexes,
-                      indexes_from_arg=0)
+    return merge_core([dataset, other], priority_arg=1, indexes=dataset.indexes)
diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py
@@ -1066,20 +1066,29 @@ def test_align(self):
             align(left, right, foo='bar')
 
     def test_align_exclude(self):
-        x = Dataset({'foo': DataArray([[1, 2],[3, 4]], dims=['x', 'y'], coords={'x': [1, 2], 'y': [3, 4]})})
-        y = Dataset({'bar': DataArray([[1, 2],[3, 4]], dims=['x', 'y'], coords={'x': [1, 3], 'y': [5, 6]})})
+        x = Dataset({'foo': DataArray([[1, 2],[3, 4]], dims=['x', 'y'],
+                    coords={'x': [1, 2], 'y': [3, 4]})})
+        y = Dataset({'bar': DataArray([[1, 2],[3, 4]], dims=['x', 'y'],
+                    coords={'x': [1, 3], 'y': [5, 6]})})
         x2, y2 = align(x, y, exclude=['y'], join='outer')
 
-        expected_x2 = Dataset({'foo': DataArray([[1, 2], [3, 4], [np.nan, np.nan]], dims=['x', 'y'], coords={'x': [1, 2, 3], 'y': [3, 4]})})
-        expected_y2 = Dataset({'bar': DataArray([[1, 2], [np.nan, np.nan], [3, 4]], dims=['x', 'y'], coords={'x': [1, 2, 3], 'y': [5, 6]})})
+        expected_x2 = Dataset(
+            {'foo': DataArray([[1, 2], [3, 4], [np.nan, np.nan]],
+                              dims=['x', 'y'],
+                              coords={'x': [1, 2, 3], 'y': [3, 4]})})
+        expected_y2 = Dataset(
+            {'bar': DataArray([[1, 2], [np.nan, np.nan], [3, 4]],
+                              dims=['x', 'y'],
+                              coords={'x': [1, 2, 3], 'y': [5, 6]})})
         self.assertDatasetIdentical(expected_x2, x2)
         self.assertDatasetIdentical(expected_y2, y2)
 
     def test_align_nocopy(self):
         x = Dataset({'foo': DataArray([1, 2, 3], coords={'x': [1, 2, 3]})})
         y = Dataset({'foo': DataArray([1, 2], coords={'x': [1, 2]})})
         expected_x2 = x
-        expected_y2 = Dataset({'foo': DataArray([1, 2, np.nan], coords={'x': [1, 2, 3]})})
+        expected_y2 = Dataset({'foo': DataArray([1, 2, np.nan],
+                                                 coords={'x': [1, 2, 3]})})
 
         x2, y2 = align(x, y, copy=False, join='outer')
         self.assertDatasetIdentical(expected_x2, x2)
@@ -1097,6 +1106,15 @@ def test_align_indexes(self):
         expected_x2 = Dataset({'foo': DataArray([2, 3, 1], coords={'x': [2, 3, 1]})})
         self.assertDatasetIdentical(expected_x2, x2)
 
+    def test_align_non_unique(self):
+        x = Dataset({'foo': ('x', [3, 4, 5]), 'x': [0, 0, 1]})
+        x1, x2 = align(x, x)
+        assert x1.identical(x) and x2.identical(x)
+
+        y = Dataset({'bar': ('x', [6, 7]), 'x': [0, 1]})
+        with self.assertRaisesRegexp(ValueError, 'cannot reindex or align'):
+            align(x, y)
+
     def test_broadcast(self):
         ds = Dataset({'foo': 0, 'bar': ('x', [1]), 'baz': ('y', [2, 3])},
                      {'c': ('x', [4])})
@@ -1573,7 +1591,7 @@ def test_assign(self):
         expected = expected.set_coords('c')
         self.assertDatasetIdentical(actual, expected)
 
-    def test_setitem_non_unique_index(self):
+    def test_setitem_original_non_unique_index(self):
         # regression test for GH943
         original = Dataset({'data': ('x', np.arange(5))},
                             coords={'x': [0, 1, 2, 0, 1]})
@@ -1591,6 +1609,17 @@ def test_setitem_non_unique_index(self):
         actual.coords['x'] = list(range(5))
         self.assertDatasetIdentical(actual, expected)
 
+    def test_setitem_both_non_unique_index(self):
+        # regression test for GH956
+        names = ['joaquin', 'manolo', 'joaquin']
+        values = np.random.randint(0, 256, (3, 4, 4))
+        array = DataArray(values, dims=['name', 'row', 'column'],
+                          coords=[names, range(4), range(4)])
+        expected = Dataset({'first': array, 'second': array})
+        actual = array.rename('first').to_dataset()
+        actual['second'] = array
+        self.assertDatasetIdentical(expected, actual)
+
     def test_delitem(self):
         data = create_test_data()
         all_items = set(data)