combine_first by using apply_ufunc in ops.fillna (#1204)

* First commit of ops.fillna() using apply_ufunc(). * Saving preliminary changes to fillna using apply_ufunc.aset attr test. * _yield_applied for GroupedBy.fillna(). Inspired by _binary_op(). * Finished introducing combine_first to DataArray/Dataset objects. Remove _fillna from injection. * Minor spacing changes to doc. * Use np.where() in ops.fillna * Separating data_vars_join from join in apply_ufunc, and accessing it through ops.fillna. Also added docstring. * Rewrote Groupby's fillna, among other review comments.git status * Initial attempt to introduce keep_attrs to apply_ufunc. * Adding test for data_vars_join, also removing join kwarg in fillna of dataarray.py and dataset.py. * Brining keep_attrs from apply_ufunc to apply_dataset_ufunc and apply_dataarray_ufunc. * Forcing explicit specification for dataset_fill_value. * Added test for keep_attrs. Changed kwarg name frmo data_vars_join to dataset_join. * Updated docstrings. Moving changes to v0.9.0 * Adjust dataset_fill_value default in apply_ufunc signature
pydata · Jan 23, 2017 · c5146e8 · c5146e8
1 parent 4bb630f
commit c5146e8
Show file tree

Hide file tree

Showing 11 changed files with 298 additions and 52 deletions.
diff --git a/doc/combining.rst b/doc/combining.rst
@@ -13,6 +13,7 @@ Combining data
 
 * For combining datasets or data arrays along a dimension, see concatenate_.
 * For combining datasets with different variables, see merge_.
+* For combining datasets or data arrays with different indexes or missing values, see combine_.
 
 .. _concatenate:
 
@@ -116,6 +117,40 @@ used in the :py:class:`~xarray.Dataset` constructor:
 
     xr.Dataset({'a': arr[:-1], 'b': arr[1:]})
 
+.. _combine:
+
+Combine
+~~~~~~~
+
+The instance method ``combine_first`` combines two datasets/data arrays and
+defaults to non-null values in the calling object, using values from the called
+object to fill holes.  The resulting coordinates are the union of coordinate labels.
+Vacant cells as a result of the outer-join are filled with nan.
+
+Mimics the behavior of ``pandas.Dataframe.combine_first``
+
+For data array,
+
+.. ipython:: python
+
+    ar0 = DataArray([[0, 0], [0, 0]], [('x', ['a', 'b']), ('y', [-1, 0])])
+    ar1 = DataArray([[1, 1], [1, 1]], [('x', ['b', 'c']), ('y', [0, 1])])
+    ar2 = DataArray([2], [('x', ['d'])])
+    ar0.combine_first(ar1)
+    ar1.combine_first(ar0)
+    ar0.combine_first(ar2)
+
+For datasets, ``ds0.combine_first(ds1)`` works similarly to ``xr.merge([ds0, ds1])``,
+except that ``xr.merge`` raises a ``MergeError`` when there are conflicting values
+in merging data variables, whereas ``.combine_first`` defaults to the calling object's values.
+
+.. ipython:: python
+
+    ds0 = Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})
+    ds1 = Dataset({'a': ('x', [99, 3]), 'x': [1, 2]})
+    ds0.combine_first(ds1)
+    xr.merge([ds0, ds1])
+
 .. _update:
 
 Update

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -128,6 +128,13 @@ Deprecations
 
 Enhancements
 ~~~~~~~~~~~~
+
+- Added the xarray equivalent of `pandas.Dataframe.combine_first` as an instance
+  method to DataArray/Dataset objects, facilitated by the new `ops.fillna` with
+  `join` and `data_vars_join` options.
+  (see :ref:`combine`)
+  By `Chun-Wei Yuan <https://github.com/chunweiyuan>`_.
+
 - Added the ability to change default automatic alignment (arithmetic_join="inner")
   for binary operations via :py:func:`~xarray.set_options()`
   (see :ref:`automatic alignment`).

diff --git a/xarray/core/computation.py b/xarray/core/computation.py
@@ -8,6 +8,8 @@
 import operator
 import re
 
+import numpy as np
+
 from . import ops
 from .alignment import deep_align
 from .merge import expand_and_merge_variables
@@ -16,6 +18,8 @@
 
 
 _DEFAULT_FROZEN_SET = frozenset()
+_DEFAULT_FILL_VALUE = object()
+
 
 # see http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
 DIMENSION_NAME = r'\w+'
@@ -202,6 +206,7 @@ def apply_dataarray_ufunc(func, *args, **kwargs):
     signature = kwargs.pop('signature')
     join = kwargs.pop('join', 'inner')
     exclude_dims = kwargs.pop('exclude_dims', _DEFAULT_FROZEN_SET)
+    keep_attrs = kwargs.pop('keep_attrs', False)
     if kwargs:
         raise TypeError('apply_dataarray_ufunc() got unexpected keyword '
                         'arguments: %s' % list(kwargs))
@@ -217,12 +222,18 @@ def apply_dataarray_ufunc(func, *args, **kwargs):
     result_var = func(*data_vars)
 
     if signature.n_outputs > 1:
-        return tuple(DataArray(variable, coords, name=name, fastpath=True)
-                     for variable, coords in zip(result_var, result_coords))
+        out = tuple(DataArray(variable, coords, name=name, fastpath=True)
+                    for variable, coords in zip(result_var, result_coords))
     else:
         coords, = result_coords
-        return DataArray(result_var, coords, name=name, fastpath=True)
+        out = DataArray(result_var, coords, name=name, fastpath=True)
 
+    if keep_attrs and isinstance(args[0], DataArray):
+        if isinstance(out, tuple):
+            out = tuple(ds._copy_attrs_from(args[0]) for ds in out)
+        else:
+            out._copy_attrs_from(args[0])
+    return out
 
 def ordered_set_union(all_keys):
     # type: List[Iterable] -> Iterable
@@ -326,32 +337,53 @@ def _fast_dataset(variables, coord_variables):
 
 def apply_dataset_ufunc(func, *args, **kwargs):
     """apply_dataset_ufunc(func, *args, signature, join='inner',
-                           fill_value=None, exclude_dims=frozenset()):
+                           dataset_join='inner', fill_value=None,
+                           exclude_dims=frozenset(), keep_attrs=False):
+
+       If dataset_join != 'inner', a non-default fill_value must be supplied
+       by the user.  Otherwise a TypeError is raised.
     """
+    from .dataset import Dataset
     signature = kwargs.pop('signature')
     join = kwargs.pop('join', 'inner')
+    dataset_join = kwargs.pop('dataset_join', 'inner')
     fill_value = kwargs.pop('fill_value', None)
     exclude_dims = kwargs.pop('exclude_dims', _DEFAULT_FROZEN_SET)
+    keep_attrs = kwargs.pop('keep_attrs', False)
+    first_obj = args[0]  # we'll copy attrs from this in case keep_attrs=True
+
+    if dataset_join != 'inner' and fill_value is _DEFAULT_FILL_VALUE:
+        raise TypeError('To apply an operation to datasets with different ',
+                        'data variables, you must supply the ',
+                        'dataset_fill_value argument.')
+
     if kwargs:
         raise TypeError('apply_dataset_ufunc() got unexpected keyword '
                         'arguments: %s' % list(kwargs))
-
     if len(args) > 1:
         args = deep_align(args, join=join, copy=False, exclude=exclude_dims,
                           raise_on_invalid=False)
 
     list_of_coords = build_output_coords(args, signature, exclude_dims)
-
     args = [getattr(arg, 'data_vars', arg) for arg in args]
+
     result_vars = apply_dict_of_variables_ufunc(
-        func, *args, signature=signature, join=join, fill_value=fill_value)
+        func, *args, signature=signature, join=dataset_join,
+        fill_value=fill_value)
 
     if signature.n_outputs > 1:
-        return tuple(_fast_dataset(*args)
-                     for args in zip(result_vars, list_of_coords))
+        out = tuple(_fast_dataset(*args)
+                    for args in zip(result_vars, list_of_coords))
     else:
         coord_vars, = list_of_coords
-        return _fast_dataset(result_vars, coord_vars)
+        out = _fast_dataset(result_vars, coord_vars)
+
+    if keep_attrs and isinstance(first_obj, Dataset):
+        if isinstance(out, tuple):
+            out = tuple(ds._copy_attrs_from(first_obj) for ds in out)
+        else:
+            out._copy_attrs_from(first_obj)
+    return out
 
 
 def _iter_over_selections(obj, dim, values):
@@ -530,7 +562,8 @@ def apply_array_ufunc(func, *args, **kwargs):
 
 def apply_ufunc(func, *args, **kwargs):
     """apply_ufunc(func, *args, signature=None, join='inner',
-                   exclude_dims=frozenset(), dataset_fill_value=None,
+                   exclude_dims=frozenset(), dataset_join='inner',
+                   dataset_fill_value=_DEFAULT_FILL_VALUE, keep_attrs=False,
                    kwargs=None, dask_array='forbidden')
 
     Apply a vectorized function for unlabeled arrays to xarray objects.
@@ -581,14 +614,23 @@ def apply_ufunc(func, *args, **kwargs):
         - 'inner': use the intersection of object indexes
         - 'left': use indexes from the first object with each dimension
         - 'right': use indexes from the last object with each dimension
+    dataset_join : {'outer', 'inner', 'left', 'right'}, optional
+        Method for joining variables of Dataset objects with mismatched
+        data variables.
+        - 'outer': take variables from both Dataset objects
+        - 'inner': take only overlapped variables
+        - 'left': take only variables from the first object
+        - 'right': take only variables from the last object
+    dataset_fill_value : optional
+        Value used in place of missing variables on Dataset inputs when the
+        datasets do not share the exact same ``data_vars``. Required if
+        ``dataset_join != 'inner'``, otherwise ignored.
+    keep_attrs: boolean, Optional
+        Whether to copy attributes from the first argument to the output.
     exclude_dims : set, optional
         Dimensions to exclude from alignment and broadcasting. Any inputs
         coordinates along these dimensions will be dropped. Each excluded
         dimension must be a core dimension in the function signature.
-    dataset_fill_value : optional
-        Value used in place of missing variables on Dataset inputs when the
-        datasets do not share the exact same ``data_vars``. Only relevant if
-        ``join != 'inner'``.
     kwargs: dict, optional
         Optional keyword arguments passed directly on to call ``func``.
     dask_array: 'forbidden' or 'allowed', optional
@@ -664,8 +706,10 @@ def stack(objects, dim, new_coord):
 
     signature = kwargs.pop('signature', None)
     join = kwargs.pop('join', 'inner')
+    dataset_join = kwargs.pop('dataset_join', 'inner')
+    keep_attrs = kwargs.pop('keep_attrs', False)
     exclude_dims = kwargs.pop('exclude_dims', frozenset())
-    dataset_fill_value = kwargs.pop('dataset_fill_value', None)
+    dataset_fill_value = kwargs.pop('dataset_fill_value', _DEFAULT_FILL_VALUE)
     kwargs_ = kwargs.pop('kwargs', None)
     dask_array = kwargs.pop('dask_array', 'forbidden')
     if kwargs:
@@ -697,16 +741,21 @@ def stack(objects, dim, new_coord):
         this_apply = functools.partial(
             apply_ufunc, func, signature=signature, join=join,
             dask_array=dask_array, exclude_dims=exclude_dims,
-            dataset_fill_value=dataset_fill_value)
+            dataset_fill_value=dataset_fill_value,
+            dataset_join=dataset_join,
+            keep_attrs=keep_attrs)
         return apply_groupby_ufunc(this_apply, *args)
     elif any(is_dict_like(a) for a in args):
         return apply_dataset_ufunc(variables_ufunc, *args, signature=signature,
                                    join=join, exclude_dims=exclude_dims,
-                                   fill_value=dataset_fill_value)
+                                   fill_value=dataset_fill_value,
+                                   dataset_join=dataset_join,
+                                   keep_attrs=keep_attrs)
     elif any(isinstance(a, DataArray) for a in args):
         return apply_dataarray_ufunc(variables_ufunc, *args,
                                      signature=signature,
-                                     join=join, exclude_dims=exclude_dims)
+                                     join=join, exclude_dims=exclude_dims,
+                                     keep_attrs=keep_attrs)
     elif any(isinstance(a, Variable) for a in args):
         return variables_ufunc(*args)
     else:

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -1097,10 +1097,27 @@ def fillna(self, value):
         if utils.is_dict_like(value):
             raise TypeError('cannot provide fill value as a dictionary with '
                             'fillna on a DataArray')
-        out = self._fillna(value)
-        out.attrs = self.attrs
+        out = ops.fillna(self, value)
         return out
 
+    def combine_first(self, other):
+        """Combine two DataArray objects, with union of coordinates.
+
+        This operation follows the normal broadcasting and alignment rules of
+        ``join='outer'``.  Default to non-null values of array calling the
+        method.  Use np.nan to fill in vacant cells after alignment.
+
+        Parameters
+        ----------
+        other : DataArray
+            Used to fill all matching missing values in this array.
+
+        Returns
+        -------
+        DataArray
+        """
+        return ops.fillna(self, other, join="outer")
+
     def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs):
         """Reduce this array by applying `func` along some dimension(s).
 

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -2001,8 +2001,31 @@ def fillna(self, value):
         -------
         Dataset
         """
-        out = self._fillna(value)
-        out._copy_attrs_from(self)
+        if utils.is_dict_like(value):
+            value_keys = getattr(value, 'data_vars', value).keys()
+            if not set(value_keys) <= set(self.data_vars.keys()):
+                raise ValueError('all variables in the argument to `fillna` '
+                                 'must be contained in the original dataset')
+        out = ops.fillna(self, value)
+        return out
+
+    def combine_first(self, other):
+        """Combine two Datasets, default to data_vars of self.
+
+        The new coordinates follow the normal broadcasting and alignment rules
+        of ``join='outer'``.  Vacant cells in the expanded coordinates are
+        filled with np.nan.
+
+        Parameters
+        ----------
+        other : DataArray
+            Used to fill all matching missing values in this array.
+
+        Returns
+        -------
+        DataArray
+        """
+        out = ops.fillna(self, other, join="outer", dataset_join="outer")
         return out
 
     def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False,
@@ -2335,7 +2358,7 @@ def func(self, *args, **kwargs):
         return func
 
     @staticmethod
-    def _binary_op(f, reflexive=False, join=None, fillna=False):
+    def _binary_op(f, reflexive=False, join=None):
         @functools.wraps(f)
         def func(self, other):
             if isinstance(other, groupby.GroupBy):
@@ -2344,8 +2367,7 @@ def func(self, other):
             if hasattr(other, 'indexes'):
                 self, other = align(self, other, join=align_type, copy=False)
             g = f if not reflexive else lambda x, y: f(y, x)
-            ds = self._calculate_binary_op(g, other, join=align_type,
-                                           fillna=fillna)
+            ds = self._calculate_binary_op(g, other, join=align_type)
             return ds
 
         return func
@@ -2370,14 +2392,9 @@ def func(self, other):
         return func
 
     def _calculate_binary_op(self, f, other, join='inner',
-                             inplace=False, fillna=False):
+                             inplace=False):
 
         def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars):
-            if fillna and join != 'left':
-                raise ValueError('`fillna` must be accompanied by left join')
-            if fillna and not set(rhs_data_vars) <= set(lhs_data_vars):
-                raise ValueError('all variables in the argument to `fillna` '
-                                 'must be contained in the original dataset')
             if inplace and set(lhs_data_vars) != set(rhs_data_vars):
                 raise ValueError('datasets must have the same data variables '
                                  'for in-place arithmetic operations: %s, %s'
@@ -2389,12 +2406,10 @@ def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars):
                 if k in rhs_data_vars:
                     dest_vars[k] = f(lhs_vars[k], rhs_vars[k])
                 elif join in ["left", "outer"]:
-                    dest_vars[k] = (lhs_vars[k] if fillna else
-                                    f(lhs_vars[k], np.nan))
+                    dest_vars[k] = f(lhs_vars[k], np.nan)
             for k in rhs_data_vars:
                 if k not in dest_vars and join in ["right", "outer"]:
-                    dest_vars[k] = (rhs_vars[k] if fillna else
-                                    f(rhs_vars[k], np.nan))
+                    dest_vars[k] = f(rhs_vars[k], np.nan)
             return dest_vars
 
         if utils.is_dict_like(other) and not isinstance(other, Dataset):
@@ -2421,7 +2436,8 @@ def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars):
     def _copy_attrs_from(self, other):
         self.attrs = other.attrs
         for v in other.variables:
-            self.variables[v].attrs = other.variables[v].attrs
+            if v in self.variables:
+                self.variables[v].attrs = other.variables[v].attrs
 
     def diff(self, dim, n=1, label='upper'):
         """Calculate the n-th order discrete difference along given axis.

diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -382,7 +382,8 @@ def fillna(self, value):
         Dataset.fillna
         DataArray.fillna
         """
-        return self._fillna(value)
+        out = ops.fillna(self, value)
+        return out
 
     def where(self, cond):
         """Return an object of the same shape with all entries where cond is