Skip to content

Commit

Permalink
combine_first by using apply_ufunc in ops.fillna (#1204)
Browse files Browse the repository at this point in the history
* First commit of ops.fillna() using apply_ufunc().

* Saving preliminary changes to fillna using apply_ufunc.aset attr test.

* _yield_applied for GroupedBy.fillna().  Inspired by _binary_op().

* Finished introducing combine_first to DataArray/Dataset objects.  Remove _fillna from injection.

* Minor spacing changes to doc.

* Use np.where() in ops.fillna

* Separating data_vars_join from join in apply_ufunc, and accessing it through ops.fillna.  Also added docstring.

* Rewrote Groupby's fillna, among other review comments.git status

* Initial attempt to introduce keep_attrs to apply_ufunc.

* Adding test for data_vars_join, also removing join kwarg in fillna of dataarray.py and dataset.py.

* Brining keep_attrs from apply_ufunc to apply_dataset_ufunc and apply_dataarray_ufunc.

* Forcing explicit specification for dataset_fill_value.

* Added test for keep_attrs.  Changed kwarg name frmo data_vars_join to dataset_join.

* Updated docstrings.  Moving changes to v0.9.0

* Adjust dataset_fill_value default in apply_ufunc signature
  • Loading branch information
chunweiyuan authored and shoyer committed Jan 23, 2017
1 parent 4bb630f commit c5146e8
Show file tree
Hide file tree
Showing 11 changed files with 298 additions and 52 deletions.
35 changes: 35 additions & 0 deletions doc/combining.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Combining data
* For combining datasets or data arrays along a dimension, see concatenate_.
* For combining datasets with different variables, see merge_.
* For combining datasets or data arrays with different indexes or missing values, see combine_.

.. _concatenate:

Expand Down Expand Up @@ -116,6 +117,40 @@ used in the :py:class:`~xarray.Dataset` constructor:
xr.Dataset({'a': arr[:-1], 'b': arr[1:]})
.. _combine:

Combine
~~~~~~~

The instance method ``combine_first`` combines two datasets/data arrays and
defaults to non-null values in the calling object, using values from the called
object to fill holes. The resulting coordinates are the union of coordinate labels.
Vacant cells as a result of the outer-join are filled with nan.

Mimics the behavior of ``pandas.Dataframe.combine_first``

For data array,

.. ipython:: python
ar0 = DataArray([[0, 0], [0, 0]], [('x', ['a', 'b']), ('y', [-1, 0])])
ar1 = DataArray([[1, 1], [1, 1]], [('x', ['b', 'c']), ('y', [0, 1])])
ar2 = DataArray([2], [('x', ['d'])])
ar0.combine_first(ar1)
ar1.combine_first(ar0)
ar0.combine_first(ar2)
For datasets, ``ds0.combine_first(ds1)`` works similarly to ``xr.merge([ds0, ds1])``,
except that ``xr.merge`` raises a ``MergeError`` when there are conflicting values
in merging data variables, whereas ``.combine_first`` defaults to the calling object's values.

.. ipython:: python
ds0 = Dataset({'a': ('x', [1, 2]), 'x': [0, 1]})
ds1 = Dataset({'a': ('x', [99, 3]), 'x': [1, 2]})
ds0.combine_first(ds1)
xr.merge([ds0, ds1])
.. _update:

Update
Expand Down
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,13 @@ Deprecations

Enhancements
~~~~~~~~~~~~

- Added the xarray equivalent of `pandas.Dataframe.combine_first` as an instance
method to DataArray/Dataset objects, facilitated by the new `ops.fillna` with
`join` and `data_vars_join` options.
(see :ref:`combine`)
By `Chun-Wei Yuan <https://github.com/chunweiyuan>`_.

- Added the ability to change default automatic alignment (arithmetic_join="inner")
for binary operations via :py:func:`~xarray.set_options()`
(see :ref:`automatic alignment`).
Expand Down
87 changes: 68 additions & 19 deletions xarray/core/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import operator
import re

import numpy as np

from . import ops
from .alignment import deep_align
from .merge import expand_and_merge_variables
Expand All @@ -16,6 +18,8 @@


_DEFAULT_FROZEN_SET = frozenset()
_DEFAULT_FILL_VALUE = object()


# see http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
DIMENSION_NAME = r'\w+'
Expand Down Expand Up @@ -202,6 +206,7 @@ def apply_dataarray_ufunc(func, *args, **kwargs):
signature = kwargs.pop('signature')
join = kwargs.pop('join', 'inner')
exclude_dims = kwargs.pop('exclude_dims', _DEFAULT_FROZEN_SET)
keep_attrs = kwargs.pop('keep_attrs', False)
if kwargs:
raise TypeError('apply_dataarray_ufunc() got unexpected keyword '
'arguments: %s' % list(kwargs))
Expand All @@ -217,12 +222,18 @@ def apply_dataarray_ufunc(func, *args, **kwargs):
result_var = func(*data_vars)

if signature.n_outputs > 1:
return tuple(DataArray(variable, coords, name=name, fastpath=True)
for variable, coords in zip(result_var, result_coords))
out = tuple(DataArray(variable, coords, name=name, fastpath=True)
for variable, coords in zip(result_var, result_coords))
else:
coords, = result_coords
return DataArray(result_var, coords, name=name, fastpath=True)
out = DataArray(result_var, coords, name=name, fastpath=True)

if keep_attrs and isinstance(args[0], DataArray):
if isinstance(out, tuple):
out = tuple(ds._copy_attrs_from(args[0]) for ds in out)
else:
out._copy_attrs_from(args[0])
return out

def ordered_set_union(all_keys):
# type: List[Iterable] -> Iterable
Expand Down Expand Up @@ -326,32 +337,53 @@ def _fast_dataset(variables, coord_variables):

def apply_dataset_ufunc(func, *args, **kwargs):
"""apply_dataset_ufunc(func, *args, signature, join='inner',
fill_value=None, exclude_dims=frozenset()):
dataset_join='inner', fill_value=None,
exclude_dims=frozenset(), keep_attrs=False):
If dataset_join != 'inner', a non-default fill_value must be supplied
by the user. Otherwise a TypeError is raised.
"""
from .dataset import Dataset
signature = kwargs.pop('signature')
join = kwargs.pop('join', 'inner')
dataset_join = kwargs.pop('dataset_join', 'inner')
fill_value = kwargs.pop('fill_value', None)
exclude_dims = kwargs.pop('exclude_dims', _DEFAULT_FROZEN_SET)
keep_attrs = kwargs.pop('keep_attrs', False)
first_obj = args[0] # we'll copy attrs from this in case keep_attrs=True

if dataset_join != 'inner' and fill_value is _DEFAULT_FILL_VALUE:
raise TypeError('To apply an operation to datasets with different ',
'data variables, you must supply the ',
'dataset_fill_value argument.')

if kwargs:
raise TypeError('apply_dataset_ufunc() got unexpected keyword '
'arguments: %s' % list(kwargs))

if len(args) > 1:
args = deep_align(args, join=join, copy=False, exclude=exclude_dims,
raise_on_invalid=False)

list_of_coords = build_output_coords(args, signature, exclude_dims)

args = [getattr(arg, 'data_vars', arg) for arg in args]

result_vars = apply_dict_of_variables_ufunc(
func, *args, signature=signature, join=join, fill_value=fill_value)
func, *args, signature=signature, join=dataset_join,
fill_value=fill_value)

if signature.n_outputs > 1:
return tuple(_fast_dataset(*args)
for args in zip(result_vars, list_of_coords))
out = tuple(_fast_dataset(*args)
for args in zip(result_vars, list_of_coords))
else:
coord_vars, = list_of_coords
return _fast_dataset(result_vars, coord_vars)
out = _fast_dataset(result_vars, coord_vars)

if keep_attrs and isinstance(first_obj, Dataset):
if isinstance(out, tuple):
out = tuple(ds._copy_attrs_from(first_obj) for ds in out)
else:
out._copy_attrs_from(first_obj)
return out


def _iter_over_selections(obj, dim, values):
Expand Down Expand Up @@ -530,7 +562,8 @@ def apply_array_ufunc(func, *args, **kwargs):

def apply_ufunc(func, *args, **kwargs):
"""apply_ufunc(func, *args, signature=None, join='inner',
exclude_dims=frozenset(), dataset_fill_value=None,
exclude_dims=frozenset(), dataset_join='inner',
dataset_fill_value=_DEFAULT_FILL_VALUE, keep_attrs=False,
kwargs=None, dask_array='forbidden')
Apply a vectorized function for unlabeled arrays to xarray objects.
Expand Down Expand Up @@ -581,14 +614,23 @@ def apply_ufunc(func, *args, **kwargs):
- 'inner': use the intersection of object indexes
- 'left': use indexes from the first object with each dimension
- 'right': use indexes from the last object with each dimension
dataset_join : {'outer', 'inner', 'left', 'right'}, optional
Method for joining variables of Dataset objects with mismatched
data variables.
- 'outer': take variables from both Dataset objects
- 'inner': take only overlapped variables
- 'left': take only variables from the first object
- 'right': take only variables from the last object
dataset_fill_value : optional
Value used in place of missing variables on Dataset inputs when the
datasets do not share the exact same ``data_vars``. Required if
``dataset_join != 'inner'``, otherwise ignored.
keep_attrs: boolean, Optional
Whether to copy attributes from the first argument to the output.
exclude_dims : set, optional
Dimensions to exclude from alignment and broadcasting. Any inputs
coordinates along these dimensions will be dropped. Each excluded
dimension must be a core dimension in the function signature.
dataset_fill_value : optional
Value used in place of missing variables on Dataset inputs when the
datasets do not share the exact same ``data_vars``. Only relevant if
``join != 'inner'``.
kwargs: dict, optional
Optional keyword arguments passed directly on to call ``func``.
dask_array: 'forbidden' or 'allowed', optional
Expand Down Expand Up @@ -664,8 +706,10 @@ def stack(objects, dim, new_coord):

signature = kwargs.pop('signature', None)
join = kwargs.pop('join', 'inner')
dataset_join = kwargs.pop('dataset_join', 'inner')
keep_attrs = kwargs.pop('keep_attrs', False)
exclude_dims = kwargs.pop('exclude_dims', frozenset())
dataset_fill_value = kwargs.pop('dataset_fill_value', None)
dataset_fill_value = kwargs.pop('dataset_fill_value', _DEFAULT_FILL_VALUE)
kwargs_ = kwargs.pop('kwargs', None)
dask_array = kwargs.pop('dask_array', 'forbidden')
if kwargs:
Expand Down Expand Up @@ -697,16 +741,21 @@ def stack(objects, dim, new_coord):
this_apply = functools.partial(
apply_ufunc, func, signature=signature, join=join,
dask_array=dask_array, exclude_dims=exclude_dims,
dataset_fill_value=dataset_fill_value)
dataset_fill_value=dataset_fill_value,
dataset_join=dataset_join,
keep_attrs=keep_attrs)
return apply_groupby_ufunc(this_apply, *args)
elif any(is_dict_like(a) for a in args):
return apply_dataset_ufunc(variables_ufunc, *args, signature=signature,
join=join, exclude_dims=exclude_dims,
fill_value=dataset_fill_value)
fill_value=dataset_fill_value,
dataset_join=dataset_join,
keep_attrs=keep_attrs)
elif any(isinstance(a, DataArray) for a in args):
return apply_dataarray_ufunc(variables_ufunc, *args,
signature=signature,
join=join, exclude_dims=exclude_dims)
join=join, exclude_dims=exclude_dims,
keep_attrs=keep_attrs)
elif any(isinstance(a, Variable) for a in args):
return variables_ufunc(*args)
else:
Expand Down
21 changes: 19 additions & 2 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1097,10 +1097,27 @@ def fillna(self, value):
if utils.is_dict_like(value):
raise TypeError('cannot provide fill value as a dictionary with '
'fillna on a DataArray')
out = self._fillna(value)
out.attrs = self.attrs
out = ops.fillna(self, value)
return out

def combine_first(self, other):
"""Combine two DataArray objects, with union of coordinates.
This operation follows the normal broadcasting and alignment rules of
``join='outer'``. Default to non-null values of array calling the
method. Use np.nan to fill in vacant cells after alignment.
Parameters
----------
other : DataArray
Used to fill all matching missing values in this array.
Returns
-------
DataArray
"""
return ops.fillna(self, other, join="outer")

def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs):
"""Reduce this array by applying `func` along some dimension(s).
Expand Down
48 changes: 32 additions & 16 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2001,8 +2001,31 @@ def fillna(self, value):
-------
Dataset
"""
out = self._fillna(value)
out._copy_attrs_from(self)
if utils.is_dict_like(value):
value_keys = getattr(value, 'data_vars', value).keys()
if not set(value_keys) <= set(self.data_vars.keys()):
raise ValueError('all variables in the argument to `fillna` '
'must be contained in the original dataset')
out = ops.fillna(self, value)
return out

def combine_first(self, other):
"""Combine two Datasets, default to data_vars of self.
The new coordinates follow the normal broadcasting and alignment rules
of ``join='outer'``. Vacant cells in the expanded coordinates are
filled with np.nan.
Parameters
----------
other : DataArray
Used to fill all matching missing values in this array.
Returns
-------
DataArray
"""
out = ops.fillna(self, other, join="outer", dataset_join="outer")
return out

def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False,
Expand Down Expand Up @@ -2335,7 +2358,7 @@ def func(self, *args, **kwargs):
return func

@staticmethod
def _binary_op(f, reflexive=False, join=None, fillna=False):
def _binary_op(f, reflexive=False, join=None):
@functools.wraps(f)
def func(self, other):
if isinstance(other, groupby.GroupBy):
Expand All @@ -2344,8 +2367,7 @@ def func(self, other):
if hasattr(other, 'indexes'):
self, other = align(self, other, join=align_type, copy=False)
g = f if not reflexive else lambda x, y: f(y, x)
ds = self._calculate_binary_op(g, other, join=align_type,
fillna=fillna)
ds = self._calculate_binary_op(g, other, join=align_type)
return ds

return func
Expand All @@ -2370,14 +2392,9 @@ def func(self, other):
return func

def _calculate_binary_op(self, f, other, join='inner',
inplace=False, fillna=False):
inplace=False):

def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars):
if fillna and join != 'left':
raise ValueError('`fillna` must be accompanied by left join')
if fillna and not set(rhs_data_vars) <= set(lhs_data_vars):
raise ValueError('all variables in the argument to `fillna` '
'must be contained in the original dataset')
if inplace and set(lhs_data_vars) != set(rhs_data_vars):
raise ValueError('datasets must have the same data variables '
'for in-place arithmetic operations: %s, %s'
Expand All @@ -2389,12 +2406,10 @@ def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars):
if k in rhs_data_vars:
dest_vars[k] = f(lhs_vars[k], rhs_vars[k])
elif join in ["left", "outer"]:
dest_vars[k] = (lhs_vars[k] if fillna else
f(lhs_vars[k], np.nan))
dest_vars[k] = f(lhs_vars[k], np.nan)
for k in rhs_data_vars:
if k not in dest_vars and join in ["right", "outer"]:
dest_vars[k] = (rhs_vars[k] if fillna else
f(rhs_vars[k], np.nan))
dest_vars[k] = f(rhs_vars[k], np.nan)
return dest_vars

if utils.is_dict_like(other) and not isinstance(other, Dataset):
Expand All @@ -2421,7 +2436,8 @@ def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars):
def _copy_attrs_from(self, other):
self.attrs = other.attrs
for v in other.variables:
self.variables[v].attrs = other.variables[v].attrs
if v in self.variables:
self.variables[v].attrs = other.variables[v].attrs

def diff(self, dim, n=1, label='upper'):
"""Calculate the n-th order discrete difference along given axis.
Expand Down
3 changes: 2 additions & 1 deletion xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,8 @@ def fillna(self, value):
Dataset.fillna
DataArray.fillna
"""
return self._fillna(value)
out = ops.fillna(self, value)
return out

def where(self, cond):
"""Return an object of the same shape with all entries where cond is
Expand Down
Loading

0 comments on commit c5146e8

Please sign in to comment.