Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add quantile method to DataArray #1187

Merged
merged 12 commits into from
Jan 23, 2017
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ Computation
Dataset.groupby_bins
Dataset.resample
Dataset.diff
Dataset.quantile

**Aggregation**:
:py:attr:`~Dataset.all`
Expand Down Expand Up @@ -270,6 +271,7 @@ Computation
DataArray.get_axis_num
DataArray.diff
DataArray.dot
DataArray.quantile
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add this for Dataset, too

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.


**Aggregation**:
:py:attr:`~DataArray.all`
Expand Down
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ Enhancements
and attributes. The method prints to a buffer (e.g. ``stdout``) with output
similar to what the command line utility ``ncdump -h`` produces (:issue:`1150`).
By `Joe Hamman <https://github.com/jhamman>`_.
- New :py:meth:`~DataArray.quantile` method to calculate quantiles from
DataArray objects (:issue:`1187`).
By `Joe Hamman <https://github.com/jhamman>`_.

Bug fixes
~~~~~~~~~
Expand Down
47 changes: 47 additions & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1736,5 +1736,52 @@ def dot(self, other):

return type(self)(new_data, new_coords, new_dims)

def quantile(self, q, dim=None, interpolation='linear', keep_attrs=False):
"""Compute the qth quantile of the data along the specified dimension.

Returns the qth quantiles(s) of the array elements.

Parameters
----------
q : float in range of [0,1] (or sequence of floats)
Quantile to compute, which must be between 0 and 1
inclusive.
dim : str or sequence of str, optional
Dimension(s) over which to apply quantile.
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
This optional parameter specifies the interpolation method to
use when the desired quantile lies between two data points
``i < j``:
* linear: ``i + (j - i) * fraction``, where ``fraction`` is
the fractional part of the index surrounded by ``i`` and
``j``.
* lower: ``i``.
* higher: ``j``.
* nearest: ``i`` or ``j``, whichever is nearest.
* midpoint: ``(i + j) / 2``.
keep_attrs : bool, optional
If True, the dataset's attributes (`attrs`) will be copied from
the original object to the new one. If False (default), the new
object will be returned without attributes.

Returns
-------
quantiles : DataArray
If `q` is a single quantile, then the result
is a scalar. If multiple percentiles are given, first axis of
the result corresponds to the quantile and a quantile dimension
is added to the return array. The other dimensions are the
dimensions that remain after the reduction of the array.

See Also
--------
np.nanpercentile, pd.Series.quantile, xr.Dataset.quantile
"""

ds = self._to_temp_dataset().quantile(q, dim=dim, keep_attrs=keep_attrs,
interpolation=interpolation)
return self._from_temp_dataset(ds)


# priority most be higher than Variable to properly work with binary ufuncs
ops.inject_all_ops_and_reduce_methods(DataArray, priority=60)
88 changes: 87 additions & 1 deletion xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
broadcast_variables)
from .pycompat import (iteritems, basestring, OrderedDict,
dask_array_type, range)
from .formatting import ensure_valid_repr
from .combine import concat
from .options import OPTIONS

Expand Down Expand Up @@ -2547,6 +2546,93 @@ def roll(self, **shifts):

return self._replace_vars_and_dims(variables)

def quantile(self, q, dim=None, interpolation='linear',
numeric_only=False, keep_attrs=False):
"""Compute the qth quantile of the data along the specified dimension.

Returns the qth quantiles(s) of the array elements for each variable
in the Dataset.

Parameters
----------
q : float in range of [0,1] (or sequence of floats)
Quantile to compute, which must be between 0 and 1
inclusive.
dim : str or sequence of str, optional
Dimension(s) over which to apply quantile.
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
This optional parameter specifies the interpolation method to
use when the desired quantile lies between two data points
``i < j``:
* linear: ``i + (j - i) * fraction``, where ``fraction`` is
the fractional part of the index surrounded by ``i`` and
``j``.
* lower: ``i``.
* higher: ``j``.
* nearest: ``i`` or ``j``, whichever is nearest.
* midpoint: ``(i + j) / 2``.
keep_attrs : bool, optional
If True, the dataset's attributes (`attrs`) will be copied from
the original object to the new one. If False (default), the new
object will be returned without attributes.
numeric_only : bool, optional
If True, only apply ``func`` to variables with a numeric dtype.

Returns
-------
quantiles : Dataset
If `q` is a single quantile, then the result is a scalar for each
variable in data_vars. If multiple percentiles are given, first
axis of the result corresponds to the quantile and a quantile
dimension is added to the return Dataset. The other dimensions are
the dimensions that remain after the reduction of the array.

See Also
--------
np.nanpercentile, pd.Series.quantile, xr.DataArray.quantile
"""

if isinstance(dim, basestring):
dims = set([dim])
elif dim is None:
dims = set(self.dims)
else:
dims = set(dim)

_assert_empty([dim for dim in dims if dim not in self.dims],
'Dataset does not contain the dimensions: %s')

q = np.asarray(q, dtype=np.float64)

variables = OrderedDict()
for name, var in iteritems(self.variables):
reduce_dims = [dim for dim in var.dims if dim in dims]
if reduce_dims or not var.dims:
if name not in self.coords:
if (not numeric_only or
np.issubdtype(var.dtype, np.number) or
var.dtype == np.bool_):
if len(reduce_dims) == var.ndim:
# prefer to aggregate over axis=None rather than
# axis=(0, 1) if they will be equivalent, because
# the former is often more efficient
reduce_dims = None
variables[name] = var.quantile(
q, dim=reduce_dims, interpolation=interpolation)

else:
variables[name] = var

# construct the new dataset
coord_names = set(k for k in self.coords if k in variables)
attrs = self.attrs if keep_attrs else None
new = self._replace_vars_and_dims(variables, coord_names, attrs=attrs)
if 'quantile' in new.dims:
new.coords['quantile'] = Variable('quantile', q)
else:
new.coords['quantile'] = q
return new

@property
def real(self):
return self._unary_op(lambda x: x.real, keep_attrs=True)(self)
Expand Down
1 change: 1 addition & 0 deletions xarray/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,7 @@ def inject_reduce_methods(cls):
extra_args=cls._reduce_extra_args_docstring)
setattr(cls, name, func)


def inject_cum_methods(cls):
methods = ([(name, globals()[name], True) for name in NAN_CUM_METHODS])
for name, f, include_skipna in methods:
Expand Down
21 changes: 12 additions & 9 deletions xarray/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import contextlib
import functools
import itertools
import os.path
import re
import warnings
from collections import Mapping, MutableMapping, Iterable
Expand Down Expand Up @@ -102,7 +101,9 @@ def equivalent(first, second):
if isinstance(first, np.ndarray) or isinstance(second, np.ndarray):
return ops.array_equiv(first, second)
else:
return first is second or first == second or (pd.isnull(first) and pd.isnull(second))
return (first is second or
first == second or
(pd.isnull(first) and pd.isnull(second)))


def peek_at(iterable):
Expand Down Expand Up @@ -179,12 +180,14 @@ def combine_pos_and_kw_args(pos_kwargs, kw_kwargs, func_name):


def is_scalar(value):
""" Whether to treat a value as a scalar. Any non-iterable, string, or 0-D array """
return (
getattr(value, 'ndim', None) == 0
or isinstance(value, (basestring, bytes_type))
or not isinstance(value, Iterable))
"""Whether to treat a value as a scalar.

Any non-iterable, string, or 0-D array
"""
return (
getattr(value, 'ndim', None) == 0 or
isinstance(value, (basestring, bytes_type)) or not
isinstance(value, Iterable))


def is_valid_numpy_dtype(dtype):
Expand All @@ -205,8 +208,8 @@ def to_0d_object_array(value):

def to_0d_array(value):
"""Given a value, wrap it in a 0-D numpy.ndarray."""
if np.isscalar(value) or (isinstance(value, np.ndarray)
and value.ndim == 0):
if np.isscalar(value) or (isinstance(value, np.ndarray) and
value.ndim == 0):
return np.array(value)
else:
return to_0d_object_array(value)
Expand Down
70 changes: 70 additions & 0 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from collections import defaultdict
import functools
import itertools
from distutils.version import LooseVersion

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -1031,6 +1032,75 @@ def no_conflicts(self, other):
"""
return self.broadcast_equals(other, equiv=ops.array_notnull_equiv)

def quantile(self, q, dim=None, interpolation='linear'):
"""Compute the qth quantile of the data along the specified dimension.

Returns the qth quantiles(s) of the array elements.

Parameters
----------
q : float in range of [0,1] (or sequence of floats)
Quantile to compute, which must be between 0 and 1
inclusive.
dim : str or sequence of str, optional
Dimension(s) over which to apply quantile.
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
This optional parameter specifies the interpolation method to
use when the desired quantile lies between two data points
``i < j``:
* linear: ``i + (j - i) * fraction``, where ``fraction`` is
the fractional part of the index surrounded by ``i`` and
``j``.
* lower: ``i``.
* higher: ``j``.
* nearest: ``i`` or ``j``, whichever is nearest.
* midpoint: ``(i + j) / 2``.

Returns
-------
quantiles : Variable
If `q` is a single quantile, then the result
is a scalar. If multiple percentiles are given, first axis of
the result corresponds to the quantile and a quantile dimension
is added to the return array. The other dimensions are the
dimensions that remain after the reduction of the array.

See Also
--------
np.nanpercentile, pd.Series.quantile,
xr.Dataset.quantile, xr.DataArray.quantile
"""

if isinstance(self.data, dask_array_type):
TypeError("quantile does not work for arrays stored as dask "
"arrays. Load the data via .compute() or .load() prior "
"to calling this method.")
if LooseVersion(np.__version__) < LooseVersion('1.10.0'):
raise NotImplementedError(
'quantile requres numpy version 1.10.0 or later')

q = np.asarray(q, dtype=np.float64)

new_dims = list(self.dims)
if dim is not None:
axis = self.get_axis_num(dim)
if utils.is_scalar(dim):
new_dims.remove(dim)
else:
for d in dim:
new_dims.remove(d)
else:
axis = None
new_dims = []

# only add the quantile dimension if q is array like
if q.ndim != 0:
new_dims = ['quantile'] + new_dims

qs = np.nanpercentile(self.data, q * 100., axis=axis,
interpolation=interpolation)
return Variable(new_dims, qs)

@property
def real(self):
return type(self)(self.dims, self.data.real, self._attrs)
Expand Down
13 changes: 13 additions & 0 deletions xarray/test/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pytest
from copy import deepcopy
from textwrap import dedent
from distutils.version import LooseVersion

import xarray as xr

Expand Down Expand Up @@ -1328,6 +1329,18 @@ def test_reduce(self):
expected = DataArray(5, {'c': -999})
self.assertDataArrayIdentical(expected, actual)

@pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.10.0'),
reason='requires numpy version 1.10.0 or later')
# skip due to bug in older versions of numpy.nanpercentile
def test_quantile(self):
for q in [0.25, [0.50], [0.25, 0.75]]:
for axis, dim in zip([None, 0, [0], [0, 1]],
[None, 'x', ['x'], ['x', 'y']]):
actual = self.dv.quantile(q, dim=dim)
expected = np.nanpercentile(self.dv.values, np.array(q) * 100,
axis=axis)
np.testing.assert_allclose(actual.values, expected)

def test_reduce_keep_attrs(self):
# Test dropped attrs
vm = self.va.mean()
Expand Down