diff --git a/doc/api.rst b/doc/api.rst index fd25a09de23..7fc18ae3bcc 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -122,6 +122,7 @@ Computation Dataset.groupby_bins Dataset.resample Dataset.diff + Dataset.quantile **Aggregation**: :py:attr:`~Dataset.all` @@ -270,6 +271,7 @@ Computation DataArray.get_axis_num DataArray.diff DataArray.dot + DataArray.quantile **Aggregation**: :py:attr:`~DataArray.all` diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 79a629cce20..663ea883967 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -184,6 +184,9 @@ Enhancements and attributes. The method prints to a buffer (e.g. ``stdout``) with output similar to what the command line utility ``ncdump -h`` produces (:issue:`1150`). By `Joe Hamman `_. +- New :py:meth:`~DataArray.quantile` method to calculate quantiles from + DataArray objects (:issue:`1187`). + By `Joe Hamman `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7953ad07747..ebc1c149423 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1736,5 +1736,52 @@ def dot(self, other): return type(self)(new_data, new_coords, new_dims) + def quantile(self, q, dim=None, interpolation='linear', keep_attrs=False): + """Compute the qth quantile of the data along the specified dimension. + + Returns the qth quantiles(s) of the array elements. + + Parameters + ---------- + q : float in range of [0,1] (or sequence of floats) + Quantile to compute, which must be between 0 and 1 + inclusive. + dim : str or sequence of str, optional + Dimension(s) over which to apply quantile. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to + use when the desired quantile lies between two data points + ``i < j``: + * linear: ``i + (j - i) * fraction``, where ``fraction`` is + the fractional part of the index surrounded by ``i`` and + ``j``. + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + keep_attrs : bool, optional + If True, the dataset's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. + + Returns + ------- + quantiles : DataArray + If `q` is a single quantile, then the result + is a scalar. If multiple percentiles are given, first axis of + the result corresponds to the quantile and a quantile dimension + is added to the return array. The other dimensions are the + dimensions that remain after the reduction of the array. + + See Also + -------- + np.nanpercentile, pd.Series.quantile, xr.Dataset.quantile + """ + + ds = self._to_temp_dataset().quantile(q, dim=dim, keep_attrs=keep_attrs, + interpolation=interpolation) + return self._from_temp_dataset(ds) + + # priority most be higher than Variable to properly work with binary ufuncs ops.inject_all_ops_and_reduce_methods(DataArray, priority=60) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d7a3631ba6c..c3da98944b1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -28,7 +28,6 @@ broadcast_variables) from .pycompat import (iteritems, basestring, OrderedDict, dask_array_type, range) -from .formatting import ensure_valid_repr from .combine import concat from .options import OPTIONS @@ -2547,6 +2546,93 @@ def roll(self, **shifts): return self._replace_vars_and_dims(variables) + def quantile(self, q, dim=None, interpolation='linear', + numeric_only=False, keep_attrs=False): + """Compute the qth quantile of the data along the specified dimension. + + Returns the qth quantiles(s) of the array elements for each variable + in the Dataset. + + Parameters + ---------- + q : float in range of [0,1] (or sequence of floats) + Quantile to compute, which must be between 0 and 1 + inclusive. + dim : str or sequence of str, optional + Dimension(s) over which to apply quantile. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to + use when the desired quantile lies between two data points + ``i < j``: + * linear: ``i + (j - i) * fraction``, where ``fraction`` is + the fractional part of the index surrounded by ``i`` and + ``j``. + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + keep_attrs : bool, optional + If True, the dataset's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. + numeric_only : bool, optional + If True, only apply ``func`` to variables with a numeric dtype. + + Returns + ------- + quantiles : Dataset + If `q` is a single quantile, then the result is a scalar for each + variable in data_vars. If multiple percentiles are given, first + axis of the result corresponds to the quantile and a quantile + dimension is added to the return Dataset. The other dimensions are + the dimensions that remain after the reduction of the array. + + See Also + -------- + np.nanpercentile, pd.Series.quantile, xr.DataArray.quantile + """ + + if isinstance(dim, basestring): + dims = set([dim]) + elif dim is None: + dims = set(self.dims) + else: + dims = set(dim) + + _assert_empty([dim for dim in dims if dim not in self.dims], + 'Dataset does not contain the dimensions: %s') + + q = np.asarray(q, dtype=np.float64) + + variables = OrderedDict() + for name, var in iteritems(self.variables): + reduce_dims = [dim for dim in var.dims if dim in dims] + if reduce_dims or not var.dims: + if name not in self.coords: + if (not numeric_only or + np.issubdtype(var.dtype, np.number) or + var.dtype == np.bool_): + if len(reduce_dims) == var.ndim: + # prefer to aggregate over axis=None rather than + # axis=(0, 1) if they will be equivalent, because + # the former is often more efficient + reduce_dims = None + variables[name] = var.quantile( + q, dim=reduce_dims, interpolation=interpolation) + + else: + variables[name] = var + + # construct the new dataset + coord_names = set(k for k in self.coords if k in variables) + attrs = self.attrs if keep_attrs else None + new = self._replace_vars_and_dims(variables, coord_names, attrs=attrs) + if 'quantile' in new.dims: + new.coords['quantile'] = Variable('quantile', q) + else: + new.coords['quantile'] = q + return new + @property def real(self): return self._unary_op(lambda x: x.real, keep_attrs=True)(self) diff --git a/xarray/core/ops.py b/xarray/core/ops.py index 0a7284f2a75..e9da926b709 100644 --- a/xarray/core/ops.py +++ b/xarray/core/ops.py @@ -410,6 +410,7 @@ def inject_reduce_methods(cls): extra_args=cls._reduce_extra_args_docstring) setattr(cls, name, func) + def inject_cum_methods(cls): methods = ([(name, globals()[name], True) for name in NAN_CUM_METHODS]) for name, f, include_skipna in methods: diff --git a/xarray/core/utils.py b/xarray/core/utils.py index dffacaf2eb0..148eb072ee4 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -6,7 +6,6 @@ import contextlib import functools import itertools -import os.path import re import warnings from collections import Mapping, MutableMapping, Iterable @@ -102,7 +101,9 @@ def equivalent(first, second): if isinstance(first, np.ndarray) or isinstance(second, np.ndarray): return ops.array_equiv(first, second) else: - return first is second or first == second or (pd.isnull(first) and pd.isnull(second)) + return (first is second or + first == second or + (pd.isnull(first) and pd.isnull(second))) def peek_at(iterable): @@ -179,12 +180,14 @@ def combine_pos_and_kw_args(pos_kwargs, kw_kwargs, func_name): def is_scalar(value): - """ Whether to treat a value as a scalar. Any non-iterable, string, or 0-D array """ - return ( - getattr(value, 'ndim', None) == 0 - or isinstance(value, (basestring, bytes_type)) - or not isinstance(value, Iterable)) + """Whether to treat a value as a scalar. + Any non-iterable, string, or 0-D array + """ + return ( + getattr(value, 'ndim', None) == 0 or + isinstance(value, (basestring, bytes_type)) or not + isinstance(value, Iterable)) def is_valid_numpy_dtype(dtype): @@ -205,8 +208,8 @@ def to_0d_object_array(value): def to_0d_array(value): """Given a value, wrap it in a 0-D numpy.ndarray.""" - if np.isscalar(value) or (isinstance(value, np.ndarray) - and value.ndim == 0): + if np.isscalar(value) or (isinstance(value, np.ndarray) and + value.ndim == 0): return np.array(value) else: return to_0d_object_array(value) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index c185c7b7127..80effeb64e2 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -5,6 +5,7 @@ from collections import defaultdict import functools import itertools +from distutils.version import LooseVersion import numpy as np import pandas as pd @@ -1031,6 +1032,75 @@ def no_conflicts(self, other): """ return self.broadcast_equals(other, equiv=ops.array_notnull_equiv) + def quantile(self, q, dim=None, interpolation='linear'): + """Compute the qth quantile of the data along the specified dimension. + + Returns the qth quantiles(s) of the array elements. + + Parameters + ---------- + q : float in range of [0,1] (or sequence of floats) + Quantile to compute, which must be between 0 and 1 + inclusive. + dim : str or sequence of str, optional + Dimension(s) over which to apply quantile. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to + use when the desired quantile lies between two data points + ``i < j``: + * linear: ``i + (j - i) * fraction``, where ``fraction`` is + the fractional part of the index surrounded by ``i`` and + ``j``. + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + + Returns + ------- + quantiles : Variable + If `q` is a single quantile, then the result + is a scalar. If multiple percentiles are given, first axis of + the result corresponds to the quantile and a quantile dimension + is added to the return array. The other dimensions are the + dimensions that remain after the reduction of the array. + + See Also + -------- + np.nanpercentile, pd.Series.quantile, + xr.Dataset.quantile, xr.DataArray.quantile + """ + + if isinstance(self.data, dask_array_type): + TypeError("quantile does not work for arrays stored as dask " + "arrays. Load the data via .compute() or .load() prior " + "to calling this method.") + if LooseVersion(np.__version__) < LooseVersion('1.10.0'): + raise NotImplementedError( + 'quantile requres numpy version 1.10.0 or later') + + q = np.asarray(q, dtype=np.float64) + + new_dims = list(self.dims) + if dim is not None: + axis = self.get_axis_num(dim) + if utils.is_scalar(dim): + new_dims.remove(dim) + else: + for d in dim: + new_dims.remove(d) + else: + axis = None + new_dims = [] + + # only add the quantile dimension if q is array like + if q.ndim != 0: + new_dims = ['quantile'] + new_dims + + qs = np.nanpercentile(self.data, q * 100., axis=axis, + interpolation=interpolation) + return Variable(new_dims, qs) + @property def real(self): return type(self)(self.dims, self.data.real, self._attrs) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index e58fa131844..08706978698 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -7,6 +7,7 @@ import pytest from copy import deepcopy from textwrap import dedent +from distutils.version import LooseVersion import xarray as xr @@ -1328,6 +1329,18 @@ def test_reduce(self): expected = DataArray(5, {'c': -999}) self.assertDataArrayIdentical(expected, actual) + @pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.10.0'), + reason='requires numpy version 1.10.0 or later') + # skip due to bug in older versions of numpy.nanpercentile + def test_quantile(self): + for q in [0.25, [0.50], [0.25, 0.75]]: + for axis, dim in zip([None, 0, [0], [0, 1]], + [None, 'x', ['x'], ['x', 'y']]): + actual = self.dv.quantile(q, dim=dim) + expected = np.nanpercentile(self.dv.values, np.array(q) * 100, + axis=axis) + np.testing.assert_allclose(actual.values, expected) + def test_reduce_keep_attrs(self): # Test dropped attrs vm = self.va.mean() diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 54f6cb273a9..9cff66d30fd 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -13,20 +13,20 @@ except ImportError: pass from io import StringIO +from distutils.version import LooseVersion import numpy as np import pandas as pd import xarray as xr import pytest -from xarray import (align, broadcast, concat, merge, conventions, backends, - Dataset, DataArray, Variable, IndexVariable, auto_combine, - open_dataset, set_options, MergeError) +from xarray import (align, broadcast, backends, Dataset, DataArray, Variable, + IndexVariable, open_dataset, set_options, MergeError) from xarray.core import indexing, utils from xarray.core.pycompat import iteritems, OrderedDict, unicode_type from xarray.core.common import full_like -from . import (TestCase, unittest, InaccessibleArray, UnexpectedDataAccess, +from . import (TestCase, InaccessibleArray, UnexpectedDataAccess, requires_dask, source_ndarray) @@ -2787,6 +2787,25 @@ def mean_only_one_axis(x, axis): with self.assertRaisesRegexp(TypeError, 'non-integer axis'): ds.reduce(mean_only_one_axis, ['x', 'y']) + @pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.10.0'), + reason='requires numpy version 1.10.0 or later') + def test_quantile(self): + + ds = create_test_data(seed=123) + + for q in [0.25, [0.50], [0.25, 0.75]]: + for dim in [None, 'dim1', ['dim1']]: + ds_quantile = ds.quantile(q, dim=dim) + assert 'quantile' in ds_quantile + for var, dar in ds.data_vars.items(): + assert var in ds_quantile + self.assertDataArrayIdentical( + ds_quantile[var], dar.quantile(q, dim=dim)) + dim = ['dim1', 'dim2'] + ds_quantile = ds.quantile(q, dim=dim) + assert 'dim3' in ds_quantile.dims + assert all(d not in ds_quantile.dims for d in dim) + def test_count(self): ds = Dataset({'x': ('a', [np.nan, 1]), 'y': 0, 'z': np.nan}) expected = Dataset({'x': 1, 'y': 1, 'z': 0}) diff --git a/xarray/test/test_variable.py b/xarray/test/test_variable.py index e99e77abf99..8cfa5681276 100644 --- a/xarray/test/test_variable.py +++ b/xarray/test/test_variable.py @@ -5,13 +5,14 @@ from copy import copy, deepcopy from datetime import datetime, timedelta from textwrap import dedent +import pytest from distutils.version import LooseVersion import numpy as np import pytz import pandas as pd -from xarray import Variable, IndexVariable, Coordinate, Dataset, DataArray +from xarray import Variable, IndexVariable, Coordinate, Dataset from xarray.core import indexing from xarray.core.variable import as_variable, as_compatible_data from xarray.core.indexing import PandasIndexAdapter, LazilyIndexedArray @@ -980,6 +981,19 @@ def test_reduce(self): with self.assertRaisesRegexp(ValueError, 'cannot supply both'): v.mean(dim='x', axis=0) + @pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.10.0'), + reason='requires numpy version 1.10.0 or later') + def test_quantile(self): + v = Variable(['x', 'y'], self.d) + for q in [0.25, [0.50], [0.25, 0.75]]: + for axis, dim in zip([None, 0, [0], [0, 1]], + [None, 'x', ['x'], ['x', 'y']]): + actual = v.quantile(q, dim=dim) + + expected = np.nanpercentile(self.d, np.array(q) * 100, + axis=axis) + np.testing.assert_allclose(actual.values, expected) + def test_big_endian_reduce(self): # regression test for GH489 data = np.ones(5, dtype='>f4')