From c8253343bc5af6f9f06af4aac2babc7aa12a83bc Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Tue, 27 Dec 2016 16:56:38 -0800 Subject: [PATCH 01/11] add quantile method to DataArray --- doc/api.rst | 1 + doc/whats-new.rst | 3 ++ xarray/core/dataarray.py | 88 +++++++++++++++++++++++++++++++++++ xarray/core/dataset.py | 1 - xarray/core/ops.py | 1 + xarray/test/test_dataarray.py | 16 +++++++ 6 files changed, 109 insertions(+), 1 deletion(-) diff --git a/doc/api.rst b/doc/api.rst index fd25a09de23..1a86b82589e 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -270,6 +270,7 @@ Computation DataArray.get_axis_num DataArray.diff DataArray.dot + DataArray.quantile **Aggregation**: :py:attr:`~DataArray.all` diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 747d1f236bd..e21f35dacd1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -165,6 +165,9 @@ Enhancements and attributes. The method prints to a buffer (e.g. ``stdout``) with output similar to what the command line utility ``ncdump -h`` produces (:issue:`1150`). By `Joe Hamman `_. +- New :py:meth:`~DataArray.quantile` method to calculate quantiles from + DataArray objects (:issue:`xxxx`). + By `Joe Hamman `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7953ad07747..66261111de4 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1736,5 +1736,93 @@ def dot(self, other): return type(self)(new_data, new_coords, new_dims) + def quantile(self, q, dim=None, axis=None, interpolation='linear'): + """ + + Compute the qth quantile of the data along the specified axis, + while ignoring nan values. + Returns the qth quantiles(s) of the array elements. + + Parameters + ---------- + q : float in range of [0,100] (or sequence of floats) + Quantile to compute, which must be between 0 and 100 + inclusive. + dim : str or sequence of str, optional + Dimension(s) over which to apply quantile. + axis : int or sequence of int, optional + Axis or axes along which the quantiles are computed. The + default is to compute the quantile(s) along a flattened + version of the array. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to + use when the desired quantile lies between two data points + ``i < j``: + * linear: ``i + (j - i) * fraction``, where ``fraction`` is + the fractional part of the index surrounded by ``i`` and + ``j``. + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + + Returns + ------- + quantiles : DataArray + If `q` is a single quantile and `axis=None`, then the result + is a scalar. If multiple percentiles are given, first axis of + the result corresponds to the quantile and a quantile dimension + is added to the return array. The other axes are the axes that + remain after the reduction of the array. If the input + contains integers or floats smaller than ``float64``, the output + data-type is ``float64``. Otherwise, the output data-type is the + same as that of the input. + See Also + -------- + np.nanpercentile + """ + + if dim is not None and axis is not None: + raise ValueError("cannot supply both 'axis' and 'dim' arguments") + + isscalar = np.isscalar(q) + if isscalar: + q = float(q) + else: + q = np.asarray(q, dtype=np.float64) + + new_dims = list(self.dims) + if dim is not None: + if isinstance(dim, basestring): + axis = self.get_axis_num(dim) + new_dims.remove(dim) + else: + axis = [self.get_axis_num(d) for d in dim] + for d in dim: + new_dims.remove(d) + elif axis is not None: + if hasattr(axis, '__iter__'): + for i in axis: + new_dims.remove(self.dims[i]) + else: + new_dims.remove(self.dims[axis]) + else: + new_dims = [] + # only add the quantile dimension if q is array like + if not isscalar: + new_dims = ['quantile'] + new_dims + + ps = np.nanpercentile(self.data, q, axis=axis, + interpolation=interpolation) + + # Construct the return DataArray + ps = DataArray(ps, dims=new_dims, name=self.name) + if not isscalar: + ps['quantile'] = DataArray(q, dims=('quantile', ), + name='quantile') + + return ps + + # priority most be higher than Variable to properly work with binary ufuncs ops.inject_all_ops_and_reduce_methods(DataArray, priority=60) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9d5618dfb7d..8c3fd3c91a1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -28,7 +28,6 @@ broadcast_variables) from .pycompat import (iteritems, basestring, OrderedDict, dask_array_type, range) -from .formatting import ensure_valid_repr from .combine import concat from .options import OPTIONS diff --git a/xarray/core/ops.py b/xarray/core/ops.py index 5902d10aa9d..0babb05aa69 100644 --- a/xarray/core/ops.py +++ b/xarray/core/ops.py @@ -409,6 +409,7 @@ def inject_reduce_methods(cls): extra_args=cls._reduce_extra_args_docstring) setattr(cls, name, func) + def inject_cum_methods(cls): methods = ([(name, globals()[name], True) for name in NAN_CUM_METHODS]) for name, f, include_skipna in methods: diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 6f9d84bdef4..044f9c3524b 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -1328,6 +1328,22 @@ def test_reduce(self): expected = DataArray(5, {'c': -999}) self.assertDataArrayIdentical(expected, actual) + def test_quantile(self): + for method in ['linear', 'lower', 'higher', 'nearest', 'midpoint']: + for q in [25, [50], [25, 75]]: + for axis, dim in zip([None, 0, [0], [0, 1]], + [None, 'x', ['x'], ['x', 'y']]): + a = self.dv.quantile(q, dim=dim, interpolation=method) + b = self.dv.quantile(q, axis=axis, interpolation=method) + self.assertDataArrayIdentical(a, b) + expected = np.nanpercentile(self.dv.values, q, axis=axis, + interpolation=method) + np.testing.assert_allclose(a.values, expected) + + # raises with both axis and dim + with self.assertRaisesRegexp(ValueError, 'cannot supply both'): + self.dv.quantile(q, axis=0, dim='x') + def test_reduce_keep_attrs(self): # Test dropped attrs vm = self.va.mean() From 3d87534f3af3bbf53a1c1517bcf0ea979b77fa5f Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 28 Dec 2016 12:49:15 -0800 Subject: [PATCH 02/11] pep utils.py --- xarray/core/utils.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 32c26bd02c1..dab6bdca4d7 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -6,7 +6,6 @@ import contextlib import functools import itertools -import os.path import re import warnings from collections import Mapping, MutableMapping, Iterable @@ -102,7 +101,9 @@ def equivalent(first, second): if isinstance(first, np.ndarray) or isinstance(second, np.ndarray): return ops.array_equiv(first, second) else: - return first is second or first == second or (pd.isnull(first) and pd.isnull(second)) + return (first is second or + first == second or + (pd.isnull(first) and pd.isnull(second))) def peek_at(iterable): @@ -179,12 +180,14 @@ def combine_pos_and_kw_args(pos_kwargs, kw_kwargs, func_name): def is_scalar(value): - """ Whether to treat a value as a scalar. Any non-iterable, string, or 0-D array """ - return ( - getattr(value, 'ndim', None) == 0 - or isinstance(value, (basestring, bytes_type)) - or not isinstance(value, Iterable)) + """Whether to treat a value as a scalar. + Any non-iterable, string, or 0-D array + """ + return ( + getattr(value, 'ndim', None) == 0 or + isinstance(value, (basestring, bytes_type)) or not + isinstance(value, Iterable)) def is_valid_numpy_dtype(dtype): @@ -205,8 +208,8 @@ def to_0d_object_array(value): def to_0d_array(value): """Given a value, wrap it in a 0-D numpy.ndarray.""" - if np.isscalar(value) or (isinstance(value, np.ndarray) - and value.ndim == 0): + if np.isscalar(value) or (isinstance(value, np.ndarray) and + value.ndim == 0): return np.array(value) else: return to_0d_object_array(value) From 52fd4d612db73db01e47317507ca58bd497860ee Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 28 Dec 2016 13:05:10 -0800 Subject: [PATCH 03/11] initial fixes after @shoyer's review --- xarray/core/dataarray.py | 60 ++++++++++++++--------------------- xarray/test/test_dataarray.py | 21 ++++-------- 2 files changed, 30 insertions(+), 51 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 66261111de4..71053e9c6e0 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -19,7 +19,8 @@ from .coordinates import (DataArrayCoordinates, LevelCoordinatesSource, Indexes) from .dataset import Dataset, merge_indexes, split_indexes -from .pycompat import iteritems, basestring, OrderedDict, zip, range +from .pycompat import (iteritems, basestring, OrderedDict, zip, range, + dask_array_type) from .variable import (as_variable, Variable, as_compatible_data, IndexVariable, assert_unique_multiindex_level_names) @@ -1736,24 +1737,18 @@ def dot(self, other): return type(self)(new_data, new_coords, new_dims) - def quantile(self, q, dim=None, axis=None, interpolation='linear'): - """ + def quantile(self, q, dim=None, interpolation='linear'): + """Compute the qth quantile of the data along the specified dimension. - Compute the qth quantile of the data along the specified axis, - while ignoring nan values. Returns the qth quantiles(s) of the array elements. Parameters ---------- - q : float in range of [0,100] (or sequence of floats) - Quantile to compute, which must be between 0 and 100 + q : float in range of [0,1] (or sequence of floats) + Quantile to compute, which must be between 0 and 1 inclusive. dim : str or sequence of str, optional Dimension(s) over which to apply quantile. - axis : int or sequence of int, optional - Axis or axes along which the quantiles are computed. The - default is to compute the quantile(s) along a flattened - version of the array. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} This optional parameter specifies the interpolation method to use when the desired quantile lies between two data points @@ -1769,57 +1764,48 @@ def quantile(self, q, dim=None, axis=None, interpolation='linear'): Returns ------- quantiles : DataArray - If `q` is a single quantile and `axis=None`, then the result + If `q` is a single quantile, then the result is a scalar. If multiple percentiles are given, first axis of the result corresponds to the quantile and a quantile dimension - is added to the return array. The other axes are the axes that - remain after the reduction of the array. If the input - contains integers or floats smaller than ``float64``, the output - data-type is ``float64``. Otherwise, the output data-type is the - same as that of the input. + is added to the return array. The other dimensions are the + dimensions that remain after the reduction of the array. + See Also -------- - np.nanpercentile + np.nanpercentile, pd.Series.quantile """ - if dim is not None and axis is not None: - raise ValueError("cannot supply both 'axis' and 'dim' arguments") + if isinstance(self.data, dask_array_type): + TypeError("quantile does not work for arrays stored as dask " + "arrays. Load the data via .load() prior to calling " + "this method.") - isscalar = np.isscalar(q) - if isscalar: - q = float(q) - else: - q = np.asarray(q, dtype=np.float64) + q = np.asarray(q, dtype=np.float64) new_dims = list(self.dims) if dim is not None: - if isinstance(dim, basestring): + if utils.is_scalar(dim): axis = self.get_axis_num(dim) new_dims.remove(dim) else: axis = [self.get_axis_num(d) for d in dim] for d in dim: new_dims.remove(d) - elif axis is not None: - if hasattr(axis, '__iter__'): - for i in axis: - new_dims.remove(self.dims[i]) - else: - new_dims.remove(self.dims[axis]) else: + axis = None new_dims = [] + # only add the quantile dimension if q is array like - if not isscalar: + if q.ndim != 0: new_dims = ['quantile'] + new_dims - ps = np.nanpercentile(self.data, q, axis=axis, + ps = np.nanpercentile(self.data, q * 100., axis=axis, interpolation=interpolation) # Construct the return DataArray ps = DataArray(ps, dims=new_dims, name=self.name) - if not isscalar: - ps['quantile'] = DataArray(q, dims=('quantile', ), - name='quantile') + if q.ndim != 0: + ps.coords['quantile'] = Variable('quantile', q) return ps diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 044f9c3524b..8a830d724ca 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -1329,20 +1329,13 @@ def test_reduce(self): self.assertDataArrayIdentical(expected, actual) def test_quantile(self): - for method in ['linear', 'lower', 'higher', 'nearest', 'midpoint']: - for q in [25, [50], [25, 75]]: - for axis, dim in zip([None, 0, [0], [0, 1]], - [None, 'x', ['x'], ['x', 'y']]): - a = self.dv.quantile(q, dim=dim, interpolation=method) - b = self.dv.quantile(q, axis=axis, interpolation=method) - self.assertDataArrayIdentical(a, b) - expected = np.nanpercentile(self.dv.values, q, axis=axis, - interpolation=method) - np.testing.assert_allclose(a.values, expected) - - # raises with both axis and dim - with self.assertRaisesRegexp(ValueError, 'cannot supply both'): - self.dv.quantile(q, axis=0, dim='x') + for q in [0.25, [0.50], [0.25, 0.75]]: + for axis, dim in zip([None, 0, [0], [0, 1]], + [None, 'x', ['x'], ['x', 'y']]): + actual = self.dv.quantile(q, dim=dim) + expected = np.nanpercentile(self.dv.values, np.array(q) * 100, + axis=axis) + np.testing.assert_allclose(actual.values, expected) def test_reduce_keep_attrs(self): # Test dropped attrs From ddd52112180ad0e96ac6db8fb80a59357bd3d373 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 28 Dec 2016 14:55:51 -0800 Subject: [PATCH 04/11] move quantile to Variable, add wrapper methods to Dataset and DataArray --- xarray/core/dataarray.py | 38 +++------------------- xarray/core/dataset.py | 51 ++++++++++++++++++++++++++++++ xarray/core/variable.py | 68 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 34 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 71053e9c6e0..4648f309807 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1772,42 +1772,12 @@ def quantile(self, q, dim=None, interpolation='linear'): See Also -------- - np.nanpercentile, pd.Series.quantile + np.nanpercentile, pd.Series.quantile, xr.Dataset.quantile """ - if isinstance(self.data, dask_array_type): - TypeError("quantile does not work for arrays stored as dask " - "arrays. Load the data via .load() prior to calling " - "this method.") - - q = np.asarray(q, dtype=np.float64) - - new_dims = list(self.dims) - if dim is not None: - if utils.is_scalar(dim): - axis = self.get_axis_num(dim) - new_dims.remove(dim) - else: - axis = [self.get_axis_num(d) for d in dim] - for d in dim: - new_dims.remove(d) - else: - axis = None - new_dims = [] - - # only add the quantile dimension if q is array like - if q.ndim != 0: - new_dims = ['quantile'] + new_dims - - ps = np.nanpercentile(self.data, q * 100., axis=axis, - interpolation=interpolation) - - # Construct the return DataArray - ps = DataArray(ps, dims=new_dims, name=self.name) - if q.ndim != 0: - ps.coords['quantile'] = Variable('quantile', q) - - return ps + ds = self._to_temp_dataset().quantile(q, dim=dim, + interpolation=interpolation) + return self._from_temp_dataset(ds) # priority most be higher than Variable to properly work with binary ufuncs diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 8c3fd3c91a1..da4a6b06fe2 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2537,6 +2537,57 @@ def roll(self, **shifts): return self._replace_vars_and_dims(variables) + def quantile(self, q, dim=None, interpolation='linear'): + """Compute the qth quantile of the data along the specified dimension. + + Returns the qth quantiles(s) of the array elements for each variable + in the Dataset. + + Parameters + ---------- + q : float in range of [0,1] (or sequence of floats) + Quantile to compute, which must be between 0 and 1 + inclusive. + dim : str or sequence of str, optional + Dimension(s) over which to apply quantile. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to + use when the desired quantile lies between two data points + ``i < j``: + * linear: ``i + (j - i) * fraction``, where ``fraction`` is + the fractional part of the index surrounded by ``i`` and + ``j``. + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + + Returns + ------- + quantiles : Dataset + If `q` is a single quantile, then the result is a scalar for each + variable in data_vars. If multiple percentiles are given, first + axis of the result corresponds to the quantile and a quantile + dimension is added to the return Dataset. The other dimensions are + the dimensions that remain after the reduction of the array. + + See Also + -------- + np.nanpercentile, pd.Series.quantile, xr.DataArray.quantile + """ + + q = np.asarray(q, dtype=np.float64) + + variables = OrderedDict() + for name, var in iteritems(self.variables): + variables[name] = var.quantile(q, dim=dim, + interpolation=interpolation) + new = self._replace_vars_and_dims(variables) + if q.ndim != 0: + new.coords['quantile'] = Variable('quantile', q) + + return new + @property def real(self): return self._unary_op(lambda x: x.real, keep_attrs=True)(self) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index fb19c5d2d31..ae5867fd5c5 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1029,6 +1029,74 @@ def no_conflicts(self, other): """ return self.broadcast_equals(other, equiv=ops.array_notnull_equiv) + def quantile(self, q, dim=None, interpolation='linear'): + """Compute the qth quantile of the data along the specified dimension. + + Returns the qth quantiles(s) of the array elements. + + Parameters + ---------- + q : float in range of [0,1] (or sequence of floats) + Quantile to compute, which must be between 0 and 1 + inclusive. + dim : str or sequence of str, optional + Dimension(s) over which to apply quantile. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to + use when the desired quantile lies between two data points + ``i < j``: + * linear: ``i + (j - i) * fraction``, where ``fraction`` is + the fractional part of the index surrounded by ``i`` and + ``j``. + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + + Returns + ------- + quantiles : Variable + If `q` is a single quantile, then the result + is a scalar. If multiple percentiles are given, first axis of + the result corresponds to the quantile and a quantile dimension + is added to the return array. The other dimensions are the + dimensions that remain after the reduction of the array. + + See Also + -------- + np.nanpercentile, pd.Series.quantile, + xr.Dataset.quantile, xr.DataArray.quantile + """ + + if isinstance(self.data, dask_array_type): + TypeError("quantile does not work for arrays stored as dask " + "arrays. Load the data via .load() prior to calling " + "this method.") + + q = np.asarray(q, dtype=np.float64) + + new_dims = list(self.dims) + if dim is not None: + if utils.is_scalar(dim): + axis = self.get_axis_num(dim) + new_dims.remove(dim) + else: + axis = [self.get_axis_num(d) for d in dim] + for d in dim: + new_dims.remove(d) + else: + axis = None + new_dims = [] + + # only add the quantile dimension if q is array like + if q.ndim != 0: + new_dims = ['quantile'] + new_dims + + qs = np.nanpercentile(self.data, q * 100., axis=axis, + interpolation=interpolation) + + return Variable(new_dims, qs) + @property def real(self): return type(self)(self.dims, self.data.real, self._attrs) From aa81a17fe085324d6d6ad4b6d9ca62e83661c8af Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 28 Dec 2016 15:01:51 -0800 Subject: [PATCH 05/11] unpin numpy/pandas for quick test --- .travis.yml | 6 +++--- ci/requirements-py27-min.yml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 51d6b886bb4..0421d7faaa2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -61,9 +61,9 @@ install: - source activate test_env # scipy should not have been installed, but it's included in older versions of # the conda pandas package - - if [[ "$CONDA_ENV" == "py27-min" ]]; then - conda remove scipy; - fi + # - if [[ "$CONDA_ENV" == "py27-min" ]]; then + # conda remove scipy; + # fi - python setup.py install script: diff --git a/ci/requirements-py27-min.yml b/ci/requirements-py27-min.yml index 7499157dbe9..4db56158a4a 100644 --- a/ci/requirements-py27-min.yml +++ b/ci/requirements-py27-min.yml @@ -2,8 +2,8 @@ name: test_env dependencies: - python=2.7 - pytest - - numpy==1.9.3 - - pandas==0.15.0 + - numpy + - pandas - pip: - coveralls - pytest-cov From 39394ecdf36d18b46c5a48d59092550146d89610 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Fri, 20 Jan 2017 19:41:24 -0700 Subject: [PATCH 06/11] further refinement of quantile methods and tests for dataset/dataarray/variable --- .travis.yml | 6 +++--- ci/requirements-py27-min.yml | 4 ++-- doc/whats-new.rst | 2 +- xarray/core/dataset.py | 42 ++++++++++++++++++++++++++++++------ xarray/core/variable.py | 8 +++---- xarray/test/test_dataset.py | 24 +++++++++++++++++---- xarray/test/test_variable.py | 13 ++++++++++- 7 files changed, 77 insertions(+), 22 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0421d7faaa2..51d6b886bb4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -61,9 +61,9 @@ install: - source activate test_env # scipy should not have been installed, but it's included in older versions of # the conda pandas package - # - if [[ "$CONDA_ENV" == "py27-min" ]]; then - # conda remove scipy; - # fi + - if [[ "$CONDA_ENV" == "py27-min" ]]; then + conda remove scipy; + fi - python setup.py install script: diff --git a/ci/requirements-py27-min.yml b/ci/requirements-py27-min.yml index 4db56158a4a..7499157dbe9 100644 --- a/ci/requirements-py27-min.yml +++ b/ci/requirements-py27-min.yml @@ -2,8 +2,8 @@ name: test_env dependencies: - python=2.7 - pytest - - numpy - - pandas + - numpy==1.9.3 + - pandas==0.15.0 - pip: - coveralls - pytest-cov diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e21f35dacd1..41c4a6b363c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -166,7 +166,7 @@ Enhancements similar to what the command line utility ``ncdump -h`` produces (:issue:`1150`). By `Joe Hamman `_. - New :py:meth:`~DataArray.quantile` method to calculate quantiles from - DataArray objects (:issue:`xxxx`). + DataArray objects (:issue:`1187`). By `Joe Hamman `_. Bug fixes diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index da4a6b06fe2..432235a1d69 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2537,7 +2537,8 @@ def roll(self, **shifts): return self._replace_vars_and_dims(variables) - def quantile(self, q, dim=None, interpolation='linear'): + def quantile(self, q, dim=None, numeric_only=False, keep_attrs=False, + interpolation='linear'): """Compute the qth quantile of the data along the specified dimension. Returns the qth quantiles(s) of the array elements for each variable @@ -2576,16 +2577,45 @@ def quantile(self, q, dim=None, interpolation='linear'): np.nanpercentile, pd.Series.quantile, xr.DataArray.quantile """ + if isinstance(dim, basestring): + dims = set([dim]) + elif dim is None: + dims = set(self.dims) + else: + dims = set(dim) + + _assert_empty([dim for dim in dims if dim not in self.dims], + 'Dataset does not contain the dimensions: %s') + q = np.asarray(q, dtype=np.float64) variables = OrderedDict() for name, var in iteritems(self.variables): - variables[name] = var.quantile(q, dim=dim, - interpolation=interpolation) - new = self._replace_vars_and_dims(variables) - if q.ndim != 0: - new.coords['quantile'] = Variable('quantile', q) + reduce_dims = [dim for dim in var.dims if dim in dims] + if reduce_dims or not var.dims: + if name not in self.coords: + if (not numeric_only or + np.issubdtype(var.dtype, np.number) or + var.dtype == np.bool_): + if len(reduce_dims) == var.ndim: + # prefer to aggregate over axis=None rather than + # axis=(0, 1) if they will be equivalent, because + # the former is often more efficient + reduce_dims = None + variables[name] = var.quantile( + q, dim=reduce_dims, interpolation=interpolation) + else: + variables[name] = var + + # construct the new dataset + coord_names = set(k for k in self.coords if k in variables) + attrs = self.attrs if keep_attrs else None + new = self._replace_vars_and_dims(variables, coord_names, attrs=attrs) + if 'quantile' in new.dims: + new.coords['quantile'] = Variable('quantile', q) + else: + new.coords['quantile'] = q return new @property diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ae5867fd5c5..c9e284dd740 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1070,18 +1070,17 @@ def quantile(self, q, dim=None, interpolation='linear'): if isinstance(self.data, dask_array_type): TypeError("quantile does not work for arrays stored as dask " - "arrays. Load the data via .load() prior to calling " - "this method.") + "arrays. Load the data via .compute() or .load() prior " + "to calling this method.") q = np.asarray(q, dtype=np.float64) new_dims = list(self.dims) if dim is not None: + axis = self.get_axis_num(dim) if utils.is_scalar(dim): - axis = self.get_axis_num(dim) new_dims.remove(dim) else: - axis = [self.get_axis_num(d) for d in dim] for d in dim: new_dims.remove(d) else: @@ -1094,7 +1093,6 @@ def quantile(self, q, dim=None, interpolation='linear'): qs = np.nanpercentile(self.data, q * 100., axis=axis, interpolation=interpolation) - return Variable(new_dims, qs) @property diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 54f6cb273a9..94c33ee7844 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -19,14 +19,13 @@ import xarray as xr import pytest -from xarray import (align, broadcast, concat, merge, conventions, backends, - Dataset, DataArray, Variable, IndexVariable, auto_combine, - open_dataset, set_options, MergeError) +from xarray import (align, broadcast, backends, Dataset, DataArray, Variable, + IndexVariable, open_dataset, set_options, MergeError) from xarray.core import indexing, utils from xarray.core.pycompat import iteritems, OrderedDict, unicode_type from xarray.core.common import full_like -from . import (TestCase, unittest, InaccessibleArray, UnexpectedDataAccess, +from . import (TestCase, InaccessibleArray, UnexpectedDataAccess, requires_dask, source_ndarray) @@ -2787,6 +2786,23 @@ def mean_only_one_axis(x, axis): with self.assertRaisesRegexp(TypeError, 'non-integer axis'): ds.reduce(mean_only_one_axis, ['x', 'y']) + def test_quantile(self): + + ds = create_test_data(seed=123) + + for q in [0.25, [0.50], [0.25, 0.75]]: + for dim in [None, 'dim1', ['dim1']]: + ds_quantile = ds.quantile(q, dim=dim) + assert 'quantile' in ds_quantile + for var, dar in ds.data_vars.items(): + assert var in ds_quantile + self.assertDataArrayIdentical( + ds_quantile[var], dar.quantile(q, dim=dim)) + dim = ['dim1', 'dim2'] + ds_quantile = ds.quantile(q, dim=dim) + assert 'dim3' in ds_quantile.dims + assert all(d not in ds_quantile.dims for d in dim) + def test_count(self): ds = Dataset({'x': ('a', [np.nan, 1]), 'y': 0, 'z': np.nan}) expected = Dataset({'x': 1, 'y': 1, 'z': 0}) diff --git a/xarray/test/test_variable.py b/xarray/test/test_variable.py index e99e77abf99..47b158fce94 100644 --- a/xarray/test/test_variable.py +++ b/xarray/test/test_variable.py @@ -11,7 +11,7 @@ import pytz import pandas as pd -from xarray import Variable, IndexVariable, Coordinate, Dataset, DataArray +from xarray import Variable, IndexVariable, Coordinate, Dataset from xarray.core import indexing from xarray.core.variable import as_variable, as_compatible_data from xarray.core.indexing import PandasIndexAdapter, LazilyIndexedArray @@ -980,6 +980,17 @@ def test_reduce(self): with self.assertRaisesRegexp(ValueError, 'cannot supply both'): v.mean(dim='x', axis=0) + def test_quantile(self): + v = Variable(['x', 'y'], self.d) + for q in [0.25, [0.50], [0.25, 0.75]]: + for axis, dim in zip([None, 0, [0], [0, 1]], + [None, 'x', ['x'], ['x', 'y']]): + actual = v.quantile(q, dim=dim) + + expected = np.nanpercentile(self.d, np.array(q) * 100, + axis=axis) + np.testing.assert_allclose(actual.values, expected) + def test_big_endian_reduce(self): # regression test for GH489 data = np.ones(5, dtype='>f4') From fa802912513b5a4293ebf2702a8c41f671bbf5cb Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Sat, 21 Jan 2017 10:12:45 -0800 Subject: [PATCH 07/11] skip quantile tests when numpy version is less than 1.10 --- xarray/test/test_dataarray.py | 4 ++++ xarray/test/test_dataset.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 8a830d724ca..35293e1e05d 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -7,6 +7,7 @@ import pytest from copy import deepcopy from textwrap import dedent +from distutils.version import StrictVersion import xarray as xr @@ -1328,6 +1329,9 @@ def test_reduce(self): expected = DataArray(5, {'c': -999}) self.assertDataArrayIdentical(expected, actual) + @pytest.mark.skipif(StrictVersion(np.__version__) < StrictVersion('1.10.0'), + reason='requires numpy version 1.10.0 or later') + # skip due to bug in older versions of numpy.nanpercentile def test_quantile(self): for q in [0.25, [0.50], [0.25, 0.75]]: for axis, dim in zip([None, 0, [0], [0, 1]], diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 94c33ee7844..c977f039cdd 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -13,6 +13,7 @@ except ImportError: pass from io import StringIO +from distutils.version import StrictVersion import numpy as np import pandas as pd @@ -2786,6 +2787,8 @@ def mean_only_one_axis(x, axis): with self.assertRaisesRegexp(TypeError, 'non-integer axis'): ds.reduce(mean_only_one_axis, ['x', 'y']) + @pytest.mark.skipif(StrictVersion(np.__version__) < StrictVersion('1.10.0'), + reason='requires numpy version 1.10.0 or later') def test_quantile(self): ds = create_test_data(seed=123) From c2d31fd56eecace10b103a147b0408fb856c542f Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Sat, 21 Jan 2017 10:18:53 -0800 Subject: [PATCH 08/11] require numpy version 1.10 or later for quantile --- xarray/core/variable.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index c9e284dd740..dbd60b54549 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -5,6 +5,7 @@ from collections import defaultdict import functools import itertools +from distutils.version import LooseVersion import numpy as np import pandas as pd @@ -1072,6 +1073,9 @@ def quantile(self, q, dim=None, interpolation='linear'): TypeError("quantile does not work for arrays stored as dask " "arrays. Load the data via .compute() or .load() prior " "to calling this method.") + if LooseVersion(np.__version__) < LooseVersion('1.10.0'): + raise NotImplementedError( + 'quantile requres numpy version 1.10.0 or later') q = np.asarray(q, dtype=np.float64) From 1f3a9907b91c7208a308f7920e1296a05063f239 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Sat, 21 Jan 2017 10:37:34 -0800 Subject: [PATCH 09/11] use LooseVersion, skip on Variable --- xarray/test/test_dataarray.py | 4 ++-- xarray/test/test_dataset.py | 4 ++-- xarray/test/test_variable.py | 3 +++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 35293e1e05d..4b760845905 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -7,7 +7,7 @@ import pytest from copy import deepcopy from textwrap import dedent -from distutils.version import StrictVersion +from distutils.version import LooseVersion import xarray as xr @@ -1329,7 +1329,7 @@ def test_reduce(self): expected = DataArray(5, {'c': -999}) self.assertDataArrayIdentical(expected, actual) - @pytest.mark.skipif(StrictVersion(np.__version__) < StrictVersion('1.10.0'), + @pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.10.0'), reason='requires numpy version 1.10.0 or later') # skip due to bug in older versions of numpy.nanpercentile def test_quantile(self): diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index c977f039cdd..9cff66d30fd 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -13,7 +13,7 @@ except ImportError: pass from io import StringIO -from distutils.version import StrictVersion +from distutils.version import LooseVersion import numpy as np import pandas as pd @@ -2787,7 +2787,7 @@ def mean_only_one_axis(x, axis): with self.assertRaisesRegexp(TypeError, 'non-integer axis'): ds.reduce(mean_only_one_axis, ['x', 'y']) - @pytest.mark.skipif(StrictVersion(np.__version__) < StrictVersion('1.10.0'), + @pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.10.0'), reason='requires numpy version 1.10.0 or later') def test_quantile(self): diff --git a/xarray/test/test_variable.py b/xarray/test/test_variable.py index 47b158fce94..8cfa5681276 100644 --- a/xarray/test/test_variable.py +++ b/xarray/test/test_variable.py @@ -5,6 +5,7 @@ from copy import copy, deepcopy from datetime import datetime, timedelta from textwrap import dedent +import pytest from distutils.version import LooseVersion import numpy as np @@ -980,6 +981,8 @@ def test_reduce(self): with self.assertRaisesRegexp(ValueError, 'cannot supply both'): v.mean(dim='x', axis=0) + @pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.10.0'), + reason='requires numpy version 1.10.0 or later') def test_quantile(self): v = Variable(['x', 'y'], self.d) for q in [0.25, [0.50], [0.25, 0.75]]: From f6507a16d83e837ff9bd3b9ab2a9dd4db6fddcaf Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Sun, 22 Jan 2017 17:47:58 -0800 Subject: [PATCH 10/11] update doc strings and pass keep_attrs from dataarray in quantile method --- xarray/core/dataarray.py | 11 +++++++---- xarray/core/dataset.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4648f309807..ebc1c149423 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -19,8 +19,7 @@ from .coordinates import (DataArrayCoordinates, LevelCoordinatesSource, Indexes) from .dataset import Dataset, merge_indexes, split_indexes -from .pycompat import (iteritems, basestring, OrderedDict, zip, range, - dask_array_type) +from .pycompat import iteritems, basestring, OrderedDict, zip, range from .variable import (as_variable, Variable, as_compatible_data, IndexVariable, assert_unique_multiindex_level_names) @@ -1737,7 +1736,7 @@ def dot(self, other): return type(self)(new_data, new_coords, new_dims) - def quantile(self, q, dim=None, interpolation='linear'): + def quantile(self, q, dim=None, interpolation='linear', keep_attrs=False): """Compute the qth quantile of the data along the specified dimension. Returns the qth quantiles(s) of the array elements. @@ -1760,6 +1759,10 @@ def quantile(self, q, dim=None, interpolation='linear'): * higher: ``j``. * nearest: ``i`` or ``j``, whichever is nearest. * midpoint: ``(i + j) / 2``. + keep_attrs : bool, optional + If True, the dataset's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. Returns ------- @@ -1775,7 +1778,7 @@ def quantile(self, q, dim=None, interpolation='linear'): np.nanpercentile, pd.Series.quantile, xr.Dataset.quantile """ - ds = self._to_temp_dataset().quantile(q, dim=dim, + ds = self._to_temp_dataset().quantile(q, dim=dim, keep_attrs=keep_attrs, interpolation=interpolation) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3f6527f0269..c3da98944b1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2546,8 +2546,8 @@ def roll(self, **shifts): return self._replace_vars_and_dims(variables) - def quantile(self, q, dim=None, numeric_only=False, keep_attrs=False, - interpolation='linear'): + def quantile(self, q, dim=None, interpolation='linear', + numeric_only=False, keep_attrs=False): """Compute the qth quantile of the data along the specified dimension. Returns the qth quantiles(s) of the array elements for each variable @@ -2571,6 +2571,12 @@ def quantile(self, q, dim=None, numeric_only=False, keep_attrs=False, * higher: ``j``. * nearest: ``i`` or ``j``, whichever is nearest. * midpoint: ``(i + j) / 2``. + keep_attrs : bool, optional + If True, the dataset's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. + numeric_only : bool, optional + If True, only apply ``func`` to variables with a numeric dtype. Returns ------- From d8ba5694351b6948dee89748a8b4a38dfaa7f2bd Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Mon, 23 Jan 2017 10:13:40 -0800 Subject: [PATCH 11/11] add Dataset.quantile to docstring --- doc/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/api.rst b/doc/api.rst index 1a86b82589e..7fc18ae3bcc 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -122,6 +122,7 @@ Computation Dataset.groupby_bins Dataset.resample Dataset.diff + Dataset.quantile **Aggregation**: :py:attr:`~Dataset.all`