From 9d87ce3617d3c7858afc8a03e258ebdf18447549 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 15 Oct 2014 18:20:00 -0700 Subject: [PATCH 01/11] Add Dataset.dropna() method --- xray/core/dataset.py | 43 +++++++++++++++++++++++++++++++++++++++ xray/test/test_dataset.py | 32 +++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/xray/core/dataset.py b/xray/core/dataset.py index 13cce5b2794..57853860c45 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -1247,6 +1247,49 @@ def squeeze(self, dim=None): """ return common.squeeze(self, self.dims, dim) + def dropna(self, dim, how='any', vars=None): + """Returns a new dataset with dropped labels for missing values along + the provided dimension. + + Parameters + ---------- + dim : str + Dimension along which to drop missing values. Dropping along + multiple dimensions simultaneously is not yet supported. + how : {'any',}, optional + How to choose values to drop. The only currently supported choice + is 'any'. + vars : sequence, optional + Which variables to check for missing values. By default, all + variables in the dataset are checked. + + Returns + ------- + Dataset + """ + # TODO: consider supporting multiple dimensions? Or not, given that + # there are some ugly edge cases, e.g., pandas's dropna differs + # depending on the order of the supplied axes. + # TODO: support the thresh argument? + + if dim not in self.dims: + raise ValueError('%s must be a single dataset dimension' % dim) + + if how != 'any': + raise NotImplementedError("how only implemented for 'any'") + + if vars is None: + vars = list(self.vars) + + drop = np.zeros(self.dims[dim], dtype=bool) + for k in vars: + array = self._arrays[k] + if dim in array.dims: + dims = [d for d in array.dims if d != dim] + drop |= array.isnull().any(dims) + + return self.isel(**{dim: ~drop}) + def reduce(self, func, dim=None, keep_attrs=False, **kwargs): """Reduce this dataset by applying `func` along some dimension(s). diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 1531620e6fb..54c6a2419d4 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -1032,6 +1032,38 @@ def test_lazy_load(self): ds.isel(time=10) ds.isel(time=slice(10), dim1=[0]).isel(dim1=0, dim2=-1) + def test_dropna(self): + x = np.random.randn(4, 4) + x[::2, 0] = np.nan + y = np.random.randn(4) + y[-1] = np.nan + ds = Dataset({'foo': (('a', 'b'), x), 'bar': (('b', y))}) + + expected = ds.isel(a=slice(1, None, 2)) + actual = ds.dropna('a') + self.assertDatasetIdentical(actual, expected) + + expected = ds.isel(b=slice(1, 3)) + actual = ds.dropna('b') + self.assertDatasetIdentical(actual, expected) + + actual = ds.dropna('b', vars=['foo', 'bar']) + self.assertDatasetIdentical(actual, expected) + + expected = ds.isel(b=slice(1, None)) + actual = ds.dropna('b', vars=['foo']) + self.assertDatasetIdentical(actual, expected) + + expected = ds.isel(b=slice(3)) + actual = ds.dropna('b', vars=['bar']) + self.assertDatasetIdentical(actual, expected) + + actual = ds.dropna('a', vars=[]) + self.assertDatasetIdentical(actual, ds) + + actual = ds.dropna('a', vars=['bar']) + self.assertDatasetIdentical(actual, ds) + def test_reduce(self): data = create_test_data() From 7f1c36dc49c4561cbb3ab10115d0d61ddd1640cb Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 15 Oct 2014 18:28:44 -0700 Subject: [PATCH 02/11] Add DataArray.dropna() Fixes #58 --- xray/core/dataarray.py | 20 ++++++++++++++++++++ xray/test/test_dataarray.py | 10 ++++++++++ 2 files changed, 30 insertions(+) diff --git a/xray/core/dataarray.py b/xray/core/dataarray.py index 3db11ba49a8..a1e382d703b 100644 --- a/xray/core/dataarray.py +++ b/xray/core/dataarray.py @@ -669,6 +669,26 @@ def squeeze(self, dim=None): ds = self._dataset.squeeze(dim) return ds[self.name] + def dropna(self, dim, how='any'): + """Returns a new array with dropped labels for missing values along + the provided dimension. + + Parameters + ---------- + dim : str + Dimension along which to drop missing values. Dropping along + multiple dimensions simultaneously is not yet supported. + how : {'any',}, optional + How to choose values to drop. The only currently supported choice + is 'any'. + + Returns + ------- + DataArray + """ + ds = self._dataset.dropna(dim, how=how) + return ds[self.name] + def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs): """Reduce this array by applying `func` along some dimension(s). diff --git a/xray/test/test_dataarray.py b/xray/test/test_dataarray.py index f64920942e0..e84cf7d31a8 100644 --- a/xray/test/test_dataarray.py +++ b/xray/test/test_dataarray.py @@ -656,6 +656,16 @@ def test_transpose(self): def test_squeeze(self): self.assertVariableEqual(self.dv.variable.squeeze(), self.dv.squeeze()) + def test_dropna(self): + x = np.random.randn(4, 4) + x[::2, 0] = np.nan + arr = DataArray(x, dims=['a', 'b']) + + actual = arr.dropna('a') + expected = arr[1::2] + + self.assertDataArrayIdentical(actual, expected) + def test_reduce(self): coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'], 'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]), From 4a0979cd275fcaf4b7927eb1f4cd3137f0fe736f Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 15 Oct 2014 18:42:09 -0700 Subject: [PATCH 03/11] speedup internal creation of dataarrays from datasets --- xray/core/dataarray.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/xray/core/dataarray.py b/xray/core/dataarray.py index a1e382d703b..79ef46fa922 100644 --- a/xray/core/dataarray.py +++ b/xray/core/dataarray.py @@ -195,6 +195,19 @@ def _new_from_dataset(cls, dataset, name): obj._dataset._coord_names.discard(name) return obj + @classmethod + def _new_from_dataset_no_copy(cls, dataset, name): + obj = object.__new__(cls) + obj._dataset = dataset + obj._name = name + return obj + + def _with_replaced_dataset(self, dataset): + obj = object.__new__(type(self)) + obj._name = self.name + obj._dataset = dataset + return obj + @property def dataset(self): """The dataset with which this DataArray is associated. @@ -439,7 +452,7 @@ def copy(self, deep=True): array's dataset is also a variable in this array's dataset. """ ds = self._dataset.copy(deep=deep) - return ds[self.name] + return self._with_replaced_dataset(ds) def __copy__(self): return self.copy(deep=False) @@ -462,7 +475,7 @@ def isel(self, **indexers): DataArray.sel """ ds = self._dataset.isel(**indexers) - return ds[self.name] + return self._with_replaced_dataset(ds) indexed = utils.function_alias(isel, 'indexed') @@ -538,7 +551,7 @@ def reindex(self, copy=True, **indexers): align """ ds = self._dataset.reindex(copy=copy, **indexers) - return ds[self.name] + return self._with_replaced_dataset(ds) def rename(self, new_name_or_name_dict): """Returns a new DataArray with renamed coordinates and/or a new name. @@ -573,7 +586,7 @@ def select_vars(self, *names): FutureWarning, stacklevel=2) names = names + (self.name,) ds = self._dataset.select_vars(*names) - return ds[self.name] + return self._with_replaced_dataset(ds) select = utils.function_alias(select_vars, 'select') @@ -639,7 +652,7 @@ def transpose(self, *dims): """ ds = self._dataset.copy() ds[self.name] = self.variable.transpose(*dims) - return ds[self.name] + return self._with_replaced_dataset(ds) def squeeze(self, dim=None): """Return a new DataArray object with squeezed data. @@ -667,7 +680,7 @@ def squeeze(self, dim=None): numpy.squeeze """ ds = self._dataset.squeeze(dim) - return ds[self.name] + return self._with_replaced_dataset(ds) def dropna(self, dim, how='any'): """Returns a new array with dropped labels for missing values along @@ -687,7 +700,7 @@ def dropna(self, dim, how='any'): DataArray """ ds = self._dataset.dropna(dim, how=how) - return ds[self.name] + return self._with_replaced_dataset(ds) def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs): """Reduce this array by applying `func` along some dimension(s). @@ -730,7 +743,7 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs): ds = self._dataset.drop_vars(*drop) ds[self.name] = var - return ds[self.name] + return self._with_replaced_dataset(ds) @classmethod def concat(cls, *args, **kwargs): @@ -760,7 +773,7 @@ def _concat(cls, arrays, dim='concat_dim', indexers=None, concat_over = set(concat_over) | set([name]) ds = Dataset._concat(datasets, dim, indexers, concat_over=concat_over) - return ds[name] + return cls._new_from_dataset_no_copy(ds, name) def to_dataframe(self): """Convert this array into a pandas.DataFrame. @@ -794,7 +807,7 @@ def from_series(cls, series): # TODO: add a 'name' parameter df = pd.DataFrame({series.name: series}) ds = Dataset.from_dataframe(df) - return ds[series.name] + return cls._new_from_dataset_no_copy(ds, series.name) def _all_compat(self, other, compat_str): """Helper function for equals and identical""" @@ -850,7 +863,7 @@ def __array_wrap__(self, obj, context=None): ds = self.coords.to_dataset() name = self._result_name() ds[name] = new_var - return ds[name] + return self._new_from_dataset_no_copy(ds, name) @staticmethod def _unary_op(f): @@ -872,7 +885,7 @@ def func(self, other): ds[name] = (f(self.variable, other_variable) if not reflexive else f(other_variable, self.variable)) - return ds[name] + return self._new_from_dataset_no_copy(ds, name) return func @staticmethod From e67a1c1e693aa0b415818b0a22241e4e8c2b2fcf Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 15 Oct 2014 18:48:09 -0700 Subject: [PATCH 04/11] Document dropna --- doc/api-hidden.rst | 6 ++++-- doc/api.rst | 2 ++ doc/computation.rst | 12 ++++++++++-- doc/whats-new.rst | 6 ++++++ 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 961c3313d89..9491c750da8 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -19,9 +19,10 @@ Dataset.std Dataset.var - Dataset.count Dataset.isnull Dataset.notnull + Dataset.count + Dataset.dropna Dataset.argsort Dataset.clip @@ -49,9 +50,10 @@ DataArray.std DataArray.var - DataArray.count DataArray.isnull DataArray.notnull + DataArray.count + DataArray.dropna DataArray.argsort DataArray.clip diff --git a/doc/api.rst b/doc/api.rst index f44e75e621a..cbffa2b35ae 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -118,6 +118,7 @@ Computation :py:attr:`~Dataset.isnull` :py:attr:`~Dataset.notnull` :py:attr:`~Dataset.count` +:py:attr:`~Dataset.dropna` **ndarray methods**: :py:attr:`~Dataset.argsort` @@ -238,6 +239,7 @@ Computation :py:attr:`~DataArray.isnull` :py:attr:`~DataArray.notnull` :py:attr:`~DataArray.count` +:py:attr:`~DataArray.dropna` **ndarray methods**: :py:attr:`~DataArray.argsort` diff --git a/doc/computation.rst b/doc/computation.rst index 3e7176c6b36..7a551baf480 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -46,13 +46,21 @@ Data arrays also implement many :py:class:`numpy.ndarray` methods: arr.round(2) arr.T -It also has the ``count``, ``isnull`` and ``notnull`` methods from pandas: +Missing values +============== + +xray objects borrow the :py:meth:`~xray.DataArray.isnull`, +:py:meth:`~xray.DataArray.notnull`, :py:meth:`~xray.DataArray.count` and +:py:meth:`~xray.DataArray.dropna` methods for working with missing data from +pandas: .. ipython:: python - x = xray.DataArray([0, 1, np.nan, np.nan, 2]) + x = xray.DataArray([0, 1, np.nan, np.nan, 2], dims=['x']) x.isnull() + x.notnull() x.count() + x.dropna(dim='x') Aggregation =========== diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d031cfb4426..de90195f4a3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -1,6 +1,12 @@ What's New ========== +v0.3.1 (not yet released) +-------------------------- + +- Added :py:math:`~xray.Dataset.count` and :py:math:`~xray.Dataset.dropna` + methods for dealing with missing values. + v0.3.0 (21 September 2014) -------------------------- From 9ba7da935a07d093e2a45f05e6fbe8eef620f451 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 15 Oct 2014 19:31:34 -0700 Subject: [PATCH 05/11] change the dropna argument vars to subset --- xray/core/dataset.py | 14 +++++++------- xray/test/test_dataset.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/xray/core/dataset.py b/xray/core/dataset.py index 57853860c45..fde5adb643e 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -1247,7 +1247,7 @@ def squeeze(self, dim=None): """ return common.squeeze(self, self.dims, dim) - def dropna(self, dim, how='any', vars=None): + def dropna(self, dim, how='any', subset=None): """Returns a new dataset with dropped labels for missing values along the provided dimension. @@ -1259,8 +1259,8 @@ def dropna(self, dim, how='any', vars=None): how : {'any',}, optional How to choose values to drop. The only currently supported choice is 'any'. - vars : sequence, optional - Which variables to check for missing values. By default, all + subset : sequence, optional + Subset of variables to check for missing values. By default, all variables in the dataset are checked. Returns @@ -1270,7 +1270,7 @@ def dropna(self, dim, how='any', vars=None): # TODO: consider supporting multiple dimensions? Or not, given that # there are some ugly edge cases, e.g., pandas's dropna differs # depending on the order of the supplied axes. - # TODO: support the thresh argument? + # TODO: support how='all' and the thresh argument if dim not in self.dims: raise ValueError('%s must be a single dataset dimension' % dim) @@ -1278,11 +1278,11 @@ def dropna(self, dim, how='any', vars=None): if how != 'any': raise NotImplementedError("how only implemented for 'any'") - if vars is None: - vars = list(self.vars) + if subset is None: + subset = list(self.vars) drop = np.zeros(self.dims[dim], dtype=bool) - for k in vars: + for k in subset: array = self._arrays[k] if dim in array.dims: dims = [d for d in array.dims if d != dim] diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 54c6a2419d4..198b75f66a6 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -1047,21 +1047,21 @@ def test_dropna(self): actual = ds.dropna('b') self.assertDatasetIdentical(actual, expected) - actual = ds.dropna('b', vars=['foo', 'bar']) + actual = ds.dropna('b', subset=['foo', 'bar']) self.assertDatasetIdentical(actual, expected) expected = ds.isel(b=slice(1, None)) - actual = ds.dropna('b', vars=['foo']) + actual = ds.dropna('b', subset=['foo']) self.assertDatasetIdentical(actual, expected) expected = ds.isel(b=slice(3)) - actual = ds.dropna('b', vars=['bar']) + actual = ds.dropna('b', subset=['bar']) self.assertDatasetIdentical(actual, expected) - actual = ds.dropna('a', vars=[]) + actual = ds.dropna('a', subset=[]) self.assertDatasetIdentical(actual, ds) - actual = ds.dropna('a', vars=['bar']) + actual = ds.dropna('a', subset=['bar']) self.assertDatasetIdentical(actual, ds) def test_reduce(self): From 5b29f46b0ef2253cb45b5fd1a03f9b59686d794f Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 15 Oct 2014 21:00:22 -0700 Subject: [PATCH 06/11] work around numpy crash with size 0 arrays here's the numpy issue: https://github.com/numpy/numpy/issues/5195 --- xray/core/formatting.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/xray/core/formatting.py b/xray/core/formatting.py index a6e5737328d..96716d85799 100644 --- a/xray/core/formatting.py +++ b/xray/core/formatting.py @@ -46,6 +46,11 @@ def first_n_items(x, n_desired): # get them in a single call to __getitem__ using only slices. if n_desired < 1: raise ValueError('must request at least one item') + + if x.size == 0: + # work around for https://github.com/numpy/numpy/issues/5195 + return [] + if n_desired < x.size: indexer = _get_indexer_at_least_n_items(x.shape, n_desired) x = x[indexer] From f7340b682410ae7e30865923090895419c078eeb Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 15 Oct 2014 23:52:24 -0700 Subject: [PATCH 07/11] Copy dropna logic from pandas.DataFrame --- xray/core/dataset.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/xray/core/dataset.py b/xray/core/dataset.py index fde5adb643e..bae0a73f12f 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -1247,7 +1247,7 @@ def squeeze(self, dim=None): """ return common.squeeze(self, self.dims, dim) - def dropna(self, dim, how='any', subset=None): + def dropna(self, dim, how='any', thresh=None, subset=None): """Returns a new dataset with dropped labels for missing values along the provided dimension. @@ -1256,9 +1256,11 @@ def dropna(self, dim, how='any', subset=None): dim : str Dimension along which to drop missing values. Dropping along multiple dimensions simultaneously is not yet supported. - how : {'any',}, optional - How to choose values to drop. The only currently supported choice - is 'any'. + how : {'any', 'all'}, optional + * any : if any NA values are present, drop that label + * all : if all values are NA, drop that label + thresh : int, default None + If supplied, require this many non-NA values. subset : sequence, optional Subset of variables to check for missing values. By default, all variables in the dataset are checked. @@ -1270,25 +1272,35 @@ def dropna(self, dim, how='any', subset=None): # TODO: consider supporting multiple dimensions? Or not, given that # there are some ugly edge cases, e.g., pandas's dropna differs # depending on the order of the supplied axes. - # TODO: support how='all' and the thresh argument if dim not in self.dims: raise ValueError('%s must be a single dataset dimension' % dim) - if how != 'any': - raise NotImplementedError("how only implemented for 'any'") - if subset is None: subset = list(self.vars) - drop = np.zeros(self.dims[dim], dtype=bool) + count = np.zeros(self.dims[dim], dtype=int) + size = 0 + for k in subset: array = self._arrays[k] if dim in array.dims: dims = [d for d in array.dims if d != dim] - drop |= array.isnull().any(dims) + count += array.count(dims) + size += np.prod([self.dims[d] for d in dims]) + + if thresh is not None: + mask = count >= thresh + elif how == 'any': + mask = count == size + elif how == 'all': + mask = count > 0 + elif how is not None: + raise ValueError('invalid how option: %s' % how) + else: + raise TypeError('must specify how or thresh') - return self.isel(**{dim: ~drop}) + return self.isel(**{dim: mask}) def reduce(self, func, dim=None, keep_attrs=False, **kwargs): """Reduce this dataset by applying `func` along some dimension(s). From f121356564b9e29df548ec270bc5f2bb9a4d10da Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 15 Oct 2014 23:59:51 -0700 Subject: [PATCH 08/11] Add pandas license --- LICENSES/PANDAS_LICENSE | 36 ++++++++++++++++++++++++++++++++++++ README.rst | 3 +++ 2 files changed, 39 insertions(+) create mode 100644 LICENSES/PANDAS_LICENSE diff --git a/LICENSES/PANDAS_LICENSE b/LICENSES/PANDAS_LICENSE new file mode 100644 index 00000000000..8026eb45edc --- /dev/null +++ b/LICENSES/PANDAS_LICENSE @@ -0,0 +1,36 @@ +pandas license +============== + +Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +All rights reserved. + +Copyright (c) 2008-2011 AQR Capital Management, LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.rst b/README.rst index 2725aaff402..90704e06f1b 100644 --- a/README.rst +++ b/README.rst @@ -104,3 +104,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +xray includes portions of pandas. The license for pandas is included in the +LICENSES directory. From adb1f41cf8abf76da5b65a784c5c34aa69c6f8db Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 16 Oct 2014 00:09:40 -0700 Subject: [PATCH 09/11] Add unit tests for Dataset.dropna how="all" and thresh --- xray/test/test_dataset.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 198b75f66a6..6de93c46939 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -1064,6 +1064,27 @@ def test_dropna(self): actual = ds.dropna('a', subset=['bar']) self.assertDatasetIdentical(actual, ds) + actual = ds.dropna('a', how='all') + self.assertDatasetIdentical(actual, ds) + + actual = ds.dropna('b', how='all', subset=['bar']) + expected = ds.isel(b=[0, 1, 2]) + self.assertDatasetIdentical(actual, expected) + + actual = ds.dropna('b', thresh=1, subset=['bar']) + self.assertDatasetIdentical(actual, expected) + + actual = ds.dropna('b', thresh=2) + self.assertDatasetIdentical(actual, ds) + + actual = ds.dropna('b', thresh=4) + expected = ds.isel(b=[1, 2, 3]) + self.assertDatasetIdentical(actual, expected) + + actual = ds.dropna('a', thresh=3) + expected = ds.isel(a=[1, 3]) + self.assertDatasetIdentical(actual, ds) + def test_reduce(self): data = create_test_data() From bc8d4c345b8f89103c62fb9db805fde0331f241e Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 16 Oct 2014 00:12:14 -0700 Subject: [PATCH 10/11] Add how=all and thresh options to DataArray.dropna --- xray/core/dataarray.py | 12 +++++++----- xray/test/test_dataarray.py | 9 +++++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/xray/core/dataarray.py b/xray/core/dataarray.py index 79ef46fa922..10ceedd3509 100644 --- a/xray/core/dataarray.py +++ b/xray/core/dataarray.py @@ -682,7 +682,7 @@ def squeeze(self, dim=None): ds = self._dataset.squeeze(dim) return self._with_replaced_dataset(ds) - def dropna(self, dim, how='any'): + def dropna(self, dim, how='any', thresh=None): """Returns a new array with dropped labels for missing values along the provided dimension. @@ -691,15 +691,17 @@ def dropna(self, dim, how='any'): dim : str Dimension along which to drop missing values. Dropping along multiple dimensions simultaneously is not yet supported. - how : {'any',}, optional - How to choose values to drop. The only currently supported choice - is 'any'. + how : {'any', 'all'}, optional + * any : if any NA values are present, drop that label + * all : if all values are NA, drop that label + thresh : int, default None + If supplied, require this many non-NA values. Returns ------- DataArray """ - ds = self._dataset.dropna(dim, how=how) + ds = self._dataset.dropna(dim, how=how, thresh=thresh) return self._with_replaced_dataset(ds) def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs): diff --git a/xray/test/test_dataarray.py b/xray/test/test_dataarray.py index e84cf7d31a8..68cb8b3b5fe 100644 --- a/xray/test/test_dataarray.py +++ b/xray/test/test_dataarray.py @@ -663,7 +663,16 @@ def test_dropna(self): actual = arr.dropna('a') expected = arr[1::2] + self.assertDataArrayIdentical(actual, expected) + + actual = arr.dropna('b', how='all') + self.assertDataArrayIdentical(actual, arr) + + actual = arr.dropna('a', thresh=1) + self.assertDataArrayIdentical(actual, arr) + actual = arr.dropna('b', thresh=3) + expected = arr[:, 1:] self.assertDataArrayIdentical(actual, expected) def test_reduce(self): From 1dd23afcebc91671711637146bf20eb3a2f9cdc7 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 16 Oct 2014 00:15:12 -0700 Subject: [PATCH 11/11] Validate exceptions in dropna --- xray/test/test_dataset.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 6de93c46939..b70f5d6a1bc 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -1085,6 +1085,13 @@ def test_dropna(self): expected = ds.isel(a=[1, 3]) self.assertDatasetIdentical(actual, ds) + with self.assertRaisesRegexp(ValueError, 'a single dataset dimension'): + ds.dropna('foo') + with self.assertRaisesRegexp(ValueError, 'invalid how'): + ds.dropna('a', how='somehow') + with self.assertRaisesRegexp(TypeError, 'must specify how or thresh'): + ds.dropna('a', how=None) + def test_reduce(self): data = create_test_data()