pydata · shoyer · Oct 17, 2014 · Oct 16, 2014 · Oct 16, 2014 · Oct 16, 2014
diff --git a/LICENSES/PANDAS_LICENSE b/LICENSES/PANDAS_LICENSE
@@ -0,0 +1,36 @@
+pandas license
+==============
+
+Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team
+All rights reserved.
+
+Copyright (c) 2008-2011 AQR Capital Management, LLC
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the copyright holder nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.rst b/README.rst
@@ -104,3 +104,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+xray includes portions of pandas. The license for pandas is included in the
+LICENSES directory.
diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst
@@ -19,9 +19,10 @@
    Dataset.std
    Dataset.var
 
-   Dataset.count
    Dataset.isnull
    Dataset.notnull
+   Dataset.count
+   Dataset.dropna
 
    Dataset.argsort
    Dataset.clip
@@ -49,9 +50,10 @@
    DataArray.std
    DataArray.var
 
-   DataArray.count
    DataArray.isnull
    DataArray.notnull
+   DataArray.count
+   DataArray.dropna
 
    DataArray.argsort
    DataArray.clip

diff --git a/doc/api.rst b/doc/api.rst
@@ -118,6 +118,7 @@ Computation
 :py:attr:`~Dataset.isnull`
 :py:attr:`~Dataset.notnull`
 :py:attr:`~Dataset.count`
+:py:attr:`~Dataset.dropna`
 
 **ndarray methods**:
 :py:attr:`~Dataset.argsort`
@@ -238,6 +239,7 @@ Computation
 :py:attr:`~DataArray.isnull`
 :py:attr:`~DataArray.notnull`
 :py:attr:`~DataArray.count`
+:py:attr:`~DataArray.dropna`
 
 **ndarray methods**:
 :py:attr:`~DataArray.argsort`

diff --git a/doc/computation.rst b/doc/computation.rst
@@ -46,13 +46,21 @@ Data arrays also implement many :py:class:`numpy.ndarray` methods:
     arr.round(2)
     arr.T
 
-It also has the ``count``, ``isnull`` and ``notnull`` methods from pandas:
+Missing values
+==============
+
+xray objects borrow the :py:meth:`~xray.DataArray.isnull`,
+:py:meth:`~xray.DataArray.notnull`, :py:meth:`~xray.DataArray.count` and
+:py:meth:`~xray.DataArray.dropna` methods for working with missing data from
+pandas:
 
 .. ipython:: python
 
-    x = xray.DataArray([0, 1, np.nan, np.nan, 2])
+    x = xray.DataArray([0, 1, np.nan, np.nan, 2], dims=['x'])
     x.isnull()
+    x.notnull()
     x.count()
+    x.dropna(dim='x')
 
 Aggregation
 ===========

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -1,6 +1,12 @@
 What's New
 ==========
 
+v0.3.1 (not yet released)
+--------------------------
+
+- Added :py:math:`~xray.Dataset.count` and :py:math:`~xray.Dataset.dropna`
+  methods for dealing with missing values.
+
 v0.3.0 (21 September 2014)
 --------------------------
 

diff --git a/xray/core/dataarray.py b/xray/core/dataarray.py
@@ -195,6 +195,19 @@ def _new_from_dataset(cls, dataset, name):
             obj._dataset._coord_names.discard(name)
         return obj
 
+    @classmethod
+    def _new_from_dataset_no_copy(cls, dataset, name):
+        obj = object.__new__(cls)
+        obj._dataset = dataset
+        obj._name = name
+        return obj
+
+    def _with_replaced_dataset(self, dataset):
+        obj = object.__new__(type(self))
+        obj._name = self.name
+        obj._dataset = dataset
+        return obj
+
     @property
     def dataset(self):
         """The dataset with which this DataArray is associated.
@@ -439,7 +452,7 @@ def copy(self, deep=True):
         array's dataset is also a variable in this array's dataset.
         """
         ds = self._dataset.copy(deep=deep)
-        return ds[self.name]
+        return self._with_replaced_dataset(ds)
 
     def __copy__(self):
         return self.copy(deep=False)
@@ -462,7 +475,7 @@ def isel(self, **indexers):
         DataArray.sel
         """
         ds = self._dataset.isel(**indexers)
-        return ds[self.name]
+        return self._with_replaced_dataset(ds)
 
     indexed = utils.function_alias(isel, 'indexed')
 
@@ -538,7 +551,7 @@ def reindex(self, copy=True, **indexers):
         align
         """
         ds = self._dataset.reindex(copy=copy, **indexers)
-        return ds[self.name]
+        return self._with_replaced_dataset(ds)
 
     def rename(self, new_name_or_name_dict):
         """Returns a new DataArray with renamed coordinates and/or a new name.
@@ -573,7 +586,7 @@ def select_vars(self, *names):
                       FutureWarning, stacklevel=2)
         names = names + (self.name,)
         ds = self._dataset.select_vars(*names)
-        return ds[self.name]
+        return self._with_replaced_dataset(ds)
 
     select = utils.function_alias(select_vars, 'select')
 
@@ -639,7 +652,7 @@ def transpose(self, *dims):
         """
         ds = self._dataset.copy()
         ds[self.name] = self.variable.transpose(*dims)
-        return ds[self.name]
+        return self._with_replaced_dataset(ds)
 
     def squeeze(self, dim=None):
         """Return a new DataArray object with squeezed data.
@@ -667,7 +680,29 @@ def squeeze(self, dim=None):
         numpy.squeeze
         """
         ds = self._dataset.squeeze(dim)
-        return ds[self.name]
+        return self._with_replaced_dataset(ds)
+
+    def dropna(self, dim, how='any', thresh=None):
+        """Returns a new array with dropped labels for missing values along
+        the provided dimension.
+
+        Parameters
+        ----------
+        dim : str
+            Dimension along which to drop missing values. Dropping along
+            multiple dimensions simultaneously is not yet supported.
+        how : {'any', 'all'}, optional
+            * any : if any NA values are present, drop that label
+            * all : if all values are NA, drop that label
+        thresh : int, default None
+            If supplied, require this many non-NA values.
+
+        Returns
+        -------
+        DataArray
+        """
+        ds = self._dataset.dropna(dim, how=how, thresh=thresh)
+        return self._with_replaced_dataset(ds)
 
     def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs):
         """Reduce this array by applying `func` along some dimension(s).
@@ -710,7 +745,7 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False, **kwargs):
         ds = self._dataset.drop_vars(*drop)
         ds[self.name] = var
 
-        return ds[self.name]
+        return self._with_replaced_dataset(ds)
 
     @classmethod
     def concat(cls, *args, **kwargs):
@@ -740,7 +775,7 @@ def _concat(cls, arrays, dim='concat_dim', indexers=None,
         concat_over = set(concat_over) | set([name])
 
         ds = Dataset._concat(datasets, dim, indexers, concat_over=concat_over)
-        return ds[name]
+        return cls._new_from_dataset_no_copy(ds, name)
 
     def to_dataframe(self):
         """Convert this array into a pandas.DataFrame.
@@ -774,7 +809,7 @@ def from_series(cls, series):
         # TODO: add a 'name' parameter
         df = pd.DataFrame({series.name: series})
         ds = Dataset.from_dataframe(df)
-        return ds[series.name]
+        return cls._new_from_dataset_no_copy(ds, series.name)
 
     def _all_compat(self, other, compat_str):
         """Helper function for equals and identical"""
@@ -830,7 +865,7 @@ def __array_wrap__(self, obj, context=None):
         ds = self.coords.to_dataset()
         name = self._result_name()
         ds[name] = new_var
-        return ds[name]
+        return self._new_from_dataset_no_copy(ds, name)
 
     @staticmethod
     def _unary_op(f):
@@ -852,7 +887,7 @@ def func(self, other):
             ds[name] = (f(self.variable, other_variable)
                         if not reflexive
                         else f(other_variable, self.variable))
-            return ds[name]
+            return self._new_from_dataset_no_copy(ds, name)
         return func
 
     @staticmethod

diff --git a/xray/core/dataset.py b/xray/core/dataset.py
@@ -1247,6 +1247,61 @@ def squeeze(self, dim=None):
         """
         return common.squeeze(self, self.dims, dim)
 
+    def dropna(self, dim, how='any', thresh=None, subset=None):
+        """Returns a new dataset with dropped labels for missing values along
+        the provided dimension.
+
+        Parameters
+        ----------
+        dim : str
+            Dimension along which to drop missing values. Dropping along
+            multiple dimensions simultaneously is not yet supported.
+        how : {'any', 'all'}, optional
+            * any : if any NA values are present, drop that label
+            * all : if all values are NA, drop that label
+        thresh : int, default None
+            If supplied, require this many non-NA values.
+        subset : sequence, optional
+            Subset of variables to check for missing values. By default, all
+            variables in the dataset are checked.
+
+        Returns
+        -------
+        Dataset
+        """
+        # TODO: consider supporting multiple dimensions? Or not, given that
+        # there are some ugly edge cases, e.g., pandas's dropna differs
+        # depending on the order of the supplied axes.
+
+        if dim not in self.dims:
+            raise ValueError('%s must be a single dataset dimension' % dim)
+
+        if subset is None:
+            subset = list(self.vars)
+
+        count = np.zeros(self.dims[dim], dtype=int)
+        size = 0
+
+        for k in subset:
+            array = self._arrays[k]
+            if dim in array.dims:
+                dims = [d for d in array.dims if d != dim]
+                count += array.count(dims)
+                size += np.prod([self.dims[d] for d in dims])
+
+        if thresh is not None:
+            mask = count >= thresh
+        elif how == 'any':
+            mask = count == size
+        elif how == 'all':
+            mask = count > 0
+        elif how is not None:
+            raise ValueError('invalid how option: %s' % how)
+        else:
+            raise TypeError('must specify how or thresh')
+
+        return self.isel(**{dim: mask})
+
     def reduce(self, func, dim=None, keep_attrs=False, **kwargs):
         """Reduce this dataset by applying `func` along some dimension(s).
 

diff --git a/xray/core/formatting.py b/xray/core/formatting.py
@@ -46,6 +46,11 @@ def first_n_items(x, n_desired):
     # get them in a single call to __getitem__ using only slices.
     if n_desired < 1:
         raise ValueError('must request at least one item')
+
+    if x.size == 0:
+        # work around for https://github.com/numpy/numpy/issues/5195
+        return []
+
     if n_desired < x.size:
         indexer = _get_indexer_at_least_n_items(x.shape, n_desired)
         x = x[indexer]

diff --git a/xray/test/test_dataarray.py b/xray/test/test_dataarray.py
@@ -656,6 +656,25 @@ def test_transpose(self):
     def test_squeeze(self):
         self.assertVariableEqual(self.dv.variable.squeeze(), self.dv.squeeze())
 
+    def test_dropna(self):
+        x = np.random.randn(4, 4)
+        x[::2, 0] = np.nan
+        arr = DataArray(x, dims=['a', 'b'])
+
+        actual = arr.dropna('a')
+        expected = arr[1::2]
+        self.assertDataArrayIdentical(actual, expected)
+
+        actual = arr.dropna('b', how='all')
+        self.assertDataArrayIdentical(actual, arr)
+
+        actual = arr.dropna('a', thresh=1)
+        self.assertDataArrayIdentical(actual, arr)
+
+        actual = arr.dropna('b', thresh=3)
+        expected = arr[:, 1:]
+        self.assertDataArrayIdentical(actual, expected)
+
     def test_reduce(self):
         coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'],
                   'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]),