Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix concatenating Variables with dtype=datetime64 #134

Merged
merged 6 commits into from
May 20, 2014
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
170 changes: 143 additions & 27 deletions test/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
import numpy as np
import pandas as pd

from xray import Variable, Dataset, DataArray
from xray import Variable, Dataset, DataArray, indexing
from xray.variable import (Coordinate, as_variable, NumpyArrayAdapter,
PandasIndexAdapter)
PandasIndexAdapter, _as_compatible_data)
from xray.pycompat import PY3

from . import TestCase, source_ndarray

Expand Down Expand Up @@ -36,32 +37,85 @@ def test_attrs(self):
v.attrs['foo'] = 'baz'
self.assertEqual(v.attrs['foo'], 'baz')

def test_0d_data(self):
d = datetime(2000, 1, 1)
for value, dtype in [(0, int),
(np.float32(0.5), np.float32),
('foo', np.str_),
(d, None),
(np.datetime64(d), np.datetime64)]:
def assertIndexedLikeNDArray(self, variable, expected_value0,
expected_dtype=None):
"""Given a 1-dimensional variable, verify that the variable is indexed
like a numpy.ndarray.
"""
self.assertEqual(variable[0].shape, ())
self.assertEqual(variable[0].ndim, 0)
self.assertEqual(variable[0].size, 1)
# test identity
self.assertTrue(variable.equals(variable.copy()))
self.assertTrue(variable.identical(variable.copy()))
# check value is equal for both ndarray and Variable
self.assertEqual(variable.values[0], expected_value0)
self.assertEqual(variable[0].values, expected_value0)
# check type or dtype is consistent for both ndarray and Variable
if expected_dtype is None:
# check output type instead of array dtype
self.assertEqual(type(variable.values[0]), type(expected_value0))
self.assertEqual(type(variable[0].values), type(expected_value0))
else:
self.assertEqual(variable.values[0].dtype, expected_dtype)
self.assertEqual(variable[0].values.dtype, expected_dtype)

def test_index_0d_int(self):
for value, dtype in [(0, np.int_),
(np.int32(0), np.int32)]:
x = self.cls(['x'], [value])
self.assertIndexedLikeNDArray(x, value, dtype)

def test_index_0d_float(self):
for value, dtype in [(0.5, np.float_),
(np.float32(0.5), np.float32)]:
x = self.cls(['x'], [value])
self.assertIndexedLikeNDArray(x, value, dtype)

def test_index_0d_string(self):
for value, dtype in [('foo', np.dtype('U3' if PY3 else 'S3')),
(u'foo', np.dtype('U3'))]:
x = self.cls(['x'], [value])
# check array properties
self.assertEqual(x[0].shape, ())
self.assertEqual(x[0].ndim, 0)
self.assertEqual(x[0].size, 1)
# test identity
self.assertTrue(x.equals(x.copy()))
self.assertTrue(x.identical(x.copy()))
# check value is equal for both ndarray and Variable
self.assertEqual(x.values[0], value)
self.assertEqual(x[0].values, value)
# check type or dtype is consistent for both ndarray and Variable
if dtype is None:
# check output type instead of array dtype
self.assertEqual(type(x.values[0]), type(value))
self.assertEqual(type(x[0].values), type(value))
else:
assert np.issubdtype(x.values[0].dtype, dtype), (x.values[0].dtype, dtype)
assert np.issubdtype(x[0].values.dtype, dtype), (x[0].values.dtype, dtype)
self.assertIndexedLikeNDArray(x, value, dtype)

def test_index_0d_datetime(self):
d = datetime(2000, 1, 1)
x = self.cls(['x'], [d])
self.assertIndexedLikeNDArray(x, d)

x = self.cls(['x'], [np.datetime64(d)])
self.assertIndexedLikeNDArray(x, np.datetime64(d), 'datetime64[ns]')

x = self.cls(['x'], pd.DatetimeIndex([d]))
self.assertIndexedLikeNDArray(x, np.datetime64(d), 'datetime64[ns]')

def test_index_0d_object(self):

class HashableItemWrapper(object):
def __init__(self, item):
self.item = item

def __eq__(self, other):
return self.item == other.item

def __hash__(self):
return hash(self.item)

def __repr__(self):
return '%s(item=%r)' % (type(self).__name__, self.item)

item = HashableItemWrapper((1, 2, 3))
x = self.cls('x', [item])
self.assertIndexedLikeNDArray(x, item)

def test_index_and_concat_datetime64(self):
# regression test for #125
expected = self.cls('t', pd.date_range('2011-09-01', periods=10))
for times in [[expected[i] for i in range(10)],
[expected[[i]] for i in range(10)]]:
actual = Variable.concat(times, 't')
self.assertArrayEqual(expected, actual)
self.assertEqual(expected.dtype, actual.dtype)

def test_0d_time_data(self):
# regression test for #105
Expand Down Expand Up @@ -229,6 +283,30 @@ def test_item(self):
self.assertEqual(v.item(), 0)
self.assertIs(type(v.item()), float)

def test_datetime64_precision(self):
# verify that datetime64 is always converted to ns precision
values = np.datetime64('2000-01-01T00')
v = Variable([], values)
self.assertEqual(v.dtype, np.dtype('datetime64[ns]'))
self.assertEqual(v.values, values)
self.assertEqual(v.values.dtype, np.dtype('datetime64[ns]'))

values = pd.date_range('2000-01-01', periods=3).values.astype(
'datetime64[s]')
v = Variable(['t'], values)
self.assertEqual(v.dtype, np.dtype('datetime64[ns]'))
self.assertArrayEqual(v.values, values)
self.assertEqual(v.values.dtype, np.dtype('datetime64[ns]'))

def test_0d_str(self):
v = Variable([], u'foo')
self.assertEqual(v.dtype, np.dtype('U3'))
self.assertEqual(v.values, 'foo')

v = Variable([], np.string_('foo'))
self.assertEqual(v.dtype, np.dtype('S3'))
self.assertEqual(v.values, bytes('foo', 'ascii') if PY3 else 'foo')

def test_equals_and_identical(self):
d = np.random.rand(10, 3)
d[0, 0] = np.nan
Expand Down Expand Up @@ -463,3 +541,41 @@ def test_data(self):
self.assertIsInstance(x._data, PandasIndexAdapter)
with self.assertRaisesRegexp(TypeError, 'cannot be modified'):
x[:] = 0


class TestAsCompatibleData(TestCase):
def test_unchanged_types(self):
types = (NumpyArrayAdapter, PandasIndexAdapter,
indexing.LazilyIndexedArray)
for t in types:
for data in [np.arange(3),
pd.date_range('2000-01-01', periods=3),
pd.date_range('2000-01-01', periods=3).values]:
x = t(data)
self.assertIs(x, _as_compatible_data(x))

def test_converted_types(self):
for input_array in [[[0, 1, 2]], pd.DataFrame([[0, 1, 2]])]:
actual = _as_compatible_data(input_array)
self.assertArrayEqual(np.asarray(input_array), actual)
self.assertEqual(NumpyArrayAdapter, type(actual))
self.assertEqual(np.dtype(int), actual.dtype)

def test_datetime(self):
expected = np.datetime64('2000-01-01T00')
actual = _as_compatible_data(expected)
self.assertEqual(expected, actual)
self.assertEqual(np.datetime64, type(actual))
self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype)

expected = np.array([np.datetime64('2000-01-01T00')])
actual = _as_compatible_data(expected)
self.assertEqual(np.asarray(expected), actual)
self.assertEqual(NumpyArrayAdapter, type(actual))
self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype)

expected = pd.Timestamp('2000-01-01T00').to_datetime()
actual = _as_compatible_data(expected)
self.assertEqual(np.asarray(expected), actual)
self.assertEqual(NumpyArrayAdapter, type(actual))
self.assertEqual(np.dtype('O'), actual.dtype)
4 changes: 2 additions & 2 deletions xray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .netcdf3 import encode_nc3_variable
import xray
from xray.conventions import encode_cf_variable
from xray.utils import FrozenOrderedDict, NDArrayMixin, as_array_or_item
from xray.utils import FrozenOrderedDict, NDArrayMixin
from xray import indexing
from xray.pycompat import iteritems, basestring

Expand All @@ -31,7 +31,7 @@ def __getitem__(self, key):
# work around for netCDF4-python's broken handling of 0-d
# arrays (slicing them always returns a 1-dimensional array):
# https://github.com/Unidata/netcdf4-python/pull/220
data = as_array_or_item(np.asscalar(self.array[key]))
data = np.asscalar(self.array[key])
else:
data = self.array[key]
return data
Expand Down
6 changes: 1 addition & 5 deletions xray/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,12 +212,8 @@ def shape(self):
shape.append(k.size)
return tuple(shape)

@property
def values(self):
return self.array[self.key]

def __array__(self, dtype=None):
return np.asarray(self.values, dtype=None)
return np.asarray(self.array[self.key], dtype=None)

def __getitem__(self, key):
return type(self)(self.array, self._updated_key(key))
Expand Down
30 changes: 0 additions & 30 deletions xray/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import operator
import warnings
from collections import OrderedDict, Mapping, MutableMapping
from datetime import datetime

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -36,35 +35,6 @@ def __new__(cls, *args, **kwargs):
return Wrapper


def as_safe_array(values, dtype=None):
"""Like np.asarray, but convert all datetime64 arrays to ns precision
"""
values = np.asarray(values, dtype=dtype)
if values.dtype.kind == 'M':
# np.datetime64
values = values.astype('datetime64[ns]')
return values


def as_array_or_item(values, dtype=None):
"""Return the given values as a numpy array of the indicated dtype, or as
an individual value if it's a 0-dimensional object array or datetime.
"""
if isinstance(values, datetime):
# shortcut because if you try to make a datetime or Timestamp object
# into an array with the proper dtype, it is liable to be silently
# converted into an integer instead :(
return values
values = as_safe_array(values, dtype=dtype)
if values.ndim == 0 and values.dtype.kind == 'O':
# unpack 0d object arrays to be consistent with numpy
values = values.item()
if isinstance(values, pd.Timestamp):
# turn Timestamps back into datetime64 objects
values = np.datetime64(values, 'ns')
return values


def squeeze(xray_obj, dimensions, dimension=None):
"""Squeeze the dimensions of an xray object."""
if dimension is None:
Expand Down
61 changes: 50 additions & 11 deletions xray/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,30 @@ def as_variable(obj, strict=True):


def _as_compatible_data(data):
"""If data does not have the necessary attributes to be the private _data
attribute, convert it to a np.ndarray and raise an warning
"""Prepare and wrap data to put in a Variable.

Prepare the data:
- If data does not have the necessary attributes, convert it to ndarray.
- If data has dtype=datetime64, ensure that it has ns precision.
- If data is already a pandas or xray object (other than an Index), just
use the values.

Wrap it up:
- Finally, put pandas.Index and numpy.ndarray arguments in adapter objects
to ensure they can be indexed properly.
- NumpyArrayAdapter, PandasIndexAdapter and LazilyIndexedArray should
all pass through unmodified.
"""
# don't check for __len__ or __iter__ so as not to warn if data is a numpy
# don't check for __len__ or __iter__ so as not to cast if data is a numpy
# numeric type like np.float32
required = ['dtype', 'shape', 'size', 'ndim']
if (any(not hasattr(data, attr) for attr in required)
or isinstance(data, np.string_)):
data = utils.as_safe_array(data)
elif not isinstance(data, (pd.Index, indexing.LazilyIndexedArray)):
if any(not hasattr(data, attr) for attr in required):
# data must be ndarray-like
data = np.asarray(data)
elif isinstance(data, np.datetime64):
# note: np.datetime64 is ndarray-like
data = np.datetime64(data, 'ns')
elif not isinstance(data, pd.Index):
try:
# we don't want nested self-described arrays
# use try/except instead of hasattr to only calculate values once
Expand All @@ -73,7 +87,10 @@ def _as_compatible_data(data):
# check pd.Index first since it's (currently) an ndarray subclass
data = PandasIndexAdapter(data)
elif isinstance(data, np.ndarray):
data = NumpyArrayAdapter(utils.as_safe_array(data))
if data.dtype.kind == 'M':
data = data.astype('datetime64[ns]')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean any non-index that holds dates will get automatically copied?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. We should be using np.asarray or set copy=False with astype:

By default, astype always returns a newly allocated array. If this is set to false, and the dtype, order, and subok requirements are satisfied, the input array is returned instead of a copy.

I don't know why I thought astype wouldn't make a copy unless necessary.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it looks like np.asarray(data, 'datetime64[ns]') works.

data = NumpyArrayAdapter(data)

return data


Expand Down Expand Up @@ -130,8 +147,9 @@ def __getitem__(self, key):
# unpack key so it can index a pandas.Index object (pandas.Index
# objects don't like tuples)
key, = key

if isinstance(key, (int, np.integer)):
return utils.as_array_or_item(self.array[key], dtype=self.dtype)
value = np.asarray(self.array[key], dtype=self.dtype)
else:
if isinstance(key, slice) and key == slice(None):
# pandas<0.14 does dtype inference when slicing; we would like
Expand All @@ -140,13 +158,34 @@ def __getitem__(self, key):
arr = self.array
else:
arr = self.array[key]
return PandasIndexAdapter(arr, dtype=self.dtype)
value = PandasIndexAdapter(arr, dtype=self.dtype)

return value

def __repr__(self):
return ('%s(array=%r, dtype=%r)'
% (type(self).__name__, self.array, self.dtype))


def _as_array_or_item(data):
"""Return the given values as a numpy array, or as an individual item if
it's a 0-dimensional object array or datetime64.

Importantly, this function does not copy data if it is already an ndarray -
otherwise, it will not be possible to update Variable values in place.
"""
data = np.asarray(data)
if data.ndim == 0:
if data.dtype.kind == 'O':
# unpack 0d object arrays to be consistent with numpy
data = data.item()
elif data.dtype.kind == 'M':
# convert to a np.datetime64 object, because 0-dimensional ndarrays
# with dtype=datetime64 are broken :(
data = np.datetime64(data, 'ns')
return data


class Variable(AbstractArray):
"""A netcdf-like variable consisting of dimensions, data and attributes
which describe a single Array. A single Variable object is not fully
Expand Down Expand Up @@ -219,7 +258,7 @@ def __getstate__(self):
@property
def values(self):
"""The variable's data as a numpy.ndarray"""
return utils.as_array_or_item(self._data_cached())
return _as_array_or_item(self._data_cached())

@values.setter
def values(self, values):
Expand Down