Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: Make ufunc works for Index #10638

Merged
merged 1 commit into from
Sep 6, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,8 @@ Other enhancements

- ``DataFrame.apply`` will return a Series of dicts if the passed function returns a dict and ``reduce=True`` (:issue:`8735`).

- ``PeriodIndex`` now supports arithmetic with ``np.ndarray`` (:issue:`10638`)

- ``concat`` will now use existing Series names if provided (:issue:`10698`).

.. ipython:: python
Expand All @@ -333,6 +335,7 @@ Other enhancements

pd.concat([foo, bar, baz], 1)


.. _whatsnew_0170.api:

.. _whatsnew_0170.api_breaking:
Expand Down Expand Up @@ -1005,3 +1008,5 @@ Bug Fixes
- Bug when constructing ``DataFrame`` where passing a dictionary with only scalar values and specifying columns did not raise an error (:issue:`10856`)
- Bug in ``.var()`` causing roundoff errors for highly similar values (:issue:`10242`)
- Bug in ``DataFrame.plot(subplots=True)`` with duplicated columns outputs incorrect result (:issue:`10962`)
- Bug in ``Index`` arithmetic may result in incorrect class (:issue:`10638`)

20 changes: 16 additions & 4 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,12 @@ def __array_wrap__(self, result, context=None):
"""
Gets called after a ufunc
"""
return self._shallow_copy(result)
if is_bool_dtype(result):
return result

attrs = self._get_attributes_dict()
attrs = self._maybe_update_attributes(attrs)
return Index(result, **attrs)

@cache_readonly
def dtype(self):
Expand Down Expand Up @@ -2809,6 +2814,10 @@ def invalid_op(self, other=None):
cls.__abs__ = _make_invalid_op('__abs__')
cls.__inv__ = _make_invalid_op('__inv__')

def _maybe_update_attributes(self, attrs):
""" Update Index attributes (e.g. freq) depending on op """
return attrs

@classmethod
def _add_numeric_methods(cls):
""" add in numeric methods """
Expand Down Expand Up @@ -2849,7 +2858,9 @@ def _evaluate_numeric_binop(self, other):
if reversed:
values, other = other, values

return self._shallow_copy(op(values, other))
attrs = self._get_attributes_dict()
attrs = self._maybe_update_attributes(attrs)
return Index(op(values, other), **attrs)

return _evaluate_numeric_binop

Expand All @@ -2861,8 +2872,9 @@ def _evaluate_numeric_unary(self):
if not self._is_numeric_dtype:
raise TypeError("cannot evaluate a numeric op {opstr} for type: {typ}".format(opstr=opstr,
typ=type(self)))

return self._shallow_copy(op(self.values))
attrs = self._get_attributes_dict()
attrs = self._maybe_update_attributes(attrs)
return Index(op(self.values), **attrs)

return _evaluate_numeric_unary

Expand Down
3 changes: 2 additions & 1 deletion pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,8 @@ def wrapper(left, right, name=name, na_op=na_op):
else:
# scalars
if hasattr(lvalues, 'values') and not isinstance(lvalues, pd.DatetimeIndex):
lvalues = lvalues.values
lvalues = lvalues.values

return left._constructor(wrap_results(na_op(lvalues, rvalues)),
index=left.index, name=left.name,
dtype=dtype)
Expand Down
130 changes: 128 additions & 2 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,56 @@ def test_equals_op(self):
tm.assert_numpy_array_equal(index_a == item, expected3)
tm.assert_numpy_array_equal(series_a == item, expected3)

def test_numpy_ufuncs(self):
# test ufuncs of numpy 1.9.2. see:
# http://docs.scipy.org/doc/numpy/reference/ufuncs.html

# some functions are skipped because it may return different result
# for unicode input depending on numpy version

for name, idx in compat.iteritems(self.indices):
for func in [np.exp, np.exp2, np.expm1, np.log, np.log2, np.log10,
np.log1p, np.sqrt, np.sin, np.cos,
np.tan, np.arcsin, np.arccos, np.arctan,
np.sinh, np.cosh, np.tanh, np.arcsinh, np.arccosh,
np.arctanh, np.deg2rad, np.rad2deg]:
if isinstance(idx, pd.tseries.base.DatetimeIndexOpsMixin):
# raise TypeError or ValueError (PeriodIndex)
# PeriodIndex behavior should be changed in future version
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the different exception behavior?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TypeError raised from __array_wrap__ seems to be catchrd by numpy and return unintended result.

with tm.assertRaises(Exception):
func(idx)
elif isinstance(idx, (Float64Index, Int64Index)):
# coerces to float (e.g. np.sin)
result = func(idx)
exp = Index(func(idx.values), name=idx.name)
self.assert_index_equal(result, exp)
self.assertIsInstance(result, pd.Float64Index)
else:
# raise AttributeError or TypeError
if len(idx) == 0:
continue
else:
with tm.assertRaises(Exception):
func(idx)

for func in [np.isfinite, np.isinf, np.isnan, np.signbit]:
if isinstance(idx, pd.tseries.base.DatetimeIndexOpsMixin):
# raise TypeError or ValueError (PeriodIndex)
with tm.assertRaises(Exception):
func(idx)
elif isinstance(idx, (Float64Index, Int64Index)):
# results in bool array
result = func(idx)
exp = func(idx.values)
self.assertIsInstance(result, np.ndarray)
tm.assertNotIsInstance(result, Index)
else:
if len(idx) == 0:
continue
else:
with tm.assertRaises(Exception):
func(idx)


class TestIndex(Base, tm.TestCase):
_holder = Index
Expand Down Expand Up @@ -2848,6 +2898,41 @@ def test_slice_keep_name(self):
idx = Int64Index([1, 2], name='asdf')
self.assertEqual(idx.name, idx[1:].name)

def test_ufunc_coercions(self):
idx = pd.Int64Index([1, 2, 3, 4, 5], name='x')

result = np.sqrt(idx)
tm.assertIsInstance(result, Float64Index)
exp = pd.Float64Index(np.sqrt(np.array([1, 2, 3, 4, 5])), name='x')
tm.assert_index_equal(result, exp)

result = np.divide(idx, 2.)
tm.assertIsInstance(result, Float64Index)
exp = pd.Float64Index([0.5, 1., 1.5, 2., 2.5], name='x')
tm.assert_index_equal(result, exp)

# _evaluate_numeric_binop
result = idx + 2.
tm.assertIsInstance(result, Float64Index)
exp = pd.Float64Index([3., 4., 5., 6., 7.], name='x')
tm.assert_index_equal(result, exp)

result = idx - 2.
tm.assertIsInstance(result, Float64Index)
exp = pd.Float64Index([-1., 0., 1., 2., 3.], name='x')
tm.assert_index_equal(result, exp)

result = idx * 1.
tm.assertIsInstance(result, Float64Index)
exp = pd.Float64Index([1., 2., 3., 4., 5.], name='x')
tm.assert_index_equal(result, exp)

result = idx / 2.
tm.assertIsInstance(result, Float64Index)
exp = pd.Float64Index([0.5, 1., 1.5, 2., 2.5], name='x')
tm.assert_index_equal(result, exp)


class DatetimeLike(Base):

def test_str(self):
Expand Down Expand Up @@ -3101,7 +3186,9 @@ def test_get_loc(self):
tolerance=timedelta(1)), 1)
with tm.assertRaisesRegexp(ValueError, 'must be convertible'):
idx.get_loc('2000-01-10', method='nearest', tolerance='foo')
with tm.assertRaisesRegexp(ValueError, 'different freq'):

msg = 'Input has different freq from PeriodIndex\\(freq=D\\)'
with tm.assertRaisesRegexp(ValueError, msg):
idx.get_loc('2000-01-10', method='nearest', tolerance='1 hour')
with tm.assertRaises(KeyError):
idx.get_loc('2000-01-10', method='nearest', tolerance='1 day')
Expand All @@ -3119,7 +3206,8 @@ def test_get_indexer(self):
idx.get_indexer(target, 'nearest', tolerance='1 hour'),
[0, -1, 1])

with self.assertRaisesRegexp(ValueError, 'different freq'):
msg = 'Input has different freq from PeriodIndex\\(freq=H\\)'
with self.assertRaisesRegexp(ValueError, msg):
idx.get_indexer(target, 'nearest', tolerance='1 minute')

tm.assert_numpy_array_equal(
Expand Down Expand Up @@ -3215,6 +3303,44 @@ def test_numeric_compat(self):
def test_pickle_compat_construction(self):
pass

def test_ufunc_coercions(self):
# normal ops are also tested in tseries/test_timedeltas.py
idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'],
freq='2H', name='x')

for result in [idx * 2, np.multiply(idx, 2)]:
tm.assertIsInstance(result, TimedeltaIndex)
exp = TimedeltaIndex(['4H', '8H', '12H', '16H', '20H'],
freq='4H', name='x')
tm.assert_index_equal(result, exp)
self.assertEqual(result.freq, '4H')

for result in [idx / 2, np.divide(idx, 2)]:
tm.assertIsInstance(result, TimedeltaIndex)
exp = TimedeltaIndex(['1H', '2H', '3H', '4H', '5H'],
freq='H', name='x')
tm.assert_index_equal(result, exp)
self.assertEqual(result.freq, 'H')

idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'],
freq='2H', name='x')
for result in [ - idx, np.negative(idx)]:
tm.assertIsInstance(result, TimedeltaIndex)
exp = TimedeltaIndex(['-2H', '-4H', '-6H', '-8H', '-10H'],
freq='-2H', name='x')
tm.assert_index_equal(result, exp)
self.assertEqual(result.freq, None)

idx = TimedeltaIndex(['-2H', '-1H', '0H', '1H', '2H'],
freq='H', name='x')
for result in [ abs(idx), np.absolute(idx)]:
tm.assertIsInstance(result, TimedeltaIndex)
exp = TimedeltaIndex(['2H', '1H', '0H', '1H', '2H'],
freq=None, name='x')
tm.assert_index_equal(result, exp)
self.assertEqual(result.freq, None)


class TestMultiIndex(Base, tm.TestCase):
_holder = MultiIndex
_multiprocess_can_split_ = True
Expand Down
9 changes: 0 additions & 9 deletions pandas/tseries/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,15 +1077,6 @@ def _fast_union(self, other):
end=max(left_end, right_end),
freq=left.offset)

def __array_finalize__(self, obj):
if self.ndim == 0: # pragma: no cover
return self.item()

self.offset = getattr(obj, 'offset', None)
self.tz = getattr(obj, 'tz', None)
self.name = getattr(obj, 'name', None)
self._reset_identity()

def __iter__(self):
"""
Return an iterator over the boxed values
Expand Down
48 changes: 38 additions & 10 deletions pandas/tseries/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
import pandas.core.common as com
from pandas.core.common import (isnull, _INT64_DTYPE, _maybe_box,
_values_from_object, ABCSeries,
is_integer, is_float, is_object_dtype)
is_integer, is_float, is_object_dtype,
is_float_dtype)
from pandas import compat
from pandas.util.decorators import cache_readonly

Expand Down Expand Up @@ -307,6 +308,30 @@ def __contains__(self, key):
return False
return key.ordinal in self._engine

def __array_wrap__(self, result, context=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesn't this need to go in tseries/base, e.g. ufuncs are not valid for any datetimelike?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

timedelta64 and datetime64 are real numpy dtypes, so ufuncs will already choke on them:

In [1]: import pandas as pd

In [2]: i = pd.date_range('2000-01-01', periods=5)

In [3]: import numpy as np

In [4]: np.square(i)
TypeError: ufunc 'square' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [5]: np.square(i.to_period())
Out[5]:
PeriodIndex(['330671-10-02', '330731-10-03', '330791-10-05', '330851-10-09',
             '330911-10-16'],
            dtype='int64', freq='D')

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that tests for these other index types would be a good idea, though.

Also, I think we should probably prohibit all ufuncs on PeriodIndex, not just those that return float. For example, it's not valid to square periods, even though that returns integers.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Raising TypeError in __array_wrap__ affects to arithmetic using np.array. Currently, it works:

pd.PeriodIndex(['2011', '2012'], freq='A') + np.array([1, 2])
# PeriodIndex(['2012', '2014'], dtype='int64', freq='A-DEC')

So it can't be prohibited?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PeriodIndex + integer ndarray should not be expected to work -- the array
does not have the right type!

On Mon, Aug 17, 2015 at 3:35 PM, Sinhrks notifications@github.com wrote:

In pandas/tseries/period.py
#10638 (comment):

@@ -281,6 +281,16 @@ def contains(self, key):
return False
return key.ordinal in self._engine

  • def array_wrap(self, result, context=None):

Raising TypeError in array_wrap affects to arithmetic using np.array.
Currently, it works:

pd.PeriodIndex(['2011', '2012'], freq='A') + np.array([1, 2])

PeriodIndex(['2012', '2014'], dtype='int64', freq='A-DEC')

So it can't be prohibited?


Reply to this email directly or view it on GitHub
https://github.com/pydata/pandas/pull/10638/files#r37245206.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually I disagree
add integers to period works and makes sense as its a freq op

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, because this is a valid freq shift, np.array should work as the same.

pd.PeriodIndex(['2011', '2012'], freq='A') + 1
# PeriodIndex(['2012', '2013'], dtype='int64', freq='A-DEC')

And #10744 has been done expecting arithmetic using np.array works.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reconsidered this, and defining _add_ndarray for PeriodIndex may be an alternative. I think options are:

  1. Always raise TypeError in __array_wraps__. Support ndarray ops in another methods (PeriodIndex must be lhs).
  2. Raise TypeError if __array_wraps__ gets non-integers. Some ufunc which return int outputs meaningless results (like square)

NOTE: Maybe not intentional, PeriodIndex.shift works for ndarray. Thus we can use shift for ndarray ops.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sinhrks remember that ATM PeriodIndex is an odd duck. Its a real int64 dtyped array, that happens to have boxing. So it allows any integer-like ops (as opposed to DatetimeIndex which prohibits a lot more because its M8[ns]).

So its needs handling to error on prohibited ops. So on on raising in __array_wraps__ for prohibited ops, but this might require some dispatching mechanism to the sub-class to determine what is allowed. E.g. we do it similarly like this in the _add_numeric_ops and like routines.

"""
Gets called after a ufunc. Needs additional handling as
PeriodIndex stores internal data as int dtype

Replace this to __numpy_ufunc__ in future version
"""
if isinstance(context, tuple) and len(context) > 0:
func = context[0]
if (func is np.add):
return self._add_delta(context[1][1])
elif (func is np.subtract):
return self._add_delta(-context[1][1])
elif isinstance(func, np.ufunc):
if 'M->M' not in func.types:
msg = "ufunc '{0}' not supported for the PeriodIndex"
# This should be TypeError, but TypeError cannot be raised
# from here because numpy catches.
raise ValueError(msg.format(func.__name__))

if com.is_bool_dtype(result):
return result
return PeriodIndex(result, freq=self.freq, name=self.name)

@property
def _box_func(self):
return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq)
Expand Down Expand Up @@ -522,7 +547,18 @@ def _maybe_convert_timedelta(self, other):
base = frequencies.get_base_alias(freqstr)
if base == self.freq.rule_code:
return other.n
raise ValueError("Input has different freq from PeriodIndex(freq={0})".format(self.freq))
elif isinstance(other, np.ndarray):
if com.is_integer_dtype(other):
return other
elif com.is_timedelta64_dtype(other):
offset = frequencies.to_offset(self.freq)
if isinstance(offset, offsets.Tick):
nanos = tslib._delta_to_nanoseconds(other)
offset_nanos = tslib._delta_to_nanoseconds(offset)
if (nanos % offset_nanos).all() == 0:
return nanos // offset_nanos
msg = "Input has different freq from PeriodIndex(freq={0})"
raise ValueError(msg.format(self.freqstr))

def _add_delta(self, other):
ordinal_delta = self._maybe_convert_timedelta(other)
Expand Down Expand Up @@ -775,14 +811,6 @@ def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs):
values[imask] = np.array([formatter(dt) for dt in values[imask]])
return values

def __array_finalize__(self, obj):
if not self.ndim: # pragma: no cover
return self.item()

self.freq = getattr(obj, 'freq', None)
self.name = getattr(obj, 'name', None)
self._reset_identity()

def take(self, indices, axis=0):
"""
Analogous to ndarray.take
Expand Down
16 changes: 8 additions & 8 deletions pandas/tseries/tdi.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,14 @@ def __setstate__(self, state):
raise Exception("invalid pickle state")
_unpickle_compat = __setstate__

def _maybe_update_attributes(self, attrs):
""" Update Index attributes (e.g. freq) depending on op """
freq = attrs.get('freq', None)
if freq is not None:
# no need to infer if freq is None
attrs['freq'] = 'infer'
return attrs

def _add_delta(self, delta):
if isinstance(delta, (Tick, timedelta, np.timedelta64)):
new_values = self._add_delta_td(delta)
Expand Down Expand Up @@ -560,14 +568,6 @@ def _fast_union(self, other):
else:
return left

def __array_finalize__(self, obj):
if self.ndim == 0: # pragma: no cover
return self.item()

self.name = getattr(obj, 'name', None)
self.freq = getattr(obj, 'freq', None)
self._reset_identity()

def _wrap_union_result(self, other, result):
name = self.name if self.name == other.name else None
return self._simple_new(result, name=name, freq=None)
Expand Down
Loading