Skip to content

Commit

Permalink
Merge pull request #10508 from sinhrks/groupby_dtcat
Browse files Browse the repository at this point in the history
BUG: Groupby(sort=False) with datetime-like Categorical raises ValueError
  • Loading branch information
sinhrks committed Jul 28, 2015
2 parents a743743 + 29f1f42 commit c06f9ce
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 12 deletions.
18 changes: 18 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Expand Up @@ -303,7 +303,22 @@ Other API Changes
- Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
- Serialize metadata properties of subclasses of pandas objects (:issue:`10553`).
- ``Categorical.name`` was removed to make `Categorical` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
- ``Categorical.unique`` now returns new ``Categorical`` which ``categories`` and ``codes`` are unique, rather than returnning ``np.array`` (:issue:`10508`)

- unordered category: values and categories are sorted by appearance order.
- ordered category: values are sorted by appearance order, categories keeps existing order.

.. ipython :: python

cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C'], ordered=True)
cat
cat.unique()

cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C'])
cat
cat.unique()

- ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above (:issue:`10508`)
- ``NaT``'s methods now either raise ``ValueError``, or return ``np.nan`` or ``NaT`` (:issue:`9513`)

=============================== ==============================================================
Expand Down Expand Up @@ -365,6 +380,9 @@ Bug Fixes
- Bug in ``DataFrame.interpolate`` with ``axis=1`` and ``inplace=True`` (:issue:`10395`)
- Bug in ``io.sql.get_schema`` when specifying multiple columns as primary
key (:issue:`10385`).

- Bug in ``groupby(sort=False)`` with datetime-like ``Categorical`` raises ``ValueError`` (:issue:`10505`)

- Bug in ``test_categorical`` on big-endian builds (:issue:`10425`)
- Bug in ``Series.shift`` and ``DataFrame.shift`` not supporting categorical data (:issue:`9416`)
- Bug in ``Series.map`` using categorical ``Series`` raises ``AttributeError`` (:issue:`10324`)
Expand Down
21 changes: 16 additions & 5 deletions pandas/core/categorical.py
Expand Up @@ -1558,19 +1558,30 @@ def mode(self):

def unique(self):
"""
Return the unique values.
Return the ``Categorical`` which ``categories`` and ``codes`` are unique.
Unused categories are NOT returned.
Unused categories are NOT returned. Unique values are returned in order
of appearance.
- unordered category: values and categories are sorted by appearance
order.
- ordered category: values are sorted by appearance order, categories
keeps existing order.
Returns
-------
unique values : array
unique values : ``Categorical``
"""

from pandas.core.nanops import unique1d
# unlike np.unique, unique1d does not sort
unique_codes = unique1d(self.codes)
return take_1d(self.categories.values, unique_codes)
cat = self.copy()
# keep nan in codes
cat._codes = unique_codes
# exclude nan from indexer for categories
take_codes = unique_codes[unique_codes != -1]
if self.ordered:
take_codes = sorted(take_codes)
return cat.set_categories(cat.categories.take(take_codes))

def equals(self, other):
"""
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/groupby.py
Expand Up @@ -1959,7 +1959,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,

# fix bug #GH8868 sort=False being ignored in categorical groupby
else:
self.grouper = self.grouper.reorder_categories(self.grouper.unique())
cat = self.grouper.unique()
self.grouper = self.grouper.reorder_categories(cat.categories)

# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
Expand Down
49 changes: 44 additions & 5 deletions pandas/tests/test_categorical.py
Expand Up @@ -958,20 +958,59 @@ def test_min_max(self):
self.assertEqual(_max, 1)

def test_unique(self):
cat = Categorical(["a","b"])
exp = np.asarray(["a","b"])
# categories are reordered based on value when ordered=False
cat = Categorical(["a", "b"])
exp = np.asarray(["a", "b"])
res = cat.unique()
self.assert_numpy_array_equal(res, exp)

cat = Categorical(["a","b","a","a"], categories=["a","b","c"])
cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
res = cat.unique()
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, Categorical(exp))

# unique should not sort
cat = Categorical(["b", "b", np.nan, "a"], categories=["a","b","c"])
cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"])
exp = np.asarray(["c", "a", "b"])
res = cat.unique()
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, Categorical(exp, categories=['c', 'a', 'b']))

# nan must be removed
cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"])
res = cat.unique()
exp = np.asarray(["b", np.nan, "a"], dtype=object)
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, Categorical(["b", np.nan, "a"], categories=["b", "a"]))

def test_unique_ordered(self):
# keep categories order when ordered=True
cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True)
res = cat.unique()
exp = np.asarray(['b', 'a'])
exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, exp_cat)

cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True)
res = cat.unique()
exp = np.asarray(['c', 'b', 'a'])
exp_cat = Categorical(exp, categories=['a', 'b', 'c'], ordered=True)
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, exp_cat)

cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True)
res = cat.unique()
exp = np.asarray(['b', 'a'])
exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, exp_cat)

cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'], ordered=True)
res = cat.unique()
exp = np.asarray(['b', np.nan, 'a'], dtype=object)
exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
self.assert_numpy_array_equal(res, exp)
tm.assert_categorical_equal(res, exp_cat)

def test_mode(self):
s = Categorical([1,1,2,4,5,5,5], categories=[5,4,3,2,1], ordered=True)
Expand Down
52 changes: 51 additions & 1 deletion pandas/tests/test_groupby.py
Expand Up @@ -3413,7 +3413,8 @@ def test_groupby_sort_categorical(self):

col = 'range'
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
# when categories is ordered, group is ordered by category's order
assert_frame_equal(result_sort, df.groupby(col, sort=False).first())

df['range'] = Categorical(df['range'],ordered=False)
index = Index(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], dtype='object')
Expand All @@ -3431,6 +3432,55 @@ def test_groupby_sort_categorical(self):
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())

def test_groupby_sort_categorical_datetimelike(self):
# GH10505

# use same data as test_groupby_sort_categorical, which category is
# corresponding to datetime.month
df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
datetime(2011, 2, 1), datetime(2011, 5, 1),
datetime(2011, 2, 1), datetime(2011, 1, 1),
datetime(2011, 5, 1)],
'foo': [10, 8, 5, 6, 4, 1, 7],
'bar': [10, 20, 30, 40, 50, 60, 70]},
columns=['dt', 'foo', 'bar'])

# ordered=True
df['dt'] = Categorical(df['dt'], ordered=True)
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 7, 1)]
result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
result_sort.index = CategoricalIndex(index, name='dt', ordered=True)

index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 1, 1)]
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
columns=['foo', 'bar'])
result_nosort.index = CategoricalIndex(index, categories=index,
name='dt', ordered=True)

col = 'dt'
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
# when categories is ordered, group is ordered by category's order
assert_frame_equal(result_sort, df.groupby(col, sort=False).first())

# ordered = False
df['dt'] = Categorical(df['dt'], ordered=False)
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 7, 1)]
result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
result_sort.index = CategoricalIndex(index, name='dt')

index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 1, 1)]
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
columns=['foo', 'bar'])
result_nosort.index = CategoricalIndex(index, categories=index, name='dt')

col = 'dt'
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())


def test_groupby_sort_multiindex_series(self):
# series multiindex groupby sort argument was not being passed through _compress_group_index
Expand Down

0 comments on commit c06f9ce

Please sign in to comment.