Skip to content

Commit

Permalink
ENH: union_categorical supports identical categories with ordered
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks committed Jul 25, 2016
1 parent 474fd05 commit 9cadc4e
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 15 deletions.
7 changes: 4 additions & 3 deletions doc/source/categorical.rst
Expand Up @@ -669,9 +669,10 @@ will be the union of the categories being combined.
.. note::

`union_categoricals` only works with unordered categoricals
and will raise if any are ordered.

In addition to the "easy" case of combining two categoricals of the same
categories and order information (e.g. what you could also ``append`` for),
``union_categoricals`` only works with unordered categoricals and will
raise if any are ordered.

Getting Data In/Out
-------------------
Expand Down
61 changes: 53 additions & 8 deletions pandas/tools/tests/test_concat.py
Expand Up @@ -870,23 +870,26 @@ def test_union_categorical(self):
# new categories ordered by appearance
s = Categorical(['x', 'y', 'z'])
s2 = Categorical(['a', 'b', 'c'])
result = union_categoricals([s, s2]).categories
expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
tm.assert_index_equal(result, expected)
result = union_categoricals([s, s2])
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
categories=['x', 'y', 'z', 'a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)

# can't be ordered
s = Categorical([0, 1.2, 2], ordered=True)
s2 = Categorical([0, 1.2, 2], ordered=True)
with tm.assertRaises(TypeError):
union_categoricals([s, s2])
result = union_categoricals([s, s2])
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
tm.assert_categorical_equal(result, expected)

# must exactly match types
s = Categorical([0, 1.2, 2])
s2 = Categorical([2, 3, 4])
with tm.assertRaises(TypeError):
msg = 'dtype of categories must be the same'
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([s, s2])

with tm.assertRaises(ValueError):
msg = 'No Categoricals to union'
with tm.assertRaisesRegexp(ValueError, msg):
union_categoricals([])

def test_union_categoricals_nan(self):
Expand Down Expand Up @@ -942,6 +945,48 @@ def test_union_categoricals_empty(self):
pd.Categorical([])])
tm.assert_categorical_equal(res, nanc)

def test_union_categorical_same_category(self):
# check fastpath
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
categories=[1, 2, 3, 4])
tm.assert_categorical_equal(res, exp)

c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
res = union_categoricals([c1, c2])
exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
categories=['x', 'y', 'z'])
tm.assert_categorical_equal(res, exp)

def test_union_categoricals_ordered(self):
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], ordered=False)

msg = 'Categorical.ordered must be the same'
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([c1, c2])

res = union_categoricals([c1, c1])
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
tm.assert_categorical_equal(res, exp)

c1 = Categorical([1, 2, 3, np.nan], ordered=True)
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
tm.assert_categorical_equal(res, exp)

c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

msg = "to union ordered Categoricals, all categories must be the same"
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([c1, c2])

def test_concat_bug_1719(self):
ts1 = tm.makeTimeSeries()
ts2 = tm.makeTimeSeries()[::2]
Expand Down
23 changes: 19 additions & 4 deletions pandas/types/concat.py
Expand Up @@ -231,8 +231,9 @@ def union_categoricals(to_union):
Raises
------
TypeError
If any of the categoricals are ordered or all do not
have the same dtype
- all inputs do not have the same dtype
- all inputs do not have the same ordered property
- all inputs are ordered and their categories are not identical
ValueError
Emmpty list of categoricals passed
"""
Expand All @@ -242,13 +243,27 @@ def union_categoricals(to_union):
raise ValueError('No Categoricals to union')

first = to_union[0]
if any(c.ordered for c in to_union):
raise TypeError("Can only combine unordered Categoricals")

if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype)
for c in to_union):
raise TypeError("dtype of categories must be the same")

if all(first.is_dtype_equal(other) for other in to_union[1:]):
return Categorical(np.concatenate([c.codes for c in to_union]),
categories=first.categories, ordered=first.ordered,
fastpath=True)
elif all(not c.ordered for c in to_union):
# not ordered
pass
else:
# to show a proper error message
if all(c.ordered for c in to_union):
msg = ("to union ordered Categoricals, "
"all categories must be the same")
raise TypeError(msg)
else:
raise TypeError('Categorical.ordered must be the same')

cats = first.categories
unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
categories = Index(unique_cats)
Expand Down

0 comments on commit 9cadc4e

Please sign in to comment.