In [11]:
import pandas as pd
from pandas.core.dtypes.common import is_dtype_equal
from pandas.api.types import CategoricalDtype

# Compatible Categorical Columns

In [24]:
# Same categories not ordered
cat1 = pd.Categorical(list('abc'), ordered=False)
cat2 = pd.Categorical(list('bac'), ordered=False)

a = pd.DataFrame({'A': cat1})
b = pd.DataFrame({'A': cat2})
print(is_dtype_equal(cat1, cat2))
print(a.merge(b, on='A', how='inner'))
a.merge(b, on='A').dtypes

True
   A
0  a
1  b
2  c


A    category
dtype: object

In [25]:
# Same categories ordered (with same ordering)
cat1 = pd.Categorical(list('abc'), ordered=True, categories=['a', 'b', 'c'])
cat2 = pd.Categorical(list('bac'), ordered=True, categories=['a', 'b', 'c'])

a = pd.DataFrame({'A': cat1})
b = pd.DataFrame({'A': cat2})
print(is_dtype_equal(cat1, cat2))
print(a.merge(b, on='A', how='inner'))
a.merge(b, on='A').dtypes

True
   A
0  a
1  b
2  c


A    category
dtype: object

In [29]:
# Categories are not exactly the same (subset)
cat1 = pd.Categorical(list('abcd'), ordered=True, categories=['a', 'b', 'c'])
cat2 = pd.Categorical(list('bac'), ordered=True, categories=['a', 'b', 'c'])

a = pd.DataFrame({'A': cat1})
b = pd.DataFrame({'A': cat2})
print(is_dtype_equal(cat1, cat2))
print(a.merge(b, on='A', how='right'))
a.merge(b, on='A', how='outer').dtypes

True
   A
0  a
1  b
2  c


A    category
dtype: object

# Incompatible
What if the categorical columns are not exactly equal? This seems to cast to object in most cases. Should we raise in all cases?

In [34]:
# Same categories ordered (without same ordering)
cat1 = pd.Categorical(list('abc'), ordered=True, categories=['c', 'b', 'a'])
cat2 = pd.Categorical(list('bac'), ordered=True, categories=['a', 'b', 'c'])

a = pd.DataFrame({'A': cat1})
b = pd.DataFrame({'A': cat2})
print(is_dtype_equal(cat1, cat2))
print(a.merge(b, on='A', how='inner'))
a.merge(b, on='A').dtypes

False
   A
0  a
1  b
2  c


A    object
dtype: object

In [33]:
# One ordered, one not ordered
cat1 = pd.Categorical(list('abc'), ordered=False)
cat2 = pd.Categorical(list('bac'), ordered=True, categories=['a', 'b', 'c'])

a = pd.DataFrame({'A': cat1})
b = pd.DataFrame({'A': cat2})
print(is_dtype_equal(cat1, cat2))
print(a.merge(b, on='A', how='inner'))
a.merge(b, on='A', how='outer').dtypes

False
   A
0  a
1  b
2  c


A    object
dtype: object

In [35]:
# Categories are not exactly the same (subset)
cat1 = pd.Categorical(list('abcd'), ordered=False)
cat2 = pd.Categorical(list('bac'), ordered=False)

a = pd.DataFrame({'A': cat1})
b = pd.DataFrame({'A': cat2})
print(is_dtype_equal(cat1, cat2))
print(a.merge(b, on='A', how='inner'))
print(a.merge(b, on='A', how='outer').dtypes)
print()

cat1 = pd.Categorical(list('abcd'), ordered=True, categories=['a', 'b', 'c', 'd'])
cat2 = pd.Categorical(list('bac'), ordered=True, categories=['a', 'b', 'c'])

a = pd.DataFrame({'A': cat1})
b = pd.DataFrame({'A': cat2})
print(is_dtype_equal(cat1, cat2))
print(a.merge(b, on='A', how='outer'))
a.merge(b, on='A', how='outer').dtypes

False
   A
0  a
1  b
2  c
A    object
dtype: object

False
   A
0  a
1  b
2  c
3  d


A    object
dtype: object

In [40]:
# Categories are not exactly the same (completely different)
cat1 = pd.Categorical(list('ghi'), ordered=False)
cat2 = pd.Categorical(list('bac'), ordered=False)

a = pd.DataFrame({'A': cat1})
b = pd.DataFrame({'A': cat2})
print(is_dtype_equal(cat1, cat2))
print(a.merge(b, on='A', how='outer'))
print(a.merge(b, on='A').dtypes)
print()

cat1 = pd.Categorical(list('ghi'), ordered=True)
cat2 = pd.Categorical(list('bac'), ordered=True)

a = pd.DataFrame({'A': cat1})
b = pd.DataFrame({'A': cat2})
print(is_dtype_equal(cat1, cat2))
print(a.merge(b, on='A', how='inner'))
a.merge(b, on='A').dtypes

False
   A
0  g
1  h
2  i
3  b
4  a
5  c
A    object
dtype: object

False
Empty DataFrame
Columns: [A]
Index: []


A    object
dtype: object

In [44]:
cat1 = pd.Categorical([0, 1, 2], ordered=False)
cat2 = pd.Categorical(list('bac'), ordered=False)

a = pd.DataFrame({'A': cat1})
b = pd.DataFrame({'A': cat2})
print(is_dtype_equal(cat1, cat2))
print(a.merge(b, on='A', how='outer'))
a.merge(b, on='A', how='outer').dtypes

False
   A
0  0
1  1
2  2
3  b
4  a
5  c


A    object
dtype: object

* np.nan