# Computations with Categoricals

In [38]:
import numpy as np
import pandas as pd

In [39]:
np.random.seed(12345)
draws = np.random.randn(1000)
draws[:5]

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057])

In [40]:
#Let’s compute a quartile binning of this data and extract some statistics:
bins = pd.qcut(draws, 4)
bins

[(-0.684, -0.0101], (-0.0101, 0.63], (-0.684, -0.0101], (-0.684, -0.0101], (0.63, 3.928], ..., (-0.0101, 0.63], (-0.684, -0.0101], (-2.9499999999999997, -0.684], (-0.0101, 0.63], (0.63, 3.928]]
Length: 1000
Categories (4, interval[float64]): [(-2.9499999999999997, -0.684] < (-0.684, -0.0101] < (-0.0101, 0.63] < (0.63, 3.928]]

While useful, the exact sample quartiles may be less useful for producing a report
than quartile names. We can achieve this with the labels argument to qcut:

In [44]:
bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

In [43]:
bins

[Q2, Q3, Q2, Q2, Q4, ..., Q3, Q2, Q1, Q3, Q4]
Length: 1000
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

In [46]:
bins.codes[:10]

array([1, 2, 1, 1, 3, 3, 2, 2, 3, 3], dtype=int8)

The labeled bins categorical does not contain information about the bin edges in the
data, so we can use groupby to extract some summary statistics:

In [50]:
bins = pd.Series(bins, name='quartile')
results = (pd.Series(draws).groupby(bins).agg(['count', 'min', 'max']).reset_index())
#results = (pd.Series(draws).groupby(bins).agg(['count', 'min', 'max']))
results

Unnamed: 0,quartile,count,min,max
0,Q1,250,-2.949343,-0.685484
1,Q2,250,-0.683066,-0.010115
2,Q3,250,-0.010032,0.628894
3,Q4,250,0.634238,3.927528


In [25]:
#The 'quartile' column in the result retains the original categorical information,including ordering, from bins:
results['quartile']

0    Q1
1    Q2
2    Q3
3    Q4
Name: quartile, dtype: category
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

# Better performance with categoricals

In [26]:
N = 10000000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))


In [27]:
#Now we convert labels to categorical:
categories = labels.astype('category')

In [28]:
#Now we note that labels uses significantly more memory than categories:
labels.memory_usage()


80000128

In [29]:
categories.memory_usage()

10000320

In [30]:
#The conversion to category is not free, of course, but it is a one-time cost:
%time _ = labels.astype('category')

Wall time: 514 ms


In [31]:
#GroupBy operations can be significantly faster with categoricals because the underlying
#algorithms use the integer-based codes array instead of an array of strings.

# Categorical Methods

In [32]:
#Series containing categorical data have several special methods similar to the Series.str specialized string methods.
#This also provides convenient access to the categories and codes. Consider the Series:
s = pd.Series(['a', 'b', 'c', 'd'] * 2)
cat_s = s.astype('category')
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]

In [33]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [34]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [35]:
#Suppose that we know the actual set of categories for this data extends beyond the
#four values observed in the data. We can use the set_categories method to change
#them:
actual_categories = ['a', 'b', 'c', 'd', 'e']
cat_s2 = cat_s.cat.set_categories(actual_categories)
cat_s2

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): [a, b, c, d, e]

In [51]:
cat_s.value_counts()

d    2
c    2
b    2
a    2
dtype: int64

In [52]:
cat_s2.value_counts()

d    2
c    2
b    2
a    2
e    0
dtype: int64

In [55]:
cat_s3 = cat_s[cat_s.isin(['a', 'b'])]
cat_s3

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): [a, b, c, d]

In [54]:
cat_s3.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): [a, b]

In [None]:
add_categories : Append new (unused) categories at end of existing categories
as_ordered     : Make categories ordered
as_unordered   : Make categories unordered
remove_categories : Remove categories, setting any removed values to null
remove_unused_categories : Remove any category values which do not appear in the data
rename_categories : Replace categories with indicated set of new category names; cannot change the number of categories
reorder_categories : Behaves like rename_categories, but can also change the result to have ordered categories
set_categories : Replace the categories with the indicated set of new categories; can add or remove categories