In [1]:
from typing import List
import numpy as np
import pandas as pd

In [2]:
#reason we have categorical datatypes:
#some columns consist of a set of distinct values
values = pd.Series(['apple', 'orange', 'apple', 'apple'] * 2)

In [3]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [4]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [5]:
pd.value_counts(values)

apple     6
orange    2
dtype: int64

In [6]:
#One storage technique is to use "dimension tables"
#Dimension table contains the distinct values
#primary options are stored as integer keys
values = pd.Series([0, 1, 0, 0] * 2)

In [7]:
dim = pd.Series(['apple', 'orange'])

In [8]:
values #the list of numeric values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [9]:
dim

0     apple
1    orange
dtype: object

In [10]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [11]:
#Integer representation is the categorical representation.
#array of distinct values is the categories, dictionary, or levels of the data
#Pandas Categorical extension type
#faster than string data
fruits: List[str] = ['apple', 'orange', 'apple', 'apple'] * 2
fruits

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']

In [12]:
N = len(fruits)
N

8

In [13]:
rng = np.random.default_rng(seed=12345)
rng

Generator(PCG64) at 0x7F90E1503D80

In [14]:
df = pd.DataFrame({'fruit': fruits,
                  'basket_id': np.arange(N),
                  'count': rng.integers(3, 15, size=N),
                  'weight': rng.uniform(0, 4, size=N)},
                 columns=['basket_id', 'fruit', 'count', 'weight'])
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,11,1.564438
1,1,orange,5,1.331256
2,2,apple,12,2.393235
3,3,apple,6,0.746937
4,4,apple,5,2.691024
5,5,orange,12,3.767211
6,6,apple,10,0.992983
7,7,apple,11,3.795525


In [15]:
df['fruit']

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: object

In [16]:
df['fruit'].dtype

dtype('O')

In [17]:
#convert df['fruit'] to a category
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [18]:
#values for fruit_cat are an instance of pd.Categorical
#we can access the values with the .array attribute
c = fruit_cat.array

In [19]:
type(c)

pandas.core.arrays.categorical.Categorical

In [20]:
#categorical object has categories and codes
c.categories

Index(['apple', 'orange'], dtype='object')

In [21]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [22]:
#the cat accessor (pg. 242 will give more info about the categories and codes methods)
#trick to get mapping between codes and categories
dict(enumerate(c.categories))

{0: 'apple', 1: 'orange'}

In [23]:
#convert a DataFrame column to category by assigning the converted result
df['fruit'] = df['fruit'].astype('category')

In [24]:
df['fruit']

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [25]:
#Create pd.Categorical objects directly from Python sequences
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])

In [26]:
my_categories

['foo', 'bar', 'baz', 'foo', 'bar']
Categories (3, object): ['bar', 'baz', 'foo']

In [27]:
#if we have categorical data from a different sources, we can use the
#from_codes constructor
categories : List = ['foo', 'bar', 'baz']

In [28]:
codes = [0, 1, 2, 0, 0, 1]

In [29]:
my_cats_2 = pd.Categorical.from_codes(codes, categories)

In [30]:
my_cats_2

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

In [31]:
#note: Categorical data doesn't have a specific order unless specified.
ordered_cat = pd.Categorical.from_codes(codes, categories,
                                       ordered=True)
ordered_cat

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [32]:
#convert unordered categorical instances to ordered categorical instances with
#.as_ordered
my_cats_2

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

In [33]:
my_cats_2.as_ordered()

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [34]:
#Note: non-string datatypes also work with categories
#Make random categorical data and use pandas.qcut
rng = np.random.default_rng(seed=12345)

In [35]:
rng

Generator(PCG64) at 0x7F90E1658D60

In [36]:
draws = rng.standard_normal(1000)

In [37]:
draws[:5]

array([-1.42382504,  1.26372846, -0.87066174, -0.25917323, -0.07534331])

In [38]:
#compute a quartile binning and extract some statistics
bins = pd.qcut(draws, 4)

In [39]:
bins

[(-3.121, -0.675], (0.687, 3.211], (-3.121, -0.675], (-0.675, 0.0134], (-0.675, 0.0134], ..., (0.0134, 0.687], (0.0134, 0.687], (-0.675, 0.0134], (0.0134, 0.687], (-0.675, 0.0134]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.121, -0.675] < (-0.675, 0.0134] < (0.0134, 0.687] < (0.687, 3.211]]

In [40]:
#what if the actual sample quartiles are less useful
#than the labels
#we can achieve this with the pd.qcut argument which takes a list of labels
bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

In [41]:
bins

['Q1', 'Q4', 'Q1', 'Q2', 'Q2', ..., 'Q3', 'Q3', 'Q2', 'Q3', 'Q2']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [44]:
#bins doesn't contain information about the bin edges, so let's use groupby
#to extract summary statistics
bins = pd.Series(bins, name='quartile')
bins

0      Q1
1      Q4
2      Q1
3      Q2
4      Q2
       ..
995    Q3
996    Q3
997    Q2
998    Q3
999    Q2
Name: quartile, Length: 1000, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [47]:
results = (pd.Series(draws)
          .groupby(bins)
          .agg(['count', 'min', 'max'])
          .reset_index())
results

Unnamed: 0,quartile,count,min,max
0,Q1,250,-3.119609,-0.678494
1,Q2,250,-0.673305,0.008009
2,Q3,250,0.018753,0.686183
3,Q4,250,0.688282,3.211418


In [48]:
#the quartile column retains the original categorical information, including ordering
results['quartile']

0    Q1
1    Q2
2    Q3
3    Q4
Name: quartile, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [49]:
#better performance with categoricals
N = 10_000_000

In [50]:
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))

In [51]:
labels

0          foo
1          bar
2          baz
3          qux
4          foo
          ... 
9999995    qux
9999996    foo
9999997    bar
9999998    baz
9999999    qux
Length: 10000000, dtype: object

In [52]:
#convert labels to categorical
categories = labels.astype('category')

In [53]:
labels.memory_usage(deep=True)

600000128

In [54]:
categories.memory_usage(deep=True)

10000540