# Categorical Data

In [4]:
import numpy as np;
import pandas as pd
values = pd.Series(['apple', 'orange', 'apple','apple'] * 2)

In [3]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [4]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [5]:
pd.value_counts(values)

apple     6
orange    2
dtype: int64

In [11]:
values = pd.Series([0, 1, 0, 0] * 2)
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [8]:
dim = pd.Series(['apple', 'orange'])

In [10]:
dim

0     apple
1    orange
dtype: object

In [12]:
#We can use the take method to restore the original Series of strings:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

The categorical representation can yield significant performance improvements when
you are doing analytics. You can also perform transformations on the categories while
leaving the codes unmodified. Some example transformations that can be made at
relatively low cost are:
• Renaming categories
• Appending a new category without changing the order or position of the existing
categories

# Categorical Type in pandas

pandas has a special Categorical type for holding data that uses the integer-based
categorical representation or encoding

In [1]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [2]:
N = len(fruits)

In [6]:
df = pd.DataFrame({'fruit': fruits,'basket_id': np.arange(N),
'count': np.random.randint(3, 15, size=N),
'weight': np.random.uniform(0, 4, size=N)},
columns=['basket_id', 'fruit', 'count', 'weight']) 
#With the help of numpy.random.uniform() method, we can get the random samples from uniform distribution 

In [7]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,6,2.946792
1,1,orange,3,1.859127
2,2,apple,8,0.892211
3,3,apple,5,2.425582
4,4,apple,12,3.154117
5,5,orange,14,3.654027
6,6,apple,5,0.576503
7,7,apple,9,1.65286


In [9]:
#Here, df['fruit'] is an array of Python string objects. We can convert it to categorical by calling:

In [10]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): [apple, orange]

In [11]:
#The values for fruit_cat are not a NumPy array, but an instance of pandas.Categorical:
c = fruit_cat.values
type(c)

pandas.core.arrays.categorical.Categorical

In [12]:
#The Categorical object has categories and codes attributes:

In [13]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [14]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [16]:
#You can convert a DataFrame column to categorical by assigning the converted result:
df['fruit'] = df['fruit'].astype('category')
df.fruit

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): [apple, orange]

In [17]:
#You can also create pandas.Categorical directly from other types of Python sequences:

In [18]:
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
my_categories

[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]

In [19]:
#If you have obtained categorical encoded data from another source, you can use the
#alternative from_codes constructor:
categories = ['foo', 'bar', 'baz']
codes = [0, 1, 2, 0, 0, 1]
my_cats_2 = pd.Categorical.from_codes(codes, categories)
my_cats_2

[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo, bar, baz]

In [20]:
#Unless explicitly specified, categorical conversions assume no specific ordering of the
#categories. So the categories array may be in a different order depending on the
#ordering of the input data. When using from_codes or any of the other constructors,
#you can indicate that the categories have a meaningful ordering:
ordered_cat = pd.Categorical.from_codes(codes, categories,ordered=True)
ordered_cat

[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo < bar < baz]

In [21]:
#The output [foo < bar < baz] indicates that 'foo' precedes 'bar' in the ordering,
#and so on. An unordered categorical instance can be made ordered with as_ordered:
my_cats_2.as_ordered()

[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo < bar < baz]

In [22]:
#As a last note, categorical data need not be strings, even though I have only showed
#string examples. A categorical array can consist of any immutable value types.