## Chapter 17 - Categorical Manipulation

In [1]:
import pandas as pd

url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'
df = pd.read_csv(url, engine='pyarrow', dtype_backend='pyarrow')
make = df.make
make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: string[pyarrow]

In [2]:
make.value_counts()

make
Chevrolet                           4003
Ford                                3371
Dodge                               2583
GMC                                 2494
Toyota                              2071
                                    ... 
Grumman Allied Industries              1
Environmental Rsch and Devp Corp       1
General Motors                         1
Goldacre                               1
Isis Imports Ltd                       1
Name: count, Length: 136, dtype: int64[pyarrow]

In [3]:
make.shape, make.nunique()

((41144,), 136)

In [8]:
cat_make = make.astype('category')
print(make.memory_usage(deep=True), cat_make.memory_usage(deep=True))
print((cat_make.memory_usage(deep=True) - make.memory_usage(deep=True)) / make.memory_usage(deep=True))

425767 88701
-0.7916677431552939


In [9]:
old_make = make.astype(str)

In [10]:
%%timeit
old_make.str.upper()

2.41 ms ± 22.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
%%timeit
make.str.upper()

480 µs ± 14.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [15]:
make_type = pd.CategoricalDtype(
    categories=sorted(make.unique()), ordered=True)

ordered_make = make.astype(make_type)
ordered_make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General' < 'ASC Incorporated' < 'Acura' < 'Alfa Romeo' ... 'Volvo' < 'Wallace Environmental' < 'Yugo' < 'smart']

In [52]:
n = 10
topn = list(cat_make.value_counts().index[:n])

(cat_make
 .cat.set_categories([*topn, 'Other'])
 .where(cat_make.isin(topn), 'Other'))

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: category
Categories (11, object): ['Chevrolet', 'Ford', 'Dodge', 'GMC', ..., 'Nissan', 'Volkswagen', 'Mitsubishi', 'Other']

In [58]:
def generalize_topn(ser, n, other='Other'):
    topn = list(ser.value_counts().index[:n])
    return (ser
            .cat.set_categories([*topn, other])
            .where(ser.isin(topn), other))

cat_make.pipe(generalize_topn, n=10)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: category
Categories (11, object): ['Chevrolet', 'Ford', 'Dodge', 'GMC', ..., 'Nissan', 'Volkswagen', 'Mitsubishi', 'Other']