## One Hot encoding for categorical data

In [1]:
import uuid

import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

Let's make columns with some random strings

In [13]:
%%time
n_samples = 500_000

def make_string_data(n_samples):
    return [str(uuid.uuid4())[:4] for _ in range(n_samples)]

df = pd.DataFrame({f'col_{idx}': make_string_data(n_samples) for idx in range(10)})

CPU times: user 20.2 s, sys: 5.45 s, total: 25.7 s
Wall time: 25.7 s


In [14]:
df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
0,1d47,ca9d,e04e,0005,17d4,1844,3870,0923,2c6b,8a61
1,cf2c,b01f,b30b,8043,d6a4,5bc6,d2fe,75d5,ed05,9bec
2,e623,9870,bbe8,1391,510a,ab8d,f729,ee38,97f5,8631
3,6415,d013,ea7b,c71b,97af,d281,56cb,d997,a7eb,35a2
4,8707,bc88,cc83,f8ad,06a7,2202,1882,e99e,01dd,df34


In [108]:
df.nunique(axis=0)

col_0    65498
col_1    65497
col_2    65510
col_3    65509
col_4    65508
col_5    65509
col_6    65498
col_7    65506
col_8    65509
col_9    65499
dtype: int64

In [15]:
df.shape

(500000, 10)

In [81]:
%%timeit

OneHotEncoder().fit_transform(df)

5.65 s ± 292 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
df_cat = df.astype('category')

In [72]:
%%timeit

OneHotEncoder().fit_transform(df_cat)

5.31 s ± 218 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [127]:
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_array
import itertools

import scipy.sparse

class OneHotCatEncoder(BaseEstimator):
    def __init__(self, handle_na='error'):
        self.handle_na = handle_na
        
    def fit_transform(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise ValueError
            
        self._categories = []
        self._feature_mask = []
        self._columns = []
        n_samples = X.shape[0]
        out = []
        for col in X.columns:
            ds = X[col]
            if ds.dtype.name != 'category':
                ds = ds.astype('category')
            if self.handle_na == 'error':
                if ds.isna().sum() > 0:
                    raise ValueError('NA found')
            elif self.handle_na == 'category':
                if 'NA' not in ds.cat.categories:
                    ds = ds.cat.add_categories(['NA'])
                    ds.fillna('NA', inplace=True)
            else:
                raise ValueError
            X_res = scipy.sparse.csr_matrix((
                np.ones(n_samples), 
                ds.cat.codes,
                np.arange(n_samples+1)
            ), shape=(n_samples, len(ds.cat.categories)))
            # remove categories with no data
            mask = X_res.sum(axis=0).A1 > 0
            X_res = X_res[:, mask]
            self._columns.append(col)
            self._categories.append(ds.cat.categories[mask])
            out.append(X_res)
        return scipy.sparse.hstack(out, format='csr')
    
    def get_feature_names(self):
        res = [[f'{col}_{category}' for category in self._categories[idx]]
              for idx, col in enumerate(self._columns)]
        return list(itertools.chain.from_iterable(res))
        
    def fit(self, X, y=None):
        raise NotImplementedError('TODO')
        
    def transform(self, X, y=None):
        raise NotImplementedError('TODO')

In [115]:
%%timeit

# Encoding categorical dtype

OneHotCatEncoder().fit_transform(df_cat)

316 ms ± 26.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [118]:
%%timeit

# Converting object dtype to categorical internally 

OneHotCatEncoder().fit_transform(df)

2.23 s ± 127 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [116]:
from numpy.testing import assert_allclose

ohe2 = OneHotEncoder()
X2 = ohe2.fit_transform(df_cat)

ohe3 = OneHotCatEncoder()
X3 = ohe3.fit_transform(df_cat)

assert X2.shape == X3.shape
assert X2.format == X3.format
assert_allclose(X2.data, X3.data)
assert_allclose(X2.indices, X3.indices)
assert_allclose(X2.indptr, X3.indptr)
assert list(ohe2.get_feature_names(df_cat.columns)) == list(ohe3.get_feature_names())

## NA support

In [130]:
df2 = pd.DataFrame({'a': [1, 2, 1], "b": ['c', 'd', 'c']})

ohe = OneHotCatEncoder(handle_na="error")
pd.DataFrame(est.fit_transform(df2).A, columns=est.get_feature_names())

Unnamed: 0,a_1,a_2,b_c,b_d
0,1.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0
2,1.0,0.0,1.0,0.0


In [134]:
df3 = pd.DataFrame({'a': np.array([1, 2, np.nan], dtype=object), "b": ['c', 'd', 'c']})

est = OneHotCatEncoder(handle_na="category")
pd.DataFrame(est.fit_transform(df3).A, columns=est.get_feature_names())

Unnamed: 0,a_1,a_2,a_NA,b_c,b_d
0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,1.0,1.0,0.0
