# Robustly Encoding categorical features

The categorical features in the tabular playground series are stored as strings. They need to be encoded to something else to be used in machine learning models. I would like to build some encoding that is robust to having new categories that have not yet been seen in the training data.

# Load data

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
train.head()

In [None]:
cat_cols = [x for x in train.columns if x.startswith('cat')]
cat_cols

# Test sample with new categories to validate encodings

In [None]:
test_new_cats = pd.DataFrame(columns=test.columns, data = [[1]+['Z']*len(cat_cols)+[0]*14])
test_new_cats

# Using the LabelEncoder
The sklearn LabelEncoder can encode strings to some integer label.

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
le.fit(train['cat8'])

In [None]:
le.transform(['A'])

The LabelEncoder is meant to encode the target variable, not the features. It cannot deal with not previously seen categories.

In [None]:
le.transform(['Z'])

# Using pandas CategoricalDtype

The CategoricalDtype from pandas can be used in many machine learning models, including LightGBM and CatBoost. I created a simple transformer that 'learns' the available categories from the training data and encodes strings to categories.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from pandas.api.types import CategoricalDtype

class CategoricalTransform(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        
    def _transform_column(self, col, col_name):
        return col.astype(self.cat_type[col_name]) 
        
    def transform(self, df, **transform_params):
        df_cat = df.copy()
        for col in self.cat_cols:
            df_cat[col] = self._transform_column(df_cat[col], col)
        return df_cat
        
    def fit(self, X, y=None, **fit_params):
        self.cat_type = dict()
        for col in self.cat_cols:
            self.cat_type[col] = CategoricalDtype(X[col].unique())
        return self

In [None]:
ct = CategoricalTransform(cat_cols)

In [None]:
t = ct.fit_transform(train)
t.info()

## Non-existing categories are encoded as NaN:

In [None]:
test_new_cats

In [None]:
ct.transform(test_new_cats)

## The transformer can be embedded in a sklearn Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from lightgbm.sklearn import LGBMRegressor
p = Pipeline([('cat_trans', CategoricalTransform(cat_cols)), 
              ('lgbm', LGBMRegressor(n_jobs=-2))])

In [None]:
x_train = train.drop(columns=['target','id'])
y_train = train['target']
p.fit(x_train, y_train)

In [None]:
p.predict(test.drop(columns=['id']))

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(p, X=x_train, y=y_train, cv=5, return_train_score = True,
                         scoring='neg_root_mean_squared_error')
scores

In [None]:
scores['test_score'].mean()

prediction with new categories

In [None]:
p.predict(test_new_cats.drop(columns=['id']))

# Transform to integer value
If you need to transform to an integer value, for example to train an embedding in tensorflow, you can use the codes from the categorical feature instead. Below is a small transformer to do so.

In [None]:
class IntegerCategoricalTransform(CategoricalTransform):
    def _transform_column(self, col, col_name):
        return super()._transform_column(col, col_name).values.codes

In [None]:
ct = IntegerCategoricalTransform(cat_cols)

In [None]:
t = ct.fit_transform(train)
t.info()

## Missing values are encoded as -1

In [None]:
ct.transform(test_new_cats)

# Using this in a sklearn Pipeline
Here I'm using CatBoost, as it is not trivial to use integer encoded features in the sklearn API of LightGBM.

In [None]:
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
p = Pipeline([('cat_trans', IntegerCategoricalTransform(cat_cols)), 
              ('cb', CatBoostRegressor(iterations=50, thread_count=3, cat_features=cat_cols))])

In [None]:
x_train = train.drop(columns=['target','id'])
y_train = train['target']
p.fit(x_train, y_train)

In [None]:
p.predict(test.drop(columns=['id']))

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(p, X=x_train, y=y_train, cv=5, return_train_score = True,
                         scoring='neg_root_mean_squared_error')
scores

In [None]:
scores['test_score'].mean()

In [None]:
p.predict(test_new_cats.drop(columns=['id']))

# One hot encoding
Robust One hot encoding can be achieved by chaining the categorical transformer with the OneHotTransform below.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class OneHotTransform(BaseEstimator, TransformerMixin):
    def transform(self, df, **transform_params):
        return pd.get_dummies(df)
    
    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
oh_pipe = Pipeline([('cat_trans', CategoricalTransform(cat_cols)),
                    ('oh_trans', OneHotTransform())])

In [None]:
train_oh = oh_pipe.fit_transform(train)
train_oh.head()

In [None]:
cat5_cols = [col for col in train_oh.columns if col.startswith('cat5')]
cat5_cols

In [None]:
train_oh[cat5_cols]

## New categories are encoded as zeros for each category column.

In [None]:
oh_pipe.transform(test_new_cats)[cat5_cols]

# Using this in a sklearn Pipeline
The sklearn Randomforestregressor does not support categorical variables, so in this example I use one hot encoding for the categorical features. Note that the one hot pipeline defined before can be used as an element in the new pipeline.

In [None]:
from sklearn.ensemble import RandomForestRegressor
p = Pipeline([('oh_trans', oh_pipe), 
              ('rf', RandomForestRegressor(n_jobs=-2))])

In [None]:
x_train = train.drop(columns=['target','id'])
y_train = train['target']
p.fit(x_train.head(10000), y_train.head(10000))

In [None]:
p.predict(test.drop(columns=['id']))

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(p, X=x_train.head(10000), y=y_train.head(10000), cv=5, return_train_score = True,
                         scoring='neg_root_mean_squared_error')
scores

In [None]:
scores['test_score'].mean()

In [None]:
p.predict(test_new_cats.drop(columns=['id']))