In [1]:
# import libraries

# data manipulation
import numpy as np
import pandas as pd

# machine learning pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import set_config
set_config(display="diagram")

In [2]:
# create sample training data
np.random.seed(42)
n_train = 100
train = pd.DataFrame({'target': np.random.choice([0, 1], size=n_train, p=[0.5, 0.5]), 
                      'feature1': np.random.choice(['a', 'b', 'b_', 'c'], size=n_train, p=[0.4, 0.2, 0.1, 0.3]),
                      'feature2': np.random.choice(['w', 'x', 'z'], size=n_train, p=[0.8, 0.15, 0.05])})
print(f"Training data: {train.shape}")
display(train.head())

Training data: (100, 3)


Unnamed: 0,target,feature1,feature2
0,0,a,w
1,1,b_,w
2,1,a,w
3,1,b,x
4,0,c,w


In [3]:
# create sample test data
n_test = 50
test = pd.DataFrame({'target': np.random.choice([0, 1], size=n_test, p=[0.5, 0.5]), 
                      'feature1': np.random.choice(['a', 'b', 'b_'], size=n_test, p=[0.5, 0.3, 0.2]),
                      'feature2': np.random.choice(['w', 'y', 'z'], size=n_test, p=[0.6, 0.2, 0.2])})
print(f"Test data: {test.shape}")
test.head()

Test data: (50, 3)


Unnamed: 0,target,feature1,feature2
0,0,b,w
1,1,b_,z
2,1,b,w
3,1,a,z
4,1,a,w


Partition the data and build a simple pipeline:

In [5]:
X_train = train.drop(columns='target')
y_train = train['target']
X_test = test.drop(columns='target')
y_test = test['target']

pipe0 = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ('model', GradientBoostingClassifier(random_state=42)),
])

pipe0.fit(X_train, y_train)

We kept the preprocessing steps simple so that we can focus on building custom transformers. Let's check the frequency of the features:

In [6]:
def find_frequency(x):
    counts = x.value_counts()
    proportion = x.value_counts(normalize=True)
    return pd.concat([counts, proportion], axis=1, keys=['count', 'proportion'])

print("===== Training data =====")
display(find_frequency(X_train['feature1']))
display(find_frequency(X_train['feature2']))

print("===== Test data =====")
display(find_frequency(X_test['feature1']))
find_frequency(X_test['feature2'])

===== Training data =====


Unnamed: 0,count,proportion
a,44,0.44
c,30,0.3
b,15,0.15
b_,11,0.11


Unnamed: 0,count,proportion
w,79,0.79
x,16,0.16
z,5,0.05


===== Test data =====


Unnamed: 0,count,proportion
a,22,0.44
b,15,0.3
b_,13,0.26


Unnamed: 0,count,proportion
w,28,0.56
z,12,0.24
y,10,0.2


### Example 1: Group Categories

In this section, we will focus on feature1 as feature2 doesn’t have a category
called b_ . Let’s first preprocess the data manually to illustrate what we want
to achieve:

In [7]:
def group_categories(X, mapping):
    return X.replace(mapping)

mapping = {'b_': 'b'}
X_train_grouped = group_categories(X_train, mapping)
display(find_frequency(X_train_grouped['feature1']))

X_test_grouped = group_categories(X_test, mapping)
find_frequency(X_test_grouped['feature1'])

Unnamed: 0,count,proportion
a,44,0.44
c,30,0.3
b,26,0.26


Unnamed: 0,count,proportion
b,28,0.56
a,22,0.44


We can see that b_ has been grouped together with b. Let’s now translate
this to a custom transformer with FunctionTransformer:

In [8]:
category_grouper1 = FunctionTransformer(group_categories, kw_args={'mapping': mapping})
display(find_frequency(category_grouper1.fit_transform(X_train['feature1'])))
find_frequency(category_grouper1.transform(X_test['feature1']))

Unnamed: 0,count,proportion
a,44,0.44
c,30,0.3
b,26,0.26


Unnamed: 0,count,proportion
b,28,0.56
a,22,0.44


We passed a preprocessing function and specified arguments inside kw_args. We can see that the name of FunctionTransformer is self-explanatory: it transforms function to a transformer.

Let’s look at the other approach. We will now create CategoryGrouper object that inherits BaseEstimator and TransformerMixin. By inheriting these parentclasses and defining __init__(), fit() and transform() methods, we get a custom transformer:

In [9]:
class CategoryGrouper(BaseEstimator, TransformerMixin):
    def __init__(self, mapping):
        self.mapping = mapping
            
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X = X.replace(self.mapping)
        return X

category_grouper2 = CategoryGrouper(mapping)

display(find_frequency(category_grouper2.fit_transform(X_train)['feature1']))
find_frequency(category_grouper2.transform(X_test)['feature1'])

Unnamed: 0,count,proportion
a,44,0.44
c,30,0.3
b,26,0.26


Unnamed: 0,count,proportion
b,28,0.56
a,22,0.44


We get the same result. For this preprocessing task, both approaches equally work. However, it’s not always the case. This particular preprocessing task is considered a stateless transformation where nothing is learned during training. We can see that fit() method in CategoryGrouper is just return self. In some preprocessing tasks, we need to do a stateful transformation where information is learned during the training process. In our next example, we will see how the two approaches differ in a stateful transformation.

### Example 2: Reduce cardinality

Let’s check the frequency of the variables:

In [10]:
print("===== Training data =====")
display(find_frequency(X_train_grouped['feature1']))
display(find_frequency(X_train_grouped['feature2']))

print("===== Test data =====")
display(find_frequency(X_test_grouped['feature1']))
find_frequency(X_test_grouped['feature2'])

===== Training data =====


Unnamed: 0,count,proportion
a,44,0.44
c,30,0.3
b,26,0.26


Unnamed: 0,count,proportion
w,79,0.79
x,16,0.16
z,5,0.05


===== Test data =====


Unnamed: 0,count,proportion
b,28,0.56
a,22,0.44


Unnamed: 0,count,proportion
w,28,0.56
z,12,0.24
y,10,0.2


We will use an arbitrary threshold of 20% to define infrequent categories and group the minority categories together. Since feature 1’s categories are above the threshold, we will focus on feature2 in this section. Let’s first manually transform it:

In [11]:
def find_top_categories(x, threshold=.2):
    proportions = x.value_counts(normalize=True)
    categories = proportions[proportions>=threshold].index.values
    return categories

categories = {}
X_train_reduced = X_train_grouped.copy()
X_test_reduced = X_test_grouped.copy()

for feature in X_train_reduced.columns:
    # Fit
    categories[feature] = find_top_categories(X_train_reduced[feature])
    # Transform
    X_train_reduced[feature] = np.where(X_train_reduced[feature].isin(categories[feature] ), X_train_reduced[feature], 'other')
    X_test_reduced[feature] = np.where(X_test_reduced[feature].isin(categories[feature] ), X_test_reduced[feature], 'other')

print("===== Training data =====")
display(find_frequency(X_train_reduced['feature2']))

print("===== Test data =====")
find_frequency(X_test_reduced['feature2'])

===== Training data =====


Unnamed: 0,count,proportion
w,79,0.79
other,21,0.21


===== Test data =====


Unnamed: 0,count,proportion
w,28,0.56
other,22,0.44


We will translate this into a custom transformer with FunctionTransformer:

In [12]:
def reduce_cardinality(X, threshold=.2):
    categories = {}
    for feature in X.columns:
        proportions = X[feature].value_counts(normalize=True)
        categories[feature] = proportions[proportions>=threshold].index.values
        X[feature] = np.where(X[feature].isin(categories[feature]), X[feature], 'other')
    return X

pipe1 = Pipeline([
    ('category_grouper', FunctionTransformer(group_categories, kw_args={'mapping': mapping})),
    ('cardinality_reducer', FunctionTransformer(reduce_cardinality)),
])

print("===== Training data =====")
display(find_frequency(pipe1.fit_transform(X_train)['feature2']))

print("===== Test data =====")
find_frequency(pipe1.transform(X_test)['feature2'])

===== Training data =====


Unnamed: 0,count,proportion
w,79,0.79
other,21,0.21


===== Test data =====


Unnamed: 0,count,proportion
w,28,0.56
z,12,0.24
y,10,0.2


Preprocessed test data looks different to what we expect. It looks like the top categories were not learned during training. This example illustrates why
FunctionTransformer shouldn’t be used for stateful transformations.

Let’s look at the other method. In this example, we will create an additional method called find_top_categories to make the code more organised:

In [13]:
class CardinalityReducer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=.2):
        self.threshold = threshold
        
    def find_top_categories(self, feature):
        proportions = feature.value_counts(normalize=True)
        categories = proportions[proportions>=self.threshold].index.values
        return categories
    
    def fit(self, X, y=None):
        self.columns = X.columns
        self.categories = {}
        for feature in self.columns:
            self.categories[feature] = self.find_top_categories(X[feature])
        return self
    
    def transform(self, X):
        X = X.copy()
        for feature in self.columns:
            X[feature] = np.where(X[feature].isin(self.categories[feature]), X[feature], 'other')
        return X

pipe2 = Pipeline([
    ('category_grouper', FunctionTransformer(group_categories, kw_args={'mapping': mapping})),
    ('cardinality_reducer', CardinalityReducer()),
])

print("===== Training data =====")
display(find_frequency(pipe2.fit_transform(X_train)['feature2']))

print("===== Test data =====")
find_frequency(pipe2.transform(X_test)['feature2'])

===== Training data =====


Unnamed: 0,count,proportion
w,79,0.79
other,21,0.21


===== Test data =====


Unnamed: 0,count,proportion
w,28,0.56
other,22,0.44


This output looks correct! This approach with BaseEstimator and TransformerMixin is more versatile and can be used for any transformation.

### Hyperparameter-tuning

By creating custom transformers, we can centralise and streamline our preprocessing steps with Scikit-learn’s Pipeline. Another benefit of creating custom transformers is that we can fine-tune their parameters alongside the model and other transformers. Let’s see an example:

In [15]:
pipe = Pipeline([
    ('category_grouper', FunctionTransformer(group_categories)),
    ('cardinality_reducer', CardinalityReducer()),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ('model', GradientBoostingClassifier(random_state=42)),
])

param_grid = {'category_grouper__kw_args':[{'mapping': mapping}, {'mapping': {'b_': 'b', 'c': 'b'}}],
              'cardinality_reducer__threshold': [0.1, 0.2],
              'model__n_estimators':[50, 100]}

search = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='accuracy', cv=3)
search.fit(X_train, y_train)
search.best_params_

{'cardinality_reducer__threshold': 0.2,
 'category_grouper__kw_args': {'mapping': {'b_': 'b'}},
 'model__n_estimators': 50}