# RareLabelEncoder

The RareLabelEncoder() groups labels that show a small number of observations in the dataset into a new category called 'Rare'. This helps to avoid overfitting.

The argument ' tol ' indicates the percentage of observations that the label needs to have in order not to be re-grouped into the "Rare" label.<br> The argument n_categories indicates the minimum number of distinct categories that a variable needs to have for any of the labels to be re-grouped into 'Rare'.<br><br>
#### Note
If the number of labels is smaller than n_categories, then the encoder will not group the labels for that variable.

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from feature_engine.encoding import RareLabelEncoder

In [73]:
# Load titanic dataset from file

def load_titanic(filepath='titanic.csv'):
    # data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
    data = pd.read_csv(filepath)
    data = data.replace('?', np.nan)
    data['cabin'] = data['cabin'].astype(str).str[0]
    data['pclass'] = data['pclass'].astype('O')
    data['age'] = data['age'].astype('float').fillna(data.age.median())
    data['fare'] = data['fare'].astype('float').fillna(data.fare.median())
    data['embarked'].fillna('C', inplace=True)
    # data.drop(labels=['boat', 'body', 'home.dest', 'name', 'ticket'], axis=1, inplace=True)
    return data

In [74]:
# data = load_titanic("../data/titanic.csv")
data = load_titanic("../data/titanic-2/Titanic-Dataset.csv")
data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,n,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,n,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,n,S


In [75]:
X = data.drop(['survived', 'name', 'ticket'], axis=1)
y = data.survived

In [76]:
# we will encode the below variables, they have no missing values
X[['cabin', 'pclass', 'embarked']].isnull().sum()

cabin       0
pclass      0
embarked    0
dtype: int64

In [77]:
''' Make sure that the variables are type (object).
if not, cast it as object , otherwise the transformer will either send an error (if we pass it as argument) 
or not pick it up (if we leave variables=None). '''

X[['cabin', 'pclass', 'embarked']].dtypes

cabin       object
pclass      object
embarked    object
dtype: object

In [78]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train.shape, X_test.shape

((623, 9), (268, 9))

The RareLabelEncoder() groups rare / infrequent categories in
a new category called "Rare", or any other name entered by the user.

For example in the variable colour,<br> if the percentage of observations
for the categories magenta, cyan and burgundy 
are < 5%, all those
categories will be replaced by the new label "Rare".

Note, infrequent labels can also be grouped under a user defined name, for
example 'Other'. The name to replace infrequent categories is defined
with the parameter replace_with.
   
The encoder will encode only categorical variables (type 'object'). A list
of variables can be passed as an argument. If no variables are passed as 
argument, the encoder will find and encode all categorical variables
(object type).

In [79]:
# Rare value encoder
'''
Parameters
----------

tol: float, default=0.05
    the minimum frequency a label should have to be considered frequent.
    Categories with frequencies lower than tol will be grouped.

n_categories: int, default=10
    the minimum number of categories a variable should have for the encoder
    to find frequent labels. If the variable contains less categories, all
    of them will be considered frequent.

max_n_categories: int, default=None
    the maximum number of categories that should be considered frequent.
    If None, all categories with frequency above the tolerance (tol) will be
    considered.

variables : list, default=None
    The list of categorical variables that will be encoded. If None, the 
    encoder will find and select all object type variables.

replace_with : string, default='Rare'
    The category name that will be used to replace infrequent categories.
'''

rare_encoder = RareLabelEncoder(tol=0.05,
                                n_categories=5,
                                variables=['cabin', 'pclass', 'embarked'])
rare_encoder.fit(X_train)



In [80]:
rare_encoder.encoder_dict_

{'cabin': ['n', 'C'], 'pclass': [1, 3, 2], 'embarked': ['S', 'C', 'Q']}

In [81]:
train_t = rare_encoder.transform(X_train)
test_t = rare_encoder.transform(X_train)

test_t.head()

Unnamed: 0,passengerid,pclass,sex,age,sibsp,parch,fare,cabin,embarked
857,858,1,male,51.0,0,0,26.55,Rare,S
52,53,1,female,49.0,1,0,76.7292,Rare,C
386,387,3,male,1.0,5,2,46.9,n,S
124,125,1,male,54.0,0,1,77.2875,Rare,S
578,579,3,female,28.0,1,0,14.4583,n,C


In [82]:
test_t.cabin.value_counts()

cabin
n       471
Rare    110
C        42
Name: count, dtype: int64

#### The user can change the string from 'Rare' to something else.

In [83]:
# Rare value encoder
rare_encoder = RareLabelEncoder(tol=0.03,
                                replace_with='Other',  # replacing 'Rare' with 'Other'
                                variables=['cabin', 'pclass', 'embarked'],
                                n_categories=2
                                )

rare_encoder.fit(X_train)

train_t = rare_encoder.transform(X_train)
test_t = rare_encoder.transform(X_train)

test_t.sample(5)

Unnamed: 0,passengerid,pclass,sex,age,sibsp,parch,fare,cabin,embarked
256,257,1,female,28.0,0,0,79.2,n,C
152,153,3,male,55.5,0,0,8.05,n,S
357,358,2,female,38.0,0,0,13.0,n,S
785,786,3,male,25.0,0,0,7.25,n,S
834,835,3,male,18.0,0,0,8.3,n,S


In [84]:
rare_encoder.encoder_dict_

{'cabin': ['n', 'C', 'B', 'E', 'D'],
 'pclass': [3, 1, 2],
 'embarked': ['S', 'C', 'Q']}

In [85]:
test_t.cabin.value_counts()

cabin
n        471
C         42
B         31
E         27
D         26
Other     26
Name: count, dtype: int64

#### The user can choose to retain only the most popular categories with the argument max_n_categories.

In [86]:
# Rare value encoder

rare_encoder = RareLabelEncoder(tol=0.03,
                                variables=['cabin', 'pclass', 'embarked'],
                                n_categories=2,
                                # keeps only the most popular 3 categories in every variable.
                                max_n_categories=3
                                )

rare_encoder.fit(X_train)

train_t = rare_encoder.transform(X_train)
test_t = rare_encoder.transform(X_train)

test_t.sample(5)

Unnamed: 0,passengerid,pclass,sex,age,sibsp,parch,fare,cabin,embarked
168,169,1,male,28.0,0,0,25.925,n,S
834,835,3,male,18.0,0,0,8.3,n,S
107,108,3,male,28.0,0,0,7.775,n,S
457,458,1,female,28.0,1,0,51.8625,Rare,S
807,808,3,female,18.0,0,0,7.775,n,S


In [87]:
rare_encoder.encoder_dict_

{'cabin': ['n', 'C', 'B'], 'pclass': [3, 1, 2], 'embarked': ['S', 'C', 'Q']}

### Automatically select all categorical variables

If no variable list is passed as argument, it selects all the categorical variables.

In [88]:
len(X_train['pclass'].unique()), len(X_train['sex'].unique()), len(X_train['embarked'].unique())

(3, 2, 3)

In [89]:
# # X_train['pclass'].value_counts(dropna=False)
# pclass_encoder = RareLabelEncoder(tol=0.03, n_categories=3)
# X_train['pclass'] = pclass_encoder.fit_transform(X_train[['pclass']])

In [90]:
# # X_train['sex'].value_counts(dropna=False)
# sex_encoder = RareLabelEncoder(tol=0.03, n_categories=2)
# X_train['sex'] = sex_encoder.fit_transform(X_train[['sex']])

In [91]:
# # X_train['embarked'].value_counts(dropna=False)
# embarked_encoder = RareLabelEncoder(tol=0.03, n_categories=3)
# X_train['embarked'] = embarked_encoder.fit_transform(X_train[['embarked']])

In [92]:
## Rare value encoder
rare_encoder = RareLabelEncoder(tol = 0.03, n_categories=3)
rare_encoder.fit(X_train)
rare_encoder.encoder_dict_



{'pclass': [1, 3, 2],
 'sex': ['male', 'female'],
 'cabin': ['n', 'C', 'B', 'E', 'D'],
 'embarked': ['S', 'C', 'Q']}

In [93]:
train_t = rare_encoder.transform(X_train)
test_t = rare_encoder.transform(X_train)
test_t.sample(5)

Unnamed: 0,passengerid,pclass,sex,age,sibsp,parch,fare,cabin,embarked
322,323,2,female,30.0,0,0,12.35,n,Q
450,451,2,male,36.0,1,2,27.75,n,S
835,836,1,female,39.0,1,1,83.1583,E,C
753,754,3,male,23.0,0,0,7.8958,n,S
624,625,3,male,21.0,0,0,16.1,n,S
