# 04 - Encoding de Categóricas

## Introdução

Com as 5 técnicas abaixo conseguimos lidar com a grande maioria das necessidades de conversão de veriáveis categóricas em numéricas. São eles:

- One Hot Encoding;
- Count/Frequency
- Target Encoding ("Model" encoding);
- Embedding;
- Ordinal.

## Importação

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import median_absolute_error
%matplotlib inline

## Carga dos Dados

In [4]:
train = pd.read_csv("data-processed/train.csv")
val = pd.read_csv("data-processed/val.csv")

train['DATE_TIME'] = pd.to_datetime(train['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')
val['DATE_TIME'] = pd.to_datetime(val['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')

In [5]:
train2 = train.copy()
train2['WEEKDAY'] = train2['DATE_TIME'].dt.weekday
train2['HOUR'] = train2['DATE_TIME'].dt.hour
train2['MINUTE'] = train2['DATE_TIME'].dt.minute

val2 = val.copy()
val2['WEEKDAY'] = val2['DATE_TIME'].dt.weekday
val2['HOUR'] = val2['DATE_TIME'].dt.hour
val2['MINUTE'] = val2['DATE_TIME'].dt.minute
cats = ['SOURCE_KEY', 'WEEKDAY','HOUR', 'MINUTE']

## Count/Frequency Encoding

Faz um proporcional entre a quantidade de vezes que um tipo de valor aparece sobre o total de valores disponíveis.

Funciona muito bem quando temos muitos valores diferentes.

In [8]:
from category_encoders import CountEncoder

for c in cats:
    cenc = CountEncoder(normalize=False)
    train2['COUNT_{}'.format(c)] = cenc.fit_transform(train2[c])
    val2['COUNT_{}'.format(c)] = cenc.transform(val2[c])

In [12]:
from sklearn.ensemble import RandomForestRegressor

mdl = RandomForestRegressor(n_jobs=-1, random_state=0, n_estimators=100)

Xtr, ytr = train2.filter(regex=r"COUNT"), train2['Y4WIN']

mdl.fit(Xtr,ytr)

p = mdl.predict(val2.filter(regex=r"COUNT"))

# Faz a validação contra o Y4 normal por não sabemos o valor Y4WIN na produção
# então não adianta ficar tratando ele aqui
median_absolute_error(val2['Y4'], p)

2.598091929386687

## Target Encoding

Grande risco de overfitting.

In [13]:
from category_encoders import TargetEncoder

for c in cats:
    tenc = TargetEncoder()
    train2['TGT_{}'.format(c)] = tenc.fit_transform(train2[c], train2['Y4WIN'])
    val2['TGT_{}'.format(c)] = tenc.transform(val2[c])

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [14]:
from sklearn.ensemble import RandomForestRegressor

mdl = RandomForestRegressor(n_jobs=-1, random_state=0, n_estimators=100)

Xtr, ytr = train2.filter(regex=r"TGT"), train2['Y4WIN']

mdl.fit(Xtr,ytr)

p = mdl.predict(val2.filter(regex=r"TGT"))

median_absolute_error(val2['Y4'], p)

2.6372917733329757

## RAPIDS Target Encoding

In [16]:
from cuml.preprocessing import TargetEncoder

for c in cats:
    tenc = TargetEncoder(n_folds=2)
    train2['TGT_{}'.format(c)] = tenc.fit_transform(train2[c], train2['Y4WIN'])
    val2['TGT_{}'.format(c)] = tenc.transform(val2[c])

ModuleNotFoundError: No module named 'cuml'

In [None]:
from sklearn.ensemble import RandomForestRegressor

mdl = RandomForestRegressor(n_jobs=-1, random_state=0, n_estimators=100)

Xtr, ytr = train2.filter(regex=r"TGT"), train2['Y4WIN']

mdl.fit(Xtr,ytr)

p = mdl.predict(val2.filter(regex=r"TGT"))

median_absolute_error(val2['Y4'], p)

## RAPIDS KNN

In [None]:
from cuml.neighbors import KNeighborsRegressor

mdl = KNeighborsRegressor(n_neighbors=1, metric='manhattan')

Xtr, ytr = train2.filter(regex=r"TGT"), train2['Y4WIN']

mdl.fit(Xtr,ytr)

p = mdl.predict(val2.filter(regex=r"TGT"))

median_absolute_error(val2['Y4'], p)

## RAPIDS OHE + KNN

In [None]:
from cuml.preprocessing import OneHotEncoder
import cudf
import cupy

cudf_train2 = cudf.from_pandas(train2[cats])
cudf_val2 = cudf.from_pandas(val2[cats])

ohenc = OneHotEncoder(handle_unknown='ignore', sparse=False)
cudf_train2 = ohenc.fit_transform(cudf_train2)
cudf_val2 = ohenc.transform(cudf_val2)

In [None]:
from cuml.neighbors import KNeighborsRegressor
mdl = KNeighborsRegressor(n_neighbors=1, metric='euclidean')

Xtr, ytr = cudf_train2, train2['Y4WIN']
mdl.fit(Xtr,ytr)

p = mdl.predict(cudf_val2)
p = cupy.asnumpy(p)
median_absolute_error(val2['Y4'], p)

## Model Likelihood

Ótima forma de reduzir esparsidade.

In [17]:
from sklearn.linear_model import Ridge
from category_encoders import OneHotEncoder
from sklearn.pipeline import make_pipeline

In [18]:
cats = ['SOURCE_KEY', 'WEEKDAY','HOUR', 'MINUTE']

new_features = set()
new_features_models = list()
for c in cats:
    
    mdl = make_pipeline(OneHotEncoder(cols=[c]), Ridge())
    mdl.fit(train2[[c]], train2['Y4WIN'])
    feature = mdl.predict(train2[[c]])
    
    train2["P_{}".format(c)] = mdl.predict(train2[[c]])
    val2["P_{}".format(c)] = mdl.predict(val2[[c]])
    
    new_features.add("P_{}".format(c))
    
    #new_features_models.append(mdl)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [19]:
features = list(new_features)

from sklearn.ensemble import RandomForestRegressor

mdl = RandomForestRegressor(n_jobs=-1, random_state=0, n_estimators=100)

Xtr, ytr = train2[features].fillna(-1), train2['Y4']

mdl.fit(Xtr,ytr)

p = mdl.predict(val2[features].fillna(-1))

median_absolute_error(val2['Y4'], p)

3.1369276674286795

## Redução da Cardinalidade

In [20]:
level_counts = train['SOURCE_KEY'].value_counts()
level_counts

bvBOhCH3iADSZry    1526
1BY6WEcLGh8j5v7    1525
VHMLBKoKgIrUVDU    1511
7JYdWkrLSPkdwr4    1511
ih0vzX44oOqAx2f    1508
ZnxXDlPa8U1GXgE    1508
z9Y9gH1T5YWrNuG    1506
pkci93gMrogZuBj    1506
iCRJl6heRkivqQ3    1506
uHbuxQJl8lW7ozc    1506
wCURE6d3bPkepu2    1506
McdE0feGgRqW7Ca    1505
sjndEbLyjtCKgGv    1505
zVJPv84UY57bAof    1505
rGa61gmuvPhdLxV    1505
ZoEaEvLYb1n2sOq    1504
zBIq5rxdHJRwDNY    1500
adLQvlD726eNBSB    1496
WRmjgnKYAwPKWDb    1496
1IF53ai7Xc0U56Y    1496
3PZuoBAID5Wc2HD    1496
YxYtjZvoooNbGkE    1485
Name: SOURCE_KEY, dtype: int64

In [21]:
low_count = set(level_counts[level_counts < 1500].index)
low_count

{'1IF53ai7Xc0U56Y',
 '3PZuoBAID5Wc2HD',
 'WRmjgnKYAwPKWDb',
 'YxYtjZvoooNbGkE',
 'adLQvlD726eNBSB'}

In [22]:
cats = ['SOURCE_KEY', 'WEEKDAY','HOUR', 'MINUTE']

new_features = set()
new_features_models = list()
for c in cats:
    
    mdl = make_pipeline(OneHotEncoder(cols=[c]), Ridge())
    mdl.fit(train2[[c]], train2['Y4WIN'])
    feature = mdl.predict(train2[[c]])
    
    train2["P_{}".format(c)] = mdl.predict(train2[[c]])
    val2["P_{}".format(c)] = mdl.predict(val2[[c]])
    
    new_features.add("P_{}".format(c))
    
    #new_features_models.append(mdl)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [23]:
features = list(new_features)

from sklearn.ensemble import RandomForestRegressor

mdl = RandomForestRegressor(n_jobs=-1, random_state=0, n_estimators=100)

Xtr, ytr = train2[features].fillna(-1), train2['Y4WIN']

mdl.fit(Xtr,ytr)

p = mdl.predict(val2[features].fillna(-1))

median_absolute_error(val2['Y4'], p)

2.6099226169563714

# PCA e Derivados

Uma combinação legal é fazer One Hot Encoding e depois aplicar PCA.

In [24]:
from sklearn.decomposition import PCA

cats = ['SOURCE_KEY', 'WEEKDAY','HOUR', 'MINUTE']

new_features = set()
new_features_models = list()
for c in cats:
    
    mdl = make_pipeline(OneHotEncoder(cols=[c]), PCA(n_components=1))
    mdl.fit(train2[[c]])
    feature = mdl.transform(train2[[c]])
    
    train2["P_{}".format(c)] = mdl.transform(train2[[c]])
    val2["P_{}".format(c)] = mdl.transform(val2[[c]])
    
    new_features.add("P_{}".format(c))
    
    #new_features_models.append(mdl)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [25]:
features = list(new_features)

from sklearn.ensemble import RandomForestRegressor

mdl = RandomForestRegressor(n_jobs=-1, random_state=0, n_estimators=100)

Xtr, ytr = train2[features].fillna(-1), train2['Y4WIN']

mdl.fit(Xtr,ytr)

p = mdl.predict(val2[features].fillna(-1))

median_absolute_error(val2['Y4'], p)

2.6454963788213064

# Fim