# 05 - Encoding de Categóricas - Embeddings Supervisionados

Embeddings é uma representação distribuida de algo que queremos representar. 

## Importação

In [4]:
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.metrics import median_absolute_error
from sklearn.linear_model import Ridge
from category_encoders import OneHotEncoder
from sklearn.pipeline import make_pipeline

## Carga dos Dados

In [3]:
train = pd.read_csv("data-processed/train.csv")
val = pd.read_csv("data-processed/val.csv")

train['DATE_TIME'] = pd.to_datetime(train['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')
val['DATE_TIME'] = pd.to_datetime(val['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')

In [5]:
train2 = train.copy()
train2['WEEKDAY'] = train2['DATE_TIME'].dt.weekday
train2['HOUR'] = train2['DATE_TIME'].dt.hour
train2['MINUTE'] = train2['DATE_TIME'].dt.minute

val2 = val.copy()
val2['WEEKDAY'] = val2['DATE_TIME'].dt.weekday
val2['HOUR'] = val2['DATE_TIME'].dt.hour
val2['MINUTE'] = val2['DATE_TIME'].dt.minute

## Supervised Embeddings

In [None]:
from category_encoders import OrdinalEncoder

encoder = OrdinalEncoder(cols=['SOURCE_KEY'])
train2['SOURCE_KEY_ENCODED'] = encoder.fit_transform(train2[['SOURCE_KEY']]) + 1
val2['SOURCE_KEY_ENCODED'] = encoder.transform(val2[['SOURCE_KEY']]) + 1

cats = ['SOURCE_KEY_ENCODED', 'WEEKDAY','HOUR', 'MINUTE']

In [None]:
import torch.nn as nn
import torch
import torch.functional as  F
import torch.optim as opt

In [None]:
train2['SOURCE_KEY_ENCODED'].unique()

In [None]:
class EmbeddingNet(nn.Module):
    def __init__(self):
        super(EmbeddingNet, self).__init__()
        self.e_source_key = nn.Embedding(24, 2, padding_idx=0)
    
    def forward(self, x):
        return self.e_source_key(x)

In [None]:
net = EmbeddingNet()

In [None]:
embeddings = net(torch.from_numpy(train2['SOURCE_KEY_ENCODED'].unique()).long())

In [None]:
embeddings

In [None]:
pd.DataFrame(embeddings.detach().numpy()).plot.scatter(x=0,y=1)

## Todos os Embeddings

Um para cada feature ou 1 para a combinação? teste!

In [None]:
class EmbeddingNet2(nn.Module):
    def __init__(self):
        super(EmbeddingNet2, self).__init__()
        self.e_source_key = nn.Embedding(24, 2, padding_idx=0)
        self.e_weekday = nn.Embedding(7, 2, padding_idx=0)
        self.e_hour = nn.Embedding(24, 2, padding_idx=0)
        self.e_minute = nn.Embedding(60, 2, padding_idx=0)
        
        self.out = nn.Linear(8, 1)
    
    def forward(self, x, return_embeddings=False):
        e_source_key = self.e_source_key(x[:,0])
        e_weekday = self.e_weekday(x[:,1])
        e_hour = self.e_hour(x[:,2])
        e_minute = self.e_minute(x[:,3])
        
        x = torch.cat([e_source_key, e_weekday, e_hour, e_minute], dim=-1)
        #print(x.shape)
        x = self.out(x)
        
        if return_embeddings:
            return e_source_key, e_weekday, e_hour, e_minute
        return x

In [None]:
net = EmbeddingNet2().cuda()
criterion = nn.L1Loss()
optimizer = opt.Adam(net.parameters(), lr=1e-4)

y = torch.from_numpy(train2['Y4WIN'].values.reshape(-1,1))
y_cuda = y.cuda()
for e in range(5000):
    optimizer.zero_grad()
    x = torch.from_numpy(train2[cats].values).cuda()
    p = net(x).cuda()
    loss = criterion(p, y_cuda)
    #print(loss.item())
    
    loss.backward()
    optimizer.step()
    
    p_numpy = p.cpu().detach().numpy()
    if e % 1000 == 0:
        print(median_absolute_error(y, p_numpy))
        
# nem overfita, modelo underfitted, pouca capacidade

In [None]:
net(x, return_embeddings=True)

## Embedding to Tree (Leaky)

In [None]:
from sklearn.ensemble import RandomForestRegressor
mdl = RandomForestRegressor(n_jobs=-1, random_state=0, n_estimators=100)

x = torch.from_numpy(train2[cats].values).cuda()
Xtr = net(x,return_embeddings=True).detach().cpu().numpy()
ytr = train2['Y4WIN']
mdl.fit(Xtr,ytr)

x = torch.from_numpy(val2[cats].fillna(0).values).long().cuda()
Xval =  net(x,return_embeddings=True).detach().cpu().numpy()

p = mdl.predict(Xval)
median_absolute_error(val2['Y4'], p)

## Hidden to Tree (Leaky)

In [None]:
from sklearn.ensemble import RandomForestRegressor
mdl = RandomForestRegressor(n_jobs=-1, random_state=0, n_estimators=100)

x = torch.from_numpy(train2[cats].values).cuda()
Xtr = net(x,return_hid3=True).detach().cpu().numpy()
ytr = train2['Y4WIN']
mdl.fit(Xtr,ytr)

x = torch.from_numpy(val2[cats].fillna(0).values).long().cuda()
Xval =  net(x,return_hid3=True).detach().cpu().numpy()

p = mdl.predict(Xval)
median_absolute_error(val2['Y4'], p)

# Fim