# First attempt to create a new representation method for a candidates

In [1]:
# %load_ext lab_black

### Imports

In [2]:
from pathlib import Path
import yaml

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence
import pandas as pd

from model.encoder import CandidateEncoder, CandidateEncoderConfig
from model.decoder import CandidateDecoder, CandidateDecoderConfig
from config.general_config import GeneralConfig
from model.embedder import EmbedderType
from dataset.utils import pad_collate
from dataset.dataset import SellersDataset
from dataset.language import Lang

### Constants

In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

with open("config/config.yaml", "r") as file:
    try:
        config = yaml.safe_load(file)["vae"]
    except yaml.YAMLError as exc:
        print(exc)

GENERAL_CONFIG = GeneralConfig(**config["general"], device = DEVICE)

In [4]:
dataset = SellersDataset(
    embedder_name=config["general"]["embedder_name"], data_path=config["general"]["data_path"], device=DEVICE
)
dataset.prepare_dataset()

[2022-05-20 09:44:30,458] {dataset.py:133} INFO - Preparing dataset
[2022-05-20 09:44:30,458] {dataset.py:160} INFO - Detecting languages:


100%|██████████| 4/4 [00:00<00:00, 14.38it/s]

[2022-05-20 09:44:30,738] {dataset.py:165} INFO - Detected languages:
[2022-05-20 09:44:30,739] {dataset.py:166} INFO - lang
en    4
Name: lang, dtype: int64
[2022-05-20 09:44:30,740] {dataset.py:167} INFO - Removing rows not written in english
[2022-05-20 09:44:30,741] {dataset.py:171} INFO - Removed 0 rows



100%|██████████| 4/4 [00:00<00:00, 5173.36it/s]
100%|██████████| 4/4 [00:00<00:00, 5270.88it/s]
100%|██████████| 4/4 [00:00<00:00, 1346.70it/s]
100%|██████████| 4/4 [00:00<00:00, 11044.91it/s]

[2022-05-20 09:44:30,752] {dataset.py:347} INFO - Adding language for languages_str



100%|██████████| 4/4 [00:00<00:00, 20997.77it/s]

[2022-05-20 09:44:30,754] {dataset.py:347} INFO - Adding language for education_str



100%|██████████| 4/4 [00:00<00:00, 21760.33it/s]

[2022-05-20 09:44:30,755] {dataset.py:347} INFO - Adding language for skills_str



100%|██████████| 4/4 [00:00<00:00, 16178.61it/s]

[2022-05-20 09:44:30,757] {dataset.py:347} INFO - Adding language for description_str



100%|██████████| 4/4 [00:00<00:00, 7476.48it/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Prepare data

### Prepare dataloader

In [5]:
loader = DataLoader(dataset, batch_size=GENERAL_CONFIG.batch_size, collate_fn=pad_collate)

In [6]:
config_encoder = CandidateEncoderConfig(
    num_words=dataset.lang.n_words,
    embedding_size=dataset.embedder.size,
    device=DEVICE,
    **config["encoder"],
    **config["general"]
)

encoder = CandidateEncoder(config_encoder).to(DEVICE)

In [7]:
mu, var, outputs, (hn, cn) = encoder(next(iter(loader)).cuda())
mu.shape, var.shape, outputs.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["skills_str"] = (


(torch.Size([3, 16]), torch.Size([3, 16]), torch.Size([3, 202, 128]))

In [8]:
def reparameterize(mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
    """
    Will a single z be enough ti compute the expectation
    for the loss??
    :param mu: (Tensor) Mean of the latent Gaussian
    :param logvar: (Tensor) Standard deviation of the latent Gaussian
    :return:
    """
    std = torch.exp(0.5 * logvar)
    eps = torch.randn_like(std)
    return eps * std + mu

In [9]:
latent_vector = reparameterize(mu, var)
latent_vector.shape

torch.Size([3, 16])

In [10]:
config_decoder = CandidateDecoderConfig(
    num_words=dataset.lang.n_words,
    embedding_size=dataset.embedder.size,
    device=DEVICE,
    **config["decoder"],
    **config["general"]
)

decoder = CandidateDecoder(config_decoder).to(DEVICE)

init_hidden = decoder.init_hidden_cell(GENERAL_CONFIG.batch_size)

In [11]:
def pad_strip_sequence(batch):
    template = torch.zeros(GENERAL_CONFIG.max_seq_len, batch.shape[-1], device=DEVICE)
    return pad_sequence([template, *batch], batch_first=True)[1:,:GENERAL_CONFIG.max_seq_len,:]

In [12]:
pad_strip_sequence(outputs).shape

torch.Size([3, 256, 128])

In [13]:
def embed_output(output: torch.Tensor):
    """
    Create embedding of the output of decoder

    output : torch.Tensor
        Tensor of shape [N, 1] where:
        - N is the output of the decoder
    """
    
    return torch.stack([dataset.embedder(dataset.lang.index2word[int(word)], pooled=True).squeeze(dim=0) for word in output], dim=0).to(DEVICE)


In [14]:
output, hidden, attn_weights = decoder(latent_vector, init_hidden, pad_strip_sequence(outputs), True)
output = torch.argmax(output, dim=1).view(-1, 1)
if GENERAL_CONFIG.embedder_name != EmbedderType.LANG:
    output = embed_output(output)

---------
torch.Size([3, 64])
---------


In [15]:
output, hidden, attn_weights = decoder(output, hidden, pad_strip_sequence(outputs), False)
output = torch.argmax(output, dim=1).view(-1, 1)
if GENERAL_CONFIG.embedder_name != EmbedderType.LANG:
    output = embed_output(output)

---------
torch.Size([3, 64])
---------


In [16]:
output, hidden, attn_weights = decoder(output, hidden, pad_strip_sequence(outputs), False)
output = torch.argmax(output, dim=1).view(-1, 1)
if GENERAL_CONFIG.embedder_name != EmbedderType.LANG:
    output = embed_output(output)

---------
torch.Size([3, 64])
---------


In [17]:
attn_weights.shape

torch.Size([3, 256])