# First attempt to create a new representation method for a candidates

In [1]:
%load_ext lab_black

### Imports

In [2]:
import os
import yaml

import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from model.encoder import CandidateEncoderConfig
from model.decoder import CandidateDecoderConfig
from model.candidate_vae import CandidateVAE
from trainer.trainer import BetaVaeTrainer, TrainerConfig
from config.general_config import GeneralConfig
from model.embedder import EmbedderType
from dataset.utils import pad_collate
from dataset.dataset import SellersDataset

[2022-05-23 21:40:26,541] {utils.py:147} INFO - Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2022-05-23 21:40:26,542] {utils.py:159} INFO - NumExpr defaulting to 8 threads.


### Constants

In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

with open("config/config.yaml", "r") as file:
    try:
        config = yaml.safe_load(file)["vae"]
    except yaml.YAMLError as exc:
        print(exc)

general_config = GeneralConfig(**config["general"])
encoder_config = CandidateEncoderConfig(**config["encoder"], **config["general"])

decoder_config = CandidateDecoderConfig(**config["decoder"], **config["general"])

trainer_config = TrainerConfig(**config["trainer"], **config["general"])

log_dir = os.path.join(general_config.checkpoints_dir, "runs")

os.makedirs(log_dir, exist_ok=True)

writer_tensorboard = SummaryWriter(log_dir)

In [4]:
%reload_ext tensorboard
%tensorboard --logdir $log_dir --port=6008

Reusing TensorBoard on port 6008 (pid 63659), started 1 day, 2:33:05 ago. (Use '!kill 63659' to kill it.)

In [5]:
dataset = SellersDataset(
    embedder_name=config["general"]["embedder_name"],
    data_path=config["general"]["data_path"],
    device=DEVICE,
    bow_remove_stopwords=config["general"]["bow_remove_stopwords"],
    bow_remove_sentiment=config["general"]["bow_remove_sentiment"],
)
dataset.prepare_dataset()

Preparing dataset
[2022-05-23 21:40:27,376] {dataset.py:108} INFO - Preparing dataset
Detecting languages:
[2022-05-23 21:40:27,377] {dataset.py:150} INFO - Detecting languages:


100%|██████████| 4/4 [00:00<00:00, 13.36it/s]

Detected languages:
[2022-05-23 21:40:27,678] {dataset.py:155} INFO - Detected languages:
lang
de    1
en    3
Name: lang, dtype: int64
[2022-05-23 21:40:27,679] {dataset.py:156} INFO - lang
de    1
en    3
Name: lang, dtype: int64
Removing rows not written in english
[2022-05-23 21:40:27,680] {dataset.py:157} INFO - Removing rows not written in english
Removed 1 rows
[2022-05-23 21:40:27,681] {dataset.py:161} INFO - Removed 1 rows



100%|██████████| 3/3 [00:00<00:00, 6285.17it/s]
100%|██████████| 3/3 [00:00<00:00, 10727.12it/s]
100%|██████████| 3/3 [00:00<00:00, 6982.75it/s]
100%|██████████| 3/3 [00:00<00:00, 8300.07it/s]

Adding bow for languages_str
[2022-05-23 21:40:27,689] {dataset.py:408} INFO - Adding bow for languages_str
Adding bow for education_str
[2022-05-23 21:40:27,689] {dataset.py:408} INFO - Adding bow for education_str
Adding bow for skills_str
[2022-05-23 21:40:27,690] {dataset.py:408} INFO - Adding bow for skills_str
Adding bow for description_str
[2022-05-23 21:40:27,690] {dataset.py:408} INFO - Adding bow for description_str
Adding language for languages_str
[2022-05-23 21:40:27,691] {dataset.py:399} INFO - Adding language for languages_str



100%|██████████| 3/3 [00:00<00:00, 14496.44it/s]

Adding language for education_str
[2022-05-23 21:40:27,692] {dataset.py:399} INFO - Adding language for education_str



100%|██████████| 3/3 [00:00<00:00, 9845.78it/s]

Adding language for skills_str
[2022-05-23 21:40:27,694] {dataset.py:399} INFO - Adding language for skills_str



100%|██████████| 3/3 [00:00<00:00, 10736.27it/s]

Adding language for description_str
[2022-05-23 21:40:27,696] {dataset.py:399} INFO - Adding language for description_str



100%|██████████| 3/3 [00:00<00:00, 7566.39it/s]
100%|██████████| 3/3 [00:00<00:00, 4158.27it/s]
100%|██████████| 3/3 [00:00<00:00, 5186.69it/s]

Dropping missing values...
[2022-05-23 21:40:27,702] {dataset.py:132} INFO - Dropping missing values...
Dropped 1 missing values...
[2022-05-23 21:40:27,703] {dataset.py:137} INFO - Dropped 1 missing values...



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Prepare data

### Prepare dataloader

In [6]:
dataloader = DataLoader(
    dataset,
    batch_size=general_config.batch_size,
    collate_fn=pad_collate(dataset.vocab.pad_token),
)

# Load models to test them

In [7]:
candidate_vae = CandidateVAE(
    general_config, encoder_config, decoder_config, dataset.vocab, dataset.embedder
).to(DEVICE)

# Check encoder / decoder separately

In [8]:
(
    input_pad,
    input_lengths,
    target_pad,
    mask,
    max_target_length,
    target_skills,
    target_education,
    target_languages,
) = next(iter(dataloader))

input_pad = input_pad.to(DEVICE)
target_pad = target_pad.to(DEVICE)
input_lengths = input_lengths.to("cpu")

In [9]:
mu, var, outputs, (hn, cn) = candidate_vae.encoder(input_pad, input_lengths)
mu.shape, var.shape, outputs.shape

latent_vector = candidate_vae.decoder.reparameterize(mu, var)

In [10]:
init_hidden = candidate_vae.decoder.init_hidden_cell(general_config.batch_size)

In [11]:
output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
    latent_vector, init_hidden, outputs, True
)
output = torch.argmax(output, dim=1).view(-1, 1)
if general_config.embedder_name != EmbedderType.LANG:
    output = candidate_vae.embed_output(output)

RuntimeError: Expected hidden[0] size (8, 2, 64), got [8, 3, 64]

In [None]:
output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
    output, hidden, outputs, False
)
print(output.shape)
output = torch.argmax(output, dim=1).view(-1, 1)
if general_config.embedder_name != EmbedderType.LANG:
    output = candidate_vae.embed_output(output)

torch.Size([3, 236])


In [None]:
output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
    output, hidden, outputs, False
)
output = torch.argmax(output, dim=1).view(-1, 1)
if general_config.embedder_name != EmbedderType.LANG:
    output = candidate_vae.embed_output(output)

# Check all together for batched input

In [None]:
(
    input_pad,
    input_lengths,
    target_pad,
    mask,
    max_target_length,
    target_skills,
    target_education,
    target_languages,
) = next(iter(dataloader))

input_pad = input_pad.to(DEVICE)
target_pad = target_pad.to(DEVICE)
input_lengths = input_lengths.to("cpu")

outputs, attentions = candidate_vae.forward(input_pad, input_lengths)

In [None]:
dataset.__getitem__(0)[2]

tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 

# Check trainer

In [None]:
trainer = BetaVaeTrainer(
    candidate_vae,
    general_config,
    trainer_config,
    dataloader,
    writer_tensorboard,
)

Initializing BetaVaeTrainer...
[2022-05-23 21:40:06,232] {trainer.py:225} INFO - Initializing BetaVaeTrainer...
Done: BetaVaeTrainer initialized!
[2022-05-23 21:40:06,235] {trainer.py:339} INFO - Done: BetaVaeTrainer initialized!


In [None]:
trainer.fit()

Training...
Epoch 0/128
[2022-05-23 21:40:06,269] {trainer.py:661} INFO - Epoch 0/128


  0%|          | 0/1 [00:00<?, ?it/s]


AttributeError: 'tuple' object has no attribute 'to'