# First attempt to create a new representation method for a candidates

In [1]:
# %load_ext lab_black

### Imports

In [2]:
import os
import yaml

import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from model.encoder import CandidateEncoderConfig
from model.decoder import CandidateDecoderConfig
from model.candidate_vae import CandidateVAE
from trainer.trainer import BetaVaeTrainer, TrainerConfig
from config.general_config import GeneralConfig
from model.embedder import EmbedderType
from dataset.utils import pad_collate
from dataset.dataset import SellersDataset

[2022-05-22 19:57:52,995] {utils.py:147} INFO - Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2022-05-22 19:57:52,996] {utils.py:159} INFO - NumExpr defaulting to 8 threads.


### Constants

In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

with open("config/config.yaml", "r") as file:
    try:
        config = yaml.safe_load(file)["vae"]
    except yaml.YAMLError as exc:
        print(exc)

general_config = GeneralConfig(**config["general"])
encoder_config = CandidateEncoderConfig(**config["encoder"], **config["general"])

decoder_config = CandidateDecoderConfig(**config["decoder"], **config["general"])

trainer_config = TrainerConfig(**config["trainer"], **config["general"])

log_dir = os.path.join(general_config.checkpoints_dir, "runs")

os.makedirs(log_dir, exist_ok=True)

writer_tensorboard = SummaryWriter(log_dir)

In [4]:
%reload_ext tensorboard
%tensorboard --logdir $log_dir --port=6008

Reusing TensorBoard on port 6008 (pid 63659), started 0:50:32 ago. (Use '!kill 63659' to kill it.)

In [5]:
dataset = SellersDataset(
    embedder_name=config["general"]["embedder_name"],
    data_path=config["general"]["data_path"],
    device=DEVICE,
)
dataset.prepare_dataset()

Preparing dataset
[2022-05-22 19:57:53,621] {dataset.py:142} INFO - Preparing dataset
Detecting languages:
[2022-05-22 19:57:53,622] {dataset.py:169} INFO - Detecting languages:


100%|██████████| 4/4 [00:00<00:00, 12.94it/s]

Detected languages:
[2022-05-22 19:57:53,933] {dataset.py:174} INFO - Detected languages:
lang
en    4
Name: lang, dtype: int64
[2022-05-22 19:57:53,934] {dataset.py:175} INFO - lang
en    4
Name: lang, dtype: int64
Removing rows not written in english
[2022-05-22 19:57:53,935] {dataset.py:176} INFO - Removing rows not written in english
Removed 0 rows
[2022-05-22 19:57:53,935] {dataset.py:180} INFO - Removed 0 rows



100%|██████████| 4/4 [00:00<00:00, 5769.33it/s]
100%|██████████| 4/4 [00:00<00:00, 6831.11it/s]
100%|██████████| 4/4 [00:00<00:00, 1772.74it/s]
100%|██████████| 4/4 [00:00<00:00, 11715.93it/s]

Adding language for languages_str
[2022-05-22 19:57:53,947] {dataset.py:356} INFO - Adding language for languages_str



100%|██████████| 4/4 [00:00<00:00, 20410.24it/s]

Adding language for education_str
[2022-05-22 19:57:53,948] {dataset.py:356} INFO - Adding language for education_str



100%|██████████| 4/4 [00:00<00:00, 22250.95it/s]

Adding language for skills_str
[2022-05-22 19:57:53,950] {dataset.py:356} INFO - Adding language for skills_str



100%|██████████| 4/4 [00:00<00:00, 15033.35it/s]

Adding language for description_str
[2022-05-22 19:57:53,951] {dataset.py:356} INFO - Adding language for description_str



100%|██████████| 4/4 [00:00<00:00, 9737.21it/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Prepare data

### Prepare dataloader

In [6]:
dataloader = DataLoader(
    dataset,
    batch_size=general_config.batch_size,
    collate_fn=pad_collate(dataset.vocab.pad_token),
)

# Load models to test them

In [7]:
candidate_vae = CandidateVAE(
    general_config, encoder_config, decoder_config, dataset.vocab, dataset.embedder
).to(DEVICE)

# Check encoder / decoder separately

In [8]:
input_pad, input_lengths, target_pad, mask, max_target_length = next(iter(dataloader))

input_pad = input_pad.to(DEVICE)
target_pad = target_pad.to(DEVICE)
input_lengths = input_lengths.to("cpu")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["skills_str"] = (


In [9]:
mu, var, outputs, (hn, cn) = candidate_vae.encoder(input_pad, input_lengths)
mu.shape, var.shape, outputs.shape

latent_vector = candidate_vae.decoder.reparameterize(mu, var)

In [10]:
init_hidden = candidate_vae.decoder.init_hidden_cell(general_config.batch_size)

In [11]:
output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
    latent_vector, init_hidden, outputs, True
)
output = torch.argmax(output, dim=1).view(-1, 1)
if general_config.embedder_name != EmbedderType.LANG:
    output = candidate_vae.embed_output(output)

In [12]:
output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
    output, hidden, outputs, False
)
print(output.shape)
output = torch.argmax(output, dim=1).view(-1, 1)
if general_config.embedder_name != EmbedderType.LANG:
    output = candidate_vae.embed_output(output)

torch.Size([3, 220])


In [13]:
output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
    output, hidden, outputs, False
)
output = torch.argmax(output, dim=1).view(-1, 1)
if general_config.embedder_name != EmbedderType.LANG:
    output = candidate_vae.embed_output(output)

# Check all together for batched input

In [14]:
input_pad, input_lengths, target_pad, mask, max_target_length = next(iter(dataloader))

input_pad = input_pad.to(DEVICE)
target_pad = target_pad.to(DEVICE)
input_lengths = input_lengths.to("cpu")

outputs, attentions = candidate_vae.forward(input_pad, input_lengths)

# Check trainer

In [15]:
encoder_optimizer = torch.optim.Adam(
    candidate_vae.encoder.parameters(), lr=general_config.encoder_lr
)
decoder_optimizer = torch.optim.Adam(
    candidate_vae.decoder.parameters(), lr=general_config.decoder_lr
)

trainer = BetaVaeTrainer(
    candidate_vae,
    encoder_optimizer,
    decoder_optimizer,
    general_config,
    trainer_config,
    dataloader,
    writer_tensorboard,
)

Initializing BetaVaeTrainer...
[2022-05-22 19:58:20,575] {trainer.py:163} INFO - Initializing BetaVaeTrainer...
Done: BetaVaeTrainer initialized!
[2022-05-22 19:58:20,576] {trainer.py:193} INFO - Done: BetaVaeTrainer initialized!


In [16]:
trainer.fit()

Training...
Epoch 0/128
[2022-05-22 19:58:23,377] {trainer.py:426} INFO - Epoch 0/128


  0%|          | 0/2 [00:00<?, ?it/s]

Iter: 0


 50%|█████     | 1/2 [00:55<00:55, 55.87s/it]

Iter: 1


 50%|█████     | 1/2 [01:04<00:55, 55.87s/it]

Iteration: 2; Percent complete: 1.2%; Average loss: 1.9522; Average recons_loss: 1.9522; Average kld_loss: 0.0000; Average kld_attn_loss: 0.1151
[2022-05-22 19:59:27,621] {trainer.py:500} INFO - Iteration: 2; Percent complete: 1.2%; Average loss: 1.9522; Average recons_loss: 1.9522; Average kld_loss: 0.0000; Average kld_attn_loss: 0.1151


100%|██████████| 2/2 [01:04<00:00, 32.19s/it]

Epoch 1/128
[2022-05-22 19:59:27,752] {trainer.py:426} INFO - Epoch 1/128



  0%|          | 0/2 [00:00<?, ?it/s]

Iter: 2


 50%|█████     | 1/2 [00:53<00:53, 53.25s/it]

Iter: 3


 50%|█████     | 1/2 [01:01<00:53, 53.25s/it]

Iteration: 4; Percent complete: 2.4%; Average loss: 1.9461; Average recons_loss: 1.9461; Average kld_loss: 0.0000; Average kld_attn_loss: 0.1152
[2022-05-22 20:00:29,423] {trainer.py:500} INFO - Iteration: 4; Percent complete: 2.4%; Average loss: 1.9461; Average recons_loss: 1.9461; Average kld_loss: 0.0000; Average kld_attn_loss: 0.1152


100%|██████████| 2/2 [01:01<00:00, 30.90s/it]

Epoch 2/128
[2022-05-22 20:00:29,550] {trainer.py:426} INFO - Epoch 2/128



  0%|          | 0/2 [00:00<?, ?it/s]

Iter: 4


 50%|█████     | 1/2 [00:53<00:53, 53.71s/it]

Iter: 5


 50%|█████     | 1/2 [01:02<00:53, 53.71s/it]

Iteration: 6; Percent complete: 3.5%; Average loss: 1.9805; Average recons_loss: 1.9611; Average kld_loss: 0.0000; Average kld_attn_loss: 0.1152
[2022-05-22 20:01:31,973] {trainer.py:500} INFO - Iteration: 6; Percent complete: 3.5%; Average loss: 1.9805; Average recons_loss: 1.9611; Average kld_loss: 0.0000; Average kld_attn_loss: 0.1152


100%|██████████| 2/2 [01:02<00:00, 31.28s/it]

Epoch 3/128
[2022-05-22 20:01:32,106] {trainer.py:426} INFO - Epoch 3/128



  0%|          | 0/2 [00:00<?, ?it/s]

Iter: 6
