# First attempt to create a new representation method for a candidates

In [1]:
# %load_ext lab_black

### Imports

In [2]:
import os
import yaml

import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from model.encoder import CandidateEncoderConfig
from model.decoder import CandidateDecoderConfig
from model.candidate_vae import CandidateVAE
from trainer.trainer import BetaVaeTrainer, TrainerConfig
from config.general_config import GeneralConfig
from model.embedder import EmbedderType
from dataset.utils import pad_collate
from dataset.dataset import SellersDataset

[2022-05-22 20:06:42,841] {utils.py:147} INFO - Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2022-05-22 20:06:42,842] {utils.py:159} INFO - NumExpr defaulting to 8 threads.


### Constants

In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

with open("config/config.yaml", "r") as file:
    try:
        config = yaml.safe_load(file)["vae"]
    except yaml.YAMLError as exc:
        print(exc)

general_config = GeneralConfig(**config["general"])
encoder_config = CandidateEncoderConfig(**config["encoder"], **config["general"])

decoder_config = CandidateDecoderConfig(**config["decoder"], **config["general"])

trainer_config = TrainerConfig(**config["trainer"], **config["general"])

log_dir = os.path.join(general_config.checkpoints_dir, "runs")

os.makedirs(log_dir, exist_ok=True)

writer_tensorboard = SummaryWriter(log_dir)

In [4]:
%reload_ext tensorboard
%tensorboard --logdir $log_dir --port=6008

Reusing TensorBoard on port 6008 (pid 63659), started 0:59:22 ago. (Use '!kill 63659' to kill it.)

In [5]:
dataset = SellersDataset(
    embedder_name=config["general"]["embedder_name"],
    data_path=config["general"]["data_path"],
    device=DEVICE,
)
dataset.prepare_dataset()

Preparing dataset
[2022-05-22 20:06:43,490] {dataset.py:142} INFO - Preparing dataset
Detecting languages:
[2022-05-22 20:06:43,491] {dataset.py:169} INFO - Detecting languages:


100%|██████████| 4/4 [00:00<00:00, 13.32it/s]

Detected languages:
[2022-05-22 20:06:43,793] {dataset.py:174} INFO - Detected languages:
lang
en    4
Name: lang, dtype: int64
[2022-05-22 20:06:43,794] {dataset.py:175} INFO - lang
en    4
Name: lang, dtype: int64
Removing rows not written in english
[2022-05-22 20:06:43,795] {dataset.py:176} INFO - Removing rows not written in english
Removed 0 rows
[2022-05-22 20:06:43,796] {dataset.py:180} INFO - Removed 0 rows



100%|██████████| 4/4 [00:00<00:00, 4416.22it/s]
100%|██████████| 4/4 [00:00<00:00, 4499.12it/s]
100%|██████████| 4/4 [00:00<00:00, 1346.16it/s]
100%|██████████| 4/4 [00:00<00:00, 9034.58it/s]

Adding language for languages_str
[2022-05-22 20:06:43,809] {dataset.py:356} INFO - Adding language for languages_str



100%|██████████| 4/4 [00:00<00:00, 14563.56it/s]

Adding language for education_str
[2022-05-22 20:06:43,811] {dataset.py:356} INFO - Adding language for education_str



100%|██████████| 4/4 [00:00<00:00, 16304.39it/s]

Adding language for skills_str
[2022-05-22 20:06:43,812] {dataset.py:356} INFO - Adding language for skills_str



100%|██████████| 4/4 [00:00<00:00, 8309.67it/s]

Adding language for description_str
[2022-05-22 20:06:43,815] {dataset.py:356} INFO - Adding language for description_str



100%|██████████| 4/4 [00:00<00:00, 7250.31it/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Prepare data

### Prepare dataloader

In [6]:
dataloader = DataLoader(
    dataset,
    batch_size=general_config.batch_size,
    collate_fn=pad_collate(dataset.vocab.pad_token),
)

# Load models to test them

In [7]:
candidate_vae = CandidateVAE(
    general_config, encoder_config, decoder_config, dataset.vocab, dataset.embedder
).to(DEVICE)

# Check encoder / decoder separately

In [8]:
input_pad, input_lengths, target_pad, mask, max_target_length = next(iter(dataloader))

input_pad = input_pad.to(DEVICE)
target_pad = target_pad.to(DEVICE)
input_lengths = input_lengths.to("cpu")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["skills_str"] = (


In [9]:
mu, var, outputs, (hn, cn) = candidate_vae.encoder(input_pad, input_lengths)
mu.shape, var.shape, outputs.shape

latent_vector = candidate_vae.decoder.reparameterize(mu, var)

In [10]:
init_hidden = candidate_vae.decoder.init_hidden_cell(general_config.batch_size)

In [11]:
output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
    latent_vector, init_hidden, outputs, True
)
output = torch.argmax(output, dim=1).view(-1, 1)
if general_config.embedder_name != EmbedderType.LANG:
    output = candidate_vae.embed_output(output)

In [12]:
output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
    output, hidden, outputs, False
)
print(output.shape)
output = torch.argmax(output, dim=1).view(-1, 1)
if general_config.embedder_name != EmbedderType.LANG:
    output = candidate_vae.embed_output(output)

torch.Size([3, 220])


In [13]:
output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
    output, hidden, outputs, False
)
output = torch.argmax(output, dim=1).view(-1, 1)
if general_config.embedder_name != EmbedderType.LANG:
    output = candidate_vae.embed_output(output)

# Check all together for batched input

In [14]:
input_pad, input_lengths, target_pad, mask, max_target_length = next(iter(dataloader))

input_pad = input_pad.to(DEVICE)
target_pad = target_pad.to(DEVICE)
input_lengths = input_lengths.to("cpu")

outputs, attentions = candidate_vae.forward(input_pad, input_lengths)

# Check trainer

In [15]:
encoder_optimizer = torch.optim.Adam(
    candidate_vae.encoder.parameters(), lr=general_config.encoder_lr
)
decoder_optimizer = torch.optim.Adam(
    candidate_vae.decoder.parameters(), lr=general_config.decoder_lr
)

trainer = BetaVaeTrainer(
    candidate_vae,
    encoder_optimizer,
    decoder_optimizer,
    general_config,
    trainer_config,
    dataloader,
    writer_tensorboard,
)

Initializing BetaVaeTrainer...
[2022-05-22 20:07:10,590] {trainer.py:163} INFO - Initializing BetaVaeTrainer...
Done: BetaVaeTrainer initialized!
[2022-05-22 20:07:10,591] {trainer.py:193} INFO - Done: BetaVaeTrainer initialized!


In [16]:
trainer.fit()

Training...
Epoch 0/128
[2022-05-22 20:07:13,360] {trainer.py:426} INFO - Epoch 0/128


100%|██████████| 2/2 [00:58<00:00, 29.37s/it]

Epoch 1/128
[2022-05-22 20:08:12,107] {trainer.py:426} INFO - Epoch 1/128



100%|██████████| 2/2 [00:57<00:00, 28.89s/it]

Epoch 2/128
[2022-05-22 20:09:09,884] {trainer.py:426} INFO - Epoch 2/128



100%|██████████| 2/2 [00:57<00:00, 28.55s/it]

Epoch 3/128
[2022-05-22 20:10:06,992] {trainer.py:426} INFO - Epoch 3/128



100%|██████████| 2/2 [00:57<00:00, 28.90s/it]

Epoch 4/128
[2022-05-22 20:11:04,798] {trainer.py:426} INFO - Epoch 4/128



100%|██████████| 2/2 [00:57<00:00, 28.81s/it]

Epoch 5/128
[2022-05-22 20:12:02,425] {trainer.py:426} INFO - Epoch 5/128



100%|██████████| 2/2 [00:57<00:00, 28.74s/it]

Epoch 6/128
[2022-05-22 20:12:59,915] {trainer.py:426} INFO - Epoch 6/128



100%|██████████| 2/2 [00:57<00:00, 28.81s/it]

Epoch 7/128
[2022-05-22 20:13:57,535] {trainer.py:426} INFO - Epoch 7/128



100%|██████████| 2/2 [00:58<00:00, 29.06s/it]

Epoch 8/128
[2022-05-22 20:14:55,650] {trainer.py:426} INFO - Epoch 8/128



100%|██████████| 2/2 [00:58<00:00, 29.33s/it]

Epoch 9/128
[2022-05-22 20:15:54,313] {trainer.py:426} INFO - Epoch 9/128



100%|██████████| 2/2 [00:57<00:00, 28.98s/it]

Epoch 10/128
[2022-05-22 20:16:52,270] {trainer.py:426} INFO - Epoch 10/128



  0%|          | 0/2 [00:51<?, ?it/s]


KeyboardInterrupt: 