# First attempt to create a new representation method for a candidates

In [1]:
# %load_ext lab_black

### Imports

In [1]:
import os
import yaml

import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from model.encoder import CandidateEncoderConfig
from model.decoder import CandidateDecoderConfig
from model.candidate_vae import CandidateVAE
from trainer.trainer import BetaVaeTrainer, TrainerConfig
from config.general_config import GeneralConfig
from model.embedder import EmbedderType
from dataset.utils import pad_collate
from dataset.dataset import SellersDataset

In [2]:
from dataset.utils import normalizeString

normalizeString(
    "english basic im a passionate content writer i have written research articles for about eight years and this vast experience has shaped my research and writing profession i value quality work and i am driven by the need to offer the best and meet the clients' needs some of the preferred areas of research include nursing public health psychology business marketing and economics bsc nursing nairobi university kenya graduated 2012 my skills are typewriting data entry microsoft word typing editing content writing"
)

'english basic im a passionate content writer i have written research articles for about eight years and this vast experience has shaped my research and writing profession i value quality work and i am driven by the need to offer the best and meet the clients needs some of the preferred areas of research include nursing public health psychology business marketing and economics bsc nursing nairobi university kenya graduated 2012 my skills are typewriting data entry microsoft word typing editing content writing'

In [3]:
# dataset.vocab.word2index

### Constants

In [4]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

with open("config/config.yaml", "r") as file:
    try:
        config = yaml.safe_load(file)["vae"]
    except yaml.YAMLError as exc:
        print(exc)

general_config = GeneralConfig(**config["general"])
encoder_config = CandidateEncoderConfig(**config["encoder"], **config["general"])

decoder_config = CandidateDecoderConfig(**config["decoder"], **config["general"])

trainer_config = TrainerConfig(**config["trainer"], **config["general"])

log_dir = os.path.join(general_config.checkpoints_dir, "runs")

os.makedirs(log_dir, exist_ok=True)

writer_tensorboard = SummaryWriter(log_dir)

In [5]:
# %reload_ext tensorboard
# %tensorboard --logdir $log_dir --port=6008

In [6]:
dataset = SellersDataset(
    dataset_path=general_config.datset_path,
    embedder_name=general_config.embedder_name,
    raw_data_path=general_config.raw_data_path,
    device=DEVICE,
    bow_remove_stopwords=general_config.bow_remove_stopwords,
    bow_remove_sentiment=general_config.bow_remove_sentiment,
    nn_embedding_size=encoder_config.lstm_hidden_dim,
    trim_tr=general_config.trim_tr,
)
# dataset.prepare_dataset()
dataset.load_dataset()

Loading dataset data/dataset/...
[2022-05-30 01:18:27,045] {dataset.py:226} INFO - Loading dataset data/dataset/...
Loaded dataset data/dataset/!
[2022-05-30 01:18:27,358] {dataset.py:245} INFO - Loaded dataset data/dataset/!




In [7]:
dataset.bow_vocab.n_words

14419

## Prepare data

### Prepare dataloader

In [8]:
dataloader = DataLoader(
    dataset,
    batch_size=general_config.batch_size,
    collate_fn=pad_collate(dataset.vocab.pad_token),
)

In [9]:
candidate_vae = CandidateVAE(
    general_config, encoder_config, decoder_config, dataset.vocab, dataset.embedder
).to(DEVICE)

# Check encoder / decoder separately

In [10]:
# (
#     input_pad,
#     input_lengths,
#     target_pad,
#     mask,
#     max_target_length,
#     target_skills,
#     target_education,
#     target_languages,
# ) = next(iter(dataloader))

# input_pad = input_pad.to(DEVICE)
# target_pad = target_pad.to(DEVICE)
# input_lengths = input_lengths.to("cpu")

In [11]:
# mu, var, outputs, (hn, cn) = candidate_vae.encoder(input_pad, input_lengths)
# mu.shape, var.shape, outputs.shape

# latent_vector = candidate_vae.decoder.reparameterize(mu, var)

In [12]:
# init_hidden = candidate_vae.decoder.init_hidden_cell(latent_vector.shape[0])

In [13]:
# output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
#     latent_vector, init_hidden, outputs, True
# )
# output = torch.argmax(output, dim=1).view(-1, 1)

# if general_config.embedder_name != EmbedderType.LANG:
#     output = candidate_vae.embed_output(output)

In [14]:
# output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
#     output, hidden, outputs, False
# )

# output = torch.argmax(output, dim=1).view(-1, 1)
# if general_config.embedder_name != EmbedderType.LANG:
#     output = candidate_vae.embed_output(output)

In [15]:
# output, hidden, attn_weights, attn_mu, attn_var = candidate_vae.decoder(
#     output, hidden, outputs, False
# )
# output = torch.argmax(output, dim=1).view(-1, 1)
# if general_config.embedder_name != EmbedderType.LANG:
#     output = candidate_vae.embed_output(output)

10# Check all together for batched input

In [16]:
# (
#     input_pad,
#     input_lengths,
#     target_pad,
#     mask,
#     max_target_length,
#     target_skills,
#     target_education,
#     target_languages,
# ) = next(iter(dataloader))

# input_pad = input_pad.to(DEVICE)
# target_pad = target_pad.to(DEVICE)
# input_lengths = input_lengths.to("cpu")

# outputs, attentions = candidate_vae.forward(input_pad, input_lengths)

# Check trainer

In [17]:
trainer = BetaVaeTrainer(
    candidate_vae,
    general_config,
    trainer_config,
    dataloader,
    writer_tensorboard,
)

Initializing BetaVaeTrainer...
[2022-05-30 01:18:29,765] {trainer.py:227} INFO - Initializing BetaVaeTrainer...
Done: BetaVaeTrainer initialized!
[2022-05-30 01:18:29,790] {trainer.py:339} INFO - Done: BetaVaeTrainer initialized!


In [None]:
trainer.fit()

Training loop...
[2022-05-30 01:18:29,793] {trainer.py:690} INFO - Training loop...
Epoch 0/20
[2022-05-30 01:18:29,794] {trainer.py:692} INFO - Epoch 0/20


100%|███████████████████████████████████████| 4101/4101 [45:54<00:00,  1.49it/s]

Epoch 1/20
[2022-05-30 02:04:24,284] {trainer.py:692} INFO - Epoch 1/20



100%|███████████████████████████████████████| 4101/4101 [45:09<00:00,  1.51it/s]

Epoch 2/20
[2022-05-30 02:49:33,845] {trainer.py:692} INFO - Epoch 2/20



100%|███████████████████████████████████████| 4101/4101 [45:02<00:00,  1.52it/s]

Epoch 3/20
[2022-05-30 03:34:36,655] {trainer.py:692} INFO - Epoch 3/20



100%|███████████████████████████████████████| 4101/4101 [45:04<00:00,  1.52it/s]

Epoch 4/20
[2022-05-30 04:19:40,780] {trainer.py:692} INFO - Epoch 4/20



100%|███████████████████████████████████████| 4101/4101 [45:05<00:00,  1.52it/s]

Epoch 5/20
[2022-05-30 05:04:46,048] {trainer.py:692} INFO - Epoch 5/20



100%|███████████████████████████████████████| 4101/4101 [45:07<00:00,  1.51it/s]

Epoch 6/20
[2022-05-30 05:49:53,196] {trainer.py:692} INFO - Epoch 6/20



100%|███████████████████████████████████████| 4101/4101 [45:06<00:00,  1.52it/s]

Epoch 7/20
[2022-05-30 06:34:59,997] {trainer.py:692} INFO - Epoch 7/20



100%|███████████████████████████████████████| 4101/4101 [44:59<00:00,  1.52it/s]

Epoch 8/20
[2022-05-30 07:19:59,783] {trainer.py:692} INFO - Epoch 8/20



100%|███████████████████████████████████████| 4101/4101 [45:16<00:00,  1.51it/s]

Epoch 9/20
[2022-05-30 08:05:15,867] {trainer.py:692} INFO - Epoch 9/20



 23%|█████████▏                              | 944/4101 [11:05<35:32,  1.48it/s]