# First attempt to create a new representation method for a candidates

In [1]:
# %load_ext lab_black

In [2]:
import torch.nn as nn
from torch import optim
import torch
import torch.nn.functional as F
from torch.autograd import Variable


class Encoder(nn.Module):
  def __init__(self, input_size, hidden_size, bidirectional = True):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.input_size = input_size
    self.bidirectional = bidirectional
    
    self.lstm = nn.LSTM(input_size, hidden_size, bidirectional = bidirectional)
  
  def forward(self, inputs, hidden):
    
    output, hidden = self.lstm(inputs.view(1, 1, self.input_size), hidden)
    return output, hidden
    
  def init_hidden(self):
    return (torch.zeros(1 + int(self.bidirectional), 1, self.hidden_size),
      torch.zeros(1 + int(self.bidirectional), 1, self.hidden_size))

class AttentionDecoder(nn.Module):
  
  def __init__(self, hidden_size, output_size, vocab_size):
    super(AttentionDecoder, self).__init__()
    self.hidden_size = hidden_size
    self.output_size = output_size
    
    self.attn = nn.Linear(hidden_size + output_size, 1)
    self.lstm = nn.LSTM(hidden_size + vocab_size, output_size) #if we are using embedding hidden_size should be added with embedding of vocab size
    self.final = nn.Linear(output_size, vocab_size)
  
  def init_hidden(self):
    return (torch.zeros(1, 1, self.output_size),
      torch.zeros(1, 1, self.output_size))
  
  def forward(self, decoder_hidden, encoder_outputs, input):
    
    weights = []
    for i in range(len(encoder_outputs)):
      print(decoder_hidden[0][0].shape)
      print(encoder_outputs[0].shape)
      weights.append(self.attn(torch.cat((decoder_hidden[0][0], 
                                          encoder_outputs[i]), dim = 1)))
    normalized_weights = F.softmax(torch.cat(weights, 1), 1)
    
    attn_applied = torch.bmm(normalized_weights.unsqueeze(1),
                             encoder_outputs.view(1, -1, self.hidden_size))
    
    input_lstm = torch.cat((attn_applied[0], input[0]), dim = 1) #if we are using embedding, use embedding of input here instead
    
    output, hidden = self.lstm(input_lstm.unsqueeze(0), decoder_hidden)
    
    output = self.final(output[0])
    
    return output, hidden, normalized_weights
  

bidirectional = True
c = Encoder(10, 20, bidirectional)
a, b = c.forward(torch.randn(10), c.init_hidden())
print(f"Output shape: {a.shape}")
print(f"Hidden shape: {b[0].shape}")
print(f"Cell shape: {b[1].shape}")
print(f"Cat shape {torch.cat((a,a)).shape}")

x = AttentionDecoder(20 * (1 + bidirectional), 25, 30)
y, z, w = x.forward(x.init_hidden(), torch.cat((a,a)), torch.zeros(1,1,30)) #Assuming <SOS> to be all zeros
print(y.shape)
print(z[0].shape)
print(z[1].shape)
print(w)

Output shape: torch.Size([1, 1, 40])
Hidden shape: torch.Size([2, 1, 20])
Cell shape: torch.Size([2, 1, 20])
Cat shape torch.Size([2, 1, 40])
torch.Size([1, 25])
torch.Size([1, 40])
torch.Size([1, 25])
torch.Size([1, 40])
torch.Size([1, 30])
torch.Size([1, 1, 25])
torch.Size([1, 1, 25])
tensor([[0.5000, 0.5000]], grad_fn=<SoftmaxBackward0>)


In [3]:
a

tensor([[[-0.1335, -0.0468, -0.1223,  0.0402,  0.1056, -0.2290, -0.2726,
           0.0667,  0.0794,  0.0335,  0.0589, -0.0310,  0.1119,  0.0896,
           0.1737, -0.0317, -0.0732,  0.2047,  0.0522, -0.1213,  0.1520,
           0.1665,  0.1283,  0.0568,  0.2498, -0.2609, -0.1174, -0.1244,
          -0.0251,  0.0219, -0.0843, -0.0520, -0.0463, -0.0383,  0.1047,
           0.1322,  0.0649,  0.1311,  0.1106,  0.0522]]],
       grad_fn=<CatBackward0>)

In [4]:
b[0]

tensor([[[-0.1335, -0.0468, -0.1223,  0.0402,  0.1056, -0.2290, -0.2726,
           0.0667,  0.0794,  0.0335,  0.0589, -0.0310,  0.1119,  0.0896,
           0.1737, -0.0317, -0.0732,  0.2047,  0.0522, -0.1213]],

        [[ 0.1520,  0.1665,  0.1283,  0.0568,  0.2498, -0.2609, -0.1174,
          -0.1244, -0.0251,  0.0219, -0.0843, -0.0520, -0.0463, -0.0383,
           0.1047,  0.1322,  0.0649,  0.1311,  0.1106,  0.0522]]],
       grad_fn=<StackBackward0>)

### Imports

In [5]:
from pathlib import Path
import yaml

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence
import pandas as pd

from model.encoder import CandidateEncoder, CandidateEncoderConfig
from model.decoder import CandidateDecoder, CandidateDecoderConfig
from config.general_config import GeneralConfig
from model.embedder import EmbedderType
from dataset.utils import pad_collate
from dataset.dataset import SellersDataset
from dataset.language import Lang

### Constants

In [6]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

with open("config/config.yaml", "r") as file:
    try:
        config = yaml.safe_load(file)["vae"]
    except yaml.YAMLError as exc:
        print(exc)

GENERAL_CONFIG = GeneralConfig(**config["general"])

In [7]:
dataset = SellersDataset(
    embedder_name=config["general"]["embedder_name"], data_path=config["general"]["data_path"], device=DEVICE
)
dataset.prepare_dataset()

[2022-05-18 21:13:48,025] {dataset.py:133} INFO - Preparing dataset
[2022-05-18 21:13:48,025] {dataset.py:160} INFO - Detecting languages:


100%|██████████| 4/4 [00:00<00:00, 13.80it/s]

[2022-05-18 21:13:48,317] {dataset.py:165} INFO - Detected languages:
[2022-05-18 21:13:48,318] {dataset.py:166} INFO - lang
en    4
Name: lang, dtype: int64
[2022-05-18 21:13:48,319] {dataset.py:167} INFO - Removing rows not written in english
[2022-05-18 21:13:48,320] {dataset.py:171} INFO - Removed 0 rows



100%|██████████| 4/4 [00:00<00:00, 4847.51it/s]
100%|██████████| 4/4 [00:00<00:00, 5113.45it/s]
100%|██████████| 4/4 [00:00<00:00, 1175.12it/s]
100%|██████████| 4/4 [00:00<00:00, 9956.80it/s]

[2022-05-18 21:13:48,331] {dataset.py:347} INFO - Adding language for languages_str



100%|██████████| 4/4 [00:00<00:00, 13617.87it/s]

[2022-05-18 21:13:48,333] {dataset.py:347} INFO - Adding language for education_str



100%|██████████| 4/4 [00:00<00:00, 22104.37it/s]

[2022-05-18 21:13:48,334] {dataset.py:347} INFO - Adding language for skills_str



100%|██████████| 4/4 [00:00<00:00, 16024.08it/s]

[2022-05-18 21:13:48,336] {dataset.py:347} INFO - Adding language for description_str



100%|██████████| 4/4 [00:00<00:00, 9980.50it/s]


## Prepare data

### Prepare dataloader

In [8]:
loader = DataLoader(dataset, batch_size=GENERAL_CONFIG.batch_size, collate_fn=pad_collate)

In [9]:
config_encoder = CandidateEncoderConfig(
    num_words=dataset.lang.n_words,
    embedding_size=dataset.embedder.size,
    device=DEVICE,
    **config["encoder"],
    **config["general"]
)

encoder = CandidateEncoder(config_encoder).to(DEVICE)

In [10]:
mu, var, outputs = encoder(next(iter(loader)).cuda())
mu.shape, var.shape, outputs.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["skills_str"] = (


(torch.Size([3, 16]), torch.Size([3, 16]), torch.Size([3, 134, 128]))

In [11]:
def reparameterize(mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
    """
    Will a single z be enough ti compute the expectation
    for the loss??
    :param mu: (Tensor) Mean of the latent Gaussian
    :param logvar: (Tensor) Standard deviation of the latent Gaussian
    :return:
    """
    std = torch.exp(0.5 * logvar)
    eps = torch.randn_like(std)
    return eps * std + mu

In [12]:
latent_vector = reparameterize(mu, var)
latent_vector.shape

torch.Size([3, 16])

In [13]:
config_decoder = CandidateDecoderConfig(
    num_words=dataset.lang.n_words,
    embedding_size=dataset.embedder.size,
    device=DEVICE,
    **config["decoder"],
    **config["general"]
)

decoder = CandidateDecoder(config_decoder).to(DEVICE)

init_hidden = decoder.init_hidden_cell(GENERAL_CONFIG.batch_size)

In [14]:
def pad_strip_sequence(batch):
    template = torch.zeros(GENERAL_CONFIG.max_seq_len, batch.shape[-1], device=DEVICE)
    return pad_sequence([template, *batch], batch_first=True)[1:,:GENERAL_CONFIG.max_seq_len,:]

In [15]:
pad_strip_sequence(outputs).shape

torch.Size([3, 256, 128])

In [16]:
output, hidden, attn_weights = decoder(latent_vector, init_hidden, pad_strip_sequence(outputs), True)
# output.shape