In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from lstm2 import train, LSTM_2

from config import filenames, folders

from matplotlib import pyplot as plt

device = 'cpu'

config = {
    'model_name': 'LSTM_2',
    'feature': 'testing-tf-metric',
    'max_len': 48,
    'min_freq_src': 4,
    'min_freq_trg': 4,

    'embedding_dim': 128,
    'hidden_size': 256,
    'num_layers': 3,

    'num_epochs': 10,
    'weight_decay': 1e-5,
    'label_smoothing': 0.1,

    'dropout_enc': 0.1,
    'dropout_dec': 0.1,
    'dropout_emb': 0.1,
    'dropout_attention': 0.1,

    'learning_rate': 1e-3,
    'gamma': 0.2,
    'patience': 2,
    'threshold': 5e-4,
    'batch_size': 128,

    'use_tf': False,
    'tf_start': 1e+10,
    'tf_decrease': 1e+10
}

def plot_losses(train_losses, val_losses):
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label="Training Loss")
    plt.plot(val_losses, label="Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss Over Epochs")
    plt.legend()
    plt.grid()
    plt.show()

In [4]:
from dataset import Vocab
vocab_src = Vocab(filenames['train_src'], min_freq=config['min_freq_src'])
vocab_trg = Vocab(filenames['train_trg'], min_freq=config['min_freq_trg'])

In [5]:
print(len(vocab_src))
print(len(vocab_trg))
config['src_vocab_size'] = len(vocab_src)
config['trg_vocab_size'] = len(vocab_trg)

29798
21555


In [6]:
from dataset import TranslationDataset
train_dataset = TranslationDataset(vocab_src, 
                                vocab_trg, 
                                filenames['train_src'], 
                                filenames['train_trg'], 
                                max_len=config['max_len'], 
                                device=device)
val_dataset = TranslationDataset(vocab_src, 
                                vocab_trg, 
                                filenames['test_src'], 
                                filenames['test_trg'], 
                                max_len=72, 
                                device=device, 
                                sort_lengths=False)


100%|██████████| 195915/195915 [00:13<00:00, 14677.17it/s]
100%|██████████| 986/986 [00:00<00:00, 16056.59it/s]


In [7]:
model = LSTM_2(config=config)

In [8]:
model.load('../weights/lstm2/LSTM_2-more-dropouts-23.0m-15epoch.pt')

In [106]:
from submission import get_bleu
from dataset import TestDataLoader, RawDataset

val_loader = TestDataLoader(val_dataset, batch_size=2)

raw = RawDataset(filenames['test_src'])

# get_bleu(model, val_loader, vocab_trg, filenames, device=device, raw_dataset=raw)

In [349]:
src_seq = None
trg_seq = None
for a, b in val_loader:
    src_seq = a
    trg_seq = b
src_seq.size()

torch.Size([2, 35])

In [350]:
import torch
from torch.nn import functional as F

unk_idx, pad_idx, bos_idx, eos_idx, num_idx = 0, 1, 2, 3, 4

self = model
beam_width = 3
bos_idx = 2

model.eval()
batch_size = src_seq.size(0)
k = beam_width

In [None]:
# encoder forward
with torch.no_grad():
    src_embedded = self.src_embedding(src_seq)  # (batch_size, seq_len, emb_dim)
    encoder_outputs, (hidden, cell) = self.encoder(src_embedded)
    encoder_outputs = self.encoder_output_proj(encoder_outputs)  # (batch_size, seq_len, hidden_dim)
    encoder_outputs = encoder_outputs.contiguous()

    hidden = self._project_hidden(hidden, self.encoder_hidden_proj)  # (num_layers, batch_size, hidden_dim)
    cell = self._project_hidden(cell, self.encoder_cell_proj)

In [None]:
# expand encoder output for beams
encoder_outputs = encoder_outputs.unsqueeze(1).repeat(1, k, 1, 1)  # (batch_size, k, seq_len, hidden_dim)
print(encoder_outputs.size())

torch.Size([2, 3, 35, 256])


In [None]:
print(encoder_outputs.size())

torch.Size([6, 35, 256])


In [None]:
hidden = hidden.unsqueeze(2).repeat(1, 1, k, 1)  # (num_layers, batch_size, k, hidden_dim)
hidden = hidden.view(hidden.size(0), -1, hidden.size(-1))  # (num_layers, batch_size*k, hidden_dim)
cell = cell.unsqueeze(2).repeat(1, 1, k, 1).view(hidden.size(0), -1, hidden.size(-1))
print('Hidden sizes: ', hidden.size(), cell.size())

torch.Size([3, 6, 256]) torch.Size([3, 6, 256])


In [None]:
beam_scores = torch.zeros((batch_size, k), dtype=torch.float, device=device)  # (batch_size, k)
beam_scores[:, 1:] = -1e10  # force beam 0 to be top 1

tensor([[ 0.0000e+00, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00, -1.0000e+09, -1.0000e+09]])

In [None]:
# --- Initialize Beams ---
beam_scores = torch.zeros((batch_size, k), dtype=torch.float, device=device)  # (batch_size, k)
beam_scores[:, 1:] = -1e10  # force beam 0 to be top 1
print('Beam scores:', *beam_scores)

beam_tokens = torch.full((batch_size, k, 1), bos_idx, dtype=torch.long, device=device)  # (batch_size, k, 1)
print('Beam tokens:', *beam_tokens)

Beam scores: tensor([ 0.0000e+00, -1.0000e+09, -1.0000e+09]) tensor([ 0.0000e+00, -1.0000e+09, -1.0000e+09])
Beam tokens: tensor([[2],
        [2],
        [2]]) tensor([[2],
        [2],
        [2]])


### Cycle

In [455]:
step = 2

In [456]:
flat_hidden = hidden  # (num_layers, batch_size*k, hidden_dim)
flat_cell = cell
flat_tokens = beam_tokens.view(batch_size * k, -1)  # (batch_size*k, seq_len)

In [457]:
print('Flat hidd size:\t', flat_hidden.size())
print('Flat cell size:\t', flat_cell.size())
print('Flat tokens size:\t', flat_tokens.size())
print('Flat tokens:\t', flat_tokens)

Flat hidd size:	 torch.Size([3, 6, 256])
Flat cell size:	 torch.Size([3, 6, 256])
Flat tokens size:	 torch.Size([6, 3])
Flat tokens:	 tensor([[    2,  9410, 20904],
        [    2,  9410,    15],
        [    2,   857,  9410],
        [    2, 19336, 21475],
        [    2, 19341, 20657],
        [    2, 17795, 19336]])


In [None]:
# decoder
current_trg = flat_tokens[:, -1].unsqueeze(1)  # (batch_size*k, 1)

trg_embedded = self.trg_embedding(current_trg)  # (batch_size*k, 1, emb_dim)

decoder_output, (new_hidden, new_cell) = self.decoder(trg_embedded, (flat_hidden, flat_cell))

In [459]:
print('Current trg:', current_trg)

Current trg: tensor([[20904],
        [   15],
        [ 9410],
        [21475],
        [20657],
        [19336]])


In [460]:
mask = (src_seq != pad_idx).unsqueeze(1)  # (batch_size, 1, src_len)
print(mask.size())
mask = mask.repeat(k, 1, 1)
mask.size()

torch.Size([2, 1, 35])


torch.Size([6, 1, 35])

In [None]:
# attention + logits
energy = torch.bmm(decoder_output, encoder_outputs.transpose(1, 2))  # (batch_size*k, 1, seq_len)

energy = energy.masked_fill(mask == 0, -1e10)


attention = F.softmax(energy, dim=-1)
context = torch.bmm(attention, encoder_outputs)  # (batch_size*k, 1, hidden_dim)
combined = torch.cat([decoder_output, context], dim=2)
logits = self.fc(combined).squeeze(1)  # (batch_size*k, vocab_size)
logits[:, unk_idx] = -1e10  # Block <UNK>


In [462]:
print('logits size:\t', logits.size())

logits size:	 torch.Size([6, 21555])


In [463]:
print(" ".join(vocab_src.decode(src_seq[0])))
print(" ".join(vocab_src.decode(src_seq[1])))

ich möchte jeden von ihnen heute dazu anregen , mit in das bild zu kommen und zögern sie nicht jemanden zu fragen : " würden sie ein bild von uns machen ? "
vielen dank .


In [None]:
import numpy as np
# scores (log probs)
log_probs = F.log_softmax(logits, dim=-1)  # (batch_size*k, vocab_size)
vocab_size = log_probs.size(-1)

In [465]:
log_probs.size()

torch.Size([6, 21555])

In [466]:
seq_n = 0
topk = torch.topk(log_probs, k, dim=-1)
probs = np.round(np.exp(topk.values.detach().numpy()[seq_n])*100, 2)
candidates = vocab_trg.decode(topk.indices[seq_n])
print(*zip(probs, candidates))

(np.float32(85.88), 'to') (np.float32(1.3), 'everybody') (np.float32(1.23), 'you')


In [467]:
print(beam_scores.view(-1, 1).detach().numpy())
print(np.round(np.exp(log_probs.detach().numpy()), 2))

[[-1.07818   ]
 [-2.709785  ]
 [-3.2239377 ]
 [-0.32311833]
 [-3.8356533 ]
 [-4.241947  ]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [468]:
print(beam_scores.view(-1, 1).size())
print(log_probs.size())

torch.Size([6, 1])
torch.Size([6, 21555])


In [469]:
next_scores = log_probs + beam_scores.view(-1, 1)  # (batch_size*k, vocab_size)

In [470]:
print(next_scores.size())

torch.Size([6, 21555])


In [None]:
# reshape to (batch_size, k * vocab_size)
next_scores = next_scores.view(batch_size, k * vocab_size)
next_scores.size()

torch.Size([2, 64665])

In [None]:
# topk candidates for each batch 
next_scores, next_tokens = torch.topk(next_scores, k, dim=1)  # log_probs: (batch_size, k), indices: (batch_size, k)

In [473]:
print('probs:', np.round(np.exp(next_scores.detach().numpy()), 2))
print('next tokens:', next_tokens)

probs: [[0.29 0.03 0.01]
 [0.33 0.28 0.02]]
next tokens: tensor([[19572, 64014, 27594],
        [   35, 20657, 17795]])


In [None]:
beam_indices = next_tokens // vocab_size  # what beam is token from
token_indices = next_tokens % vocab_size  # what token

In [475]:
print(beam_indices)
print(token_indices)

tensor([[0, 2, 1],
        [0, 0, 0]])
tensor([[19572, 20904,  6039],
        [   35, 20657, 17795]])


In [476]:
print('Batch source sequernces:')
print(" ".join(vocab_src.decode(src_seq[0])))
print(" ".join(vocab_src.decode(src_seq[1])))
print()
print(f'Candidates for position {step}:')
print(*map(vocab_trg.decode, token_indices.detach().numpy()), sep='\n')
print(f'Probabilities for position {step}:')
print(*map(np.exp, np.round(next_scores.detach().numpy(), 2)), sep='\n')

Batch source sequernces:
ich möchte jeden von ihnen heute dazu anregen , mit in das bild zu kommen und zögern sie nicht jemanden zu fragen : " würden sie ein bild von uns machen ? "
vielen dank .

Candidates for position 2:
['to', 'want', 'each']
['.', 'very', 'so']
Probabilities for position 2:
[0.29229257 0.02930492 0.00927902]
[0.33621648 0.28083163 0.024972  ]


#### new beam states

In [None]:
# new scores
beam_scores = next_scores

In [478]:
print(beam_scores.detach().numpy())

[[-1.2304108 -3.5302777 -4.676137 ]
 [-1.0944414 -1.2699423 -3.6939917]]


In [None]:
# new beam tokens
beam_tokens = torch.cat([
    beam_tokens[torch.arange(batch_size).unsqueeze(1), beam_indices],  # (batch_size, k) -> (batch_size, k, seq_len)
    token_indices.unsqueeze(-1)
], dim=-1)

In [480]:
print(beam_tokens)

tensor([[[    2,  9410, 20904, 19572],
         [    2,   857,  9410, 20904],
         [    2,  9410,    15,  6039]],

        [[    2, 19336, 21475,    35],
         [    2, 19336, 21475, 20657],
         [    2, 19336, 21475, 17795]]])


In [481]:
print('beams for seq0 in batch:')
print(*map(vocab_trg.decode, beam_tokens[0, :].numpy()), sep='\n')
print('beam probabilities:')
print(*map(np.exp, beam_scores[0, :].detach().numpy()), sep='\n')

print('\nbeams for seq1 in batch:')
print(*map(vocab_trg.decode, beam_tokens[1, :].numpy()), sep='\n')
print('beam probabilities:')
print(*map(np.exp, beam_scores[1, :].detach().numpy()), sep='\n')

beams for seq0 in batch:
['i', 'want', 'to']
['and', 'i', 'want']
['i', "'d", 'each']
beam probabilities:
0.29217252
0.02929678
0.009314928

beams for seq1 in batch:
['thank', 'you', '.']
['thank', 'you', 'very']
['thank', 'you', 'so']
beam probabilities:
0.33472654
0.28084785
0.024872521


In [482]:
num_layers = hidden.size(0)

In [483]:
new_hidden.view(num_layers, batch_size, k, -1).size()  # (num_layers, batch_size, k, hidden_dim)

torch.Size([3, 2, 3, 256])

In [None]:
# new hidden states
hidden = new_hidden.view(num_layers, batch_size, k, -1)  # (num_layers, batch_size, k, hidden_dim)

In [485]:
hidden.size()

torch.Size([3, 2, 3, 256])

In [486]:
print(hidden.size())

torch.Size([3, 2, 3, 256])


In [487]:
hidden = hidden.view(hidden.size(0), -1, hidden.size(-1))  # (num_layers, batch_size*k, hidden_dim)

In [449]:
cell = new_cell.view(num_layers, batch_size, k, -1)  # (num_layers, batch_size, k, hidden_dim)

In [451]:
cell = cell.view(cell.size(0), -1, cell.size(-1))

In [306]:
# 3. Update hidden/cell states

# cell = new_cell.view(cell.size(0), batch_size, k, -1)
# cell = cell[torch.arange(batch_size).unsqueeze(1), beam_indices].view(cell.size(0), -1, cell.size(-1))

In [453]:
# --- Check for EOS ---
eos_mask = (token_indices == eos_idx)
if eos_mask.all():
    # Store finished beams and adjust active beams\
    pass
    # break  # (Implement early stopping logic here)

In [454]:
eos_mask

tensor([[False, False, False],
        [False, False, False]])

In [498]:
trg_seq = beam_tokens[torch.arange(batch_size), beam_scores.argmax(dim=1)]  # (batch_size, seq_len)
trg_seq

tensor([[    2,  9410, 20904, 19572, 19048,  6743, 13293,   684,  9410, 20904,
         19572, 19048,   978, 13293, 21089,  9410, 20904, 19572, 19048,  6743,
         13293,   684,  9410, 20904, 19572, 19048,   978, 13293, 21089,  9410,
         20904, 19572, 19048,  6743, 13293,   684,  9410, 20904, 19572, 19048,
           978, 13293, 21089,  9410, 20904, 19572, 19048,   978, 13293, 20904,
         19572, 19048,   978, 13293],
        [    2, 19336, 21475,    35,     3,    35,     3,    35,     3,    35,
             3,    35,     3,    35,     3,    35,     3,    35,     3,    35,
             3,    35,     3,    35,     3,    35,     3,    35,     3,    35,
             3,    35,     3,    35,     3,    35,     3,    35,     3,    35,
             3,    35,     3,    35,     3,    35,     3,    35,     3,    35,
             3,    35,     3,    35]])