In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np
import random
import math
import time

In [2]:
SEED  = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deteministic = True

##### Data preparation

In [3]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [14]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [15]:
SRC = Field(tokenize = tokenize_de,
           init_token = '<sos>',
           eos_token = '<eos>',
           lower = True)
TRG = Field(tokenize = tokenize_en,
           init_token = '<sos>',
           eos_token = '<eos>',
           lower = True)

In [16]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                   fields = (SRC, TRG))

In [17]:
print(f"Training examples length {len(train_data.examples)}")
print(f"Training examples length {len(test_data.examples)}")
print(f"Training examples length {len(valid_data.examples)}")

Training examples length 29000
Training examples length 1000
Training examples length 1014


In [18]:
print(vars(train_data.examples[0]))

{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In order to be in the vocab the word should have appeared at least 2 times (min_freq used param below)

In [19]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [20]:
print(f"Source vocab length : {len(SRC.vocab)}")
print(f"Target vocab length : {len(TRG.vocab)}")

Source vocab length : 7853
Target vocab length : 5893


In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
BATCH_SIZE = 128 
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data),
                                                                     batch_size = BATCH_SIZE,
                                                                     device=device)

In [23]:
class Encoder(nn.Module):
    
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        return hidden

In [37]:
class Decoder(nn.Module):
    
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden, context):
        
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        emb_con = torch.cat((embedded, context), dim = 2 )
        outputs, hidden = self.rnn(emb_con, hidden)
        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)),
                          dim = 1)
        prediction = self.fc_out(output)
        
        return prediction, hidden

In [38]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, "Hidden dimensions of encoder and decoder should be the same"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        context = self.encoder(src)
        
        hidden = context
        
        input = trg[0, :]
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, context)
            
            outputs[t] = output
            
            teacher_forcing = random.random() < teacher_forcing_ratio
            
            top1 = output.argmax(1)
            
            input = trg[t] if teacher_forcing else top1
        
        return outputs  

In [39]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, ENC_EMB_DIM, HID_DIM, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [40]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.1)
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [41]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 14,219,781 trainable parameters


In [42]:
optimizer = optim.Adam(model.parameters())

In [43]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index= TRG_PAD_IDX)

In [44]:
def train(model , iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [45]:
def evaluate(model , iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)

            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [46]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - int(elapsed_mins *( 60)))
    return elapsed_mins, elapsed_secs

In [47]:
torch.cuda.get_device_name()

'GeForce GTX 1650'

In [48]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'de2en_model_tut2.pt')
    
    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\nTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
    print(f"\Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}")

Epoch: 01 | Time: 1m 11s

Train Loss: 4.767 | Train PPL: 117.608
\Val. Loss: 4.389 | Val. PPL:  80.555
Epoch: 02 | Time: 1m 9s

Train Loss: 3.754 | Train PPL:  42.683
\Val. Loss: 4.007 | Val. PPL:  54.995
Epoch: 03 | Time: 1m 9s

Train Loss: 3.299 | Train PPL:  27.092
\Val. Loss: 3.778 | Val. PPL:  43.747
Epoch: 04 | Time: 1m 9s

Train Loss: 2.985 | Train PPL:  19.779
\Val. Loss: 3.694 | Val. PPL:  40.207
Epoch: 05 | Time: 1m 9s

Train Loss: 2.748 | Train PPL:  15.611
\Val. Loss: 3.552 | Val. PPL:  34.878
Epoch: 06 | Time: 1m 9s

Train Loss: 2.522 | Train PPL:  12.450
\Val. Loss: 3.561 | Val. PPL:  35.206
Epoch: 07 | Time: 1m 9s

Train Loss: 2.380 | Train PPL:  10.805
\Val. Loss: 3.458 | Val. PPL:  31.750
Epoch: 08 | Time: 1m 9s

Train Loss: 2.212 | Train PPL:   9.134
\Val. Loss: 3.464 | Val. PPL:  31.952
Epoch: 09 | Time: 1m 8s

Train Loss: 2.085 | Train PPL:   8.041
\Val. Loss: 3.533 | Val. PPL:  34.212
Epoch: 10 | Time: 1m 8s

Train Loss: 1.971 | Train PPL:   7.177
\Val. Loss: 3.524

In [49]:
model.load_state_dict(torch.load('de2en_model_tut2.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |")

| Test Loss: 3.440 | Test PPL:  31.181 |


In [71]:
BATCH_SIZE = 128 
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data),
                                                                     batch_size = BATCH_SIZE,
                                                                     device=device)

In [72]:
for i in train_iterator:
    src_test_data = i.src
    trg_test_data = i.trg
    break

In [60]:
model.eval()
with torch.no_grad():
    output = model(src_test_data, trg_test_data, 0)

In [67]:
output.shape

torch.Size([14, 128, 5893])

In [55]:
a.shape

torch.Size([10, 128])

In [57]:
len(vars(train_data.examples[0])['src'])

13

In [73]:
src_test_data

tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [   4,    4,    4,  ...,    4,    4,    4],
        [ 700,  788,  507,  ..., 2777,  941,  292],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0')