In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np
import random
import math
import time

In [2]:
SEED  = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deteministic = True

##### Data prep started from here

In [3]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [4]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)][::-1]

In [5]:
SRC = Field(tokenize = tokenize_de,
           init_token = '<sos>',
           eos_token = '<eos>',
           lower = True)
TRG = Field(tokenize = tokenize_en,
           init_token = '<sos>',
           eos_token = '<eos>',
           lower = True)

In [6]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                   fields = (SRC, TRG))

tensor([ 919,    0, 1314,    0,    0])

In [7]:
print(f"Training examples length {len(train_data.examples)}")
print(f"Training examples length {len(test_data.examples)}")
print(f"Training examples length {len(valid_data.examples)}")

Training examples length 29000
Training examples length 1000
Training examples length 1014


In [8]:
print(vars(train_data.examples[0]))

{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['.', 'bushes', 'many', 'near', 'outside', 'are', 'males', 'white', ',', 'young', 'two']}


In order to be in the vocab the word should have appeared at least 2 times (min_freq used param below)

In [9]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [10]:
print(f"Source vocab length : {len(SRC.vocab)}")
print(f"Target vocab length : {len(TRG.vocab)}")

Source vocab length : 7853
Target vocab length : 5893


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
train_data.vocab

<generator object Dataset.__getattr__ at 0x000001F5EE36C510>

In [13]:
BATCH_SIZE = 128 
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data),
                                                                     batch_size = BATCH_SIZE,
                                                                     device=device)

In [14]:
for i in train_iterator:
    print(i)
    break


[torchtext.legacy.data.batch.Batch of size 128 from MULTI30K]
	[.src]:[torch.cuda.LongTensor of size 23x128 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 21x128 (GPU 0)]


In [15]:
class Encoder(nn.Module):
    
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [16]:
class Decoder(nn.Module):
    
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden, cell):
        
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        outputs, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        prediction = self.fc_out(outputs.squeeze(0))
        
        return prediction, hidden, cell

In [17]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, "Hidden dimensions of encoder and decoder should be the same"
        assert encoder.n_layers == decoder.n_layers, "Encoder and decoder layers should be same"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        
        input = trg[0, :]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            outputs[t] = output
            
            teacher_forcing = random.random() < teacher_forcing_ratio
            
            top1 = output.argmax(1)
            
            input = trg[t] if teacher_forcing else top1
        
        return outputs  

In [18]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [19]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 13,898,501 trainable parameters


In [21]:
optimizer = optim.Adam(model.parameters())

In [22]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index= TRG_PAD_IDX)

In [23]:
def train(model , iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [24]:
def evaluate(model , iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)

            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [25]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - int(elapsed_mins *( 60)))
    return elapsed_mins, elapsed_secs

In [26]:
torch.cuda.get_device_name()

'GeForce GTX 1650'

In [27]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'de2en_model.pt')
    
    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\nTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
    print(f"\Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}")

Epoch: 01 | Time: 1m 42s

Train Loss: 4.981 | Train PPL: 145.578
\Val. Loss: 4.822 | Val. PPL: 124.192
Epoch: 02 | Time: 1m 34s

Train Loss: 4.477 | Train PPL:  87.991
\Val. Loss: 4.811 | Val. PPL: 122.864
Epoch: 03 | Time: 1m 37s

Train Loss: 4.299 | Train PPL:  73.641
\Val. Loss: 4.807 | Val. PPL: 122.318
Epoch: 04 | Time: 1m 33s

Train Loss: 4.133 | Train PPL:  62.373
\Val. Loss: 4.712 | Val. PPL: 111.320
Epoch: 05 | Time: 1m 36s

Train Loss: 4.016 | Train PPL:  55.496
\Val. Loss: 4.514 | Val. PPL:  91.267
Epoch: 06 | Time: 1m 45s

Train Loss: 3.881 | Train PPL:  48.480
\Val. Loss: 4.523 | Val. PPL:  92.067
Epoch: 07 | Time: 1m 41s

Train Loss: 3.751 | Train PPL:  42.555
\Val. Loss: 4.279 | Val. PPL:  72.196
Epoch: 08 | Time: 1m 40s

Train Loss: 3.561 | Train PPL:  35.185
\Val. Loss: 4.255 | Val. PPL:  70.453
Epoch: 09 | Time: 1m 38s

Train Loss: 3.379 | Train PPL:  29.348
\Val. Loss: 4.138 | Val. PPL:  62.676
Epoch: 10 | Time: 1m 38s

Train Loss: 3.217 | Train PPL:  24.964
\Val. Lo

In [34]:
model.load_state_dict(torch.load('de2en_model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |")

| Test Loss: 4.177 | Test PPL:  65.148 |


In [None]:
def evaluate(model , iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)

            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [51]:
inp_str = "guten Morgen!"
out_str = "good Morning!"
inp_tensor = torch.tensor([SRC.vocab.stoi[tok] for tok in SRC.tokenize(inp_str)])
out_tensor = torch.tensor([SRC.vocab.stoi[tok] for tok in SRC.tokenize(out_str)])
# model(inp_tensor, out_tensor, 0)
out_tensor.shape

torch.Size([3])

In [30]:
for i in train_iterator:
    src_test_data = i.src
    trg_test_data = i.trg
    break

In [31]:
model.eval()
with torch.no_grad():
    output = model(src_test_data, trg_test_data, 0)

In [32]:
output.shape

torch.Size([37, 128, 5893])

In [33]:
a.shape

NameError: name 'a' is not defined

In [57]:
len(vars(train_data.examples[0])['src'])

13

In [73]:
src_test_data

tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [   4,    4,    4,  ...,    4,    4,    4],
        [ 700,  788,  507,  ..., 2777,  941,  292],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0')