Importing libraries

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time

In [29]:
SEED = 1235

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [30]:
!python -m spacy download en
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [31]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [32]:
def tokenize_de(text):
    #tokenizes german text string, outputs a list of string
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    #tokenizes english text string, outputs a list of string
    return [tok.text for tok in spacy_en.tokenizer(text)]


Initializing Fields to defive the processing. Unline RNNs, CNNs expect the batch to be in first dimension. Hence setting batch_first = True. We lower case all text and pad init and eos tokens

In [33]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

Dividing dataset into train, valid and test

In [34]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), 
                                                    fields=(SRC, TRG))

Build arc and trg vocabulary as before. Tokens which appear less than 2 times are converted into <unk> tokens.

In [35]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

define device

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Define train, valid and test iterators

In [37]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     device = device)

#Encoder

Encoder in the convolutional sequence-to-sequence model is a little different - gives two context vectors for each token in the input sentence. If input sentence had n tokens, we would get 2*n context vectors, two for each token.

Steps:

*  Token is passed through a token embedding layer
*  To have the sense for tokens order, positional embedding layer is introduced which takes the position of the token within the sequence ranging [0,n) where n is length os sequence.
*  We take element wise sum of token and positional embeddings - call this as embedding vector
* This embedding vector is passed through a linear layer and outputs a vector with the required hidden dimension size. - call this as hidden vector
* This hidden vector into $N$ convolutional blocks. 
* Output of convolution blocks is passed through a linear layer and tranforms to embedding dimension size - call this as our conved vector 
* We take element wise sum of conved vector and embedding vector via a residual connection - call this as combined vector
* For each token, now we have conved vector and combined vector 

In [38]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 emb_dim, 
                 hid_dim, 
                 n_layers, 
                 kernel_size, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        # checking kernel size to be odd
        assert kernel_size % 2 == 1, "Kernel size must be odd!"
        
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device) # useful to scale the combined vectors
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, 
                                              out_channels = 2 * hid_dim, 
                                              kernel_size = kernel_size, 
                                              padding = (kernel_size - 1) // 2)
                                    for _ in range(n_layers)]) # sequence of conv layers 
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [batch_size,src_len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
  
        #position tensor
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        #pos = [batch_size,src_len]
        
        tok_embedded = self.tok_embedding(src) #embed tokens
        pos_embedded = self.pos_embedding(pos) #position tokens
        
        #tok_embedded =  [batch_size,src_len,emb_dim]
        #pos_embedded = [batch_size,src_len,emb_dim]
        #elementwise summ
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        #embedded = [batch_size, src_len, emb_dim]
        
        #input to conv
        conv_input = self.emb2hid(embedded)
        
        #conv_input = [batch_size, src_len, hid_dim]
        
        #permute as CNNs require src_len at last
        conv_input = conv_input.permute(0, 2, 1) 
        
        #conv_input = [batch_size,hid_dim,src_len]
        
        
        for i, conv in enumerate(self.convs):
        
            conved = conv(self.dropout(conv_input))

            #conved = [batch_size,2*hid_dim,src_len] 

            #GLU activation function halves the dimension
            conved = F.glu(conved, dim = 1)

            #conved = [batch_size,hid_dim,src_len] 
            
            #element wise sum - residual connection
            conved = (conved + conv_input) * self.scale

            #conved = [batch_size,hid_dim,src_len] 
            

            conv_input = conved
        
        
        #permute back to linear layer dim format
        conved = self.hid2emb(conved.permute(0, 2, 1))
        
        #conved = [batch_size,src_len,emb_dim]
        
        #elementwise sum output (conved) and input (embedded) 
        combined = (conved + embedded) * self.scale
        
        #combined = [batch_size,src_len,emb_dim]
      
        
        return conved, combined # return two vectors for each token

#Decoder


Decoder takes the groundtruth sentence and tries to predict it. Differences to Encoder are


*  Embeddings are fed into the convolutional blocks inatead of a residual connection that connects after the convolutional blocks and the transformation. 

* Encoder conved and combined outputs are used as inputs to convolutional blocks.

* Decoder output is a linear layer from embedding dimension to output dimension.



Convolution block is as follows:
*  Padding is done only at the beginning of the sentence. This would avoid the model from looking at the next tokens. As in saves from cheating.
*   Attentuion is applied after GLU activation and before the residual connection



In [39]:

class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 emb_dim, 
                 hid_dim, 
                 n_layers, 
                 kernel_size, 
                 dropout, 
                 trg_pad_idx, 
                 device,
                 max_length = 100):
        super().__init__()
        
        self.kernel_size = kernel_size
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)# useful to scale the combined vectors
        
        self.tok_embedding = nn.Embedding(output_dim, emb_dim)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
        self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
        
        self.fc_out = nn.Linear(emb_dim, output_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, 
                                              out_channels = 2 * hid_dim, 
                                              kernel_size = kernel_size)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
      
    def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
        
        #embedded = [batch_size, trg_len, emb_dim]
        #conved = [batch_size, hid_dim, trg_len]
        #encoder_conved = [batch_size, src_len, emb_dim]
        #encoder_combined = [batch_size, src_len, emb_dim]
        
        #permute back to linear layer dim format
        conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
        
        #conved_emb =[batch_size, trg_len, emb_dim]
        
        combined = (conved_emb + embedded) * self.scale
        
        #combined =[batch_size, trg_len, emb_dim]
                
        energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
        
        #energy = [batch_size, trg_len, src_len]
        
        attention = F.softmax(energy, dim=2)
        
        #attention = [batch_size, trg_len, src_dim]
            
        attended_encoding = torch.matmul(attention, encoder_combined)
        
        #attended_encoding = [batch_size, trg_len, emb_dim]
        
        
        attended_encoding = self.attn_emb2hid(attended_encoding)
        
        #attended_encoding = [batch_size, trg_len, hid_dim]
        
        #element wise sum and residual connection
        attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
        
        #attended_combined = [batch_size, hid_dim, trg_len]
        
        return attention, attended_combined
        
    def forward(self, trg, encoder_conved, encoder_combined):
        
        #trg = [batch_size, trg_len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
            
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch_size, trg_len]
        
        #embed tokens and positions
        tok_embedded = self.tok_embedding(trg)
        pos_embedded = self.pos_embedding(pos)
        
        #tok_embedded =  [batch_size,trg_len,emb_dim]
        #pos_embedded = [batch_size,trg_len,emb_dim]
        
        #element wise sum of tok and pos embeddings 
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        #embedded = [batch_size, trg_len, emb_dim]
        
 
        conv_input = self.emb2hid(embedded)
        
        #conv_input = [batch_size, trg_len, hid_dim]
        
         #permute as CNNs require trg_len at last
        conv_input = conv_input.permute(0, 2, 1) 
        
        #conv_input = [batch_size, hid_dim, trg_len]
        
        batch_size = conv_input.shape[0]
        hid_dim = conv_input.shape[1]
        
        for i, conv in enumerate(self.convs):
        
            #apply dropout
            conv_input = self.dropout(conv_input)
        
            #pad at beginning
            padding = torch.zeros(batch_size, 
                                  hid_dim, 
                                  self.kernel_size - 1).fill_(self.trg_pad_idx).to(self.device)
                
            padded_conv_input = torch.cat((padding, conv_input), dim = 2)
        
            #padded_conv_input = [batch_size, hid_dim, trg_len + kernel_size - 1]

            conved = conv(padded_conv_input)

            #conved = [batch_size, 2 * hid_dim, trg_len]
            
            #GLU activation function halves the dimension
            conved = F.glu(conved, dim = 1)

            #conved = [batch_size, hid_dim, trg_len]
            
            # attention
            attention, conved = self.calculate_attention(embedded, 
                                                         conved, 
                                                         encoder_conved, 
                                                         encoder_combined)
            
            #attention = [batch_size, trg_len, src_len]
            
            #element wise sum, residual connection
            conved = (conved + conv_input) * self.scale
            
            #conved = [batch_size, hid_dim, trg_len]
            
         
            conv_input = conved
        #permute back to linear layer dim format   
        conved = self.hid2emb(conved.permute(0, 2, 1))
         
        #conved = [batch_size, trg_len, emb_dim]
            
        output = self.fc_out(self.dropout(conved))
        
        #output = [batch_size, trg_len, output_dim]
            
        return output, attention

#Seq2Seq

In [40]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, src, trg):
        
        #src = [batch_size, src_len]
        #trg = [batch_size, trg_len - 1] (no <eos> token at the end)
           
        encoder_conved, encoder_combined = self.encoder(src)
            
        #encoder_conved = [batch_size, sr_len, emb_dim]
        #encoder_combined = [batch_size, src_len, emb_dim]
        

        output, attention = self.decoder(trg, encoder_conved, encoder_combined)
        
        #output = [batch_size, trg_len - 1, output_dim]
        #attention = [batch_size, trg_len - 1, src_len]
        
        return output, attention

#Training

In [41]:

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
EMB_DIM = 256
HID_DIM = 512 # each conv. layer has 2 * hid_dim filters
ENC_LAYERS = 10 # number of conv. blocks in encoder
DEC_LAYERS = 10 # number of conv. blocks in decoder
ENC_KERNEL_SIZE = 3 # must be odd!
DEC_KERNEL_SIZE = 3 # can be even or odd
ENC_DROPOUT = 0.25
DEC_DROPOUT = 0.25
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
    
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, device)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, TRG_PAD_IDX, device)

model = Seq2Seq(enc, dec).to(device)

In [42]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [43]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
        
        #output = [batch_size, trg_len - 1, output_dim]
        #trg = [batch_size, trg_len]
        
        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
        
        #output = [batch_size * trg_len - 1, output_dim]
        #trg = [batch_size * trg_len - 1]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [44]:

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, _ = model(src, trg[:,:-1])
        
            #output = [batch_size, trg_len - 1, output_dim]
            #trg = [batch_size, trg_len]

            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)

            #output = [batch_size * trg_len - 1, output_dim]
            #trg = [batch_size * trg_len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [45]:

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [46]:
N_EPOCHS = 10
CLIP = 0.1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut5-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 1m 6s
	Train Loss: 4.193 | Train PPL:  66.233
	 Val. Loss: 2.878 |  Val. PPL:  17.779
Epoch: 02 | Time: 1m 6s
	Train Loss: 2.993 | Train PPL:  19.951
	 Val. Loss: 2.335 |  Val. PPL:  10.329
Epoch: 03 | Time: 1m 5s
	Train Loss: 2.590 | Train PPL:  13.331
	 Val. Loss: 2.121 |  Val. PPL:   8.336
Epoch: 04 | Time: 1m 5s
	Train Loss: 2.370 | Train PPL:  10.697
	 Val. Loss: 1.993 |  Val. PPL:   7.336
Epoch: 05 | Time: 1m 5s
	Train Loss: 2.222 | Train PPL:   9.228
	 Val. Loss: 1.904 |  Val. PPL:   6.712
Epoch: 06 | Time: 1m 5s
	Train Loss: 2.111 | Train PPL:   8.260
	 Val. Loss: 1.861 |  Val. PPL:   6.433
Epoch: 07 | Time: 1m 5s
	Train Loss: 2.030 | Train PPL:   7.612
	 Val. Loss: 1.827 |  Val. PPL:   6.217
Epoch: 08 | Time: 1m 6s
	Train Loss: 1.957 | Train PPL:   7.080
	 Val. Loss: 1.806 |  Val. PPL:   6.088
Epoch: 09 | Time: 1m 5s
	Train Loss: 1.903 | Train PPL:   6.709
	 Val. Loss: 1.749 |  Val. PPL:   5.746
Epoch: 10 | Time: 1m 6s
	Train Loss: 1.856 | Train PPL:   6.399


In [49]:

model.load_state_dict(torch.load('tut5-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 1.807 | Test PPL:   6.095 |


#Inference Steps


*   tokenize the source sentence
*   append the <sos> and <eos> tokens
*   generate source indices, convert to tensor and unsqueeze it to get into right format
* forward source sentence into the encoder
* create a list to have the output sentence, initialized with  <sos> token
* Run a while loop till we have not hit a maximum length:

  *   convert the current prediction to a tensor and unsqueeze it to get into right format
  *   Foeward the current prediction, two encoder outputs into the decoder
  *   Get next token prediction from decoder and add prediction to current output sentence 
  *   break if the prediction comes <eos> token
  *   convert the output sentence from indexes to tokens
  *   return the output sentence and attention from final layer
















In [50]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):

    model.eval()
        
    if isinstance(sentence, str):
        nlp = spacy.load('de')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    with torch.no_grad():
        encoder_conved, encoder_combined = model.encoder(src_tensor)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, encoder_conved, encoder_combined)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [51]:
example_idx = 77

src = vars(test_data.examples[example_idx])['src']
trg = vars(test_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'trg = {trg}')
translation, attention = translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {translation}')

src = ['ein', 'brauner', 'hund', 'läuft', 'durchs', 'gras', 'und', 'seine', 'zunge', 'hängt', 'heraus', '.']
trg = ['a', 'brown', 'dog', 'walks', 'in', 'the', 'grass', 'with', 'its', 'tongue', 'hanging', 'out', '.']
predicted trg = ['a', 'brown', 'dog', 'runs', 'through', 'the', 'grass', 'with', 'his', 'tongue', 'hanging', '.', '<eos>']
