In [None]:
%%bash
python -m spacy download fr
python -m spacy download de

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Instantiate our German and English spaCy models.

In [None]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

Previously we reversed the source (German) sentence, however in the paper we are implementing they don't do this, so neither will we.

In [None]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

Create our fields to process our data. This will append the "start of sentence" and "end of sentence" tokens as well as converting all words to lowercase

In [None]:
SRC = Field(tokenize=tokenize_de, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True)

TRG = Field(tokenize = tokenize_en, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True)

Load our data.

In [None]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 973kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 273kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 267kB/s]


In [None]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


We'll also print out an example just to double check they're not reversed.


In [None]:
print(vars(train_data.examples[0]))

{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


Then create our vocabulary, converting all tokens appearing less than twice into `<unk>` tokens.

In [None]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

Finally, define the device and create our iterators.


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)
print(device)

cuda


In [None]:
# import pickle
# with open('source.pkl', 'wb') as src_tokens: 
#   pickle.dump(SRC.vocab.stoi, src_tokens)
# with open('target_stoi.pkl','wb') as trg_tokens:
#   pickle.dump(TRG.vocab.stoi,trg_tokens)

# with open('target_itos.pkl','wb') as trg_tokens:
#   pickle.dump(TRG.vocab.itos,trg_tokens)

## Building the Seq2Seq Model

### Encoder


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout ):
        super().__init__()
        #self.hid_dim = enc_hid_dim
        self.embedding = nn.Embedding(input_dim, emb_dim) #no dropout as only one layer!
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc =  nn.Linear (enc_hid_dim*2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)

        
    def forward(self, src):

        #src = [src len, batch size]
        
        embedding = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]

        outputs, hidden = self.rnn(embedding)
        #outputs = [src len, batch size, hid dim * n directions]  n directions = 2- forward& backward
        #hidden = [n layers * n directions, batch size, hid dim]

        #outputs, hidden = self.rnn(embedded) #no cell state!
        #outputs are always from the top hidden layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))) #  #excess to backward and in rows
        #outputs = [src len, batch size, hid dim * 2]
        #hidden = [batch size, dec hid dim]
        print("****** ", hidden.shape)
        return outputs, hidden

In [None]:
enc = Encoder (input_dim = 32, 
               emb_dim = 256, 
               enc_hid_dim = 512, 
               dec_hid_dim = 512, 
               dropout = 0.1)

In [None]:
embedding, output = enc.forward(torch.zeros(12,32).to(torch.int64)) # sentence of length 12,batch 32
embedding.size(), output.size()
# o1, o2  = output
# o1.size(), o2.size(), fc.size() # shape 



******  torch.Size([32, 512])


(torch.Size([12, 32, 1024]), torch.Size([32, 512]))

In [None]:
from pdb import set_trace as bp


In [None]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

In [None]:
# # class Attention(nn.Module):
# #     def __init__(self,  emb_dim, enc_hid_dim, dec_hid_dim, dropout ):
# #         super().__init__()
# #         self.attn =  nn.Linear ((enc_hid_dim*2)+ dec_hid_dim, dec_hid_dim ))
# #         self.v = nn.Linear(dec_hid_dim,1, bias= False)

# class Attention(nn.Module):
#     def __init__(self, enc_hid_dim, dec_hid_dim):
#         super().__init__()
        
#         self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
#         self.v = nn.Linear(dec_hid_dim, 1, bias = False)

#     def forward(self,hidden, encoder_outputs):
#       #hidden here is decoder st-1
#       src_len = encoder_outputs.shape[0]

#       #hidden = [batchsize, dec_hi_dim]
#       hidden = hidden.unsqueeze(1).repeat(1,src_len,1)
      
#       #encoder_outputs = [src len, batch size, enc hid dim * 2]
#       encoder_outputs = encoder_outputs.permute(1,0,2)

#       #hidden = [batchsize, src_len, dec_hi_dim]
#       #encoder_outputs = [batch size, src_len, enc hid dim * 2]
#       bp()
#       energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs)), dim=2))
#       # energy = [batchsize, src_len, dec_hi_dim]
#       attention = self.v(energy).squeeze(2)
#       # hidden = [batchsize, src_len]

#       return F.softmax(attention, dim=1)




trial checks

In [None]:
outputs, hidden = enc.forward(torch.zeros(12,32).to(torch.int64))

******  torch.Size([32, 512])


In [None]:
outputs.size(), hidden.size()
#input_dim = 32, 
  #             emb_dim = 256, 
 #              enc_hid_dim = 512, 
  #             dec_hid_dim = 512, 
  #             dropout = 0.1

(torch.Size([12, 32, 1024]), torch.Size([32, 512]))

In [None]:
s_hidden = hidden
s_hidden.size()

torch.Size([32, 512])

In [None]:
#s_hidden = s_hidden.repeat(1,12,1)
s_hidden = s_hidden.unsqueeze(1).repeat(1,12,1)
s_hidden.size()

torch.Size([32, 12, 512])

In [None]:
outputs.size()
outputs.permute(1,0,2).size()


torch.Size([32, 12, 1024])

In [None]:
from pdb import set_trace as bp

Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout,attention):
        super().__init__()
        self.attention = attention
        #self.hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim*2)+ emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim*2) + emb_dim + dec_hid_dim, output_dim )
        self.dropout = nn.Dropout(dropout)           
        
    def forward(self, input, hidden, encoder_outputs):
        #input = [batch size]
        input = input.unsqueeze(0)
        #input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        #input = [1, batch size,emb_dim]
        a = self.attention(hidden, encoder_outputs)
        #input = [batch size,1, src_len]
        
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1,0,2)
        # encoder_outputs = [batch size, src_len, enc hid dim *2 ]

        weighted = torch.bmm(a, encoder_outputs)
        # weighted = [batch size, 1,  enc hid dim *2]

        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]

        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]

        #this also means that output == hidden
        assert (output == hidden).all()
        

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)

        
        prediction = self.fc_out(torch.cat((output,weighted, embedded)),dim=1)


        return prediction, hidden.squeeze(0), a.squeeze(1)


        #hidden = [n layers * n directions, batch size, hid dim]
        #context = [n layers * n directions, batch size, hid dim]
        
        #n layers and n directions in the decoder will both always be 1, therefore:
        #hidden = [1, batch size, hid dim]
        #context = [1, batch size, hid dim]
        
        
        
        #input = [1, batch size]
        
        #embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
       # emb_con = torch.cat((embedded, context), dim = 2)
            
        #emb_con = [1, batch size, emb dim + hid dim]
            
        #output, hidden = self.rnn(emb_con, hidden)
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #seq len, n layers and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        #output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), 
                          #  dim = 1)
        
        #output = [batch size, emb dim + hid dim * 2]
        
        #prediction = self.fc_out(output)
        
        #prediction = [batch size, output dim]
        
        

In [None]:
# class Seq2Seq(nn.Module):
#     def __init__(self, encoder, decoder, device):
#         super().__init__()
        
#         self.encoder = encoder
#         self.decoder = decoder
#         self.device = device
        
#         assert encoder.hid_dim == decoder.hid_dim, \
#             "Hidden dimensions of encoder and decoder must be equal!"

#     def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
#         #src = [src len, batch size]
#         #trg = [trg len, batch size]
#         #teacher_forcing_ratio is probability to use teacher forcing
#         #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
#         batch_size = trg.shape[1]
#         trg_len = trg.shape[0]
#         trg_vocab_size = self.decoder.output_dim
        
#         #tensor to store decoder outputs
#         outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
#         #last hidden state of the encoder is the context
#         context = self.encoder(src)
        
#         #context also used as the initial hidden state of the decoder
#         hidden = context
        
#         #first input to the decoder is the <sos> tokens
#         input = trg[0,:]
        
#         for t in range(1, trg_len):
            
#             #insert input token embedding, previous hidden state and the context state
#             #receive output tensor (predictions) and new hidden state
#             output, hidden = self.decoder(input, hidden, context)
            
#             #place predictions in a tensor holding predictions for each token
#             outputs[t] = output
            
#             #decide if we are going to use teacher forcing or not
#             teacher_force = random.random() < teacher_forcing_ratio
            
#             #get the highest predicted token from our predictions
#             top1 = output.argmax(1) 
            
#             #if teacher forcing, use actual next token as next input
#             #if not, use predicted token
#             input = trg[t] if teacher_force else top1

#         return outputs

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        # assert encoder.hid_dim == decoder.hid_dim, \
        #     "Hidden dimensions of encoder and decoder must be equal!"
    
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
# def init_weights(m):
#     for name, param in m.named_parameters():
#         nn.init.normal_(param.data, mean=0, std=0.01)
        
# model.apply(init_weights)

def init_weights(m):
  for name,param in m.named_parameters():
    if 'weight' in name:
      nn.init.normal_(param.data,mean=0,std=0.01)
    else:
      nn.init.constant_(param.data,0)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(5893, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 20,518,917 trainable parameters


In [None]:
pad_idx = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
def train(model,iterator,optimizer,criterion,clip):
  model.train()

  epoch_loss = 0
  bleu_score = 0
  for i,batch in enumerate(iterator):

    src = batch.src
    trg = batch.trg

    optimizer.zero_grad()

    outputs = model(src,trg)

    #outputs : [trg_len,batch,output_dim]
    # trg :[trg_len,batch]

    output_dim = outputs.shape[-1]

    batch_bleu_score = get_blue_score(outputs,trg)
    bleu_score += batch_bleu_score

    outputs = outputs[1:].view(-1,output_dim)
    trg = trg[1:].view(-1)

    loss = criterion(outputs,trg)

    loss.backward()

    torch.nn.utils.clip_grad_norm(model.parameters(),clip)

    optimizer.step()

    epoch_loss += loss.item()
  
  return epoch_loss/len(iterator) , bleu_score/len(iterator)

In [None]:
def evaluate(model,criterion,iterator):
  model.eval()

  epoch_loss = 0
  bleu_score = 0
  for i,batch in enumerate(iterator):

    src = batch.src
    trg = batch.trg

    outputs = model(src,trg)

    output_dim = outputs.shape[-1]

    batch_bleu_score = get_blue_score(outputs,trg)
    bleu_score += batch_bleu_score

    outputs = outputs[1:].view(-1,output_dim)
    trg = trg[1:].view(-1)

    loss = criterion(outputs,trg)

    epoch_loss += loss.item()

  return epoch_loss/len(iterator) ,bleu_score/len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import torch.nn.functional as F

N_EPOCHS =20
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss,train_bleu  = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss,val_bleu = evaluate(model, criterion, valid_iterator,)
    
    end_time = time.time()
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'eng-german-seq2seq.pt')

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train BLEU : {train_bleu:.4f} | Train PPL: {math.exp(train_loss):7.3f} ')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val BLEU: {val_bleu:.4f} |Val PPL: {math.exp(val_loss):7.3f}')


******  torch.Size([128, 512])


RuntimeError: ignored

In [None]:
weimodel.load_state_dict(torch.load('tut2-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

Packed padded sequences

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch. nn.functional as F

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [None]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
SRC = Field(tokenize=tokenize_de, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True, 
            include_lengths= True)

TRG = Field(tokenize = tokenize_en, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True)

In [None]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

In [None]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    sort_within_batch =True,
    sort_key =lambda x : len(x.src),
    device = device)


    

In [None]:
# with open('source.pkl', 'wb') as src_tokens: 
#   pickle.dump(SRC.vocab.stoi, src_tokens)
# with open('target_stoi.pkl','wb') as trg_tokens:
#   pickle.dump(TRG.vocab.stoi,trg_tokens)

# with open('target_itos.pkl','wb') as trg_tokens:
#   pickle.dump(TRG.vocab.itos,trg_tokens)

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout ):
        super().__init__()

        self.hid_dim = enc_hid_dim
        
        self.embedding = nn.Embedding(input_dim, emb_dim) #no dropout as only one layer!
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        
        self.fc =  nn.Linear (enc_hid_dim*2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedding = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,src_len)

        packed_outputs, hidden = self.rnn(packed_embedded)
        #packed_output is a packed sequence containing all hidden states
        # hidden is now from the final non padded elementin the batch
        outputs, _ = nn.utils.rnn.pack_padded_sequence(packed_outputs)
        #output is non-padded sequence, all hidden states obtained 
        # when the input is a pad token are all zeroes

        #outputs = [src len, batch size, hid dim * n directions]  n directions = 2- forward& backward
        #hidden = [n layers * n directions, batch size, hid dim]

        #outputs, hidden = self.rnn(embedded) #no cell state!
        #outputs are always from the top hidden layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:])), dim=1 )) #  #excess to backward and in rows
        #outputs = [src len, batch size, hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden

In [None]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)

    def forward(self,hidden, encoder_outputs, mask):
      #hidden here is decoder st-1
      src_len = encoder_outputs.shape[0]
      #hidden = [batchsize, dec_hi_dim]
      hidden = hidden.unsqueeze(1).repeat(1,src_len,1)
      #encoder_outputs = [src len, batch size, enc hid dim * 2]
      encoder_outputs = encoder_outputs.permute(1,0,2)

      #hidden = [batchsize, src_len, dec_hi_dim]
      #encoder_outputs = [batch size, src_len, enc hid dim * 2]

      energy = torch.tanh(self.attn(torch.cat(hidden, ecoder_outputs), dim=2))
      # energy = [batchsize, src_len, dec_hi_dim]
      attention = self.v(energy).squeeze(2)
      # hidden = [batchsize, src_len]
      attention = attention.masked_fill(mask==0,-1e10)

      return F.softmax(attention, dim=1)


In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs,mask):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs,mask)
                
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0), a.squeeze(1)
        

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device,src_pad_idx):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.src_pad_idx = src_pad_idx

    def create_mask(self,src):
        mask = (src != self.src_pad_idx).permute(1,0)
        return mask

    def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src,src_len)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        mask = self.create_mask(src)
        # mask: [batch_size,src_len]
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs,mask)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
src_pad_idx = SRC.vocab.stoi[SRC.pad_token]

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device,src_pad_idx).to(device)

In [None]:
def init_weights(m):
  for name,param in m.named_parameters():
    if 'weight' in name:
      nn.init.normal_(param.data,mean=0,std=0.01)
    else:
      nn.init.constant_(param.data,0)

model.apply(init_weights)

In [None]:
pad_idx = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('Total number of trainable params in the model = {}'.format(count_parameters(model)))

Total number of trainable params in the model = 20518917


In [None]:
import torchtext
from torchtext.data.metrics import bleu_score

def get_blue_score(translated,translation,itos_location="target_itos.pkl"):
  # translated : [trg_len,batch_size,output_dim]
  # translation : [trg_len,batch_size]

  # Load trg.itos
  trg_token_file = open(itos_location,'rb')
  trg_itos = pickle.load(trg_token_file)

  translated = torch.argmax(translated[1:],-1).T # translated : [batch_size,trg_len-1]
  translation = translation[1:].T # translation : [batch_size,trg_len-1]

  pred_ = []
  trg_ = []
  for i,j in zip(translated,translation):
    pred_.append([trg_itos[k.item()] for k in i if trg_itos[k.item()] not in ('<eos>','<pad>')]) # trim pad and eos tokens
    trg_.append([[trg_itos[k.item()] for k in j if trg_itos[k.item()] not in ('<eos>','<pad>')]])

  try:
    score = bleu_score(pred_,trg_)
    return score
  except IndexError:
    # My guess is that while doing split with split(" "), we are getting ngram len greater than 4
    # Find blue_score for each translated and translation sepearately
    
    # for i,j in zip(translated,translation):
    #   try:
    #     bleu_score([[trg_itos[k.item()] for k in i]],[[[trg_itos[k.item()] for k in j]]])
    #   except IndexError:
    #     print(f' Translated result {[trg_itos[k.item()] for k in i]}')
    #     print(f' Target Result {[trg_itos[k.item()] for k in j]}')

    '''
    The speculation was right. When the translated sentence has a ' ' in it. Then the length of ngram increases to 5 after splitting it string.split(" ")
    For now simply passing 0 score if this error is encountered.
    '''
    return 0

ImportError: ignored

In [None]:
def train(model,iterator,optimizer,criterion,clip):
  model.train()

  epoch_loss = 0
  bleu_score = 0
  for i,batch in enumerate(iterator):

    src,src_len = batch.src
    trg = batch.trg

    optimizer.zero_grad()

    outputs = model(src,src_len,trg)

    #outputs : [trg_len,batch,output_dim]
    # trg :[trg_len,batch]

    output_dim = outputs.shape[-1]

    batch_bleu_score = get_blue_score(outputs,trg)
    bleu_score += batch_bleu_score

    outputs = outputs[1:].view(-1,output_dim)
    trg = trg[1:].view(-1)

    loss = criterion(outputs,trg)

    loss.backward()

    torch.nn.utils.clip_grad_norm(model.parameters(),clip)

    optimizer.step()

    epoch_loss += loss.item()
  
  return epoch_loss/len(iterator) , bleu_score/len(iterator)

In [None]:
def evaluate(model,criterion,iterator):
  model.eval()

  epoch_loss = 0
  bleu_score = 0
  for i,batch in enumerate(iterator):

    src,src_len = batch.src
    trg = batch.trg

    outputs = model(src,src_len,trg)

    output_dim = outputs.shape[-1]

    batch_bleu_score = get_blue_score(outputs,trg)
    bleu_score += batch_bleu_score

    outputs = outputs[1:].view(-1,output_dim)
    trg = trg[1:].view(-1)

    loss = criterion(outputs,trg)

    epoch_loss += loss.item()

  return epoch_loss/len(iterator) ,bleu_score/len(iterator)

In [None]:
def epoch_time(start_time,end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time/60)
  elapsed_secs = int(elapsed_time - elapsed_mins*60)
  return elapsed_mins, elapsed_secs

In [None]:
import torch.nn.functional as F
epochs = 20
clip = 1.

best_valid_loss = float('inf')

for epoch in range(epochs):

  start_time = time.time()

  train_loss,train_bleu = train(model,train_iterator,optimizer,criterion,clip)
  val_loss,val_bleu = evaluate(model,criterion,valid_iterator)

  end_time = time.time()

  if best_valid_loss > val_loss:
    best_valid_loss = val_loss
    torch.save(model.state_dict(),'english-german-seq2seq.pt')
  
  epoch_mins,epoch_secs = epoch_time(start_time,end_time)

  print(f'Epoch : {epoch+1:02} | Time: {epoch_mins:.3f} minutes ,{epoch_secs} seconds')
  print(f'\t Train Loss : {train_loss:.3f} | Train BLEU : {train_bleu:.4f} | Train PPL: {math.exp(train_loss):7.3f} ')
  print(f'\t Val loss : {val_loss:.3f} | Val BLEU: {val_bleu:.4f} |Val PPL: {math.exp(val_loss):7.3f}')

In [None]:
# Testing the model
import spacy
nlp = spacy.load('en')

best_model_at = 'english-german-seq2seq.pt'
model.load_state_dict(torch.load(best_model_at))
model.eval()

# src stoi vocab
src_token_file = open("source.pkl",'rb')
src_vocab = pickle.load(src_token_file)


# target stoi vocab
trg_tokens_file = open('target_stoi.pkl','rb')
trg_stoi = pickle.load(trg_tokens_file)

# target itos vocab
trg_token_file = open("target_itos.pkl",'rb')
trg_itos = pickle.load(trg_token_file)


# Creating src tensor
custom_src_ = "A boy was swimming in the river."
tokenized_custom_src = [i.text.lower() for i in nlp.tokenizer(custom_src_)]
print(tokenized_custom_src)
tokenized_custom_src = ['<sos>'] + tokenized_custom_src +['<eos>']
custom_src = [src_vocab[i] for i in tokenized_custom_src]
custom_src = torch.LongTensor(custom_src).to(device).unsqueeze(1)
custom_src_len = torch.LongTensor([len(custom_src)]).to(device)

# Creating trg tensor
eval_trg_len = 15
trg_seq = [trg_stoi['<sos>']]
# There are two option to proceed. Pass src and trg to model.forward() or step by step model.encoder() and model.decoder(). With the second
# you can visualize attention

with torch.no_grad():
  encoder_outputs, hidden = model.encoder(custom_src,custom_src_len)

mask = model.create_mask(custom_src)

attentions = []
outputs = []

for i in range(eval_trg_len): # Could have used range(eval_trg_len)
  with torch.no_grad():
    trg_tensor = torch.LongTensor([trg_seq[-1]]).to(device)
    output, hidden,attention = model.decoder(trg_tensor,hidden,encoder_outputs,mask)
  attentions.append(attention)
  pred_token = output.argmax(1).item()
  trg_seq.append(pred_token)

  if pred_token == trg_stoi['<eos>']:
    break

attentions = torch.cat(attentions).to(device)
translation = [trg_itos[i] for i in trg_seq]
translation_joined = " ".join(trg_itos[i] for i in trg_seq[1:-1][::-1])
print(f" German translation : {translation_joined}")

In [None]:
# Printing Attention vector
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker
def display_attention(sentence,translation,attention):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(111)

  attention = attention.squeeze(1).cpu().detach().numpy()

  cax = ax.matshow(attention, cmap='bone')

  ax.tick_params(labelsize=15)
  ax.set_xticklabels(['']+[t for t in tokenized_custom_src],rotation=45)
  ax.set_yticklabels(['']+ [t for t in translation])

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()
  plt.close()

display_attention(custom_src_,translation,attentions)