<a href="https://colab.research.google.com/github/sayarghoshroy/Neural_Machine_Translation/blob/master/attention_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
tokenized_stores = {'en_train': [], 'en_dev': [], 'en_test': [], 'bn_train': [], 'bn_dev': [], 'bn_test': []}

for key in tokenized_stores:
    location = ""
    # Add location here
    file_name = location + str(key)[3:] + "." + str(key)[0:2]
    load = open(file_name)
    sentences = load.read().split('\n')
    
    for sentence in sentences:
        token_store = sentence.split(' ')
        tokenized_stores[key].append(token_store)

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
from torchtext import datasets

import torch.nn.functional as F
from torchtext.datasets import TranslationDataset
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

import os
import json

In [0]:
# using the json files for train, development and test set
def identity(x):
    return x

SRC = Field(tokenize = identity, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = identity, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

src = data.Field()
trg = data.Field()

fields = {'src': ("src", SRC), 'trg': ("trg", TRG)}

In [0]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = '/content/drive/My Drive/asn_2_data/',
                                        train = 'train.json',
                                        validation = 'dev.json',
                                        test = 'test.json',
                                        format = 'json',
                                        fields = fields)

In [0]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
print("Train Size:", str(len(train_data.examples)))
print("Validation Size:", str(len(valid_data.examples)))
print("Test Size:", str(len(test_data.examples)))

Train Size: 49398
Validation Size: 401
Test Size: 200


In [0]:
# Building the Vocabulary
SRC.build_vocab(train_data, min_freq = 1)
TRG.build_vocab(train_data, min_freq = 1)

# Vocabulary Sizes
print("Source Vocabulary Size:", len(SRC.vocab))
print("Target Vocabulary Size:", len(TRG.vocab))

Source Vocabulary Size: 31850
Target Vocabulary Size: 54513


In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [0]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device,
    sort = False)

In [0]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        #src = [src len, batch size]
        embedded = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]
        outputs, hidden = self.rnn(embedded)
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        return outputs, hidden

In [0]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        #energy = [batch size, src len, dec hid dim]
        attention = self.v(energy).squeeze(2)
        #attention= [batch size, src len]
        return F.softmax(attention, dim=1)

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs): 
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        input = input.unsqueeze(0)
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        #embedded = [1, batch size, emb dim]
        a = self.attention(hidden, encoder_outputs)
        #a = [batch size, src len]
        a = a.unsqueeze(1)
        #a = [batch size, 1, src len]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        weighted = torch.bmm(a, encoder_outputs)
        #weighted = [batch size, 1, enc hid dim * 2]
        weighted = weighted.permute(1, 0, 2)
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        #prediction = [batch size, output dim]
        return prediction, hidden.squeeze(0)

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [0]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

In [0]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(31850, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(54513, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=54513, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 126,284,017 trainable parameters


In [0]:
optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [0]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        optimizer.zero_grad()
        output = model(src, trg)
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            output = model(src, trg, 0) #turn off teacher forcing
            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = 25
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/drive/My Drive/asn_2_data/GRU-model.pt')

    else:
        print("Done.")
        break
        # implementing early stopping
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 11m 17s
	Train Loss: 7.130 | Train PPL: 1249.271
	 Val. Loss: 7.062 |  Val. PPL: 1166.876
Epoch: 02 | Time: 11m 17s
	Train Loss: 6.040 | Train PPL: 420.044
	 Val. Loss: 6.590 |  Val. PPL: 728.096
Epoch: 03 | Time: 11m 17s
	Train Loss: 5.119 | Train PPL: 167.226
	 Val. Loss: 6.409 |  Val. PPL: 607.027
Done.


In [0]:
# Evaluation on the Test Set
# Loading in the saved model
model.load_state_dict(torch.load('/content/drive/My Drive/asn_2_data/GRU-model.pt'))
test_loss = evaluate(model, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 6.015 | Test PPL: 409.660 |


In [0]:
# Inference
def inference(model, sentence):
    model.eval()
    with torch.no_grad():
        tokens = [token.lower() for token in sentence]

        tokens = [SRC.init_token] + tokens + [SRC.eos_token]
        src_indexes = [SRC.vocab.stoi[token] for token in tokens]
        src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

        src_tensor = src_tensor.reshape(-1,1)

        # trg = batch.trg
        output = model(src_tensor, src_tensor, 0) #turn off teacher forcing
        # trg = [trg len, batch size]
        # output = [trg len, batch size, output dim]
        output_dim = output.shape[-1]
        # output = output[1:].view(-1, output_dim)
        output = output.view(-1, output_dim)
        # trg = trg[1:].view(-1)
        indices = torch.argmax(output,dim=1).tolist()

        return indices

In [0]:
def get_translation(sentence):
    return [TRG.vocab.itos[x] for x in inference(model, sentence)]

In [0]:
bn_test_outputs = []
count = 0
for sentence in tokenized_stores['en_test']:
    intermediate = get_translation(sentence)
    output = list(filter(lambda x: x != '<eos>' and x!= '<unk>', intermediate))[1:]
    if count < 20:
        print(sentence, output)
    count += 1
    bn_test_outputs.append(output)

['Fresh', 'breath', 'and', 'shining', 'teeth', 'enhance', 'your', 'personality', '.'] ['ভালো', 'করে', 'দাঁত', 'এবং', 'দাঁত', 'আপনার', 'আপনার', 'আপনার', 'আপনার', 'সৌন্দর্য']
['Your', 'self-confidence', 'also', 'increases', 'with', 'teeth', '.'] ['দাঁতের', 'সাথে', 'সাথে', 'দাঁতের', 'বৃদ্ধি', 'বৃদ্ধি', 'পায়', '৷']
['Bacteria', 'stay', 'between', 'our', 'gums', 'and', 'teeth', '.'] ['আমাদের', 'মাড়ির', 'দাঁত', 'আর', 'দাঁত', 'আর', 'আর', 'আর', '৷']
['They', 'make', 'teeth', 'dirty', 'and', 'breath', 'stinky', '.'] ['দাঁত', 'দাঁত', 'আর', 'আর', 'আর', 'আর', 'দাঁতে', 'লাগিয়ে', '৷']
['You', 'may', 'keep', 'your', 'teeth', 'clean', 'and', 'breath', 'fresh', 'by', 'the', 'help', 'of', 'some', 'easy', 'tips', 'given', 'here', '.'] ['আপনি', 'আপনার', 'পরিষ্কার', 'পরিষ্কার', 'করে', 'আর', 'আপনার', 'আরাম', 'পাবেন', '৷']
['Clean', 'your', 'teeth', 'properly', '.'] ['পরিষ্কার', 'পরিষ্কার', 'পরিষ্কার', 'পরিষ্কার', 'পরিষ্কার', 'পরিষ্কার']
['It', 'takes', 'two', 'to', 'three', 'minutes', 'to', 'clean', 'your'

In [0]:
import nltk
BLEU_scores = []

for index in range(len(tokenized_stores['bn_test'])):
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['bn_test'][index]], bn_test_outputs[index], smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method7))

print("Average BLEU Score:", np.mean(BLEU_scores))

Average BLEU Score: 0.28159428346110404


In [0]:
# ^_^ Thank You