<a href="https://colab.research.google.com/github/rpadaki/tamil-english-translation/blob/master/English_Tamil_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install -q torch torchtext opt_einsum
!pip install -qU git+https://github.com/harvardnlp/namedtensor

  Building wheel for namedtensor (setup.py) ... [?25l[?25hdone


In [0]:
!curl http://ufal.mff.cuni.cz/~ramasamy/parallel/data/v2/en-ta-parallel-v2.tar.gz | tar xvz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0en-ta-parallel-v2/
en-ta-parallel-v2/corpus.bcn.dev.en
en-ta-parallel-v2/corpus.bcn.dev.ta
  0 23.7M    0  126k    0     0   183k      0  0:02:12 --:--:--  0:02:12  183ken-ta-parallel-v2/corpus.bcn.test.en
en-ta-parallel-v2/corpus.bcn.test.ta
en-ta-parallel-v2/corpus.bcn.train.en
en-ta-parallel-v2/corpus.bcn.train.ta
100 23.7M  100 23.7M    0     0  10.4M      0  0:00:02  0:00:02 --:--:-- 10.4M


In [0]:
import torch
import torchtext
from torchtext.vocab import Vectors
from torchtext import data, datasets
import torch.nn.functional as F

from namedtensor import ntorch
from namedtensor.text import NamedField

import numpy as np
import random

In [0]:
def tokenize_ta(text):
    sent = text.split()
    sent.reverse()
    return sent

def tokenize_en(text):
    return text.split()

BOS_WORD = "<s>"
EOS_WORD = "</s>"

In [0]:
TA = NamedField(names=("seqlen",), tokenize=tokenize_ta)
EN = NamedField(names=("seqlen",), tokenize=tokenize_en,
                init_token=BOS_WORD, eos_token=EOS_WORD)

In [0]:
MAX_LEN = 20
MIN_FREQ = 7

In [0]:
train, val, test = datasets.TranslationDataset.splits(
     exts=('.ta', '.en'),
     fields=(TA, EN),
     path="./en-ta-parallel-v2",
     train="corpus.bcn.train",
     test="corpus.bcn.test",
     validation="corpus.bcn.dev",
     filter_pred = lambda x: len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN
)

In [0]:
TA.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)

In [0]:
BOS_ID = EN.vocab.stoi[BOS_WORD]
EOS_ID = EN.vocab.stoi[EOS_WORD]

In [0]:
device = torch.device('cuda')
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test),
    batch_size=32, 
    device=device,
    repeat=False
)

In [0]:
def inspect(b, s):
    batch = [i for i in iter(test_iter)][b]
    return " ".join([TA.vocab.itos[i] for i in batch.src[{"batch": s}].values.data]), " ".join([EN.vocab.itos[i] for i in batch.trg[{"batch": s}].values.data])

def get_tamil_tensor(b, s):
    batch = [i for i in iter(test_iter)][b]
    return batch.src[{"batch": s}]

In [0]:
class EmbeddingLM(ntorch.nn.Module):
    def __init__(self, TEXT, dropout=0.0, max_embedding_norm=None,
                 embedding_size=1000):
        super(EmbeddingLM, self).__init__()
        self.dropout_prob = dropout
        self.dropout = ntorch.nn.Dropout(self.dropout_prob)
        
        self.vocab_size = len(TEXT.vocab)
        self.embedding_dim = embedding_size
        self.embeddings = ntorch.nn.Embedding(self.vocab_size,
                                              self.embedding_dim,
                                              max_norm=max_embedding_norm)
        self.embeddings.spec("seqlen", "embedding")
#         self.embeddings._output_augment = "embedding"
#         self.embeddings._input_order = ()
#         self.embeddings._spec = True
     

In [0]:
class EncoderLSTM(EmbeddingLM):
    def __init__(self, TEXT, hidden_size=500, num_layers=2,
                 **kwargs):
        super(EncoderLSTM, self).__init__(TEXT, **kwargs)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = ntorch.nn.LSTM(input_size=self.embedding_dim,
                                   hidden_size=self.hidden_size,
                                   num_layers=self.num_layers,
                                   dropout=self.dropout_prob,
                                   batch_first=True)
        self.lstm.spec("embedding", "seqlen", "encoding")
    
    def forward(self, input):
        x = self.embeddings(input)
        x = self.dropout(x)
        output, (hidden, context) = self.lstm(x)
        return hidden, context

In [0]:
class DecoderLSTM(EncoderLSTM):
    def __init__(self, TEXT, **kwargs):
        super(DecoderLSTM, self).__init__(TEXT, **kwargs)
        self.linear = ntorch.nn.Linear(self.hidden_size,
                                       self.vocab_size)
        self.linear.spec("encoding", "vocab")
        
    def forward(self, input, hidden = None, context = None):
        x = self.embeddings(input).relu()
        output, (hidden, context) = self.lstm(x, (hidden, context))
        output = self.linear(output).log_softmax("vocab")
        return output, hidden, context

In [0]:
class Seq2Seq(ntorch.nn.Module):
    def __init__(self, SRC_TEXT, TRG_TEXT, embedding_size = 256, 
                hidden_size = 512, num_layers = 2, dropout=0.0):
        super(Seq2Seq, self).__init__()
        self.encoder = EncoderLSTM(TA, hidden_size=hidden_size,
                                   num_layers=num_layers,
                                   embedding_size=embedding_size,
                                   dropout=dropout)
        self.decoder = DecoderLSTM(EN, hidden_size=hidden_size,
                                   num_layers=num_layers,
                                   embedding_size=embedding_size,
                                   dropout=dropout)
    
    def forward(self, source, target=None, teacher_forcing=0.5,
                max_length=20):
        if target:
            max_length = target.shape["seqlen"]
        else:
            teacher_forcing = 0
        
        hidden, context = self.encoder(source)
        out_log_probs = ntorch.zeros(source.size("batch"),
                                 max_length,
                                 self.decoder.vocab_size,
                                 names = ("batch",
                                          "seqlen",
                                          "vocab")).cuda()
        input = ntorch.tensor([BOS_ID] * source.size("batch"), names = ("batch")).cuda()
        input_shape = dict(input.shape)
        input_shape["seqlen"] = 1
        input = input.split("batch", ("seqlen", "batch"), **input_shape)
        
        for t in range(1, max_length):
            output, hidden, context = self.decoder(input, hidden, context)
            teacher_force = random.random() < teacher_forcing
            out_log_probs[{"seqlen" : t}] = output.squeeze("seqlen")

            if teacher_force:
                input = target[{"seqlen" : t}]
                input = input.split("batch", ("seqlen", "batch"), **input_shape)
            else:
                dist = ntorch.distributions.Categorical(logits = output, dim_logit = "vocab")
                input = dist.sample()
                
        return out_log_probs         

In [0]:
model = Seq2Seq(TA, EN, dropout = 0.01, embedding_size = 128, hidden_size = 256)
model.cuda()

Seq2Seq(
  (encoder): EncoderLSTM(
    (dropout): Dropout(p=0.01)
    (embeddings): Embedding(13872, 128)
    (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.01)
  )
  (decoder): DecoderLSTM(
    (dropout): Dropout(p=0.01)
    (embeddings): Embedding(12503, 128)
    (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.01)
    (linear): Linear(in_features=256, out_features=12503, bias=True)
  )
)

In [0]:
optimizer = torch.optim.Adam(model.parameters())
PAD_ID = TA.vocab.stoi['<pad>']
criterion = ntorch.nn.NLLLoss(ignore_index = PAD_ID).spec("vocab")

In [0]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        source = batch.src
        target = batch.trg
        
        optimizer.zero_grad()
        output = model(source, target)
        
        loss = criterion(output, target)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)
    

In [0]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            source = batch.src
            target = batch.trg
            
            output = model(source, target, teacher_forcing = 0)
            
            loss = criterion(output, target)
            epoch_loss += loss.item()
        return epoch_loss / len(iterator)

In [0]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

8432471

In [0]:
import math
import time

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [0]:
best_valid_loss = float('inf')

for epoch in range(15):
    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion, 1)
    valid_loss = evaluate(model, val_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


Epoch: 01 | Time: 4m 11s
	Train Loss: 5.616 | Train PPL: 274.870
	 Val. Loss: 5.556 |  Val. PPL: 258.765
Epoch: 02 | Time: 4m 11s
	Train Loss: 5.262 | Train PPL: 192.962
	 Val. Loss: 5.440 |  Val. PPL: 230.330
Epoch: 03 | Time: 4m 12s
	Train Loss: 5.077 | Train PPL: 160.259
	 Val. Loss: 5.409 |  Val. PPL: 223.310
Epoch: 04 | Time: 4m 12s
	Train Loss: 4.941 | Train PPL: 139.877
	 Val. Loss: 5.345 |  Val. PPL: 209.554
Epoch: 05 | Time: 4m 12s
	Train Loss: 4.817 | Train PPL: 123.558
	 Val. Loss: 5.348 |  Val. PPL: 210.144
Epoch: 06 | Time: 4m 11s
	Train Loss: 4.717 | Train PPL: 111.858
	 Val. Loss: 5.283 |  Val. PPL: 196.984
Epoch: 07 | Time: 4m 12s
	Train Loss: 4.617 | Train PPL: 101.164
	 Val. Loss: 5.302 |  Val. PPL: 200.684
Epoch: 08 | Time: 4m 11s
	Train Loss: 4.526 | Train PPL:  92.417
	 Val. Loss: 5.293 |  Val. PPL: 198.901
Epoch: 09 | Time: 4m 12s
	Train Loss: 4.443 | Train PPL:  84.989
	 Val. Loss: 5.300 |  Val. PPL: 200.422
Epoch: 10 | Time: 4m 11s
	Train Loss: 4.358 | Train PPL

KeyboardInterrupt: ignored

In [0]:
evaluate(model, test_iter, criterion)

5.286605736304974