In [None]:
import torch
import torch.nn as nn
import random
import time
import math
import numpy as np
import torch.nn.functional as F
from torchtext.legacy import data
from torchtext.legacy import datasets
import spacy

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [None]:
en = spacy.load("en_core_web_sm")
de=spacy.load("de_core_news_sm")

In [None]:
seed=1234

random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)
torch.cuda.manual_seed(1234)
torch.backends.cudnn.determininistic=True

In [None]:
if torch.cuda.is_available():
  device="cuda"
else:
  device="cpu"

In [None]:
device

'cuda'

In [None]:
def tokenize_de(text):
  return [tok.text for tok in de.tokenizer(text)]

def tokenize_en(text):
  return [tok.text for tok in en.tokenizer(text)]

In [None]:
source=data.Field(init_token="<sos>",eos_token="<eos>",tokenize=tokenize_de,lower=True)
target=data.Field(init_token="<sos>",eos_token="<eos>",tokenize=tokenize_en,lower=True)

In [None]:
train_data,valid_data,test_data=datasets.Multi30k.splits(exts=(".de",".en"),fields=(source,target))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 480kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 92.2kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 86.9kB/s]


In [None]:
source.build_vocab(train_data,min_freq=2)
target.build_vocab(train_data,min_freq=2)

In [None]:
train_iterator,valid_iterator,test_iterator=data.BucketIterator.splits((train_data,valid_data,test_data),batch_size=64,device=device)

In [None]:
batch=next(iter(train_iterator))

In [None]:
batch.src.shape

torch.Size([25, 64])

In [None]:
class encoder(nn.Module):
  def __init__(self,input_dim,embed_dim,hidden_dim,dropout):
    super().__init__()
    self.input_dim = input_dim
    self.embed_dim = embed_dim
    self.hidden_dim = hidden_dim
    self.embedding = nn.Embedding(input_dim,embedding_dim)
    self.rnn = nn.GRU(embedding_dim,hidden_dim,bidirectional=True)
    self.lin1 = nn.Linear(hidden_dim*2,hidden_dim)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self,input):
    embed = self.dropout(self.embedding(input))
    output,hidden=self.rnn(embed)
    hidden=torch.tanh(self.lin1(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))
    return output,hidden

In [None]:
class attention(nn.Module):
  def __init__(self,hidden_dim):
    super().__init__()
    self.attention = nn.Linear((hidden_dim*2)+hidden_dim,hidden_dim)
    self.v = nn.Linear(hidden_dim,1,bias=False)
  
  def forward(self,hidden,encoder_output):
    batch_size = encoder_output.shape[1]
    src_len = encoder_output.shape[0]
    hidden=hidden.unsqueeze(1).repeat(1,src_len,1)
    encoder_output=encoder_output.permute(1,0,2)
    energy = torch.tanh(self.attention(torch.cat((hidden, encoder_output), dim = 2))) 
    attention = self.v(energy).squeeze(2)
    return F.softmax(attention, dim=1)

In [None]:
class decoder(nn.Module):
  def __init__(self,output_dim,embedding_dim,hidden_dim,dropout,attn):
    super().__init__()
    self.output_dim = output_dim
    self.embedding_dim = embedding_dim
    self.attn=attn
    self.hidden_dim = hidden_dim
    self.embed = nn.Embedding(output_dim,embedding_dim)
    self.rnn = nn.GRU(hidden_dim*2+embedding_dim,hidden_dim)
    self.fc=nn.Linear(hidden_dim*2+embedding_dim+hidden_dim,output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self,input,hidden,encoder_output):
    input = input.unsqueeze(0)
    embed =self.dropout(self.embed(input))
    a=self.attn(hidden,encoder_output)
    a=a.unsqueeze(1)
    encoder_output = encoder_output.permute(1,0,2)
    weighted= torch.bmm(a,encoder_output)
    weighted = weighted.permute(1,0,2)
    rnn_input = torch.cat((embed,weighted),dim=2)
    output,hidden = self.rnn(rnn_input,hidden.unsqueeze(0))
    assert (output == hidden).all()
        
    embed = embed.squeeze(0)
    output = output.squeeze(0)
    weighted = weighted.squeeze(0)
        
    prediction = self.fc(torch.cat((output, weighted, embed), dim = 1))
        
        #prediction = [batch size, output dim]
        
    return prediction, hidden.squeeze(0)


In [None]:
class seq2seq(nn.Module):
  def __init__(self,encoder,decoder,device):
    super().__init__()
    self.encoder=encoder
    self.decoder=decoder
    self.device=device
  
  def forward(self,src,trg,teacher_forcing_ratio=0.5):
    trg_len=trg.shape[0]
    batch_size = trg.shape[1]
    output_dim=self.decoder.output_dim
    outputs=torch.zeros(trg_len,batch_size,output_dim).to(self.device)
    encoder_output,hidden = self.encoder(src)
    input=trg[0,:]

    for t in range(1,trg_len):
      output,hidden=self.decoder(input,hidden,encoder_output)
      outputs[t]=output
      top=output.argmax(1)
      next_word = random.random() < teacher_forcing_ratio
      if next_word:
        input = trg[t]
      else:
        input=top
    return outputs      
            


In [None]:
input_dim = len(source.vocab)
output_dim = len(target.vocab)
embedding_dim = 256
hidden_dim =512
dropout = 0.5

In [None]:
input_dim,output_dim

(7855, 5893)

In [None]:
enc=encoder(input_dim,embedding_dim,hidden_dim,dropout)

In [None]:
enc

encoder(
  (embedding): Embedding(7855, 256)
  (rnn): GRU(256, 512, bidirectional=True)
  (lin1): Linear(in_features=1024, out_features=512, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
attn=attention(hidden_dim)

In [None]:
attn

attention(
  (attention): Linear(in_features=1536, out_features=512, bias=True)
  (v): Linear(in_features=512, out_features=1, bias=False)
)

In [None]:
dec=decoder(output_dim,embedding_dim,hidden_dim,dropout,attn)

In [None]:
dec

decoder(
  (attn): attention(
    (attention): Linear(in_features=1536, out_features=512, bias=True)
    (v): Linear(in_features=512, out_features=1, bias=False)
  )
  (embed): Embedding(5893, 256)
  (rnn): GRU(1280, 512)
  (fc): Linear(in_features=1792, out_features=5893, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
model=seq2seq(enc,dec,device).to(device)

In [None]:
model

seq2seq(
  (encoder): encoder(
    (embedding): Embedding(7855, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (lin1): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): decoder(
    (attn): attention(
      (attention): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embed): Embedding(5893, 256)
    (rnn): GRU(1280, 512)
    (fc): Linear(in_features=1792, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

seq2seq(
  (encoder): encoder(
    (embedding): Embedding(7855, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (lin1): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): decoder(
    (attn): attention(
      (attention): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embed): Embedding(5893, 256)
    (rnn): GRU(1280, 512)
    (fc): Linear(in_features=1792, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 20,518,917 trainable parameters


In [None]:
optim = torch.optim.Adam(model.parameters())

In [None]:
TRG_PAD_IDX = target.vocab.stoi[target.pad_token]

loss_fn = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
def train(model,iterator,optim,loss_fn,clip):
  model.train()
  epoch_loss=0
  for i,batch in enumerate(iterator):
    src=batch.src
    trg=batch.trg
    optim.zero_grad()
    pred=model(src,trg)
  #  print(pred.shape)
    pred_dim=pred.shape[-1]
  #  print(pred_dim)
    trg=trg[1:].view(-1)
   # print(trg.shape)
    pred = pred[1:].view(-1, pred_dim)
   # print(pred.shape)
    loss=loss_fn(pred,trg)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optim.step()
    epoch_loss+=loss.item()
  return epoch_loss/len(iterator)

In [None]:
def evaluate(model,iterator,loss_fn):
  model.eval()
  epoch_loss=0
  with torch.no_grad():
    for i,batch in enumerate(iterator):
      src=batch.src
      trg=batch.trg
      pred=model(src,trg,0)
      pred_dim=pred.shape[-1]
      trg=trg[1:].view(-1)
      pred = pred[1:].view(-1, pred_dim)
      loss=loss_fn(pred,trg)
      epoch_loss+=loss.item()
  return epoch_loss/len(iterator)

In [None]:
import time
import math

In [None]:

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
epochs=10
clip=1
best_valid_loss = float('inf')
for epoch in range(0,epochs):
  start_time = time.time()
  train_loss = train(model,train_iterator,optim,loss_fn,clip)
  valid_loss=evaluate(model,valid_iterator,loss_fn)
  end_time =time.time()
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'getoen-model.pt')
    
    
  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


Epoch: 01 | Time: 3m 3s
	Train Loss: 4.630 | Train PPL: 102.529
	 Val. Loss: 4.383 |  Val. PPL:  80.069
Epoch: 02 | Time: 3m 3s
	Train Loss: 3.411 | Train PPL:  30.309
	 Val. Loss: 3.547 |  Val. PPL:  34.703
Epoch: 03 | Time: 3m 1s
	Train Loss: 2.723 | Train PPL:  15.229
	 Val. Loss: 3.239 |  Val. PPL:  25.506
Epoch: 04 | Time: 3m 4s
	Train Loss: 2.321 | Train PPL:  10.181
	 Val. Loss: 3.176 |  Val. PPL:  23.950
Epoch: 05 | Time: 3m 4s
	Train Loss: 1.998 | Train PPL:   7.373
	 Val. Loss: 3.128 |  Val. PPL:  22.832
Epoch: 06 | Time: 3m 1s
	Train Loss: 1.768 | Train PPL:   5.858
	 Val. Loss: 3.137 |  Val. PPL:  23.039
Epoch: 07 | Time: 3m 2s
	Train Loss: 1.585 | Train PPL:   4.881
	 Val. Loss: 3.224 |  Val. PPL:  25.132
Epoch: 08 | Time: 3m 0s
	Train Loss: 1.443 | Train PPL:   4.232
	 Val. Loss: 3.337 |  Val. PPL:  28.148
Epoch: 09 | Time: 3m 0s
	Train Loss: 1.330 | Train PPL:   3.783
	 Val. Loss: 3.379 |  Val. PPL:  29.334
Epoch: 10 | Time: 3m 0s
	Train Loss: 1.213 | Train PPL:   3.364


In [None]:
model.load_state_dict(torch.load('getoen-model.pt'))

test_loss = evaluate(model, test_iterator, loss_fn)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 3.186 | Test PPL:  24.183 |


In [None]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
    # print(sentence)

    # sys.exit()

    # Load german tokenizer
    spacy_ger = spacy.load("de_core_news_sm")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, source.init_token)
    tokens.append(source.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [source.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        op,hidden = model.encoder(sentence_tensor)

    outputs = [target.vocab.stoi["<sos>"]]
    

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden = model.decoder(previous_word, hidden, op)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == target.vocab.stoi["<eos>"]:
            break

    translated_sentence = [target.vocab.itos[idx] for idx in outputs]

    # remove start token
    return " " .join(translated_sentence[1:])

In [None]:
for i in range(0,20):
  t=vars(test_data.examples[i])
  t1=t["src"][::-1]

  t2=" " .join(t["src"])
  print("german sentence:",t2)
  target1 =" ".join(t["trg"])
  print("ground truth:",target1)
  translated_sentence = translate_sentence(
        model,t2, source, target, device, max_length=50
    )
  print("tranlated sentence:",translated_sentence[1:])
  print("\n\n")


german sentence: ein mann mit einem orangefarbenen hut , der etwas anstarrt .
ground truth: a man in an orange hat starring at something .
tranlated sentence:  man in an orange hat welding something . <eos>



german sentence: ein boston terrier läuft über saftig-grünes gras vor einem weißen zaun .
ground truth: a boston terrier is running on lush green grass in front of a white fence .
tranlated sentence:  golden dog runs running on grass grass in front of a white fence . <eos>



german sentence: ein mädchen in einem karateanzug bricht ein brett mit einem tritt .
ground truth: a girl in karate uniform breaking a stick with a front kick .
tranlated sentence:  girl in a karate uniform is a a a a a . <eos>



german sentence: fünf leute in winterjacken und mit helmen stehen im schnee mit schneemobilen im hintergrund .
ground truth: five people wearing winter jackets and helmets stand in the snow , with snowmobiles in the background .
tranlated sentence: ive people wearing life jackets a