In [37]:
pip install torchtext==0.16.0

Note: you may need to restart the kernel to use updated packages.


In [38]:
pip install portalocker==2.8.2

Note: you may need to restart the kernel to use updated packages.


In [39]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [40]:
import torch
import torchtext

from torchtext.vocab import build_vocab_from_iterator



import spacy

en_nlp = spacy.load('en_core_web_sm')
de_nlp = spacy.load('de_core_news_sm')

from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T

from torch.utils.data import DataLoader

import numpy as np

In [41]:
data = torchtext.datasets.Multi30k(split = 'train')
test_data = torchtext.datasets.Multi30k(split = 'valid')

In [42]:
train_input = []
train_target = []

for data_idx,data in enumerate(data):
  target = ' '.join(['sos',data[1],'eos'])
  train_input.append(data[0])
  train_target.append(target)

In [43]:
test_input = []
test_target = []

for data_idx,data in enumerate(test_data):
  target = ' '.join(['sos',data[1],'eos'])
  test_input.append(data[0])
  test_target.append(target)

In [44]:
class Text_Tokenization:
  def __init__(self,nlp):
    self.nlp = nlp

  def _spacy_tokenizer(self,text):
    return [token.text for token in self.nlp(text)]

  def _yield_tokens(self,doc):
    for text in doc:
      text = self._spacy_tokenizer(text)
      yield text

  def _vocab(self,data):
    vocab = build_vocab_from_iterator(
        self._yield_tokens(data),
        specials = ['<pad>','<unk>']
    )
    vocab.set_default_index(vocab['<unk>'])
    self.vocab = vocab


  def tokenize(self,doc,maxlen,vocab = None):
    if vocab == None:
      self._vocab(doc)

    transforms = T.Sequential(
        T.VocabTransform(self.vocab),
        T.Truncate(max_seq_len = maxlen),
        T.ToTensor(padding_value = 0),
        T.PadTransform(max_length = maxlen,pad_value = 0)
    )

    output = np.array([transforms(self._spacy_tokenizer(text)) for text in doc])
    return output

In [45]:
def decoder_inputs_targets(sentences):
  decoder_inputs = []
  decoder_targets = [s[1:] for s in sentences]

  for sentence in sentences:
    sentence = [s for s in sentence if s != 0][:-1]
    while len(sentence) != len(decoder_targets[0]): sentence.append(0)
    decoder_inputs.append(np.array(sentence))

  return np.array(decoder_inputs),np.array(decoder_targets)

In [46]:
source_text_tokenizer = Text_Tokenization(de_nlp)
target_text_tokenizer = Text_Tokenization(en_nlp)

train_input_tokenized = source_text_tokenizer.tokenize(train_input[:1000],maxlen = 39)
train_target_tokenized = target_text_tokenizer.tokenize(train_target[:1000],maxlen = 39)

source_text_vocab = source_text_tokenizer.vocab
source_text_inverse_vocab = {value:key for key,value in source_text_vocab.get_stoi().items()}

target_text_vocab = target_text_tokenizer.vocab
target_text_inverse_vocab = {value:key for key,value in target_text_vocab.get_stoi().items()}


test_input_tokenized = source_text_tokenizer.tokenize(test_input[:3000],maxlen = 39,vocab = source_text_vocab)
test_target_tokenized = target_text_tokenizer.tokenize(test_target[:3000],maxlen = 39,vocab = target_text_vocab)




In [47]:
encoder_inputs = train_input_tokenized
decoder_inputs,decoder_targets = decoder_inputs_targets(train_target_tokenized)

encoder_inputs_test = test_input_tokenized
decoder_inputs_test,decoder_targets_test = decoder_inputs_targets(test_target_tokenized)

print(f'Encoder input: {encoder_inputs[0]}')
print(f'Decoder input: {decoder_inputs[0]}')
print(f'Decoder target: {decoder_targets[0]}')


print(f'\nEncoder input reconstruction: {" ".join([source_text_inverse_vocab[token] for token in encoder_inputs[0] if token != 0 ])} ')
print(f'Decoder input reconstruction: {" ".join([target_text_inverse_vocab[token] for token in decoder_inputs[0] if token != 0 ])} ')
print(f'Encoder reconstruction: {" ".join([target_text_inverse_vocab[token] for token in decoder_targets[0] if token != 0 ])} ')

print(f'\n\n\nTest Encoder input: {encoder_inputs_test[0]}')
print(f'Test Decoder input: {decoder_inputs_test[0]}')
print(f'Test Decoder target: {decoder_targets_test[0]}')

print(f'\nEncoder input reconstruction: {" ".join([source_text_inverse_vocab[token] for token in encoder_inputs_test[0] if token != 0 ])} ')
print(f'Decoder input reconstruction: {" ".join([target_text_inverse_vocab[token] for token in decoder_inputs_test[0] if token != 0 ])} ')
print(f'Encoder reconstruction: {" ".join([target_text_inverse_vocab[token] for token in decoder_targets_test[0] if token != 0 ])} ')



Encoder input: [  20   89  282   33   96   19   54    6   12   63 2187  958    2    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]
Decoder input: [  4  18  26  16 574 729  15  54  59 307 452   5   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
Decoder target: [ 18  26  16 574 729  15  54  59 307 452   5   3   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]

Encoder input reconstruction: Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche . 
Decoder input reconstruction: sos Two young , White males are outside near many bushes . 
Encoder reconstruction: Two young , White males are outside near many bushes . eos 



Test Encoder input: [  15   58   29 1325    1    1    7   18 1261    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0

In [None]:
batch_size = 32
batched_encoder_inputs = DataLoader(encoder_inputs,batch_size)
batched_decoder_inputs = DataLoader(decoder_inputs,batch_size)
batched_decoder_targets = DataLoader(decoder_targets,batch_size)

batched_encoder_inputs_test = DataLoader(encoder_inputs_test,batch_size)
batched_decoder_inputs_test = DataLoader(decoder_inputs_test,batch_size)
batched_decoder_targets_test = DataLoader(decoder_targets_test,batch_size)

train_data_batched = DataLoader(list(zip(list(zip(encoder_inputs,decoder_inputs)),decoder_targets)),batch_size)
test_data_batched = DataLoader(list(zip(list(zip(encoder_inputs_test,decoder_inputs_test)),decoder_targets_test)),batch_size)

X,y = next(iter(train_data_batched))

X[0][0],X[1][0],y[0]

(tensor([  15,   58,   29, 1325,    1,    1,    7,   18, 1261,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0]),
 tensor([   4,    6,   56,   13,   40,   15, 1423, 1153, 1506,    2,  235,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0]),
 tensor([   6,   56,   13,   40,   15, 1423, 1153, 1506,    2,  235,    3,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0]))

In [49]:
class Encoder(torch.nn.Module):
  def __init__(self,vocab_size,embedding_size,hidden_size,num_layers,p):
    super().__init__()

    self.dropout = torch.nn.Dropout(p)
    self.embedding = torch.nn.Embedding(vocab_size,embedding_size,padding_idx = 0)
    self.lstm = torch.nn.LSTM(embedding_size,hidden_size,num_layers,batch_first = True,bidirectional = True,dropout = p)

  def forward(self,encoder_input):
    encoder_input = self.dropout(self.embedding(encoder_input))
    _,(hidden,cell) = self.lstm(encoder_input)

    return hidden,cell

In [50]:
class Decoder(torch.nn.Module):
  def __init__(self,vocab_size,embedding_size,hidden_size,num_layers,p):
    super().__init__()

    self.dropout = torch.nn.Dropout(p)
    self.embedding = torch.nn.Embedding(vocab_size,embedding_size,padding_idx = 0)
    self.lstm = torch.nn.LSTM(embedding_size,hidden_size,num_layers,batch_first = True,bidirectional = True,dropout = p)
    self.linear = torch.nn.Linear(2 * hidden_size,vocab_size)

  def forward(self,decoder_input,input_hidden,input_cell):
    decoder_input = self.dropout(self.embedding(decoder_input))

    output,(decoder_hidden,decoder_cell) = self.lstm(decoder_input,(input_hidden,input_cell))
    output = self.linear(output)

    return output,decoder_hidden,decoder_cell

In [51]:
class Seq2Seq(torch.nn.Module):
  def __init__(self,encoder,decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self,X):
    source,target = X

    batch_size,target_seq_len = target.shape
    target_vocab_size = len(target_text_inverse_vocab)
    outputs = torch.zeros((target_seq_len,batch_size,target_vocab_size))

    hidden,cell = self.encoder(source)

    for token in range(0,target_seq_len):
      output,hidden,cell = self.decoder(target[:,token].unsqueeze(1),hidden,cell)
      outputs[token] = output.squeeze()

    return outputs.permute(1,0,2)

In [52]:
class Train_Model:
  def __init__(self,model,loss_function,optimizer,epochs):
    self.model = model
    self.loss_function = loss_function
    self.optimizer = optimizer
    self.epochs = epochs


  def _accuracy(self,target,prediction):
    from sklearn.metrics import accuracy_score as accuracy
    target = np.array(target).reshape(-1)
    prediction = np.array(prediction).reshape(-1)

    valid_idxs = np.argwhere(target != 0).reshape(-1)

    target = target[valid_idxs]
    prediction = prediction[valid_idxs]

    acc = accuracy(target,prediction)

    return acc

  def fit(self,train_data_batched):
    from tqdm.auto import tqdm

    self.model.train()

    train_batch_loss = 0
    train_batch_acc = 0

    for epoch in tqdm(range(self.epochs)):
      for batch,(X,y) in tqdm(enumerate(train_data_batched)):
        train_prediction = self.model(X)
        train_labels = train_prediction.argmax(-1)

        train_loss = self.loss_function(train_prediction.permute(0,2,1),y)
        train_acc = self._accuracy(y,train_labels)

        train_batch_loss += train_loss
        train_batch_acc += train_acc

        self.optimizer.zero_grad()
        train_loss.backward()
        self.optimizer.step()

      train_batch_loss /= len(train_data_batched)
      train_batch_acc /= len(train_data_batched)

      print(f'Epoch: {epoch} | Train Loss: {train_batch_loss} | Train Accuracy: {train_batch_acc}')

    return self.model


  def eval(self,test_data_batched):
    from tqdm.auto import tqdm

    self.model.eval()

    test_batch_loss = 0
    test_batch_acc = 0

    for batch,(X,y) in tqdm(enumerate(test_data_batched)):
      test_prediction = self.model(X)
      test_labels = test_prediction.argmax(-1)

      test_loss = self.loss_function(test_prediction.permute(0,2,1),y)
      test_acc = self._accuracy(y,test_labels)

      test_batch_loss += test_loss
      test_batch_acc += test_acc

    test_batch_loss /= len(test_data_batched)
    test_batch_acc /= len(test_data_batched)

    print(f'Test Loss: {test_batch_loss} | Test Accuracy: {test_batch_acc}')

In [53]:
sourve_vocab_size = len(source_text_inverse_vocab)
target_vocab_size = len(target_text_inverse_vocab)

embedding_size = 128
hidden_size = 256
num_layers = 2
p = 0.2

encoder = Encoder(sourve_vocab_size,embedding_size,hidden_size,num_layers,p)
decoder = Decoder(target_vocab_size,embedding_size,hidden_size,num_layers,p)
model = Seq2Seq(encoder,decoder)

In [54]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 0.001)
epochs = 30

Trainer = Train_Model(model,loss_function,optimizer,epochs)
model = Trainer.fit(train_data_batched)
Trainer.eval(test_data_batched)


  0%|          | 0/30 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch: 0 | Train Loss: 3.2800869941711426 | Train Accuracy: 0.06191669889744503


0it [00:00, ?it/s]

Epoch: 1 | Train Loss: 1.9530646800994873 | Train Accuracy: 0.21374264759008893


0it [00:00, ?it/s]

Epoch: 2 | Train Loss: 1.7931052446365356 | Train Accuracy: 0.26265276692912304


0it [00:00, ?it/s]

Epoch: 3 | Train Loss: 1.7097806930541992 | Train Accuracy: 0.2804017549110527


0it [00:00, ?it/s]

Epoch: 4 | Train Loss: 1.6179306507110596 | Train Accuracy: 0.30661995502498407


0it [00:00, ?it/s]

Epoch: 5 | Train Loss: 1.5281482934951782 | Train Accuracy: 0.3291968737749158


0it [00:00, ?it/s]

Epoch: 6 | Train Loss: 1.4440710544586182 | Train Accuracy: 0.35066525247090546


0it [00:00, ?it/s]

Epoch: 7 | Train Loss: 1.3652151823043823 | Train Accuracy: 0.3701266853298629


0it [00:00, ?it/s]

Epoch: 8 | Train Loss: 1.2915691137313843 | Train Accuracy: 0.386556775069133


0it [00:00, ?it/s]

Epoch: 9 | Train Loss: 1.225767731666565 | Train Accuracy: 0.3977846358468434


0it [00:00, ?it/s]

Epoch: 10 | Train Loss: 1.166143536567688 | Train Accuracy: 0.4153591455487834


0it [00:00, ?it/s]

Epoch: 11 | Train Loss: 1.08286714553833 | Train Accuracy: 0.43588392261010145


0it [00:00, ?it/s]

Epoch: 12 | Train Loss: 1.0109105110168457 | Train Accuracy: 0.45522864998618


0it [00:00, ?it/s]

Epoch: 13 | Train Loss: 0.9420673847198486 | Train Accuracy: 0.4779901650234873


0it [00:00, ?it/s]

Epoch: 14 | Train Loss: 0.8843963742256165 | Train Accuracy: 0.5026112578965313


0it [00:00, ?it/s]

Epoch: 15 | Train Loss: 0.8208913803100586 | Train Accuracy: 0.5299837168149873


0it [00:00, ?it/s]

Epoch: 16 | Train Loss: 0.7549307942390442 | Train Accuracy: 0.5592042778855623


0it [00:00, ?it/s]

Epoch: 17 | Train Loss: 0.6910964846611023 | Train Accuracy: 0.5961160325296668


0it [00:00, ?it/s]

Epoch: 18 | Train Loss: 0.6267557144165039 | Train Accuracy: 0.6421945622196025


0it [00:00, ?it/s]

Epoch: 19 | Train Loss: 0.5653209686279297 | Train Accuracy: 0.6769753477955589


0it [00:00, ?it/s]

Epoch: 20 | Train Loss: 0.5195882320404053 | Train Accuracy: 0.7088854890115419


0it [00:00, ?it/s]

Epoch: 21 | Train Loss: 0.47199663519859314 | Train Accuracy: 0.74071255927118


0it [00:00, ?it/s]

Epoch: 22 | Train Loss: 0.4336816370487213 | Train Accuracy: 0.7656311625399638


0it [00:00, ?it/s]

Epoch: 23 | Train Loss: 0.3966355621814728 | Train Accuracy: 0.79059837128836


0it [00:00, ?it/s]

Epoch: 24 | Train Loss: 0.3475376069545746 | Train Accuracy: 0.8276025560276556


0it [00:00, ?it/s]

Epoch: 25 | Train Loss: 0.3051137924194336 | Train Accuracy: 0.8631553673985932


0it [00:00, ?it/s]

Epoch: 26 | Train Loss: 0.2656541168689728 | Train Accuracy: 0.8972481197358971


0it [00:00, ?it/s]

Epoch: 27 | Train Loss: 0.22988465428352356 | Train Accuracy: 0.9211855839919622


0it [00:00, ?it/s]

Epoch: 28 | Train Loss: 0.20243416726589203 | Train Accuracy: 0.9406549099703587


0it [00:00, ?it/s]

Epoch: 29 | Train Loss: 0.17587482929229736 | Train Accuracy: 0.9574850840815399


0it [00:00, ?it/s]

Test Loss: 1.8455290794372559 | Test Accuracy: 0.3794608507068989


In [58]:
def Translate(model,text,source_text_vocab,source_text_tokenizer,target_text_inverse_vocab,max_translation_len = 30):
  encoder = model.encoder
  decoder = model.decoder

  print(f'Input sentence: {text}')
  text = torch.tensor(source_text_tokenizer.tokenize([text],39,source_text_vocab))

  x = torch.tensor([[4]])
  translation = []

  hidden,cell = encoder(text)

  for _ in range(max_translation_len):
    output,hidden,cell = decoder(x,hidden,cell)
    token = output.argmax(-1)
    if token[0][0].item() == 3: break

    translation.append(token[0][0].item())
    x = token

  translated_sentence = ' '.join([target_text_inverse_vocab[token] for token in translation])
  print(f'Translated sentence: {translated_sentence}\n')

  return translated_sentence

In [60]:
Translate(model,train_input[66],source_text_vocab,source_text_tokenizer,target_text_inverse_vocab)

Input sentence: Ein Hund spielt mit einem Schlauch.
Translated sentence: A dog is playing with a hose .



'A dog is playing with a hose .'

In [61]:
train_target[66]

'sos A dog is playing with a hose. eos'