Машинный перевод. Seq2Seq

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

import spacy

import random
import math
import time

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

In [0]:
import dill

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
seed = 0

random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
! python -m spacy download en
! python -m spacy download de

spacy_en = spacy.load("en")
spacy_de = spacy.load("de")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 641kB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=1f7b71769f0d56ec472cb72b63ab026434d5d5ef4f6a8c90f86551918301a974
  Stored in directory: /tmp/pip-ephem-wheel-cache-of_xopaq/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Inst

In [0]:
def token_de(text):
  return [token.text for token in spacy_de.tokenizer(text)][::-1]

def token_en(text):
  return [token.text for token in spacy_en.tokenizer(text)]

In [0]:
SRC = Field(tokenize = token_de, init_token = '<sos>', eos_token = '<eos>', lower = True)
TRG = Field(tokenize = token_en, init_token = '<sos>', eos_token = '<eos>', lower = True)

In [0]:
train, valid, test = Multi30k.splits(exts = ('.de', '.en'), fields = (SRC, TRG))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 570kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 173kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 165kB/s]


In [0]:
SRC.build_vocab(train, min_freq = 2)
TRG.build_vocab(train, min_freq = 2)

In [0]:
print(len(SRC.vocab), len(TRG.vocab))

7855 5893


In [0]:
de_words = []
for i in range(len(SRC.vocab)):
  de_words.append(SRC.vocab.itos[i])

In [0]:
en_words = []
for i in range(len(TRG.vocab)):
  en_words.append(TRG.vocab.itos[i])

In [0]:
class Encoder(nn.Module):
  def __init__(self, src_dim, emb_dim, hid_dim, num_lay, drop):
    super().__init__()
    
    self.src_dim = src_dim
    self.emb_dim = emb_dim
    self.hid_dim = hid_dim
    self.num_lay = num_lay
    
    self.src2emb = nn.Embedding(src_dim, emb_dim)
    self.emb2hid = nn.LSTM(emb_dim, hid_dim, num_lay, bidirectional = True)
    self.drop = nn.Dropout(drop)
  
  def forward(self, src):
    emb = self.drop(self.src2emb(src))    
    hid, (h, c) = self.emb2hid(emb)
    return h, c

In [0]:
class Decoder(nn.Module):
  def __init__(self, trg_dim, emb_dim, hid_dim, num_lay, drop):
    super().__init__()
    
    self.emb_dim = emb_dim
    self.hid_dim = hid_dim
    self.trg_dim = trg_dim
    self.num_lay = num_lay
    
    self.trg2emb = nn.Embedding(trg_dim, emb_dim)
    self.emb2hid = nn.LSTM(emb_dim, hid_dim, num_lay, bidirectional = True)
    self.hid2trg = nn.Linear(hid_dim * 2, trg_dim)
    self.drop = nn.Dropout(drop)
  
  def forward(self, inp, h, c):
    inp = inp.unsqueeze(0)
    emb = self.drop(self.trg2emb(inp))
    hid, (h, c) = self.emb2hid(emb, (h, c))
    trg = self.hid2trg(hid.squeeze(0))
    return trg, h, c

In [0]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
      
  def forward(self, src, trg, tfr = 0.5): #tfr - teacher forcing ratio
    max_len, batch_size = trg.size()

    outs = torch.zeros(max_len, batch_size, self.decoder.trg_dim).to(self.device)
    h, c = self.encoder(src)
    
    inp = trg[0, :]
    
    for i in range(1, max_len):
      out, h, c = self.decoder(inp, h, c)
      outs[i] = out
      top1 = out.max(1)[1]
      inp = (trg[i] if random.random() < tfr else top1)
  
    return outs

In [0]:
def train_(seq2seq, it, crit, opt, clip, tfr):
  seq2seq.train()
  epoch_loss = 0
  
  for i, batch in enumerate(it):
    src = batch.src
    trg = batch.trg
    
    out = seq2seq(src, trg, tfr)
    
    out = out[1:].view(-1, out.shape[-1])
    trg = trg[1:].view(-1)
    
    loss = crit(out, trg)
    
    opt.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(seq2seq.parameters(), clip)
    opt.step()
    
    epoch_loss += loss.item()
      
  return epoch_loss / len(it)

In [0]:
def eval_(seq2seq, it, crit):
  seq2seq.eval()
  epoch_loss = 0

  with torch.no_grad():
    for i, batch in enumerate(it):
      src = batch.src
      trg = batch.trg

      out = seq2seq(src, trg, 0)
      
      out = out[1:].view(-1, out.shape[-1])
      trg = trg[1:].view(-1)

      loss = crit(out, trg)
      epoch_loss += loss.item()
    
    return epoch_loss / len(it)

In [0]:
src_dim = len(SRC.vocab)
trg_dim = len(TRG.vocab)
src_emb_dim = trg_emb_dim = 128
hid_dim = 512
num_lay = 5
drop = 0.2

batch_size = 128
PAD_IND = TRG.vocab.stoi['<pad>']

it = BucketIterator.splits((train, valid, test), batch_size = batch_size, device = device)
train_it, valid_it, test_it = it

encoder = Encoder(src_dim, src_emb_dim, hid_dim, num_lay, drop)
decoder = Decoder(trg_dim, trg_emb_dim, hid_dim, num_lay, drop)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)

In [0]:
seq2seq

Seq2Seq(
  (encoder): Encoder(
    (src2emb): Embedding(7855, 128)
    (emb2hid): LSTM(128, 512, num_layers=5, bidirectional=True)
    (drop): Dropout(p=0.2, inplace=False)
  )
  (decoder): Decoder(
    (trg2emb): Embedding(5893, 128)
    (emb2hid): LSTM(128, 512, num_layers=5, bidirectional=True)
    (hid2trg): Linear(in_features=1024, out_features=5893, bias=True)
    (drop): Dropout(p=0.2, inplace=False)
  )
)

In [0]:
num_epochs = 50
CLIP = 1
TFR = 0.7

opt = optim.Adam(seq2seq.parameters(), lr = 1e-4)
crit = nn.CrossEntropyLoss(ignore_index=PAD_IND)

best = float('inf')

In [0]:
for epoch in range(num_epochs):
  train_loss = round(train_(seq2seq, train_it, crit, opt, CLIP, TFR), 5)
  valid_loss = round(eval_(seq2seq, valid_it, crit), 5)

  if valid_loss < best:
    best = valid_loss
    torch.save(seq2seq.state_dict(), '/content/gdrive/My Drive/Models/seq2seq_5_bi.pt')

  print('Epoch: {} \n Train Loss {}  Val loss {}:'.format(epoch, train_loss, valid_loss))

Epoch: 0 
 Train Loss 5.34693  Val loss 5.03553:
Epoch: 1 
 Train Loss 4.87516  Val loss 4.90424:
Epoch: 2 
 Train Loss 4.81923  Val loss 4.72618:
Epoch: 3 
 Train Loss 4.70468  Val loss 4.65255:
Epoch: 4 
 Train Loss 4.54333  Val loss 4.61009:
Epoch: 5 
 Train Loss 4.33659  Val loss 4.63377:
Epoch: 6 
 Train Loss 4.15686  Val loss 4.58876:
Epoch: 7 
 Train Loss 4.02162  Val loss 4.54608:
Epoch: 8 
 Train Loss 3.91199  Val loss 4.51854:
Epoch: 9 
 Train Loss 3.81741  Val loss 4.44543:
Epoch: 10 
 Train Loss 3.71075  Val loss 4.36034:
Epoch: 11 
 Train Loss 3.61411  Val loss 4.38712:
Epoch: 12 
 Train Loss 3.52715  Val loss 4.30879:
Epoch: 13 
 Train Loss 3.45426  Val loss 4.25266:
Epoch: 14 
 Train Loss 3.37322  Val loss 4.30223:
Epoch: 15 
 Train Loss 3.30456  Val loss 4.26951:
Epoch: 16 
 Train Loss 3.23515  Val loss 4.14975:
Epoch: 17 
 Train Loss 3.16093  Val loss 4.15013:
Epoch: 18 
 Train Loss 3.10918  Val loss 4.1484:
Epoch: 19 
 Train Loss 3.0437  Val loss 4.08016:
Epoch: 20 
 

KeyboardInterrupt: ignored

In [0]:
seq2seq.load_state_dict(torch.load('/content/gdrive/My Drive/Models/seq2seq_5_bi.pt'))
best = eval_(seq2seq, valid_it, crit)

In [0]:
best

3.4479972422122955

In [0]:
num_epochs = 10
CLIP = 1
TFR = 0.15

opt = optim.Adam(seq2seq.parameters(), lr = 1e-4)
crit = nn.CrossEntropyLoss(ignore_index=PAD_IND)

In [0]:
for epoch in range(num_epochs):
  train_loss = round(train_(seq2seq, train_it, crit, opt, CLIP, TFR), 5)
  valid_loss = round(eval_(seq2seq, valid_it, crit), 5)

  if valid_loss < best:
    best = valid_loss
    torch.save(seq2seq.state_dict(), '/content/gdrive/My Drive/Models/seq2seq_5_bi.pt')

  print('Epoch: {} \n Train Loss {}  Val loss {}:'.format(epoch, train_loss, valid_loss))

Epoch: 0 
 Train Loss 2.58476  Val loss 3.49668:
Epoch: 1 
 Train Loss 2.51347  Val loss 3.46819:
Epoch: 2 
 Train Loss 2.4571  Val loss 3.448:
Epoch: 3 
 Train Loss 2.3821  Val loss 3.48092:
Epoch: 4 
 Train Loss 2.33111  Val loss 3.52004:
Epoch: 5 
 Train Loss 2.26993  Val loss 3.49503:
Epoch: 6 
 Train Loss 2.20341  Val loss 3.51156:
Epoch: 7 
 Train Loss 2.14897  Val loss 3.58274:
Epoch: 8 
 Train Loss 2.0872  Val loss 3.55001:
Epoch: 9 
 Train Loss 2.03006  Val loss 3.58647:


In [0]:
def translate(sent):
  sent = sent.lower()
  sent_vec = [[SRC.vocab.stoi['<eos>']]]
  for token in sent.split():
    sent_vec.append([SRC.vocab.stoi[token]])
  sent_vec.append([SRC.vocab.stoi['<sos>']])

  sent_vec = sent_vec[::-1]

  sent_vec = torch.tensor(sent_vec).to(device)
  print(sent_vec)
  inp = torch.zeros((20, 1)).type(torch.LongTensor).to(device)
  inp += SRC.vocab.stoi['<sos>']
  out = seq2seq(sent_vec, inp, 0)

  for t in out:
    if t[0].max(0)[1] != SRC.vocab.stoi['<eos>']:
      print(TRG.vocab.itos[t[0].max(0)[1]], end = ' ')
    else:
      break

In [0]:
def translate_vec(sent_vec, out_size):
  inp = torch.zeros((out_size, 1)).type(torch.LongTensor).to(device)
  inp += SRC.vocab.stoi['<sos>']
  out = seq2seq(sent_vec, inp, 0)

  s = ''
  for t in out:
    s += (TRG.vocab.itos[t[0].max(0)[1]] + ' ')
  
  return s

In [0]:
def vec2sent(sent_vec, field):
  s = ''
  for t in sent_vec:
   s += (field.vocab.itos[t] + ' ')
  
  return s

In [0]:
from nltk.translate.bleu_score import corpus_bleu

In [0]:
def compute_bleu(inp_batch, out_batch):
  translations = []
  for j in range(inp_batch.shape[1]):
    sent_vec = torch.zeros((inp_batch.shape[0], 1)).type(torch.LongTensor).to(device)

    for i in range(inp_batch.shape[0]):
      sent_vec[i][0] = inp_batch[i][j]
    
    translations.append(translate_vec(sent_vec, out_batch.shape[0]))

  outputs = []
  for j in range(out_batch.shape[1]):
    sent_vec = torch.zeros((out_batch.shape[0], 1)).type(torch.LongTensor).to(device)

    for i in range(out_batch.shape[0]):
      sent_vec[i][0] = out_batch[i][j]
    
    outputs.append(vec2sent(sent_vec, TRG))
  
  return corpus_bleu(outputs, translations)

In [0]:
best_accuracy = 0
worst_accuracy = 1
mean_accuracy = 0
count = 0

In [0]:
for i, batch in enumerate(test_it):
  accuracy = compute_bleu(batch.src, batch.trg)
  mean_accuracy += accuracy
  count += 1

  print(accuracy)
  
  if accuracy > best_accuracy:
    best_accuracy = accuracy

  if accuracy < worst_accuracy:
    worst_accuracy = accuracy

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.6966093116376639
0.7306135901287637
0.7118077027972092
0.687989192339322
0.7010099117293621
0.6833126972407678
0.632492277525701
0.5880911053562271


In [0]:
mean_accuracy /= count
print(best_accuracy)
print(worst_accuracy)
print(mean_accuracy)

0.7306135901287637
0.5880911053562271
0.6789907235943772


In [0]:
compute_bleu(batch_0.src, batch_0.trg)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


60.02218420927817

In [0]:
sent_vec = torch.zeros((32, 1)).type(torch.LongTensor).to(device)
for i in range(len(batch_0.src)):
  sent_vec[i][0] = batch_0.src[i][0]
print(sent_vec)

tensor([[  2],
        [  4],
        [141],
        [ 20],
        [326],
        [337],
        [ 23],
        [220],
        [580],
        [  5],
        [  3],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  1],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0]], device='cuda:0')


In [0]:
sent_vec_trg = torch.zeros((32, 1)).type(torch.LongTensor).to(device)
for i in range(len(batch_0.trg)):
  sent_vec_trg[i][0] = batch_0.trg[i][0]
print(sent_vec_trg)

tensor([[   2],
        [   4],
        [ 674],
        [  10],
        [1151],
        [  79],
        [   6],
        [   7],
        [ 168],
        [  12],
        [   4],
        [ 164],
        [   5],
        [   3],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [   1],
        [   0],
        [   0],
        [   0],
        [   0],
        [   0],
        [   0],
        [   0]], device='cuda:0')


In [0]:
for t in sent_vec:
   print(SRC.vocab.itos[t], end = ' ')
print()
translate_vec(sent_vec)
print()
for t in sent_vec_trg:
   print(TRG.vocab.itos[t], end = ' ')

print()
translate("männer in orangefarbenen westen heben gemeinsam einen gegenstand hoch")

<sos> . rennen im mitten gerade sich befindet läufer ein <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <unk> <unk> <unk> <unk> <unk> 
<unk> a runner is in in in in the middle of a race . 
<sos> a runner is caught running in the middle of a race . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 
tensor([[  2],
        [119],
        [456],
        [ 19],
        [614],
        [893],
        [573],
        [175],
        [  7],
        [ 30],
        [  3]], device='cuda:0')
<unk> men in orange vests <unk> to to a a . 