In [None]:
import itertools, os, time , datetime
import numpy as np
import spacy
import torch
import torch.nn as nn
use_gpu = torch.cuda.is_available()
from torchtext import data, datasets

In [None]:
def load_embeddings(SRC, TGT, np_src_file, tgt_file):
    emb_tr_src = torch.from_numpy(np.load(np_src_file))
    emb_tr_tgt = torch.from_numpy(np.load(np_tgt_file))
    return emb_tr_src, emb_tr_tgt

In [None]:
class Attention(nn.Module):
    def __init__(self, pad_token=1, bidirectional=True, h_dim=250):
        super(Attention, self).__init__()
        self.bidirectional, self.h_dim, self.pad_token = bidirectional, h_dim, pad_token
        self.softmax = nn.Softmax(dim=1)

    def forward(self, in_e, out_e, out_d):

        if self.bidirectional:
            out_e = out_e.contiguous().view(out_e.size(0), out_e.size(1), 2, -1).sum(2).view(out_e.size(0), out_e.size(1), -1)
            
        out_e = out_e.transpose(0,1) 
        out_d = out_d.transpose(0,1) 

        attn = out_e.bmm(out_d.transpose(1,2)) 
        attn = self.softmax(attn).transpose(1,2) 

        context = attn.bmm(out_e) 
        context = context.transpose(0,1) 
        return context

In [None]:
class EncoderLSTM(nn.Module):
    def __init__(self, embedding, h_dim, num_layers, bidirectional=True):
        super(EncoderLSTM, self).__init__()
        self.vocab_size, self.embedding_size = embedding.size()
        self.num_layers, self.h_dim, self.dropout_p, self.bidirectional = num_layers, h_dim, dropout_p, bidirectional 

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        self.embedding.weight.data.copy_(embedding)
        self.lstm = nn.LSTM(self.embedding_size, self.h_dim, self.num_layers, dropout=self.dropout_p, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, x):
        x = self.dropout(self.embedding(x)) 
        h0 = self.init_hidden(x.size(1))
        memory_bank, h = self.lstm(x, h0) 
        return memory_bank, h

    def init_hidden(self, batch_size):
         if self.bidirectional:
           num_layers = self.num_layers * 2 
         else:
            self.num_layers
        init = torch.zeros(num_layers, batch_size, self.h_dim)
        if use_gpu == True:
          init = init.cuda()
        else: 
          init

In [None]:
class DecoderLSTM(nn.Module):
    def __init__(self, embedding, h_dim, num_layers):
        super(DecoderLSTM, self).__init__()
        self.vocab_size, self.embedding_size = embedding.size()
        self.num_layers, self.h_dim, self.dropout_p = num_layers, h_dim, dropout_p
        
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        self.embedding.weight.data.copy_(embedding) 
        self.lstm = nn.LSTM(self.embedding_size, self.h_dim, self.num_layers, dropout=self.dropout_p)
        self.dropout = nn.Dropout(self.dropout_p)
    
    def forward(self, x, h0):
        x = self.embedding(x)
        x = self.dropout(x)
        out, h = self.lstm(x, h0)
        return out, h

In [None]:
class Seq2seq(nn.Module):
    def __init__(self, embedding_src, embedding_tgt, h_dim, num_layers, dropout_p, bi, tokens_bos_eos_pad_unk=[0,1,2,3]):
        super(Seq2seq, self).__init__()
    
        self.h_dim = h_dim
        self.vocab_size_tgt, self.emb_dim_tgt = embedding_tgt.size()
        self.bos_token, self.eos_token, self.pad_token, self.unk_token = tokens_bos_eos_pad_unk

        self.encoder = EncoderLSTM(embedding_src, h_dim, num_layers, dropout_p=dropout_p, bidirectional=bi)
        self.decoder = DecoderLSTM(embedding_tgt, h_dim, num_layers * 2 if bi else num_layers, dropout_p=dropout_p)
        self.attention = Attention(pad_token=self.pad_token, bidirectional=bi, h_dim=self.h_dim)

        self.linear1 = nn.Linear(2 * self.h_dim, self.emb_dim_tgt)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(dropout_p)
        self.linear2 = nn.Linear(self.emb_dim_tgt, self.vocab_size_tgt)
        
        if self.decoder.embedding.weight.size() == self.linear2.weight.size():
            self.linear2.weight = self.decoder.embedding.weight

    def forward(self, src, tgt):
        if use_gpu: 
          src = src.cuda()
        
        out_e, final_e = self.encoder(src)

        out_d, final_d = self.decoder(tgt, final_e)
        
        context = self.attention(src, out_e, out_d)
        out_cat = torch.cat((out_d, context), dim=2) 
        
        out = self.linear1(out_cat)
        out = self.dropout(self.tanh(out))
        out = self.linear2(out)
        return out

    def predict(self, src, beam_size=1): 
        beam_outputs = self.beam_search(src, beam_size, max_len=30) # returns top candidates in tuples
        best = beam_outputs[1][1] 
        return best # returns a subsection of those candidates


In [None]:
 def beam_search(self, src, beam_size, max_len, remove_tokens=[]):
        if use_gpu: 
          src = src.cuda()
        outputs_e, states = self.encoder(src) 
        init_sent = [self.bos_token]
        init_lprob = -1e10
        best_candidates = [(init_lprob, init_sent, states)] 
        
        k = beam_size 
        for length in range(max_len):
            candidates = [] 
            for lprob, sentence, current_state in best_candidates:
                last_word = sentence[-1]
                if last_word != self.eos_token:
                    last_word_input = torch.LongTensor([last_word]).view(1,1)
                    if use_gpu: last_word_input = last_word_input.cuda()
                    outputs_d, new_state = self.decoder(last_word_input, current_state)
                    context = self.attention(src, outputs_e, outputs_d)
                    out_cat = torch.cat((outputs_d, context), dim=2)
                    x = self.linear1(out_cat)
                    x = self.dropout(self.tanh(x))
                    x = self.linear2(x)
                    x = x.squeeze().data.clone()
                    for t in remove_tokens: x[t] = -10e5
                    lprobs = torch.log(x.exp() / x.exp().sum())
                    for index in torch.topk(lprobs, k)[1]: 
                        candidate = (float(lprobs[index]) + lprob, sentence + [index], new_state) 
                        candidates.append(option)
                else:
                    candidates.append((lprob, sentence, current_state))
            candidates.sort(key = lambda x: x[0], reverse=True) # sort by lprob
            best_candidates = candidates[:k] 
        best_candidates.sort(key = lambda x: x[0])
        return best_candidates


In [None]:
def train(train_iter, val_iter, model, criterion, optimizer, num_epochs):  
    for epoch in range(num_epochs):
      
        with torch.no_grad():
          val_loss = validate(val_iter, model, criterion) 
          print('Validating Epoch [{e}/{num_e}]\t Average loss: {l:.3f}\t Perplexity: {p:.3f}'.format(
            e=epoch, num_e=num_epochs, l=val_loss, p=torch.FloatTensor([val_loss]).exp().item()))

        model.train()
        losses = AverageMeter()
        for i, batch in enumerate(train_iter): 
            src = batch.src.cuda() if use_gpu else batch.src
            trans = batch.trg.cuda() if use_gpu else batch.trg
            
            # Includes backprop and optimizer
            model.zero_grad()
            scores = model(src, trans)
            scores = scores[:-1]
            trans = trans[1:]           

            scores = scores.view(scores.size(0) * scores.size(1), scores.size(2))
            tgt = tgt.view(scores.size(0))
            loss = criterion(scores, tgt) 
            loss.backward()
            losses.update(loss.item())
            optimizer.step()

            if i % 1000 == 10:
                print('''Epoch [{e}/{num_e}]\t Batch [{b}/{num_b}]\t Loss: {l:.3f}'''.format(e=epoch+1, num_e=num_epochs, b=i, num_b=len(train_iter), l=losses.avg))

        print('''Epoch [{e}/{num_e}] complete. Loss: {l:.3f}'''.format(e=epoch+1, num_e=num_epochs, l=losses.avg))

In [None]:
def validate(val_iter, model, criterion):
    model.eval()
    losses = AverageMeter()
    for i, batch in enumerate(val_iter):
        if use_gpu: 
          src = batch.src.cuda() 
        else: 
           batch.src
           
        tgt = batch.trg.cuda() if use_gpu else batch.trg
        
        scores = model(src, tgt)
        scores = scores[:-1]
        tgt = tgt[1:]           
        
        scores = scores.view(scores.size(0) * scores.size(1), scores.size(2))
        tgt = tgt.view(scores.size(0))
        num_words = (tgt != 0).float().sum()
        

        loss = criterion(scores, tgt) 
        losses.update(loss.item())
    
    return losses.avg

In [None]:
def predict_from_text(model, input_sentence, SRC, TGT):
    sent_german = input_sentence.split(' ') 
    sent_indices = [SRC.vocab.stoi[word] if word in SRC.vocab.stoi else SRC.vocab.stoi['<unk>'] for word in sent_german]
    sent = torch.LongTensor([sent_indices])
    if use_gpu: sent = sent.cuda()
    sent = sent.view(-1,1) 
    print('German: ' + ' '.join([SRC.vocab.itos[index] for index in sent_indices])) 
    pred = model.predict(sent, beam_size=15) 
    out = ' '.join([TGT.vocab.itos[index] for index in pred[1:-1]])
    print('English: ' + out)

In [None]:

embedding_src, embedding_tgt = load_embeddings(SRC, TGT, 'emb-13353-de.npy', 'emb-11560-en.npy')

In [None]:

tokens = [TGT.vocab.stoi[x] for x in ['<s>', '</s>', '<pad>', '<unk>']]
model = Seq2seq(embedding_src, embedding_tgt, 300, 2, 0.3, True, tokens_bos_eos_pad_unk=tokens)
model = model.cuda() if use_gpu else model

In [None]:

weight = torch.ones(len(TGT.vocab))
weight[TGT.vocab.stoi['<pad>']] = 0
weight = weight.cuda() if use_gpu else weight

In [None]:

criterion = nn.CrossEntropyLoss(weight=weight)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-3) 

In [None]:
train(train_iter, val_iter, model, criterion, optimizer, 100)

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch #')
plt.ylabel('Token')
plt.legend()

In [None]:
plt.plot(history.history['masked_acc'], label='accuracy')
plt.plot(history.history['val_masked_acc'], label='val_accuracy')
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch #')
plt.ylabel('Token')
plt.legend()

In [None]:
model.load_state_dict(torch.load('model.pkl'))
model = model.cuda() if use_gpu else model

In [None]:
with torch.no_grad():
  val_loss = validate(val_iter, model, criterion) 
  print('Average loss: {l:.3f}\t Perplexity: {p:.3f}'.format(l=val_loss, p=torch.FloatTensor([val_loss]).exp().item()))

Average loss: 1.865	 Perplexity: 6.459


In [None]:
input = "Ich kenne nur Berge, ich bleibe in den Bergen und ich liebe die Berge ."
predict_from_text(model, input, SRC, TGT)

German: Ich kenne nur <unk> ich bleibe in den Bergen und ich liebe die Berge .
English: I only know I 'm staying in the hills , and I love the mountains .
