In [50]:
import re
import copy
from collections import Counter
import numpy as np
import time
import math

import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.autograd import Variable
import gc

In [19]:
device = torch.device("cuda:0")

In [20]:
PAD = 0
SOS = 1
EOS = 2
UNK = 3

In [1]:
en_path = "dataset/europarl-v7.es-en.en"
es_path = "dataset/europarl-v7.es-en.es"
with open(en_path, 'r', encoding='utf-8') as f:
    en_str = f.read().strip().lower()
with open(es_path, 'r', encoding='utf-8') as f:
    es_str = f.read().strip().lower()

In [2]:
en_lines = en_str.split('\n')
es_lines = es_str.split('\n')

In [3]:
from sklearn.model_selection import train_test_split
_, en_small, _, es_small = train_test_split(en_lines, es_lines, test_size=0.1)

In [6]:
en_small_train, en_small_test, es_small_train, es_small_test = train_test_split(en_small, es_small, test_size=0.3)

In [11]:
with open("dataset/europarl-v7small.es-en-train.en", 'w', encoding='utf-8') as f:
    for line in en_small_train:
        f.write(line + '\n')
with open("dataset/europarl-v7small.es-en-train.es", 'w', encoding='utf-8') as f:
    for line in es_small_train:
        f.write(line + '\n')
with open("dataset/europarl-v7small.es-en-test.en", 'w', encoding='utf-8') as f:
    for line in en_small_test:
        f.write(line + '\n')
with open("dataset/europarl-v7small.es-en-test.es", 'w', encoding='utf-8') as f:
    for line in es_small_test:
        f.write(line + '\n')

# Dataset

In [21]:
class EnEsTranslationDataset():
    def __init__(self, en_path, es_path, max_len=50):
        self.max_len = max_len
        self.en_lines, self.es_lines = self.preprocess_data(en_path, es_path)
        self.en_vocab = None
        self.en_word2index = None
        self.en_index2word = None
        self.es_vocab = None
        self.es_word2index = None
        self.es_index2word = None
        self.en_data = None
        self.es_data = None
        
    def preprocess_data(self, en_path, es_path):
        print("Reading from file...")
        with open(en_path, 'r', encoding='utf-8') as f:
            en_str = f.read().strip().lower()
        with open(es_path, 'r', encoding='utf-8') as f:
            es_str = f.read().strip().lower()
            
        print("Adding space between punctuations...")
        en_str = self.add_space_between_punctuation(en_str)
        es_str = self.add_space_between_punctuation(es_str)
        
        print("Replacing numbers with <NUM> token...")
        en_str = self.remove_numbers(en_str)
        es_str = self.remove_numbers(es_str)
        
        en_lines, es_lines = en_str.split('\n'), es_str.split('\n')
        
        print("Removing lines...")
        en_lines, es_lines = self.remove_lines(en_lines, es_lines)
        
        return en_lines, es_lines
        
    def add_space_between_punctuation(self, data_str):
        '''Add space between a punctuation and word to tokenize punctuations separately'''
        data_str = re.sub(r'(\w)([\.\,\!\?\:\;\'\"\)\(\¡\¿\-])', r'\1 \2', data_str)
        data_str = re.sub(r'([\.\,\!\?\:\;\'\"\)\(\¡\¿\-])(\w)', r'\1 \2', data_str)
        return data_str
    
    def remove_numbers(self, data_str):
        '''replace all numbers to a <NUM> token'''
        data_str = re.sub(r'\b\d+\b', "<NUM>", data_str)
        return data_str
        
    def remove_lines(self, l1, l2):
        '''remove lines that are too short or dont end with .!?'''
        indices = []
        for i, line in enumerate(l1):
            if len(line.split()) <= 2:
                indices.append(i)
            elif line[-1] not in set(".!?"):
                indices.append(i)
        for i, line in enumerate(l2):
            if len(line.split()) <= 2 or len(line.split()) > self.max_len:
                indices.append(i)
            elif line[-1] not in set(".!?"):
                indices.append(i)
        indices = sorted(list(set(indices)), reverse=True)
        for i in indices:
            del l1[i]
            del l2[i]
        return l1, l2

    def init_with_new_maps(self):
        print("Generating vocab...")
        self.en_vocab = self.generate_vocab(self.en_lines)
        self.es_vocab = self.generate_vocab(self.es_lines)
        print("Generating maps...")
        self.en_word2index, self.en_index2word = self.generate_maps(self.en_vocab)
        self.es_word2index, self.es_index2word = self.generate_maps(self.es_vocab)
        print("Converting lines to indices...")
        self.en_data = self.convert_lines(self.en_lines, self.en_word2index)
        self.es_data = self.convert_lines(self.es_lines, self.es_word2index)
    
    def init_using_existing_maps(self, en_vocab, en_word2index, en_index2word, es_vocab, es_word2index, es_index2word):
        self.en_vocab = en_vocab
        self.en_word2index = en_word2index
        self.en_index2word = en_index2word
        self.es_vocab = es_vocab
        self.es_word2index = es_word2index
        self.es_index2word = es_index2word
        print("Converting lines to indices...")
        self.en_data = self.convert_lines(self.en_lines, self.en_word2index)
        self.es_data = self.convert_lines(self.es_lines, self.es_word2index)
    
    def generate_vocab(self, data, min_freq=20):
        vocab_cnt = Counter()
        for line in data:
            for word in line.split():
                vocab_cnt[word] += 1
        vocab = [word for word in vocab_cnt if vocab_cnt[word] > min_freq]
        return vocab
    
    def generate_maps(self, vocab):
        word2index = {'PAD':PAD, 'SOS':SOS, 'EOS':EOS, 'UNK':UNK}
        index2word = {PAD:'PAD', SOS:'SOS', EOS:'EOS', UNK:'UNK'}
        for i, w in enumerate(vocab):
            word2index[w] = i + 4
            index2word[i+4] = w
        return word2index, index2word
    
    def sentence2index(self, s, _map):
        out = []
        for w in s.split():
            if w in _map:
                out.append(_map[w])
            else:
                out.append(_map['UNK'])
        return out
    
    def convert_lines(self, lines, _map):
        out = []
        for line in lines:
            out.append(self.sentence2index(line, _map))
        return out
    
    def __getitem__(self, idx):
        return self.en_data[idx], self.es_data[idx]
    
    def __len__(self):
        return len(self.en_data)

In [22]:
def padding_collate_fn(batch):
    batch_size = len(batch)
    # Get the max length of an input sequence (each item is an input seq and a label)
    max_length1 = max([len(item[1]) for item in batch])
    max_length2 = max([len(item[0]) for item in batch]) + 1
    # Its weird but the first item is a tuple not a tensor. 
    # print('data',[item[0][0].shape for item in batch])
    src = torch.zeros((batch_size, max_length1), dtype=torch.float)
    trg = torch.zeros((batch_size, max_length2), dtype=torch.float)
    trg_y = torch.zeros((batch_size, max_length2), dtype=torch.float)
    
    for i, seq in enumerate(batch):
        trg_seq = seq[0].copy()
        trg_seq.insert(0, SOS)
        trg_y_seq = seq[0].copy()
        trg_y_seq.append(EOS)
        src[i, :len(seq[1])] = torch.tensor(seq[1])
        trg[i, :len(seq[0])+1] = torch.tensor(trg_seq)
        trg_y[i, :len(seq[0])+1] = torch.tensor(trg_y_seq)
    
    src = src.long()
    trg = trg.long()
    trg_y = trg_y.long()
    
    src_key_padding_mask = (src == PAD)
    trg_key_padding_mask = (trg == PAD)
    
    return src, trg, trg_y, src_key_padding_mask, trg_key_padding_mask

def index2sentence(ind, _map):
    out = []
    for i in ind:
        if i == PAD: continue;
        out.append(_map[i])
    return ' '.join(out)

# Model

In [51]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
    
class TransformerModel(nn.Module):
    def __init__(self, vocab1, vocab2, d_model=256, nhead=8, num_layers=6, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.embedding1 = nn.Embedding(vocab1, d_model)
        self.embedding2 = nn.Embedding(vocab2, d_model)
        self.pe = PositionalEncoding(d_model, dropout)
        self.transformer = nn.Transformer(
            d_model=d_model, 
            nhead=nhead, 
            num_encoder_layers=num_layers, 
            num_decoder_layers=num_layers, 
            dim_feedforward=dim_feedforward, 
            dropout=dropout
        )
        self.generator = Generator(d_model, vocab2)
    
    def forward(self, src, trg, trg_mask, skpm=None, tkpm=None):
        src_emb = self.embedding1(src).permute(1, 0, 2) * math.sqrt(self.d_model)
        src_emb = self.pe(src_emb)
        trg_emb = self.embedding2(trg).permute(1, 0, 2) * math.sqrt(self.d_model)
        trg_emb = self.pe(trg_emb)
        out = self.transformer(
            src_emb, trg_emb, 
            tgt_mask=trg_mask, 
            src_key_padding_mask=skpm, 
            tgt_key_padding_mask=tkpm, 
            memory_key_padding_mask=skpm
        )
        return self.generator(out)
    
    def greedy_decode(self, src, max_len=50):
        src = torch.LongTensor([i for i in src if i != 0]).to(device)
        src = src.unsqueeze(0)
        src_emb = self.embedding1(src).permute(1, 0, 2)
        memory = self.transformer.encoder(src_emb)
        ys = torch.ones(1, 1).fill_(SOS).type_as(src.data).to(device)
        for i in range(max_len-1):
            ys = Variable(ys)
            ys_emb = self.embedding2(ys).permute(1, 0, 2)
            out = self.transformer.decoder(ys_emb, memory)
            prob = self.generator(out[:, -1])
            _, next_word = torch.max(prob, dim = 1)
            next_word = next_word.data[0]
            ys = torch.cat([ys, 
                            torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
            if next_word.item() == EOS:
                break
        return ys[0]
        

# Optimizer

In [24]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
    
    def zero_grad(self):
        self.optimizer.zero_grad()
        
def get_std_opt(model, warmup):
    return NoamOpt(model.d_model, 2, warmup,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

# Loss

In [25]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction="sum")
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

def ComputeLoss(x, y, norm, criterion):
    reconstruction_loss = criterion(x.contiguous().view(-1, x.size(-1)), 
                                    y.contiguous().view(-1)) / norm
    loss = reconstruction_loss
    return loss

# Training Loop

In [52]:
def run_instance(batch, model, criterion, optimizer):
    src, trg, trg_y, skpm, tkpm = batch
    ntokens = (trg_y != PAD).data.sum()
    trg_mask = model.transformer.generate_square_subsequent_mask(trg.size(1))
    src, trg, trg_y, trg_mask, skpm, tkpm = src.to(device), trg.to(device), trg_y.to(device), trg_mask.to(device), skpm.to(device), tkpm.to(device)
    
    output = model(src, trg, trg_mask)
    loss = ComputeLoss(output, trg_y, ntokens, criterion)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return loss

def train(dataloader, model, criterion, optimizer, index2word1, index2word2, num_epochs, print_every=100):
    history = []
    start = time.time()
    for e in range(num_epochs):
        temp_history = []
        for i, batch in enumerate(dataloader):
            loss = run_instance(batch, model, criterion, optimizer)
            temp_history.append(loss.item())
            if i % print_every == 0:                
                model.eval()
                decoded = model.greedy_decode(batch[0][0])
                model.train()
                checkpoint = time.time()
                history.append(np.mean(temp_history))
                elapsed = checkpoint - start
                remaining = (elapsed / (i+1)) * (len(dataloader) - (i+1))
                print_str1 = "Epoch: {}, Iteration: {}, loss: {:.4f}, elapsed: {:.2f}, remaining: {:.2f}"\
                             .format(e, i, np.mean(temp_history), elapsed, remaining)
                l1 = (batch[0][0] != 0).sum().item()
                print_str2 = "Input: " + index2sentence(batch[0][0][:l1].tolist(), index2word1)
                print_str3 = "Output: " + index2sentence(decoded[1:].tolist(), index2word2)
                l2 = (batch[2][0] != 0).sum().item()
                print_str4 = "Target: " + index2sentence(batch[2][0][:l2].tolist(), index2word2)
                temp_history = []
                
                print(print_str1)
                print(print_str2)
                print(print_str3)
                print(print_str4)
                print()
                
            del batch
            gc.collect()
    return model, history

In [29]:
en_train_path = "dataset/europarl-v7small.es-en-train.en"
es_train_path = "dataset/europarl-v7small.es-en-train.es"
train_dataset = EnEsTranslationDataset(en_train_path, es_train_path)
train_dataset.init_with_new_maps()

Reading from file...
Adding space between punctuations...
Replacing numbers with <NUM> token...
Removing lines...
Generating vocab...
Generating maps...
Converting lines to indices...


In [30]:
print("Number of lines:", len(train_dataset))
print("EN vocab size: ", len(train_dataset.en_vocab))
print("ES vocab size: ", len(train_dataset.es_vocab))

Number of lines: 118460
EN vocab size:  5887
ES vocab size:  7035


In [53]:
batch_size = 64
num_epochs = 10
num_layers = 6

In [54]:
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    pin_memory=True,
    collate_fn=padding_collate_fn,
    shuffle=True
)
vocab1_size = len(train_dataset.es_word2index)
vocab2_size = len(train_dataset.en_word2index)

In [59]:
model = TransformerModel(vocab1_size, vocab2_size, num_layers=num_layers).to(device)
criterion = LabelSmoothing(size=vocab2_size, padding_idx=PAD, smoothing=0.1)
optimizer = get_std_opt(model, 2000)

In [60]:
train(train_loader, model, criterion, optimizer, train_dataset.es_index2word, train_dataset.en_index2word, num_epochs)

Epoch: 0, Iteration: 0, loss: 7.6250, elapsed: 0.83, remaining: 1537.87
Input: pero la responsabilidad final de la coordinación recae en la osce .
Output: , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,
Target: but the ultimate responsibility for coordination rests with the osce . EOS

Epoch: 0, Iteration: 100, loss: 5.5542, elapsed: 41.07, remaining: 711.63
Input: ¿ no deberían UNK estas UNK específicas en un momento en el que la unión está trabajando en un sistema de financiación que se centra en el mercado interior ?
Output: the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Target: should these specific UNK not be consolidated at a time when the union is working on a funding system focused on the internal market ? EOS

Epoch: 0, Iteration: 200, loss: 5.3002, elapsed: 81.63, remaining: 670.1

KeyboardInterrupt: 