In [None]:
! pip install OpenNMT-py
! pip install rouge

In [None]:
import os
import re
import copy
import math
import spacy
import torch
import pyonmttok
import json
from shutil import copyfile
from tqdm import tqdm
import numpy as np
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [None]:
!nvidia-smi

Tue Jun 16 12:13:54 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#### Data preprocess

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
copyfile('/content/drive/My Drive/TransformerLSH/model-50k_with_joiner', '/content/model-50k_with_joiner')

'/content/model-50k_with_joiner'

In [None]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.word2index = {"PAD": PAD_token, "SOS": SOS_token, "EOS": EOS_token}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token:"EOS"}
        self.num_words = 3  # Count PAD, SOS, EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.index2word[self.num_words] = word
            self.num_words += 1

vocab = Voc('News')

In [None]:
TRAIN_TEST_DATASET_LEN = 21000

with open('/content/drive/My Drive/texts_tokenized_val') as train_texts_file:
    lines = train_texts_file.readlines()[:TRAIN_TEST_DATASET_LEN]

for i in tqdm(range(len(lines))):
    vocab.addSentence(lines[i])

with open('/content/drive/My Drive/labels_tokenized_val') as train_labels_file:
    lines = train_labels_file.readlines()[:TRAIN_TEST_DATASET_LEN]

for i in tqdm(range(len(lines))):
    vocab.addSentence(lines[i])

100%|██████████| 21000/21000 [00:03<00:00, 6882.98it/s]
100%|██████████| 21000/21000 [00:00<00:00, 164306.08it/s]


In [None]:
class NLPDataset(Dataset):
    def __init__(self, samples_dir):       
        self.samples_dir = samples_dir

    def __len__(self):
        return len(os.listdir(self.samples_dir))

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        file_dir = os.path.join(self.samples_dir, str(idx+1))
        with open(file_dir) as file:
            sample = json.loads(file.readline())
            text = np.array(sample['text'])
            label = np.array(sample['label'])

        sample = {'text': text, 'label': label}
        return sample

In [None]:
# max label length = 36
# 99.7% of texts' length < 4750

In [None]:
# import shutil
# shutil.rmtree('samples')

In [None]:
os.mkdir('samples')

DATASET_LEN = 20000
TEXT_MAX_LENGTH = 2048
LABEL_MAX_LENGTH = 32

with open('/content/drive/My Drive/texts_tokenized_val') as f:
    lines = f.readlines()[:DATASET_LEN]

with open('/content/drive/My Drive/labels_tokenized_val') as f:
    label_lines = f.readlines()[:DATASET_LEN]

name = 1
for text, label in tqdm(zip(lines, label_lines)):
    text = text.split(' ')
    t_len = len(text)
    if t_len > TEXT_MAX_LENGTH:
        text = text[:TEXT_MAX_LENGTH]
    elif t_len < TEXT_MAX_LENGTH:
        text.extend(["PAD"] * (TEXT_MAX_LENGTH - t_len))
    
    label = label.split(' ')
    label = ["SOS"] + label
    l_len = len(label)
    mask = [1] * l_len
    if l_len > LABEL_MAX_LENGTH:
        label = label[:LABEL_MAX_LENGTH]
        label[-1] = "EOS"
        mask = mask[:LABEL_MAX_LENGTH]
    elif l_len < LABEL_MAX_LENGTH:
        label.append("EOS")
        mask.append(1)
        label.extend(["PAD"] * (LABEL_MAX_LENGTH - l_len - 1))
        mask.extend([0] * (LABEL_MAX_LENGTH - l_len - 1))
    else:
        label[-1] = "EOS"
    
    tokenized_text = []
    for token in text:
        tokenized_text.append(vocab.word2index[token])

    tokenized_label = []
    for token in label:
        tokenized_label.append(vocab.word2index[token])

    output = {'text': tokenized_text, 'mask': mask, 'label': tokenized_label}
    
    with open('samples/{}'.format(name), 'w') as text_labels_tokens_file:
        json.dump(output, text_labels_tokens_file)
    
    name +=1

20000it [00:43, 455.48it/s]


In [None]:
os.mkdir('test_samples')

TEST_DATASET_LEN = 1000
TEXT_MAX_LENGTH = 2048
LABEL_MAX_LENGTH = 32

with open('/content/drive/My Drive/texts_tokenized_val') as f:
    lines = f.readlines()[DATASET_LEN:DATASET_LEN + TEST_DATASET_LEN]

with open('/content/drive/My Drive/labels_tokenized_val') as f:
    label_lines = f.readlines()[DATASET_LEN:DATASET_LEN + TEST_DATASET_LEN]

name = 1
for text, label in tqdm(zip(lines, label_lines)):
    text = text.split(' ')
    t_len = len(text)
    if t_len > TEXT_MAX_LENGTH:
        text = text[:TEXT_MAX_LENGTH]
    elif t_len < TEXT_MAX_LENGTH:
        text.extend(["PAD"] * (TEXT_MAX_LENGTH - t_len))
    
    label = label.split(' ')
    label = ["SOS"] + label
    l_len = len(label)
    mask = [1] * l_len
    if l_len > LABEL_MAX_LENGTH:
        label = label[:LABEL_MAX_LENGTH]
        label[-1] = "EOS"
        mask = mask[:LABEL_MAX_LENGTH]
    elif l_len < LABEL_MAX_LENGTH:
        label.append("EOS")
        mask.append(1)
        label.extend(["PAD"] * (LABEL_MAX_LENGTH - l_len - 1))
        mask.extend([0] * (LABEL_MAX_LENGTH - l_len - 1))
    else:
        label[-1] = "EOS"
    
    tokenized_text = []
    for token in text:
        tokenized_text.append(vocab.word2index[token])

    tokenized_label = []
    for token in label:
        tokenized_label.append(vocab.word2index[token])

    output = {'text': tokenized_text, 'mask': mask, 'label': tokenized_label}
    
    with open('test_samples/{}'.format(name), 'w') as text_labels_tokens_file:
        json.dump(output, text_labels_tokens_file)
    
    name +=1

1000it [00:02, 445.20it/s]


In [None]:
dataset = NLPDataset('samples')
test_dataset = NLPDataset('test_samples')
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=4)
len(vocab.index2word)

10573

#### Adafactor optimizer

In [None]:
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py

import math
import torch
import torch.optim


class Adafactor(torch.optim.Optimizer):
    """Implements Adafactor algorithm.
    This implementation is based on:
    `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost`
    (see https://arxiv.org/abs/1804.04235)
    Note that this optimizer internally adjusts the learning rate
    depending on the *scale_parameter*, *relative_step* and
    *warmup_init* options. To use a manual (external) learning rate
    schedule you should set `scale_parameter=False` and
    `relative_step=False`.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): external learning rate (default: None)
        eps (tuple[float, float]): regularization constans for square gradient
            and parameter scale respectively (default: (1e-30, 1e-3))
        clip_threshold (float): threshold of root mean square of
            final gradient update (default: 1.0)
        decay_rate (float): coefficient used to compute running averages of square
            gradient (default: -0.8)
        beta1 (float): coefficient used for computing running averages of gradient
            (default: None)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        scale_parameter (bool): if True, learning rate is scaled by root mean square of
            parameter (default: True)
        relative_step (bool): if True, time-dependent learning rate is computed
            instead of external learning rate (default: True)
        warmup_init (bool): time-dependent learning rate computation depends on
            whether warm-up initialization is being used (default: False)
    """

    def __init__(self, params, lr=None, eps=(1e-30, 1e-3), clip_threshold=1.0,
                 decay_rate=-0.8, beta1=None, weight_decay=0.0, scale_parameter=True,
                 relative_step=True, warmup_init=False):
        if lr is not None and relative_step:
            raise ValueError('Cannot combine manual lr and relative_step options')
        if warmup_init and not relative_step:
            raise ValueError('warmup_init requires relative_step=True')

        defaults = dict(lr=lr, eps=eps, clip_threshold=clip_threshold, decay_rate=decay_rate,
                        beta1=beta1, weight_decay=weight_decay, scale_parameter=scale_parameter,
                        relative_step=relative_step, warmup_init=warmup_init)
        super(Adafactor, self).__init__(params, defaults)

    @property
    def supports_memory_efficient_fp16(self):
        return True

    @property
    def supports_flat_params(self):
        return False

    def _get_lr(self, param_group, param_state):
        rel_step_sz = param_group['lr']
        if param_group['relative_step']:
            min_step = 1e-6 * param_state['step'] if param_group['warmup_init'] else 1e-2
            rel_step_sz = min(min_step, 1.0/math.sqrt(param_state['step']))
        param_scale = 1.0
        if param_group['scale_parameter']:
            param_scale = max(param_group['eps'][1], param_state['RMS'])
        return param_scale * rel_step_sz

    def _get_options(self, param_group, param_shape):
        factored = len(param_shape) >= 2
        use_first_moment = param_group['beta1'] is not None
        return factored, use_first_moment

    def _rms(self, tensor):
        return tensor.norm(2) / (tensor.numel() ** 0.5)

    def _approx_sq_grad(self, exp_avg_sq_row, exp_avg_sq_col, output):
        r_factor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1).unsqueeze(-1)).rsqrt_().unsqueeze(-1)
        c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
        torch.mul(r_factor, c_factor, out=output)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('Adafactor does not support sparse gradients.')

                state = self.state[p]
                grad_shape = grad.shape

                factored, use_first_moment = self._get_options(group, grad_shape)
                # State Initialization
                if len(state) == 0:
                    state['step'] = 0

                    if use_first_moment:
                        # Exponential moving average of gradient values
                        state['exp_avg'] = torch.zeros_like(grad)
                    if factored:
                        state['exp_avg_sq_row'] = torch.zeros(grad_shape[:-1]).type_as(grad)
                        state['exp_avg_sq_col'] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).type_as(grad)
                    else:
                        state['exp_avg_sq'] = torch.zeros_like(grad)

                    state['RMS'] = 0
                else:
                    if use_first_moment:
                        state['exp_avg'] = state['exp_avg'].type_as(grad)
                    if factored:
                        state['exp_avg_sq_row'] = state['exp_avg_sq_row'].type_as(grad)
                        state['exp_avg_sq_col'] = state['exp_avg_sq_col'].type_as(grad)
                    else:
                        state['exp_avg_sq'] = state['exp_avg_sq'].type_as(grad)

                p_data_fp32 = p.data.float()

                state['step'] += 1
                state['RMS'] = self._rms(p_data_fp32)
                group['lr'] = self._get_lr(group, state)

                beta2t = 1.0 - math.pow(state['step'], group['decay_rate'])
                update = (grad**2) + group['eps'][0]
                if factored:
                    exp_avg_sq_row = state['exp_avg_sq_row']
                    exp_avg_sq_col = state['exp_avg_sq_col']

                    # exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))
                    # exp_avg_sq_col.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-2))
                    exp_avg_sq_row.mul_(beta2t).add_(update.mean(dim=-1), alpha=1.0 - beta2t)
                    exp_avg_sq_col.mul_(beta2t).add_(update.mean(dim=-2), alpha=1.0 - beta2t)

                    # Approximation of exponential moving average of square of gradient
                    self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col, update)
                    update.mul_(grad)
                else:
                    exp_avg_sq = state['exp_avg_sq']

                    # exp_avg_sq.mul_(beta2t).add_(1.0 - beta2t, update)
                    exp_avg_sq.mul_(beta2t).add_(update, alpha=1.0 - beta2t)
                    torch.rsqrt(exp_avg_sq, out=update).mul_(grad)

                update.div_(max(1.0, self._rms(update) / group['clip_threshold']))
                update.mul_(group['lr'])

                if use_first_moment:
                    exp_avg = state['exp_avg']
                    # exp_avg.mul_(group['beta1']).add_(1 - group['beta1'], update)
                    exp_avg.mul_(group['beta1']).add_(update, alpha=1 - group['beta1'])
                    update = exp_avg

                if group['weight_decay'] != 0:
                    # p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
                    p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])

                p_data_fp32.add_(-update)

                # TODO: remove check once pyTorch avoids a copy for this case
                if p.data_ptr() != p_data_fp32.data_ptr():
                    p.data.copy_(p_data_fp32)

        return loss

#### Pytorch Transformer

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class TransformerModel(nn.Module):

    def __init__(self, src_vocab, trg_vocab, d_model=512, heads=8, nlayers=6, d_ff=1024, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.enc_embeder = nn.Embedding(src_vocab, d_model)
        self.dec_embeder = nn.Embedding(trg_vocab, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.transformer = nn.Transformer(d_model, heads, nlayers, nlayers, d_ff, dropout)
        self.output = nn.Linear(d_model, trg_vocab)

    def forward(self, src, tgt, src_pad_mask=None, tgt_pad_mask=None, tgt_mask=None):

        src = self.pos_encoder(self.enc_embeder(src))
        tgt = self.pos_encoder(self.dec_embeder(tgt))
        output = self.transformer(src, tgt, tgt_mask = tgt_mask,
                        src_key_padding_mask = src_pad_mask,
                        tgt_key_padding_mask = tgt_pad_mask,
                        memory_key_padding_mask = src_pad_mask)
        
        output = self.output(output)
        return output

In [None]:
def make_target_mask(targets):
    size = targets.size(0)
    nopeak_mask = np.triu(np.ones((size, size)), k=1)
    nopeak_mask = torch.from_numpy(nopeak_mask)
    nopeak_mask = Variable(nopeak_mask.masked_fill(nopeak_mask == 1, -np.inf))
    return nopeak_mask

In [None]:
import math
import time

PAD_INDEX = 0

def train(model, iterator, optimizer, criterion, clip=5):
    
    model.train()
    epoch_loss = 0
    
    for _, batch in tqdm(enumerate(iterator)):

        src = batch['text'].cuda()
        trg = batch['label'].cuda()
        src_pad_mask = src == PAD_INDEX
        trg_pad_mask = trg == PAD_INDEX
        src = src.transpose(0, 1)
        trg = trg.transpose(0, 1)
        TRG_MASK = make_target_mask(trg).cuda()

        optimizer.zero_grad()

        output = model(src, trg, src_pad_mask, trg_pad_mask, TRG_MASK)[:-1]
        output = output.transpose(0, 1).contiguous().view(-1, output.size(-1))
        trg = trg.transpose(0, 1)[:, 1:].flatten()

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
LABEL_MAX_LEN = 32
PAD_INDEX = 0

def index_2_token(x):
    return vocab.index2word[x]

back_to_tokens = np.vectorize(index_2_token)

def validate_model(model, iterator, stop_token):
    model.eval()
    predictions = []
    trues = []
    for _, batch in tqdm(enumerate(iterator)):
        with torch.no_grad():

            src = batch['text'].cuda()
            trg = batch['label'].cuda()
            src_pad_mask = src == PAD_INDEX

            src = src.transpose(0, 1)
            trg = trg.transpose(0, 1)

            trg_input = trg[0, :].view(1, -1)
            for i in range(trg.shape[0] - 1):
                trg_pad_mask = trg_input.transpose(0,1) == PAD_INDEX
                out = model(src, trg_input, src_pad_mask, trg_pad_mask).argmax(dim=-1)[-1].view(1, trg.shape[1])
                trg_input = torch.cat((trg_input, out), dim = 0)
            
            trg_input = trg_input.transpose(0, 1).cpu().detach().numpy()[:, 1:]
            trg = trg.transpose(0, 1).cpu().numpy()[:, 1:]
            
            for j, sentence in enumerate(trg_input):
                try:
                    stop_index = np.min(np.where(sentence == stop_token)[0])
                except:
                    stop_index = LABEL_MAX_LEN

                predict = ' '.join(back_to_tokens(sentence[:stop_index]))
                predict = re.sub(r'\n|\r', '', predict) + '\n'
                predict = predict.split(' ')
                predictions.append(predict)

                stop_index = np.min(np.where(trg[j] == stop_token)[0])
                true = back_to_tokens(trg[j][:stop_index])
                true = true.tolist()
                trues.append(true)
    
    return predictions, trues

#### Pytorch transformer training

In [None]:
model = TransformerModel(10573, 10573, nlayers=2).cuda()
optimizer = Adafactor(model.parameters(), beta1=0, warmup_init=False)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
copyfile('/content/drive/My Drive/TransformerLSH/model_torch4.pt', '/content/model.pt')
checkpoint = torch.load('/content/model.pt')

In [None]:
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
#25 эпох прошло

In [None]:
N_EPOCHS = 5
CLIP = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, dataloader, optimizer, criterion, CLIP)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):7.3f}')

5000it [24:08,  3.45it/s]

Epoch: 01 | Time: 24m 9s
	Train Loss: 3.473 | Train PPL:  32.221



5000it [24:09,  3.45it/s]

Epoch: 02 | Time: 24m 9s
	Train Loss: 3.228 | Train PPL:  25.220



5000it [24:11,  3.45it/s]

Epoch: 03 | Time: 24m 11s
	Train Loss: 3.006 | Train PPL:  20.206



5000it [24:18,  3.43it/s]

Epoch: 04 | Time: 24m 18s
	Train Loss: 2.811 | Train PPL:  16.626



5000it [24:15,  3.44it/s]

Epoch: 05 | Time: 24m 15s
	Train Loss: 2.645 | Train PPL:  14.089





In [None]:
N_EPOCHS = 5
CLIP = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, dataloader, optimizer, criterion, CLIP)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):7.3f}')

5000it [24:11,  3.44it/s]

Epoch: 01 | Time: 24m 11s
	Train Loss: 2.487 | Train PPL:  12.022



5000it [24:02,  3.47it/s]

Epoch: 02 | Time: 24m 3s
	Train Loss: 2.358 | Train PPL:  10.567



5000it [24:01,  3.47it/s]

Epoch: 03 | Time: 24m 2s
	Train Loss: 2.243 | Train PPL:   9.424



5000it [24:04,  3.46it/s]

Epoch: 04 | Time: 24m 4s
	Train Loss: 2.139 | Train PPL:   8.489



5000it [24:09,  3.45it/s]

Epoch: 05 | Time: 24m 9s
	Train Loss: 2.050 | Train PPL:   7.765





In [None]:
N_EPOCHS = 5
CLIP = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, dataloader, optimizer, criterion, CLIP)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):7.3f}')

5000it [24:07,  3.45it/s]

Epoch: 01 | Time: 24m 7s
	Train Loss: 1.966 | Train PPL:   7.143



5000it [24:06,  3.46it/s]

Epoch: 02 | Time: 24m 6s
	Train Loss: 1.888 | Train PPL:   6.604



5000it [24:16,  3.43it/s]

Epoch: 03 | Time: 24m 16s
	Train Loss: 1.815 | Train PPL:   6.139



5000it [24:18,  3.43it/s]

Epoch: 04 | Time: 24m 18s
	Train Loss: 1.755 | Train PPL:   5.782



5000it [24:19,  3.43it/s]

Epoch: 05 | Time: 24m 19s
	Train Loss: 1.701 | Train PPL:   5.481





In [None]:
N_EPOCHS = 5
CLIP = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, dataloader, optimizer, criterion, CLIP)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):7.3f}')

5000it [24:36,  3.39it/s]

Epoch: 01 | Time: 24m 36s
	Train Loss: 1.648 | Train PPL:   5.196



5000it [24:40,  3.38it/s]

Epoch: 02 | Time: 24m 40s
	Train Loss: 1.598 | Train PPL:   4.945



5000it [24:43,  3.37it/s]

Epoch: 03 | Time: 24m 43s
	Train Loss: 1.556 | Train PPL:   4.738



5000it [24:40,  3.38it/s]

Epoch: 04 | Time: 24m 40s
	Train Loss: 1.514 | Train PPL:   4.544



5000it [24:40,  3.38it/s]

Epoch: 05 | Time: 24m 40s
	Train Loss: 1.479 | Train PPL:   4.388





In [None]:
!nvidia-smi

Wed May 20 16:33:16 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    30W /  70W |   6949MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [None]:
torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss,
            }, 'model_torch5.pt')

In [None]:
copyfile('/content/model_torch5.pt', '/content/drive/My Drive/TransformerLSH/model_torch5.pt')

'/content/drive/My Drive/TransformerLSH/model_torch5.pt'

In [None]:
predictions, trues = validate_model(model, test_dataloader, stop_token=2)

250it [12:58,  3.11s/it]


In [None]:
BPE = pyonmttok.Tokenizer("conservative", bpe_model_path="model-50k_with_joiner")

In [None]:
predictions_detok = []
for sentence in predictions:
    predictions_detok.append(BPE.detokenize(sentence))

trues_detok = []
for sentence in trues:
    trues_detok.append(BPE.detokenize(sentence))

In [None]:
import rouge
rouge = rouge.Rouge()
scores = rouge.get_scores(predictions_detok, trues_detok, avg=True)

In [None]:
scores

{'rouge-1': {'f': 0.06887463710319201,
  'p': 0.08638927363486204,
  'r': 0.06282519425019459},
 'rouge-2': {'f': 0.0033033848732950595,
  'p': 0.004736183261183262,
  'r': 0.0030555916305916304},
 'rouge-l': {'f': 0.06847248789840453,
  'p': 0.08663711843711856,
  'r': 0.06160128760128791}}

#### FullLSHTransformer

In [None]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)


class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 80):
        super().__init__()
        self.d_model = d_model
        
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = np.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = np.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    def forward(self, x):
        # make embeddings relatively larger
        x = x * np.sqrt(self.d_model)
        #add constant to embedding
        seq_len = x.size(1)
        x = x + Variable(self.pe[:,:seq_len], requires_grad=False)
        return x


class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1, activation = nn.ReLU()):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.activation = activation
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.dropout(self.activation(self.linear_1(x)))
        x = self.linear_2(x)
        return x


class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm


# build an encoder layer with one multi-head attention layer and one ff layer
class EncoderLayer(nn.Module):
    def __init__(self, heads, d_model, bucket_size,
                 n_hashes, chunk_len, random_rotations_per_head = False,
                 dropout = 0.1, d_ff = 2048, act_ff = nn.ReLU()):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = SelfLSHAttention(heads, d_model, bucket_size,
                                     n_hashes, chunk_len, False,
                                     random_rotations_per_head, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout, act_ff)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask=None, exclude_self=True):
        x_attn = self.dropout_1(self.attn(x, pad_mask=mask,
                                          exclude_self=exclude_self)) + x
        x_attn_norm = self.norm_1(x_attn)
        x_ff = self.dropout_2(self.ff(x_attn_norm)) + x_attn_norm
        x_ff_norm = self.norm_2(x_ff)
        return x_ff_norm
    
# build a decoder layer with two multi-head attention layers and one ff layer
class DecoderLayer(nn.Module):
    def __init__(self, heads, d_model, trg_bucket_size, enc_dec_n_buckets,
                 n_hashes, trg_chunk_len, src_chunk_len, random_rotations_per_head,
                 dropout=0.1, d_ff=2048, act_ff = nn.ReLU()):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = SelfLSHAttention(heads, d_model, trg_bucket_size,
                                n_hashes, trg_chunk_len, True,
                                random_rotations_per_head, dropout)
        
        self.attn_2 = EncDecLSHAttention(heads, d_model, enc_dec_n_buckets,
                                         n_hashes, trg_chunk_len, src_chunk_len,
                                         random_rotations_per_head, dropout)
        
        self.ff = FeedForward(d_model, d_ff, dropout, act_ff)

    def forward(self, x, e_outputs, src_mask=None, trg_mask=None, exclude_self=True):
        x_attn = self.dropout_1(self.attn_1(x, pad_mask=trg_mask,
                                            exclude_self=exclude_self)) + x
        x_attn_norm = self.norm_1(x_attn)
        x_attn_enc_dec = x_attn_norm + self.dropout_2(self.attn_2(x_attn_norm,
                                    e_outputs, trg_mask, src_mask))
        
        x_attn_enc_dec_norm = self.norm_2(x_attn_enc_dec)
        x_ff = self.dropout_3(self.ff(x_attn_enc_dec_norm)) + x_attn_enc_dec_norm
        x_ff_norm = self.norm_3(x_ff)        
        return x_ff_norm
    
# We can then build a convenient cloning function that can generate multiple layers:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class Encoder(nn.Module):
    def __init__(self, vocab_size, max_src_len, d_model, N_layers, heads,
                 bucket_size, n_hashes, chunk_len, random_rotations_per_head,
                 dropout=0.1, d_ff=2048, act_ff=nn.ReLU()):
        super().__init__()
        self.N = N_layers
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, max_src_len)
        self.layers = get_clones(EncoderLayer(heads, d_model, bucket_size,
                                    n_hashes, chunk_len, random_rotations_per_head,
                                    dropout, d_ff, act_ff), N_layers)

    def forward(self, src, mask=None, exclude_self=True):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask, exclude_self)
        return x
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, max_trg_len, d_model, N_layers, heads,
                 trg_bucket_size, enc_dec_n_buckets, n_hashes, trg_chunk_len, src_chunk_len, random_rotations_per_head,
                dropout=0.1, d_ff=2048, act_ff=nn.ReLU()):
        super().__init__()
        self.N = N_layers
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, max_trg_len)
        self.layers = get_clones(DecoderLayer(heads, d_model, trg_bucket_size,
                                    enc_dec_n_buckets,
                                    n_hashes, trg_chunk_len, src_chunk_len, random_rotations_per_head,
                                    dropout, d_ff, act_ff), N_layers)

    def forward(self, trg, e_outputs, src_mask=None, trg_mask=None, exclude_self=True):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask, exclude_self)
        return x


class TransformerFullLSH(nn.Module):
    def __init__(self, src_vocab, trg_vocab, src_bucket_size, trg_bucket_size,
                 enc_dec_n_buckets, n_hashes, src_chunk_len, trg_chunk_len,
                 max_src_len=3000, max_trg_len=100, d_model=512, N_layers=2,
                 heads=8, random_rotations_per_head=False, dropout=0.1, 
                 d_ff=2048, act_ff=nn.ReLU()):
        super().__init__()
        self.encoder = Encoder(src_vocab, max_src_len, d_model, N_layers, heads,
                            src_bucket_size, n_hashes, src_chunk_len, random_rotations_per_head,
                            dropout, d_ff, act_ff)
        self.decoder = Decoder(trg_vocab, max_trg_len, d_model, N_layers, heads,
                            trg_bucket_size, enc_dec_n_buckets, n_hashes, trg_chunk_len, src_chunk_len, random_rotations_per_head,
                            dropout, d_ff, act_ff)
        self.out = nn.Linear(d_model, trg_vocab)
        
    def forward(self, src, trg, src_mask=None, trg_mask=None, exclude_self=True):
        e_outputs = self.encoder(src, src_mask, exclude_self)
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask, exclude_self)
        output = self.out(d_output)
        return output

In [None]:
def mask_enc_dec_attention(dots, q_pad_mask_chanked=None, kv_pad_mask_chunked=None):
    mask = (q_pad_mask_chanked[..., :, None] * kv_pad_mask_chunked[..., None, :]).bool()
    # in the model notation 0 in pad_mask means token that must be masked to -1e9
    dots.masked_fill_(~mask, -1e9)
    
    return dots


class EncDecLSHAttention(nn.Module):
    def __init__(self,
                 heads,
                 d_model,
                 n_buckets,
                 n_hashes,
                 q_chunk_len,
                 k_chunk_len,
                 random_rotations_per_head = False,
                 dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        self.n_buckets = n_buckets
        self.n_hashes = n_hashes
        self.q_chunk_len = q_chunk_len
        self.k_chunk_len = k_chunk_len
        self.random_rotations_per_head = random_rotations_per_head
        
        self.q_linear = nn.Linear(d_model, d_model)    
        self.v_linear = nn.Linear(d_model, d_model)
        if dropout is not None:
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = None
        self.out = nn.Linear(d_model, d_model)
        
        
    def forward(self, x, e_outputs, q_pad_mask=None, kv_pad_mask=None):
        bs = x.size(0)
        
        # perform linear operation and split into h heads
        # bs, q_seq, heads, dk
        q = self.q_linear(x).view(bs, -1, self.h, self.d_k)
        # bs, kv_seq, heads, dk
        k = self.q_linear(e_outputs).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(e_outputs).view(bs, -1, self.h, self.d_k)
        # transpose to get dimensions bs, heads, seq, dk
        q = q.transpose(1,2)
        k = k.transpose(1,2)
        v = v.transpose(1,2)
        # expand mask on number of hands
        # bs, heads, seq
        if (q_pad_mask is not None) and (kv_pad_mask is not None):
            q_pad_mask = q_pad_mask.unsqueeze(1).expand(-1, self.h, -1)
            kv_pad_mask = kv_pad_mask.unsqueeze(1).expand(-1, self.h, -1)
        # calculate LSHattention
        scores = self.enc_dec_lsh_attention(q, k, v, self.n_buckets, self.n_hashes,
                                            self.q_chunk_len, self.k_chunk_len,
                                            q_pad_mask, kv_pad_mask,
                                            self.random_rotations_per_head)
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)        
        output = self.out(concat)
    
        return output
    
     
    def enc_dec_lsh_attention(self, q, k, v,
                           n_buckets,
                           n_hashes,
                           q_chunk_len,
                           k_chunk_len,
                           q_pad_mask=None,
                           kv_pad_mask=None,
                           random_rotations_per_head=False):
        # q shape: bs, heads, q_seqlen, dk
        # k shape: bs, heads, kv_seqlen, dk
        # v shape: bs, heads, kv_seqlen, dk
        # n_buckets - number of buckets for q(trg) and kv(src)
        batch_size, heads, q_seqlen, dk = q.shape
        k_seqlen = k.size(-2)
        device = q.device

        assert q_seqlen % (n_buckets * 2) == 0
        assert k_seqlen % (n_buckets * 2) == 0

        # get buckets of shape bs, heads, seqlen*n_hashes
        q_buckets, k_buckets = self.hash_vectors(n_buckets, n_hashes, q, k, random_rotations_per_head)
        
        q_ticker = torch.arange(n_hashes * q_seqlen, device=device).expand_as(q_buckets)
        q_buckets_and_t = q_seqlen * q_buckets + (q_ticker % q_seqlen)
        q_buckets_and_t = q_buckets_and_t.detach()
              
        k_ticker = torch.arange(n_hashes * k_seqlen, device=device).expand_as(k_buckets)
        k_buckets_and_t = k_seqlen * k_buckets + (k_ticker % k_seqlen)
        k_buckets_and_t = k_buckets_and_t.detach()

        # hash-based sort ("s" at the start of variable names means "sorted")
        sq_buckets_and_t, sq_ticker = q_buckets_and_t.sort(dim=-1)
        _, q_undo_sort = sq_ticker.sort(dim=-1)

        sq_buckets_and_t = sq_buckets_and_t.detach()
        sq_ticker = sq_ticker.detach()
        q_undo_sort = q_undo_sort.detach()
          
        sk_buckets_and_t, sk_ticker = k_buckets_and_t.sort(dim=-1)

        sk_buckets_and_t = sk_buckets_and_t.detach()
        sk_ticker = sk_ticker.detach()
                        
        # get vectors in the order of hash-based sort
        sqt = sq_ticker % q_seqlen
        skt = sk_ticker % k_seqlen
        # sq shape: bs, heads, q_seqlen*n_hashes, dk
        # sk and sv shape: bs, heads, k_seqlen*n_hashes, dk
        sq = batched_index_select(q, sqt)
        sk = batched_index_select(k, skt)
        sv = batched_index_select(v, skt)
        
        if (q_pad_mask is not None) and (kv_pad_mask is not None):
            q_pad_mask_chunked = q_pad_mask.gather(-1, sqt).view(batch_size, heads, -1, q_chunk_len)
            kv_pad_mask_chunked = kv_pad_mask.gather(-1, skt).view(batch_size, heads, -1, k_chunk_len)
        else:
            q_pad_mask_chunked=None
            kv_pad_mask_chunked=None

        soutputs, sdots_logsumexp = self.attend(sq, sk, sv, q_chunk_len, k_chunk_len,
                                                q_pad_mask_chunked, kv_pad_mask_chunked)
        
        # use undo_sort to get true order of sequence
        outputs = batched_index_select(soutputs, q_undo_sort)
        _, dots_logsumexp = sort_key_val(sq_ticker, sdots_logsumexp, dim=-1)

        if n_hashes > 1:
            outputs = torch.reshape(outputs, (batch_size, heads, n_hashes, q_seqlen, dk))
            dots_logsumexp = torch.reshape(dots_logsumexp, (batch_size, heads, n_hashes, q_seqlen, 1))
            probs = F.softmax(dots_logsumexp, dim=2)
            outputs = torch.sum(outputs * probs, dim=2)

        return outputs

    
    def hash_vectors(self, n_buckets, n_hashes, q, k, random_rotations_per_head=False):
        # q vecs of shape bs, heads, q_seqlen, dk
        # k vecs of shape bs, heads, kv_seqlen, dk
        # n_buckets of q == n_buckets of k
        assert n_buckets % 2 == 0

        batch_size = q.size(0)
        heads = q.size(1)
        device = q.device

        rotations_shape = (
            batch_size,
            heads if random_rotations_per_head else 1,
            q.size(-1),
            n_hashes,
            n_buckets // 2)

        random_rotations = torch.randn(rotations_shape, dtype=q.dtype,
                                      device=device).expand(batch_size, heads, -1, -1, -1)
        # rotated_q size: bs(b), heads(h), n_hashes(r), q_seqlen(s), buckets//2 (i)
        rotated_q = torch.einsum('bhsd,bhdri->bhrsi', q, random_rotations)
        rotated_q = torch.cat([rotated_q, -rotated_q], dim=-1)
        q_buckets = torch.argmax(rotated_q, dim=-1)
        
        # rotated_k size: bs(b), heads(h), n_hashes(r), k_seqlen(s), buckets//2 (i)
        rotated_k = torch.einsum('bhsd,bhdri->bhrsi', k, random_rotations)
        rotated_k = torch.cat([rotated_k, -rotated_k], dim=-1)
        k_buckets = torch.argmax(rotated_k, dim=-1)
        
        # buckets is now (bs, heads, n_hashes, seqlen). Next we add offsets so that
        # bucket numbers from different hashing rounds don't overlap.        
        offsets = torch.arange(n_hashes, device=device)
        offsets = torch.reshape(offsets * n_buckets, (1, 1, -1, 1))
        q_buckets = torch.reshape(q_buckets + offsets, (batch_size, heads, -1,))
        k_buckets = torch.reshape(k_buckets + offsets, (batch_size, heads, -1,))
        # out shape of each tensor: bs, heads, seqlen*n_hashes
        return q_buckets, k_buckets
     
        
    def attend(self, q, k, v, q_chunk_len, k_chunk_len,
               q_pad_mask_chunked=None,
               kv_pad_mask_chunked=None):
        
        batch_size, heads, _, dk = q.size()
            
        # q, k, v shapes: bs, heads, n_hashes*seqlen/chunk_len, chunk_len, dk
        q = torch.reshape(q, (batch_size, heads, -1, q_chunk_len, dk))
        k = torch.reshape(k, (batch_size, heads, -1, k_chunk_len, dk))
        v = torch.reshape(v, (batch_size, heads, -1, k_chunk_len, dk))
        
        k = length_normalized(k)
        k = k / (dk**0.5)
        
        # dots shape: bs, heads, n_hashes*seqlen/chunk_len, chunk_len, chunk_len * 2
        dots = torch.einsum('bhcsd,bhcfd->bhcsf', q, k)
        if (q_pad_mask_chunked is not None) and (kv_pad_mask_chunked is not None):
            dots = mask_enc_dec_attention(dots, q_pad_mask_chunked, kv_pad_mask_chunked)

        # softmax
        dots_logsumexp = torch.logsumexp(dots, dim=-1, keepdim=True)
        dots = torch.exp(dots - dots_logsumexp)
        
        if self.dropout is not None:
            dots = self.dropout(dots)
        
        out = torch.matmul(dots, v)
        out = torch.reshape(out, (batch_size, heads, -1, dk))
        dots_logsumexp = torch.reshape(dots_logsumexp, (batch_size, heads, -1))
        return out, dots_logsumexp

In [None]:
def length_normalized(x, epsilon=1e-6):
    variance = torch.mean(x**2, dim=-1, keepdim=True)
    norm_inputs = x / torch.sqrt(variance + epsilon)
    return norm_inputs


def look_one_back(x):
    # Allow each chunk to attend within itself, and also one chunk back. Chunk
    # boundaries might occur in the middle of a sequence of items from the
    # same bucket, so this increases the chances of attending to relevant items.
    x_extra = torch.cat([x[:, :, -1:, ...], x[:, :, :-1, ...]], dim=2)
    return torch.cat([x, x_extra], dim=3)


def batched_index_select(values, indices):
    last_dim = values.size(-1)
    return values.gather(2, indices[:, :, :, None].expand(-1, -1, -1, last_dim))


def sort_key_val(t1, t2, dim=-1):
    values, indices = t1.sort(dim=dim)
    #t2 = t2.expand_as(t1)
    return values, t2.gather(dim, indices)


def mask_self_attention(dots, q_info, kv_info, pad_mask_chanked=None, is_decoder=False, exclude_self=False):
    # mask: 1 - token must be masked to -1e9, 0 - otherwise
    if is_decoder:
        mask = torch.lt(q_info[..., :, None], kv_info[..., None, :])
        dots.masked_fill_(mask, -1e9)
    if exclude_self:
        # we don't mask the token with the least index in the chunk
        le = torch.le(q_info[...,:, None], kv_info[..., None, :])
        not_min_mask = ~((kv_info.size(-1) - le.sum(dim=-1, keepdim=True)) == 0).expand_as(le)
        # equality_mask for to mask self token if there some other tokens in sequence, that are unmasked
        equality_mask = torch.eq(q_info[..., :, None], kv_info[..., None, :])
        # combine two masks
        mask = not_min_mask & equality_mask
        dots.masked_fill_(mask, -1e9)
    
    if pad_mask_chanked is not None:
        kv_pad_mask = look_one_back(pad_mask_chanked)
        mask = (pad_mask_chanked[..., :, None] * kv_pad_mask[..., None, :]).bool()
        # in the model notation 0 in pad_mask means token that must be masked to -1e9
        dots.masked_fill_(~mask, -1e9)
    
    return dots


class SelfLSHAttention(nn.Module):
    def __init__(self,
                 heads,
                 d_model,
                 bucket_size,
                 n_hashes,
                 chunk_len,
                 is_decoder = False,
                 random_rotations_per_head = False,
                 dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        self.bucket_size = bucket_size
        self.n_hashes = n_hashes
        self.chunk_len = chunk_len
        self.is_decoder = is_decoder
        self.random_rotations_per_head = random_rotations_per_head
        
        self.q_linear = nn.Linear(d_model, d_model)    
        self.v_linear = nn.Linear(d_model, d_model)
        if dropout is not None:
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = None
        self.out = nn.Linear(d_model, d_model)
        
        
    def forward(self, x, pad_mask=None, exclude_self=False):
        bs = x.size(0)
        
        #qk_mask = make_shared_qk_mask(q.size(1)) # 1, 1, seq_len, seq_len
        # perform linear operation and split into h heads
        # bs, seq, heads, dk
        q = self.q_linear(x).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(x).view(bs, -1, self.h, self.d_k)
        # transpose to get dimensions bs, heads, seq, dk
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        # expand mask on number of hands
        # bs, heads, seq
        if pad_mask is not None:
            pad_mask = pad_mask.unsqueeze(1).expand(-1, self.h, -1)
        # calculate LSHattention
        scores = self.self_lsh_attention(q, v, self.bucket_size, self.n_hashes,
                                         self.chunk_len, pad_mask,
                                         self.random_rotations_per_head,
                                         self.is_decoder, exclude_self)
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)        
        output = self.out(concat)
    
        return output
    
     
    def self_lsh_attention(self, qk, v,
                           bucket_size,
                           n_hashes,
                           chunk_len,
                           pad_mask=None,
                           random_rotations_per_head=False,
                           is_decoder=False,
                           exclude_self=False):
        # qk shape: bs, heads, qk_seqlen, dk
        # v shape: bs, heads, v_seqlen (same as qk_seqlen), dk
        # bucket size - mean number of vectors in one bucket
        batch_size, heads, seqlen, dk = qk.shape
        device = qk.device

        assert seqlen % (bucket_size * 2) == 0

        n_buckets = seqlen // bucket_size
        # get buckets of shape bs, heads, seqlen*n_hashes
        buckets = self.hash_vectors(n_buckets, n_hashes, qk, random_rotations_per_head)

        ticker = torch.arange(n_hashes * seqlen, device=qk.device).expand_as(buckets)
        buckets_and_t = seqlen * buckets + (ticker % seqlen)
        buckets_and_t = buckets_and_t.detach()

        # hash-based sort ("s" at the start of variable names means "sorted")
        # была сортировка через ф-цию sort_key_val
        sbuckets_and_t, sticker = buckets_and_t.sort(dim=-1)
        _, undo_sort = sticker.sort(dim=-1)

        sbuckets_and_t = sbuckets_and_t.detach()
        sticker = sticker.detach()
        undo_sort = undo_sort.detach()
        # get vectors in the order of hash-based sort
        st = sticker % seqlen
        # sqk shape: bs, heads, seqlen*n_hashes, dk
        sqk = batched_index_select(qk, st)
        sv = batched_index_select(v, st)

        soutputs, sdots_logsumexp = self.attend(sqk, sv, chunk_len, st, pad_mask, is_decoder, exclude_self)
        # use undo_sort to get true order of sequence
        outputs = batched_index_select(soutputs, undo_sort)
        _, dots_logsumexp = sort_key_val(sticker, sdots_logsumexp, dim=-1)

        if n_hashes > 1:
            outputs = torch.reshape(outputs, (batch_size, heads, n_hashes, seqlen, dk))
            dots_logsumexp = torch.reshape(dots_logsumexp, (batch_size, heads, n_hashes, seqlen, 1))
            probs = F.softmax(dots_logsumexp, dim=2)
            outputs = torch.sum(outputs * probs, dim=2)

        return outputs

    
    def hash_vectors(self, n_buckets, n_hashes, vecs, random_rotations_per_head=False):
        # input vecs of shape bs, heads, seqlen, dk
        # n_buckets = seqlen // bucketsize
        assert n_buckets % 2 == 0

        batch_size = vecs.size(0)
        heads = vecs.size(1)
        device = vecs.device

        rotations_shape = (
            batch_size,
            heads if random_rotations_per_head else 1,
            vecs.size(-1),
            n_hashes,
            n_buckets // 2)

        random_rotations = torch.randn(rotations_shape, dtype=vecs.dtype, device=device).expand(batch_size, heads, -1, -1, -1)
        # rotated_vecs size: bs(b), heads(h), n_hashes(r), seqlen(s), buckets//2 (i)
        rotated_vecs = torch.einsum('bhsd,bhdri->bhrsi', vecs, random_rotations)

        rotated_vecs = torch.cat([rotated_vecs, -rotated_vecs], dim=-1)
        buckets = torch.argmax(rotated_vecs, dim=-1)

        # buckets is now (bs, heads, n_hashes, seqlen). Next we add offsets so that
        # bucket numbers from different hashing rounds don't overlap.        
        offsets = torch.arange(n_hashes, device=device)
        offsets = torch.reshape(offsets * n_buckets, (1, 1, -1, 1))
        buckets = torch.reshape(buckets + offsets, (batch_size, heads, -1,))
        # out shape bs, heads, seqlen*n_hashes
        return buckets
     

    def attend(self, q, v, chunk_len, info, pad_mask=None, is_decoder=False, exclude_self=False):
        # attend for qk sharing
        batch_size, heads, _, dk = q.size()
        
        if pad_mask is not None:
            pad_mask_chunked = pad_mask.gather(-1, info).view(batch_size, heads, -1, chunk_len)
        else:
            pad_mask_chunked=None
            
        info = torch.reshape(info, (batch_size, heads, -1, chunk_len))
        # q, k, v shapes: bs, heads, n_hashes*seqlen/chunk_len, chunk_len, dk
        q = torch.reshape(q, (batch_size, heads, -1, chunk_len, dk))
        v = torch.reshape(v, (batch_size, heads, -1, chunk_len, dk))
        k = q.clone()    
        k = length_normalized(k)
        k = k / (dk**0.5)

        # form chunks
        # k and v shape: bs, heads, n_hashes*seqlen/chunk_len, chunk_len * 2, dk
        k = look_one_back(k)
        v = look_one_back(v)
        kv_info = look_one_back(info)

        # dots shape: bs, heads, n_hashes*seqlen/chunk_len, chunk_len, chunk_len * 2
        dots = torch.einsum('bhcsd,bhcfd->bhcsf', q, k)
        dots = mask_self_attention(dots, info, kv_info,
                                   pad_mask_chunked,
                                   is_decoder,
                                   exclude_self)

        # softmax
        dots_logsumexp = torch.logsumexp(dots, dim=-1, keepdim=True)
        dots = torch.exp(dots - dots_logsumexp)
        
        if self.dropout is not None:
            dots = self.dropout(dots)
        
        out = torch.matmul(dots, v)
        out = torch.reshape(out, (batch_size, heads, -1, dk))
        dots_logsumexp = torch.reshape(dots_logsumexp, (batch_size, heads, -1))
        return out, dots_logsumexp

#### FullLSHTransformer training

In [None]:
import math
import time
PAD_INDEX = 0

def train(model, iterator, optimizer, criterion, clip=5):
    
    model.train()
    epoch_loss = 0
    
    for _, batch in tqdm(enumerate(iterator)):

        src = batch['text'].cuda()
        trg = batch['label'].cuda()

        src_pad_mask = src != PAD_INDEX
        trg_pad_mask = trg != PAD_INDEX
        
        optimizer.zero_grad()

        output = model(src, trg, src_pad_mask, trg_pad_mask)[:, :-1]
        output = output.contiguous().view(-1, output.size(-1))
        trg = trg[:, 1:].flatten()

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
LABEL_MAX_LEN = 32
PAD_INDEX = 0

def index_2_token(x):
    return vocab.index2word[x]

back_to_tokens = np.vectorize(index_2_token)

def validate_model(model, iterator, stop_token):
    model.eval()
    predictions = []
    trues = []
    for _, batch in tqdm(enumerate(iterator)):
        with torch.no_grad():

            src = batch['text'].cuda()
            trg = batch['label'].cuda()
            src_pad_mask = src != PAD_INDEX
            device = src.device

            trg_input = trg[:, 0].view(-1, 1)
            for i in range(trg.shape[1] - 1):
                pad_input = torch.zeros((trg.shape[0], trg.shape[1] - trg_input.shape[1])).long().to(device)
                inputs = torch.cat((trg_input, pad_input), dim = 1)
                trg_pad_mask = inputs != PAD_INDEX
                out = model(src, inputs, src_pad_mask, trg_pad_mask).argmax(dim=-1)[:, i].view(trg.shape[0], 1)
                trg_input = torch.cat((trg_input, out), dim = 1)
            
            trg_input = trg_input.cpu().detach().numpy()[:, 1:]
            trg = trg.cpu().numpy()[:, 1:]
            
            for j, sentence in enumerate(trg_input):
                try:
                    stop_index = np.min(np.where(sentence == stop_token)[0])
                except:
                    stop_index = LABEL_MAX_LEN

                predict = ' '.join(back_to_tokens(sentence[:stop_index]))
                predict = re.sub(r'\n|\r', '', predict) + '\n'
                predict = predict.split(' ')
                predictions.append(predict)

                stop_index = np.min(np.where(trg[j] == stop_token)[0])
                true = back_to_tokens(trg[j][:stop_index])
                true = true.tolist()
                trues.append(true)
    
    return predictions, trues

##### training for article bucket size = 512

In [None]:
FullLSHmodel = TransformerFullLSH(10573, 10573, src_bucket_size=512, trg_bucket_size=8, enc_dec_n_buckets=4, n_hashes=2,
                              src_chunk_len=512, trg_chunk_len=8, max_src_len=2048, max_trg_len=32, d_model=512,
                              N_layers=2, heads=8, d_ff=1024).cuda()
optimizer = Adafactor(FullLSHmodel.parameters(), beta1=0, warmup_init=False)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
copyfile('/content/drive/My Drive/TransformerLSH/model4.pt', '/content/model.pt')
checkpoint = torch.load('/content/model.pt')

In [None]:
FullLSHmodel.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
# 25 эпох прошло

In [None]:
N_EPOCHS = 5
CLIP = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(FullLSHmodel, dataloader, optimizer, criterion, CLIP)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

5000it [28:54,  2.88it/s]

Epoch: 01 | Time: 28m 55s
	Train Loss: 2.577 | Train PPL:  13.161



5000it [28:54,  2.88it/s]

Epoch: 02 | Time: 28m 54s
	Train Loss: 2.399 | Train PPL:  11.011



5000it [28:56,  2.88it/s]

Epoch: 03 | Time: 28m 56s
	Train Loss: 2.233 | Train PPL:   9.327



5000it [28:54,  2.88it/s]

Epoch: 04 | Time: 28m 54s
	Train Loss: 2.089 | Train PPL:   8.081



5000it [29:04,  2.87it/s]

Epoch: 05 | Time: 29m 4s
	Train Loss: 1.965 | Train PPL:   7.133





In [None]:
N_EPOCHS = 3
CLIP = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(FullLSHmodel, dataloader, optimizer, criterion, CLIP)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

5000it [29:15,  2.85it/s]

Epoch: 01 | Time: 29m 15s
	Train Loss: 1.851 | Train PPL:   6.367



5000it [29:10,  2.86it/s]

Epoch: 02 | Time: 29m 11s
	Train Loss: 1.754 | Train PPL:   5.780



5000it [29:12,  2.85it/s]

Epoch: 03 | Time: 29m 12s
	Train Loss: 1.670 | Train PPL:   5.311





In [None]:
N_EPOCHS = 5
CLIP = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(FullLSHmodel, dataloader, optimizer, criterion, CLIP)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

5000it [26:46,  3.11it/s]

Epoch: 01 | Time: 26m 46s
	Train Loss: 1.590 | Train PPL:   4.902



5000it [26:46,  3.11it/s]

Epoch: 02 | Time: 26m 46s
	Train Loss: 1.522 | Train PPL:   4.581



5000it [26:46,  3.11it/s]

Epoch: 03 | Time: 26m 46s
	Train Loss: 1.461 | Train PPL:   4.312



5000it [26:48,  3.11it/s]

Epoch: 04 | Time: 26m 49s
	Train Loss: 1.406 | Train PPL:   4.080



5000it [26:49,  3.11it/s]

Epoch: 05 | Time: 26m 49s
	Train Loss: 1.354 | Train PPL:   3.874





In [None]:
N_EPOCHS = 4
CLIP = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(FullLSHmodel, dataloader, optimizer, criterion, CLIP)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

5000it [26:47,  3.11it/s]

Epoch: 01 | Time: 26m 47s
	Train Loss: 1.308 | Train PPL:   3.701



5000it [26:49,  3.11it/s]

Epoch: 02 | Time: 26m 49s
	Train Loss: 1.264 | Train PPL:   3.538



5000it [26:50,  3.11it/s]

Epoch: 03 | Time: 26m 50s
	Train Loss: 1.221 | Train PPL:   3.389



5000it [26:49,  3.11it/s]

Epoch: 04 | Time: 26m 50s
	Train Loss: 1.188 | Train PPL:   3.280





In [None]:
!nvidia-smi

Tue May 19 08:00:07 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    34W / 250W |   7547MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [None]:
torch.save({
            'epoch': epoch,
            'model_state_dict': FullLSHmodel.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss,
            }, 'model6.pt')

In [None]:
copyfile('/content/model6.pt', '/content/drive/My Drive/TransformerLSH/model6.pt')

'/content/drive/My Drive/TransformerLSH/model6.pt'

In [None]:
predictions, trues = validate_model(FullLSHmodel, test_dataloader, stop_token=2)

250it [13:13,  3.18s/it]


In [None]:
BPE = pyonmttok.Tokenizer("conservative", bpe_model_path="model-50k_with_joiner")

In [None]:
predictions_detok = []
for sentence in predictions:
    predictions_detok.append(BPE.detokenize(sentence))

trues_detok = []
for sentence in trues:
    trues_detok.append(BPE.detokenize(sentence))

In [None]:
import rouge
rouge = rouge.Rouge()
scores = rouge.get_scores(predictions_detok, trues_detok, avg=True)

In [None]:
scores

{'rouge-1': {'f': 0.07717138136059545,
  'p': 0.08127906203273878,
  'r': 0.07639163059163077},
 'rouge-2': {'f': 0.01357722028002533,
  'p': 0.013940523365523356,
  'r': 0.013551151626151618},
 'rouge-l': {'f': 0.07469833506246332,
  'p': 0.07879135586635615,
  'r': 0.07368989066489091}}

##### training for article bucket size = 128

In [None]:
copyfile('/content/drive/My Drive/TransformerLSH/LSH2.pt', '/content/model.pt')
checkpoint = torch.load('/content/model.pt')

In [None]:
FullLSHmodel = TransformerFullLSH(10573, 10573, src_bucket_size=128, trg_bucket_size=2, enc_dec_n_buckets=16, n_hashes=2,
                              src_chunk_len=128, trg_chunk_len=2, max_src_len=2048, max_trg_len=32, d_model=512,
                              N_layers=2, heads=8, d_ff=1024).cuda()
optimizer = Adafactor(FullLSHmodel.parameters(), beta1=0, warmup_init=False)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
FullLSHmodel.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
N_EPOCHS = 5
CLIP = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(FullLSHmodel, dataloader, optimizer, criterion, CLIP)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

5000it [44:15,  1.88it/s]

Epoch: 01 | Time: 44m 15s
	Train Loss: 7.125 | Train PPL: 1243.114



5000it [44:13,  1.88it/s]

Epoch: 02 | Time: 44m 13s
	Train Loss: 5.302 | Train PPL: 200.735



5000it [44:10,  1.89it/s]

Epoch: 03 | Time: 44m 10s
	Train Loss: 4.637 | Train PPL: 103.195



5000it [43:43,  1.91it/s]

Epoch: 04 | Time: 43m 43s
	Train Loss: 4.136 | Train PPL:  62.545



5000it [43:41,  1.91it/s]

Epoch: 05 | Time: 43m 41s
	Train Loss: 3.755 | Train PPL:  42.734





In [None]:
N_EPOCHS = 10
CLIP = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(FullLSHmodel, dataloader, optimizer, criterion, CLIP)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

5000it [15:03,  5.53it/s]

Epoch: 01 | Time: 15m 3s
	Train Loss: 3.447 | Train PPL:  31.403



5000it [15:03,  5.53it/s]

Epoch: 02 | Time: 15m 4s
	Train Loss: 3.186 | Train PPL:  24.191



5000it [15:04,  5.53it/s]

Epoch: 03 | Time: 15m 4s
	Train Loss: 2.972 | Train PPL:  19.534



5000it [15:07,  5.51it/s]

Epoch: 04 | Time: 15m 7s
	Train Loss: 2.786 | Train PPL:  16.217



5000it [15:17,  5.45it/s]

Epoch: 05 | Time: 15m 17s
	Train Loss: 2.622 | Train PPL:  13.768



5000it [15:17,  5.45it/s]

Epoch: 06 | Time: 15m 17s
	Train Loss: 2.478 | Train PPL:  11.915



5000it [15:22,  5.42it/s]

Epoch: 07 | Time: 15m 22s
	Train Loss: 2.338 | Train PPL:  10.358



5000it [15:18,  5.45it/s]

Epoch: 08 | Time: 15m 18s
	Train Loss: 2.213 | Train PPL:   9.141



5000it [15:16,  5.46it/s]

Epoch: 09 | Time: 15m 16s
	Train Loss: 2.120 | Train PPL:   8.330



5000it [15:16,  5.45it/s]

Epoch: 10 | Time: 15m 17s
	Train Loss: 2.040 | Train PPL:   7.687





In [None]:
N_EPOCHS = 10
CLIP = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(FullLSHmodel, dataloader, optimizer, criterion, CLIP)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

5000it [15:13,  5.47it/s]

Epoch: 01 | Time: 15m 13s
	Train Loss: 1.952 | Train PPL:   7.041



5000it [15:13,  5.47it/s]

Epoch: 02 | Time: 15m 13s
	Train Loss: 1.874 | Train PPL:   6.518



5000it [15:11,  5.49it/s]

Epoch: 03 | Time: 15m 11s
	Train Loss: 1.799 | Train PPL:   6.044



5000it [15:10,  5.49it/s]

Epoch: 04 | Time: 15m 10s
	Train Loss: 1.746 | Train PPL:   5.731



5000it [15:06,  5.51it/s]

Epoch: 05 | Time: 15m 6s
	Train Loss: 1.698 | Train PPL:   5.463



5000it [15:05,  5.52it/s]

Epoch: 06 | Time: 15m 5s
	Train Loss: 1.657 | Train PPL:   5.242



5000it [15:05,  5.52it/s]

Epoch: 07 | Time: 15m 5s
	Train Loss: 1.606 | Train PPL:   4.983



5000it [15:08,  5.51it/s]

Epoch: 08 | Time: 15m 8s
	Train Loss: 1.567 | Train PPL:   4.793



5000it [15:07,  5.51it/s]

Epoch: 09 | Time: 15m 7s
	Train Loss: 1.530 | Train PPL:   4.620



5000it [15:07,  5.51it/s]

Epoch: 10 | Time: 15m 7s
	Train Loss: 1.490 | Train PPL:   4.437





In [None]:
torch.save({
            'epoch': epoch,
            'model_state_dict': FullLSHmodel.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss,
            }, 'LSH2_3.pt')

In [None]:
copyfile('/content/LSH2_3.pt', '/content/drive/My Drive/TransformerLSH/LSH2_3.pt')

'/content/drive/My Drive/TransformerLSH/LSH2_3.pt'

In [None]:
predictions, trues = validate_model(FullLSHmodel, test_dataloader, stop_token=2)

250it [06:34,  1.58s/it]


In [None]:
BPE = pyonmttok.Tokenizer("conservative", bpe_model_path="model-50k_with_joiner")

In [None]:
predictions_detok = []
for sentence in predictions:
    predictions_detok.append(BPE.detokenize(sentence))

trues_detok = []
for sentence in trues:
    trues_detok.append(BPE.detokenize(sentence))

In [None]:
import rouge
rouge = rouge.Rouge()
scores = rouge.get_scores(predictions_detok, trues_detok, avg=True)

In [None]:
scores

{'rouge-1': {'f': 0.061742156144260124,
  'p': 0.06666360989801418,
  'r': 0.0602998085248088},
 'rouge-2': {'f': 0.006710968747283967,
  'p': 0.007141341991341988,
  'r': 0.006568398268398267},
 'rouge-l': {'f': 0.05933669709711019,
  'p': 0.06467209302135801,
  'r': 0.05742897935397962}}