# Data preparation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp drive/MyDrive/qa_data.jsonl.zip .

In [None]:
!unzip qa_data.jsonl.zip

Archive:  qa_data.jsonl.zip
  inflating: qa_data.jsonl           
  inflating: __MACOSX/._qa_data.jsonl  


In [None]:
!head -n 300000 qa_data.jsonl > data.jsonl

In [None]:
!pip install youtokentome

Collecting youtokentome
  Downloading youtokentome-1.0.6-cp37-cp37m-manylinux2010_x86_64.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 4.2 MB/s 
Installing collected packages: youtokentome
Successfully installed youtokentome-1.0.6


In [None]:
!head -n 500000 qa_data.jsonl | sed 's/[^0-9а-яА-Я \-\.\?]//g' | sed 's/  / /g' > forbpe.txt

# Imports

In [None]:
import pandas as pd
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
from torchtext.legacy.data import BucketIterator

In [None]:
import youtokentome as yttm

In [None]:
from torchtext.nn import MultiHeadAttentionContainer, InProjContainer, ScaledDotProduct

# Подготовка

In [None]:
%%time
vocab_size = 30000
model_path = 'pretrained_bpe_lm.model'
yttm.BPE.train(data='forbpe.txt', vocab_size=vocab_size, model=model_path)

CPU times: user 17.8 s, sys: 2.84 s, total: 20.6 s
Wall time: 12.5 s


In [None]:
tokenizer = yttm.BPE(model=model_path)
(PAD_TOKEN,
UNK_TOKEN,
START_TOKEN,
END_TOKEN) = tokenizer.vocab()[:4]
MAX_LEN=64

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data = list()

with open('data.jsonl') as file_object:
    for line in file_object:
        data.append(json.loads(line.strip()))

test_start_idx = int(len(data) * 0.5)
val_start_idx = test_start_idx * 2
random.shuffle(data)
test_subset = data[:test_start_idx]
valid_subset = data[test_start_idx:val_start_idx]
train_subset = data[val_start_idx:]

In [None]:
class QADataset(Dataset):
    def __init__(self, data, _tokenizer=None, MAX_LEN=MAX_LEN):
        super().__init__()
        if _tokenizer is None:
            _tokenizer = tokenizer
        self._tokenizer = _tokenizer
        questions = []
        responses = []
        self.length: int
        for line_dict in data:
            question = line_dict["question"]
            response = line_dict["responses"]
            if len(response) == 0:
                continue
            questions.append(
                self._tokenize(question, MAX_LEN)
            )
            responses.append(
                self._tokenize(response[0], MAX_LEN)
            )
        else:
            assert len(questions) == len(responses)
            self.length = len(questions)
        self.questions = torch.nn.utils.rnn.pad_sequence(
            questions,
            batch_first=True,
            padding_value=self._tokenizer.subword_to_id(PAD_TOKEN)
        )
        self.responses = torch.nn.utils.rnn.pad_sequence(
            questions,
            batch_first=True,
            padding_value=self._tokenizer.subword_to_id(PAD_TOKEN)
        )

    def __len__(self):
        return self.length

    def __getitem__(self, item):
        return (
            self.questions[item],
            self.responses[item]
        )

    def _tokenize(self, text, max_len):
        return torch.LongTensor(
            self._tokenizer.encode(text, bos=True, eos=True)[:max_len]
        )

In [None]:
train_dataset = QADataset(train_subset)
valid_dataset = QADataset(valid_subset)
test_dataset = QADataset(test_subset)

In [None]:
(
    train_dataloader,
    valid_dataloader,
    test_dataloader
) = BucketIterator(
    (train_dataset, valid_dataset, test_dataset),
    batch_sizes=(128, 128, 128),
    device=device,
    sort_key=lambda x: torch.count_nonzero(x[0]),
    sort=True,
    shuffle=True,
    sort_within_batch=False
)

# Модель

In [None]:
class PositionWiseFF(nn.Module):
    def __init__(self, embed_dim, pf_dim, dropout):
        self.hidden = nn.Linear(embed_dim, pf_dim)
        self.gate = nn.Linear(pf_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, embedding):
        hidden = self.dropout(
            torch.relu(self.hidden(embedding))
        )
        return self.gate(hidden)

In [None]:
class EncodLayer(nn.Module):
    def __init__(
        self,
        embed_dim,
        num_heads,
        pf_dim,
        dropout=0.3
    ):
        assert embed_dim % num_heads == 0
        super().__init__()
        self.norm_attention = nn.LayerNorm(embed_dim)
        self.norm_ff = nn.LayerNorm(embed_dim)
        projection_container = InProjContainer(
            nn.Linear(embed_dim, embed_dim),
            nn.Linear(embed_dim, embed_dim),
            nn.Linear(embed_dim, embed_dim)
        )
        self.selfAttention = MultiHeadAttentionContainer(
            num_heads,
            projection_container,
            ScaledDotProduct(dropout=dropout),
            nn.Linear(embed_dim, embed_dim)
        )
        self.ff = PositionWiseFF(embed_dim, pf_dim, dropout)

    def forward(self, embedding, mask):
        mha_out, _ = self.selfAttention(*([embedding] * 3), mask)
        normalized = self.normAttention(embedding + mha_out)
        gated = self.ff(embedding)
        return self.norm_ff(
            normalized + gated
        )

In [None]:
class Encoder(nn.Module):
    def __init__(
        self,
        sent_dim,
        embed_dim,
        num_heads,
        pf_dim,
        dropout=0.3,
        MAX_LEN=MAX_LEN
    ):
        super().__init__()
        self.embedding = nn.Embedding(sent_dim, embed_dim)
        self.positional = nn.Embedding(MAX_LEN, embed_dim)
        self.scaling = torch.sqrt(
            torch.FloatTensor([embed_dim]).to(device)
        )
        self.encoder_layers = nn.ModuleList(
            [EncodLayer(embed_dim, num_heads, pf_dim, dropout)] * 12
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, texts, mask):
        batch_size = texts.shape[0]
        text_len = texts.shape[1]
        # position matrix for texts
        pos = torch.IntTensor(
            [list(range(text_len))] * batch_size
        ).unsqueeze(0).to(device)
        pos_hidden = self.positional(pos)
        text_hidden = self.embedding(texts) * self.scaling
        combined = self.dropout(pos_hidden + text_hidden)
        for layer in self.encoder_layers:
            combined = layer(combined, mask)
        return combined

# Обучение