# Data preparation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!cp drive/MyDrive/qa_data.jsonl.zip .

In [3]:
!unzip qa_data.jsonl.zip

Archive:  qa_data.jsonl.zip
replace qa_data.jsonl? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace __MACOSX/._qa_data.jsonl? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [4]:
!head -n 300000 qa_data.jsonl > data.jsonl

In [5]:
!pip install youtokentome



In [6]:
!head -n 500000 qa_data.jsonl | sed 's/[^0-9а-яА-Я \-\.\?]//g' | sed 's/  / /g' > forbpe.txt

# Imports

In [7]:
import numpy as np
import pandas as pd
import json
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [8]:
from tqdm import tqdm

In [9]:
from torchtext.legacy.data import BucketIterator
from torchtext.nn import MultiheadAttentionContainer, InProjContainer, ScaledDotProduct

In [10]:
import youtokentome as yttm

In [11]:
from collections import namedtuple

# Подготовка

In [12]:
%%time
vocab_size = 30000
model_path = 'pretrained_bpe_lm.model'
yttm.BPE.train(data='forbpe.txt', vocab_size=vocab_size, model=model_path)

CPU times: user 15.9 s, sys: 1.68 s, total: 17.6 s
Wall time: 14.6 s


In [13]:
tokenizer = yttm.BPE(model=model_path)
(PAD_TOKEN,
UNK_TOKEN,
START_TOKEN,
END_TOKEN) = tokenizer.vocab()[:4]
MAX_LEN=48
PAD_IDX = tokenizer.subword_to_id(PAD_TOKEN)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
data = list()

with open('data.jsonl') as file_object:
    for line in file_object:
        data.append(json.loads(line.strip()))

test_start_idx = int(len(data) * 0.05)
val_start_idx = test_start_idx * 2
random.shuffle(data)
test_subset = data[:test_start_idx]
valid_subset = data[test_start_idx:val_start_idx]
train_subset = data[val_start_idx:]

In [16]:
class QADataset(Dataset):
    def __init__(self, data, _tokenizer=None, MAX_LEN=MAX_LEN):
        super().__init__()
        if _tokenizer is None:
            _tokenizer = tokenizer
        self._tokenizer = _tokenizer
        questions = []
        responses = []
        self.length: int
        for line_dict in data:
            question = line_dict["question"]
            response = line_dict["responses"]
            if len(response) == 0:
                continue
            questions.append(
                self._tokenize(question, MAX_LEN)
            )
            responses.append(
                self._tokenize(response[0], MAX_LEN)
            )
        else:
            assert len(questions) == len(responses)
            self.length = len(questions)
        self.questions = torch.nn.utils.rnn.pad_sequence(
            questions,
            batch_first=True,
            padding_value=PAD_IDX
        )
        self.responses = torch.nn.utils.rnn.pad_sequence(
            questions,
            batch_first=True,
            padding_value=PAD_IDX
        )

    def __len__(self):
        return self.length

    def __getitem__(self, item):
        return (
            self.questions[item],
            self.responses[item],
        )

    def _tokenize(self, text, max_len):
        return torch.LongTensor(
            self._tokenizer.encode(text, bos=True, eos=True)[:max_len]
        )

In [17]:
train_dataset = QADataset(train_subset)
valid_dataset = QADataset(valid_subset)
test_dataset = QADataset(test_subset)

In [18]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=64)
valid_dataloader = DataLoader(valid_dataset, shuffle=True, batch_size=64)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=64)

In [19]:
for i in train_dataloader:
    print(i[0].shape)
    break

torch.Size([64, 48])


# Модель

In [20]:
class PositionWiseFF(nn.Module):
    def __init__(self, embed_dim, pf_dim, dropout):
        super().__init__()
        self.hidden = nn.Linear(embed_dim, pf_dim)
        self.gate = nn.Linear(pf_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, embedding):
        hidden = self.dropout(
            torch.relu(self.hidden(embedding))
        )
        return self.gate(hidden)

In [21]:
class EncodLayer(nn.Module):
    def __init__(
        self,
        embed_dim,
        pf_dim,        
        num_heads=8,
        dropout=0.1
    ):
        super().__init__()
        self.norm_attention, self.norm_ff = (
            2 * [nn.LayerNorm(embed_dim)]
        )
        projection_container = InProjContainer(
            *(3 * [nn.Linear(embed_dim, embed_dim)])
        )
        self.selfAttention = MultiheadAttentionContainer(
            num_heads,
            projection_container,
            ScaledDotProduct(dropout=dropout, batch_first=True),
            nn.Linear(embed_dim, embed_dim),
            batch_first=True
        )
        self.ff = PositionWiseFF(embed_dim, pf_dim, dropout)

    def forward(self, embedding, mask):
        gated, _ = self.selfAttention(*(3 * [embedding]), mask)
        # apply residual connection
        normalized = self.norm_attention(embedding + gated)
        gated = self.ff(embedding)
        return self.norm_ff(
            normalized + gated # apply residual connection
        )

In [22]:
class Encoder(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim,
        pf_dim,
        num_heads=8,
        num_layers=6,
        dropout=0.1,
        MAX_LEN=MAX_LEN
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.positional = nn.Embedding(MAX_LEN, embed_dim)
        self.scaling = torch.sqrt(torch.FloatTensor([embed_dim])).to(device)
        self.encoder_layers = nn.ModuleList(
            num_layers * [EncodLayer(embed_dim, pf_dim, num_heads, dropout)] 
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, texts, mask):
        batch_size, text_len = texts.shape[:2]
        # position matrix for texts
        pos = torch.IntTensor(
            batch_size * [list(range(text_len))]
        ).to(device)
        pos_hidden = self.positional(pos)
        text_hidden = self.embedding(texts) * self.scaling
        combined = self.dropout(pos_hidden + text_hidden)
        for layer in self.encoder_layers:
            combined = layer(combined, mask)
        return combined

In [41]:
class DecoderLayer(nn.Module):
    def __init__(
        self,
        embed_dim,
        pf_dim,
        num_heads=8,
        dropout=0.1,
        MAX_LEN=MAX_LEN
    ):
        super().__init__()
        self.norm_attention, self.norm_encoder, self.norm_ff = (
            3 * [nn.LayerNorm(embed_dim)]
        )
        projection_containers = 2 * [
            InProjContainer(*(3 * [nn.Linear(embed_dim, embed_dim)]))
        ]
        self.encAttention, self.selfAttention = [
            MultiheadAttentionContainer(num_heads,
                                        container,
                                        ScaledDotProduct(dropout=dropout, batch_first=True),
                                        nn.Linear(embed_dim, embed_dim),
                                        batch_first=True)
            for container in projection_containers                                             
        ]
        self.ff = PositionWiseFF(embed_dim, pf_dim, dropout)

    def forward(self, target, encoded, decoder_mask, encoder_mask):
        gated, _ = self.selfAttention(*(3 * [target]), decoder_mask)
        
        target = self.norm_attention(target + gated) # apply residual connection
        # print(f"target: {target.shape}\n encoded: {encoded.shape}")
        gated, attention = self.encAttention(target,
                                             *(2 * [encoded]),
                                             encoder_mask)
        target = self.norm_encoder(target + gated) # apply residual connection
        target = self.norm_ff(target + self.ff(target)) # apply residual connection
        return target, attention
    

In [42]:
class Decoder(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim,
        pf_dim,
        num_heads=8,
        num_layers=6,
        dropout=0.1,
        MAX_LEN=MAX_LEN                 
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.positional = nn.Embedding(MAX_LEN, embed_dim)
        self.decoder_layers = nn.ModuleList(
            num_layers * [DecoderLayer(embed_dim, pf_dim, num_heads, dropout)]
        )
        self.scaling = torch.sqrt(torch.FloatTensor([embed_dim])).to(device)
        self.output = nn.Linear(embed_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, target, encoded, decoder_mask, encoder_mask):
        batch_size, text_len = target.shape[:2]
        pos = torch.IntTensor(
            batch_size * [list(range(text_len))]
        ).to(device)        
        pos_hidden = self.positional(pos)
        text_hidden = self.embedding(target) * self.scaling
        combined = self.dropout(pos_hidden + text_hidden)
        for layer in self.decoder_layers:
            combined, attention = layer(combined, encoded, decoder_mask, encoder_mask)
        output = self.output(combined)
        return output, attention

In [43]:
class LanguageModel(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.encoder = Encoder(**kwargs)
        self.decoder = Decoder(**kwargs)
        self.encoder.embedding.weight = self.decoder.output.weight

    @staticmethod
    def mask(batch, stage='encoding'):
        shape = batch.shape[1]
        print(f"batch shape: {batch.shape}")
        mask = (batch != PAD_IDX).unsqueeze(1) & torch.ones(
            (shape, shape), device=device
        ).bool()
        print(f"mask shape: {mask.shape}")
        return mask

    def forward(self, texts, target):
        # encoder_mask = self.mask(texts)
        encoder_mask=None
        # decoder_mask = self.mask(target, stage='decoding')
        decoder_mask=None
        print("\nencoding")
        encoded = self.encoder(texts, encoder_mask)
        print("\ndecoding")
        output, attention = self.decoder(target, encoded, decoder_mask, encoder_mask)
        return output, attention

In [35]:
model = LanguageModel(
    vocab_size=vocab_size,
    embed_dim=128,
    pf_dim=256,
    num_heads=8,
    dropout=0.1,
    MAX_LEN=MAX_LEN
).to(device)

In [36]:
optimizer = optim.Adam(model.parameters(), lr=0.005)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1)
loss = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

# Обучение

In [47]:
def train(model:nn.Module,
          iterator:torch.utils.data.DataLoader,
          optimizer:torch.optim.Optimizer,
          loss_fn:nn.modules.loss._Loss,
          grad_clip:int=1,
          print_every:int=1000):
    
    epoch_loss:list = []
    model.train()
    
    for i, (texts, ys) in enumerate(iterator):
        optimizer.zero_grad()

        output, _ = model(texts.to(device), ys.to(device))
        print(f"output shape: {output.shape}")
        print(f"ys shape: {ys.shape}")
        output_dim = output.shape[-1]    
        output = output.contiguous().view(-1, output_dim)
        ys = ys.contiguous().view(-1)
        print(f"output shape: {output.shape}")
        print(f"ys shape: {ys.shape}")
        print("before loss")
        loss = loss_fn(output, ys)
        loss.backward()
        epoch_loss.append(loss.item())

        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        
        if not (i + 1) % print_every:
            print(f"loss: {np.mean(epoch_loss)}")
    return np.mean(epoch_loss)
    
def evaluate(model:nn.Module,
             iterator:torch.utils.data.DataLoader,
             loss_fn:torch.nn.modules.loss._Loss):
    
    epoch_loss:list = []
        
    model.eval()
    with torch.no_grad():
        for texts, ys in iterator:
            output, _ = model(texts.to(device), ys.to(device))
            output_dim = output.shape[-1]    
            output = output.contiguous().view(-1, output_dim)
            ys = ys.contiguous().view(-1)

            loss = loss_fn(output, ys)
            epoch_loss.append(loss.item())
    
    return np.mean(epoch_loss)

In [50]:
loss_eval = []

NUM_EPOCHS = 60  
counter = 0
best_loss = 100

for n_epoch in tqdm(range(NUM_EPOCHS)):
    print(f"\nEpoch #{str(n_epoch + 1)}:")
    train = train(model, train_dataloader, optimizer, loss)
    ev = evaluate(model, valid_dataloader, loss)
    loss_eval.append(ev)
    print("\nMean Loss: ", ev)
    # for early stopping
    if ev < best_loss:
        best_loss = ev
        counter = 0
    else:
        counter += 1
    if counter == 5:
        break

    # for saving
    if n_epoch % 5 == 0:
        torch.save(model.state_dict(), 'trans.pt')

  0%|          | 0/60 [00:00<?, ?it/s]


Epoch #1:

encoding





RuntimeError: ignored

In [49]:
torch.cuda.empty_cache()

In [None]:
def infer(sentence:str,
          model:nn.Module,
          device,
          MAX_LEN=MAX_LEN):
    tokenized = tokenizer.encode(sentence, bos=True, eos=True)[:MAX_LEN]
    padded = torch.nn.utils.rnn.pad_sequence(
        [tokenized],
        batch_first=True,
        padding_value=PAD_IDX
    )
    encoder_mask = model.mask(padded)
    start_idx, end_idx = (
        tokenizer.subword_to_id(START_TOKEN),
        tokenizer.subword_to_id(END_TOKEN),
    )
    targets = [start_idx]
    with torch.no_grad():
        encoded = model.encoder(padded, encoder_mask),

    for i in range(MAX_LEN):
        target = torch.LongTensor([targets]).to(device)
        decoder_mask = model.mask(target)
        with torch.no_grad():
            output, attention = model.decoder(target, encoded, decoder_mask, encoder_mask)
        prediction = output.argmax(2)[:,-1].item()
        targets.append(prediction)
        if prediction == end_idx:
            break
    return tokenizer.decode(targets)

In [None]:
model.load_state_dict(torch.load('trans.pt'))