In [59]:
import numpy as np
import json
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from typing import Optional
import matplotlib.pyplot as plt
import pandas as pd
from string import punctuation
from nltk import tokenize
import youtokentome as yttm
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

# Создаем датасет

In [40]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def load_jsonl(path):
    data=[]
    with open(path, 'r', encoding='utf-8') as reader:
        for line in reader:
            data.append(json.loads(line))
    return data 

In [4]:
qa_data = load_jsonl('qa_data.jsonl')

In [5]:
all_data = []
for line in qa_data:
    for response in line['responses']:
        all_data.append(
            {'question': line['question'].lower(),
             'response': response.lower()})

In [6]:
data = pd.json_normalize(all_data)

In [7]:
print(len(data))
data.head()

7767138


Unnamed: 0,question,response
0,долго ли идут деньги с яндексденег на карту visa?,нет. прорыв 35 ;)
1,можно ли зарегистрировать авто в другом регионе,можно на родственника из того региона.. . а п...
2,что делать если у меня очень тонкие ногти а хо...,витамины и умная эмаль (каждый день)
3,что делать если у меня очень тонкие ногти а хо...,ванночки с морской солью. с вечера мажь ногти ...
4,что делать если у меня очень тонкие ногти а хо...,"умная эмаль, витамины, йод, и поменьше крась л..."


In [8]:
with open('for_bpe.txt', 'w', encoding='utf-8') as f:
    for res in data.response:
        f.write(res + '\n')
        
# параметры
vocab_size = 30_000
model_path = 'pretrained_bpe_lm.model'

In [9]:
yttm.BPE.train(data='for_bpe.txt', vocab_size=vocab_size, model=model_path)

<youtokentome.youtokentome.BPE at 0x7f464f8ca730>

In [10]:
# загружаем токенизатор
tokenizer = yttm.BPE(model=model_path)

tokenizer.vocab()[:15]

['<PAD>',
 '<UNK>',
 '<BOS>',
 '<EOS>',
 '▁',
 'о',
 'е',
 'а',
 'т',
 'н',
 'и',
 'с',
 'р',
 'в',
 'л']

In [11]:
small_data = data.sample(frac=0.005, random_state=42)

In [12]:
print(len(small_data))

38836


In [99]:
small_data.head()

Unnamed: 0,question,response
3194253,а вот у меня нет любви...я ценю свободу...а вы...,иногда хочется какой-то определенности но в ос...
5451557,подскажите название телепередачи!!!,бля кажется по дисковери было или експлоуер)
3314992,"дамы! что есть хуже порванных новых колготок, ...",ты порвал свои новые колготки прямо перед выхо...
7522285,"бухгалтера салона красоты отзовитесь, пожалуйста!",для услуг я бы использовала 26 но можно и на 2...
6000707,за сколько времени зарядится акб на 500mah. ес...,от силы тока зарядки зависит


In [100]:
train_size = 0.8
valid_size=0.1

train_index = int(len(small_data)*train_size)

df_train = small_data[0:train_index]
df_rem = small_data[train_index:]

valid_index = int(len(small_data)*valid_size)

df_valid = small_data[train_index:train_index+valid_index]
df_test = small_data[train_index+valid_index:]

X_train, y_train = df_train.drop(columns='response').copy(), df_train['response'].copy()
X_valid, y_valid = df_valid.drop(columns='response').copy(), df_valid['response'].copy()
X_test, y_test = df_test.drop(columns='response').copy(), df_test['response'].copy()
        
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(31068, 1)
(31068,)
(3883, 1)
(3883,)
(3885, 1)
(3885,)


(None, None)

In [101]:
X_train = tokenizer.encode(X_train.question.to_list(), bos=True)
y_train = tokenizer.encode(y_train.to_list(), bos=True)
X_valid = tokenizer.encode(X_valid.question.to_list(), bos=True)
y_valid = tokenizer.encode(y_valid.to_list(), bos=True)
X_test = tokenizer.encode(X_test.question.to_list(), bos=True)
y_test = tokenizer.encode(y_test.to_list(), bos=True)

In [102]:
class SeqDataset(torch.utils.data.Dataset):

    def __init__(self, seq_1, seq_2, max_len, pad_index):
        
        self.questions = seq_1
        self.responses = seq_2
        self.max_len = max_len
        self.pad_index = pad_index

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, index):
        
        questions = self.questions[index][:self.max_len]
        responses = self.responses[index][:self.max_len]
        
        
        q_pads = [self.pad_index] * (self.max_len - len(questions))
        r_pads = [self.pad_index] * (self.max_len - len(responses))
        
        questions = torch.tensor(questions + q_pads).long().view(-1, 1)
        responses = torch.tensor(responses + r_pads).long().view(-1, 1)
        
        return questions, responses

In [103]:
train_dataset = SeqDataset(seq_1=list(X_train), seq_2=list(y_train), max_len = 30, pad_index = 0)
valid_dataset = SeqDataset(seq_1=list(X_valid), seq_2=list(y_valid), max_len = 30, pad_index = 0)
test_dataset = SeqDataset(seq_1=list(X_test), seq_2=list(y_test), max_len = 30, pad_index = 0)

In [104]:
train_dataset

<__main__.SeqDataset at 0x7f43bdcafd60>

In [105]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=64)

for x, y in train_loader:
    break

x.shape, y.shape

(torch.Size([64, 30, 1]), torch.Size([64, 30, 1]))

# Делаем модель

Отталкиваюсь от статьи 'Convolutional Sequence to Sequence Learning': https://arxiv.org/abs/1705.03122

Пока не закончено

In [53]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 emb_dim, 
                 hid_dim, 
                 n_layers, 
                 kernel_size, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        assert kernel_size % 2 == 1, "Kernel size must be odd!"
        
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, 
                                              out_channels = 2 * hid_dim, 
                                              kernel_size = kernel_size, 
                                              padding = (kernel_size - 1) // 2)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        tok_embedded = self.tok_embedding(src)
        pos_embedded = self.pos_embedding(pos)
        embedded = self.dropout(tok_embedded + pos_embedded)
        conv_input = self.emb2hid(embedded)
        conv_input = conv_input.permute(0, 2, 1) 

        
        for i, conv in enumerate(self.convs):

            conved = conv(self.dropout(conv_input))
            conved = F.glu(conved, dim = 1)
            conved = (conved + conv_input) * self.scale
            conv_input = conved
        
        conved = self.hid2emb(conved.permute(0, 2, 1))
        combined = (conved + embedded) * self.scale
        
        
        return conved, combined

In [54]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 emb_dim, 
                 hid_dim, 
                 n_layers, 
                 kernel_size, 
                 dropout, 
                 trg_pad_idx, 
                 device,
                 max_length = 100):
        super().__init__()
        
        self.kernel_size = kernel_size
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(output_dim, emb_dim)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
        self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
        
        self.fc_out = nn.Linear(emb_dim, output_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, 
                                              out_channels = 2 * hid_dim, 
                                              kernel_size = kernel_size)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
      
    def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):

        conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
        combined = (conved_emb + embedded) * self.scale
                
        energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
        
        attention = F.softmax(energy, dim=2)
        attended_encoding = torch.matmul(attention, encoder_combined)
        attended_encoding = self.attn_emb2hid(attended_encoding)
        attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
        
        
        return attention, attended_combined
        
    def forward(self, trg, encoder_conved, encoder_combined):
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
            
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        tok_embedded = self.tok_embedding(trg)
        pos_embedded = self.pos_embedding(pos)
        
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        conv_input = self.emb2hid(embedded)
        
        conv_input = conv_input.permute(0, 2, 1) 
        
        
        batch_size = conv_input.shape[0]
        hid_dim = conv_input.shape[1]
        
        for i, conv in enumerate(self.convs):
        
            conv_input = self.dropout(conv_input)
        
            padding = torch.zeros(batch_size, 
                                  hid_dim, 
                                  self.kernel_size - 1).fill_(self.trg_pad_idx).to(self.device)
                
            padded_conv_input = torch.cat((padding, conv_input), dim = 2)

            conved = conv(padded_conv_input)

            conved = F.glu(conved, dim = 1)

            attention, conved = self.calculate_attention(embedded, 
                                                         conved, 
                                                         encoder_conved, 
                                                         encoder_combined)

            conved = (conved + conv_input) * self.scale
            
            conv_input = conved
            
        conved = self.hid2emb(conved.permute(0, 2, 1))
         
            
        output = self.fc_out(self.dropout(conved))
            
        return output, attention

In [55]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, src, trg):

        encoder_conved, encoder_combined = self.encoder(src)
        output, attention = self.decoder(trg, encoder_conved, encoder_combined)
        
        return output, attention

In [56]:
INPUT_DIM = len(tokenizer.vocab())
OUTPUT_DIM = len(tokenizer.vocab())
EMB_DIM = 256
HID_DIM = 512 # each conv. layer has 2 * hid_dim filters
ENC_LAYERS = 10 # number of conv. blocks in encoder
DEC_LAYERS = 10 # number of conv. blocks in decoder
ENC_KERNEL_SIZE = 3 # must be odd!
DEC_KERNEL_SIZE = 3 # can be even or odd
ENC_DROPOUT = 0.25
DEC_DROPOUT = 0.25
TRG_PAD_IDX = 0
    
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, device)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, TRG_PAD_IDX, device)

model = Seq2Seq(enc, dec).to(device)

In [57]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 55,387,696 trainable parameters


In [70]:
optimizer = optim.Adam(model.parameters())

In [71]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [72]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, _ = model(src, trg[:,:-1])

            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import time

N_EPOCHS = 10
CLIP = 0.1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut5-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')