1. Написать Dataset для задачи seq2seq
2. Реализовать модель
3. Сделать цикл обучения
4. Реализовать метод генерации ответа по вопросу с помощью вашей модели


1. Сделать модель, основанную на lstm/gru 5 баллов ✓
2. Сделать модель, основанную на cnn 7 баллов
3. Сделать модель, основанную на трансформере (реализовать все слои самому) 10 баллов
4. Добавить в rnn/cnn модель attention 5 баллов ✓
5. Реализовать жадное семплирование (генерацию по самому вероятному токену, как выше в языковой модели) 3 балла ✓
6. Реализовать beam search 5 баллов
7. Реализовать nucleus sampling 5 баллов ✓
8. Добавить condition в модель 3 балла
9. Добавить layer norm/residual в cnn или rnn модель 1 балл ✓
10. Реализовать аккамуляцию градиентов 1 балл
11. Сделать телеграм бота 2 балла


In [5]:
import zipfile
import json
from tqdm import tqdm
import youtokentome as yttm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from typing import Iterable, List, Tuple
import torch
from torch import nn
from string import punctuation, whitespace
from src.utils import train_bpe

ModuleNotFoundError: No module named 'src'

In [10]:
ZIP_PATH = 'qa_data.jsonl.zip'
FILE_NAME = 'qa_data.jsonl'
BPE_TEXT_PATH = 'bpe_raw.txt'
BPE_MODEL_PATH = 'bpe_qa.model'
VOCAB_SIZE = 7000
MAX_SOURCE_LEN = 32
MAX_TARGET_LEN = 32
QA_MODEL_PATH = 'qa_model.pth'

In [None]:
qa_data = list()
with open(ZIP_PATH) as ZipFile
with open('qa_data.jsonl') as file_object:
    for line in file_object:
        qa_data.append(json.loads(line.strip()))

In [9]:
if 1:
    train_bpe(qa_data, bpe_text_path, bpe_model_path, 7000)

NameError: name 'qa_data' is not defined

In [34]:
bpe = yttm.BPE(model=bpe_model_path)

In [35]:
bpe.vocab()[:10]

['<PAD>', '<UNK>', '<BOS>', '<EOS>', '▁']

In [36]:
class SeqToSeqDataset(Dataset):

    def __init__(self,
                 qa_data: Iterable[dict],
                 tokenizer,
                 max_length: int = 128,
                 pad_index: int = 0,
                 unk_index: int = 1,
                 bos_index: int = 2,
                 eos_index: int = 3,
                 pre_pad: bool = False):

        super().__init__()
        
        self.qa_data = self.remove_empty_answers(qa_data)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pad_index = pad_index
        self.unk_index = unk_index
        self.bos_index = bos_index
        self.eos_index = eos_index
        self.pre_pad = pre_pad


    def __len__(self) -> int:
        return len(self.qa_data)

    def filter_unk(self, token_indices: List[int]) -> List[int]:
        output = []
        unk_flag = False
        for token in token_indices:
            if token != self.unk_index:
                output.append(token)
            elif token == self.unk_index and not unk_flag:
                unk_flag = True
                output.append(token)
            else:
                pass
        return output

    def tokenize(self,
                 text: str,
                 bos: bool = True,
                 eos: bool = True) -> List[int]:

        tokens = self.tokenizer.encode(text,
                                       eos=eos,
                                       bos=bos)
        

        return tokens

    def padding(self, tokenized_text: List[int]) -> List[int]:

        # tokenized_text = self.filter_unk(tokenized_text)
        if tokenized_text[-1] == self.eos_index:
            tokenized_text = tokenized_text[:self.max_length]
            tokenized_text[-1] = self.eos_index
        else:
            tokenized_text = tokenized_text[:self.max_length]
        

        if self.pre_pad:
            tokenized_text = [self.pad_index] * (self.max_length - len(tokenized_text)) + tokenized_text
        else:
            tokenized_text += [self.pad_index] * (self.max_length - len(tokenized_text))

        return tokenized_text

    def __getitem__(self, index: int) -> Tuple[List[int], List[int], List[int]]:
        
        qa = self.qa_data[index]
        
        question = self.clean_text(qa['question'])
        answers = qa['responses']
        
        answer_index = np.random.randint(0, len(answers))
        answer = answers[answer_index]
        
        encoder_sequence = torch.Tensor(self.padding(self.tokenize(question,
                                                                   bos=True,
                                                                   eos=True))).long()
        decoder_sequence = torch.Tensor(self.padding(self.tokenize(answer,
                                                                   bos=True,
                                                                   eos=False))).long()
        target_sequence = torch.Tensor(self.padding(self.tokenize(answer,
                                                                  bos=False,
                                                                  eos=True))).long()
        
        return encoder_sequence, decoder_sequence, target_sequence

In [37]:
class SpatialDropout(torch.nn.Dropout2d):
    
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
    
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T)
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

In [38]:
class AttentionLayer(nn.Module):
    
    def __init__(self,
                 dim: int):
        
        super().__init__()
        
        self.dim = dim
        
        self.key_projection = nn.Linear(in_features=self.dim,
                                        out_features=self.dim)
        self.value_projection = nn.Linear(in_features=self.dim,
                                          out_features=self.dim)
        self.query_projection = nn.Linear(in_features=self.dim,
                                          out_features=self.dim)
        
        self.scale_factor = np.sqrt(self.dim)

    @staticmethod
    def mask_pads(weights, x_len, max_len):
        mask = torch.arange(max_len)[None, :] < x_len[:, None]
        masked_weights = weights.clone()
        masked_weights[~mask] = float('-inf')
        return masked_weights
        
    def forward(self, x, y, x_len, max_len):
        
        query = self.query_projection(y) # (batch_size, seq_len, dim)
        key = self.key_projection(x) # (batch_size, seq_len, dim)
        value = self.value_projection(x) # (batch_size, seq_len, dim)
        
        attention_weights = torch.bmm(query, key.permute(0, 2, 1)) # (batch_size, seq_len, seq_len)
        attention_weights /= self.scale_factor
        
        attention_weights = self.mask_pads(attention_weights,
                                           x_len=x_ len,
                                           max_len=max_len) 
               
        attention_weights = torch.softmax(attention_weights, dim=1) # (batch_size, seq_len, seq_len)


        
        attention = torch.bmm(attention_weights, value) # (batch_size, seq_len, dim)
        
        return y + attention

In [39]:
class MyNet(nn.Module):
    
    def __init__(self,
                 dim: int,
                 hidden_size: int,
                 vocab_size: int,
                 dropout: float,
                 max_len: int = 64,
                 pad_index: int = 0,
                 weight_tying=True):
        
        super().__init__()
        
        self.dim = dim
        self.hidden_size = hidden_size
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.pad_index = pad_index
        self.weight_tying = weight_tying
        
        self.embedding_layer = nn.Embedding(num_embeddings=self.vocab_size,
                                            embedding_dim=self.dim,
                                            padding_idx=self.pad_index)
        
        self.embedding_dropout = SpatialDropout(p=dropout)
        
        self.attention_layer = AttentionLayer(self.dim)
        
        self.lstm_1 = nn.LSTM(self.dim,
                              self.hidden_size,
                              batch_first=True,
                              bidirectional=False)
        
        self.lstm_2 = nn.LSTM(self.dim,
                              self.hidden_size,
                              batch_first=True,
                              bidirectional=False)  
        
        self.final_output = nn.Linear(in_features=self.hidden_size,
                                      out_features=self.vocab_size)
        
        if self.weight_tying and self.dim == self.hidden_size:
            self.final_output.weight = self.embedding_layer.weight
            
    @staticmethod
    def count_pads(x, axis=1):
        try:
          x = x.cpu()
        except:
          x = x
        return torch.Tensor(np.count_nonzero(x, axis=axis))
    
    def forward(self, x, y):

        x_lengths = self.count_pads(x)
        y_lengths = self.count_pads(y)


        x = self.embedding_layer(x)
        x = self.embedding_dropout(x)
        y = self.embedding_layer(y)
        y = self.embedding_dropout(y)



        x = pack_padded_sequence(x,
                                 x_lengths,
                                 batch_first=True,
                                 enforce_sorted=False)
        
        
        y = pack_padded_sequence(y,
                                 y_lengths,
                                 batch_first=True,
                                 enforce_sorted=False)
        
        x, memory = self.lstm_1(x)
        y, _ = self.lstm_2(y, memory)
        

        x = pad_packed_sequence(x,
                                batch_first=True,
                                total_length=self.max_len)[0]
        y = pad_packed_sequence(y,
                                batch_first=True,
                                total_length=self.max_len)[0]

      

        y = self.attention_layer(x,
                                 y,
                                 x_len=x_lengths,
                                 max_len=self.max_len)
        
        y = self.final_output(y)
        
        return y # softmax?


In [40]:
def train(model,
          loader,
          criterion,
          optimizer,
          gpu: bool = True,
          clip:float = 3.,
          last_n_losses: int = 500,
          verbose: bool = True):
  
    losses = []

    progress_bar = tqdm(total=len(loader), disable=not verbose, desc='Train')

    model.train()

    for encoder_sequence, decoder_sequence, target_sequence in loader:
        
        if gpu:
            encoder_sequence = encoder_sequence.to(device)
            decoder_sequence = decoder_sequence.to(device)
            target_sequence = target_sequence.to(device)

        pred = model(encoder_sequence, decoder_sequence)

        loss = criterion(pred.view(-1, pred.size(-1)), target_sequence.view(-1))

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        losses.append(loss.item())

        progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]),
                                 perplexity=np.exp(np.mean(losses[-last_n_losses:])))

        progress_bar.update()

    progress_bar.close()
    
    return losses

In [41]:
def evaluate(model,
             loader,
             criterion,
             gpu: bool = True,
             last_n_losses: int = 500,
             verbose: bool = True):
    
    losses = []

    progress_bar = tqdm(total=len(loader), disable=not verbose, desc='Evaluate')

    model.eval()

    for encoder_sequence, decoder_sequence, target_sequence in loader:
        
        if gpu:
            encoder_sequence = encoder_sequence.to(device)
            decoder_sequence = decoder_sequence.to(device)
            target_sequence = target_sequence.to(device)

        with torch.no_grad():
            pred = model(encoder_sequence, decoder_sequence)

        loss = criterion(pred.view(-1, pred.size(-1)), target_sequence.view(-1))

        losses.append(loss.item())

        progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]),
                                 perplexity=np.exp(np.mean(losses[-last_n_losses:])))

        progress_bar.update()

    progress_bar.close()
    
    return losses

In [42]:
train_len = int(np.floor(len(qa_data) * 0.95))
train_data = qa_data[:train_len]
valid_data = qa_data[train_len:]

In [43]:
MAX_LEN = 64

In [44]:
train_ds = SeqToSeqDataset(train_data, bpe, max_length=MAX_LEN)
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=512)

valid_ds = SeqToSeqDataset(valid_data, bpe, max_length=MAX_LEN)
valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=512)

In [53]:
model = MyNet(dim=256,
              hidden_size=256,
              vocab_size=bpe.vocab_size(),
              dropout=0.2,
              max_len=MAX_LEN,
              weight_tying=False)

In [54]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)

In [55]:
for instance in list(tqdm._instances):
    tqdm._decr_instances(instance)

In [56]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [57]:
model.to(device)

MyNet(
  (embedding_layer): Embedding(5, 256, padding_idx=0)
  (embedding_dropout): SpatialDropout(p=0.2, inplace=False)
  (attention_layer): AttentionLayer(
    (key_projection): Linear(in_features=256, out_features=256, bias=True)
    (value_projection): Linear(in_features=256, out_features=256, bias=True)
    (query_projection): Linear(in_features=256, out_features=256, bias=True)
  )
  (lstm_1): LSTM(256, 256, batch_first=True)
  (lstm_2): LSTM(256, 256, batch_first=True)
  (final_output): Linear(in_features=256, out_features=5, bias=True)
)

In [61]:
train(model, train_loader, criterion, optimizer)

Train: 0it [00:00, ?it/s]


[]

In [59]:
epochs = 5

train_losses = []
validation_losses = []

train_perplexities = []
validation_perplexities = []

best_validation_loss = 1e+6

for n_epoch in range(1, epochs + 1):
    
    epoch_train_losses = train(model, train_loader, criterion, optimizer)
    epoch_validation_losses = evaluate(model, valid_loader, criterion)
    
    mean_train_loss = np.mean(epoch_train_losses)
    mean_validation_loss = np.mean(epoch_validation_losses)
    
    train_losses.append(epoch_train_losses)
    train_perplexities.append(np.exp(mean_train_loss))
    
    validation_losses.append(epoch_validation_losses)
    validation_perplexities.append(np.exp(mean_validation_loss))
    
    message = f'Epoch: {n_epoch}\n'
    message += f'Train: loss - {mean_train_loss:.4f} | perplexity - {train_perplexities[-1]:.3f}\n'
    message += f'Validation: loss - {mean_validation_loss:.4f} | perplexity - {validation_perplexities[-1]:.3f}'
    
    print(message)
    
#     if mean_validation_loss < best_validation_loss:
        
#         best_validation_loss = mean_validation_loss
        
#         torch.save(model.state_dict(), f'best_language_model_state_dict.pth')
#         torch.save(optimizer.state_dict(), 'best_optimizer_state_dict.pth')
        
#     else:
#         break
        
    torch.save(model.state_dict(), f'last_language_model_state_dict.pth')
    torch.save(optimizer.state_dict(), 'last_optimizer_state_dict.pth')

    with open(f'info_{n_epoch}.json', 'w') as file_object:

        info = {
            'message': message,
            'train_losses': train_losses,
            'validation_losses': validation_losses,
            'train_perplexities': train_perplexities,
            'validation_perplexities': validation_perplexities
        }

        file_object.write(json.dumps(info, indent=2))

Train: 0it [00:00, ?it/s]
Evaluate: 0it [00:00, ?it/s]
Train: 0it [00:00, ?it/s]
Evaluate: 0it [00:00, ?it/s]
Train: 0it [00:00, ?it/s]
Evaluate: 0it [00:00, ?it/s]
Train: 0it [00:00, ?it/s]
Evaluate: 0it [00:00, ?it/s]
Train: 0it [00:00, ?it/s]
Evaluate: 0it [00:00, ?it/s]

Epoch: 1
Train: loss - nan | perplexity - nan
Validation: loss - nan | perplexity - nan
Epoch: 2
Train: loss - nan | perplexity - nan
Validation: loss - nan | perplexity - nan
Epoch: 3
Train: loss - nan | perplexity - nan
Validation: loss - nan | perplexity - nan
Epoch: 4
Train: loss - nan | perplexity - nan
Validation: loss - nan | perplexity - nan
Epoch: 5
Train: loss - nan | perplexity - nan
Validation: loss - nan | perplexity - nan





### Greedy Search

In [None]:
def generate(question,
             tokenizer,
             bos_index=2,
             eos_index=3,
             max_sequence=32):
  
    if max_sequence > MAX_LEN:
      raise ValueError

    tokenized = tokenizer.encode(question, eos=True, bos=True)
    # tokenized += [0] * (MAX_LEN - len(tokenized))
    
    encoder_sequence = torch.tensor([tokenized]).long().to(device)
    decoder_sequence = torch.tensor([bos_index]).long().unsqueeze(0).to(device)

    model.eval()

    with torch.no_grad():
      for timestamp in range(max_sequence):
        predictions = model(encoder_sequence, decoder_sequence)
        current_token = predictions[:, -1, :].argmax(dim=-1)
        if current_token == eos_index:
          break
        decoder_sequence = torch.cat([decoder_sequence, current_token.unsqueeze(0)], dim=-1)
    # return decoder_sequence

    answer = tokenizer.decode(decoder_sequence.squeeze(0).tolist())
    answer = answer[0].lstrip('<BOS>').rstrip('<EOS>')
    return answer

In [None]:
valid_data[160]['question']

'я из черного в шоколадный хочу покрасится,'

In [None]:
generate(valid_data[160]['question'], bpe)

### Nucleus Sampling

In [None]:
from scipy.special import softmax

def nucleus(question,
            tokenizer,
            p=0.92,
            bos_index=2,
            eos_index=3,
            max_sequence=32):
  
    if max_sequence > MAX_LEN:
      raise ValueError

    tokenized = tokenizer.encode(question, eos=True, bos=True)
    # tokenized += [0] * (MAX_LEN - len(tokenized))
    
    encoder_sequence = torch.tensor([tokenized]).long().to(device)
    decoder_sequence = torch.tensor([bos_index]).long().unsqueeze(0).to(device)

    model.eval()

    with torch.no_grad():
      for timestamp in range(max_sequence):
        # по идее это и есть условная вероятность следущего слова:
        predictions = torch.softmax(model(encoder_sequence, decoder_sequence), dim=-1)
        candidate_probs, candidate_tokens = predictions[:, -1, :].sort(dim=-1, descending=True)
        # ищем индекс, левее которого вероятности складываются в `p`
        candidate_probs = torch.cumsum(candidate_probs, dim=-1)
        # еще один softmax, чтобы выбранные вероятности снова складывались в единицу
        candidate_probs = softmax(candidate_probs[candidate_probs < p].cpu().numpy())
        candidate_tokens = candidate_tokens[:, :candidate_probs.shape[0]].squeeze(0).cpu().numpy()
        current_token = np.random.choice(candidate_tokens, p=candidate_probs)
        if current_token == eos_index:
          break
        current_token = torch.Tensor([current_token]).long().to(device)
        decoder_sequence = torch.cat([decoder_sequence, current_token.unsqueeze(0)], dim=-1)
    # return decoder_sequence

    answer = tokenizer.decode(decoder_sequence.squeeze(0).tolist())
    answer = answer[0].lstrip('<BOS>').rstrip('<EOS>')
    return answer

In [None]:
nucleus(valid_data[160]['question'], bpe, p=0.5)

' поез приблизатьрылик онлайн мальчика постоя руга поли алекса настучески приходи чтобству зая отсут женщинам бюджетколо коричне егэлик скорость фамилия пить зем профи изуля отли'