In [4]:
import pickle
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
import youtokentome as yttm
from livelossplot import PlotLosses
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
device = torch.device("cuda:0")
cpu = torch.device("cpu")

## Load data 

In [32]:
story_path = "corpus/story_data_punct_del_em.pkl"
with open(story_path, 'rb') as f:
    data = pickle.load(f)

In [33]:
STORY_VAL = 34000
data_batch = data[:STORY_VAL]

In [34]:
def samples2text(data, story_size=300):
    new_data = []
    for i in range(len(data)):
        text = " ".join(data[i]).replace(" .",".").replace(" ,", ",").replace(" ?", "?")
        
        if len(text) <= story_size:
            new_data.append(text)
    
    return new_data

In [35]:
data_batch = list(samples2text(data_batch))

In [37]:
random_story = np.random.randint(0, len(data_batch)-1)
data_batch[random_story]

'У нас в городе есть целое дерево, с верху до низа обвешанное кроссами Привет из Мытищ.'

## Split data

In [38]:
from data_tools import split_data, get_bpe_tokenizer, get_unknown_ngrams

In [39]:
train_texts, test_texts = split_data(data_batch, train_size=0.75)

Total samples: 21061

Traning size: 15795 | Validating size: 5266


## Apply BPE 

In [40]:
train_txt = 'train_bpe.txt'
bpe_model_name = "story_bpe.yttm"
BPE_VOCAB_SIZE = 1000

tokenizer = get_bpe_tokenizer(train_texts, train_txt_path=train_txt, bpe_model_name=bpe_model_name,
                              vocab_size=BPE_VOCAB_SIZE)

In [41]:
random_id = np.random.randint(1, len(train_texts)-1)

print(train_texts[random_id])
print("")
print(*tokenizer.encode(train_texts[random_id]))

Отчим рассказывал, как он воевал в Афганистане и там они жгли лазером духов блять, черножопых, как марсиане людей в Войне миров. за этого марсианина

974 746 696 28 378 20 223 234 277 7 378 144 327 38 287 179 272 192 145 388 536 224 25 177 496 22 7 568 435 207 15 753 485 158 196 165 32 5 19 28 742 223 150 217 510 6 192 489 18 225 144 227 256 192 622 532 24 176 745 150 217 510 6 179 180


In [42]:
train_token_ids = tokenizer.encode(train_texts, bos=True, eos=True)
test_token_ids = tokenizer.encode(test_texts, bos=True, eos=True)

In [43]:
get_unknown_ngrams(test_token_ids)

Unknown n-grams in validation set:  0


## Create data-loaders

In [17]:
from torch.nn.utils.rnn import pad_sequence

In [18]:
def get_loaders(data, padding_value=0, batch_size=512, shuffle=True):
    input_seq = []
    target_seq = []
    
    for story in data:
        input_seq.append(torch.tensor(story[:-1]))
        target_seq.append(torch.tensor(story[1:]))
    
    input_seq = pad_sequence(input_seq, batch_first=True, padding_value=padding_value)
    target_seq = pad_sequence(target_seq, batch_first=True, padding_value=padding_value)

    data = torch.utils.data.TensorDataset(input_seq, target_seq)
    data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=shuffle)
    
    return data_loader

In [19]:
train_loader = get_loaders(train_token_ids)
test_loader = get_loaders(test_token_ids, shuffle=True)

## Init Language Model 

In [1]:
from model_tools import dependency_mask, positional_encoding

In [21]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, backbone, emb_dropout=0.0):
        super().__init__()
        self.embedding_size = embedding_size
        self.embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.emb_dropout = nn.Dropout(emb_dropout)
        self.backbone = backbone
        self.out = nn.Linear(embedding_size, vocab_size)
    
    def forward(self, seed_token_ids):

        batch_size, max_in_length = seed_token_ids.shape

        seed_padding_mask = seed_token_ids == 0
        dep_mask = dependency_mask(max_in_length).to(seed_token_ids.device)
        
        seed_embs = self.embeddings(seed_token_ids)  
        pos_codes = positional_encoding(max_in_length,
                                             self.embedding_size).unsqueeze(0).to(seed_embs.device)
        seed_embs = seed_embs + pos_codes
        seed_embs = self.emb_dropout(seed_embs)

        
        target_features = seed_embs
        target_features = self.backbone(seed_embs,
                                        mask=dep_mask,
                                        src_key_padding_mask=seed_padding_mask)
        
        logits = self.out(target_features)  
        return logits

In [22]:
#used for batch-first
class TransformerEncoder(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.impl = nn.TransformerEncoder(*args, **kwargs)
        self.initialize_weights()
    
    def forward(self, src, *args, **kwargs):
        src = src.transpose(0, 1).contiguous()  
        result = self.impl(src, *args, **kwargs)  
        result = result.transpose(0, 1).contiguous()  
        return result
    
    def initialize_weights(self):
        for param in self.impl.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)

In [23]:
vocab_size = tokenizer.vocab_size()
embedding_size = 256

enoder = TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=16, 
                                                              dim_feedforward=512, dropout=0.3), num_layers=3)


model = LanguageModel(vocab_size, embedding_size, enoder, emb_dropout=0.1)
print('Params:', sum(t.numel() for t in model.parameters()))

Params: 2094312


In [24]:
LR = 2e-3
EPOCH = 40
reg_alpha = 0

In [25]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=reg_alpha)
model = model.to(device)

## Train loop

In [2]:
from train_tools import train_loop

In [27]:
model = train_loop(model,device, optimizer, train_loader, test_loader, epoch_value=EPOCH, plot_loss=False)

Time: 0m 30s | Epoch: 1 / 40 | T-Loss: 6.272 | Val-Loss: 6.088
Time: 1m 0s | Epoch: 2 / 40 | T-Loss: 6.039 | Val-Loss: 5.731
Time: 1m 29s | Epoch: 3 / 40 | T-Loss: 5.366 | Val-Loss: 4.747
Time: 1m 59s | Epoch: 4 / 40 | T-Loss: 4.699 | Val-Loss: 4.413
Time: 2m 28s | Epoch: 5 / 40 | T-Loss: 4.451 | Val-Loss: 4.231
Time: 2m 58s | Epoch: 6 / 40 | T-Loss: 4.303 | Val-Loss: 4.088
Time: 3m 27s | Epoch: 7 / 40 | T-Loss: 4.184 | Val-Loss: 3.962
Time: 3m 57s | Epoch: 8 / 40 | T-Loss: 4.082 | Val-Loss: 3.859
Time: 4m 26s | Epoch: 9 / 40 | T-Loss: 3.995 | Val-Loss: 3.771
Time: 4m 56s | Epoch: 10 / 40 | T-Loss: 3.919 | Val-Loss: 3.693
Time: 5m 25s | Epoch: 11 / 40 | T-Loss: 3.852 | Val-Loss: 3.629
Time: 5m 54s | Epoch: 12 / 40 | T-Loss: 3.793 | Val-Loss: 3.564
Time: 6m 24s | Epoch: 13 / 40 | T-Loss: 3.741 | Val-Loss: 3.508
Time: 6m 53s | Epoch: 14 / 40 | T-Loss: 3.696 | Val-Loss: 3.461
Time: 7m 22s | Epoch: 15 / 40 | T-Loss: 3.657 | Val-Loss: 3.414
Time: 7m 52s | Epoch: 16 / 40 | T-Loss: 3.618 | Va

## Fit model 

In [28]:
def create_text(model, tokenizer, seq_begin):
    eos_token = 3
    max_steps_n = 40
    
    seed_tokens = tokenizer.encode([seq_begin])[0]
    
    for _ in range(max_steps_n):
        in_batch = torch.tensor(seed_tokens).unsqueeze(0).to(device)
        best_next_token = model(in_batch)[0, -1].argmax()
        if best_next_token == eos_token:
            break

        seed_tokens.append(best_next_token)

    return tokenizer.decode([seed_tokens])[0]

In [30]:
create_text(model, tokenizer, "На уроке я")

'На уроке я проснулся от того, что умерлась в туалете и услышал какую то хуйню.'

## Save PyTorch model

In [33]:
torch.save(model.state_dict(), 'story_model_trs.pth')