In [1]:
# !pip install youtokentome
# !pip install livelossplot

Collecting youtokentome
[?25l  Downloading https://files.pythonhosted.org/packages/ea/17/72cb4f7e01941e663e560d7d1882bcfa9794af917e09c9c319837d083138/youtokentome-1.0.3-cp36-cp36m-manylinux1_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 2.8MB/s eta 0:00:01
Installing collected packages: youtokentome
Successfully installed youtokentome-1.0.3


In [2]:
import pickle
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
import youtokentome as yttm

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [3]:
device = torch.device("cuda:0")
cpu = torch.device("cpu")

In [4]:
!nvidia-smi

Sun Nov 17 13:11:04 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  On   | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

## Load data 

In [5]:
story_path = "corpus2/story_data_punct_del_em.pkl"
with open(story_path, 'rb') as f:
    data = pickle.load(f)

In [6]:
STORY_VAL = 8500
data_batch = data[:STORY_VAL]

In [7]:
def samples2text(data):
    new_data = []
    for i in range(len(data)):
        if len(data[i]) < 200:
            new_data.append(" ".join(data[i]).replace(" .",".").replace(" ,", ",").replace(" ?", "?"))
    
    return new_data

In [8]:
data_batch = list(samples2text(data_batch))

In [9]:
random_story = np.random.randint(0, len(data_batch)-1)
data_batch[random_story]

'п 19, встречаюсь с армянкой 21, так вот, я парень хоть куда и клялся влюбви и женитьбе, она в итоге мне дала, но я все время перся со всеми подряд, и сейчас не беру от неё трубки за левобережку меня'

## Split data

In [10]:
TRAIN_SIZE = 0.75

split_threshold = int(len(data_batch) * TRAIN_SIZE)
train_texts = data_batch[:split_threshold]
test_texts = data_batch[split_threshold:]

print('Traning size: {} | Validating size: {}'.format(len(train_texts), len(test_texts)))
print('Total samples: {}'.format(len(train_texts) + len(test_texts)))

Traning size: 5892 | Validating size: 1965
Total samples:7857


## Apply BPE 

In [11]:
def save_text(texts, out_file):
    with open(out_file, 'w') as outf:
        outf.write('\n'.join(texts))

In [12]:
train_txt = 'train_bpe.txt'
bpe_model_name = "story_bpe.yttm"

save_text(train_texts, train_txt)
yttm.BPE.train(data=train_txt, vocab_size=1000, model=bpe_model_name);

In [13]:
tokenizer = yttm.BPE(bpe_model_name)

In [37]:
random_id = np.random.randint(1, len(train_texts)-1)

print(train_texts[random_id])
print("")
print(tokenizer.encode(train_texts[random_id]))

П16 Всем привет, вчера в Твери произошёл пиздец со стрельбой. Началось всё с того, что челики сцепились из за своих шкур они не поделили счёт в баре, челики пиздились и по итогу один ушёл отпизженый. Как оказалось, он ушёл за двумя хачами с обрезами, а второй челик я потом его нашёл и выпил по пивку, обсуждая данный пиздец, уже уебашил домой. Челики с обрезами походили, посмотрели и просто постреляли по шаурмечной, которая стояла рядом и тоже уебашили. Затем приехали 4 машины ментов, повтыкали минут 20 на шаурмечную и уехали. За Тверь город пиздец.

[402, 63, 241, 12, 190, 213, 875, 126, 352, 137, 126, 299, 287, 154, 951, 235, 926, 513, 215, 273, 153, 238, 23, 230, 27, 443, 26, 377, 175, 713, 124, 717, 20, 158, 492, 143, 180, 124, 592, 223, 701, 243, 156, 335, 989, 225, 187, 13, 395, 150, 130, 200, 143, 143, 124, 26, 637, 126, 272, 153, 20, 492, 143, 180, 319, 170, 701, 127, 130, 771, 431, 155, 926, 178, 617, 32, 147, 28, 539, 560, 128, 233, 377, 405, 189, 155, 926, 156, 131, 228, 17, 

In [15]:
train_token_ids = tokenizer.encode(train_texts, bos=True, eos=True)
test_token_ids = tokenizer.encode(test_texts, bos=True, eos=True)

In [16]:
unknown_subwords_in_test = sum(1 for text in test_token_ids for token_id in text if token_id == 1)
print('Unknown n-grams in validation set: ', unknown_subwords_in_test)

Unknown n-grams in validation set:  29


## Create data-loaders

In [17]:
from torch.nn.utils.rnn import pad_sequence

In [18]:
def get_loaders(data, padding_value=0, batch_size=100, shuffle=True):
    input_seq = []
    target_seq = []
    
    for story in data:
        input_seq.append(torch.tensor(story[:-1]))
        target_seq.append(torch.tensor(story[1:]))
    
    input_seq = pad_sequence(input_seq, batch_first=True, padding_value=padding_value)
    target_seq = pad_sequence(target_seq, batch_first=True, padding_value=padding_value)

    data = torch.utils.data.TensorDataset(input_seq, target_seq)
    data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=shuffle)
    
    return data_loader

In [19]:
train_loader = get_loaders(train_token_ids)
test_loader = get_loaders(test_token_ids, shuffle=True)

## Init Language Model 

In [1]:
from model_tools import dependency_mask, positional_encoding

In [22]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, backbone, emb_dropout=0.0):
        super().__init__()
        self.embedding_size = embedding_size
        self.embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.emb_dropout = nn.Dropout(emb_dropout)
        self.backbone = backbone
        self.out = nn.Linear(embedding_size, vocab_size)
    
    def forward(self, seed_token_ids):

        batch_size, max_in_length = seed_token_ids.shape

        seed_padding_mask = seed_token_ids == 0
        dep_mask = dependency_mask(max_in_length).to(seed_token_ids.device)
        
        seed_embs = self.embeddings(seed_token_ids)  
        pos_codes = positional_encoding(max_in_length,
                                             self.embedding_size).unsqueeze(0).to(seed_embs.device)
        seed_embs = seed_embs + pos_codes
        seed_embs = self.emb_dropout(seed_embs)

        
        target_features = seed_embs
        target_features = self.backbone(seed_embs,
                                        mask=dep_mask,
                                        src_key_padding_mask=seed_padding_mask)
        
        logits = self.out(target_features)  
        return logits

In [24]:
class TransformerEncoder(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.impl = nn.TransformerEncoder(*args, **kwargs)
        self.initialize_weights()
    
    def forward(self, src, *args, **kwargs):
        src = src.transpose(0, 1).contiguous()  
        result = self.impl(src, *args, **kwargs)  
        result = result.transpose(0, 1).contiguous()  
        return result
    
    def initialize_weights(self):
        for param in self.impl.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)

In [25]:
vocab_size = tokenizer.vocab_size()
embedding_size = 256

enoder = TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=16, 
                                                              dim_feedforward=512, dropout=0.4), num_layers=3)


model = LanguageModel(vocab_size, embedding_size, enoder, emb_dropout=0.1)
print('Params:', sum(t.numel() for t in model.parameters()))

Params: 2094312


In [26]:
LR = 2e-3
EPOCH = 50
reg_alpha = 0

In [27]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=reg_alpha)
torch_transf_model = model.to(device)

## Train loop

In [2]:
from train_tools import train_loop

In [33]:
torch_transf_model = train_loop(model, device,optimizer, train_loader, test_loader, epoch_value = EPOCH)

Time: 0m 28s | Epoch: 1 / 50 | T-Loss: 6.290 | Val-Loss: 6.191
Time: 0m 56s | Epoch: 2 / 50 | T-Loss: 6.168 | Val-Loss: 6.073
Time: 1m 23s | Epoch: 3 / 50 | T-Loss: 5.993 | Val-Loss: 5.762
Time: 1m 51s | Epoch: 4 / 50 | T-Loss: 5.587 | Val-Loss: 5.193
Time: 2m 18s | Epoch: 5 / 50 | T-Loss: 4.908 | Val-Loss: 4.652
Time: 2m 46s | Epoch: 6 / 50 | T-Loss: 4.500 | Val-Loss: 4.540
Time: 3m 13s | Epoch: 7 / 50 | T-Loss: 4.327 | Val-Loss: 4.481
Time: 3m 41s | Epoch: 8 / 50 | T-Loss: 4.210 | Val-Loss: 4.455
Time: 4m 8s | Epoch: 9 / 50 | T-Loss: 4.109 | Val-Loss: 4.431
Time: 4m 36s | Epoch: 10 / 50 | T-Loss: 4.027 | Val-Loss: 4.417
Time: 5m 3s | Epoch: 11 / 50 | T-Loss: 3.949 | Val-Loss: 4.408
Time: 5m 31s | Epoch: 12 / 50 | T-Loss: 3.884 | Val-Loss: 4.402
Time: 5m 58s | Epoch: 13 / 50 | T-Loss: 3.825 | Val-Loss: 4.396
Time: 6m 26s | Epoch: 14 / 50 | T-Loss: 3.769 | Val-Loss: 4.397
Time: 6m 53s | Epoch: 15 / 50 | T-Loss: 3.718 | Val-Loss: 4.396
Time: 7m 21s | Epoch: 16 / 50 | T-Loss: 3.678 | Val

## Fit model 

In [58]:
def create_text(model, tokenizer, seq_begin):
    eos_token = 3
    max_steps_n = 40
    
    seed_tokens = tokenizer.encode([seq_begin])[0]
    
    for _ in range(max_steps_n):
        in_batch = torch.tensor(seed_tokens).unsqueeze(0).to(device)
        best_next_token = model(in_batch)[0, -1].argmax()
        if best_next_token == eos_token:
            break

        seed_tokens.append(best_next_token)

    return tokenizer.decode([seed_tokens])[0]

In [59]:
create_text(model, tokenizer, "Школа это скучно")

'Школа это скучно. История такая история, она была в школе, она была однажды она мне ее парню и она сидит на хате. И вот она сидит на хате перед'