In [1]:
import os
import pickle
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
import math
import time
# import transformers
# from transformers import convert_bert_original_tf_checkpoint_to_pytorch
# from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertForQuestionAnswering, BertForPreTraining

In [2]:
device = torch.device("cuda:0")
cpu = torch.device("cpu")

In [3]:
!nvidia-smi

Mon Oct  7 10:01:52 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  On   | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

## Load bert

In [4]:
# BERT_MODEL_PATH = "rubert/"
# os.listdir(BERT_MODEL_PATH)

In [5]:
# convert_bert_original_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
#     BERT_MODEL_PATH + 'bert_model.ckpt',
# BERT_MODEL_PATH + 'bert_config.json','pytorch_model.bin')

In [6]:
# model_path = "rubert_pytorch/"

In [7]:
# tokenizer = BertTokenizer.from_pretrained(model_path)
# model = BertModel.from_pretrained(model_path);

## Load data 

In [8]:
story_path = "corpus/story_data_punct_del_em.pkl"
with open(story_path, 'rb') as f:
    data = pickle.load(f)

In [9]:
STORY_VAL = 7000

In [10]:
data_batch = data[:STORY_VAL]

In [11]:
data_flat = np.concatenate(data_batch, 0)

In [12]:
data_flat = list(map(str.lower, data_flat))

## Data preprocessing 

In [13]:
vocab = set(data_flat)
voc_len=len(vocab) + 1
word_to_ix = {word: i+1 for i, word in enumerate(vocab)}
word_to_ix["<pad>"] = 0 
ix_to_word = {v:k for k, v in word_to_ix.items()}

In [14]:
def get_samples(data):
    input_seq = []
    target_seq = []
    
    for story in data:
        input_seq.append(np.array(story[:-1]))
        target_seq.append(np.array(story[1:]))
    
    return np.array(input_seq), np.array(target_seq)

In [15]:
input_seq, target_seq = get_samples(data_batch)

In [16]:
for i in range(len(data_batch)):
    input_seq[i] = torch.tensor([word_to_ix[word.lower()] for word in input_seq[i]])
    target_seq[i] = torch.tensor([word_to_ix[word.lower()] for word in target_seq[i]])

In [17]:
input_seq = torch.nn.utils.rnn.pad_sequence(input_seq, batch_first=True, padding_value = word_to_ix['<pad>'])
target_seq = torch.nn.utils.rnn.pad_sequence(target_seq, batch_first=True, padding_value = word_to_ix['<pad>'])

In [18]:
BATCH_SIZE = 50
data2train = torch.utils.data.TensorDataset(input_seq, target_seq)
train_loader = torch.utils.data.DataLoader(data2train, batch_size = BATCH_SIZE, shuffle = True)

In [19]:
input_seq.size()

torch.Size([7000, 633])

## Init Embeddings

In [20]:
# def get_bert_embed_matrix(model):
    
#     bert_embeddings = list(model.children())[0]
#     bert_word_embeddings = list(bert_embeddings.children())[0]
#     mat = bert_word_embeddings.weight.data.numpy()
#     return torch.tensor(mat)

In [21]:
# matrix = get_bert_embed_matrix(model)

In [22]:
# def create_emb_layer(weights_matrix, non_trainable=False):
#     num_embeddings, embedding_dim = weights_matrix.size()
#     emb_layer = nn.Embedding(num_embeddings, embedding_dim)
#     emb_layer.load_state_dict({'weight': weights_matrix})
#     if non_trainable:
#         emb_layer.weight.requires_grad = False

#     return emb_layer, num_embeddings, embedding_dim

## Init Language Model 

In [23]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, rnn_layers=1, drop_p=0.5):
        super(Model, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.rnn_layers = rnn_layers
        self.drop_p = drop_p
        
        
        self.encoder = nn.Embedding(input_size, hidden_size)
        
        self.lstm = nn.LSTM(hidden_size, hidden_size, rnn_layers, batch_first=True) 
        
        self.drop = nn.Dropout(self.drop_p)
        
        self.fc_1 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, prev_state):
                
        x = self.encoder(x)
        
        out, state = self.lstm(x, (prev_state[0].to(device), prev_state[1].to(device)))
        
        out = out.contiguous().view(-1, self.hidden_size)
        out = self.drop(out)
        out = self.fc_1(out)

        return out, state
    
    def init_hidden(self, batch_size):
        hidden = (torch.zeros(self.rnn_layers, batch_size, self.hidden_size),
                  torch.zeros(self.rnn_layers, batch_size, self.hidden_size))
        return hidden

## Fit model 

In [24]:
def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [25]:
n_epochs = 90
hidden_size = 256
rnn_layers = 2
lr = 0.001
callback_every = 10
drop_p = 0.5

In [26]:
model = Model(voc_len, hidden_size, voc_len, rnn_layers, drop_p)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [27]:
start = time.time()
for epoch in range(1, n_epochs + 1):
    loss_accum = 0
    zero_state = model.init_hidden(BATCH_SIZE)
    
    for i_step, (input_s, target_s) in enumerate(train_loader):
        optimizer.zero_grad()
        
        target_s = target_s.to(device)
        input_s = input_s.to(device)
    
        output, _ = model(input_s, zero_state)
        loss = criterion(output, target_s.view(-1).long())
        
        loss.backward() 
        optimizer.step() 
    
        loss_accum += loss
        del input_s
        del target_s
    
    ave_loss = loss_accum / i_step
    if epoch % callback_every == 0:
        print('Time: %s | Epoch: %d / %d | Loss: %.4f' % (time_since(start), epoch, n_epochs, ave_loss))

Time: 6m 42s | Epoch: 10 / 90 | Loss: 0.7127
Time: 13m 24s | Epoch: 20 / 90 | Loss: 0.4513
Time: 20m 5s | Epoch: 30 / 90 | Loss: 0.2808
Time: 26m 47s | Epoch: 40 / 90 | Loss: 0.1875
Time: 33m 28s | Epoch: 50 / 90 | Loss: 0.1318
Time: 40m 9s | Epoch: 60 / 90 | Loss: 0.0967
Time: 46m 51s | Epoch: 70 / 90 | Loss: 0.0740
Time: 53m 32s | Epoch: 80 / 90 | Loss: 0.0586
Time: 60m 14s | Epoch: 90 / 90 | Loss: 0.0486


## Evaluation 

In [28]:
def predict(model, hidden, words, top_k, softmax_t):
    words = np.array([[word_to_ix[c] for c in words]])

    words = torch.from_numpy(words[0])
    words = words.to(device)
    words = words.view(1, -1)
    
    out, hidden = model(words, hidden)

    prob = nn.functional.softmax(out[-1] / softmax_t, dim=0).data
    prob = prob.to(cpu)
    
    prob, top_ch = prob.topk(top_k)
    top_ch = top_ch.numpy().squeeze()
        
    prob = prob.numpy().squeeze()
    char_ind = np.random.choice(top_ch, p=prob/prob.sum())

    return ix_to_word[char_ind], hidden

In [29]:
def sample(model, out_len, start='я',top_k = 5, softmax_t=1):
    model.eval() 
    hidden = model.init_hidden(1)
    
    words_data = start.lower().split()
    
    size = out_len - len(words_data)

    for ii in range(size):
        word, h = predict(model, hidden, words_data, top_k, softmax_t)
        words_data.append(word)

    return ' '.join(words_data)

## Generated Story 

In [35]:
sample(model, 25, "я отправился в школу утром", top_k=12, softmax_t=1)

'я отправился в школу утром с ней её человек без 16 из 8 лет . по скромным отец свою ее забрали , так как на'