In [35]:
import os
import pickle
import numpy as np
import time
import math
import torch.nn as nn
import torch.nn.functional as F
import torch

In [36]:
device = torch.device("cuda:0")
cpu = torch.device("cpu")

In [3]:
!nvidia-smi

Sun Oct  6 08:09:20 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  On   | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    31W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

## Load data 

In [37]:
story_path = "corpus/story_data_origin.pkl"
with open(story_path, 'rb') as f:
    data = pickle.load(f)

In [38]:
STORY_VAL = 14000
data_batch = data[:STORY_VAL]

## Data preprocessing 

In [39]:
def token2str(data):
    full_data = [" ".join(sent).rstrip() for sent in data]
    
    return full_data

In [40]:
corpus = token2str(data_batch)

In [41]:
chars = set(''.join(corpus))

id2char = dict(enumerate(chars))
id2char[len(id2char)] = "pad"
char2id = {char: ind for ind, char in id2char.items()}
voc_len = len(char2id)

In [42]:
def get_samples(data):
    input_seq = []
    target_seq = []
    
    for story in data:
        input_seq.append(np.array(story[:-1]))
        target_seq.append(np.array(story[1:]))
    
    return np.array(input_seq), np.array(target_seq)

In [43]:
input_seq, target_seq = get_samples(corpus)

In [44]:
input_tensor = []
target_tensor = []

for i in range(len(data_batch)):
    input_tensor.append(torch.tensor([char2id[char] for char in input_seq[i]]))
    target_tensor.append(torch.tensor([char2id[char] for char in target_seq[i]]))

In [45]:
input_seq = torch.nn.utils.rnn.pad_sequence(input_tensor, batch_first=True, padding_value=char2id['pad'])
target_seq = torch.nn.utils.rnn.pad_sequence(target_tensor, batch_first=True, padding_value=char2id['pad'])

In [46]:
BATCH_SIZE = 250
data2train = torch.utils.data.TensorDataset(input_seq, target_seq)
train_loader = torch.utils.data.DataLoader(data2train, batch_size = BATCH_SIZE, shuffle = True)

In [47]:
input_seq.size()

torch.Size([8000, 2966])

## Init Language Model 

In [48]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, rnn_layers=1, drop_p=0.5):
        super(Model, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.rnn_layers = rnn_layers
        self.drop_p = 0.5
        
        
        self.encoder = nn.Embedding(input_size, hidden_size)
        
        self.lstm = nn.LSTM(hidden_size, hidden_size, rnn_layers, batch_first=True) 
        
        self.drop = nn.Dropout(0.5)
        
        self.fc_1 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, prev_state):
                
        x = self.encoder(x)
        
        out, state = self.lstm(x, (prev_state[0].to(device), prev_state[1].to(device)))
        
        out = out.contiguous().view(-1, self.hidden_size)
        out = self.drop(out)
        out = self.fc_1(out)

        return out, state
    
    def init_hidden(self, batch_size):
        hidden = (torch.zeros(self.rnn_layers, batch_size, self.hidden_size),
                  torch.zeros(self.rnn_layers, batch_size, self.hidden_size))
        return hidden

## Init train params

In [49]:
def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [50]:
n_epochs = 150
hidden_size = 256
rnn_layers = 2
lr = 0.001
callback_every = 10

In [51]:
model = Model(voc_len, hidden_size, voc_len, rnn_layers)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## Train Loop 

In [52]:
start = time.time()
for epoch in range(1, n_epochs + 1):
    loss_accum = 0
    zero_state = model.init_hidden(BATCH_SIZE)
    
    for i_step, (input_s, target_s) in enumerate(train_loader):
        optimizer.zero_grad()
        target_s = target_s.to(device)
        input_s = input_s.to(device)
    
        output, _ = model(input_s, zero_state)
        loss = criterion(output, target_s.view(-1).long())
        loss.backward() 
        optimizer.step() 
    
        loss_accum += loss
        del input_s
        del target_s
    
    ave_loss = loss_accum / i_step
    if epoch % callback_every == 0:
        print('Time: %s | Epoch: %d / %d | Loss: %.4f' % (time_since(start), epoch, n_epochs, ave_loss))

Time: 6m 44s | Epoch: 10 / 100 | Loss: 0.3029
Time: 13m 28s | Epoch: 20 / 100 | Loss: 0.2518
Time: 20m 12s | Epoch: 30 / 100 | Loss: 0.2310
Time: 26m 57s | Epoch: 40 / 100 | Loss: 0.2181
Time: 33m 41s | Epoch: 50 / 100 | Loss: 0.2082
Time: 40m 26s | Epoch: 60 / 100 | Loss: 0.2008
Time: 47m 10s | Epoch: 70 / 100 | Loss: 0.1934
Time: 53m 54s | Epoch: 80 / 100 | Loss: 0.1863
Time: 60m 38s | Epoch: 90 / 100 | Loss: 0.1790
Time: 67m 22s | Epoch: 100 / 100 | Loss: 0.1719


## Evaluation 

In [56]:
def predict(model, words, hidden, top_k, softmax_t):
    words = np.array([[char2id[c] for c in words]])

    words = torch.from_numpy(words)
    words = words.to(device)
    words = words.view(1, -1)
    
    out, hidden = model(words, hidden)

    prob = F.softmax(out[-1] / softmax_t, dim=0).data
    char_ind = torch.max(prob, dim=0)[1].item()
    prob = prob.to(cpu)

    
    prob, top_ch = prob.topk(top_k)
    top_ch = top_ch.numpy().squeeze()
        
    prob = prob.numpy().squeeze()
    char = np.random.choice(top_ch, p=prob/prob.sum())


    return id2char[char], hidden

In [58]:
def generate_story(model, out_len, start='Я поехал', top_k=5, softmax_t=1):
    model.eval() 
    hidden = model.init_hidden(1)
    chars = [ch for ch in start]
    size = out_len - len(chars)

    for ii in range(size):
        word, hidden = predict(model, chars, hidden, top_k, softmax_t)
        chars.append(word)

    return ''.join(chars)

## Generated Story 

In [77]:
result = generate_story(model, 150, "Я ушел домой", top_k=5, softmax_t=1)
print(result)

Я ушел домой, пока привезли с палитым и собирается дома с девушкой, только на мой друг высосили в первый сенсчернее и они начинает дернуть кроми трубк
