In [105]:
import os
import pickle
import numpy as np
import time
import math
import torch.nn as nn
import torch.functional as F
import torch

In [123]:
device = torch.device("cpu")

## Load data 

In [124]:
story_path = "corpus/story_data_origin.pkl"
with open(story_path, 'rb') as f:
    data = pickle.load(f)

In [125]:
STORY_VAL = 10
data_batch = data[:STORY_VAL]

## Data preprocessing 

In [126]:
def token2str(data):
    full_data = [" ".join(sent).rstrip() for sent in data]
    
    return full_data

In [127]:
corpus = token2str(data_batch)

In [128]:
chars = set(''.join(corpus))

id2char = dict(enumerate(chars))

char2id = {char: ind for ind, char in id2char.items()}
voc_len = len(char2id)

In [129]:
def get_samples(data):
    input_seq = []
    target_seq = []
    
    for story in data:
        input_seq.append(np.array(story[:-1]))
        target_seq.append(np.array(story[1:]))
    
    return np.array(input_seq), np.array(target_seq)

In [130]:
input_seq, target_seq = get_samples(corpus)

In [131]:
input_tensor = []
target_tensor = []

for i in range(len(data_batch)):
    input_tensor.append(torch.tensor([char2id[char] for char in input_seq[i]]))
    target_tensor.append(torch.tensor([char2id[char] for char in target_seq[i]]))
    


In [132]:
input_seq = torch.nn.utils.rnn.pad_sequence(input_tensor, batch_first=True, padding_value=0)
target_seq = torch.nn.utils.rnn.pad_sequence(target_tensor, batch_first=True, padding_value=0)

In [133]:
BATCH_SIZE = 2
data2train = torch.utils.data.TensorDataset(input_seq, target_seq)
train_loader = torch.utils.data.DataLoader(data2train, batch_size = BATCH_SIZE, shuffle = True)

In [134]:
input_seq.size()

torch.Size([10, 783])

## Init Language Model 

In [135]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(Model, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.encoder = nn.Embedding(input_size, hidden_size*2)


        self.lstm = nn.LSTM(hidden_size*2, hidden_size, n_layers, batch_first=True,
                          bidirectional=False) 
        
        self.drop = nn.Dropout(0.5)
        self.fc_1 = nn.Linear(hidden_size, output_size)
    def forward(self, x, prev_state):
                
        x = self.encoder(x)
        
        out, state = self.lstm(x, prev_state)
        
        out = out.contiguous().view(-1, self.hidden_size)
        out = self.drop(out)
        out = self.fc_1(out)

        return out, state
    
    def init_hidden(self, batch_size):
        hidden = (torch.zeros(self.n_layers, batch_size, self.hidden_size), torch.zeros(self.n_layers, batch_size, self.hidden_size))
        return hidden

## Init train params

In [147]:
def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [148]:
n_epochs = 30
hidden_size = 120
n_layers = 1
lr = 0.01
print_every = 1

In [149]:
model = Model(voc_len, hidden_size, voc_len, n_layers)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## Train Loop 

In [150]:
start = time.time()
for epoch in range(1, n_epochs + 1):
    loss_accum = 0
    zero_state = model.init_hidden(BATCH_SIZE)
    for i_step, (input_s, target_s) in enumerate(train_loader):
        optimizer.zero_grad()
        target_s = target_s.to(device)
        input_s = input_s.to(device)
    
        output, _ = model(input_s, zero_state)
        loss = criterion(output, target_s.view(-1).long())
        loss.backward() 
        optimizer.step() 
    
        loss_accum += loss
        del input_s
        del target_s
    


    ave_loss = loss_accum / i_step
    if epoch % print_every == 0:
        print('Time: %s | Epoch: %d / %d | Loss: %.4f' % (time_since(start), epoch, n_epochs, ave_loss))

Time: 0m 1s | Epoch: 1 / 30 | Loss: 3.2857
Time: 0m 2s | Epoch: 2 / 30 | Loss: 1.8111
Time: 0m 3s | Epoch: 3 / 30 | Loss: 1.6368
Time: 0m 4s | Epoch: 4 / 30 | Loss: 1.5453
Time: 0m 5s | Epoch: 5 / 30 | Loss: 1.4968
Time: 0m 6s | Epoch: 6 / 30 | Loss: 1.4443
Time: 0m 8s | Epoch: 7 / 30 | Loss: 1.4102
Time: 0m 9s | Epoch: 8 / 30 | Loss: 1.3889
Time: 0m 10s | Epoch: 9 / 30 | Loss: 1.3460
Time: 0m 11s | Epoch: 10 / 30 | Loss: 1.3335
Time: 0m 12s | Epoch: 11 / 30 | Loss: 1.3141
Time: 0m 14s | Epoch: 12 / 30 | Loss: 1.2740
Time: 0m 15s | Epoch: 13 / 30 | Loss: 1.2540
Time: 0m 16s | Epoch: 14 / 30 | Loss: 1.2372
Time: 0m 17s | Epoch: 15 / 30 | Loss: 1.1970
Time: 0m 19s | Epoch: 16 / 30 | Loss: 1.1863
Time: 0m 20s | Epoch: 17 / 30 | Loss: 1.1537
Time: 0m 21s | Epoch: 18 / 30 | Loss: 1.1495
Time: 0m 22s | Epoch: 19 / 30 | Loss: 1.1344
Time: 0m 24s | Epoch: 20 / 30 | Loss: 1.1264
Time: 0m 25s | Epoch: 21 / 30 | Loss: 1.1021
Time: 0m 26s | Epoch: 22 / 30 | Loss: 1.0748
Time: 0m 27s | Epoch: 23 / 

## Evaluation 

In [151]:
def predict(model, words, hidden):
    words = np.array([[char2id[c] for c in words]])

    words = torch.from_numpy(words)
    words = words.to(device)
    words = words.view(1, -1)
    
    out, hidden = model(words, hidden)

    prob = nn.functional.softmax(out[-1], dim=0).data
    char_ind = torch.max(prob, dim=0)[1].item()

    return id2char[char_ind], hidden

In [152]:
def sample(model, out_len, start='я'):
    model.eval() 
    hidden = model.init_hidden(1)
    chars = [ch for ch in start]
    size = out_len - len(chars)

    for ii in range(size):
        word, hidden = predict(model, chars, hidden)
        chars.append(word)

    return ''.join(chars)

## Generated Story 

In [156]:
result = sample(model, 200, "рома")
print(result)

ромали, на потом старили с за празале в тусили не потом старили с за празале в тусили не потом старили с за празале в тусили не потом старили с за празале в тусили не потом старили с за празале в туси
