In [1]:
import os

import itertools
import pickle
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import math 

from torchtext import data, datasets
from pathlib import Path

import sys
sys.path.append('../')
%matplotlib inline

In [2]:
# Init cuda
device = "cuda:6" if torch.cuda.is_available() else "cpu"
idevice = 6 if torch.cuda.is_available() else -1
torch.cuda.set_device(idevice)
device, idevice

('cuda:6', 6)

In [3]:
TEXT = data.Field()
PATH = Path('./wikitext/')

In [4]:
batch_size = 256
sequence_length = 30
grad_clip = 0.1
lr = 4.
best_val_loss = None
log_interval = 10_000

In [5]:
from torchtext.datasets.language_modeling import LanguageModelingDataset

class WikiTextRu(LanguageModelingDataset):

    urls = ['http://files.deeppavlov.ai/datasets/wikitext_ru.zip']
    name = 'wikitext_ru'
    dirname = 'wikitext_ru'

    @classmethod
    def splits(cls, text_field, root='.data', 
               train='ru.wiki.train.txt', validation='ru.wiki.valid.txt', test='ru.wiki.test.txt',  **kwargs):

        return super().splits(text_field=text_field, root=root, 
                              train=train, validation=validation, test=test, **kwargs)

    @classmethod
    def iters(cls, batch_size=32, bptt_len=35, device=0, root='.data', vectors=None, **kwargs):
       
        TEXT = data.Field()
        train, val, test = cls.splits(TEXT, root=root, **kwargs)
        TEXT.build_vocab(train, vectors=vectors)

        return data.BPTTIterator.splits((train, val, test), 
                                        batch_size=batch_size, bptt_len=bptt_len, device=device)

In [6]:
from torchtext.vocab import Vectors

class RuFastText(Vectors):

    url_base = 'http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_word_tokenize/ft_native_300_ru_wiki_lenta_nltk_word_tokenize.vec'

    def __init__(self, **kwargs):
        url = self.url_base
        name = os.path.basename(url)
        super().__init__(name, url=url, **kwargs)


In [7]:
%%time
train_ds, valid_ds, test_ds = WikiTextRu.splits(TEXT, root=PATH)

CPU times: user 1min 6s, sys: 17.2 s, total: 1min 23s
Wall time: 1min 23s


In [8]:
ru_vectors = RuFastText()

In [9]:
%%time
TEXT.build_vocab(train_ds, min_freq=30, max_size=10_000, vectors=ru_vectors)

CPU times: user 2min 15s, sys: 15 s, total: 2min 30s
Wall time: 2min 29s


In [10]:
ntokens = len(TEXT.vocab)
ntokens

10002

In [11]:
%%time
train_loader, val_loader, test_loader = data.BPTTIterator.splits((train_ds, valid_ds, test_ds), 
                                        batch_sizes=(batch_size, batch_size, batch_size), 
                                        bptt_len=sequence_length, repeat=False, device=idevice) 

CPU times: user 159 µs, sys: 30 µs, total: 189 µs
Wall time: 213 µs


In [12]:
class RNNModel(nn.Module):

    def __init__(self, rnn_type, emb_vectors, nhid, nlayers, dropout=0.5):
        super().__init__()
        ntoken = emb_vectors.shape[0]
        ninp = emb_vectors.shape[1]
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.encoder.weight.data.copy_(emb_vectors);
        self.encoder.weight.requires_grad = False
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, x, hidden=None):
        emb = self.drop(self.encoder(x))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
                    weight.new(self.nlayers, bsz, self.nhid).zero_())
        else:
            return weight.new(self.nlayers, bsz, self.nhid).zero_()

In [13]:
def evaluate(data_loader):
    model.eval()
    total_loss = 0
    ntokens = len(TEXT.vocab)
    hidden = model.init_hidden(batch_size)
    for i, b in enumerate(data_loader):
        data, targets = b.text, b.target
        output, hidden = model(data)
        output_flat = output.view(-1, ntokens)
        total_loss += len(data) * criterion(output_flat, targets.view(-1)).item()
    return total_loss / len(data_loader)

In [14]:
def train():
    model.train()
    total_loss = 0
    ntokens = len(TEXT.vocab) #len(corpus.dictionary)
    for batch, b in enumerate(train_loader):
        data, targets = b.text, b.target
        model.zero_grad()
        output, hidden = model(data)
        loss = criterion(output.view(-1, ntokens), targets.view(-1))
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), grad_clip)
        for p in filter(lambda p: p.requires_grad, model.parameters()):
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_loader), lr, cur_loss, math.exp(cur_loss)))
            total_loss = 0

In [15]:
def generate(n=50, temp=1.):
    model.eval()
    x = torch.rand(1, 1).mul(ntokens).long().to(device)
    hidden = None
    out = []
    for i in range(n):
        output, hidden = model(x, hidden)
        s_weights = output.squeeze().data.div(temp).exp()
        s_idx = torch.multinomial(s_weights, 1)[0]
        x.data.fill_(s_idx)
        s = TEXT.vocab.itos[s_idx]
        out.append(s)
    return ' '.join(out)

In [16]:
emb_vectors = TEXT.vocab.vectors
model = RNNModel('LSTM', emb_vectors, 128, 1, 0.0).to(device)
criterion = nn.CrossEntropyLoss()

In [17]:
%%time
with torch.no_grad():
    print('sample:\n', generate(15), '\n')

sample:
 тюрьмы Алан галерее собственных финансовых полгода оставить принимали птица военная производству полк. Так кровь здания 

CPU times: user 5.39 ms, sys: 7.5 ms, total: 12.9 ms
Wall time: 11.6 ms


In [18]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time
for epoch in range(1, 51):
    train()
    val_loss = evaluate(val_loader)
    print('-' * 89)
    print('| end of epoch {:3d} | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
        epoch, val_loss, math.exp(val_loss)))
    print('-' * 89)
    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0
    with torch.no_grad():
        print('sample:\n', generate(50), '\n')


| epoch   1 | 10000/49916 batches | lr 4.00 | loss  4.92 | ppl   136.85
| epoch   1 | 20000/49916 batches | lr 4.00 | loss  4.48 | ppl    88.02
| epoch   1 | 30000/49916 batches | lr 4.00 | loss  4.29 | ppl    72.75
| epoch   1 | 40000/49916 batches | lr 4.00 | loss  4.18 | ppl    65.26
-----------------------------------------------------------------------------------------
| end of epoch   1 | valid loss 122.43 | valid ppl 147687316997835539700604251822616840547712309802303488.00
-----------------------------------------------------------------------------------------
sample:
 , <unk> собой <unk> сборную национальной на четыре <unk> радио <unk> ( <unk> <unk> носителей <unk> <unk> , включая <unk> , <unk> <unk> морской <unk> на машине <unk> <unk> ) и <unk> <unk> <unk> <unk> <unk> , и Роберт им <unk> и <unk> на <unk> в <unk> . <eos> <eos> 

| epoch   2 | 10000/49916 batches | lr 4.00 | loss  4.06 | ppl    57.75
| epoch   2 | 20000/49916 batches | lr 4.00 | loss  4.01 | ppl    55.29
| ep

| epoch  11 | 10000/49916 batches | lr 4.00 | loss  3.66 | ppl    39.00
| epoch  11 | 20000/49916 batches | lr 4.00 | loss  3.66 | ppl    38.78
| epoch  11 | 30000/49916 batches | lr 4.00 | loss  3.66 | ppl    38.84
| epoch  11 | 40000/49916 batches | lr 4.00 | loss  3.66 | ppl    38.73
-----------------------------------------------------------------------------------------
| end of epoch  11 | valid loss 109.76 | valid ppl 467122626905963211447072252094134935169064239104.00
-----------------------------------------------------------------------------------------
sample:
 <unk> армии и младших — 14 ноября 1788 года <unk> становится деревней <unk> при <unk> 3 воздушной тела , приходилось <unk> <unk> , чтобы атаку с <unk> войсками ветра , — в Гран-при это возможность время нескольких отрасли <unk> <unk> его как с <unk> для <unk> <unk> <unk> и библиотеку 

| epoch  12 | 10000/49916 batches | lr 4.00 | loss  3.65 | ppl    38.55
| epoch  12 | 20000/49916 batches | lr 4.00 | loss  3.65 | pp

| epoch  21 | 10000/49916 batches | lr 4.00 | loss  3.59 | ppl    36.23
| epoch  21 | 20000/49916 batches | lr 4.00 | loss  3.59 | ppl    36.09
| epoch  21 | 30000/49916 batches | lr 4.00 | loss  3.59 | ppl    36.21
| epoch  21 | 40000/49916 batches | lr 4.00 | loss  3.59 | ppl    36.15
-----------------------------------------------------------------------------------------
| end of epoch  21 | valid loss 107.77 | valid ppl 63710408183649785083231364540568976851759792128.00
-----------------------------------------------------------------------------------------
sample:
 на <unk> основе борьбы ( за отличие , мужество и героизм <unk> Григория <unk> армия , лейтенант иностранные <unk> выполнение заданий командования крестьян и <unk> <unk> ) <unk> <unk> <unk> и армии , не имея в себя <unk> вперёд , а карьеру сопротивления , <unk> <unk> <unk> его <unk> участвовать 

| epoch  22 | 10000/49916 batches | lr 4.00 | loss  3.59 | ppl    36.08
| epoch  22 | 20000/49916 batches | lr 4.00 | loss  

| epoch  31 | 10000/49916 batches | lr 4.00 | loss  3.56 | ppl    35.05
| epoch  31 | 20000/49916 batches | lr 4.00 | loss  3.55 | ppl    34.93
| epoch  31 | 30000/49916 batches | lr 4.00 | loss  3.56 | ppl    35.06
| epoch  31 | 40000/49916 batches | lr 4.00 | loss  3.56 | ppl    35.02
-----------------------------------------------------------------------------------------
| end of epoch  31 | valid loss 106.84 | valid ppl 25122954064470043124948056042779551615980404736.00
-----------------------------------------------------------------------------------------
sample:
 политике <unk> команды открыто в том , что он — тренер сборной <unk> . <eos> <eos> Он принимал участие в соревнованиях Армении , в том числе ещё три матча и в группы в Кубке <unk> чемпионате мира , а также в полуфинале победа состоялся 7 — 33 <unk> В связи 

| epoch  32 | 10000/49916 batches | lr 4.00 | loss  3.55 | ppl    34.97
| epoch  32 | 20000/49916 batches | lr 4.00 | loss  3.55 | ppl    34.85
| epoch  32 | 3000