In [None]:
!pip3 -qq install torch==0.4.1
!pip -qq install torchtext==0.3.1
!pip -qq install gensim==3.6.0
!pip -qq install pyldavis==2.1.2
!pip -qq install attrs==18.2.0
!wget -qq --no-check-certificate 'https://drive.google.com/uc?export=download&id=1OIU9ICMebvZXJ0Grc2SLlMep3x9EkZtz' -O perashki.txt
!wget -qq --no-check-certificate 'https://drive.google.com/uc?export=download&id=1v66uAEKL3KunyylYitNKggdl2gCeYgZZ' -O poroshki.txt
!git clone https://github.com/UniversalDependencies/UD_Russian-SynTagRus.git
!wget -qq https://raw.githubusercontent.com/DanAnastasyev/neuromorphy/master/neuromorphy/train/corpus_iterator.py

In [None]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


if torch.cuda.is_available():
    from torch.cuda import FloatTensor, LongTensor
    DEVICE = torch.device('cuda')
else:
    from torch import FloatTensor, LongTensor
    DEVICE = torch.device('cpu')

np.random.seed(42)

# Word-Level Text Generation

Today we are mainly engaged in the fact that we generate * cakes * and * powders *.

* (Data without demand downloaded from the site http://poetory.ru) *

The pies are here:

In [None]:
!head perashki.txt

Порошки вот:

In [None]:
!head poroshki.txt

Do not confuse!

In general, a pie is a quatrain, written by iambic tetrameter under the scheme 9-8-9-8. In powder scheme 9-8-9-2.

In [None]:
vowels = 'ёуеыаоэяию'

odd_pattern = '-+-+-+-+-'
even_pattern = '-+-+-+-+'

Считываем данные:

In [None]:
def read_poem(path):
    poem = []
    with open(path, encoding='utf8') as f:
        for line in f:
            line = line.rstrip()
            if len(line) == 0:
                yield poem
                poem = []
                continue
            
            poem.extend(line.split() + ['\\n'])
            
perashki = list(read_poem('perashki.txt'))
poroshki = list(read_poem('poroshki.txt'))

Построим датасет для порошков:

In [None]:
from torchtext.data import Field, Example, Dataset, BucketIterator

text_field = Field(init_token='<s>', eos_token='</s>')
        
fields = [('text', text_field)]
examples = [Example.fromlist([poem], fields) for poem in poroshki]
dataset = Dataset(examples, fields)

text_field.build_vocab(dataset, min_freq=7)

print('Vocab size =', len(text_field.vocab))
train_dataset, test_dataset = dataset.split(split_ratio=0.9)

train_iter, test_iter = BucketIterator.splits(datasets=(train_dataset, test_dataset), batch_sizes=(32, 128), 
                                              shuffle=True, device=DEVICE, sort=False)

**Задание** Напишите класс языковой модели.

In [None]:
class LMModel(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, lstm_hidden_dim=256, num_layers=1):
        super().__init__()

        self._emb = nn.Embedding(vocab_size, emb_dim)
        self._rnn = nn.LSTM(input_size=emb_dim, hidden_size=lstm_hidden_dim)
        
        self._out_layer = nn.Linear(lstm_hidden_dim, vocab_size)
        
        self._init_weights()

    def _init_weights(self, init_range=0.1):
        self._emb.weight.data.uniform_(-init_range, init_range)
        self._out_layer.bias.data.zero_()
        self._out_layer.weight.data.uniform_(-init_range, init_range)

    def forward(self, inputs, hidden=None):
        <apply layers>

In [None]:
batch = next(iter(train_iter))

In [None]:
model = LMModel(vocab_size=len(train_iter.dataset.fields['text'].vocab)).to(DEVICE)

model(batch.text)

**Задание** Добавьте подсчет потерей с маскингом паддингов.

In [None]:
import math
from tqdm import tqdm
tqdm.get_lock().locks = []


def do_epoch(model, criterion, data_iter, unk_idx, pad_idx, optimizer=None, name=None):
    epoch_loss = 0
    
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    
    batches_count = len(data_iter)
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=batches_count) as progress_bar:
            for i, batch in enumerate(data_iter):                
                logits, _ = model(batch.text)

                <calc loss>

                epoch_loss += loss.item()

                if optimizer:
                    optimizer.zero_grad()
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), 1.)
                    optimizer.step()

                progress_bar.update()
                progress_bar.set_description('{:>5s} Loss = {:.5f}, PPX = {:.2f}'.format(name, loss.item(), 
                                                                                         math.exp(loss.item())))
                
            progress_bar.set_description('{:>5s} Loss = {:.5f}, PPX = {:.2f}'.format(
                name, epoch_loss / batches_count, math.exp(epoch_loss / batches_count))
            )
            progress_bar.refresh()

    return epoch_loss / batches_count


def fit(model, criterion, optimizer, train_iter, epochs_count=1, unk_idx=0, pad_idx=1, val_iter=None):
    best_val_loss = None
    for epoch in range(epochs_count):
        name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
        train_loss = do_epoch(model, criterion, train_iter, unk_idx, pad_idx, optimizer, name_prefix + 'Train:')
        
        if not val_iter is None:
            val_loss = do_epoch(model, criterion, val_iter, unk_idx, pad_idx, None, name_prefix + '  Val:')
            
            if best_val_loss and val_loss > best_val_loss:
                optimizer.param_groups[0]['lr'] /= 4.
                print('Optimizer lr = {:g}'.format(optimizer.param_groups[0]['lr']))
            else:
                best_val_loss = val_loss
        print()
        generate(model)
        print()

**Задание** Напишите функцию-генератор для модели.

In [None]:
def sample(probs, temp):
    probs = F.log_softmax(probs.squeeze(), dim=0)
    probs = (probs / temp).exp()
    probs /= probs.sum()
    probs = probs.cpu().numpy()

    return np.random.choice(np.arange(len(probs)), p=probs)


def generate(model, temp=0.6):
    model.eval()
    with torch.no_grad():        
        prev_token = train_iter.dataset.fields['text'].vocab.stoi['<s>']
        end_token = train_iter.dataset.fields['text'].vocab.stoi['</s>']
        
        hidden = None
        for _ in range(150):
            <generate text>
                
generate(model)

In [None]:
model = LMModel(vocab_size=len(train_iter.dataset.fields['text'].vocab)).to(DEVICE)

pad_idx = train_iter.dataset.fields['text'].vocab.stoi['<pad>']
unk_idx = train_iter.dataset.fields['text'].vocab.stoi['<unk>']
criterion = nn.CrossEntropyLoss(...).to(DEVICE)

optimizer = optim.SGD(model.parameters(), lr=20., weight_decay=1e-6)

fit(model, criterion, optimizer, train_iter, epochs_count=300, unk_idx=unk_idx, pad_idx=pad_idx, val_iter=test_iter)

**Задание** Добавьте маскинг `<unk>` токенов при тренировке модели.

## Improving the model

### Tying input and output embeddings

There are two embeddings in the model - input and output. A beautiful and useful idea in life is to learn only one matrix shared between them: [Using the Output Mode for Improving Language Models] (http://www.aclweb.org/anthology/E17-2025)

From the idea there are some pluses: it turns out that there are much less trained parameters and at the same time a noticeably higher quality.

** Assignment ** Implement it. It is enough to write something like this in the constructor:

`self._out_layer.weight = self._emb.weight`

### Add information to the sample

Now we have every word represented by one index. Models are very difficult to know how many syllables there are - which means it is difficult to generate a correct poem.

In fact, each word can be attributed to a piece of the metric pattern:

<img src="https://hsto.org/web/59a/b39/bd0/59ab39bd020c49a78a12cbab62c80181.png" width="50%">


**Task** Update the function `read_poem`, let it generate two lists - a list of words and a list of pieces of the template. Add an input to the model - template sequences, concatenate their embeddings with words.
An additional idea is to make the model guess which pattern should go next (about half will be suitable, the rest will not). Add additional loss from guessing the pattern.

### We increase the selection

We have a sample for pies, which is much larger.

** Task ** Learn from it.

### Transfer learning

A simple and pleasant way to improve the model is to make the transfer trained on a large case of the model for a smaller amount of datasets.

This method is more popular in computer vision: [Transfer learning, cs231n] (http://cs231n.github.io/transfer-learning/) - there is a huge ImageNet on which the model is trained to freeze the lower layers and replace the weekend. As a result, the model uses universal data representations, learned on a large package, but to predict very different labels - and the quality grows very well.

We still do not need such perversions (although the keywords will come in handy later: ULMFiT, ELMo and company). Just take a model trained in a larger case and teach it on a smaller case. She just needs to learn a new matrix pattern of the last row.

**Assignment** Model trained in the last paragraph to train for powders.

### Conditional language model

Even better, just learn from both buildings at once. Combine the pies and powders, for each store the index 0/1 - whether it was a pie or powder. Add an entry — this index and concatenate it either to each embedding of the words or to each output from the LSTM.

**Assignment** Teach a single model from which you can ask to generate a pie or powder.

### Variational & word dropout

** Assignment ** In the last lesson, examples of dropout adapters more suitable for RNNs were given. Add them.

** Task ** In addition, try increasing the size of the model or the number of layers in it to improve the quality.

## Multi-task learning

Another important way to improve the model is multi-task learning. This is when one model learns to make predictions for several tasks at once.

In our case, this can be a prediction of the lemma of the word separately and its grammatical meaning separately:

<img src = "https://hsto.org/web/e97/8a8/6e8/e978a86e8a874d8d946bb15e6a49a713.png" width = "50%">

As a result, the model learns both the language model according to the lemmas and the POS tagging model. At the same time!

Take the case from universal dependencies - it is already marked up as needed.

We read it:

In [None]:
from corpus_iterator import Token, CorpusIterator

fields = [('word', Field()), ('lemma', Field()), ('gram_val', Field())]
examples = []

with CorpusIterator('UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu') as corpus_iter:
    for sent in corpus_iter:
        words = ['<s>'] + [tok.token.lower() for tok in sent] + ['</s>']
        lemmas = ['<s>'] + [tok.lemma.lower() for tok in sent] + ['</s>']
        gr_vals = ['<s>'] + [tok.grammar_value for tok in sent] + ['</s>']
        examples.append(Example.fromlist([words, lemmas, gr_vals], fields))

In [None]:
print('Words:', examples[1].word)
print('Lemmas:', examples[1].lemma)
print('Grammar vals:', examples[1].gram_val)

Таким образом, размер словаря может быть существенно сокращен - лемм меньше, чем слов, а предсказание грамматики вынуждает модель быть более осведомленной о согласовании слов.

In [None]:
dataset = Dataset(examples, fields)

dataset.fields['word'].build_vocab(dataset, min_freq=3)
print('Word vocab size =', len(dataset.fields['word'].vocab))
dataset.fields['lemma'].build_vocab(dataset, min_freq=3)
print('Lemma vocab size =', len(dataset.fields['lemma'].vocab))
dataset.fields['gram_val'].build_vocab(dataset)
print('Grammar val vocab size =', len(dataset.fields['gram_val'].vocab))

train_dataset, test_dataset = dataset.split(split_ratio=0.75)

train_iter, test_iter = BucketIterator.splits(datasets=(train_dataset, test_dataset), batch_sizes=(32, 128), 
                                              shuffle=True, device=DEVICE, sort=False)

Построим маппинг из пары (лемма, грамматическое значение) в слово - если бы у нас под рукой был морфологический словарь, маппинг можно было бы пополнить, добавить слова для лемм из корпуса, которые не встретились в обучении.

In [None]:
dictionary = {
    (lemma, gr_val): word
    for example in train_iter.dataset.examples 
    for word, lemma, gr_val in zip(example.word, example.lemma, example.gram_val)
}

**Задание**  Обновите генератор - например, можно сэмплировать лемму и находить самое вероятное грамматическое значение, которое встречается  в паре с этой леммой в `dictionary`.

In [None]:
def generate(model, temp=0.7):
    ...

** Task ** Update the model and learning function.

The model should take the pairs `lemma, gr_val`, concatenate their embeddings and predict the following` lemma, gr_val` on leaving LSTM.

Function `do_epoch` should summarize loss prediction Lemma (for making Muskingum` <unk> `and` <pad> `) + losses on the prediction of grammatical meaning (according to the Muskingum` <pad> `).

## Controlled generation

I want to make the generation more controlled - ideally, to set the topic.

A simple way is to do thematic modeling and find some topics in the texts - and then transfer the vector of themes along with embedding to the model so that the model learns to generate thematically agreed text.

In [None]:
from gensim import corpora, models

docs = [[word for word in poem if word != '\\n'] for poem in perashki]

dictionary = corpora.Dictionary(docs)
dictionary.filter_n_most_frequent(100)

bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

lda_model = models.LdaModel(bow_corpus, num_topics=5, id2word=dictionary, passes=5)

Посмотреть, что выучилось, можно так:

In [None]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)

Предсказывает распределение модель как-то так:

In [None]:
for word in perashki[10]:
    if word == '\\n':
        print()
    else:
        print(word, end=' ')

In [None]:
lda_model.get_document_topics(bow_corpus[10])

** Task ** Count for all the texts of the vector of themes, pass them along with the words (concatenating to embeddings). See what happens.

# Referrence

Regularizing and Optimizing LSTM Language Models, 2017 [[arxiv]](https://arxiv.org/abs/1708.02182), [[github]](https://github.com/salesforce/awd-lstm-lm) - одна из самых полезных статей про языковые модели + репозиторий, в котором реализовано много полезного, стоит заглянуть

Exploring the Limits of Language Modeling, 2016 [[arxiv]](https://arxiv.org/abs/1602.02410)

Using the Output Embedding to Improve Language Models, 2017 [[pdf]](http://www.aclweb.org/anthology/E17-2025)

[Transfer learning, cs231n](http://cs231n.github.io/transfer-learning/)  
[Transfer learning, Ruder](http://ruder.io/transfer-learning/) - очень подробная статья от чувака из NLP
[An Overview of Multi-Task Learning in Deep Neural Networks, Ruder](http://ruder.io/multi-task/)  
[Multi-Task Learning Objectives for Natural Language Processing, Ruder](http://ruder.io/multi-task-learning-nlp/)