# Лекция №6

## RNN-сети

Кроме продвижения _вглубь_ сети образуются межэлементные связи на одном слое.

### Обучение

- Граф связей(расписанная модель с раскрытыми связями) обучаем, как и обычную сеть;

### RNN-модуль

- Конкатенируем вход с предыдущим результатом и внутренней функцией активации `tanh`.

__Проблема__:
- Забывчивость(исправляют LSTM):
  - _residual_ блоки для сохранения сигналов;
  - первое слагаемое - совокупность входа и предыдущего состояния(forget gate);
  - Используем состояние и вход для определения текущего состояния(input gate);
  - Результат ячейки - совокупность input gate и forget gate(cell update)
  - выходной gate - формирует результат и состояние, учитывая вход и предыдущее состояние.

GRU схожи с LSTM, но исключают второй параметр выхода ячейки.

In [13]:
from torch import nn
import torch

In [14]:
lstm = nn.LSTM(input_size=3, hidden_size=3, num_layers=1, 
bias=True, batch_first=False, dropout=0, bidirectional=False) 
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3)) 
inputs = [torch.randn(1, 3) for _ in range(5)] 
for i in inputs:
    out, hidden = lstm(i.view(1, 1, -1), hidden)
    print(out.data, [h.data for h in hidden])

tensor([[[-0.1376, -0.0534,  0.0858]]]) [tensor([[[-0.1376, -0.0534,  0.0858]]]), tensor([[[-0.5819, -0.1293,  0.1176]]])]
tensor([[[-0.2166,  0.2641,  0.1926]]]) [tensor([[[-0.2166,  0.2641,  0.1926]]]), tensor([[[-0.5162,  0.4397,  0.4905]]])]
tensor([[[-0.0089,  0.0961,  0.0392]]]) [tensor([[[-0.0089,  0.0961,  0.0392]]]), tensor([[[-0.0238,  0.1715,  0.1055]]])]
tensor([[[0.0523, 0.0938, 0.0164]]]) [tensor([[[0.0523, 0.0938, 0.0164]]]), tensor([[[0.1161, 0.1477, 0.0610]]])]
tensor([[[-0.1366,  0.1296, -0.0530]]]) [tensor([[[-0.1366,  0.1296, -0.0530]]]), tensor([[[-0.6740,  0.3636, -0.0680]]])]


In [15]:
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)

In [16]:
print(inputs)

tensor([[[-0.6211, -2.6352,  1.1328]],

        [[ 2.4052,  0.5578,  1.6831]],

        [[-0.3897, -1.0508, -0.5149]],

        [[-0.1309,  0.2160, -0.0988]],

        [[ 0.6567, -2.1673,  2.0208]]])


In [17]:
print(out)

tensor([[[ 0.0483, -0.1339, -0.1824]],

        [[-0.0020,  0.2097,  0.0826]],

        [[ 0.0455,  0.0950, -0.0318]],

        [[ 0.0810,  0.1086, -0.0248]],

        [[-0.1339,  0.1334, -0.0901]]], grad_fn=<StackBackward0>)


In [18]:
print(hidden)

(tensor([[[-0.1339,  0.1334, -0.0901]]], grad_fn=<StackBackward0>), tensor([[[-0.6609,  0.3774, -0.1158]]], grad_fn=<StackBackward0>))


In [19]:
class Model(nn.Module):
    def __init__(self, input_size):
        hidden = [400, 300, 200, 100]
        super().__init__()
        self.lstm1 = nn.LSTM(input_size, hidden[0],
            batch_first=True,
            bidirectional=True)
        self.lstm2 = nn.LSTM(
            2 * hidden[0]
            , hidden[1],
            batch_first=True,
            bidirectional=True)
        self.lstm3 = nn.LSTM(2 * hidden[1], hidden[2],
            batch_first=True,
            bidirectional=True)
        self.lstm4 = nn.LSTM(2 * hidden[2], hidden[3],
            batch_first=True,
            bidirectional=True)
        self.fc1 = nn.Linear(2 * hidden[3], 50)
        self.selu = nn.SELU()
        self.fc2 = nn.Linear(50, 1)
        self._reinitialize()
    def _reinitialize(self):
        """
        Tensorflow/Keras-like initialization
        """
        for name, p in self.named_parameters():
            if 'lstm' in name:
                if 'weight_ih' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'weight_hh' in name:
                    nn.init.orthogonal_(p.data)
                elif 'bias_ih' in name:
                    p.data.fill_(0)
                    # Set forget-gate bias to 1
                    n = p.size(0)
                    p.data[(n // 4):(n // 2)].fill_(1)
                elif 'bias_hh' in name:
                    p.data.fill_(0)
            elif 'fc' in name:
                if 'weight' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'bias' in name:
                    p.data.fill_(0)
    def forward(self, x):
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x, _ = self.lstm3(x)
        x, _ = self.lstm4(x)
        x = self.fc2(self.selu(self.fc1(x)))

- Teacher forcing:
  - подаем каждый раз истинную метку:
    - Быстрее обучается;
    - Отличается на тестировании.
- Professor forcing:
  - стараемся делать элементы похожими;
- Scheduled sampling:
  - смешиваем значение выборки с генерированным

In [12]:
teacher_forcing_ratio = 0.5
def train(input_tensor, target_tensor, encoder, decoder,
    encoder_optimizer, decoder_optimizer, criterion, max_length=15):
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device) 
    loss = 0
    decoder_input = torch.tensor([["a"]], device=device)
    decoder_hidden = encoder_hidden
    use_teacher_forcing = T

In [None]:
if use_teacher_forcing:
    # Teacher forcing: Feed the target as the next input
    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden,
        encoder_outputs)
        loss += criterion(decoder_output, target_tensor[di])
        decoder_input = target_tensor[di]  # Teacher forcing
else:
# Without teacher forcing: use its own predictions as the next input
    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden,
        encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach() # detach from history as input
        loss += criterion(decoder_output, target_tensor[di])
    if decoder_input.item() == EOS_token:
        break

loss.backward()
return loss.item() / target_length
decoder_optimizer.step()

- RNN глубины не больше 4(обычно)

Для обучения(чтобы побороть gradient vanishing):
- ограничение шагов обратного распространения;
- оптимизация с гессианом
- специальная регуляризация

### Эхо-сети

- берем случайные паттерны(связи модудлей), а обучаем все модули кром полученных паттернов.

In [21]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# обучающая выборка
training_data = [("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
                 ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])]
# соответствие класс-id
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
# построить соответствие токен-id
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

print(word_to_ix)

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [26]:
EMBEDDING_DIM = 6 # надо делать существенно больше!
HIDDEN_DIM = 6

class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # embeddings + hidden states -> hidden states
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # linear layer hidden state -> tag
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = torch.nn.functional.log_softmax(tag_space, dim=1)
        return tag_scores

In [27]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [28]:
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], 
    word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-1.0274, -1.3471, -0.9621],
        [-0.9032, -1.3646, -1.0811],
        [-0.9205, -1.3213, -1.0940],
        [-0.8779, -1.3206, -1.1477],
        [-0.8887, -1.3120, -1.1409]])


In [29]:
for epoch in range(3):
    for sentence, tags in training_data:
        model.zero_grad()
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        tag_scores = model(sentence_in)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

In [30]:
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], 
    word_to_ix)
    print(inputs)
    tag_scores = model(inputs)
    print(tag_scores)

tensor([0, 1, 2, 3, 4])
tensor([[-1.0498, -1.2391, -1.0207],
        [-0.9311, -1.2415, -1.1491],
        [-0.9565, -1.2035, -1.1532],
        [-0.9014, -1.2005, -1.2278],
        [-0.9188, -1.1974, -1.2073]])
