# Семинар по рекуррентным нейронным сетям
На этом семинаре мы обучим несколько рекуррентных архитектур для решения задачи сентимент-анализа, то есть предсказания метки тональности предложения.

В общем случае рекуррентная нейронная сеть предназначена для обработки последовательности произвольной длины. Однако при реализации метода оказывается проще зафиксировать длину последовательности (даже в pytorch с их динамическими графами :) Поэтому мы пока поступим так, но вернемся к этому вопросу ниже.

Сначала мы разберемся с RNN в pytorch, а затем сами реализуем наиболее популярную архитектуру.

Задание сделано так, чтобы его можно было выполнять на CPU, однако RNN - это ресурсоемкая вещь, поэтому на GPU с ними работать приятнее. Можете попробовать использовать [https://colab.research.google.com](https://colab.research.google.com) - бесплатное облако с GPU.

### Гиперпараметры

In [0]:
vocab_size = 20000 
index_from = 3
n_hidden = 128
n_emb = 300
seq_len = 32

batch_size = 128
learning_rate = 0.001
num_epochs = 50

use_gpu = True

### Загрузка данных
Функция load_matrix_imdb скачивает матричные данные, перемешивает и загружает их в numpy-массивы.

Если у вас не установлен wget, скачайте [архив imdb.npz](https://s3.amazonaws.com/text-datasets/imdb.npz)

In [0]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if cuda_output and exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
# !pip install Pillow==4.0.0
# !pip install PIL
# !pip install image

In [0]:
import numpy as np
import os
import re
from collections import defaultdict
import operator

def load_matrix_imdb(path='imdb.npz', num_words=None, skip_top=0,
              maxlen=None, seed=113,
              start_char=1, oov_char=2, index_from=3, **kwargs):
    """
    Modified code from Keras
    Loads data matrixes from npz file, crops and pads seqs and returns
    shuffled (x_train, y_train), (x_test, y_test)
    """
    if not os.path.exists(path):
        print("Downloading matrix data into current folder")
        os.system("wget https://s3.amazonaws.com/text-datasets/imdb.npz")
        
    with np.load(path) as f:
        x_train, labels_train = f['x_train'], f['y_train']
        x_test, labels_test = f['x_test'], f['y_test']

    np.random.seed(seed)
    indices = np.arange(len(x_train))
    np.random.shuffle(indices)
    x_train = x_train[indices]
    labels_train = labels_train[indices]

    indices = np.arange(len(x_test))
    np.random.shuffle(indices)
    x_test = x_test[indices]
    labels_test = labels_test[indices]

    xs = np.concatenate([x_train, x_test])
    labels = np.concatenate([labels_train, labels_test])

    if start_char is not None:
        xs = [[start_char] + [w + index_from for w in x] for x in xs]
    elif index_from:
        xs = [[w + index_from for w in x] for x in xs]

    if not num_words:
        num_words = max([max(x) for x in xs])
    if not maxlen:
        maxlen = max([len(x) for x in xs])

    # by convention, use 2 as OOV word
    # reserve 'index_from' (=3 by default) characters:
    # 0 (padding), 1 (start), 2 (OOV)
    xs_new = []
    for x in xs:
        x = x[:maxlen] # crop long sequences
        if oov_char is not None: # replace rare or frequent symbols 
            x = [w if (skip_top <= w < num_words) else oov_char for w in x]
        else: # or filter rare and frequent symbols
            x = [w for w in x if skip_top <= w < num_words]
        x_padded = np.zeros(maxlen)#, dtype = 'int32')
        x_padded[-len(x):] = x
        xs_new.append(x_padded)    
            
    idx = len(x_train)
    x_train, y_train = np.array(xs_new[:idx]), np.array(labels[:idx])
    x_test, y_test = np.array(xs_new[idx:]), np.array(labels[idx:])

    return (x_train, y_train), (x_test, y_test)

In [0]:
import numpy as np
import torch
import torch.utils.data

In [0]:
np.random.seed(0)
(X_train, y_train), (X_test, y_test) = load_matrix_imdb(num_words=vocab_size,
                                                        maxlen=seq_len)

In [0]:
set(y_train) # binary classification

{0, 1}

In [0]:
X_train.shape, X_test.shape

((25000, 32), (25000, 32))

In [0]:
X_train[0] # sequence of coded words

array([1.000e+00, 1.400e+01, 2.200e+01, 1.600e+01, 4.300e+01, 5.300e+02,
       9.730e+02, 1.622e+03, 1.385e+03, 6.500e+01, 4.580e+02, 4.468e+03,
       6.600e+01, 3.941e+03, 4.000e+00, 1.730e+02, 3.600e+01, 2.560e+02,
       5.000e+00, 2.500e+01, 1.000e+02, 4.300e+01, 8.380e+02, 1.120e+02,
       5.000e+01, 6.700e+02, 2.000e+00, 9.000e+00, 3.500e+01, 4.800e+02,
       2.840e+02, 5.000e+00])

In [0]:
train_dset = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_dset = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
                                           
train_loader = torch.utils.data.DataLoader(
    train_dset,
    batch_size=batch_size,
    shuffle=True,
)

test_loader = torch.utils.data.DataLoader(
    test_dset,
    batch_size=batch_size,
    shuffle=True,
)

### Сборка и обучение RNN в pytorch

In [0]:
import os
import torch.optim as optim
import torch.nn as nn


Наша нейросеть будет обрабатывать входную последовательность по словам (word level). Мы будем использовать простую и стандарную рекуррентную архитектуру для сентимент-анализа: слой представлений, слой LSTM и полносвязный слой, предсказывающий выход по последнему скрытому состоянию.

Ниже даны шаблоны реализации нейросети и ее обучения. Допишите класс и функции обучения так, чтобы класс реализовывал описанную архитектуру, а вызов функции train не выдавал ошибок :)

In [0]:
class LSTMClassifier(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, \
                 batch_size, use_gpu, rec_layer, dropout):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.use_gpu = use_gpu

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = rec_layer(embedding_dim, hidden_dim, dropout=dropout)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
    ### your code here
        self.hidden = self.init_hidden()
      
    def init_hidden(self):
      h_0 = torch.zeros(1, self.batch_size, self.hidden_dim)
      c_0 = torch.zeros(1, self.batch_size, self.hidden_dim)
      if self.use_gpu:
        h_0 = h_0.cuda()
        c_0 = c_0.cuda()
        
      return (h_0, c_0)

    def forward(self, sentence):
      lstm_output, self.hidden = self.lstm(self.word_embeddings(sentence), self.hidden)
      return torch.sigmoid(self.hidden2label(lstm_output[-1]))

In [0]:
# raise Exception("pause plz")

Exception: ignored

In [0]:
model = LSTMClassifier(embedding_dim=n_emb,
                             hidden_dim=n_hidden,
                              vocab_size=vocab_size,
                              label_size=1,
                             batch_size=batch_size, 
                             use_gpu=use_gpu,
                             rec_layer = nn.LSTM)
if use_gpu:
    model = model.cuda()

[Исходный код LSTM](http://pytorch.org/docs/master/_modules/torch/nn/modules/rnn.html#LSTM)

In [0]:
#?model.lstm.forward

In [0]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
lossfun = nn.BCELoss(reduction='sum')
#lossfun = nn.CrossEntropyLoss()

In [0]:
def train_epoch(loader, model, lossfun, optimizer, use_gpu):
    model.train()
    for train_inputs, train_labels in loader:
        train_labels = torch.squeeze(train_labels)

        if use_gpu:
            train_inputs, train_labels = train_inputs.cuda(), train_labels.cuda()

        model.zero_grad()
        model.batch_size = len(train_labels)
        model.hidden = model.init_hidden()
     
        output = model(train_inputs.long().t())
#         print(output.view(-1).shape)
#         print(train_labels.shape)
        
        loss = lossfun(output.view(-1), train_labels.float())
        loss.backward()
        optimizer.step()

        del train_inputs, train_labels
        if use_gpu:
          torch.cuda.empty_cache()

def evaluate(loader, model, lossfun, use_gpu):
    model.eval()
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    
    for train_inputs, train_labels in loader:
      with torch.no_grad():
        train_labels = torch.squeeze(train_labels)
        if use_gpu:
          train_inputs, train_labels = train_inputs.cuda(), train_labels.cuda()

        ### your code here
        model.batch_size = len(train_labels)
        model.hidden = model.init_hidden()
        output = model(train_inputs.long().t())

        loss = lossfun(output.view(-1), train_labels.float())
        total_loss = total_loss + loss.data.item()

        # calc testing acc
        ### your code here
        if not use_gpu:
          total_acc = total_acc + (train_labels.numpy() == (output > 0.5).data.long().view(-1).numpy()).sum()
        else:
          total_acc = total_acc + (train_labels.cpu().numpy() == (output > 0.5).data.long().view(-1).cpu().numpy()).sum()
        
        total = total + len(train_labels)
        
        del train_inputs, train_labels
        if use_gpu:
          torch.cuda.empty_cache()
        
    return total_loss / total, total_acc / total

def train(train_loader, test_loader, model, lossfun, optimizer, use_gpu, num_epochs):
    train_loss_ = []
    test_loss_ = []
    train_acc_ = []
    test_acc_ = []
    for epoch in range(num_epochs):
        train_epoch(train_loader, model, lossfun, optimizer, use_gpu)
        train_loss, train_acc = evaluate(train_loader, model, lossfun, use_gpu)
        train_loss_.append(train_loss)
        train_acc_.append(train_acc)
        test_loss, test_acc = evaluate(test_loader, model, lossfun, use_gpu)
        test_loss_.append(test_loss)
        test_acc_.append(test_acc)

        print('[Epoch: %3d/%3d] Training Loss: %.3f, Testing Loss: %.3f, Training Acc: %.3f, Testing Acc: %.3f'
              % (epoch, num_epochs, train_loss_[epoch], test_loss_[epoch], train_acc_[epoch], test_acc_[epoch]))
    return train_loss_, train_acc_, test_loss_, test_acc_

In [0]:
a, b, c, d = train(train_loader, test_loader, model, lossfun, optimizer, use_gpu, 15)

[Epoch:   0/ 15] Training Loss: 0.575, Testing Loss: 0.610, Training Acc: 0.725, Testing Acc: 0.671
[Epoch:   1/ 15] Training Loss: 0.424, Testing Loss: 0.560, Training Acc: 0.821, Testing Acc: 0.708
[Epoch:   2/ 15] Training Loss: 0.317, Testing Loss: 0.579, Training Acc: 0.882, Testing Acc: 0.710
[Epoch:   3/ 15] Training Loss: 0.184, Testing Loss: 0.658, Training Acc: 0.941, Testing Acc: 0.715
[Epoch:   4/ 15] Training Loss: 0.108, Testing Loss: 0.785, Training Acc: 0.971, Testing Acc: 0.712
[Epoch:   5/ 15] Training Loss: 0.083, Testing Loss: 0.928, Training Acc: 0.975, Testing Acc: 0.710
[Epoch:   6/ 15] Training Loss: 0.052, Testing Loss: 1.156, Training Acc: 0.985, Testing Acc: 0.712
[Epoch:   7/ 15] Training Loss: 0.035, Testing Loss: 1.237, Training Acc: 0.991, Testing Acc: 0.701
[Epoch:   8/ 15] Training Loss: 0.023, Testing Loss: 1.400, Training Acc: 0.994, Testing Acc: 0.708
[Epoch:   9/ 15] Training Loss: 0.011, Testing Loss: 1.514, Training Acc: 0.998, Testing Acc: 0.714


Нерегуляризованные LSTM часто быстро переобучаются. Чтобы с этим бороться, часто используют L2-регуляризацию и дропаут.
Однако способов накладывать дропаут на рекуррентный слой достаточно много, и далеко не все хорошо работают. Мы реализуем дропаут, описанный в [статье Гала и Гарамани](https://arxiv.org/abs/1512.05287).
Для этого нам потребуется самостоятельно реализовать LSTM.

### Самостоятельная реализация LSTM

Для начала реализуйте LSTM, не обращая внимание на параметр dropout, и протестируйте модель. На каждом шаге ячейка LSTM обновляет скрытое состояние и память по следующим формулам:
$$
i = \sigma(h_{t-1}W^i + x_t U^i+b_i) \quad
o = \sigma(h_{t-1}W^o + x_t U^o+b_o) 
$$
$$
f = \sigma(h_{t-1}W^f + x_t U^f+b_f) \quad 
g = tanh(h_{t-1} W^g + x_t U^g+b_g) 
$$
$$
c_t = f \odot c_{t-1} +  i \odot  g \quad
h_t =  o \odot tanh(c_t) \nonumber
$$

Теперь реализуйте дропаут для рекуррентного слоя. Как и в сетях прямого распространения, дропаут можно накладывать на вход и скрытое состояние ($x_t$ и $h_t$). Ключевая идея дропаута Гала состоит в том, что бинарная маска должна быть одинаковая для всех моментов времени (но своя для разных объектов). Кроме того, статья утверждает, что одновременно с бинарным дропаутом нужно использовать L$_2$-регуляризацию. Ее тоже можно включить (параметр weight_decay в оптимизаторе).

Формулы ячейки LSTM с бинарным дропаутом ($b_x$ и $b_h$ - бинарные маски):

$$
i = \sigma((h_{t-1} \odot b_h) W^i + (x_t \odot b_x) U^i+b_i) \quad
o = \sigma((h_{t-1} \odot b_h)W^o + (x_t \odot b_x) U^o+b_o) 
$$
$$
f = \sigma((h_{t-1} \odot b_h)W^f + (x_t \odot b_x) U^f+b_f) \quad 
g = tanh((h_{t-1} \odot b_h) W^g + (x_t \odot b_x) U^g+b_g) 
$$
$$
c_t = f \odot c_{t-1} +  i \odot  g \quad
h_t =  o \odot tanh(c_t)
$$

In [0]:
from torch.nn.parameter import Parameter

In [0]:
class MyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0):
        super(MyLSTM, self).__init__()
        ### your code here
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dropout = dropout
          
        if use_gpu:
          self.u_input_weights = nn.Linear(input_size, 4 * hidden_size).cuda()
          self.w_hidden_weights = nn.Linear(hidden_size, 4 * hidden_size).cuda()
        else:
          self.u_input_weights = nn.Linear(input_size, 4 * hidden_size)
          self.w_hidden_weights = nn.Linear(hidden_size, 4 * hidden_size)
        self.reset_params()
        
        
    def reset_params(self):
        """
        initialization as in Pytorch
        do not forget to call this method!
        """
        stdv = 1.0 / np.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)
            
    def forward(self, input, hidden):
        ### your code here
        h = hidden[0]
        c = hidden[1]
        
        if self.training and self.dropout > 0.0:
          dropout = nn.Dropout(p=self.dropout)
          if use_gpu:
            b_x = dropout(torch.ones(self.input_size)).cuda()
            b_h = dropout(torch.ones_like(h)).cuda()
          else:
            b_x = dropout(torch.ones(self.input_size))
            b_h = dropout(torch.ones_like(h))
          
        for ix in range(len(input)):
          x = input[ix]
          
          if self.training and self.dropout > 0.0:
            lstm_gates = (self.u_input_weights(x*b_x) + self.w_hidden_weights(h*b_h)).chunk(chunks=4, dim=2)
          else:
            lstm_gates = (self.u_input_weights(x) + self.w_hidden_weights(h)).chunk(chunks=4, dim=2)
            
          #https://developer.nvidia.com/sites/default/files/pictures/2018/lstm.png
          lstm_gates = (self.u_input_weights(x) + self.w_hidden_weights(h)).chunk(chunks=4, dim=2)
          input_gate, forget_gate, cell_gate, output_gate = lstm_gates
          
          input_gate = torch.sigmoid(input_gate)
          forget_gate = torch.sigmoid(forget_gate)
          cell_gate = torch.tanh(cell_gate)
          output_gate = torch.sigmoid(output_gate)
        
          c = (forget_gate * c) + (input_gate * cell_gate)
          h = output_gate * torch.tanh(c)
          
          if ix == 0:
            h_output = h
            c_output = c
          else:
            h_output = torch.cat([h_output, h], 0)

        return h_output, c_output


Протестируйте полученную модель, сравните итоговое качество на тестовой выборке с нерегуляризованной моделью.

In [0]:
DROP_OUT_VALUE = [0, 0.25, 0.5, 0.75, 0.9]
WEIGHT_DECAY = [0, 0.01, 0.001, 0.0001]

In [142]:
for dropout_val in DROP_OUT_VALUE:
  for weight_decay_val in WEIGHT_DECAY:
    model = LSTMClassifier(embedding_dim=n_emb,
                         hidden_dim=n_hidden,
                          vocab_size=vocab_size,
                          label_size=1,
                         batch_size=batch_size, 
                         use_gpu=use_gpu,
                         rec_layer=MyLSTM,
                         dropout=dropout_val
                          )
    if use_gpu:
        model = model.cuda()
        
    if weight_decay_val:
      optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay_val)
    else:
      optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    loss_function = nn.BCELoss(size_average=False)
    print("Experiment run for dropout = {}, weight_decay = {}".format(dropout_val, weight_decay_val))
    a, b, c, d = train(train_loader, test_loader, model, lossfun, optimizer, use_gpu, 10)
    print()
    



Experiment run for dropout = 0, weight_decay = 0
[Epoch:   0/ 10] Training Loss: 0.568, Testing Loss: 0.612, Training Acc: 0.726, Testing Acc: 0.669
[Epoch:   1/ 10] Training Loss: 0.407, Testing Loss: 0.562, Training Acc: 0.827, Testing Acc: 0.710
[Epoch:   2/ 10] Training Loss: 0.311, Testing Loss: 0.612, Training Acc: 0.872, Testing Acc: 0.719
[Epoch:   3/ 10] Training Loss: 0.187, Testing Loss: 0.663, Training Acc: 0.935, Testing Acc: 0.717
[Epoch:   4/ 10] Training Loss: 0.115, Testing Loss: 0.779, Training Acc: 0.966, Testing Acc: 0.713
[Epoch:   5/ 10] Training Loss: 0.057, Testing Loss: 1.037, Training Acc: 0.984, Testing Acc: 0.714
[Epoch:   6/ 10] Training Loss: 0.041, Testing Loss: 1.061, Training Acc: 0.991, Testing Acc: 0.708
[Epoch:   7/ 10] Training Loss: 0.024, Testing Loss: 1.340, Training Acc: 0.994, Testing Acc: 0.707
[Epoch:   8/ 10] Training Loss: 0.017, Testing Loss: 1.407, Training Acc: 0.997, Testing Acc: 0.707
[Epoch:   9/ 10] Training Loss: 0.012, Testing Loss

L$_2$-регуляризация (параметр weight_decay в оптимизаторе) больше всего повлияла на loss\acc результат экспериментов
Лучший результат при weight_decay=0.01, однако случайный поиск даст более точный результат
Параметр dropout почти не повлиял на производительность сети