In [2]:
import torch
import torch.nn as nn
from torch.optim import SGD 
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [3]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [4]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [5]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [6]:
word = 'ololoasdasddqweqw123456789'
#word = 'hello'

## Датасет. 
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [7]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

## Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [8]:
class VanillaRNN(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

## Инициализация переменных 

In [9]:
ds = WordDataSet(word=word)
rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

# Обучение

In [10]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

74.68247985839844
Clip gradient :  tensor(7.9369)
64.46907806396484
Clip gradient :  tensor(5.1973)
39.70587158203125
Clip gradient :  tensor(6.3590)
28.357858657836914
Clip gradient :  tensor(4.8708)
23.596736907958984
Clip gradient :  tensor(6.6768)
24.970483779907227
Clip gradient :  tensor(9.3834)
23.081567764282227
Clip gradient :  tensor(7.0319)
20.046682357788086
Clip gradient :  tensor(4.9698)
19.071449279785156
Clip gradient :  tensor(5.1603)
18.277280807495117
Clip gradient :  tensor(4.1550)


# Тестирование

In [11]:
rnn.eval()
hh = torch.zeros(rnn.hidden.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 oasddddddddddddddddddddddd
Original:	 ololoasdasddqweqw123456789


AssertionError: ignored

# Практика
Реализовать LSTM и GRU модули, обучить их предсказывать тестовое слово

In [12]:
#тестовое слово
word = 'ololoasdasddqweqw123456789'

## Реализовать LSTM

In [65]:
#Написать реализацию LSTM и обучить предсказывать слово
class LSTM(nn.Module):
  def __init__(self, in_size=5, hidden_size=3, out_size=3):
    super().__init__()
    self.x_i = nn.Linear(in_features=in_size, out_features=hidden_size)
    self.hidden_i = nn.Linear(in_features=hidden_size, out_features=hidden_size)
    self.b_i = nn.Parameter(torch.zeros(hidden_size))

    self.x_f = nn.Linear(in_features=in_size, out_features=hidden_size)
    self.hidden_f = nn.Linear(in_features=hidden_size, out_features=hidden_size)
    self.b_f = nn.Parameter(torch.zeros(hidden_size))

    self.x_o = nn.Linear(in_features=in_size, out_features=hidden_size)
    self.hidden_o = nn.Linear(in_features=hidden_size, out_features=hidden_size)
    self.b_o = nn.Parameter(torch.zeros(hidden_size))

    self.x_g = nn.Linear(in_features=in_size, out_features=hidden_size)
    self.hidden_g = nn.Linear(in_features=hidden_size, out_features=hidden_size)
    self.b_g = nn.Parameter(torch.zeros(hidden_size))

    self.tanh = nn.Tanh()
    self.sigmoid = nn.Sigmoid()
    self.outweight = nn.Linear(in_features=hidden_size, out_features=out_size)

  def forward(self, x, prev_hidden, prev_c):
    # print(x.shape, prev_hidden.shape, prev_c.shape)
    # print(self.x_i(x).shape, self.hidden_i(prev_hidden).shape, self.b_i.shape)
    i = self.sigmoid(self.x_i(x) + self.hidden_i(prev_hidden) + self.b_i)
    # print(f"i.shape = {i.shape}")
    f = self.sigmoid(self.x_f(x) + self.hidden_f(prev_hidden) + self.b_f)
    o = self.sigmoid(self.x_o(x) + self.hidden_o(prev_hidden) + self.b_o)
    g = self.tanh(self.x_g(x) + self.hidden_g(prev_hidden) + self.b_g)
    c_new = f * prev_c + i * g
    hidden = self.tanh(c_new) * o
    output = self.outweight(hidden)
    return output, hidden, c_new

In [66]:
ds = WordDataSet(word=word)
hidden_size = 10
rnn = LSTM(in_size=ds.vec_size, hidden_size=hidden_size, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
print(f"in_size={ds.vec_size}, hidden_size={hidden_size}, out_size={ds.vec_size}")
e_cnt     = 100
optim     = SGD(rnn.parameters(), lr = 0.01, momentum=0.9)

in_size=17, hidden_size=10, out_size=17


In [74]:
CLIP_GRAD = True

for epoch in range(e_cnt):
  h = torch.zeros(hidden_size)
  c = torch.zeros(hidden_size)
  loss = 0
  optim.zero_grad()
  for sample, next_sample in ds:
      x = ds.get_one_hot(sample).unsqueeze(0)
      target =  torch.LongTensor([next_sample])
      
      y, h, c = rnn(x, h, c)
      loss += criterion(y, target)
    

  loss.backward()
  
  if epoch % 10 == 0:
      print (loss.data.item())
      if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
  else: 
      if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
          
  # print("Params : ")
  # num_params = 0
  # for item in rnn.parameters():
  #     num_params += 1
  #     print(item.grad)
  # print("NumParams :", num_params)
  # print("Optimize")
  
  optim.step()

9.27072811126709
Clip gradient :  tensor(10.2192)
8.737988471984863
Clip gradient :  tensor(9.6936)
8.270630836486816
Clip gradient :  tensor(9.2256)
7.806108474731445
Clip gradient :  tensor(8.8226)
7.365233421325684
Clip gradient :  tensor(8.5298)
6.929915904998779
Clip gradient :  tensor(8.0776)
6.506451606750488
Clip gradient :  tensor(8.1888)
6.0715155601501465
Clip gradient :  tensor(7.2475)
5.6868157386779785
Clip gradient :  tensor(8.0842)
5.249034881591797
Clip gradient :  tensor(6.4048)


In [79]:
rnn.eval()
h = torch.zeros(hidden_size)
c = torch.zeros(hidden_size)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for _c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, h, c = rnn(x, h, c)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


## Реализовать GRU

In [82]:
#Написать реализацию GRU и обучить предсказывать слово
class GRU(nn.Module):
  def __init__(self, in_size=5, hidden_size=3, out_size=3):
    super().__init__()
    self.x_u = nn.Linear(in_features=in_size, out_features=hidden_size)
    self.hidden_u = nn.Linear(in_features=hidden_size, out_features=hidden_size)
    self.b_u = nn.Parameter(torch.zeros(hidden_size))

    self.x_r = nn.Linear(in_features=in_size, out_features=hidden_size)
    self.hidden_r = nn.Linear(in_features=hidden_size, out_features=hidden_size)
    self.b_r = nn.Parameter(torch.zeros(hidden_size))

    self.x_h = nn.Linear(in_features=in_size, out_features=hidden_size)
    self.hidden_h = nn.Linear(in_features=hidden_size, out_features=hidden_size)
    self.b_h = nn.Parameter(torch.zeros(hidden_size))

    self.tanh = nn.Tanh()
    self.sigmoid = nn.Sigmoid()
    self.outweight = nn.Linear(in_features=hidden_size, out_features=out_size)

  def forward(self, x, prev_hidden, prev_c):
    u = self.sigmoid(self.x_u(x) + self.hidden_u(prev_hidden) + self.b_u)
    r = self.sigmoid(self.x_r(x) + self.hidden_r(prev_hidden) + self.b_r)
    __h = r * prev_hidden
    _h = self.tanh(self.x_h(x) + self.hidden_h(__h))
    h = (1 - u) * _h + u * prev_hidden
    output = self.outweight(h)
    return output, h

In [83]:
ds = WordDataSet(word=word)
hidden_size = 10
rnn = GRU(in_size=ds.vec_size, hidden_size=hidden_size, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
print(f"in_size={ds.vec_size}, hidden_size={hidden_size}, out_size={ds.vec_size}")
e_cnt     = 100
optim     = SGD(rnn.parameters(), lr = 0.01, momentum=0.9)

in_size=17, hidden_size=10, out_size=17


In [84]:
CLIP_GRAD = True

for epoch in range(e_cnt):
  h = torch.zeros(hidden_size)
  
  loss = 0
  optim.zero_grad()
  for sample, next_sample in ds:
      x = ds.get_one_hot(sample).unsqueeze(0)
      target =  torch.LongTensor([next_sample])
      c = torch.zeros(hidden_size)
      y, h = rnn(x, h, c)
      loss += criterion(y, target)
    

  loss.backward()
  
  if epoch % 10 == 0:
      print (loss.data.item())
      if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
  else: 
      if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
          
  # print("Params : ")
  # num_params = 0
  # for item in rnn.parameters():
  #     num_params += 1
  #     print(item.grad)
  # print("NumParams :", num_params)
  # print("Optimize")
  
  optim.step()

72.42302703857422
Clip gradient :  tensor(4.6419)
69.8535385131836
Clip gradient :  tensor(3.5886)
66.83639526367188
Clip gradient :  tensor(3.2351)
63.54423522949219
Clip gradient :  tensor(3.7836)
58.882816314697266
Clip gradient :  tensor(4.7286)
52.84427261352539
Clip gradient :  tensor(5.1046)
46.27897262573242
Clip gradient :  tensor(5.2396)
39.647979736328125
Clip gradient :  tensor(5.0810)
33.375
Clip gradient :  tensor(5.0499)
27.126129150390625
Clip gradient :  tensor(4.9527)


In [86]:
rnn.eval()
h = torch.zeros(hidden_size)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for _c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    c = torch.zeros(hidden_size)
    y, h = rnn(x, h, c)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789
