In [1]:
import torch
import torch.nn as nn
from torch.optim import SGD 
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [2]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [3]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [4]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [5]:
word = 'ololoasdasddqweqw123456789'
# word = 'hello'
device = torch.device('cuda')

## Датасет. 
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [6]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

## Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [7]:
class VanillaRNN(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()                                                         #убиря тангенс градиент взрывается
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden # в LSTM озвращается 3 параметра hidden cell output

## Инициализация переменных RNN

In [8]:
ds = WordDataSet(word=word)
rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 1000
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

# Обучение RNN

In [9]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

70.97218322753906
Clip gradient :  4.910309349844152
58.63840866088867
Clip gradient :  5.2989042753281135
38.026004791259766
Clip gradient :  5.72162757872091
31.51056480407715
Clip gradient :  14.014392104300347
27.997692108154297
Clip gradient :  13.146466516119984
26.130306243896484
Clip gradient :  9.400126814924173
26.135711669921875
Clip gradient :  10.56450363266952
24.23821449279785
Clip gradient :  10.967867018223615
22.482589721679688
Clip gradient :  5.786121449205299
23.444833755493164
Clip gradient :  7.530576060682781
20.45547866821289
Clip gradient :  6.532083539401054
18.708005905151367
Clip gradient :  7.622614572350123
17.33254623413086
Clip gradient :  8.71955669443517
20.5743408203125
Clip gradient :  8.365423557616502
17.582252502441406
Clip gradient :  10.179772646895092
18.536964416503906
Clip gradient :  4.939761390765118
35.410709381103516
Clip gradient :  28.829133198330254
19.89492416381836
Clip gradient :  7.89358002045628
26.975528717041016
Clip gradient :

# Тестирование RNN

In [10]:
rnn.eval()
hh = torch.zeros(rnn.hidden.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololololololololololololol
Original:	 ololoasdasddqweqw123456789


AssertionError: 

# ДЗ


## Реализация LSTM

In [11]:
class LSTM (nn.Module):
        
    def __init__(self, in_size = 5, hidden_size = 3, out_size = 5):
        super(LSTM, self).__init__()

        self.nIter = 0
        print (in_size)
        print (hidden_size)
        print (out_size)
        self.prev_state = torch.zeros(hidden_size)
        print ('size of prev_state = ', self.prev_state.size())

        self.Wxc = nn.Linear (in_features=in_size, out_features=hidden_size)
        self.Wxi = nn.Linear (in_features=in_size, out_features=hidden_size)
        self.Wxf = nn.Linear (in_features=in_size, out_features=hidden_size)
        self.Wxo = nn.Linear (in_features=in_size, out_features=hidden_size)

        self.Whc = nn.Linear (in_features=hidden_size, out_features=hidden_size)
        self.Whi = nn.Linear (in_features=hidden_size, out_features=hidden_size)
        self.Whf = nn.Linear (in_features=hidden_size, out_features=hidden_size)
        self.Who = nn.Linear (in_features=hidden_size, out_features=hidden_size)

        self.outweight = nn.Linear (in_features=hidden_size, out_features=out_size)

        self.activation  = nn.Tanh()                                                         #убиря тангенс градиент взрывается

    def forward(self, x, prev_hidden):
        #print ("iter no ", self.nIter)
        self.nIter += 1

        c = self.activation(self.Wxc(x) + self.Whc(prev_hidden))
        #print ('sizeof c = ', c.size())
        i = torch.sigmoid  (self.Wxi(x) + self.Whi(prev_hidden))
        #print ('sizeof i = ', i.size())
        f = torch.sigmoid  (self.Wxf(x) + self.Whf(prev_hidden))
        #print ('sizeof f = ', f.size())
        o = torch.sigmoid  (self.Wxo(x) + self.Who(prev_hidden))
        #print ('sizeof o = ', o.size())

        state  = f * self.prev_state + i * c
        hidden = o * self.activation(state)
        output = self.outweight(hidden)

        self.prev_state = state
        return output, hidden
            
            
            

## Инициализация переменных LSTM

In [13]:
ds = WordDataSet(word=word)
#rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
rnn = LSTM(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 800
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)  # больше эпох, меньше lr

17
3
17
size of prev_state =  torch.Size([3])


# Обучение LSTM

In [14]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    #hh = torch.zeros(rnn.hidden.in_features)
    hh = torch.zeros(rnn.Whc.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward(retain_graph=True)
    
    if epoch % 10 == 0:
        print ('epoch', epoch)
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))  # норму тоже можно менять и это поможет
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

epoch 0
71.80549621582031
Clip gradient :  3.7464316588676825
epoch 10
67.80767822265625
Clip gradient :  3.4368091136620103
epoch 20
56.145877838134766
Clip gradient :  4.783557551385455
epoch 30
44.098384857177734
Clip gradient :  5.024869207477741
epoch 40
31.787113189697266
Clip gradient :  5.535859668079092
epoch 50
26.748558044433594
Clip gradient :  12.348403501906601
epoch 60
27.062150955200195
Clip gradient :  13.983449571832734
epoch 70
42.506107330322266
Clip gradient :  27.683577520867107
epoch 80
45.2901725769043
Clip gradient :  24.918214968591613
epoch 90
27.93473243713379
Clip gradient :  18.43181627011852
epoch 100
22.330181121826172
Clip gradient :  4.20402390080865
epoch 110
20.634708404541016
Clip gradient :  3.64113523948088
epoch 120
17.24968719482422
Clip gradient :  3.593724392539985
epoch 130
15.352540969848633
Clip gradient :  2.9593505918249483
epoch 140
13.699014663696289
Clip gradient :  2.189987752050617
epoch 150
11.159832954406738
Clip gradient :  1.6444

# Тестирование LSTM

In [15]:
rnn.eval()
#hh = torch.zeros(rnn.hidden.in_features)
hh = torch.zeros(3)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


## Реализация GRU

In [20]:
class GRU (nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(GRU, self).__init__()
        
        self.Wxu = nn.Linear (in_features=in_size, out_features=hidden_size)
        self.Wxr = nn.Linear (in_features=in_size, out_features=hidden_size)
        self.Wxh = nn.Linear (in_features=in_size, out_features=hidden_size)
        
        self.Whu = nn.Linear (in_features=hidden_size, out_features=hidden_size)
        self.Whr = nn.Linear (in_features=hidden_size, out_features=hidden_size)
        self.Whh = nn.Linear (in_features=hidden_size, out_features=hidden_size)
        
        self.outweight  = nn.Linear (in_features=hidden_size, out_features=out_size)
        self.activasion = nn.Tanh()
        
    def forward(self, x, prev_hidden):
        
        u      = torch.sigmoid  (self.Wxu(x) + self.Whu(prev_hidden))
        r      = torch.sigmoid  (self.Wxr(x) + self.Whr(prev_hidden))
        h      = self.activasion(self.Wxh(x) + self.Whh(r * prev_hidden))
        hidden = (torch.ones(u.size()) - u) * h + u * prev_hidden

        output = self.outweight(hidden)
        
        return output, hidden
           
            

## Инициализация переменных GRU

In [28]:
ds = WordDataSet(word=word)
rnn = GRU(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 6000
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

## Обучение GRU

In [29]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.Whu.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print ('epoch', epoch)
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

epoch 0
73.16291046142578
Clip gradient :  5.179620979719791
epoch 10
62.900001525878906
Clip gradient :  5.386808831805271
epoch 20
43.390869140625
Clip gradient :  7.222646102914481
epoch 30
34.89240264892578
Clip gradient :  11.7192219288621
epoch 40
27.109901428222656
Clip gradient :  3.578579801335792
epoch 50
21.25787353515625
Clip gradient :  2.806711358943242
epoch 60
18.032333374023438
Clip gradient :  9.024785464703136
epoch 70
22.390649795532227
Clip gradient :  25.683223203106216
epoch 80
19.023849487304688
Clip gradient :  12.635288078834575
epoch 90
18.017215728759766
Clip gradient :  14.610677053124359
epoch 100
16.243391036987305
Clip gradient :  11.368504481191625
epoch 110
19.127246856689453
Clip gradient :  15.44414523652144
epoch 120
14.9714937210083
Clip gradient :  7.690299552022672
epoch 130
21.888504028320312
Clip gradient :  19.62923149812076
epoch 140
16.281597137451172
Clip gradient :  13.573173432614048
epoch 150
16.424697875976562
Clip gradient :  8.5088257

## Тестирование GRU

In [30]:
rnn.eval()
hh = torch.zeros(3)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789
