# 2 Практическая чаcть


In [209]:
import pandas as pd
import torch
import numpy as np
import scipy 
from collections import defaultdict, Counter
import re
import os
from tqdm.notebook import trange, tqdm

## 2.1 Загрузка датасета.

### 1. Cоставьте таблицу, в которой указано число токенов, уникальных токенов, предложений для каждой из трех частей датасета.

In [210]:
folder_name = './filimdb_evaluation/PTB/'
filenames = ['train', 'valid', 'test']

In [211]:
def get_file_info(filename):
    global folder_name 
    tokens_cnt = defaultdict(int)
    cnt_lines = 0
    with open(folder_name + f"ptb.{filename}.txt", 'r') as inp:
        for line in inp:
            cnt_lines += 1
            for token in line.strip().split():
                tokens_cnt[token] += 1
    total_tokens = sum(tokens_cnt.values())
    unique_tokens = len(tokens_cnt.keys())
    return filename, total_tokens, unique_tokens, cnt_lines, tokens_cnt

data = {
    'file':[],
    'token_cnt':[],
    'unique_tokens':[],
    'sentences_cnt': []
}

file_dicts = []

for f in filenames:
    f_data = get_file_info(f)
    data['file'].append(f_data[0])
    data['token_cnt'].append(f_data[1])
    data['unique_tokens'].append(f_data[2])
    data['sentences_cnt'].append(f_data[3])
    file_dicts.append(f_data[4])
    
all_tokens = defaultdict(int)
for d in file_dicts:
    for k, v in d.items():
        all_tokens[k] += v

data['file'].append('all files')
data['token_cnt'].append(sum(data['token_cnt']))
data['unique_tokens'].append(len(all_tokens.keys()))
data['sentences_cnt'].append(sum(data['sentences_cnt']))
    
df = pd.DataFrame(data=data)
df

Unnamed: 0,file,token_cnt,unique_tokens,sentences_cnt
0,train,887521,9999,42068
1,valid,70390,6021,3370
2,test,78669,6048,3761
3,all files,1036580,9999,49199


### 2. Приведите 10 самых частотных и 10 самых редких токенов с их частотами.
(тут видимо для всех файлов)

In [212]:
tokens_cnt = defaultdict(int)
for f in filenames:
     with open(folder_name + f"ptb.{f}.txt", 'r') as inp:
        for line in inp:
            for token in line.strip().split():
                tokens_cnt[token] += 1
cnt_list = list(tokens_cnt.items())
cnt_list.sort(key=lambda x: x[1])
most_frequent_data = {'word':[], 'cnt':[]}
for w, c in cnt_list[-10:][::-1]:
    most_frequent_data['word'].append(w)
    most_frequent_data['cnt'].append(c)
least_frequent_data = {'word':[], 'cnt':[]}
for w, c in cnt_list[:10]:
    least_frequent_data['word'].append(w)
    least_frequent_data['cnt'].append(c)

In [213]:
pd.DataFrame(data=most_frequent_data)

Unnamed: 0,word,cnt
0,the,59421
1,<unk>,53299
2,N,37607
3,of,28427
4,to,27430
5,a,24755
6,in,21032
7,and,20404
8,'s,11555
9,for,10436


In [214]:
pd.DataFrame(data=least_frequent_data)

Unnamed: 0,word,cnt
0,buffet,5
1,lancaster,5
2,barnett,5
3,rewrite,5
4,downgrading,5
5,backgrounds,5
6,stanza,5
7,vessel,5
8,unstable,5
9,peat,5


### 3. Какие специальные токены уже есть в выборке, что они означают?


Вроде как, токены выглядят как текст в треугольных кавычках. Поищем такие фрагменты.

In [215]:
spec_tokens = set()
for f in filenames:
     with open(folder_name + f"ptb.{f}.txt", 'r') as inp:
        for line in inp:
            cur_spec = set(re.findall(r'<[a-z]*>', line))
            spec_tokens = spec_tokens.union(cur_spec)
print(spec_tokens)

{'<unk>'}


Этим токеном заменяются слова, невошедшие в 10000 самых популярных в корпусе.

Также есть специальные токены вида:
<br>
1. N - все отдельно стоящие числа заменяются на этот токен.
2. \$ - на этот токен заменяются все знаки валют.

## 2.2 Генерацей батчей.

Тут написана версия разбиения на батчи для для слов в обычном виде, чтобы проще было проверить правильность построения. Сильно ниже будет версия генератора для уже приведенных к индексам слов в предложении.

In [216]:
def print_batch(ind, X_b, Y_b):
    print(f"Batch # {ind}")
    for i in range(len(X_b)):
            print(X_b[i], ' ', Y_b[i])

In [217]:
def batch_generator_text(data_path, batch_size, num_steps, debug=False):
    eos_token = '<eos>'
    L_tokens = []
    with open(data_path, 'r', encoding='utf-8') as inp:
        for line in inp:
            line_tokens = list(map(str.lower, line.strip().split()))
            L_tokens.extend(line_tokens + [eos_token])
            
    L_shifted = L_tokens[1:]
    L_tokens = L_tokens[:-1]
    print(len(L_tokens), len(L_shifted))
    slice_len = len(L_tokens) // batch_size
    X_lists = [L_tokens[i * slice_len : (i + 1) * slice_len] for i in range(batch_size)]
    Y_lists = [L_shifted[i * slice_len : (i + 1) * slice_len] for i in range(batch_size)]

    total_batchs = slice_len // num_steps
    for i in range(total_batchs):
        X_batch = []
        Y_batch = []
        for lst in X_lists:
            X_batch.append(lst[i * num_steps : (i + 1) * num_steps])
        for lst in Y_lists:
            Y_batch.append(lst[i * num_steps : (i + 1) * num_steps])
        if debug and i < 3:
            print_batch(i, X_batch, Y_batch)
#         yield torch.tensor(X_batch, requires_grad=False), torch.tensor(Y_batch, requires_grad=False)

res = batch_generator_text(folder_name + "ptb.train.txt", batch_size = 2, num_steps = 3, debug=True)

929588 929588
Batch # 0
['aer', 'banknote', 'berlitz']   ['banknote', 'berlitz', 'calloway']
['guarantee', 'the', 'government']   ['the', 'government', 'can']
Batch # 1
['calloway', 'centrust', 'cluett']   ['centrust', 'cluett', 'fromstein']
['can', 'ensure', 'the']   ['ensure', 'the', 'same']
Batch # 2
['fromstein', 'gitano', 'guterman']   ['gitano', 'guterman', 'hydro-quebec']
['same', 'flow', 'of']   ['flow', 'of', 'resources']


На файле из первых трех строчек **train** датасета функция создаёт батчи похожие на правду.

## 2.3 Реализация LSTM LM.

### 2.3.1 Класс LSTMCell
Для реализации LSTM ячейки будем отталкиваться от реализации обычной RNN ячейки из семинара.

In [218]:
class LSTMCell(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        '''
        Args:
            input_size: Size of token embedding
            hidden_size: Size of hidden state of LSTM cell
        '''
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Creating matrices whose weights will be trained
        # Token embedding (input of this cell) will be multiplied by this matrix
        self.U_input = torch.nn.Parameter(torch.Tensor(input_size, 4 * hidden_size))
        self.BU_input = torch.nn.Parameter(torch.Tensor(4 * hidden_size))

        # Creating matrices whose weights will be trained
        # Hidden state from previous step will be multipied by this matrix
        # Zero hidden state at the initial step
        self.W_hidden = torch.nn.Parameter(torch.Tensor(hidden_size, 4 * hidden_size))
        self.BW_hidden = torch.nn.Parameter(torch.Tensor(4 * hidden_size))

        # Weights initialization
        self.reset_parameters()

    def forward(self, inp: torch.Tensor, cell_state: torch.Tensor, hidden_state: torch.Tensor) -> (torch.Tensor, torch.Tensor):
        '''
        Performes forward pass of the recurrent cell
        Args:
            inp: Output from Embedding layer at the current timestep
                Tensor shape is (batch_size, emb_size)
            cell_state: Output cell_state from previous recurrent step or zero state
                Tensor shape is (batch_size, hidden_size)
            hidden_state: Output hidden_state from previous recurrent step or zero state
                Tensor shape is (batch_size, hidden_size)
        Returns:
            Output from LSTM cell
        '''
        hidden_mult = hidden_state @ self.W_hidden + self.BW_hidden
        input_mult  = inp @ self.U_input + self.BU_input 
        matr_sum = input_mult + hidden_mult
        
        f, i, c_new, o, = matr_sum.chunk(chunks=4, dim=1)
        f = torch.sigmoid(f)
        i = torch.sigmoid(i)
        c_new = torch.tanh(c_new)
        o = torch.sigmoid(o)
        
        cell_state_new = cell_state * f + i * c_new
        hidden_state_new = o * torch.tanh(cell_state_new)
        
        return cell_state_new, hidden_state_new
        
    def reset_parameters(self):
        '''
        Weights initialization
        '''
        stdv = 1.0 / np.sqrt(self.hidden_size)
        for weight in self.parameters():
            torch.nn.init.uniform_(weight, -stdv, stdv)

8 матриц и векторов смещений заменили на 2 каждого вида.<br>
Всё перемножили и сложили по формулам, применили функции активация к каждой из 4 частей большой матрицы. <br>
Дальше осталось просто всё правильно поэлементно перемножить и получить новые состояния ячейки и скрытое состояние.

### 2.3.2 Класс LSTMLayer

In [219]:
class LSTMLayer(torch.nn.Module):
    def __init__(self, emb_size, hidden_size):
        super(LSTMLayer, self).__init__()
        self.input_size = emb_size
        self.hidden_size = hidden_size
        self.LSTMCell = LSTMCell(emb_size, hidden_size)
        
    def forward(self, X_batch, initial_states):
        cell_state, hidden_state = initial_states
        outputs = []
        for timestamp in range(X_batch.shape[0]):
            cell_state, hidden_state = self.LSTMCell(X_batch[timestamp], cell_state, hidden_state)
            outputs.append(hidden_state)
        return torch.stack(outputs), (cell_state, hidden_state)

### 2.3.3 Класс LSTM

In [220]:
class LSTM(torch.nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, dropout_rate):
        super(LSTM, self).__init__()
        self.input_size = emb_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        
        self.layers = []
        for i in range(num_layers):
            self.layers.append(torch.nn.Dropout(p=self.dropout_rate))
            if i == 0:
                self.layers.append(LSTMLayer(emb_size, hidden_size))
            else:
                self.layers.append(LSTMLayer(hidden_size, hidden_size))

        self.layers.append(torch.nn.Dropout(p=self.dropout_rate))    
        self.layers = torch.nn.ModuleList(self.layers)
            
    def forward(self, X_batch, initial_states):
        for ind, layer in enumerate(self.layers):
            if ind % 2 == 1:
                X_batch, states = layer(X_batch, initial_states)
            else:
                X_batch = layer(X_batch)
        return X_batch, states

### 2.3.4 Класс PTBLM

In [221]:
class PTBLM(torch.nn.Module):
    def __init__(self, num_layers, emb_size, hidden_size, vocab_size, dropout_rate, tie_emb=True):
        super(PTBLM, self).__init__()
        self.num_layers = num_layers
        self.input_size = emb_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.dropout_rate = dropout_rate
        self.tie = tie_emb


        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_size)
        self.LSTM = LSTM(emb_size, hidden_size, num_layers, dropout_rate)
        self.decoder = torch.nn.Linear(in_features=hidden_size, out_features=vocab_size)
        self.tie_b =  torch.nn.Parameter(torch.zeros(vocab_size))

        self.init_weights()

        
    def forward(self, model_input, initial_states):
        embs = self.embedding(model_input).transpose(0, 1).contiguous()
        
        outputs, states = self.LSTM(embs, initial_states)
        
        # print(outputs.shape)
        if self.tie:
            ns, bs = outputs.shape[0], outputs.shape[1]
            outputs = outputs.view(-1, self.hidden_size)
            logits = outputs.mm(self.embedding.weight.t()) + self.tie_b
            logits = logits.view(ns, bs, self.vocab_size)
        else:
            logits = self.decoder(outputs)

        logits = logits.transpose(0, 1).contiguous()

        return logits, states
    
    def init_weights(self):
        self.embedding.weight.data.uniform_(-0.1, 0.1)
        self.decoder.weight.data.uniform_(-0.1, 0.1)
        torch.nn.init.uniform_(self.tie_b, -0.1, 0.1)
        
    def init_hidden(self, batch_size, device):
        return torch.zeros(batch_size, self.hidden_size).to(device), torch.zeros(batch_size, self.hidden_size).to(device)

## 2.4 Обучение языковой модели.

Ниже функции для подготовки ptb датасета и словарей.

In [222]:
START_TOKEN =  '<start>'
EOS_TOKEN =  '<eos>'

In [223]:
def _read_words(path):
    with open(path, 'r') as inp:
        names = inp.read().lower().split()
        return names
print(_read_words(folder_name + 'small.txt'))

['pierre', '<unk>', 'n', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', 'n', 'mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', 'the', 'dutch', 'publishing', 'group', 'rudolph', '<unk>', 'n', 'years', 'old', 'and', 'former', 'chairman', 'of', 'consolidated', 'gold', 'fields', 'plc', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'british', 'industrial', 'conglomerate']


In [224]:
def _read_sentences(path):
    with open(path, 'r') as inp:
        sentences = inp.read().lower().split('\n')
    sentences = [[START_TOKEN] + sent.split() for sent in sentences]
    return sentences
    
sents = _read_sentences(folder_name + 'small.txt')
for sent in sents:
    print(sent)

['<start>', 'pierre', '<unk>', 'n', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', 'n']
['<start>', 'mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', 'the', 'dutch', 'publishing', 'group']
['<start>', 'rudolph', '<unk>', 'n', 'years', 'old', 'and', 'former', 'chairman', 'of', 'consolidated', 'gold', 'fields', 'plc', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'british', 'industrial', 'conglomerate']


In [225]:
def _build_vocab(path):
    data = _read_words(path)
    special_tokens = [START_TOKEN, EOS_TOKEN]
    data += special_tokens
    
    counter = Counter(data)
    sorted_words = sorted(counter.items(), key=lambda x: -x[1])
    
    words = [w for w, _ in sorted_words]
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = {v: k for k, v in word_to_id.items()}
    
    return word_to_id, id_to_word

word_to_id, id_to_word = _build_vocab(folder_name + 'small.txt')
print('Vocab size = ', len(word_to_id))
print(list(word_to_id.items()))

Vocab size =  37
[('<unk>', 0), ('n', 1), ('of', 2), ('years', 3), ('old', 4), ('the', 5), ('a', 6), ('nonexecutive', 7), ('director', 8), ('chairman', 9), ('pierre', 10), ('will', 11), ('join', 12), ('board', 13), ('as', 14), ('nov.', 15), ('mr.', 16), ('is', 17), ('n.v.', 18), ('dutch', 19), ('publishing', 20), ('group', 21), ('rudolph', 22), ('and', 23), ('former', 24), ('consolidated', 25), ('gold', 26), ('fields', 27), ('plc', 28), ('was', 29), ('named', 30), ('this', 31), ('british', 32), ('industrial', 33), ('conglomerate', 34), ('<start>', 35), ('<eos>', 36)]


In [226]:
def _sentences_to_word_ids(path, word_to_id):
    sentences = _read_sentences(path)
    return [[word_to_id[word] for word in sent] for sent in sentences]

word_to_id, id_to_word = _build_vocab(folder_name + 'small.txt')
res = _sentences_to_word_ids(folder_name + 'small.txt', word_to_id)
for sent in res:
    print(sent)

[35, 10, 0, 1, 3, 4, 11, 12, 5, 13, 14, 6, 7, 8, 15, 1]
[35, 16, 0, 17, 9, 2, 0, 18, 5, 19, 20, 21]
[35, 22, 0, 1, 3, 4, 23, 24, 9, 2, 25, 26, 27, 28, 29, 30, 6, 7, 8, 2, 31, 32, 33, 34]


In [227]:
def ptb_raw_data(data_path, debug=False):
    train_path = os.path.join(data_path, 'ptb.train.txt')
    dev_path = os.path.join(data_path, 'ptb.valid.txt')
    test_path = os.path.join(data_path, 'ptb.test.txt')
    
    word_to_id, id_to_word = _build_vocab(train_path)
    train_data = _sentences_to_word_ids(train_path, word_to_id)
    dev_data = _sentences_to_word_ids(dev_path, word_to_id)
    test_data = _sentences_to_word_ids(test_path, word_to_id)
    
    return train_data, dev_data, test_data, word_to_id, id_to_word

train_data, dev_data, test_data, word_to_ind, ind_to_word = ptb_raw_data(folder_name)
print('Vocab size = ', len(word_to_ind))
for sent in train_data[:5]:
    print(sent)

Vocab size =  10001
[9999, 9969, 9970, 9971, 9972, 9973, 9974, 9975, 9976, 9977, 9978, 9979, 9980, 9981, 9982, 9983, 9984, 9985, 9986, 9987, 9988, 9989, 9990, 9991, 9992]
[9999, 8568, 1, 2, 71, 392, 32, 2115, 0, 145, 18, 5, 8569, 274, 406, 2]
[9999, 22, 1, 12, 140, 3, 1, 5277, 0, 3054, 1580, 95]
[9999, 7231, 1, 2, 71, 392, 7, 336, 140, 3, 2467, 656, 2157, 948, 23, 520, 5, 8569, 274, 3, 38, 302, 436, 3660]
[9999, 5, 940, 3, 3142, 494, 261, 4, 136, 5881, 4218, 5882, 29, 985, 5, 239, 754, 3, 1012, 2764, 210, 5, 95, 3, 426, 4059, 4, 13, 44, 54, 2, 71, 194, 1232, 219]


In [228]:
def batch_generator_inds(data, word_to_id, batch_size, num_steps, debug=False):
    L_tokens = []
    for sentence in data:
        L_tokens.extend(sentence + [word_to_id[EOS_TOKEN]])
    L_shifted = L_tokens[1:]
    L_tokens = L_tokens[:-1]

    slice_len = len(L_tokens) // batch_size
    X_lists = [L_tokens[i * slice_len : (i + 1) * slice_len] for i in range(batch_size)]
    Y_lists = [L_shifted[i * slice_len : (i + 1) * slice_len] for i in range(batch_size)]
    # print(len(X_lists))

    total_batchs = slice_len // num_steps
    for i in range(total_batchs):
        X_batch = []
        Y_batch = []
        for lst in X_lists:
            X_batch.append(lst[i * num_steps : (i + 1) * num_steps])
        for lst in Y_lists:
            Y_batch.append(lst[i * num_steps : (i + 1) * num_steps])
        if debug:
            print_batch(i, X_batch, Y_batch)
        else:
            if X_batch:
                yield torch.tensor(X_batch, requires_grad=False), torch.tensor(Y_batch, requires_grad=False)

train_data, dev_data, test_data, word_to_ind, ind_to_word = ptb_raw_data(folder_name, debug=True)
res = batch_generator_inds(train_data, word_to_ind, batch_size = 2, num_steps = 3)
next(res)

(tensor([[9999, 9969, 9970],
         [   4, 2818,  507]]),
 tensor([[9969, 9970, 9971],
         [2818,  507,   35]]))

Теперь перейдем к функциям для обучения сети.

In [229]:
def update_lr(optimizer, lr):
    for g in optimizer.param_groups:
        g['lr'] = lr

def run_epoch(
    lr,
    model,
    data, 
    word_to_id, 
    loss_fn, 
    batch_size,
    num_steps,
    optimizer = None, 
    device = None
) -> float:
    '''
    Performs one training epoch or inference epoch
    Args:
        lr: Learning rate for this epoch
        model: Language model object
        data: Data that will be passed through the language model
        char_to_id: Mapping of each character into its index in the vocabulary
        loss_fn: Torch loss function
        optimizer: Torch optimizer
        device: Input tensors should be sent to this device
    Returns: 
        Perplexity
    '''
    
    total_loss, total_examples = 0.0, 0
    generator = batch_generator_inds(data, word_to_id=word_to_id, batch_size=batch_size, num_steps=num_steps)

    initial_state = model.init_hidden(batch_size=batch_size, device=device)
    for step, (X, Y) in enumerate(generator):
        X = X.to(device)
        Y = Y.to(device)
        
        logits, new_state = model(X, initial_state)
        initial_state = (new_state[0].detach(), new_state[1].detach())
        
        loss = loss_fn(logits.view((-1, model.vocab_size)), Y.view(-1))
        total_examples += loss.size(0)
        total_loss += loss.sum().item()
        loss = loss.mean()

        if optimizer is not None:
            # We have a new learning rate value at every step, so it needs to be updated
            update_lr(optimizer, lr)
            
            # Gradients computation
            loss.backward()
            
            # Gradient clipping by predefined norm value - usually 5.0
            # torch.nn.utils.clip_grad_norm_(model.parameters(), config['max_grad_norm'])

            # Applying gradients - one gradient descent step
            optimizer.step()
            optimizer.zero_grad()

    return np.exp(total_loss / total_examples)

In [230]:
config = { 'batch_size': 64, 'num_steps': 40, 
           'num_layers': 1, 'emb_size': 256,
           'hidden_size': 256, 'vocab_size': -1,
           'dropout_rate': 0.2, 'num_epochs': 13,
           'learning_rate': 0.01, 'lr_decay' : 0.8,
           'epoch_decay' : 6
         }

In [231]:
raw_data = ptb_raw_data(folder_name)
train_data, dev_data, test_data, word_to_id, id_to_word = raw_data
config['vocab_size'] = len(word_to_id)
config['vocab_size']

10001

In [232]:
model = PTBLM(num_layers=config['num_layers'], emb_size=config['emb_size'],
              hidden_size=config['hidden_size'], vocab_size = config['vocab_size'],
              dropout_rate=config['dropout_rate']
             )
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
model

PTBLM(
  (embedding): Embedding(10001, 256)
  (LSTM): LSTM(
    (layers): ModuleList(
      (0): Dropout(p=0.2, inplace=False)
      (1): LSTMLayer(
        (LSTMCell): LSTMCell()
      )
      (2): Dropout(p=0.2, inplace=False)
      (3): LSTMLayer(
        (LSTMCell): LSTMCell()
      )
      (4): Dropout(p=0.2, inplace=False)
    )
  )
  (decoder): Linear(in_features=256, out_features=10001, bias=True)
)

In [233]:
device

device(type='cuda', index=0)

наконец-то цикл обучения

In [234]:
plot_data = []
for i in trange(config['num_epochs']):
    lr_decay = config['lr_decay'] ** max(i + 1 - config['epoch_decay'], 0.0)
    decayed_lr = config['learning_rate'] * lr_decay
    
    model.train()
    train_perplexity = run_epoch(decayed_lr, model, train_data, 
                                 word_to_id, loss_fn,
                                 config['batch_size'], config['num_steps'],
                                 optimizer=optimizer, 
                                 device=device)
    
    model.eval()

    # Disabling gradient calculation. 
    # It will reduce memory consumption for computations 
    # The result of every computation will have requires_grad=False, 
    with torch.no_grad():
        dev_perplexity = run_epoch(decayed_lr, model, dev_data, 
                                   word_to_id, loss_fn, config['batch_size'], config['num_steps'],
                                   device=device)
    
    plot_data.append((i, train_perplexity, dev_perplexity, decayed_lr))
    print(f'Epoch: {i+1}. Learning rate: {decayed_lr:.3f}. '
          f'Train Perplexity: {train_perplexity:.3f}. '
          f'Dev Perplexity: {dev_perplexity:.3f}. ' 
         )

model.eval()
with torch.no_grad():
    test_perplexity = run_epoch(
        decayed_lr, model, test_data, 
        word_to_id, loss_fn, config['batch_size'], config['num_steps'],
        device=device)
    print(f"Test Perplexity: {test_perplexity:.3f}")

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch: 1. Learning rate: 0.010. Train Perplexity: 804.518. Dev Perplexity: 461.887. 
Epoch: 2. Learning rate: 0.010. Train Perplexity: 336.382. Dev Perplexity: 263.073. 
Epoch: 3. Learning rate: 0.010. Train Perplexity: 240.855. Dev Perplexity: 215.174. 
Epoch: 4. Learning rate: 0.010. Train Perplexity: 206.755. Dev Perplexity: 195.311. 
Epoch: 5. Learning rate: 0.010. Train Perplexity: 188.087. Dev Perplexity: 183.748. 
Epoch: 6. Learning rate: 0.010. Train Perplexity: 174.807. Dev Perplexity: 176.936. 
Epoch: 7. Learning rate: 0.008. Train Perplexity: 154.425. Dev Perplexity: 161.207. 
Epoch: 8. Learning rate: 0.006. Train Perplexity: 139.557. Dev Perplexity: 150.261. 
Epoch: 9. Learning rate: 0.005. Train Perplexity: 128.528. Dev Perplexity: 140.302. 
Epoch: 10. Learning rate: 0.004. Train Perplexity: 121.077. Dev Perplexity: 134.560. 
Epoch: 11. Learning rate: 0.003. Train Perplexity: 115.517. Dev Perplexity: 130.042. 
Epoch: 12. Learning rate: 0.003. Train Perplexity: 111.148. Dev