In [1]:
!pip install torchtext==0.9.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install -q datasets

In [4]:

import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy.data import Field, BucketIterator, Iterator
from torchtext.legacy import data



import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np
import pandas as pd

import random
import math
import time

import datasets
from datasets import load_dataset
from datasets import Dataset

In [5]:
train_dataset = load_dataset('json', data_files = "./conala-train.json")  
test_dataset = load_dataset('json', data_files = "./conala-test.json")  
print("Train dataset - ",train_dataset)
print("Test dataset - ",test_dataset)



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

Train dataset -  DatasetDict({
    train: Dataset({
        features: ['intent', 'rewritten_intent', 'snippet', 'question_id'],
        num_rows: 2379
    })
})
Test dataset -  DatasetDict({
    train: Dataset({
        features: ['intent', 'rewritten_intent', 'snippet', 'question_id'],
        num_rows: 500
    })
})


In [6]:
def creating_features(raw_data):
    # manually curated data
    input_texts = raw_data['train']['rewritten_intent']
    code_snippets = raw_data['train']["snippet"]
    for i in range(len(input_texts)):
        if(raw_data['train']['rewritten_intent'][i] == None):
            input_texts[i] = raw_data['train']['intent'][i]
    return  input_texts,code_snippets

In [7]:
model_inputs,outputs=creating_features(train_dataset)
df=pd.concat([pd.DataFrame(model_inputs),pd.DataFrame(outputs)],axis=1)
df.columns=['Text','Code']

import numpy as np

np.random.seed(0)
msk = np.random.rand(len(df)) < 0.85 

train_df = df[msk]
val_df = df[~msk]

train_df=train_df.reset_index(drop=True)
val_df=val_df.reset_index(drop=True)

In [8]:


train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

print(train_dataset)
print(val_dataset)

Dataset({
    features: ['Text', 'Code'],
    num_rows: 1989
})
Dataset({
    features: ['Text', 'Code'],
    num_rows: 390
})


In [9]:
from tokenize import tokenize, untokenize
import io


def tokenize_python_code(python_code_str):
    python_tokens = list(tokenize(io.BytesIO(python_code_str.encode('utf-8')).readline))
    tokenized_output = []
    for i in range(0, len(python_tokens)):
        tokenized_output.append((python_tokens[i].type, python_tokens[i].string))
    return tokenized_output

In [10]:
tokenize_python_code(df.iloc[497,1])

[(62, 'utf-8'),
 (1, 'sys'),
 (54, '.'),
 (1, 'stdout'),
 (54, '.'),
 (1, 'flush'),
 (54, '('),
 (54, ')'),
 (4, ''),
 (0, '')]

In [11]:
import keyword

print(keyword.kwlist)

['False', 'None', 'True', 'and', 'as', 'assert', 'async', 'await', 'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'nonlocal', 'not', 'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield']


In [12]:
def augment_tokenize_python_code(python_code_str, mask_factor=0.3):


    var_dict = {} # Dictionary that stores masked variables

    # certain reserved words that should not be treated as normal variables and
    # hence need to be skipped from our variable mask augmentations
    skip_list = ['range', 'enumerate', 'print', 'ord', 'int', 'float', 'zip'
                 'char', 'list', 'dict', 'tuple', 'set', 'len', 'sum', 'min', 'max']
    skip_list.extend(keyword.kwlist)

    var_counter = 1
    python_tokens = list(tokenize(io.BytesIO(python_code_str.encode('utf-8')).readline))
    tokenized_output = []

    for i in range(0, len(python_tokens)):
      if python_tokens[i].type == 1 and python_tokens[i].string not in skip_list:
        
        if i>0 and python_tokens[i-1].string in ['def', '.', 'import', 'raise', 'except', 'class']: # avoid masking modules, functions and error literals
          skip_list.append(python_tokens[i].string)
          tokenized_output.append((python_tokens[i].type, python_tokens[i].string))
        elif python_tokens[i].string in var_dict:  # if variable is already masked
          tokenized_output.append((python_tokens[i].type, var_dict[python_tokens[i].string]))
        elif random.uniform(0, 1) > 1-mask_factor: # randomly mask variables
          var_dict[python_tokens[i].string] = 'var_' + str(var_counter)
          var_counter+=1
          tokenized_output.append((python_tokens[i].type, var_dict[python_tokens[i].string]))
        else:
          skip_list.append(python_tokens[i].string)
          tokenized_output.append((python_tokens[i].type, python_tokens[i].string))
      
      else:
        tokenized_output.append((python_tokens[i].type, python_tokens[i].string))
    
    return tokenized_output

In [13]:
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [14]:

Input = data.Field(tokenize = 'spacy',
            init_token='', 
            eos_token='', 
            lower=True)

Output = data.Field(tokenize = augment_tokenize_python_code,
                    init_token='', 
                    eos_token='', 
                    lower=False)



In [15]:
fields = [('Input', Input),('Output', Output)]

In [16]:
train_example = []
val_example = []

train_expansion_factor = 10
for j in range(train_expansion_factor):
  for i in range(train_df.shape[0]):
    try:
      ex = data.Example.fromlist([train_df.Text[i], train_df.Code[i]], fields)
      train_example.append(ex)
    except:
      pass

for i in range(val_df.shape[0]):
    try:
        ex = data.Example.fromlist([val_df.Text[i], val_df.Code[i]], fields)
        val_example.append(ex)
    except:
        pass       

In [17]:
train_data = data.Dataset(train_example, fields)
valid_data =  data.Dataset(val_example, fields)

Input.build_vocab(train_data, min_freq = 0)
Output.build_vocab(train_data, min_freq = 0)



In [18]:
Output.vocab

<torchtext.vocab.Vocab at 0x7fb4dd5099d0>

In [19]:
def save_vocab(vocab, path):
    import pickle
    output = open(path, 'wb')
    pickle.dump(vocab, output)
    output.close()

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [21]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 1000):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, 1, 1, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]

        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
            
        return src

In [22]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, 1, 1, src len] 
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

In [23]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

In [24]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, query len, hid dim]
        
        return x, attention


In [25]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 10000):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]

        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
                
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output, attention


In [26]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        # query, key, value
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention


In [27]:

class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

In [28]:
INPUT_DIM = len(Input.vocab)
OUTPUT_DIM = len(Output.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 16
DEC_HEADS = 16
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [29]:
len(Output.vocab.__dict__['freqs'])

2534

In [30]:
SRC_PAD_IDX = Input.vocab.stoi[Input.pad_token]
TRG_PAD_IDX = Output.vocab.stoi[Output.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [31]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [32]:
model.apply(initialize_weights);

In [33]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [34]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F

class CrossEntropyLoss(nn.CrossEntropyLoss):
    """CrossEntropyLoss - with ability to recieve distrbution as targets, and optional label smoothing"""

    def __init__(self, weight=None, ignore_index=-100, reduction='mean', smooth_eps=None, smooth_dist=None, from_logits=True):
        super(CrossEntropyLoss, self).__init__(weight=weight,
                                               ignore_index=ignore_index, reduction=reduction)
        self.smooth_eps = smooth_eps
        self.smooth_dist = smooth_dist
        self.from_logits = from_logits

    def forward(self, input, target, smooth_dist=None):
        if smooth_dist is None:
            smooth_dist = self.smooth_dist
        return cross_entropy(input, target, weight=self.weight, ignore_index=self.ignore_index,
                             reduction=self.reduction, smooth_eps=self.smooth_eps,
                             smooth_dist=smooth_dist, from_logits=self.from_logits)


def cross_entropy(inputs, target, weight=None, ignore_index=-100, reduction='mean',
                  smooth_eps=None, smooth_dist=None, from_logits=True):
    """cross entropy loss, with support for target distributions and label smoothing https://arxiv.org/abs/1512.00567"""
    smooth_eps = smooth_eps or 0

    # ordinary log-liklihood - use cross_entropy from nn
    if _is_long(target) and smooth_eps == 0:
        if from_logits:
            return F.cross_entropy(inputs, target, weight, ignore_index=ignore_index, reduction=reduction)
        else:
            return F.nll_loss(inputs, target, weight, ignore_index=ignore_index, reduction=reduction)

    if from_logits:
        # log-softmax of inputs
        lsm = F.log_softmax(inputs, dim=-1)
    else:
        lsm = inputs

    masked_indices = None
    num_classes = inputs.size(-1)

    if _is_long(target) and ignore_index >= 0:
        masked_indices = target.eq(ignore_index)

    if smooth_eps > 0 and smooth_dist is not None:
        if _is_long(target):
            target = onehot(target, num_classes).type_as(inputs)
        if smooth_dist.dim() < target.dim():
            smooth_dist = smooth_dist.unsqueeze(0)
        target.lerp_(smooth_dist, smooth_eps)

    if weight is not None:
        lsm = lsm * weight.unsqueeze(0)

    if _is_long(target):
        eps_sum = smooth_eps / num_classes
        eps_nll = 1. - eps_sum - smooth_eps
        likelihood = lsm.gather(dim=-1, index=target.unsqueeze(-1)).squeeze(-1)
        loss = -(eps_nll * likelihood + eps_sum * lsm.sum(-1))
    else:
        loss = -(target * lsm).sum(-1)

    if masked_indices is not None:
        loss.masked_fill_(masked_indices, 0)

    if reduction == 'sum':
        loss = loss.sum()
    elif reduction == 'mean':
        if masked_indices is None:
            loss = loss.mean()
        else:
            loss = loss.sum() / float(loss.size(0) - masked_indices.sum())

    return loss


def onehot(indexes, N=None, ignore_index=None):
    """
    Creates a one-representation of indexes with N possible entries
    if N is not specified, it will suit the maximum index appearing.
    indexes is a long-tensor of indexes
    ignore_index will be zero in onehot representation
    """
    if N is None:
        N = indexes.max() + 1
    sz = list(indexes.size())
    output = indexes.new().byte().resize_(*sz, N).zero_()
    output.scatter_(-1, indexes.unsqueeze(-1), 1)
    if ignore_index is not None and ignore_index >= 0:
        output.masked_fill_(indexes.eq(ignore_index).unsqueeze(-1), 0)
    return output

def _is_long(x):
    if hasattr(x, 'data'):
        x = x.data
    return isinstance(x, torch.LongTensor) or isinstance(x, torch.cuda.LongTensor)

In [35]:
def maskNLLLoss(inp, target, mask):
    # print(inp.shape, target.shape, mask.sum())
    nTotal = mask.sum()
    crossEntropy = CrossEntropyLoss(ignore_index = TRG_PAD_IDX, smooth_eps=0.20)
    loss = crossEntropy(inp, target)
    loss = loss.to(device)
    return loss, nTotal.item()

In [36]:
criterion = maskNLLLoss

In [37]:
from tqdm import tqdm

def make_trg_mask(trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != TRG_PAD_IDX).unsqueeze(1).unsqueeze(2)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    print("here")
    n_totals = 0
    print_losses = []
    print("iterator")
    print(iterator)
    for i, batch in tqdm(enumerate(iterator), total=len(iterator)):
        # print(batch)
        loss = 0
        src = batch.Input.permute(1, 0)
        trg = batch.Output.permute(1, 0)
        trg_mask = make_trg_mask(trg)
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        mask_loss, nTotal = criterion(output, trg, trg_mask)
        
        mask_loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        print_losses.append(mask_loss.item() * nTotal)
        n_totals += nTotal
        print(i)


    print("printloss",print_losses)
    print("n_totals",n_totals)
    return sum(print_losses) / n_totals


In [38]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    print("there")
    n_totals = 0
    print_losses = []
    
    with torch.no_grad():
    
        for i, batch in tqdm(enumerate(iterator), total=len(iterator)):

            src = batch.Input.permute(1, 0)
            trg = batch.Output.permute(1, 0)
            trg_mask = make_trg_mask(trg)

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            mask_loss, nTotal = criterion(output, trg, trg_mask)

            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

        
    return sum(print_losses) / n_totals

In [39]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [40]:
N_EPOCHS = 5
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_example = []
    val_example = []

    for i in range(train_df.shape[0]):
        try:
            ex = data.Example.fromlist([train_df.Text[i], train_df.Code[i]], fields)
            train_example.append(ex)
        except:
            pass

    for i in range(val_df.shape[0]):
        try:
            ex = data.Example.fromlist([val_df.Text[i], val_df.Code[i]], fields)
            val_example.append(ex)
        except:
            pass       

    train_data = data.Dataset(train_example, fields)
    valid_data =  data.Dataset(val_example, fields)

    BATCH_SIZE = 16
    train_iterator, valid_iterator = BucketIterator.splits((train_data, valid_data), batch_size = BATCH_SIZE, 
                                                                sort_key = lambda x: len(x.Input),
                                                                sort_within_batch=True, device = device)

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7fb4dcd394f0>


  1%|          | 1/125 [00:00<01:28,  1.41it/s]

0


  2%|▏         | 2/125 [00:01<01:18,  1.56it/s]

1


  2%|▏         | 3/125 [00:01<01:05,  1.85it/s]

2


  3%|▎         | 4/125 [00:02<01:05,  1.86it/s]

3


  4%|▍         | 5/125 [00:03<01:38,  1.22it/s]

4


  5%|▍         | 6/125 [00:04<01:30,  1.32it/s]

5


  6%|▌         | 7/125 [00:04<01:28,  1.34it/s]

6


  6%|▋         | 8/125 [00:05<01:27,  1.33it/s]

7


  7%|▋         | 9/125 [00:06<01:25,  1.35it/s]

8


  8%|▊         | 10/125 [00:07<01:19,  1.44it/s]

9


  9%|▉         | 11/125 [00:07<01:13,  1.55it/s]

10


 10%|▉         | 12/125 [00:08<01:13,  1.55it/s]

11


 10%|█         | 13/125 [00:08<01:10,  1.59it/s]

12


 11%|█         | 14/125 [00:09<01:06,  1.68it/s]

13


 12%|█▏        | 15/125 [00:09<01:05,  1.67it/s]

14


 13%|█▎        | 16/125 [00:10<01:08,  1.60it/s]

15


 14%|█▎        | 17/125 [00:11<01:13,  1.47it/s]

16


 14%|█▍        | 18/125 [00:12<01:15,  1.42it/s]

17


 15%|█▌        | 19/125 [00:12<01:11,  1.49it/s]

18


 16%|█▌        | 20/125 [00:13<01:12,  1.45it/s]

19


 17%|█▋        | 21/125 [00:14<01:11,  1.45it/s]

20


 18%|█▊        | 22/125 [00:14<01:08,  1.50it/s]

21


 18%|█▊        | 23/125 [00:15<01:11,  1.43it/s]

22


 19%|█▉        | 24/125 [00:16<01:07,  1.50it/s]

23


 20%|██        | 25/125 [00:17<01:12,  1.39it/s]

24


 21%|██        | 26/125 [00:17<01:11,  1.39it/s]

25


 22%|██▏       | 27/125 [00:18<01:06,  1.48it/s]

26


 22%|██▏       | 28/125 [00:18<01:02,  1.54it/s]

27


 23%|██▎       | 29/125 [00:19<00:59,  1.62it/s]

28


 24%|██▍       | 30/125 [00:19<00:57,  1.66it/s]

29


 25%|██▍       | 31/125 [00:20<01:01,  1.53it/s]

30


 26%|██▌       | 32/125 [00:21<00:58,  1.60it/s]

31


 26%|██▋       | 33/125 [00:22<01:00,  1.52it/s]

32


 27%|██▋       | 34/125 [00:22<00:55,  1.65it/s]

33


 28%|██▊       | 35/125 [00:23<00:51,  1.75it/s]

34


 29%|██▉       | 36/125 [00:23<00:46,  1.92it/s]

35


 30%|██▉       | 37/125 [00:24<00:48,  1.81it/s]

36


 30%|███       | 38/125 [00:24<00:47,  1.82it/s]

37


 31%|███       | 39/125 [00:24<00:42,  2.01it/s]

38


 32%|███▏      | 40/125 [00:25<00:46,  1.82it/s]

39


 33%|███▎      | 41/125 [00:26<00:54,  1.54it/s]

40


 34%|███▎      | 42/125 [00:27<00:51,  1.61it/s]

41


 34%|███▍      | 43/125 [00:27<00:52,  1.57it/s]

42


 35%|███▌      | 44/125 [00:28<00:49,  1.63it/s]

43


 36%|███▌      | 45/125 [00:29<00:51,  1.56it/s]

44


 37%|███▋      | 46/125 [00:29<00:52,  1.49it/s]

45


 38%|███▊      | 47/125 [00:30<00:50,  1.54it/s]

46


 38%|███▊      | 48/125 [00:30<00:48,  1.59it/s]

47


 39%|███▉      | 49/125 [00:31<00:43,  1.75it/s]

48


 40%|████      | 50/125 [00:32<00:44,  1.69it/s]

49


 41%|████      | 51/125 [00:32<00:47,  1.56it/s]

50


 42%|████▏     | 52/125 [00:33<00:46,  1.56it/s]

51


 42%|████▏     | 53/125 [00:34<00:52,  1.36it/s]

52


 43%|████▎     | 54/125 [00:35<01:07,  1.06it/s]

53


 44%|████▍     | 55/125 [00:36<00:57,  1.21it/s]

54


 45%|████▍     | 56/125 [00:37<00:54,  1.26it/s]

55


 46%|████▌     | 57/125 [00:37<00:49,  1.37it/s]

56


 46%|████▋     | 58/125 [00:38<00:48,  1.39it/s]

57


 47%|████▋     | 59/125 [00:38<00:40,  1.65it/s]

58


 48%|████▊     | 60/125 [00:39<00:40,  1.59it/s]

59


 49%|████▉     | 61/125 [00:40<00:40,  1.57it/s]

60


 50%|████▉     | 62/125 [00:40<00:38,  1.64it/s]

61


 50%|█████     | 63/125 [00:41<00:41,  1.48it/s]

62


 51%|█████     | 64/125 [00:42<00:42,  1.45it/s]

63


 52%|█████▏    | 65/125 [00:42<00:40,  1.48it/s]

64


 53%|█████▎    | 66/125 [00:43<00:35,  1.67it/s]

65


 54%|█████▎    | 67/125 [00:43<00:34,  1.68it/s]

66


 54%|█████▍    | 68/125 [00:44<00:35,  1.59it/s]

67


 55%|█████▌    | 69/125 [00:45<00:38,  1.47it/s]

68


 56%|█████▌    | 70/125 [00:45<00:35,  1.57it/s]

69


 57%|█████▋    | 71/125 [00:46<00:36,  1.47it/s]

70


 58%|█████▊    | 72/125 [00:47<00:32,  1.65it/s]

71


 58%|█████▊    | 73/125 [00:47<00:31,  1.63it/s]

72


 59%|█████▉    | 74/125 [00:48<00:38,  1.32it/s]

73


 60%|██████    | 75/125 [00:49<00:37,  1.33it/s]

74


 61%|██████    | 76/125 [00:50<00:33,  1.48it/s]

75


 62%|██████▏   | 77/125 [00:50<00:31,  1.52it/s]

76


 62%|██████▏   | 78/125 [00:51<00:29,  1.58it/s]

77


 63%|██████▎   | 79/125 [00:51<00:26,  1.73it/s]

78


 64%|██████▍   | 80/125 [00:52<00:25,  1.78it/s]

79


 65%|██████▍   | 81/125 [00:52<00:23,  1.85it/s]

80


 66%|██████▌   | 82/125 [00:53<00:24,  1.74it/s]

81


 66%|██████▋   | 83/125 [00:53<00:24,  1.70it/s]

82


 67%|██████▋   | 84/125 [00:54<00:24,  1.68it/s]

83


 68%|██████▊   | 85/125 [00:55<00:24,  1.67it/s]

84


 69%|██████▉   | 86/125 [00:56<00:26,  1.46it/s]

85


 70%|██████▉   | 87/125 [00:56<00:27,  1.38it/s]

86


 70%|███████   | 88/125 [00:57<00:26,  1.41it/s]

87


 71%|███████   | 89/125 [00:58<00:24,  1.48it/s]

88


 72%|███████▏  | 90/125 [00:58<00:24,  1.42it/s]

89


 73%|███████▎  | 91/125 [00:59<00:25,  1.32it/s]

90


 74%|███████▎  | 92/125 [01:00<00:22,  1.46it/s]

91


 74%|███████▍  | 93/125 [01:00<00:21,  1.50it/s]

92


 75%|███████▌  | 94/125 [01:01<00:19,  1.58it/s]

93


 76%|███████▌  | 95/125 [01:02<00:19,  1.53it/s]

94


 77%|███████▋  | 96/125 [01:02<00:19,  1.52it/s]

95


 78%|███████▊  | 97/125 [01:03<00:18,  1.53it/s]

96


 78%|███████▊  | 98/125 [01:04<00:16,  1.61it/s]

97


 79%|███████▉  | 99/125 [01:04<00:16,  1.53it/s]

98


 80%|████████  | 100/125 [01:05<00:15,  1.57it/s]

99


 81%|████████  | 101/125 [01:05<00:14,  1.64it/s]

100


 82%|████████▏ | 102/125 [01:06<00:15,  1.48it/s]

101


 82%|████████▏ | 103/125 [01:07<00:15,  1.44it/s]

102


 83%|████████▎ | 104/125 [01:07<00:13,  1.61it/s]

103


 84%|████████▍ | 105/125 [01:08<00:13,  1.47it/s]

104


 85%|████████▍ | 106/125 [01:09<00:12,  1.55it/s]

105


 86%|████████▌ | 107/125 [01:09<00:11,  1.54it/s]

106


 86%|████████▋ | 108/125 [01:10<00:10,  1.61it/s]

107


 87%|████████▋ | 109/125 [01:11<00:10,  1.54it/s]

108


 88%|████████▊ | 110/125 [01:11<00:09,  1.65it/s]

109


 89%|████████▉ | 111/125 [01:12<00:07,  1.93it/s]

110


 90%|████████▉ | 112/125 [01:12<00:06,  1.87it/s]

111


 90%|█████████ | 113/125 [01:13<00:06,  1.93it/s]

112


 91%|█████████ | 114/125 [01:14<00:07,  1.55it/s]

113


 92%|█████████▏| 115/125 [01:14<00:06,  1.55it/s]

114


 93%|█████████▎| 116/125 [01:15<00:06,  1.46it/s]

115


 94%|█████████▎| 117/125 [01:16<00:06,  1.32it/s]

116


 94%|█████████▍| 118/125 [01:16<00:04,  1.45it/s]

117


 95%|█████████▌| 119/125 [01:17<00:04,  1.36it/s]

118


 96%|█████████▌| 120/125 [01:18<00:03,  1.48it/s]

119


 97%|█████████▋| 121/125 [01:18<00:02,  1.55it/s]

120


 98%|█████████▊| 122/125 [01:19<00:01,  1.54it/s]

121


 98%|█████████▊| 123/125 [01:20<00:01,  1.53it/s]

122


 99%|█████████▉| 124/125 [01:20<00:00,  1.47it/s]

123


100%|██████████| 125/125 [01:21<00:00,  1.53it/s]


124
printloss [40853.15597820282, 42068.50950241089, 22083.150102615356, 34838.09683799744, 112904.83822345734, 45899.65773773193, 47872.74584770203, 61127.334568977356, 54264.37144756317, 35215.95320701599, 32966.04341983795, 42920.79768419266, 35054.76151180267, 29832.370344638824, 37335.99400281906, 41765.881751060486, 49227.00759887695, 44525.8397731781, 37693.56318473816, 46611.65822601318, 40812.45216989517, 35267.88043022156, 39173.569370269775, 29167.628797531128, 58536.94775104523, 38127.40503168106, 28283.151160240173, 30127.49542236328, 21232.295342445374, 26081.413086891174, 39714.317145347595, 19954.466095924377, 37881.642221450806, 22778.479253292084, 19541.29251909256, 15956.359231948853, 32532.6311044693, 23485.509711265564, 11574.761375427246, 34977.337171554565, 61516.626040935516, 26019.480551719666, 34368.30081176758, 24984.17253303528, 45255.69404554367, 44092.49167442322, 27280.35537147522, 25514.91478729248, 15366.443464279175, 26064.985370635986, 40128.624471664

100%|██████████| 25/25 [00:04<00:00,  5.34it/s]


Epoch: 01 | Time: 1m 26s
	Train Loss: 4.695 | Train PPL: 109.408
	 Val. Loss: 4.041 |  Val. PPL:  56.861
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7fb4de47a490>


  1%|          | 1/125 [00:00<01:19,  1.56it/s]

0


  2%|▏         | 2/125 [00:01<01:18,  1.57it/s]

1


  2%|▏         | 3/125 [00:01<01:22,  1.49it/s]

2


  3%|▎         | 4/125 [00:02<01:08,  1.78it/s]

3


  4%|▍         | 5/125 [00:03<01:14,  1.61it/s]

4


  5%|▍         | 6/125 [00:03<01:11,  1.67it/s]

5


  6%|▌         | 7/125 [00:04<01:13,  1.60it/s]

6


  6%|▋         | 8/125 [00:04<01:10,  1.66it/s]

7


  7%|▋         | 9/125 [00:05<01:15,  1.53it/s]

8


  8%|▊         | 10/125 [00:06<01:06,  1.72it/s]

9


  9%|▉         | 11/125 [00:06<01:03,  1.81it/s]

10


 10%|▉         | 12/125 [00:07<01:08,  1.66it/s]

11


 10%|█         | 13/125 [00:07<01:06,  1.68it/s]

12


 11%|█         | 14/125 [00:08<01:14,  1.48it/s]

13


 12%|█▏        | 15/125 [00:09<01:09,  1.59it/s]

14


 13%|█▎        | 16/125 [00:09<01:06,  1.63it/s]

15


 14%|█▎        | 17/125 [00:10<01:06,  1.62it/s]

16


 14%|█▍        | 18/125 [00:11<01:09,  1.55it/s]

17


 15%|█▌        | 19/125 [00:11<01:08,  1.54it/s]

18


 16%|█▌        | 20/125 [00:12<01:02,  1.67it/s]

19


 17%|█▋        | 21/125 [00:12<01:00,  1.73it/s]

20


 18%|█▊        | 22/125 [00:13<00:59,  1.73it/s]

21


 18%|█▊        | 23/125 [00:13<00:59,  1.72it/s]

22


 19%|█▉        | 24/125 [00:15<01:14,  1.36it/s]

23


 20%|██        | 25/125 [00:15<01:09,  1.45it/s]

24


 21%|██        | 26/125 [00:16<01:01,  1.61it/s]

25


 22%|██▏       | 27/125 [00:16<01:00,  1.63it/s]

26


 22%|██▏       | 28/125 [00:17<01:09,  1.40it/s]

27


 23%|██▎       | 29/125 [00:18<01:07,  1.42it/s]

28


 24%|██▍       | 30/125 [00:18<00:58,  1.63it/s]

29


 25%|██▍       | 31/125 [00:19<01:01,  1.53it/s]

30


 26%|██▌       | 32/125 [00:19<00:54,  1.72it/s]

31


 26%|██▋       | 33/125 [00:20<00:54,  1.69it/s]

32


 27%|██▋       | 34/125 [00:21<01:15,  1.20it/s]

33


 28%|██▊       | 35/125 [00:22<01:07,  1.33it/s]

34


 29%|██▉       | 36/125 [00:22<00:59,  1.50it/s]

35


 30%|██▉       | 37/125 [00:23<00:59,  1.48it/s]

36


 30%|███       | 38/125 [00:24<00:54,  1.60it/s]

37


 31%|███       | 39/125 [00:24<00:54,  1.57it/s]

38


 32%|███▏      | 40/125 [00:25<00:57,  1.48it/s]

39


 33%|███▎      | 41/125 [00:26<00:58,  1.44it/s]

40


 34%|███▎      | 42/125 [00:26<00:54,  1.53it/s]

41


 34%|███▍      | 43/125 [00:27<00:55,  1.49it/s]

42


 35%|███▌      | 44/125 [00:28<00:52,  1.54it/s]

43


 36%|███▌      | 45/125 [00:28<00:52,  1.54it/s]

44


 37%|███▋      | 46/125 [00:29<00:53,  1.48it/s]

45


 38%|███▊      | 47/125 [00:30<00:50,  1.54it/s]

46


 38%|███▊      | 48/125 [00:30<00:50,  1.52it/s]

47


 39%|███▉      | 49/125 [00:31<00:52,  1.45it/s]

48


 40%|████      | 50/125 [00:32<00:45,  1.65it/s]

49


 41%|████      | 51/125 [00:32<00:49,  1.48it/s]

50


 42%|████▏     | 52/125 [00:33<00:46,  1.56it/s]

51


 42%|████▏     | 53/125 [00:34<00:48,  1.47it/s]

52


 43%|████▎     | 54/125 [00:34<00:48,  1.45it/s]

53


 44%|████▍     | 55/125 [00:35<00:49,  1.40it/s]

54


 45%|████▍     | 56/125 [00:36<00:44,  1.55it/s]

55


 46%|████▌     | 57/125 [00:36<00:41,  1.65it/s]

56


 46%|████▋     | 58/125 [00:37<00:42,  1.57it/s]

57


 47%|████▋     | 59/125 [00:38<00:42,  1.54it/s]

58


 48%|████▊     | 60/125 [00:38<00:42,  1.52it/s]

59


 49%|████▉     | 61/125 [00:39<00:39,  1.62it/s]

60


 50%|████▉     | 62/125 [00:40<00:42,  1.49it/s]

61


 50%|█████     | 63/125 [00:40<00:40,  1.55it/s]

62


 51%|█████     | 64/125 [00:41<00:40,  1.52it/s]

63


 52%|█████▏    | 65/125 [00:42<00:39,  1.50it/s]

64


 53%|█████▎    | 66/125 [00:42<00:37,  1.56it/s]

65


 54%|█████▎    | 67/125 [00:43<00:36,  1.58it/s]

66


 54%|█████▍    | 68/125 [00:43<00:32,  1.73it/s]

67


 55%|█████▌    | 69/125 [00:44<00:34,  1.60it/s]

68


 56%|█████▌    | 70/125 [00:45<00:40,  1.36it/s]

69


 57%|█████▋    | 71/125 [00:45<00:35,  1.53it/s]

70


 58%|█████▊    | 72/125 [00:46<00:33,  1.58it/s]

71


 58%|█████▊    | 73/125 [00:47<00:32,  1.62it/s]

72


 59%|█████▉    | 74/125 [00:47<00:31,  1.60it/s]

73


 60%|██████    | 75/125 [00:48<00:32,  1.54it/s]

74


 61%|██████    | 76/125 [00:49<00:35,  1.36it/s]

75


 62%|██████▏   | 77/125 [00:49<00:34,  1.41it/s]

76


 62%|██████▏   | 78/125 [00:50<00:32,  1.43it/s]

77


 63%|██████▎   | 79/125 [00:51<00:30,  1.52it/s]

78


 64%|██████▍   | 80/125 [00:51<00:27,  1.65it/s]

79


 65%|██████▍   | 81/125 [00:52<00:26,  1.68it/s]

80


 66%|██████▌   | 82/125 [00:52<00:25,  1.66it/s]

81


 66%|██████▋   | 83/125 [00:53<00:27,  1.52it/s]

82


 67%|██████▋   | 84/125 [00:54<00:26,  1.58it/s]

83


 68%|██████▊   | 85/125 [00:55<00:27,  1.45it/s]

84


 69%|██████▉   | 86/125 [00:55<00:26,  1.45it/s]

85


 70%|██████▉   | 87/125 [00:56<00:25,  1.48it/s]

86


 70%|███████   | 88/125 [00:57<00:25,  1.43it/s]

87


 71%|███████   | 89/125 [00:57<00:22,  1.62it/s]

88


 72%|███████▏  | 90/125 [00:58<00:21,  1.61it/s]

89


 73%|███████▎  | 91/125 [00:58<00:20,  1.70it/s]

90


 74%|███████▎  | 92/125 [00:59<00:21,  1.51it/s]

91


 74%|███████▍  | 93/125 [01:00<00:19,  1.64it/s]

92


 75%|███████▌  | 94/125 [01:00<00:18,  1.66it/s]

93


 76%|███████▌  | 95/125 [01:01<00:18,  1.63it/s]

94


 77%|███████▋  | 96/125 [01:01<00:17,  1.69it/s]

95


 78%|███████▊  | 97/125 [01:02<00:16,  1.66it/s]

96


 78%|███████▊  | 98/125 [01:03<00:16,  1.65it/s]

97


 79%|███████▉  | 99/125 [01:03<00:17,  1.45it/s]

98


 80%|████████  | 100/125 [01:04<00:16,  1.55it/s]

99


 81%|████████  | 101/125 [01:04<00:14,  1.65it/s]

100


 82%|████████▏ | 102/125 [01:05<00:12,  1.79it/s]

101


 82%|████████▏ | 103/125 [01:05<00:11,  1.89it/s]

102


 83%|████████▎ | 104/125 [01:06<00:10,  1.96it/s]

103


 84%|████████▍ | 105/125 [01:07<00:11,  1.80it/s]

104


 85%|████████▍ | 106/125 [01:07<00:10,  1.84it/s]

105


 86%|████████▌ | 107/125 [01:08<00:11,  1.63it/s]

106


 86%|████████▋ | 108/125 [01:09<00:11,  1.47it/s]

107


 87%|████████▋ | 109/125 [01:10<00:12,  1.27it/s]

108


 88%|████████▊ | 110/125 [01:11<00:12,  1.22it/s]

109


 89%|████████▉ | 111/125 [01:11<00:11,  1.20it/s]

110


 90%|████████▉ | 112/125 [01:12<00:10,  1.25it/s]

111


 90%|█████████ | 113/125 [01:13<00:09,  1.32it/s]

112


 91%|█████████ | 114/125 [01:13<00:07,  1.44it/s]

113


 92%|█████████▏| 115/125 [01:14<00:06,  1.45it/s]

114


 93%|█████████▎| 116/125 [01:15<00:06,  1.49it/s]

115


 94%|█████████▎| 117/125 [01:15<00:05,  1.55it/s]

116


 94%|█████████▍| 118/125 [01:16<00:03,  1.77it/s]

117


 95%|█████████▌| 119/125 [01:16<00:03,  1.77it/s]

118


 96%|█████████▌| 120/125 [01:17<00:02,  1.72it/s]

119


 97%|█████████▋| 121/125 [01:18<00:02,  1.46it/s]

120


 98%|█████████▊| 122/125 [01:19<00:02,  1.35it/s]

121


 98%|█████████▊| 123/125 [01:19<00:01,  1.37it/s]

122


 99%|█████████▉| 124/125 [01:20<00:00,  1.48it/s]

123


100%|██████████| 125/125 [01:21<00:00,  1.54it/s]


124
printloss [26621.209884166718, 27713.784690856934, 27728.849945545197, 9798.985545158386, 38208.08399105072, 19754.663459062576, 35497.344245910645, 16205.055884122849, 43297.96298503876, 11882.752265453339, 16980.902626991272, 34944.16148543358, 18811.37148141861, 50756.89009284973, 18457.774674892426, 21646.233333349228, 28636.12506198883, 32680.888855218887, 27563.415001392365, 16059.440056800842, 19018.80849838257, 21831.593126535416, 18875.764991044998, 73311.83030319214, 24343.242415189743, 14630.151590824127, 26709.340797901154, 61918.897569179535, 31132.983278274536, 14106.504382610321, 34203.43675994873, 10218.041723012924, 21531.254819869995, 76544.43233060837, 21401.320295333862, 14725.120360851288, 34211.35924458504, 18850.353850364685, 25381.465316295624, 37847.36614608765, 35015.47887802124, 23434.59082508087, 38759.013473033905, 24908.71728491783, 28909.775562286377, 27091.5051612854, 22581.362498283386, 28044.81493449211, 41778.1248486042, 13928.230385303497, 42209.

100%|██████████| 25/25 [00:04<00:00,  5.33it/s]


Epoch: 02 | Time: 1m 26s
	Train Loss: 3.897 | Train PPL:  49.278
	 Val. Loss: 3.859 |  Val. PPL:  47.423
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7fb4e04b7430>


  1%|          | 1/125 [00:00<01:01,  2.02it/s]

0


  2%|▏         | 2/125 [00:01<01:05,  1.87it/s]

1


  2%|▏         | 3/125 [00:01<01:05,  1.86it/s]

2


  3%|▎         | 4/125 [00:02<01:02,  1.93it/s]

3


  4%|▍         | 5/125 [00:02<01:07,  1.77it/s]

4


  5%|▍         | 6/125 [00:03<01:14,  1.60it/s]

5


  6%|▌         | 7/125 [00:03<01:05,  1.79it/s]

6


  6%|▋         | 8/125 [00:04<01:05,  1.80it/s]

7


  7%|▋         | 9/125 [00:04<00:59,  1.96it/s]

8


  8%|▊         | 10/125 [00:05<01:00,  1.90it/s]

9


  9%|▉         | 11/125 [00:06<01:28,  1.28it/s]

10


 10%|▉         | 12/125 [00:07<01:31,  1.24it/s]

11


 10%|█         | 13/125 [00:08<01:20,  1.39it/s]

12


 11%|█         | 14/125 [00:08<01:23,  1.33it/s]

13


 12%|█▏        | 15/125 [00:09<01:15,  1.45it/s]

14


 13%|█▎        | 16/125 [00:10<01:16,  1.42it/s]

15


 14%|█▎        | 17/125 [00:10<01:16,  1.42it/s]

16


 14%|█▍        | 18/125 [00:11<01:08,  1.57it/s]

17


 15%|█▌        | 19/125 [00:11<00:59,  1.77it/s]

18


 16%|█▌        | 20/125 [00:12<00:59,  1.77it/s]

19


 17%|█▋        | 21/125 [00:13<01:03,  1.63it/s]

20


 18%|█▊        | 22/125 [00:13<01:05,  1.58it/s]

21


 18%|█▊        | 23/125 [00:14<01:05,  1.57it/s]

22


 19%|█▉        | 24/125 [00:15<01:02,  1.61it/s]

23


 20%|██        | 25/125 [00:15<01:01,  1.62it/s]

24


 21%|██        | 26/125 [00:16<01:02,  1.59it/s]

25


 22%|██▏       | 27/125 [00:16<00:55,  1.76it/s]

26


 22%|██▏       | 28/125 [00:17<01:01,  1.57it/s]

27


 23%|██▎       | 29/125 [00:18<01:02,  1.52it/s]

28


 24%|██▍       | 30/125 [00:18<01:02,  1.52it/s]

29


 25%|██▍       | 31/125 [00:19<00:59,  1.59it/s]

30


 26%|██▌       | 32/125 [00:20<01:03,  1.45it/s]

31


 26%|██▋       | 33/125 [00:20<01:01,  1.49it/s]

32


 27%|██▋       | 34/125 [00:21<01:03,  1.44it/s]

33


 28%|██▊       | 35/125 [00:22<01:02,  1.44it/s]

34


 29%|██▉       | 36/125 [00:23<01:02,  1.43it/s]

35


 30%|██▉       | 37/125 [00:23<00:59,  1.49it/s]

36


 30%|███       | 38/125 [00:24<00:57,  1.52it/s]

37


 31%|███       | 39/125 [00:25<01:04,  1.33it/s]

38


 32%|███▏      | 40/125 [00:25<00:57,  1.47it/s]

39


 33%|███▎      | 41/125 [00:26<00:54,  1.54it/s]

40


 34%|███▎      | 42/125 [00:26<00:51,  1.62it/s]

41


 34%|███▍      | 43/125 [00:27<00:48,  1.70it/s]

42


 35%|███▌      | 44/125 [00:27<00:44,  1.80it/s]

43


 36%|███▌      | 45/125 [00:28<00:50,  1.60it/s]

44


 37%|███▋      | 46/125 [00:29<00:49,  1.59it/s]

45


 38%|███▊      | 47/125 [00:30<00:50,  1.55it/s]

46


 38%|███▊      | 48/125 [00:30<00:45,  1.71it/s]

47


 39%|███▉      | 49/125 [00:31<00:46,  1.63it/s]

48


 40%|████      | 50/125 [00:31<00:44,  1.70it/s]

49


 41%|████      | 51/125 [00:32<00:41,  1.80it/s]

50


 42%|████▏     | 52/125 [00:32<00:44,  1.64it/s]

51


 42%|████▏     | 53/125 [00:33<00:40,  1.79it/s]

52


 43%|████▎     | 54/125 [00:34<00:44,  1.61it/s]

53


 44%|████▍     | 55/125 [00:34<00:43,  1.62it/s]

54


 45%|████▍     | 56/125 [00:35<00:41,  1.65it/s]

55


 46%|████▌     | 57/125 [00:35<00:38,  1.78it/s]

56


 46%|████▋     | 58/125 [00:36<00:39,  1.70it/s]

57


 47%|████▋     | 59/125 [00:36<00:37,  1.76it/s]

58


 48%|████▊     | 60/125 [00:37<00:35,  1.82it/s]

59


 49%|████▉     | 61/125 [00:38<00:37,  1.71it/s]

60


 50%|████▉     | 62/125 [00:38<00:36,  1.72it/s]

61


 50%|█████     | 63/125 [00:39<00:34,  1.81it/s]

62


 51%|█████     | 64/125 [00:39<00:37,  1.65it/s]

63


 52%|█████▏    | 65/125 [00:40<00:34,  1.73it/s]

64


 53%|█████▎    | 66/125 [00:41<00:37,  1.57it/s]

65


 54%|█████▎    | 67/125 [00:41<00:36,  1.58it/s]

66


 54%|█████▍    | 68/125 [00:42<00:44,  1.30it/s]

67


 55%|█████▌    | 69/125 [00:43<00:39,  1.42it/s]

68


 56%|█████▌    | 70/125 [00:44<00:43,  1.28it/s]

69


 57%|█████▋    | 71/125 [00:45<00:40,  1.33it/s]

70


 58%|█████▊    | 72/125 [00:45<00:39,  1.34it/s]

71


 58%|█████▊    | 73/125 [00:46<00:37,  1.39it/s]

72


 59%|█████▉    | 74/125 [00:47<00:35,  1.44it/s]

73


 60%|██████    | 75/125 [00:47<00:37,  1.34it/s]

74


 61%|██████    | 76/125 [00:48<00:34,  1.40it/s]

75


 62%|██████▏   | 77/125 [00:49<00:34,  1.38it/s]

76


 62%|██████▏   | 78/125 [00:50<00:33,  1.40it/s]

77


 63%|██████▎   | 79/125 [00:50<00:31,  1.45it/s]

78


 64%|██████▍   | 80/125 [00:51<00:28,  1.57it/s]

79


 65%|██████▍   | 81/125 [00:51<00:29,  1.50it/s]

80


 66%|██████▌   | 82/125 [00:52<00:29,  1.45it/s]

81


 66%|██████▋   | 83/125 [00:53<00:31,  1.33it/s]

82


 67%|██████▋   | 84/125 [00:54<00:30,  1.36it/s]

83


 68%|██████▊   | 85/125 [00:54<00:28,  1.38it/s]

84


 69%|██████▉   | 86/125 [00:55<00:26,  1.48it/s]

85


 70%|██████▉   | 87/125 [00:56<00:25,  1.50it/s]

86


 70%|███████   | 88/125 [00:56<00:25,  1.46it/s]

87


 71%|███████   | 89/125 [00:57<00:24,  1.46it/s]

88


 72%|███████▏  | 90/125 [00:58<00:23,  1.51it/s]

89


 73%|███████▎  | 91/125 [00:58<00:20,  1.70it/s]

90


 74%|███████▎  | 92/125 [00:59<00:19,  1.65it/s]

91


 74%|███████▍  | 93/125 [00:59<00:19,  1.62it/s]

92


 75%|███████▌  | 94/125 [01:00<00:20,  1.53it/s]

93


 76%|███████▌  | 95/125 [01:01<00:18,  1.61it/s]

94


 77%|███████▋  | 96/125 [01:01<00:17,  1.67it/s]

95


 78%|███████▊  | 97/125 [01:02<00:16,  1.69it/s]

96


 78%|███████▊  | 98/125 [01:02<00:15,  1.73it/s]

97


 79%|███████▉  | 99/125 [01:03<00:14,  1.77it/s]

98


 80%|████████  | 100/125 [01:04<00:15,  1.66it/s]

99


 81%|████████  | 101/125 [01:04<00:15,  1.58it/s]

100


 82%|████████▏ | 102/125 [01:05<00:15,  1.51it/s]

101


 82%|████████▏ | 103/125 [01:06<00:14,  1.57it/s]

102


 83%|████████▎ | 104/125 [01:06<00:14,  1.48it/s]

103


 84%|████████▍ | 105/125 [01:07<00:12,  1.59it/s]

104


 85%|████████▍ | 106/125 [01:07<00:11,  1.71it/s]

105


 86%|████████▌ | 107/125 [01:08<00:12,  1.48it/s]

106


 86%|████████▋ | 108/125 [01:09<00:10,  1.58it/s]

107


 87%|████████▋ | 109/125 [01:09<00:09,  1.71it/s]

108


 88%|████████▊ | 110/125 [01:10<00:10,  1.44it/s]

109


 89%|████████▉ | 111/125 [01:11<00:09,  1.41it/s]

110


 90%|████████▉ | 112/125 [01:12<00:08,  1.45it/s]

111


 90%|█████████ | 113/125 [01:12<00:07,  1.62it/s]

112


 91%|█████████ | 114/125 [01:13<00:06,  1.68it/s]

113


 92%|█████████▏| 115/125 [01:13<00:06,  1.59it/s]

114


 93%|█████████▎| 116/125 [01:14<00:05,  1.58it/s]

115


 94%|█████████▎| 117/125 [01:15<00:05,  1.58it/s]

116


 94%|█████████▍| 118/125 [01:15<00:04,  1.56it/s]

117


 95%|█████████▌| 119/125 [01:16<00:03,  1.59it/s]

118


 96%|█████████▌| 120/125 [01:16<00:02,  1.80it/s]

119


 97%|█████████▋| 121/125 [01:17<00:02,  1.64it/s]

120


 98%|█████████▊| 122/125 [01:18<00:01,  1.58it/s]

121


 98%|█████████▊| 123/125 [01:18<00:01,  1.63it/s]

122


 99%|█████████▉| 124/125 [01:19<00:00,  1.46it/s]

123


100%|██████████| 125/125 [01:20<00:00,  1.56it/s]


124
printloss [17594.780970096588, 20830.600905418396, 16502.853118896484, 15795.788297653198, 27762.362520694733, 34450.73856282234, 11347.126567363739, 19501.88314819336, 12905.382142066956, 20085.123107910156, 75202.18294429779, 41653.55878829956, 18498.379719734192, 42728.34926891327, 18496.03715801239, 35065.69567513466, 23569.024832487106, 14773.70850276947, 12375.608457565308, 24631.07328414917, 33237.32642698288, 31081.919758081436, 25282.792464971542, 22205.814956903458, 21919.245956897736, 33567.34668326378, 10636.341339826584, 34819.09330511093, 30805.842918395996, 29186.28002524376, 21174.99986886978, 39278.46195888519, 28314.776209115982, 42421.34223461151, 30574.733778953552, 32689.048801660538, 25377.25751876831, 22666.30662918091, 43802.84279823303, 19655.076895713806, 23815.839424610138, 21501.571880340576, 20607.304174661636, 14855.005770206451, 30600.457084178925, 21946.91837501526, 30394.278429985046, 12750.073655843735, 30581.74222755432, 15749.00365638733, 16000.0

100%|██████████| 25/25 [00:04<00:00,  5.28it/s]


Epoch: 03 | Time: 1m 25s
	Train Loss: 3.692 | Train PPL:  40.133
	 Val. Loss: 3.778 |  Val. PPL:  43.738
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7fb4dd60b430>


  1%|          | 1/125 [00:00<01:15,  1.64it/s]

0


  2%|▏         | 2/125 [00:01<01:28,  1.38it/s]

1


  2%|▏         | 3/125 [00:02<01:24,  1.45it/s]

2


  3%|▎         | 4/125 [00:02<01:22,  1.46it/s]

3


  4%|▍         | 5/125 [00:03<01:16,  1.56it/s]

4


  5%|▍         | 6/125 [00:03<01:11,  1.66it/s]

5


  6%|▌         | 7/125 [00:04<01:14,  1.59it/s]

6


  6%|▋         | 8/125 [00:05<01:11,  1.63it/s]

7


  7%|▋         | 9/125 [00:05<01:10,  1.65it/s]

8


  8%|▊         | 10/125 [00:06<01:09,  1.65it/s]

9


  9%|▉         | 11/125 [00:06<01:10,  1.62it/s]

10


 10%|▉         | 12/125 [00:07<01:11,  1.58it/s]

11


 10%|█         | 13/125 [00:08<01:12,  1.54it/s]

12


 11%|█         | 14/125 [00:08<01:03,  1.75it/s]

13


 12%|█▏        | 15/125 [00:09<01:06,  1.66it/s]

14


 13%|█▎        | 16/125 [00:09<01:01,  1.79it/s]

15


 14%|█▎        | 17/125 [00:10<00:57,  1.87it/s]

16


 14%|█▍        | 18/125 [00:11<01:03,  1.68it/s]

17


 15%|█▌        | 19/125 [00:11<01:05,  1.63it/s]

18


 16%|█▌        | 20/125 [00:12<01:01,  1.72it/s]

19


 17%|█▋        | 21/125 [00:12<01:03,  1.64it/s]

20


 18%|█▊        | 22/125 [00:13<01:05,  1.58it/s]

21


 18%|█▊        | 23/125 [00:14<01:02,  1.63it/s]

22


 19%|█▉        | 24/125 [00:14<01:00,  1.66it/s]

23


 20%|██        | 25/125 [00:15<01:03,  1.57it/s]

24


 21%|██        | 26/125 [00:16<01:02,  1.59it/s]

25


 22%|██▏       | 27/125 [00:16<01:09,  1.41it/s]

26


 22%|██▏       | 28/125 [00:17<01:04,  1.51it/s]

27


 23%|██▎       | 29/125 [00:17<00:59,  1.63it/s]

28


 24%|██▍       | 30/125 [00:18<01:00,  1.56it/s]

29


 25%|██▍       | 31/125 [00:19<00:54,  1.72it/s]

30


 26%|██▌       | 32/125 [00:19<00:57,  1.63it/s]

31


 26%|██▋       | 33/125 [00:20<01:00,  1.53it/s]

32


 27%|██▋       | 34/125 [00:21<01:02,  1.47it/s]

33


 28%|██▊       | 35/125 [00:21<00:57,  1.55it/s]

34


 29%|██▉       | 36/125 [00:22<00:57,  1.55it/s]

35


 30%|██▉       | 37/125 [00:23<00:56,  1.56it/s]

36


 30%|███       | 38/125 [00:23<00:55,  1.58it/s]

37


 31%|███       | 39/125 [00:24<00:57,  1.50it/s]

38


 32%|███▏      | 40/125 [00:25<00:53,  1.59it/s]

39


 33%|███▎      | 41/125 [00:25<00:49,  1.71it/s]

40


 34%|███▎      | 42/125 [00:26<00:46,  1.77it/s]

41


 34%|███▍      | 43/125 [00:27<01:06,  1.23it/s]

42


 35%|███▌      | 44/125 [00:28<01:04,  1.25it/s]

43


 36%|███▌      | 45/125 [00:28<01:02,  1.28it/s]

44


 37%|███▋      | 46/125 [00:29<01:03,  1.25it/s]

45


 38%|███▊      | 47/125 [00:30<01:04,  1.21it/s]

46


 38%|███▊      | 48/125 [00:31<00:57,  1.34it/s]

47


 39%|███▉      | 49/125 [00:31<00:55,  1.37it/s]

48


 40%|████      | 50/125 [00:32<00:54,  1.38it/s]

49


 41%|████      | 51/125 [00:33<00:50,  1.45it/s]

50


 42%|████▏     | 52/125 [00:34<00:59,  1.24it/s]

51


 42%|████▏     | 53/125 [00:34<00:53,  1.35it/s]

52


 43%|████▎     | 54/125 [00:35<00:48,  1.47it/s]

53


 44%|████▍     | 55/125 [00:35<00:43,  1.62it/s]

54


 45%|████▍     | 56/125 [00:36<00:42,  1.61it/s]

55


 46%|████▌     | 57/125 [00:37<00:43,  1.57it/s]

56


 46%|████▋     | 58/125 [00:37<00:44,  1.49it/s]

57


 47%|████▋     | 59/125 [00:38<00:42,  1.57it/s]

58


 48%|████▊     | 60/125 [00:39<00:43,  1.49it/s]

59


 49%|████▉     | 61/125 [00:39<00:42,  1.50it/s]

60


 50%|████▉     | 62/125 [00:40<00:43,  1.45it/s]

61


 50%|█████     | 63/125 [00:41<00:46,  1.34it/s]

62


 51%|█████     | 64/125 [00:42<00:45,  1.35it/s]

63


 52%|█████▏    | 65/125 [00:42<00:38,  1.54it/s]

64


 53%|█████▎    | 66/125 [00:43<00:34,  1.70it/s]

65


 54%|█████▎    | 67/125 [00:44<00:38,  1.50it/s]

66


 54%|█████▍    | 68/125 [00:44<00:38,  1.49it/s]

67


 55%|█████▌    | 69/125 [00:45<00:40,  1.40it/s]

68


 56%|█████▌    | 70/125 [00:46<00:37,  1.46it/s]

69


 57%|█████▋    | 71/125 [00:46<00:35,  1.53it/s]

70


 58%|█████▊    | 72/125 [00:47<00:32,  1.65it/s]

71


 58%|█████▊    | 73/125 [00:47<00:29,  1.76it/s]

72


 59%|█████▉    | 74/125 [00:48<00:34,  1.47it/s]

73


 60%|██████    | 75/125 [00:49<00:32,  1.53it/s]

74


 61%|██████    | 76/125 [00:49<00:28,  1.71it/s]

75


 62%|██████▏   | 77/125 [00:50<00:28,  1.69it/s]

76


 62%|██████▏   | 78/125 [00:51<00:30,  1.55it/s]

77


 63%|██████▎   | 79/125 [00:51<00:27,  1.66it/s]

78


 64%|██████▍   | 80/125 [00:52<00:27,  1.64it/s]

79


 65%|██████▍   | 81/125 [00:53<00:34,  1.29it/s]

80


 66%|██████▌   | 82/125 [00:54<00:33,  1.29it/s]

81


 66%|██████▋   | 83/125 [00:54<00:30,  1.40it/s]

82


 67%|██████▋   | 84/125 [00:55<00:28,  1.45it/s]

83


 68%|██████▊   | 85/125 [00:55<00:25,  1.55it/s]

84


 69%|██████▉   | 86/125 [00:56<00:22,  1.72it/s]

85


 70%|██████▉   | 87/125 [00:56<00:22,  1.68it/s]

86


 70%|███████   | 88/125 [00:57<00:22,  1.67it/s]

87


 71%|███████   | 89/125 [00:58<00:20,  1.72it/s]

88


 72%|███████▏  | 90/125 [00:58<00:21,  1.59it/s]

89


 73%|███████▎  | 91/125 [00:59<00:22,  1.52it/s]

90


 74%|███████▎  | 92/125 [01:00<00:22,  1.44it/s]

91


 74%|███████▍  | 93/125 [01:00<00:20,  1.54it/s]

92


 75%|███████▌  | 94/125 [01:01<00:20,  1.48it/s]

93


 76%|███████▌  | 95/125 [01:01<00:17,  1.69it/s]

94


 77%|███████▋  | 96/125 [01:02<00:17,  1.68it/s]

95


 78%|███████▊  | 97/125 [01:03<00:17,  1.64it/s]

96


 78%|███████▊  | 98/125 [01:03<00:16,  1.62it/s]

97


 79%|███████▉  | 99/125 [01:04<00:15,  1.71it/s]

98


 80%|████████  | 100/125 [01:04<00:15,  1.66it/s]

99


 81%|████████  | 101/125 [01:05<00:12,  1.87it/s]

100


 82%|████████▏ | 102/125 [01:05<00:12,  1.84it/s]

101


 82%|████████▏ | 103/125 [01:06<00:12,  1.78it/s]

102


 83%|████████▎ | 104/125 [01:07<00:12,  1.69it/s]

103


 84%|████████▍ | 105/125 [01:07<00:12,  1.60it/s]

104


 85%|████████▍ | 106/125 [01:08<00:11,  1.66it/s]

105


 86%|████████▌ | 107/125 [01:09<00:10,  1.64it/s]

106


 86%|████████▋ | 108/125 [01:09<00:10,  1.65it/s]

107


 87%|████████▋ | 109/125 [01:10<00:09,  1.74it/s]

108


 88%|████████▊ | 110/125 [01:10<00:09,  1.57it/s]

109


 89%|████████▉ | 111/125 [01:11<00:08,  1.63it/s]

110


 90%|████████▉ | 112/125 [01:12<00:08,  1.61it/s]

111


 90%|█████████ | 113/125 [01:12<00:07,  1.69it/s]

112


 91%|█████████ | 114/125 [01:13<00:06,  1.75it/s]

113


 92%|█████████▏| 115/125 [01:13<00:06,  1.60it/s]

114


 93%|█████████▎| 116/125 [01:14<00:04,  1.85it/s]

115


 94%|█████████▎| 117/125 [01:14<00:04,  1.75it/s]

116


 94%|█████████▍| 118/125 [01:15<00:04,  1.46it/s]

117


 95%|█████████▌| 119/125 [01:16<00:03,  1.52it/s]

118


 96%|█████████▌| 120/125 [01:17<00:03,  1.45it/s]

119


 97%|█████████▋| 121/125 [01:17<00:02,  1.54it/s]

120


 98%|█████████▊| 122/125 [01:18<00:02,  1.41it/s]

121


 98%|█████████▊| 123/125 [01:19<00:01,  1.43it/s]

122


 99%|█████████▉| 124/125 [01:19<00:00,  1.53it/s]

123


100%|██████████| 125/125 [01:20<00:00,  1.55it/s]


124
printloss [23268.4716463089, 29849.133665800095, 24487.450137138367, 25200.17261838913, 18917.796416282654, 14248.325180053711, 28544.079097509384, 22397.902176856995, 20602.136241674423, 20107.796179771423, 23174.838623046875, 27025.27761554718, 31602.73352622986, 8349.544537067413, 27612.11292743683, 14539.954407691956, 14779.303486347198, 32580.76719236374, 26922.47474384308, 16745.461913347244, 25341.05985069275, 30942.283973693848, 20987.9465610981, 20886.333273887634, 29926.48409843445, 22663.777354002, 38643.47444701195, 15731.299448013306, 16079.31438446045, 28865.571425437927, 12405.78912973404, 27874.38885498047, 30733.596210479736, 35504.85349082947, 18247.69165611267, 25525.316840171814, 24269.905262947083, 26521.846503019333, 25635.43940138817, 17293.434638500214, 14028.866018772125, 15424.332375526428, 69472.72592258453, 30181.744169950485, 23003.130935907364, 40010.25966644287, 53450.00101232529, 19520.27446603775, 31707.69779777527, 30467.28938817978, 17724.22979307

100%|██████████| 25/25 [00:05<00:00,  4.33it/s]


Epoch: 04 | Time: 1m 26s
	Train Loss: 3.538 | Train PPL:  34.404
	 Val. Loss: 3.707 |  Val. PPL:  40.719
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7fb4dfe943a0>


  1%|          | 1/125 [00:00<01:15,  1.65it/s]

0


  2%|▏         | 2/125 [00:01<01:23,  1.48it/s]

1


  2%|▏         | 3/125 [00:02<01:26,  1.41it/s]

2


  3%|▎         | 4/125 [00:02<01:16,  1.59it/s]

3


  4%|▍         | 5/125 [00:03<01:12,  1.67it/s]

4


  5%|▍         | 6/125 [00:03<01:06,  1.80it/s]

5


  6%|▌         | 7/125 [00:03<00:58,  2.00it/s]

6


  6%|▋         | 8/125 [00:04<01:00,  1.94it/s]

7


  7%|▋         | 9/125 [00:05<00:59,  1.97it/s]

8


  8%|▊         | 10/125 [00:05<00:57,  1.99it/s]

9


  9%|▉         | 11/125 [00:06<01:06,  1.71it/s]

10


 10%|▉         | 12/125 [00:07<01:12,  1.55it/s]

11


 10%|█         | 13/125 [00:07<01:10,  1.59it/s]

12


 11%|█         | 14/125 [00:08<01:07,  1.64it/s]

13


 12%|█▏        | 15/125 [00:08<01:10,  1.55it/s]

14


 13%|█▎        | 16/125 [00:09<01:11,  1.52it/s]

15


 14%|█▎        | 17/125 [00:10<01:13,  1.47it/s]

16


 14%|█▍        | 18/125 [00:10<01:08,  1.57it/s]

17


 15%|█▌        | 19/125 [00:11<01:10,  1.50it/s]

18


 16%|█▌        | 20/125 [00:12<01:16,  1.37it/s]

19


 17%|█▋        | 21/125 [00:13<01:15,  1.39it/s]

20


 18%|█▊        | 22/125 [00:13<01:14,  1.38it/s]

21


 18%|█▊        | 23/125 [00:14<01:11,  1.43it/s]

22


 19%|█▉        | 24/125 [00:15<01:09,  1.44it/s]

23


 20%|██        | 25/125 [00:15<01:05,  1.52it/s]

24


 21%|██        | 26/125 [00:16<01:00,  1.62it/s]

25


 22%|██▏       | 27/125 [00:17<01:05,  1.50it/s]

26


 22%|██▏       | 28/125 [00:17<01:08,  1.42it/s]

27


 23%|██▎       | 29/125 [00:18<01:09,  1.38it/s]

28


 24%|██▍       | 30/125 [00:19<01:15,  1.26it/s]

29


 25%|██▍       | 31/125 [00:20<01:13,  1.27it/s]

30


 26%|██▌       | 32/125 [00:21<01:10,  1.32it/s]

31


 26%|██▋       | 33/125 [00:21<01:06,  1.39it/s]

32


 27%|██▋       | 34/125 [00:22<01:08,  1.33it/s]

33


 28%|██▊       | 35/125 [00:23<01:01,  1.47it/s]

34


 29%|██▉       | 36/125 [00:23<00:58,  1.53it/s]

35


 30%|██▉       | 37/125 [00:24<00:54,  1.61it/s]

36


 30%|███       | 38/125 [00:24<00:49,  1.76it/s]

37


 31%|███       | 39/125 [00:25<00:50,  1.71it/s]

38


 32%|███▏      | 40/125 [00:25<00:51,  1.64it/s]

39


 33%|███▎      | 41/125 [00:26<00:52,  1.59it/s]

40


 34%|███▎      | 42/125 [00:27<00:50,  1.65it/s]

41


 34%|███▍      | 43/125 [00:27<00:47,  1.74it/s]

42


 35%|███▌      | 44/125 [00:28<00:49,  1.63it/s]

43


 36%|███▌      | 45/125 [00:29<00:50,  1.60it/s]

44


 37%|███▋      | 46/125 [00:29<00:46,  1.72it/s]

45


 38%|███▊      | 47/125 [00:30<00:48,  1.61it/s]

46


 38%|███▊      | 48/125 [00:30<00:48,  1.59it/s]

47


 39%|███▉      | 49/125 [00:31<00:50,  1.49it/s]

48


 40%|████      | 50/125 [00:32<00:51,  1.44it/s]

49


 41%|████      | 51/125 [00:32<00:46,  1.59it/s]

50


 42%|████▏     | 52/125 [00:33<00:45,  1.59it/s]

51


 42%|████▏     | 53/125 [00:34<00:42,  1.69it/s]

52


 43%|████▎     | 54/125 [00:34<00:42,  1.67it/s]

53


 44%|████▍     | 55/125 [00:35<00:42,  1.65it/s]

54


 45%|████▍     | 56/125 [00:35<00:43,  1.59it/s]

55


 46%|████▌     | 57/125 [00:36<00:45,  1.50it/s]

56


 46%|████▋     | 58/125 [00:37<00:41,  1.60it/s]

57


 47%|████▋     | 59/125 [00:38<00:44,  1.48it/s]

58


 48%|████▊     | 60/125 [00:38<00:43,  1.48it/s]

59


 49%|████▉     | 61/125 [00:39<00:40,  1.57it/s]

60


 50%|████▉     | 62/125 [00:39<00:40,  1.56it/s]

61


 50%|█████     | 63/125 [00:40<00:38,  1.61it/s]

62


 51%|█████     | 64/125 [00:41<00:39,  1.56it/s]

63


 52%|█████▏    | 65/125 [00:41<00:39,  1.51it/s]

64


 53%|█████▎    | 66/125 [00:42<00:37,  1.59it/s]

65


 54%|█████▎    | 67/125 [00:43<00:36,  1.57it/s]

66


 54%|█████▍    | 68/125 [00:43<00:39,  1.44it/s]

67


 55%|█████▌    | 69/125 [00:44<00:38,  1.47it/s]

68


 56%|█████▌    | 70/125 [00:45<00:37,  1.48it/s]

69


 57%|█████▋    | 71/125 [00:45<00:35,  1.52it/s]

70


 58%|█████▊    | 72/125 [00:46<00:34,  1.54it/s]

71


 58%|█████▊    | 73/125 [00:46<00:30,  1.70it/s]

72


 59%|█████▉    | 74/125 [00:47<00:30,  1.67it/s]

73


 60%|██████    | 75/125 [00:48<00:29,  1.70it/s]

74


 61%|██████    | 76/125 [00:48<00:27,  1.79it/s]

75


 62%|██████▏   | 77/125 [00:49<00:27,  1.76it/s]

76


 62%|██████▏   | 78/125 [00:50<00:31,  1.50it/s]

77


 63%|██████▎   | 79/125 [00:50<00:27,  1.67it/s]

78


 64%|██████▍   | 80/125 [00:51<00:30,  1.50it/s]

79


 65%|██████▍   | 81/125 [00:52<00:30,  1.45it/s]

80


 66%|██████▌   | 82/125 [00:52<00:25,  1.65it/s]

81


 66%|██████▋   | 83/125 [00:53<00:27,  1.50it/s]

82


 67%|██████▋   | 84/125 [00:53<00:27,  1.50it/s]

83


 68%|██████▊   | 85/125 [00:54<00:23,  1.69it/s]

84


 69%|██████▉   | 86/125 [00:54<00:23,  1.69it/s]

85


 70%|██████▉   | 87/125 [00:55<00:20,  1.82it/s]

86


 70%|███████   | 88/125 [00:55<00:20,  1.84it/s]

87


 71%|███████   | 89/125 [00:56<00:21,  1.70it/s]

88


 72%|███████▏  | 90/125 [00:57<00:19,  1.77it/s]

89


 73%|███████▎  | 91/125 [00:57<00:21,  1.61it/s]

90


 74%|███████▎  | 92/125 [00:58<00:20,  1.58it/s]

91


 74%|███████▍  | 93/125 [00:59<00:21,  1.51it/s]

92


 75%|███████▌  | 94/125 [01:00<00:25,  1.23it/s]

93


 76%|███████▌  | 95/125 [01:01<00:23,  1.29it/s]

94


 77%|███████▋  | 96/125 [01:01<00:20,  1.43it/s]

95


 78%|███████▊  | 97/125 [01:02<00:17,  1.62it/s]

96


 78%|███████▊  | 98/125 [01:02<00:17,  1.53it/s]

97


 79%|███████▉  | 99/125 [01:04<00:22,  1.13it/s]

98


 80%|████████  | 100/125 [01:04<00:20,  1.24it/s]

99


 81%|████████  | 101/125 [01:05<00:18,  1.31it/s]

100


 82%|████████▏ | 102/125 [01:06<00:18,  1.22it/s]

101


 82%|████████▏ | 103/125 [01:07<00:18,  1.17it/s]

102


 83%|████████▎ | 104/125 [01:07<00:15,  1.34it/s]

103


 84%|████████▍ | 105/125 [01:08<00:14,  1.43it/s]

104


 85%|████████▍ | 106/125 [01:09<00:12,  1.51it/s]

105


 86%|████████▌ | 107/125 [01:09<00:11,  1.56it/s]

106


 86%|████████▋ | 108/125 [01:10<00:11,  1.52it/s]

107


 87%|████████▋ | 109/125 [01:11<00:10,  1.50it/s]

108


 88%|████████▊ | 110/125 [01:11<00:09,  1.63it/s]

109


 89%|████████▉ | 111/125 [01:12<00:09,  1.44it/s]

110


 90%|████████▉ | 112/125 [01:12<00:08,  1.61it/s]

111


 90%|█████████ | 113/125 [01:13<00:07,  1.53it/s]

112


 91%|█████████ | 114/125 [01:14<00:08,  1.27it/s]

113


 92%|█████████▏| 115/125 [01:15<00:07,  1.38it/s]

114


 93%|█████████▎| 116/125 [01:15<00:06,  1.43it/s]

115


 94%|█████████▎| 117/125 [01:16<00:05,  1.42it/s]

116


 94%|█████████▍| 118/125 [01:17<00:04,  1.52it/s]

117


 95%|█████████▌| 119/125 [01:17<00:03,  1.63it/s]

118


 96%|█████████▌| 120/125 [01:18<00:03,  1.58it/s]

119


 97%|█████████▋| 121/125 [01:18<00:02,  1.61it/s]

120


 98%|█████████▊| 122/125 [01:19<00:01,  1.67it/s]

121


 98%|█████████▊| 123/125 [01:20<00:01,  1.71it/s]

122


 99%|█████████▉| 124/125 [01:20<00:00,  1.99it/s]

123


100%|██████████| 125/125 [01:20<00:00,  1.55it/s]


124
printloss [23788.62841129303, 28248.923825740814, 30956.919103860855, 16757.7135014534, 18853.819309473038, 13518.328838825226, 8938.06909108162, 15601.60611486435, 13627.214086055756, 14940.046795845032, 26184.194147109985, 41275.64899635315, 23147.819109916687, 19404.631784677505, 29007.90492916107, 26977.79771232605, 26743.908370018005, 19662.07617044449, 30443.494139671326, 39313.13066697121, 26116.44446372986, 33952.773711919785, 16999.158495903015, 24732.40920829773, 19953.424008369446, 14182.375874757767, 32562.84561896324, 39338.97993206978, 27810.568403720856, 53488.05390405655, 29104.91699552536, 27799.37864756584, 22696.347076892853, 39046.43373012543, 14305.665040969849, 22343.241196870804, 17410.919321537018, 10024.856100082397, 24000.828186035156, 23107.525552749634, 25788.67202758789, 19376.444657564163, 14506.094547510147, 27035.163308143616, 25964.62042093277, 14174.829013824463, 27908.78815126419, 21891.25297164917, 28738.923717975616, 30568.060326576233, 13431.37

100%|██████████| 25/25 [00:04<00:00,  5.24it/s]


Epoch: 05 | Time: 1m 25s
	Train Loss: 3.422 | Train PPL:  30.635
	 Val. Loss: 3.664 |  Val. PPL:  39.015


In [41]:
SRC = Input
TRG = Output

In [50]:
model.load_state_dict(torch.load('./model_16heads.pt'))

<All keys matched successfully>

In [44]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 500):
    
    model.eval()
    # print(sentence)
        
    if isinstance(sentence, str):
        nlp = spacy.load('en')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    # print("here")
    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        # print("done1")
        with torch.no_grad():
            # print("done2")
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
            # print("done3")
        
        # print("here2")
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    # print("here1")
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention


In [46]:
import re
from nltk.translate.bleu_score import sentence_bleu
import warnings
warnings.filterwarnings("ignore")

""" The tokenizer that we use for code submissions, from Wang Ling et al., Latent Predictor Networks for Code Generation (2016)
    @param code: string containing a code snippet
    @return: list of code tokens
"""
def tokenize_for_bleu_eval(code):
    code = re.sub(r'([^A-Za-z0-9_])', r' \1 ', code)
    code = re.sub(r'([a-z])([A-Z])', r'\1 \2', code)
    code = re.sub(r'\s+', ' ', code)
    code = code.replace('"', '`')
    code = code.replace('\'', '`')
    tokens = [t for t in code.split(' ') if t]
    return tokens

""" This scores hypotheses against references using BLEU.
    @param reference_list:  list of ground truth samples
    @param hypothesis_list: list of predictions that a model generates.
    @return: average bleu_score of all the data samples
"""
def evaluate_bleu(reference_list, hypothesis_list):
  bleu_score = 0
  number_of_samples = len(reference_list)
  for index in range(number_of_samples):
    reference_tokens = tokenize_for_bleu_eval(reference_list[index])
    hypothesis_tokens = tokenize_for_bleu_eval(hypothesis_list[index])
    bleu_score += compute_bleu([reference_tokens], hypothesis_tokens)
  return (bleu_score/number_of_samples)*100

def compute_bleu(reference, candidate):
  return sentence_bleu(references=reference, hypothesis=candidate, weights=(1.0, 0.0, 0.0, 0.0))

In [None]:
test_inputs,reference_output=creating_features(test_dataset)
# inputs=[test_inputs[497]]
# print(inputs)
hypothesis_list=[]
for i,input in enumerate(test_inputs):
  
  input=input.split(" ")
  print(i,input)
  translation, attention = translate_sentence(input, SRC, TRG, model, device)
  # print(translation)
  hypothesis_list.append(untokenize(translation[:-1]).decode('utf-8'))
  # print(hypothesis_list)

bleu = evaluate_bleu(reference_output, hypothesis_list)

In [53]:
print(bleu)

31.166087542542304


In [59]:
df=pd.concat([pd.DataFrame(test_inputs),pd.DataFrame(reference_output),pd.DataFrame(hypothesis_list)],axis=1)
df.columns=['intent','Ground truth','Code output by Transformer']

In [60]:
df.head(10)

Unnamed: 0,intent,Ground truth,Code output by Transformer
0,send a signal `signal.SIGUSR1` to the current ...,"os.kill(os.getpid(), signal.SIGUSR1)",os .system ('some_command with args')
1,decode a hex string '4a4b4c' to UTF-8.,bytes.fromhex('4a4b4c').decode('utf-8'),int (var_1 (16 ))
2,check if all elements in list `myList` are ide...,all(x == myList[0] for x in myList),"(isinstance (var_1 ,var_2 )for var_1 in var_2 )"
3,format number of spaces between strings `Pytho...,"print('%*s : %*s' % (20, 'Python', 20, 'Very G...","int (var_1 .findall ('\\d+',var_2 ))"
4,How to convert a string from CP-1251 to UTF-8?,d.decode('cp1251').encode('utf8'),int (''.join (str (str (str (str (str ))))for ...
5,get rid of None values in dictionary `kwargs`,"res = {k: v for k, v in list(kwargs.items()) i...",[(x [0 ]for x in list (var_1 .items ()))if x ]
6,get rid of None values in dictionary `kwargs`,"res = dict((k, v) for k, v in kwargs.items() i...",[(x [0 ]for x in list (var_1 .items ()))if x ]
7,capture final output of a chain of system comm...,subprocess.check_output('ps -ef | grep somethi...,"subprocess .call (['shutdown','/s'])"
8,"concatenate a list of strings `['a', 'b', 'c']`",""""""""""""".join(['a', 'b', 'c'])",[int (var_1 )for var_1 in var_2 ]
9,find intersection data between series `s1` and...,pd.Series(list(set(s1).intersection(set(s2)))),var_1 .groupby (var_2 .columns ).size ()


In [None]:
df.tail(20)

Unnamed: 0,0,0.1,0.2
480,"write line ""hi there"" to file `myfile`","f = open('myfile', 'w')\nf.write('hi there\n')...",os .write ('\n'.join (line ))
481,"write line ""Hello"" to file `somefile.txt`","with open('somefile.txt', 'a') as the_file:\n ...",os .write ('\n'.join (line ))
482,convert unicode string `s` to ascii,s.encode('iso-8859-15'),int (var_1 .encode ('hex'))
483,Django get maximum value associated with field...,AuthorizedEmail.objects.filter(group=group).or...,var_1 .objects .filter (var_1 =lambda x :x [0 ])
484,Find all numbers and dots from a string `text`...,"re.findall('Test([0-9.]*[0-9]+)', text)","re .findall ('#(\\w+)',re .findall ('\\d+',re ..."
485,python regex to find all numbers and dots fro...,"re.findall('Test([\\d.]*\\d+)', text)","re .findall ('[bcdfghjklmnpqrstvwxyz]+',re .fi..."
486,execute script 'script.ps1' using 'powershell....,"os.system('powershell.exe', 'script.ps1')","subprocess .call (['shutdown','/s'])"
487,Sort a list of tuples `b` by third item in the...,b.sort(key=lambda x: x[1][2]),"sorted (list (var_1 .items ()),key =lambda x :..."
488,get a list of all keys in Cassandra database `...,list(cf.get_range().get_keys()),[var_1 for var_1 in list (var_2 .items ())for ...
489,create a datetime with the current date & time,datetime.datetime.now(),datetime .datetime .datetime .now ().time ()


***Running the model for 8 heads instead of 16***

In [None]:
INPUT_DIM = len(Input.vocab)
OUTPUT_DIM = len(Output.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

SRC_PAD_IDX = Input.vocab.stoi[Input.pad_token]
TRG_PAD_IDX = Output.vocab.stoi[Output.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

model.apply(initialize_weights);

LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [None]:
N_EPOCHS = 5
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_example = []
    val_example = []

    for i in range(train_df.shape[0]):
        try:
            ex = data.Example.fromlist([train_df.Text[i], train_df.Code[i]], fields)
            train_example.append(ex)
        except:
            pass

    for i in range(val_df.shape[0]):
        try:
            ex = data.Example.fromlist([val_df.Text[i], val_df.Code[i]], fields)
            val_example.append(ex)
        except:
            pass       

    train_data = data.Dataset(train_example, fields)
    valid_data =  data.Dataset(val_example, fields)

    BATCH_SIZE = 16
    train_iterator, valid_iterator = BucketIterator.splits((train_data, valid_data), batch_size = BATCH_SIZE, 
                                                                sort_key = lambda x: len(x.Input),
                                                                sort_within_batch=True, device = device)

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/model_8heads.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7efe0c63ce10>


  1%|          | 1/125 [00:00<01:28,  1.40it/s]

0


  2%|▏         | 2/125 [00:01<01:26,  1.43it/s]

1


  2%|▏         | 3/125 [00:01<01:15,  1.63it/s]

2


  3%|▎         | 4/125 [00:02<01:16,  1.58it/s]

3


  4%|▍         | 5/125 [00:04<01:50,  1.09it/s]

4


  5%|▍         | 6/125 [00:04<01:44,  1.14it/s]

5


  6%|▌         | 7/125 [00:05<01:43,  1.14it/s]

6


  6%|▋         | 8/125 [00:06<01:44,  1.12it/s]

7


  7%|▋         | 9/125 [00:07<01:43,  1.13it/s]

8


  8%|▊         | 10/125 [00:08<01:36,  1.19it/s]

9


  9%|▉         | 11/125 [00:08<01:29,  1.27it/s]

10


 10%|▉         | 12/125 [00:09<01:30,  1.25it/s]

11


 10%|█         | 13/125 [00:10<01:27,  1.28it/s]

12


 11%|█         | 14/125 [00:11<01:22,  1.34it/s]

13


 12%|█▏        | 15/125 [00:11<01:21,  1.35it/s]

14


 13%|█▎        | 16/125 [00:12<01:23,  1.30it/s]

15


 14%|█▎        | 17/125 [00:13<01:30,  1.19it/s]

16


 14%|█▍        | 18/125 [00:14<01:40,  1.06it/s]

17


 15%|█▌        | 19/125 [00:15<01:42,  1.03it/s]

18


 16%|█▌        | 20/125 [00:16<01:38,  1.06it/s]

19


 17%|█▋        | 21/125 [00:17<01:34,  1.10it/s]

20


 18%|█▊        | 22/125 [00:18<01:34,  1.09it/s]

21


 18%|█▊        | 23/125 [00:20<02:07,  1.25s/it]

22


 19%|█▉        | 24/125 [00:22<02:16,  1.35s/it]

23


 20%|██        | 25/125 [00:23<02:14,  1.35s/it]

24


 21%|██        | 26/125 [00:24<02:01,  1.23s/it]

25


 22%|██▏       | 27/125 [00:25<01:46,  1.08s/it]

26


 22%|██▏       | 28/125 [00:26<01:40,  1.03s/it]

27


 23%|██▎       | 29/125 [00:27<01:51,  1.16s/it]

28


 24%|██▍       | 30/125 [00:28<01:45,  1.11s/it]

29


 25%|██▍       | 31/125 [00:29<01:51,  1.18s/it]

30


 26%|██▌       | 32/125 [00:30<01:39,  1.07s/it]

31


 26%|██▋       | 33/125 [00:31<01:32,  1.01s/it]

32


 27%|██▋       | 34/125 [00:32<01:20,  1.13it/s]

33


 28%|██▊       | 35/125 [00:32<01:11,  1.26it/s]

34


 29%|██▉       | 36/125 [00:33<01:02,  1.43it/s]

35


 30%|██▉       | 37/125 [00:33<01:03,  1.39it/s]

36


 30%|███       | 38/125 [00:34<01:00,  1.43it/s]

37


 31%|███       | 39/125 [00:35<00:54,  1.59it/s]

38


 32%|███▏      | 40/125 [00:35<00:58,  1.46it/s]

39


 33%|███▎      | 41/125 [00:36<01:06,  1.26it/s]

40


 34%|███▎      | 42/125 [00:37<01:02,  1.33it/s]

41


 34%|███▍      | 43/125 [00:38<01:03,  1.29it/s]

42


 35%|███▌      | 44/125 [00:39<01:00,  1.34it/s]

43


 36%|███▌      | 45/125 [00:40<01:02,  1.28it/s]

44


 37%|███▋      | 46/125 [00:40<01:04,  1.23it/s]

45


 38%|███▊      | 47/125 [00:41<01:01,  1.26it/s]

46


 38%|███▊      | 48/125 [00:42<00:59,  1.30it/s]

47


 39%|███▉      | 49/125 [00:42<00:53,  1.42it/s]

48


 40%|████      | 50/125 [00:43<00:54,  1.38it/s]

49


 41%|████      | 51/125 [00:44<00:58,  1.27it/s]

50


 42%|████▏     | 52/125 [00:45<00:56,  1.28it/s]

51


 42%|████▏     | 53/125 [00:46<01:03,  1.13it/s]

52


 43%|████▎     | 54/125 [00:48<01:18,  1.11s/it]

53


 44%|████▍     | 55/125 [00:48<01:08,  1.03it/s]

54


 45%|████▍     | 56/125 [00:49<01:05,  1.06it/s]

55


 46%|████▌     | 57/125 [00:50<00:59,  1.14it/s]

56


 46%|████▋     | 58/125 [00:51<00:57,  1.17it/s]

57


 47%|████▋     | 59/125 [00:51<00:47,  1.38it/s]

58


 48%|████▊     | 60/125 [00:52<00:49,  1.32it/s]

59


 49%|████▉     | 61/125 [00:53<00:49,  1.31it/s]

60


 50%|████▉     | 62/125 [00:53<00:46,  1.37it/s]

61


 50%|█████     | 63/125 [00:54<00:50,  1.22it/s]

62


 51%|█████     | 64/125 [00:55<00:51,  1.18it/s]

63


 52%|█████▏    | 65/125 [00:56<00:49,  1.22it/s]

64


 53%|█████▎    | 66/125 [00:57<00:43,  1.37it/s]

65


 54%|█████▎    | 67/125 [00:57<00:41,  1.39it/s]

66


 54%|█████▍    | 68/125 [00:58<00:40,  1.40it/s]

67


 55%|█████▌    | 69/125 [00:59<00:38,  1.46it/s]

68


 56%|█████▌    | 70/125 [00:59<00:35,  1.54it/s]

69


 57%|█████▋    | 71/125 [01:00<00:39,  1.37it/s]

70


 58%|█████▊    | 72/125 [01:01<00:35,  1.50it/s]

71


 58%|█████▊    | 73/125 [01:01<00:36,  1.44it/s]

72


 59%|█████▉    | 74/125 [01:03<00:44,  1.15it/s]

73


 60%|██████    | 75/125 [01:04<00:43,  1.14it/s]

74


 61%|██████    | 76/125 [01:04<00:39,  1.24it/s]

75


 62%|██████▏   | 77/125 [01:05<00:37,  1.27it/s]

76


 62%|██████▏   | 78/125 [01:06<00:35,  1.31it/s]

77


 63%|██████▎   | 79/125 [01:06<00:32,  1.43it/s]

78


 64%|██████▍   | 80/125 [01:07<00:30,  1.48it/s]

79


 65%|██████▍   | 81/125 [01:07<00:28,  1.54it/s]

80


 66%|██████▌   | 82/125 [01:08<00:29,  1.45it/s]

81


 66%|██████▋   | 83/125 [01:09<00:30,  1.40it/s]

82


 67%|██████▋   | 84/125 [01:10<00:29,  1.39it/s]

83


 68%|██████▊   | 85/125 [01:10<00:29,  1.36it/s]

84


 69%|██████▉   | 86/125 [01:12<00:32,  1.21it/s]

85


 70%|██████▉   | 87/125 [01:13<00:33,  1.13it/s]

86


 70%|███████   | 88/125 [01:13<00:32,  1.15it/s]

87


 71%|███████   | 89/125 [01:14<00:29,  1.21it/s]

88


 72%|███████▏  | 90/125 [01:15<00:30,  1.16it/s]

89


 73%|███████▎  | 91/125 [01:16<00:31,  1.08it/s]

90


 74%|███████▎  | 92/125 [01:17<00:27,  1.19it/s]

91


 74%|███████▍  | 93/125 [01:18<00:26,  1.21it/s]

92


 75%|███████▌  | 94/125 [01:18<00:24,  1.27it/s]

93


 76%|███████▌  | 95/125 [01:20<00:29,  1.03it/s]

94


 77%|███████▋  | 96/125 [01:21<00:30,  1.06s/it]

95


 78%|███████▊  | 97/125 [01:22<00:28,  1.03s/it]

96


 78%|███████▊  | 98/125 [01:23<00:25,  1.08it/s]

97


 79%|███████▉  | 99/125 [01:23<00:23,  1.10it/s]

98


 80%|████████  | 100/125 [01:24<00:21,  1.16it/s]

99


 81%|████████  | 101/125 [01:25<00:19,  1.23it/s]

100


 82%|████████▏ | 102/125 [01:26<00:20,  1.14it/s]

101


 82%|████████▏ | 103/125 [01:27<00:19,  1.13it/s]

102


 83%|████████▎ | 104/125 [01:27<00:16,  1.26it/s]

103


 84%|████████▍ | 105/125 [01:28<00:17,  1.17it/s]

104


 85%|████████▍ | 106/125 [01:29<00:15,  1.23it/s]

105


 86%|████████▌ | 107/125 [01:30<00:14,  1.26it/s]

106


 86%|████████▋ | 108/125 [01:30<00:12,  1.34it/s]

107


 87%|████████▋ | 109/125 [01:31<00:12,  1.28it/s]

108


 88%|████████▊ | 110/125 [01:32<00:10,  1.37it/s]

109


 89%|████████▉ | 111/125 [01:32<00:08,  1.59it/s]

110


 90%|████████▉ | 112/125 [01:33<00:08,  1.53it/s]

111


 90%|█████████ | 113/125 [01:34<00:07,  1.58it/s]

112


 91%|█████████ | 114/125 [01:35<00:08,  1.28it/s]

113


 92%|█████████▏| 115/125 [01:36<00:07,  1.26it/s]

114


 93%|█████████▎| 116/125 [01:36<00:06,  1.32it/s]

115


 94%|█████████▎| 117/125 [01:37<00:06,  1.28it/s]

116


 94%|█████████▍| 118/125 [01:38<00:04,  1.40it/s]

117


 95%|█████████▌| 119/125 [01:39<00:04,  1.28it/s]

118


 96%|█████████▌| 120/125 [01:39<00:03,  1.35it/s]

119


 97%|█████████▋| 121/125 [01:40<00:02,  1.37it/s]

120


 98%|█████████▊| 122/125 [01:41<00:02,  1.35it/s]

121


 98%|█████████▊| 123/125 [01:42<00:01,  1.31it/s]

122


 99%|█████████▉| 124/125 [01:42<00:00,  1.25it/s]

123


100%|██████████| 125/125 [01:43<00:00,  1.20it/s]


124
printloss [41142.96779537201, 41984.213609695435, 22093.746968269348, 34836.8096780777, 113242.60317325592, 45754.129165649414, 47834.287759780884, 60714.65199279785, 53961.632471084595, 34945.3553314209, 32899.693811416626, 42723.72448396683, 34628.11057329178, 29424.88609457016, 36751.27447652817, 41336.27294635773, 48538.99501037598, 43655.981135845184, 37176.0125541687, 46138.81242084503, 40176.4088973999, 34462.97608423233, 38620.25667190552, 28620.96267414093, 58155.72531223297, 37631.375910282135, 27933.80735015869, 29963.24363899231, 20974.382632255554, 25971.21270418167, 39324.61159706116, 19823.058108329773, 38138.5813369751, 22762.403274536133, 19242.00794649124, 15951.034763336182, 31971.999941825867, 23267.180646896362, 11459.707176208496, 35102.05698776245, 61194.94540500641, 25652.801921367645, 34221.78516292572, 24854.712929725647, 44712.624958992004, 43213.10729312897, 26943.948654174805, 25352.676246643066, 15382.49346446991, 25987.21332550049, 39998.44970655441, 

100%|██████████| 25/25 [00:05<00:00,  4.41it/s]


Epoch: 01 | Time: 1m 49s
	Train Loss: 4.687 | Train PPL: 108.486
	 Val. Loss: 4.076 |  Val. PPL:  58.926
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7efe18448750>


  1%|          | 1/125 [00:00<01:32,  1.33it/s]

0


  2%|▏         | 2/125 [00:01<01:33,  1.32it/s]

1


  2%|▏         | 3/125 [00:02<01:38,  1.24it/s]

2


  3%|▎         | 4/125 [00:02<01:21,  1.49it/s]

3


  4%|▍         | 5/125 [00:03<01:29,  1.34it/s]

4


  5%|▍         | 6/125 [00:04<01:24,  1.40it/s]

5


  6%|▌         | 7/125 [00:05<01:30,  1.31it/s]

6


  6%|▋         | 8/125 [00:05<01:26,  1.35it/s]

7


  7%|▋         | 9/125 [00:06<01:32,  1.25it/s]

8


  8%|▊         | 10/125 [00:07<01:21,  1.40it/s]

9


  9%|▉         | 11/125 [00:07<01:16,  1.49it/s]

10


 10%|▉         | 12/125 [00:08<01:23,  1.36it/s]

11


 10%|█         | 13/125 [00:09<01:22,  1.36it/s]

12


 11%|█         | 14/125 [00:10<01:31,  1.21it/s]

13


 12%|█▏        | 15/125 [00:11<01:25,  1.29it/s]

14


 13%|█▎        | 16/125 [00:11<01:22,  1.33it/s]

15


 14%|█▎        | 17/125 [00:12<01:21,  1.33it/s]

16


 14%|█▍        | 18/125 [00:13<01:23,  1.28it/s]

17


 15%|█▌        | 19/125 [00:14<01:24,  1.25it/s]

18


 16%|█▌        | 20/125 [00:15<01:17,  1.35it/s]

19


 17%|█▋        | 21/125 [00:15<01:14,  1.40it/s]

20


 18%|█▊        | 22/125 [00:16<01:13,  1.40it/s]

21


 18%|█▊        | 23/125 [00:17<01:13,  1.38it/s]

22


 19%|█▉        | 24/125 [00:18<01:30,  1.11it/s]

23


 20%|██        | 25/125 [00:19<01:23,  1.20it/s]

24


 21%|██        | 26/125 [00:19<01:15,  1.31it/s]

25


 22%|██▏       | 27/125 [00:20<01:13,  1.33it/s]

26


 22%|██▏       | 28/125 [00:21<01:24,  1.15it/s]

27


 23%|██▎       | 29/125 [00:22<01:22,  1.17it/s]

28


 24%|██▍       | 30/125 [00:22<01:11,  1.33it/s]

29


 25%|██▍       | 31/125 [00:23<01:14,  1.26it/s]

30


 26%|██▌       | 32/125 [00:24<01:05,  1.42it/s]

31


 26%|██▋       | 33/125 [00:25<01:06,  1.39it/s]

32


 27%|██▋       | 34/125 [00:26<01:30,  1.00it/s]

33


 28%|██▊       | 35/125 [00:27<01:20,  1.12it/s]

34


 29%|██▉       | 36/125 [00:27<01:11,  1.25it/s]

35


 30%|██▉       | 37/125 [00:28<01:11,  1.23it/s]

36


 30%|███       | 38/125 [00:29<01:05,  1.32it/s]

37


 31%|███       | 39/125 [00:30<01:06,  1.29it/s]

38


 32%|███▏      | 40/125 [00:31<01:09,  1.23it/s]

39


 33%|███▎      | 41/125 [00:32<01:09,  1.21it/s]

40


 34%|███▎      | 42/125 [00:32<01:05,  1.28it/s]

41


 34%|███▍      | 43/125 [00:33<01:06,  1.24it/s]

42


 35%|███▌      | 44/125 [00:34<01:02,  1.29it/s]

43


 36%|███▌      | 45/125 [00:35<01:02,  1.28it/s]

44


 37%|███▋      | 46/125 [00:35<01:04,  1.22it/s]

45


 38%|███▊      | 47/125 [00:36<01:02,  1.25it/s]

46


 38%|███▊      | 48/125 [00:37<01:02,  1.23it/s]

47


 39%|███▉      | 49/125 [00:38<01:04,  1.18it/s]

48


 40%|████      | 50/125 [00:39<00:56,  1.33it/s]

49


 41%|████      | 51/125 [00:40<01:01,  1.21it/s]

50


 42%|████▏     | 52/125 [00:40<00:57,  1.27it/s]

51


 42%|████▏     | 53/125 [00:41<00:59,  1.21it/s]

52


 43%|████▎     | 54/125 [00:42<00:59,  1.20it/s]

53


 44%|████▍     | 55/125 [00:43<01:00,  1.16it/s]

54


 45%|████▍     | 56/125 [00:44<01:01,  1.13it/s]

55


 46%|████▌     | 57/125 [00:45<01:04,  1.05it/s]

56


 46%|████▋     | 58/125 [00:46<01:07,  1.01s/it]

57


 47%|████▋     | 59/125 [00:47<01:03,  1.04it/s]

58


 48%|████▊     | 60/125 [00:48<00:59,  1.09it/s]

59


 49%|████▉     | 61/125 [00:48<00:53,  1.19it/s]

60


 50%|████▉     | 62/125 [00:49<00:55,  1.14it/s]

61


 50%|█████     | 63/125 [00:50<00:51,  1.21it/s]

62


 51%|█████     | 64/125 [00:51<00:50,  1.21it/s]

63


 52%|█████▏    | 65/125 [00:52<00:49,  1.21it/s]

64


 53%|█████▎    | 66/125 [00:52<00:47,  1.25it/s]

65


 54%|█████▎    | 67/125 [00:53<00:45,  1.28it/s]

66


 54%|█████▍    | 68/125 [00:54<00:40,  1.40it/s]

67


 55%|█████▌    | 69/125 [00:55<00:42,  1.32it/s]

68


 56%|█████▌    | 70/125 [00:56<00:48,  1.14it/s]

69


 57%|█████▋    | 71/125 [00:56<00:42,  1.29it/s]

70


 58%|█████▊    | 72/125 [00:57<00:40,  1.31it/s]

71


 58%|█████▊    | 73/125 [00:58<00:38,  1.34it/s]

72


 59%|█████▉    | 74/125 [00:59<00:38,  1.32it/s]

73


 60%|██████    | 75/125 [00:59<00:39,  1.28it/s]

74


 61%|██████    | 76/125 [01:00<00:42,  1.14it/s]

75


 62%|██████▏   | 77/125 [01:01<00:40,  1.19it/s]

76


 62%|██████▏   | 78/125 [01:02<00:38,  1.21it/s]

77


 63%|██████▎   | 79/125 [01:03<00:35,  1.28it/s]

78


 64%|██████▍   | 80/125 [01:03<00:32,  1.39it/s]

79


 65%|██████▍   | 81/125 [01:04<00:31,  1.40it/s]

80


 66%|██████▌   | 82/125 [01:05<00:31,  1.38it/s]

81


 66%|██████▋   | 83/125 [01:06<00:33,  1.26it/s]

82


 67%|██████▋   | 84/125 [01:06<00:31,  1.31it/s]

83


 68%|██████▊   | 85/125 [01:07<00:33,  1.21it/s]

84


 69%|██████▉   | 86/125 [01:08<00:32,  1.20it/s]

85


 70%|██████▉   | 87/125 [01:09<00:30,  1.24it/s]

86


 70%|███████   | 88/125 [01:10<00:30,  1.21it/s]

87


 71%|███████   | 89/125 [01:10<00:26,  1.37it/s]

88


 72%|███████▏  | 90/125 [01:11<00:25,  1.35it/s]

89


 73%|███████▎  | 91/125 [01:12<00:23,  1.42it/s]

90


 74%|███████▎  | 92/125 [01:13<00:25,  1.29it/s]

91


 74%|███████▍  | 93/125 [01:13<00:23,  1.37it/s]

92


 75%|███████▌  | 94/125 [01:14<00:22,  1.38it/s]

93


 76%|███████▌  | 95/125 [01:15<00:22,  1.36it/s]

94


 77%|███████▋  | 96/125 [01:15<00:20,  1.40it/s]

95


 78%|███████▊  | 97/125 [01:16<00:20,  1.36it/s]

96


 78%|███████▊  | 98/125 [01:17<00:19,  1.35it/s]

97


 79%|███████▉  | 99/125 [01:18<00:21,  1.20it/s]

98


 80%|████████  | 100/125 [01:19<00:19,  1.27it/s]

99


 81%|████████  | 101/125 [01:19<00:17,  1.35it/s]

100


 82%|████████▏ | 102/125 [01:20<00:15,  1.45it/s]

101


 82%|████████▏ | 103/125 [01:20<00:14,  1.53it/s]

102


 83%|████████▎ | 104/125 [01:21<00:13,  1.58it/s]

103


 84%|████████▍ | 105/125 [01:22<00:13,  1.49it/s]

104


 85%|████████▍ | 106/125 [01:22<00:12,  1.52it/s]

105


 86%|████████▌ | 107/125 [01:24<00:15,  1.18it/s]

106


 86%|████████▋ | 108/125 [01:25<00:14,  1.19it/s]

107


 87%|████████▋ | 109/125 [01:26<00:14,  1.07it/s]

108


 88%|████████▊ | 110/125 [01:27<00:14,  1.05it/s]

109


 89%|████████▉ | 111/125 [01:28<00:14,  1.01s/it]

110


 90%|████████▉ | 112/125 [01:29<00:12,  1.04it/s]

111


 90%|█████████ | 113/125 [01:30<00:11,  1.09it/s]

112


 91%|█████████ | 114/125 [01:30<00:09,  1.18it/s]

113


 92%|█████████▏| 115/125 [01:31<00:08,  1.18it/s]

114


 93%|█████████▎| 116/125 [01:32<00:07,  1.21it/s]

115


 94%|█████████▎| 117/125 [01:33<00:06,  1.25it/s]

116


 94%|█████████▍| 118/125 [01:33<00:04,  1.42it/s]

117


 95%|█████████▌| 119/125 [01:34<00:04,  1.37it/s]

118


 96%|█████████▌| 120/125 [01:35<00:03,  1.32it/s]

119


 97%|█████████▋| 121/125 [01:36<00:03,  1.11it/s]

120


 98%|█████████▊| 122/125 [01:37<00:02,  1.03it/s]

121


 98%|█████████▊| 123/125 [01:38<00:01,  1.05it/s]

122


 99%|█████████▉| 124/125 [01:39<00:00,  1.15it/s]

123


100%|██████████| 125/125 [01:40<00:00,  1.25it/s]


124
printloss [26872.851572990417, 27711.42611002922, 27834.278780460358, 9838.132905006409, 38522.56776666641, 19975.60108280182, 35831.60266971588, 16041.851155757904, 43563.746153354645, 11960.199247598648, 16924.124273777008, 35054.558190107346, 18847.707711696625, 50762.305101156235, 18245.19318175316, 21933.68731069565, 28927.885234117508, 32944.86471700668, 27743.976318836212, 16111.575236320496, 18916.911158561707, 22035.9241437912, 19021.210918426514, 74201.89554595947, 24554.263556718826, 14687.475017309189, 26653.922578811646, 61853.246784210205, 31393.67800140381, 14108.891275882721, 34371.75814247131, 10225.291305541992, 21921.922837257385, 76009.75993585587, 21712.430161476135, 14842.582921028137, 34383.196413517, 19165.08798313141, 25712.073816776276, 38100.416845321655, 34579.65414714813, 23161.732696533203, 39230.69972229004, 24990.72423028946, 29163.27367067337, 27302.96464920044, 22740.483663082123, 28451.568358898163, 42408.962223529816, 13819.50732755661, 42433.714

100%|██████████| 25/25 [00:05<00:00,  4.34it/s]


Epoch: 02 | Time: 1m 46s
	Train Loss: 3.911 | Train PPL:  49.955
	 Val. Loss: 3.876 |  Val. PPL:  48.224
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7efe1b569150>


  1%|          | 1/125 [00:00<01:20,  1.54it/s]

0


  2%|▏         | 2/125 [00:01<01:24,  1.45it/s]

1


  2%|▏         | 3/125 [00:02<01:22,  1.48it/s]

2


  3%|▎         | 4/125 [00:02<01:18,  1.54it/s]

3


  4%|▍         | 5/125 [00:03<01:26,  1.39it/s]

4


  5%|▍         | 6/125 [00:04<01:32,  1.28it/s]

5


  6%|▌         | 7/125 [00:04<01:23,  1.42it/s]

6


  6%|▋         | 8/125 [00:05<01:21,  1.43it/s]

7


  7%|▋         | 9/125 [00:06<01:14,  1.56it/s]

8


  8%|▊         | 10/125 [00:06<01:16,  1.51it/s]

9


  9%|▉         | 11/125 [00:08<01:51,  1.02it/s]

10


 10%|▉         | 12/125 [00:09<01:53,  1.00s/it]

11


 10%|█         | 13/125 [00:10<01:40,  1.11it/s]

12


 11%|█         | 14/125 [00:11<01:43,  1.08it/s]

13


 12%|█▏        | 15/125 [00:11<01:34,  1.16it/s]

14


 13%|█▎        | 16/125 [00:12<01:36,  1.13it/s]

15


 14%|█▎        | 17/125 [00:13<01:36,  1.12it/s]

16


 14%|█▍        | 18/125 [00:14<01:26,  1.24it/s]

17


 15%|█▌        | 19/125 [00:14<01:16,  1.39it/s]

18


 16%|█▌        | 20/125 [00:15<01:16,  1.37it/s]

19


 17%|█▋        | 21/125 [00:16<01:20,  1.28it/s]

20


 18%|█▊        | 22/125 [00:17<01:22,  1.25it/s]

21


 18%|█▊        | 23/125 [00:18<01:22,  1.24it/s]

22


 19%|█▉        | 24/125 [00:18<01:19,  1.26it/s]

23


 20%|██        | 25/125 [00:19<01:18,  1.28it/s]

24


 21%|██        | 26/125 [00:20<01:18,  1.26it/s]

25


 22%|██▏       | 27/125 [00:21<01:10,  1.40it/s]

26


 22%|██▏       | 28/125 [00:22<01:16,  1.26it/s]

27


 23%|██▎       | 29/125 [00:22<01:18,  1.23it/s]

28


 24%|██▍       | 30/125 [00:23<01:18,  1.22it/s]

29


 25%|██▍       | 31/125 [00:24<01:13,  1.28it/s]

30


 26%|██▌       | 32/125 [00:25<01:19,  1.18it/s]

31


 26%|██▋       | 33/125 [00:26<01:15,  1.22it/s]

32


 27%|██▋       | 34/125 [00:27<01:16,  1.19it/s]

33


 28%|██▊       | 35/125 [00:27<01:15,  1.19it/s]

34


 29%|██▉       | 36/125 [00:28<01:15,  1.17it/s]

35


 30%|██▉       | 37/125 [00:29<01:12,  1.22it/s]

36


 30%|███       | 38/125 [00:30<01:09,  1.24it/s]

37


 31%|███       | 39/125 [00:31<01:18,  1.09it/s]

38


 32%|███▏      | 40/125 [00:32<01:10,  1.20it/s]

39


 33%|███▎      | 41/125 [00:32<01:06,  1.26it/s]

40


 34%|███▎      | 42/125 [00:33<01:03,  1.31it/s]

41


 34%|███▍      | 43/125 [00:34<01:00,  1.35it/s]

42


 35%|███▌      | 44/125 [00:34<00:56,  1.42it/s]

43


 36%|███▌      | 45/125 [00:35<01:01,  1.29it/s]

44


 37%|███▋      | 46/125 [00:36<01:01,  1.29it/s]

45


 38%|███▊      | 47/125 [00:37<01:01,  1.26it/s]

46


 38%|███▊      | 48/125 [00:37<00:54,  1.40it/s]

47


 39%|███▉      | 49/125 [00:38<00:56,  1.34it/s]

48


 40%|████      | 50/125 [00:39<00:53,  1.40it/s]

49


 41%|████      | 51/125 [00:40<00:50,  1.46it/s]

50


 42%|████▏     | 52/125 [00:40<00:54,  1.34it/s]

51


 42%|████▏     | 53/125 [00:41<00:49,  1.45it/s]

52


 43%|████▎     | 54/125 [00:42<00:54,  1.31it/s]

53


 44%|████▍     | 55/125 [00:43<00:53,  1.32it/s]

54


 45%|████▍     | 56/125 [00:43<00:51,  1.34it/s]

55


 46%|████▌     | 57/125 [00:44<00:46,  1.45it/s]

56


 46%|████▋     | 58/125 [00:45<00:48,  1.39it/s]

57


 47%|████▋     | 59/125 [00:45<00:45,  1.44it/s]

58


 48%|████▊     | 60/125 [00:46<00:43,  1.49it/s]

59


 49%|████▉     | 61/125 [00:47<00:45,  1.40it/s]

60


 50%|████▉     | 62/125 [00:47<00:44,  1.42it/s]

61


 50%|█████     | 63/125 [00:48<00:41,  1.48it/s]

62


 51%|█████     | 64/125 [00:49<00:45,  1.35it/s]

63


 52%|█████▏    | 65/125 [00:50<00:42,  1.41it/s]

64


 53%|█████▎    | 66/125 [00:51<00:46,  1.27it/s]

65


 54%|█████▎    | 67/125 [00:51<00:45,  1.28it/s]

66


 54%|█████▍    | 68/125 [00:53<00:53,  1.06it/s]

67


 55%|█████▌    | 69/125 [00:53<00:48,  1.15it/s]

68


 56%|█████▌    | 70/125 [00:55<00:52,  1.05it/s]

69


 57%|█████▋    | 71/125 [00:55<00:49,  1.09it/s]

70


 58%|█████▊    | 72/125 [00:56<00:47,  1.10it/s]

71


 58%|█████▊    | 73/125 [00:57<00:45,  1.13it/s]

72


 59%|█████▉    | 74/125 [00:58<00:43,  1.17it/s]

73


 60%|██████    | 75/125 [00:59<00:45,  1.09it/s]

74


 61%|██████    | 76/125 [01:00<00:42,  1.15it/s]

75


 62%|██████▏   | 77/125 [01:01<00:42,  1.13it/s]

76


 62%|██████▏   | 78/125 [01:01<00:40,  1.15it/s]

77


 63%|██████▎   | 79/125 [01:02<00:38,  1.19it/s]

78


 64%|██████▍   | 80/125 [01:03<00:34,  1.29it/s]

79


 65%|██████▍   | 81/125 [01:04<00:35,  1.22it/s]

80


 66%|██████▌   | 82/125 [01:05<00:36,  1.18it/s]

81


 66%|██████▋   | 83/125 [01:06<00:38,  1.09it/s]

82


 67%|██████▋   | 84/125 [01:07<00:36,  1.12it/s]

83


 68%|██████▊   | 85/125 [01:07<00:35,  1.14it/s]

84


 69%|██████▉   | 86/125 [01:08<00:31,  1.23it/s]

85


 70%|██████▉   | 87/125 [01:09<00:30,  1.25it/s]

86


 70%|███████   | 88/125 [01:10<00:30,  1.21it/s]

87


 71%|███████   | 89/125 [01:11<00:29,  1.21it/s]

88


 72%|███████▏  | 90/125 [01:11<00:27,  1.25it/s]

89


 73%|███████▎  | 91/125 [01:12<00:24,  1.41it/s]

90


 74%|███████▎  | 92/125 [01:13<00:24,  1.36it/s]

91


 74%|███████▍  | 93/125 [01:13<00:23,  1.34it/s]

92


 75%|███████▌  | 94/125 [01:14<00:24,  1.25it/s]

93


 76%|███████▌  | 95/125 [01:15<00:22,  1.32it/s]

94


 77%|███████▋  | 96/125 [01:16<00:21,  1.37it/s]

95


 78%|███████▊  | 97/125 [01:16<00:20,  1.39it/s]

96


 78%|███████▊  | 98/125 [01:17<00:19,  1.42it/s]

97


 79%|███████▉  | 99/125 [01:18<00:18,  1.43it/s]

98


 80%|████████  | 100/125 [01:19<00:18,  1.34it/s]

99


 81%|████████  | 101/125 [01:19<00:18,  1.27it/s]

100


 82%|████████▏ | 102/125 [01:21<00:22,  1.03it/s]

101


 82%|████████▏ | 103/125 [01:22<00:22,  1.04s/it]

102


 83%|████████▎ | 104/125 [01:23<00:22,  1.09s/it]

103


 84%|████████▍ | 105/125 [01:24<00:19,  1.04it/s]

104


 85%|████████▍ | 106/125 [01:25<00:16,  1.16it/s]

105


 86%|████████▌ | 107/125 [01:26<00:16,  1.07it/s]

106


 86%|████████▋ | 108/125 [01:26<00:14,  1.18it/s]

107


 87%|████████▋ | 109/125 [01:27<00:12,  1.29it/s]

108


 88%|████████▊ | 110/125 [01:28<00:13,  1.11it/s]

109


 89%|████████▉ | 111/125 [01:29<00:12,  1.10it/s]

110


 90%|████████▉ | 112/125 [01:30<00:11,  1.14it/s]

111


 90%|█████████ | 113/125 [01:30<00:09,  1.26it/s]

112


 91%|█████████ | 114/125 [01:31<00:08,  1.32it/s]

113


 92%|█████████▏| 115/125 [01:32<00:07,  1.28it/s]

114


 93%|█████████▎| 116/125 [01:33<00:07,  1.28it/s]

115


 94%|█████████▎| 117/125 [01:33<00:06,  1.29it/s]

116


 94%|█████████▍| 118/125 [01:34<00:05,  1.27it/s]

117


 95%|█████████▌| 119/125 [01:35<00:04,  1.29it/s]

118


 96%|█████████▌| 120/125 [01:35<00:03,  1.47it/s]

119


 97%|█████████▋| 121/125 [01:36<00:02,  1.36it/s]

120


 98%|█████████▊| 122/125 [01:37<00:02,  1.31it/s]

121


 98%|█████████▊| 123/125 [01:38<00:01,  1.36it/s]

122


 99%|█████████▉| 124/125 [01:39<00:00,  1.23it/s]

123


100%|██████████| 125/125 [01:40<00:00,  1.25it/s]


124
printloss [17688.479641914368, 21011.350917816162, 16571.46849822998, 15995.268626213074, 27544.150743484497, 34734.48980641365, 11379.657373428345, 19848.491912841797, 13013.985792160034, 20131.924713134766, 74679.72909736633, 41485.70058822632, 18525.157804250717, 43240.56510233879, 18562.973683834076, 35294.13793802261, 23599.590982198715, 14878.639165401459, 12304.429461479187, 24757.756090164185, 33621.68057012558, 31227.11349749565, 25191.58658838272, 22526.37480711937, 22196.561812877655, 33317.84412622452, 10605.10766005516, 34810.15041041374, 30859.22566318512, 28968.491734981537, 21286.288347244263, 39262.84196329117, 28314.443795681, 42270.448961257935, 30593.026000976562, 33026.185182094574, 25443.869068145752, 22734.790906906128, 43093.56635808945, 19829.543261528015, 24095.769151210785, 21436.778837919235, 20725.415511131287, 14743.256524562836, 30620.88307619095, 21898.795553207397, 30647.538189888, 12695.22859454155, 30552.585474014282, 15852.471244812012, 15819.158

100%|██████████| 25/25 [00:05<00:00,  4.36it/s]


Epoch: 03 | Time: 1m 46s
	Train Loss: 3.705 | Train PPL:  40.653
	 Val. Loss: 3.750 |  Val. PPL:  42.531
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7efe1bf77f90>


  1%|          | 1/125 [00:00<01:29,  1.39it/s]

0


  2%|▏         | 2/125 [00:01<01:49,  1.13it/s]

1


  2%|▏         | 3/125 [00:02<01:44,  1.17it/s]

2


  3%|▎         | 4/125 [00:03<01:44,  1.16it/s]

3


  4%|▍         | 5/125 [00:04<01:37,  1.23it/s]

4


  5%|▍         | 6/125 [00:04<01:30,  1.31it/s]

5


  6%|▌         | 7/125 [00:05<01:32,  1.27it/s]

6


  6%|▋         | 8/125 [00:06<01:29,  1.31it/s]

7


  7%|▋         | 9/125 [00:07<01:26,  1.33it/s]

8


  8%|▊         | 10/125 [00:07<01:24,  1.36it/s]

9


  9%|▉         | 11/125 [00:08<01:24,  1.35it/s]

10


 10%|▉         | 12/125 [00:09<01:26,  1.31it/s]

11


 10%|█         | 13/125 [00:10<01:28,  1.26it/s]

12


 11%|█         | 14/125 [00:10<01:18,  1.42it/s]

13


 12%|█▏        | 15/125 [00:11<01:21,  1.35it/s]

14


 13%|█▎        | 16/125 [00:12<01:15,  1.44it/s]

15


 14%|█▎        | 17/125 [00:12<01:12,  1.49it/s]

16


 14%|█▍        | 18/125 [00:13<01:19,  1.34it/s]

17


 15%|█▌        | 19/125 [00:14<01:21,  1.30it/s]

18


 16%|█▌        | 20/125 [00:15<01:16,  1.38it/s]

19


 17%|█▋        | 21/125 [00:15<01:17,  1.34it/s]

20


 18%|█▊        | 22/125 [00:16<01:19,  1.30it/s]

21


 18%|█▊        | 23/125 [00:17<01:16,  1.34it/s]

22


 19%|█▉        | 24/125 [00:18<01:14,  1.35it/s]

23


 20%|██        | 25/125 [00:19<01:18,  1.28it/s]

24


 21%|██        | 26/125 [00:19<01:17,  1.28it/s]

25


 22%|██▏       | 27/125 [00:20<01:26,  1.13it/s]

26


 22%|██▏       | 28/125 [00:21<01:19,  1.22it/s]

27


 23%|██▎       | 29/125 [00:22<01:12,  1.32it/s]

28


 24%|██▍       | 30/125 [00:23<01:14,  1.27it/s]

29


 25%|██▍       | 31/125 [00:23<01:07,  1.39it/s]

30


 26%|██▌       | 32/125 [00:24<01:11,  1.31it/s]

31


 26%|██▋       | 33/125 [00:25<01:14,  1.24it/s]

32


 27%|██▋       | 34/125 [00:26<01:16,  1.19it/s]

33


 28%|██▊       | 35/125 [00:27<01:11,  1.26it/s]

34


 29%|██▉       | 36/125 [00:27<01:10,  1.25it/s]

35


 30%|██▉       | 37/125 [00:28<01:09,  1.26it/s]

36


 30%|███       | 38/125 [00:29<01:08,  1.28it/s]

37


 31%|███       | 39/125 [00:30<01:10,  1.22it/s]

38


 32%|███▏      | 40/125 [00:30<01:05,  1.30it/s]

39


 33%|███▎      | 41/125 [00:31<01:00,  1.39it/s]

40


 34%|███▎      | 42/125 [00:32<00:58,  1.42it/s]

41


 34%|███▍      | 43/125 [00:33<01:21,  1.00it/s]

42


 35%|███▌      | 44/125 [00:34<01:19,  1.02it/s]

43


 36%|███▌      | 45/125 [00:35<01:16,  1.05it/s]

44


 37%|███▋      | 46/125 [00:36<01:18,  1.00it/s]

45


 38%|███▊      | 47/125 [00:37<01:19,  1.03s/it]

46


 38%|███▊      | 48/125 [00:38<01:11,  1.07it/s]

47


 39%|███▉      | 49/125 [00:39<01:08,  1.10it/s]

48


 40%|████      | 50/125 [00:40<01:07,  1.12it/s]

49


 41%|████      | 51/125 [00:41<01:03,  1.17it/s]

50


 42%|████▏     | 52/125 [00:42<01:12,  1.00it/s]

51


 42%|████▏     | 53/125 [00:43<01:05,  1.10it/s]

52


 43%|████▎     | 54/125 [00:43<00:59,  1.19it/s]

53


 44%|████▍     | 55/125 [00:44<00:54,  1.29it/s]

54


 45%|████▍     | 56/125 [00:45<00:53,  1.30it/s]

55


 46%|████▌     | 57/125 [00:45<00:53,  1.28it/s]

56


 46%|████▋     | 58/125 [00:46<00:55,  1.21it/s]

57


 47%|████▋     | 59/125 [00:47<00:52,  1.26it/s]

58


 48%|████▊     | 60/125 [00:48<00:57,  1.13it/s]

59


 49%|████▉     | 61/125 [00:50<01:05,  1.03s/it]

60


 50%|████▉     | 62/125 [00:51<01:09,  1.11s/it]

61


 50%|█████     | 63/125 [00:52<01:07,  1.09s/it]

62


 51%|█████     | 64/125 [00:53<01:03,  1.04s/it]

63


 52%|█████▏    | 65/125 [00:53<00:54,  1.11it/s]

64


 53%|█████▎    | 66/125 [00:54<00:47,  1.25it/s]

65


 54%|█████▎    | 67/125 [00:55<00:50,  1.16it/s]

66


 54%|█████▍    | 68/125 [00:56<00:49,  1.16it/s]

67


 55%|█████▌    | 69/125 [00:57<00:50,  1.11it/s]

68


 56%|█████▌    | 70/125 [00:58<00:46,  1.18it/s]

69


 57%|█████▋    | 71/125 [00:58<00:43,  1.25it/s]

70


 58%|█████▊    | 72/125 [00:59<00:39,  1.35it/s]

71


 58%|█████▊    | 73/125 [00:59<00:36,  1.44it/s]

72


 59%|█████▉    | 74/125 [01:01<00:41,  1.22it/s]

73


 60%|██████    | 75/125 [01:01<00:39,  1.26it/s]

74


 61%|██████    | 76/125 [01:02<00:35,  1.40it/s]

75


 62%|██████▏   | 77/125 [01:03<00:34,  1.38it/s]

76


 62%|██████▏   | 78/125 [01:04<00:37,  1.24it/s]

77


 63%|██████▎   | 79/125 [01:04<00:34,  1.33it/s]

78


 64%|██████▍   | 80/125 [01:05<00:34,  1.31it/s]

79


 65%|██████▍   | 81/125 [01:06<00:41,  1.06it/s]

80


 66%|██████▌   | 82/125 [01:07<00:40,  1.07it/s]

81


 66%|██████▋   | 83/125 [01:08<00:36,  1.16it/s]

82


 67%|██████▋   | 84/125 [01:09<00:34,  1.18it/s]

83


 68%|██████▊   | 85/125 [01:09<00:31,  1.26it/s]

84


 69%|██████▉   | 86/125 [01:10<00:27,  1.39it/s]

85


 70%|██████▉   | 87/125 [01:11<00:27,  1.37it/s]

86


 70%|███████   | 88/125 [01:11<00:27,  1.36it/s]

87


 71%|███████   | 89/125 [01:12<00:25,  1.39it/s]

88


 72%|███████▏  | 90/125 [01:13<00:27,  1.30it/s]

89


 73%|███████▎  | 91/125 [01:14<00:27,  1.23it/s]

90


 74%|███████▎  | 92/125 [01:15<00:28,  1.16it/s]

91


 74%|███████▍  | 93/125 [01:16<00:25,  1.24it/s]

92


 75%|███████▌  | 94/125 [01:17<00:25,  1.20it/s]

93


 76%|███████▌  | 95/125 [01:17<00:22,  1.36it/s]

94


 77%|███████▋  | 96/125 [01:18<00:21,  1.35it/s]

95


 78%|███████▊  | 97/125 [01:19<00:21,  1.31it/s]

96


 78%|███████▊  | 98/125 [01:19<00:20,  1.30it/s]

97


 79%|███████▉  | 99/125 [01:20<00:18,  1.38it/s]

98


 80%|████████  | 100/125 [01:21<00:18,  1.35it/s]

99


 81%|████████  | 101/125 [01:21<00:15,  1.50it/s]

100


 82%|████████▏ | 102/125 [01:22<00:15,  1.48it/s]

101


 82%|████████▏ | 103/125 [01:23<00:15,  1.43it/s]

102


 83%|████████▎ | 104/125 [01:24<00:15,  1.35it/s]

103


 84%|████████▍ | 105/125 [01:24<00:15,  1.28it/s]

104


 85%|████████▍ | 106/125 [01:25<00:14,  1.34it/s]

105


 86%|████████▌ | 107/125 [01:26<00:13,  1.30it/s]

106


 86%|████████▋ | 108/125 [01:27<00:12,  1.31it/s]

107


 87%|████████▋ | 109/125 [01:27<00:11,  1.40it/s]

108


 88%|████████▊ | 110/125 [01:28<00:11,  1.26it/s]

109


 89%|████████▉ | 111/125 [01:29<00:10,  1.32it/s]

110


 90%|████████▉ | 112/125 [01:30<00:09,  1.31it/s]

111


 90%|█████████ | 113/125 [01:30<00:08,  1.38it/s]

112


 91%|█████████ | 114/125 [01:31<00:07,  1.38it/s]

113


 92%|█████████▏| 115/125 [01:32<00:07,  1.26it/s]

114


 93%|█████████▎| 116/125 [01:32<00:06,  1.48it/s]

115


 94%|█████████▎| 117/125 [01:33<00:05,  1.40it/s]

116


 94%|█████████▍| 118/125 [01:34<00:05,  1.19it/s]

117


 95%|█████████▌| 119/125 [01:35<00:04,  1.24it/s]

118


 96%|█████████▌| 120/125 [01:36<00:04,  1.19it/s]

119


 97%|█████████▋| 121/125 [01:37<00:03,  1.26it/s]

120


 98%|█████████▊| 122/125 [01:38<00:02,  1.17it/s]

121


 98%|█████████▊| 123/125 [01:39<00:01,  1.13it/s]

122


 99%|█████████▉| 124/125 [01:39<00:00,  1.19it/s]

123


100%|██████████| 125/125 [01:40<00:00,  1.24it/s]


124
printloss [23283.235551834106, 29589.752685785294, 24268.27842092514, 25520.35659456253, 18970.85496902466, 14346.785705566406, 28628.834005594254, 22595.988379478455, 20664.97207760811, 20360.440554618835, 23390.562085151672, 27231.693410396576, 31612.98562192917, 8401.602431058884, 27761.405530929565, 14633.807760715485, 14691.119477272034, 32949.3405046463, 27176.261095046997, 16802.527482509613, 25355.9680519104, 30319.81501865387, 20965.36376810074, 21216.141412734985, 29762.72616147995, 22590.785357952118, 38984.82245993614, 16209.017012119293, 16243.98591041565, 28954.81311893463, 12517.419047355652, 27891.262058258057, 30666.38350391388, 35576.77228832245, 18389.028715133667, 25693.904851913452, 24540.95094680786, 26805.65954566002, 25930.336293697357, 17507.460465431213, 14183.106224060059, 15458.279896736145, 68871.49399423599, 30547.10561823845, 23244.451585531235, 40317.18982219696, 54138.466817379, 19711.49135684967, 32019.6473236084, 30877.680629491806, 17868.44228887

100%|██████████| 25/25 [00:05<00:00,  4.25it/s]


Epoch: 04 | Time: 1m 47s
	Train Loss: 3.561 | Train PPL:  35.190
	 Val. Loss: 3.677 |  Val. PPL:  39.524
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7efe1b5394d0>


  1%|          | 1/125 [00:00<01:38,  1.26it/s]

0


  2%|▏         | 2/125 [00:01<01:44,  1.17it/s]

1


  2%|▏         | 3/125 [00:02<01:49,  1.12it/s]

2


  3%|▎         | 4/125 [00:03<01:36,  1.26it/s]

3


  4%|▍         | 5/125 [00:03<01:30,  1.33it/s]

4


  5%|▍         | 6/125 [00:04<01:22,  1.44it/s]

5


  6%|▌         | 7/125 [00:05<01:13,  1.61it/s]

6


  6%|▋         | 8/125 [00:05<01:14,  1.57it/s]

7


  7%|▋         | 9/125 [00:06<01:13,  1.58it/s]

8


  8%|▊         | 10/125 [00:06<01:12,  1.59it/s]

9


  9%|▉         | 11/125 [00:07<01:21,  1.39it/s]

10


 10%|▉         | 12/125 [00:08<01:27,  1.29it/s]

11


 10%|█         | 13/125 [00:09<01:25,  1.31it/s]

12


 11%|█         | 14/125 [00:10<01:22,  1.34it/s]

13


 12%|█▏        | 15/125 [00:11<01:27,  1.26it/s]

14


 13%|█▎        | 16/125 [00:11<01:27,  1.24it/s]

15


 14%|█▎        | 17/125 [00:12<01:29,  1.20it/s]

16


 14%|█▍        | 18/125 [00:13<01:23,  1.28it/s]

17


 15%|█▌        | 19/125 [00:14<01:26,  1.23it/s]

18


 16%|█▌        | 20/125 [00:15<01:34,  1.11it/s]

19


 17%|█▋        | 21/125 [00:16<01:32,  1.13it/s]

20


 18%|█▊        | 22/125 [00:17<01:32,  1.12it/s]

21


 18%|█▊        | 23/125 [00:18<01:28,  1.16it/s]

22


 19%|█▉        | 24/125 [00:18<01:26,  1.17it/s]

23


 20%|██        | 25/125 [00:19<01:21,  1.23it/s]

24


 21%|██        | 26/125 [00:20<01:15,  1.32it/s]

25


 22%|██▏       | 27/125 [00:21<01:21,  1.20it/s]

26


 22%|██▏       | 28/125 [00:22<01:24,  1.14it/s]

27


 23%|██▎       | 29/125 [00:23<01:26,  1.12it/s]

28


 24%|██▍       | 30/125 [00:24<01:32,  1.03it/s]

29


 25%|██▍       | 31/125 [00:25<01:31,  1.03it/s]

30


 26%|██▌       | 32/125 [00:26<01:27,  1.06it/s]

31


 26%|██▋       | 33/125 [00:26<01:22,  1.12it/s]

32


 27%|██▋       | 34/125 [00:27<01:24,  1.08it/s]

33


 28%|██▊       | 35/125 [00:28<01:16,  1.18it/s]

34


 29%|██▉       | 36/125 [00:29<01:12,  1.23it/s]

35


 30%|██▉       | 37/125 [00:29<01:07,  1.30it/s]

36


 30%|███       | 38/125 [00:30<01:00,  1.45it/s]

37


 31%|███       | 39/125 [00:31<01:01,  1.40it/s]

38


 32%|███▏      | 40/125 [00:32<01:02,  1.35it/s]

39


 33%|███▎      | 41/125 [00:32<01:04,  1.31it/s]

40


 34%|███▎      | 42/125 [00:33<01:01,  1.36it/s]

41


 34%|███▍      | 43/125 [00:34<00:57,  1.44it/s]

42


 35%|███▌      | 44/125 [00:34<00:59,  1.36it/s]

43


 36%|███▌      | 45/125 [00:35<00:59,  1.33it/s]

44


 37%|███▋      | 46/125 [00:36<00:55,  1.43it/s]

45


 38%|███▊      | 47/125 [00:37<00:57,  1.35it/s]

46


 38%|███▊      | 48/125 [00:37<00:57,  1.33it/s]

47


 39%|███▉      | 49/125 [00:38<01:01,  1.24it/s]

48


 40%|████      | 50/125 [00:39<01:02,  1.20it/s]

49


 41%|████      | 51/125 [00:40<00:56,  1.31it/s]

50


 42%|████▏     | 52/125 [00:41<00:55,  1.31it/s]

51


 42%|████▏     | 53/125 [00:41<00:51,  1.39it/s]

52


 43%|████▎     | 54/125 [00:42<00:51,  1.38it/s]

53


 44%|████▍     | 55/125 [00:43<00:51,  1.36it/s]

54


 45%|████▍     | 56/125 [00:44<00:53,  1.29it/s]

55


 46%|████▌     | 57/125 [00:45<00:55,  1.22it/s]

56


 46%|████▋     | 58/125 [00:45<00:52,  1.29it/s]

57


 47%|████▋     | 59/125 [00:46<00:55,  1.19it/s]

58


 48%|████▊     | 60/125 [00:47<00:55,  1.18it/s]

59


 49%|████▉     | 61/125 [00:48<00:51,  1.24it/s]

60


 50%|████▉     | 62/125 [00:49<00:50,  1.24it/s]

61


 50%|█████     | 63/125 [00:49<00:47,  1.30it/s]

62


 51%|█████     | 64/125 [00:50<00:47,  1.28it/s]

63


 52%|█████▏    | 65/125 [00:51<00:48,  1.24it/s]

64


 53%|█████▎    | 66/125 [00:52<00:45,  1.30it/s]

65


 54%|█████▎    | 67/125 [00:52<00:44,  1.29it/s]

66


 54%|█████▍    | 68/125 [00:53<00:48,  1.17it/s]

67


 55%|█████▌    | 69/125 [00:54<00:47,  1.19it/s]

68


 56%|█████▌    | 70/125 [00:55<00:46,  1.18it/s]

69


 57%|█████▋    | 71/125 [00:56<00:44,  1.21it/s]

70


 58%|█████▊    | 72/125 [00:57<00:43,  1.23it/s]

71


 58%|█████▊    | 73/125 [00:57<00:38,  1.36it/s]

72


 59%|█████▉    | 74/125 [00:58<00:38,  1.34it/s]

73


 60%|██████    | 75/125 [00:59<00:36,  1.36it/s]

74


 61%|██████    | 76/125 [00:59<00:34,  1.42it/s]

75


 62%|██████▏   | 77/125 [01:00<00:34,  1.41it/s]

76


 62%|██████▏   | 78/125 [01:01<00:38,  1.21it/s]

77


 63%|██████▎   | 79/125 [01:02<00:34,  1.34it/s]

78


 64%|██████▍   | 80/125 [01:03<00:37,  1.22it/s]

79


 65%|██████▍   | 81/125 [01:04<00:37,  1.19it/s]

80


 66%|██████▌   | 82/125 [01:04<00:31,  1.35it/s]

81


 66%|██████▋   | 83/125 [01:05<00:34,  1.23it/s]

82


 67%|██████▋   | 84/125 [01:06<00:33,  1.22it/s]

83


 68%|██████▊   | 85/125 [01:07<00:29,  1.37it/s]

84


 69%|██████▉   | 86/125 [01:07<00:28,  1.36it/s]

85


 70%|██████▉   | 87/125 [01:08<00:26,  1.45it/s]

86


 70%|███████   | 88/125 [01:08<00:24,  1.48it/s]

87


 71%|███████   | 89/125 [01:09<00:26,  1.37it/s]

88


 72%|███████▏  | 90/125 [01:10<00:24,  1.44it/s]

89


 73%|███████▎  | 91/125 [01:11<00:25,  1.31it/s]

90


 74%|███████▎  | 92/125 [01:12<00:25,  1.28it/s]

91


 74%|███████▍  | 93/125 [01:13<00:26,  1.20it/s]

92


 75%|███████▌  | 94/125 [01:14<00:32,  1.04s/it]

93


 76%|███████▌  | 95/125 [01:15<00:29,  1.01it/s]

94


 77%|███████▋  | 96/125 [01:16<00:26,  1.11it/s]

95


 78%|███████▊  | 97/125 [01:16<00:22,  1.26it/s]

96


 78%|███████▊  | 98/125 [01:17<00:22,  1.20it/s]

97


 79%|███████▉  | 99/125 [01:19<00:28,  1.09s/it]

98


 80%|████████  | 100/125 [01:20<00:24,  1.01it/s]

99


 81%|████████  | 101/125 [01:20<00:22,  1.06it/s]

100


 82%|████████▏ | 102/125 [01:22<00:23,  1.01s/it]

101


 82%|████████▏ | 103/125 [01:23<00:25,  1.14s/it]

102


 83%|████████▎ | 104/125 [01:24<00:23,  1.11s/it]

103


 84%|████████▍ | 105/125 [01:25<00:22,  1.11s/it]

104


 85%|████████▍ | 106/125 [01:26<00:19,  1.01s/it]

105


 86%|████████▌ | 107/125 [01:27<00:16,  1.07it/s]

106


 86%|████████▋ | 108/125 [01:28<00:15,  1.10it/s]

107


 87%|████████▋ | 109/125 [01:29<00:14,  1.10it/s]

108


 88%|████████▊ | 110/125 [01:29<00:12,  1.21it/s]

109


 89%|████████▉ | 111/125 [01:30<00:12,  1.11it/s]

110


 90%|████████▉ | 112/125 [01:31<00:10,  1.23it/s]

111


 90%|█████████ | 113/125 [01:32<00:10,  1.19it/s]

112


 91%|█████████ | 114/125 [01:33<00:10,  1.01it/s]

113


 92%|█████████▏| 115/125 [01:34<00:09,  1.10it/s]

114


 93%|█████████▎| 116/125 [01:35<00:07,  1.14it/s]

115


 94%|█████████▎| 117/125 [01:36<00:07,  1.14it/s]

116


 94%|█████████▍| 118/125 [01:36<00:05,  1.22it/s]

117


 95%|█████████▌| 119/125 [01:37<00:04,  1.31it/s]

118


 96%|█████████▌| 120/125 [01:38<00:03,  1.28it/s]

119


 97%|█████████▋| 121/125 [01:38<00:03,  1.29it/s]

120


 98%|█████████▊| 122/125 [01:39<00:02,  1.33it/s]

121


 98%|█████████▊| 123/125 [01:40<00:01,  1.35it/s]

122


 99%|█████████▉| 124/125 [01:40<00:00,  1.58it/s]

123


100%|██████████| 125/125 [01:41<00:00,  1.23it/s]


124
printloss [23946.64365005493, 28099.210402965546, 31465.705114603043, 16899.59093928337, 19035.439305067062, 13895.85758972168, 9185.494194746017, 15657.79808974266, 13717.982485294342, 15147.957355499268, 26424.46057653427, 41431.839322566986, 23512.457335472107, 19549.225243330002, 29400.120200157166, 27244.857516288757, 26762.803444862366, 19799.682600975037, 30595.940391540527, 39132.33960413933, 26263.920011520386, 34365.95945549011, 17150.864629745483, 24547.04140472412, 19961.4720287323, 14277.099900245667, 32537.135642766953, 40011.457797288895, 28171.14630174637, 53382.29087471962, 29191.114277601242, 28399.969077587128, 22793.821383953094, 39636.9980134964, 14222.458661556244, 22629.633893013, 17421.059277057648, 10162.505598068237, 24160.81296157837, 23486.16611480713, 25701.629943847656, 19416.51820707321, 14681.343266963959, 27314.369387626648, 25678.318648338318, 14148.354683876038, 27832.855525493622, 22138.27694129944, 28810.14363837242, 30426.61593389511, 13463.179

100%|██████████| 25/25 [00:06<00:00,  4.14it/s]


Epoch: 05 | Time: 1m 47s
	Train Loss: 3.441 | Train PPL:  31.234
	 Val. Loss: 3.665 |  Val. PPL:  39.050


In [None]:
SRC = Input
TRG = Output

In [None]:
model.load_state_dict(torch.load('./model_8heads.pt'))

<All keys matched successfully>

In [None]:
test_dataset

DatasetDict({
    train: Dataset({
        features: ['intent', 'rewritten_intent', 'snippet', 'question_id'],
        num_rows: 500
    })
})

In [None]:
test_inputs,reference_output=creating_features(test_dataset)
hypothesis_list=[]
for i,input in enumerate(test_inputs):
  input=input.split(" ")
  translation, attention = translate_sentence(input, SRC, TRG, model, device)
  hypothesis_list.append(untokenize(translation[:-1]).decode('utf-8'))
  print(i)

bleu = evaluate_bleu(reference_output, hypothesis_list)
print(bleu)

['send', 'a', 'signal', '`signal.SIGUSR1`', 'to', 'the', 'current', 'process']
0
['decode', 'a', 'hex', 'string', "'4a4b4c'", 'to', 'UTF-8.']
1
['check', 'if', 'all', 'elements', 'in', 'list', '`myList`', 'are', 'identical']
2
['format', 'number', 'of', 'spaces', 'between', 'strings', '`Python`,', '`:`', 'and', '`Very', 'Good`', 'to', 'be', '`20`']
3
['How', 'to', 'convert', 'a', 'string', 'from', 'CP-1251', 'to', 'UTF-8?']
4
['get', 'rid', 'of', 'None', 'values', 'in', 'dictionary', '`kwargs`']
5
['get', 'rid', 'of', 'None', 'values', 'in', 'dictionary', '`kwargs`']
6
['capture', 'final', 'output', 'of', 'a', 'chain', 'of', 'system', 'commands', '`ps', '-ef', '|', 'grep', 'something', '|', 'wc', '-l`']
7
['concatenate', 'a', 'list', 'of', 'strings', "`['a',", "'b',", "'c']`"]
8
['find', 'intersection', 'data', 'between', 'series', '`s1`', 'and', 'series', '`s2`']
9
['sending', 'http', 'headers', 'to', '`client`']
10
['Format', 'a', 'datetime', 'string', '`when`', 'to', 'extract', 'dat

***Implementation with 6 layers and 16 heads***

In [None]:
INPUT_DIM = len(Input.vocab)
OUTPUT_DIM = len(Output.vocab)
HID_DIM = 256
ENC_LAYERS = 6
DEC_LAYERS = 6
ENC_HEADS = 16
DEC_HEADS = 16
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [None]:
N_EPOCHS = 5
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_example = []
    val_example = []

    for i in range(train_df.shape[0]):
        try:
            ex = data.Example.fromlist([train_df.Text[i], train_df.Code[i]], fields)
            train_example.append(ex)
        except:
            pass

    for i in range(val_df.shape[0]):
        try:
            ex = data.Example.fromlist([val_df.Text[i], val_df.Code[i]], fields)
            val_example.append(ex)
        except:
            pass       

    train_data = data.Dataset(train_example, fields)
    valid_data =  data.Dataset(val_example, fields)

    BATCH_SIZE = 16
    train_iterator, valid_iterator = BucketIterator.splits((train_data, valid_data), batch_size = BATCH_SIZE, 
                                                                sort_key = lambda x: len(x.Input),
                                                                sort_within_batch=True, device = device)

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/model_16layers.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7f872beb7fd0>


  1%|          | 1/125 [00:00<01:42,  1.21it/s]

0


  2%|▏         | 2/125 [00:01<01:35,  1.29it/s]

1


  2%|▏         | 3/125 [00:02<01:22,  1.47it/s]

2


  3%|▎         | 4/125 [00:02<01:22,  1.46it/s]

3


  4%|▍         | 5/125 [00:04<01:59,  1.01it/s]

4


  5%|▍         | 6/125 [00:05<01:50,  1.07it/s]

5


  6%|▌         | 7/125 [00:06<01:50,  1.07it/s]

6


  6%|▋         | 8/125 [00:07<01:49,  1.07it/s]

7


  7%|▋         | 9/125 [00:07<01:47,  1.08it/s]

8


  8%|▊         | 10/125 [00:08<01:38,  1.17it/s]

9


  9%|▉         | 11/125 [00:09<01:31,  1.25it/s]

10


 10%|▉         | 12/125 [00:10<01:30,  1.24it/s]

11


 10%|█         | 13/125 [00:10<01:28,  1.27it/s]

12


 11%|█         | 14/125 [00:11<01:22,  1.34it/s]

13


 12%|█▏        | 15/125 [00:12<01:21,  1.36it/s]

14


 13%|█▎        | 16/125 [00:13<01:23,  1.30it/s]

15


 14%|█▎        | 17/125 [00:14<01:30,  1.19it/s]

16


 14%|█▍        | 18/125 [00:15<01:31,  1.17it/s]

17


 15%|█▌        | 19/125 [00:15<01:26,  1.22it/s]

18


 16%|█▌        | 20/125 [00:16<01:28,  1.19it/s]

19


 17%|█▋        | 21/125 [00:17<01:27,  1.19it/s]

20


 18%|█▊        | 22/125 [00:18<01:24,  1.22it/s]

21


 18%|█▊        | 23/125 [00:19<01:27,  1.16it/s]

22


 19%|█▉        | 24/125 [00:19<01:21,  1.23it/s]

23


 20%|██        | 25/125 [00:20<01:27,  1.14it/s]

24


 21%|██        | 26/125 [00:21<01:26,  1.14it/s]

25


 22%|██▏       | 27/125 [00:22<01:20,  1.21it/s]

26


 22%|██▏       | 28/125 [00:23<01:16,  1.27it/s]

27


 23%|██▎       | 29/125 [00:23<01:10,  1.35it/s]

28


 24%|██▍       | 30/125 [00:24<01:08,  1.39it/s]

29


 25%|██▍       | 31/125 [00:25<01:13,  1.27it/s]

30


 26%|██▌       | 32/125 [00:26<01:11,  1.31it/s]

31


 26%|██▋       | 33/125 [00:27<01:14,  1.23it/s]

32


 27%|██▋       | 34/125 [00:27<01:08,  1.32it/s]

33


 28%|██▊       | 35/125 [00:28<01:04,  1.40it/s]

34


 29%|██▉       | 36/125 [00:28<00:58,  1.52it/s]

35


 30%|██▉       | 37/125 [00:29<00:59,  1.48it/s]

36


 30%|███       | 38/125 [00:30<00:58,  1.49it/s]

37


 31%|███       | 39/125 [00:30<00:53,  1.60it/s]

38


 32%|███▏      | 40/125 [00:31<00:57,  1.48it/s]

39


 33%|███▎      | 41/125 [00:32<01:06,  1.27it/s]

40


 34%|███▎      | 42/125 [00:33<01:02,  1.33it/s]

41


 34%|███▍      | 43/125 [00:34<01:02,  1.30it/s]

42


 35%|███▌      | 44/125 [00:34<00:59,  1.36it/s]

43


 36%|███▌      | 45/125 [00:35<01:01,  1.29it/s]

44


 37%|███▋      | 46/125 [00:36<01:03,  1.25it/s]

45


 38%|███▊      | 47/125 [00:37<01:00,  1.28it/s]

46


 38%|███▊      | 48/125 [00:37<00:58,  1.31it/s]

47


 39%|███▉      | 49/125 [00:38<00:53,  1.43it/s]

48


 40%|████      | 50/125 [00:39<00:54,  1.37it/s]

49


 41%|████      | 51/125 [00:40<00:58,  1.27it/s]

50


 42%|████▏     | 52/125 [00:40<00:57,  1.27it/s]

51


 42%|████▏     | 53/125 [00:42<01:05,  1.11it/s]

52


 43%|████▎     | 54/125 [00:43<01:22,  1.16s/it]

53


 44%|████▍     | 55/125 [00:44<01:10,  1.01s/it]

54


 45%|████▍     | 56/125 [00:45<01:07,  1.03it/s]

55


 46%|████▌     | 57/125 [00:46<01:01,  1.11it/s]

56


 46%|████▋     | 58/125 [00:47<00:58,  1.14it/s]

57


 47%|████▋     | 59/125 [00:47<00:49,  1.34it/s]

58


 48%|████▊     | 60/125 [00:48<00:50,  1.30it/s]

59


 49%|████▉     | 61/125 [00:49<00:50,  1.27it/s]

60


 50%|████▉     | 62/125 [00:49<00:47,  1.31it/s]

61


 50%|█████     | 63/125 [00:50<00:52,  1.19it/s]

62


 51%|█████     | 64/125 [00:51<00:52,  1.17it/s]

63


 52%|█████▏    | 65/125 [00:52<00:49,  1.21it/s]

64


 53%|█████▎    | 66/125 [00:53<00:43,  1.35it/s]

65


 54%|█████▎    | 67/125 [00:53<00:42,  1.37it/s]

66


 54%|█████▍    | 68/125 [00:54<00:41,  1.38it/s]

67


 55%|█████▌    | 69/125 [00:55<00:38,  1.47it/s]

68


 56%|█████▌    | 70/125 [00:55<00:35,  1.54it/s]

69


 57%|█████▋    | 71/125 [00:56<00:40,  1.35it/s]

70


 58%|█████▊    | 72/125 [00:57<00:36,  1.46it/s]

71


 58%|█████▊    | 73/125 [00:57<00:37,  1.39it/s]

72


 59%|█████▉    | 74/125 [00:59<00:46,  1.10it/s]

73


 60%|██████    | 75/125 [01:00<00:45,  1.10it/s]

74


 61%|██████    | 76/125 [01:00<00:40,  1.20it/s]

75


 62%|██████▏   | 77/125 [01:01<00:38,  1.24it/s]

76


 62%|██████▏   | 78/125 [01:02<00:36,  1.28it/s]

77


 63%|██████▎   | 79/125 [01:02<00:32,  1.40it/s]

78


 64%|██████▍   | 80/125 [01:03<00:31,  1.43it/s]

79


 65%|██████▍   | 81/125 [01:04<00:29,  1.48it/s]

80


 66%|██████▌   | 82/125 [01:04<00:31,  1.38it/s]

81


 66%|██████▋   | 83/125 [01:05<00:31,  1.34it/s]

82


 67%|██████▋   | 84/125 [01:06<00:34,  1.18it/s]

83


 68%|██████▊   | 85/125 [01:07<00:36,  1.09it/s]

84


 69%|██████▉   | 86/125 [01:09<00:41,  1.07s/it]

85


 70%|██████▉   | 87/125 [01:10<00:39,  1.05s/it]

86


 70%|███████   | 88/125 [01:11<00:36,  1.01it/s]

87


 71%|███████   | 89/125 [01:11<00:33,  1.08it/s]

88


 72%|███████▏  | 90/125 [01:12<00:32,  1.08it/s]

89


 73%|███████▎  | 91/125 [01:14<00:33,  1.02it/s]

90


 74%|███████▎  | 92/125 [01:14<00:29,  1.13it/s]

91


 74%|███████▍  | 93/125 [01:15<00:27,  1.16it/s]

92


 75%|███████▌  | 94/125 [01:16<00:25,  1.24it/s]

93


 76%|███████▌  | 95/125 [01:17<00:24,  1.21it/s]

94


 77%|███████▋  | 96/125 [01:17<00:23,  1.23it/s]

95


 78%|███████▊  | 97/125 [01:18<00:22,  1.25it/s]

96


 78%|███████▊  | 98/125 [01:19<00:20,  1.30it/s]

97


 79%|███████▉  | 99/125 [01:20<00:20,  1.24it/s]

98


 80%|████████  | 100/125 [01:20<00:19,  1.27it/s]

99


 81%|████████  | 101/125 [01:21<00:17,  1.34it/s]

100


 82%|████████▏ | 102/125 [01:22<00:19,  1.18it/s]

101


 82%|████████▏ | 103/125 [01:23<00:19,  1.15it/s]

102


 83%|████████▎ | 104/125 [01:24<00:16,  1.27it/s]

103


 84%|████████▍ | 105/125 [01:25<00:17,  1.17it/s]

104


 85%|████████▍ | 106/125 [01:25<00:15,  1.24it/s]

105


 86%|████████▌ | 107/125 [01:26<00:14,  1.27it/s]

106


 86%|████████▋ | 108/125 [01:27<00:12,  1.34it/s]

107


 87%|████████▋ | 109/125 [01:28<00:12,  1.27it/s]

108


 88%|████████▊ | 110/125 [01:28<00:10,  1.36it/s]

109


 89%|████████▉ | 111/125 [01:29<00:09,  1.54it/s]

110


 90%|████████▉ | 112/125 [01:29<00:08,  1.50it/s]

111


 90%|█████████ | 113/125 [01:30<00:08,  1.48it/s]

112


 91%|█████████ | 114/125 [01:32<00:10,  1.03it/s]

113


 92%|█████████▏| 115/125 [01:33<00:10,  1.04s/it]

114


 93%|█████████▎| 116/125 [01:34<00:09,  1.02s/it]

115


 94%|█████████▎| 117/125 [01:35<00:08,  1.03s/it]

116


 94%|█████████▍| 118/125 [01:36<00:06,  1.13it/s]

117


 95%|█████████▌| 119/125 [01:36<00:05,  1.10it/s]

118


 96%|█████████▌| 120/125 [01:37<00:04,  1.19it/s]

119


 97%|█████████▋| 121/125 [01:38<00:03,  1.26it/s]

120


 98%|█████████▊| 122/125 [01:39<00:02,  1.26it/s]

121


 98%|█████████▊| 123/125 [01:40<00:01,  1.24it/s]

122


 99%|█████████▉| 124/125 [01:40<00:00,  1.19it/s]

123


100%|██████████| 125/125 [01:41<00:00,  1.23it/s]


124
printloss [40904.578899383545, 42347.01530456543, 22137.107243061066, 35087.17915534973, 114304.84547615051, 45904.47960281372, 48062.55200386047, 61273.031081199646, 54273.55026721954, 35678.57342147827, 33544.7134103775, 43458.71329879761, 35148.55373764038, 30265.33984708786, 38213.74514055252, 42221.143005371094, 49625.31734466553, 45003.4561419487, 38841.10206604004, 48152.059885025024, 41977.49567985535, 37207.24394607544, 40996.95420837402, 31466.850737571716, 61558.68852138519, 41268.52576160431, 31327.07746744156, 33629.408895492554, 23483.076559066772, 29324.390127182007, 43713.0504693985, 22811.706840515137, 42535.88144302368, 26531.75216293335, 22611.48914051056, 19090.439474105835, 36871.22318172455, 27624.11672925949, 14612.41312789917, 40668.15919017792, 70889.13860177994, 30997.19482898712, 41156.582329273224, 29310.31342124939, 53177.30997991562, 51940.88317298889, 32932.48589515686, 31337.528601646423, 19081.147810935974, 31023.737382888794, 48195.31374883652, 345

100%|██████████| 25/25 [00:05<00:00,  4.17it/s]


Epoch: 01 | Time: 1m 48s
	Train Loss: 5.566 | Train PPL: 261.345
	 Val. Loss: 5.268 |  Val. PPL: 193.971
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7f872c887f90>


  1%|          | 1/125 [00:00<01:39,  1.25it/s]

0


  2%|▏         | 2/125 [00:01<01:38,  1.25it/s]

1


  2%|▏         | 3/125 [00:02<01:43,  1.17it/s]

2


  3%|▎         | 4/125 [00:03<01:28,  1.37it/s]

3


  4%|▍         | 5/125 [00:03<01:34,  1.27it/s]

4


  5%|▍         | 6/125 [00:04<01:30,  1.31it/s]

5


  6%|▌         | 7/125 [00:05<01:33,  1.27it/s]

6


  6%|▋         | 8/125 [00:06<01:29,  1.31it/s]

7


  7%|▋         | 9/125 [00:07<01:33,  1.25it/s]

8


  8%|▊         | 10/125 [00:07<01:24,  1.37it/s]

9


  9%|▉         | 11/125 [00:08<01:19,  1.44it/s]

10


 10%|▉         | 12/125 [00:09<01:25,  1.32it/s]

11


 10%|█         | 13/125 [00:09<01:25,  1.32it/s]

12


 11%|█         | 14/125 [00:11<01:34,  1.18it/s]

13


 12%|█▏        | 15/125 [00:11<01:27,  1.25it/s]

14


 13%|█▎        | 16/125 [00:12<01:24,  1.29it/s]

15


 14%|█▎        | 17/125 [00:13<01:23,  1.29it/s]

16


 14%|█▍        | 18/125 [00:14<01:26,  1.24it/s]

17


 15%|█▌        | 19/125 [00:14<01:26,  1.23it/s]

18


 16%|█▌        | 20/125 [00:15<01:19,  1.32it/s]

19


 17%|█▋        | 21/125 [00:16<01:15,  1.37it/s]

20


 18%|█▊        | 22/125 [00:16<01:14,  1.39it/s]

21


 18%|█▊        | 23/125 [00:17<01:14,  1.37it/s]

22


 19%|█▉        | 24/125 [00:19<01:33,  1.09it/s]

23


 20%|██        | 25/125 [00:19<01:26,  1.16it/s]

24


 21%|██        | 26/125 [00:20<01:17,  1.28it/s]

25


 22%|██▏       | 27/125 [00:21<01:15,  1.30it/s]

26


 22%|██▏       | 28/125 [00:22<01:26,  1.12it/s]

27


 23%|██▎       | 29/125 [00:23<01:24,  1.14it/s]

28


 24%|██▍       | 30/125 [00:23<01:13,  1.29it/s]

29


 25%|██▍       | 31/125 [00:24<01:17,  1.21it/s]

30


 26%|██▌       | 32/125 [00:25<01:09,  1.34it/s]

31


 26%|██▋       | 33/125 [00:25<01:10,  1.31it/s]

32


 27%|██▋       | 34/125 [00:27<01:39,  1.09s/it]

33


 28%|██▊       | 35/125 [00:28<01:27,  1.03it/s]

34


 29%|██▉       | 36/125 [00:29<01:17,  1.16it/s]

35


 30%|██▉       | 37/125 [00:29<01:15,  1.16it/s]

36


 30%|███       | 38/125 [00:30<01:09,  1.25it/s]

37


 31%|███       | 39/125 [00:31<01:09,  1.24it/s]

38


 32%|███▏      | 40/125 [00:32<01:11,  1.19it/s]

39


 33%|███▎      | 41/125 [00:33<01:12,  1.16it/s]

40


 34%|███▎      | 42/125 [00:33<01:07,  1.23it/s]

41


 34%|███▍      | 43/125 [00:34<01:09,  1.19it/s]

42


 35%|███▌      | 44/125 [00:35<01:05,  1.24it/s]

43


 36%|███▌      | 45/125 [00:36<01:03,  1.25it/s]

44


 37%|███▋      | 46/125 [00:37<01:06,  1.18it/s]

45


 38%|███▊      | 47/125 [00:38<01:03,  1.24it/s]

46


 38%|███▊      | 48/125 [00:38<01:03,  1.22it/s]

47


 39%|███▉      | 49/125 [00:39<01:04,  1.18it/s]

48


 40%|████      | 50/125 [00:40<00:56,  1.32it/s]

49


 41%|████      | 51/125 [00:41<01:07,  1.10it/s]

50


 42%|████▏     | 52/125 [00:42<01:10,  1.04it/s]

51


 42%|████▏     | 53/125 [00:44<01:19,  1.10s/it]

52


 43%|████▎     | 54/125 [00:45<01:18,  1.11s/it]

53


 44%|████▍     | 55/125 [00:46<01:13,  1.05s/it]

54


 45%|████▍     | 56/125 [00:46<01:03,  1.08it/s]

55


 46%|████▌     | 57/125 [00:47<00:57,  1.18it/s]

56


 46%|████▋     | 58/125 [00:48<00:55,  1.20it/s]

57


 47%|████▋     | 59/125 [00:49<00:54,  1.20it/s]

58


 48%|████▊     | 60/125 [00:49<00:53,  1.20it/s]

59


 49%|████▉     | 61/125 [00:50<00:49,  1.28it/s]

60


 50%|████▉     | 62/125 [00:51<00:52,  1.19it/s]

61


 50%|█████     | 63/125 [00:52<00:49,  1.25it/s]

62


 51%|█████     | 64/125 [00:53<00:49,  1.22it/s]

63


 52%|█████▏    | 65/125 [00:53<00:49,  1.22it/s]

64


 53%|█████▎    | 66/125 [00:54<00:47,  1.25it/s]

65


 54%|█████▎    | 67/125 [00:55<00:45,  1.28it/s]

66


 54%|█████▍    | 68/125 [00:56<00:41,  1.38it/s]

67


 55%|█████▌    | 69/125 [00:56<00:43,  1.27it/s]

68


 56%|█████▌    | 70/125 [00:58<00:50,  1.09it/s]

69


 57%|█████▋    | 71/125 [00:58<00:44,  1.23it/s]

70


 58%|█████▊    | 72/125 [00:59<00:41,  1.27it/s]

71


 58%|█████▊    | 73/125 [01:00<00:39,  1.31it/s]

72


 59%|█████▉    | 74/125 [01:00<00:39,  1.29it/s]

73


 60%|██████    | 75/125 [01:01<00:40,  1.25it/s]

74


 61%|██████    | 76/125 [01:02<00:43,  1.12it/s]

75


 62%|██████▏   | 77/125 [01:03<00:42,  1.14it/s]

76


 62%|██████▏   | 78/125 [01:04<00:40,  1.17it/s]

77


 63%|██████▎   | 79/125 [01:05<00:37,  1.24it/s]

78


 64%|██████▍   | 80/125 [01:05<00:33,  1.33it/s]

79


 65%|██████▍   | 81/125 [01:06<00:32,  1.36it/s]

80


 66%|██████▌   | 82/125 [01:07<00:32,  1.33it/s]

81


 66%|██████▋   | 83/125 [01:08<00:34,  1.21it/s]

82


 67%|██████▋   | 84/125 [01:09<00:32,  1.26it/s]

83


 68%|██████▊   | 85/125 [01:10<00:34,  1.16it/s]

84


 69%|██████▉   | 86/125 [01:11<00:33,  1.15it/s]

85


 70%|██████▉   | 87/125 [01:11<00:31,  1.19it/s]

86


 70%|███████   | 88/125 [01:12<00:31,  1.16it/s]

87


 71%|███████   | 89/125 [01:13<00:27,  1.31it/s]

88


 72%|███████▏  | 90/125 [01:14<00:26,  1.30it/s]

89


 73%|███████▎  | 91/125 [01:14<00:26,  1.30it/s]

90


 74%|███████▎  | 92/125 [01:16<00:34,  1.04s/it]

91


 74%|███████▍  | 93/125 [01:17<00:35,  1.09s/it]

92


 75%|███████▌  | 94/125 [01:19<00:36,  1.17s/it]

93


 76%|███████▌  | 95/125 [01:20<00:39,  1.30s/it]

94


 77%|███████▋  | 96/125 [01:22<00:38,  1.32s/it]

95


 78%|███████▊  | 97/125 [01:23<00:37,  1.35s/it]

96


 78%|███████▊  | 98/125 [01:24<00:35,  1.30s/it]

97


 79%|███████▉  | 99/125 [01:25<00:32,  1.24s/it]

98


 80%|████████  | 100/125 [01:26<00:27,  1.09s/it]

99


 81%|████████  | 101/125 [01:27<00:23,  1.04it/s]

100


 82%|████████▏ | 102/125 [01:27<00:19,  1.17it/s]

101


 82%|████████▏ | 103/125 [01:28<00:17,  1.29it/s]

102


 83%|████████▎ | 104/125 [01:28<00:15,  1.39it/s]

103


 84%|████████▍ | 105/125 [01:29<00:14,  1.34it/s]

104


 85%|████████▍ | 106/125 [01:30<00:13,  1.40it/s]

105


 86%|████████▌ | 107/125 [01:31<00:14,  1.27it/s]

106


 86%|████████▋ | 108/125 [01:32<00:12,  1.32it/s]

107


 87%|████████▋ | 109/125 [01:32<00:12,  1.32it/s]

108


 88%|████████▊ | 110/125 [01:33<00:11,  1.32it/s]

109


 89%|████████▉ | 111/125 [01:34<00:11,  1.17it/s]

110


 90%|████████▉ | 112/125 [01:35<00:11,  1.17it/s]

111


 90%|█████████ | 113/125 [01:36<00:10,  1.19it/s]

112


 91%|█████████ | 114/125 [01:36<00:08,  1.26it/s]

113


 92%|█████████▏| 115/125 [01:37<00:08,  1.23it/s]

114


 93%|█████████▎| 116/125 [01:38<00:07,  1.25it/s]

115


 94%|█████████▎| 117/125 [01:39<00:06,  1.29it/s]

116


 94%|█████████▍| 118/125 [01:39<00:04,  1.44it/s]

117


 95%|█████████▌| 119/125 [01:40<00:04,  1.44it/s]

118


 96%|█████████▌| 120/125 [01:41<00:03,  1.39it/s]

119


 97%|█████████▋| 121/125 [01:42<00:03,  1.14it/s]

120


 98%|█████████▊| 122/125 [01:43<00:02,  1.07it/s]

121


 98%|█████████▊| 123/125 [01:44<00:01,  1.08it/s]

122


 99%|█████████▉| 124/125 [01:45<00:00,  1.17it/s]

123


100%|██████████| 125/125 [01:46<00:00,  1.18it/s]


124
printloss [35816.40170764923, 45117.98260116577, 38175.57375621796, 13380.215286254883, 52956.46844244003, 26908.743901729584, 46087.33010387421, 21737.31118440628, 57191.474401474, 15960.385152339935, 22554.54785346985, 46503.9043879509, 24801.333486557007, 68271.26152706146, 24859.189256191254, 29122.877604961395, 37455.54295825958, 43307.665241241455, 35868.48919391632, 21378.378690719604, 25050.071380615234, 29281.101654052734, 25494.991998672485, 99306.43200683594, 32425.27639389038, 19738.868980407715, 35384.90096282959, 81649.44494962692, 41088.61566925049, 18351.82976913452, 44921.07500696182, 13903.3833360672, 28420.950857162476, 96273.948843956, 29319.22979450226, 20127.074313640594, 45202.348828315735, 25367.87509727478, 33551.736850738525, 48766.30334472656, 45140.47791624069, 30746.620245933533, 50404.47748184204, 32488.304379463196, 38240.494627952576, 34914.711181640625, 30871.041778564453, 37443.76403617859, 55494.9164686203, 18194.80389022827, 56471.76707267761, 29

100%|██████████| 25/25 [00:05<00:00,  4.22it/s]


Epoch: 02 | Time: 1m 52s
	Train Loss: 5.176 | Train PPL: 177.040
	 Val. Loss: 5.111 |  Val. PPL: 165.846
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7f872f5e8c90>


  1%|          | 1/125 [00:00<01:20,  1.55it/s]

0


  2%|▏         | 2/125 [00:01<01:24,  1.46it/s]

1


  2%|▏         | 3/125 [00:02<01:22,  1.47it/s]

2


  3%|▎         | 4/125 [00:02<01:19,  1.53it/s]

3


  4%|▍         | 5/125 [00:03<01:25,  1.41it/s]

4


  5%|▍         | 6/125 [00:04<01:32,  1.28it/s]

5


  6%|▌         | 7/125 [00:04<01:23,  1.41it/s]

6


  6%|▋         | 8/125 [00:05<01:21,  1.43it/s]

7


  7%|▋         | 9/125 [00:06<01:15,  1.53it/s]

8


  8%|▊         | 10/125 [00:06<01:15,  1.52it/s]

9


  9%|▉         | 11/125 [00:08<01:54,  1.00s/it]

10


 10%|▉         | 12/125 [00:09<01:55,  1.03s/it]

11


 10%|█         | 13/125 [00:10<01:41,  1.10it/s]

12


 11%|█         | 14/125 [00:11<01:44,  1.06it/s]

13


 12%|█▏        | 15/125 [00:12<01:35,  1.15it/s]

14


 13%|█▎        | 16/125 [00:12<01:35,  1.14it/s]

15


 14%|█▎        | 17/125 [00:13<01:35,  1.13it/s]

16


 14%|█▍        | 18/125 [00:14<01:26,  1.24it/s]

17


 15%|█▌        | 19/125 [00:15<01:17,  1.37it/s]

18


 16%|█▌        | 20/125 [00:15<01:15,  1.39it/s]

19


 17%|█▋        | 21/125 [00:16<01:20,  1.29it/s]

20


 18%|█▊        | 22/125 [00:17<01:20,  1.27it/s]

21


 18%|█▊        | 23/125 [00:18<01:22,  1.24it/s]

22


 19%|█▉        | 24/125 [00:19<01:19,  1.27it/s]

23


 20%|██        | 25/125 [00:19<01:18,  1.27it/s]

24


 21%|██        | 26/125 [00:20<01:17,  1.27it/s]

25


 22%|██▏       | 27/125 [00:21<01:10,  1.40it/s]

26


 22%|██▏       | 28/125 [00:22<01:18,  1.24it/s]

27


 23%|██▎       | 29/125 [00:23<01:18,  1.22it/s]

28


 24%|██▍       | 30/125 [00:23<01:18,  1.22it/s]

29


 25%|██▍       | 31/125 [00:24<01:12,  1.29it/s]

30


 26%|██▌       | 32/125 [00:25<01:20,  1.16it/s]

31


 26%|██▋       | 33/125 [00:26<01:16,  1.20it/s]

32


 27%|██▋       | 34/125 [00:27<01:17,  1.18it/s]

33


 28%|██▊       | 35/125 [00:28<01:15,  1.19it/s]

34


 29%|██▉       | 36/125 [00:28<01:15,  1.18it/s]

35


 30%|██▉       | 37/125 [00:29<01:12,  1.22it/s]

36


 30%|███       | 38/125 [00:30<01:09,  1.25it/s]

37


 31%|███       | 39/125 [00:31<01:19,  1.08it/s]

38


 32%|███▏      | 40/125 [00:32<01:12,  1.18it/s]

39


 33%|███▎      | 41/125 [00:33<01:07,  1.24it/s]

40


 34%|███▎      | 42/125 [00:33<01:03,  1.31it/s]

41


 34%|███▍      | 43/125 [00:34<01:00,  1.35it/s]

42


 35%|███▌      | 44/125 [00:35<00:57,  1.42it/s]

43


 36%|███▌      | 45/125 [00:36<01:03,  1.25it/s]

44


 37%|███▋      | 46/125 [00:37<01:09,  1.13it/s]

45


 38%|███▊      | 47/125 [00:38<01:20,  1.03s/it]

46


 38%|███▊      | 48/125 [00:39<01:08,  1.12it/s]

47


 39%|███▉      | 49/125 [00:39<01:06,  1.14it/s]

48


 40%|████      | 50/125 [00:40<01:00,  1.24it/s]

49


 41%|████      | 51/125 [00:41<00:55,  1.34it/s]

50


 42%|████▏     | 52/125 [00:42<00:58,  1.25it/s]

51


 42%|████▏     | 53/125 [00:42<00:52,  1.38it/s]

52


 43%|████▎     | 54/125 [00:43<00:57,  1.23it/s]

53


 44%|████▍     | 55/125 [00:44<00:55,  1.27it/s]

54


 45%|████▍     | 56/125 [00:45<00:52,  1.30it/s]

55


 46%|████▌     | 57/125 [00:45<00:48,  1.41it/s]

56


 46%|████▋     | 58/125 [00:46<00:48,  1.37it/s]

57


 47%|████▋     | 59/125 [00:47<00:47,  1.40it/s]

58


 48%|████▊     | 60/125 [00:47<00:44,  1.45it/s]

59


 49%|████▉     | 61/125 [00:48<00:46,  1.37it/s]

60


 50%|████▉     | 62/125 [00:49<00:45,  1.38it/s]

61


 50%|█████     | 63/125 [00:49<00:43,  1.44it/s]

62


 51%|█████     | 64/125 [00:50<00:46,  1.32it/s]

63


 52%|█████▏    | 65/125 [00:51<00:42,  1.40it/s]

64


 53%|█████▎    | 66/125 [00:52<00:45,  1.29it/s]

65


 54%|█████▎    | 67/125 [00:53<00:44,  1.30it/s]

66


 54%|█████▍    | 68/125 [00:54<00:53,  1.06it/s]

67


 55%|█████▌    | 69/125 [00:55<00:48,  1.15it/s]

68


 56%|█████▌    | 70/125 [00:56<00:52,  1.04it/s]

69


 57%|█████▋    | 71/125 [00:57<00:49,  1.09it/s]

70


 58%|█████▊    | 72/125 [00:57<00:47,  1.11it/s]

71


 58%|█████▊    | 73/125 [00:58<00:45,  1.15it/s]

72


 59%|█████▉    | 74/125 [00:59<00:42,  1.20it/s]

73


 60%|██████    | 75/125 [01:00<00:45,  1.10it/s]

74


 61%|██████    | 76/125 [01:01<00:42,  1.15it/s]

75


 62%|██████▏   | 77/125 [01:02<00:42,  1.13it/s]

76


 62%|██████▏   | 78/125 [01:03<00:41,  1.13it/s]

77


 63%|██████▎   | 79/125 [01:04<00:41,  1.11it/s]

78


 64%|██████▍   | 80/125 [01:05<00:41,  1.09it/s]

79


 65%|██████▍   | 81/125 [01:06<00:46,  1.07s/it]

80


 66%|██████▌   | 82/125 [01:07<00:49,  1.14s/it]

81


 66%|██████▋   | 83/125 [01:09<00:48,  1.16s/it]

82


 67%|██████▋   | 84/125 [01:09<00:44,  1.08s/it]

83


 68%|██████▊   | 85/125 [01:10<00:41,  1.03s/it]

84


 69%|██████▉   | 86/125 [01:11<00:36,  1.08it/s]

85


 70%|██████▉   | 87/125 [01:12<00:33,  1.13it/s]

86


 70%|███████   | 88/125 [01:13<00:33,  1.11it/s]

87


 71%|███████   | 89/125 [01:14<00:31,  1.14it/s]

88


 72%|███████▏  | 90/125 [01:14<00:29,  1.19it/s]

89


 73%|███████▎  | 91/125 [01:15<00:25,  1.31it/s]

90


 74%|███████▎  | 92/125 [01:16<00:25,  1.29it/s]

91


 74%|███████▍  | 93/125 [01:16<00:24,  1.29it/s]

92


 75%|███████▌  | 94/125 [01:17<00:25,  1.21it/s]

93


 76%|███████▌  | 95/125 [01:18<00:23,  1.28it/s]

94


 77%|███████▋  | 96/125 [01:19<00:21,  1.32it/s]

95


 78%|███████▊  | 97/125 [01:20<00:20,  1.35it/s]

96


 78%|███████▊  | 98/125 [01:20<00:19,  1.36it/s]

97


 79%|███████▉  | 99/125 [01:21<00:18,  1.41it/s]

98


 80%|████████  | 100/125 [01:22<00:18,  1.32it/s]

99


 81%|████████  | 101/125 [01:23<00:18,  1.27it/s]

100


 82%|████████▏ | 102/125 [01:24<00:18,  1.22it/s]

101


 82%|████████▏ | 103/125 [01:24<00:17,  1.26it/s]

102


 83%|████████▎ | 104/125 [01:25<00:17,  1.17it/s]

103


 84%|████████▍ | 105/125 [01:26<00:15,  1.25it/s]

104


 85%|████████▍ | 106/125 [01:27<00:14,  1.33it/s]

105


 86%|████████▌ | 107/125 [01:28<00:15,  1.17it/s]

106


 86%|████████▋ | 108/125 [01:28<00:13,  1.25it/s]

107


 87%|████████▋ | 109/125 [01:29<00:12,  1.31it/s]

108


 88%|████████▊ | 110/125 [01:30<00:13,  1.12it/s]

109


 89%|████████▉ | 111/125 [01:31<00:12,  1.08it/s]

110


 90%|████████▉ | 112/125 [01:32<00:11,  1.13it/s]

111


 90%|█████████ | 113/125 [01:33<00:09,  1.26it/s]

112


 91%|█████████ | 114/125 [01:33<00:08,  1.32it/s]

113


 92%|█████████▏| 115/125 [01:34<00:07,  1.25it/s]

114


 93%|█████████▎| 116/125 [01:35<00:07,  1.22it/s]

115


 94%|█████████▎| 117/125 [01:37<00:08,  1.11s/it]

116


 94%|█████████▍| 118/125 [01:38<00:07,  1.03s/it]

117


 95%|█████████▌| 119/125 [01:38<00:05,  1.06it/s]

118


 96%|█████████▌| 120/125 [01:39<00:04,  1.23it/s]

119


 97%|█████████▋| 121/125 [01:40<00:03,  1.21it/s]

120


 98%|█████████▊| 122/125 [01:41<00:02,  1.19it/s]

121


 98%|█████████▊| 123/125 [01:41<00:01,  1.24it/s]

122


 99%|█████████▉| 124/125 [01:42<00:00,  1.15it/s]

123


100%|██████████| 125/125 [01:43<00:00,  1.20it/s]


124
printloss [24599.46437072754, 29559.187030792236, 22626.67222881317, 22470.379676818848, 37700.29811000824, 47760.18538856506, 15896.663061618805, 27331.065399169922, 18048.172536849976, 27149.009521484375, 96283.83013343811, 56284.204902648926, 25987.761471748352, 59940.31032562256, 25211.620287895203, 48267.687435626984, 31305.517313957214, 20936.735454559326, 17313.544229507446, 33437.26787567139, 46178.114336013794, 42212.571936130524, 34418.01354265213, 30587.59686899185, 29710.32816696167, 44096.6520113945, 14657.218335151672, 48053.14535999298, 42882.39312171936, 40236.27919006348, 29259.633464813232, 54614.53749322891, 39158.927063941956, 57764.99437332153, 43047.44737625122, 45686.97158718109, 35666.74964141846, 30360.1509475708, 56802.37328052521, 27361.0763835907, 33617.729978084564, 28932.449763298035, 28788.95134162903, 20387.1295003891, 40949.1078710556, 29007.796726226807, 41515.55409383774, 17462.05965280533, 42039.08386230469, 21873.63054227829, 21879.528999328613,

100%|██████████| 25/25 [00:05<00:00,  4.22it/s]


Epoch: 03 | Time: 1m 50s
	Train Loss: 5.080 | Train PPL: 160.724
	 Val. Loss: 5.048 |  Val. PPL: 155.644
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7f87300f0790>


  1%|          | 1/125 [00:00<01:35,  1.31it/s]

0


  2%|▏         | 2/125 [00:01<01:53,  1.09it/s]

1


  2%|▏         | 3/125 [00:02<01:47,  1.14it/s]

2


  3%|▎         | 4/125 [00:03<01:45,  1.15it/s]

3


  4%|▍         | 5/125 [00:04<01:36,  1.24it/s]

4


  5%|▍         | 6/125 [00:04<01:30,  1.32it/s]

5


  6%|▌         | 7/125 [00:06<01:51,  1.06it/s]

6


  6%|▋         | 8/125 [00:07<02:02,  1.05s/it]

7


  7%|▋         | 9/125 [00:08<01:59,  1.03s/it]

8


  8%|▊         | 10/125 [00:09<01:56,  1.01s/it]

9


  9%|▉         | 11/125 [00:10<01:48,  1.05it/s]

10


 10%|▉         | 12/125 [00:11<01:44,  1.08it/s]

11


 10%|█         | 13/125 [00:12<01:52,  1.01s/it]

12


 11%|█         | 14/125 [00:13<01:44,  1.06it/s]

13


 12%|█▏        | 15/125 [00:14<01:53,  1.03s/it]

14


 13%|█▎        | 16/125 [00:15<01:46,  1.02it/s]

15


 14%|█▎        | 17/125 [00:16<01:41,  1.06it/s]

16


 14%|█▍        | 18/125 [00:16<01:40,  1.07it/s]

17


 15%|█▌        | 19/125 [00:17<01:36,  1.10it/s]

18


 16%|█▌        | 20/125 [00:18<01:27,  1.20it/s]

19


 17%|█▋        | 21/125 [00:19<01:37,  1.07it/s]

20


 18%|█▊        | 22/125 [00:20<01:41,  1.01it/s]

21


 18%|█▊        | 23/125 [00:21<01:40,  1.02it/s]

22


 19%|█▉        | 24/125 [00:22<01:31,  1.10it/s]

23


 20%|██        | 25/125 [00:23<01:30,  1.11it/s]

24


 21%|██        | 26/125 [00:24<01:24,  1.17it/s]

25


 22%|██▏       | 27/125 [00:25<01:34,  1.03it/s]

26


 22%|██▏       | 28/125 [00:26<01:26,  1.12it/s]

27


 23%|██▎       | 29/125 [00:26<01:18,  1.23it/s]

28


 24%|██▍       | 30/125 [00:27<01:19,  1.20it/s]

29


 25%|██▍       | 31/125 [00:28<01:11,  1.32it/s]

30


 26%|██▌       | 32/125 [00:28<01:12,  1.28it/s]

31


 26%|██▋       | 33/125 [00:29<01:16,  1.21it/s]

32


 27%|██▋       | 34/125 [00:30<01:17,  1.18it/s]

33


 28%|██▊       | 35/125 [00:31<01:13,  1.22it/s]

34


 29%|██▉       | 36/125 [00:32<01:12,  1.22it/s]

35


 30%|██▉       | 37/125 [00:33<01:11,  1.24it/s]

36


 30%|███       | 38/125 [00:33<01:08,  1.26it/s]

37


 31%|███       | 39/125 [00:34<01:12,  1.18it/s]

38


 32%|███▏      | 40/125 [00:35<01:08,  1.24it/s]

39


 33%|███▎      | 41/125 [00:36<01:02,  1.35it/s]

40


 34%|███▎      | 42/125 [00:36<00:59,  1.39it/s]

41


 34%|███▍      | 43/125 [00:38<01:28,  1.08s/it]

42


 35%|███▌      | 44/125 [00:39<01:24,  1.04s/it]

43


 36%|███▌      | 45/125 [00:40<01:19,  1.00it/s]

44


 37%|███▋      | 46/125 [00:41<01:21,  1.03s/it]

45


 38%|███▊      | 47/125 [00:42<01:21,  1.05s/it]

46


 38%|███▊      | 48/125 [00:43<01:13,  1.04it/s]

47


 39%|███▉      | 49/125 [00:44<01:10,  1.08it/s]

48


 40%|████      | 50/125 [00:45<01:08,  1.09it/s]

49


 41%|████      | 51/125 [00:46<01:03,  1.16it/s]

50


 42%|████▏     | 52/125 [00:47<01:14,  1.02s/it]

51


 42%|████▏     | 53/125 [00:48<01:06,  1.08it/s]

52


 43%|████▎     | 54/125 [00:48<01:00,  1.16it/s]

53


 44%|████▍     | 55/125 [00:49<00:55,  1.27it/s]

54


 45%|████▍     | 56/125 [00:50<00:54,  1.27it/s]

55


 46%|████▌     | 57/125 [00:51<00:54,  1.26it/s]

56


 46%|████▋     | 58/125 [00:52<00:56,  1.18it/s]

57


 47%|████▋     | 59/125 [00:52<00:53,  1.23it/s]

58


 48%|████▊     | 60/125 [00:53<00:55,  1.18it/s]

59


 49%|████▉     | 61/125 [00:54<00:54,  1.18it/s]

60


 50%|████▉     | 62/125 [00:55<00:55,  1.15it/s]

61


 50%|█████     | 63/125 [00:56<00:58,  1.07it/s]

62


 51%|█████     | 64/125 [00:57<00:56,  1.08it/s]

63


 52%|█████▏    | 65/125 [00:58<00:49,  1.21it/s]

64


 53%|█████▎    | 66/125 [00:58<00:44,  1.32it/s]

65


 54%|█████▎    | 67/125 [00:59<00:49,  1.18it/s]

66


 54%|█████▍    | 68/125 [01:00<00:48,  1.18it/s]

67


 55%|█████▌    | 69/125 [01:01<00:50,  1.12it/s]

68


 56%|█████▌    | 70/125 [01:02<00:47,  1.16it/s]

69


 57%|█████▋    | 71/125 [01:03<00:44,  1.22it/s]

70


 58%|█████▊    | 72/125 [01:03<00:40,  1.31it/s]

71


 58%|█████▊    | 73/125 [01:04<00:37,  1.40it/s]

72


 59%|█████▉    | 74/125 [01:05<00:42,  1.19it/s]

73


 60%|██████    | 75/125 [01:06<00:39,  1.25it/s]

74


 61%|██████    | 76/125 [01:06<00:35,  1.38it/s]

75


 62%|██████▏   | 77/125 [01:07<00:34,  1.37it/s]

76


 62%|██████▏   | 78/125 [01:08<00:37,  1.26it/s]

77


 63%|██████▎   | 79/125 [01:08<00:34,  1.35it/s]

78


 64%|██████▍   | 80/125 [01:09<00:33,  1.33it/s]

79


 65%|██████▍   | 81/125 [01:11<00:42,  1.03it/s]

80


 66%|██████▌   | 82/125 [01:12<00:42,  1.02it/s]

81


 66%|██████▋   | 83/125 [01:12<00:37,  1.12it/s]

82


 67%|██████▋   | 84/125 [01:13<00:35,  1.15it/s]

83


 68%|██████▊   | 85/125 [01:14<00:32,  1.23it/s]

84


 69%|██████▉   | 86/125 [01:14<00:28,  1.36it/s]

85


 70%|██████▉   | 87/125 [01:15<00:28,  1.33it/s]

86


 70%|███████   | 88/125 [01:16<00:27,  1.34it/s]

87


 71%|███████   | 89/125 [01:17<00:26,  1.38it/s]

88


 72%|███████▏  | 90/125 [01:18<00:27,  1.29it/s]

89


 73%|███████▎  | 91/125 [01:18<00:27,  1.24it/s]

90


 74%|███████▎  | 92/125 [01:19<00:28,  1.16it/s]

91


 74%|███████▍  | 93/125 [01:20<00:26,  1.23it/s]

92


 75%|███████▌  | 94/125 [01:21<00:26,  1.19it/s]

93


 76%|███████▌  | 95/125 [01:22<00:22,  1.33it/s]

94


 77%|███████▋  | 96/125 [01:22<00:21,  1.32it/s]

95


 78%|███████▊  | 97/125 [01:23<00:21,  1.28it/s]

96


 78%|███████▊  | 98/125 [01:24<00:21,  1.28it/s]

97


 79%|███████▉  | 99/125 [01:25<00:19,  1.35it/s]

98


 80%|████████  | 100/125 [01:25<00:18,  1.32it/s]

99


 81%|████████  | 101/125 [01:26<00:16,  1.46it/s]

100


 82%|████████▏ | 102/125 [01:27<00:15,  1.45it/s]

101


 82%|████████▏ | 103/125 [01:27<00:15,  1.42it/s]

102


 83%|████████▎ | 104/125 [01:28<00:15,  1.34it/s]

103


 84%|████████▍ | 105/125 [01:29<00:15,  1.30it/s]

104


 85%|████████▍ | 106/125 [01:30<00:14,  1.34it/s]

105


 86%|████████▌ | 107/125 [01:31<00:13,  1.33it/s]

106


 86%|████████▋ | 108/125 [01:31<00:12,  1.33it/s]

107


 87%|████████▋ | 109/125 [01:32<00:11,  1.39it/s]

108


 88%|████████▊ | 110/125 [01:33<00:12,  1.24it/s]

109


 89%|████████▉ | 111/125 [01:34<00:10,  1.30it/s]

110


 90%|████████▉ | 112/125 [01:34<00:10,  1.29it/s]

111


 90%|█████████ | 113/125 [01:35<00:08,  1.35it/s]

112


 91%|█████████ | 114/125 [01:36<00:07,  1.39it/s]

113


 92%|█████████▏| 115/125 [01:37<00:07,  1.26it/s]

114


 93%|█████████▎| 116/125 [01:37<00:06,  1.43it/s]

115


 94%|█████████▎| 117/125 [01:38<00:05,  1.38it/s]

116


 94%|█████████▍| 118/125 [01:39<00:06,  1.15it/s]

117


 95%|█████████▌| 119/125 [01:40<00:05,  1.20it/s]

118


 96%|█████████▌| 120/125 [01:41<00:04,  1.15it/s]

119


 97%|█████████▋| 121/125 [01:42<00:03,  1.23it/s]

120


 98%|█████████▊| 122/125 [01:43<00:02,  1.14it/s]

121


 98%|█████████▊| 123/125 [01:43<00:01,  1.14it/s]

122


 99%|█████████▉| 124/125 [01:44<00:00,  1.22it/s]

123


100%|██████████| 125/125 [01:45<00:00,  1.18it/s]


124
printloss [32905.48434972763, 41082.51218175888, 35573.41900539398, 35602.91292142868, 26709.710083007812, 19816.34555053711, 40310.02608060837, 32806.98008584976, 29212.471183776855, 29096.942536830902, 33369.05337715149, 39039.80411911011, 45497.020674705505, 13095.361102581024, 39484.912525177, 21134.108783721924, 21868.28387737274, 47898.03309869766, 39991.693170547485, 24669.71913099289, 35661.57612609863, 42202.91742324829, 29772.50511789322, 29954.369206428528, 43005.87827682495, 32365.44119310379, 55033.682674884796, 22287.672567367554, 23332.75182914734, 40171.711486816406, 17683.024456501007, 40898.778327941895, 43489.085807323456, 49625.23447227478, 27212.172023773193, 36920.8609752655, 35356.44046783447, 38426.83050394058, 35835.87824869156, 24105.372467041016, 20168.17333984375, 21743.930614471436, 92968.0303273201, 43921.43843507767, 33363.819466114044, 57020.00038146973, 76471.50205421448, 28472.154487609863, 45436.66341876984, 42795.90308666229, 25558.249526023865, 

100%|██████████| 25/25 [00:06<00:00,  4.16it/s]


Epoch: 04 | Time: 1m 52s
	Train Loss: 5.064 | Train PPL: 158.186
	 Val. Loss: 5.053 |  Val. PPL: 156.457
here
iterator
<torchtext.legacy.data.iterator.BucketIterator object at 0x7f872cfa8c10>


  1%|          | 1/125 [00:00<01:35,  1.30it/s]

0


  2%|▏         | 2/125 [00:01<01:42,  1.20it/s]

1


  2%|▏         | 3/125 [00:02<01:47,  1.14it/s]

2


  3%|▎         | 4/125 [00:03<01:35,  1.26it/s]

3


  4%|▍         | 5/125 [00:03<01:30,  1.33it/s]

4


  5%|▍         | 6/125 [00:04<01:22,  1.44it/s]

5


  6%|▌         | 7/125 [00:05<01:15,  1.57it/s]

6


  6%|▋         | 8/125 [00:05<01:16,  1.54it/s]

7


  7%|▋         | 9/125 [00:06<01:15,  1.54it/s]

8


  8%|▊         | 10/125 [00:06<01:14,  1.54it/s]

9


  9%|▉         | 11/125 [00:07<01:24,  1.35it/s]

10


 10%|▉         | 12/125 [00:08<01:28,  1.27it/s]

11


 10%|█         | 13/125 [00:09<01:26,  1.30it/s]

12


 11%|█         | 14/125 [00:10<01:23,  1.32it/s]

13


 12%|█▏        | 15/125 [00:11<01:29,  1.23it/s]

14


 13%|█▎        | 16/125 [00:12<01:29,  1.21it/s]

15


 14%|█▎        | 17/125 [00:13<01:32,  1.17it/s]

16


 14%|█▍        | 18/125 [00:13<01:26,  1.24it/s]

17


 15%|█▌        | 19/125 [00:14<01:28,  1.19it/s]

18


 16%|█▌        | 20/125 [00:15<01:35,  1.10it/s]

19


 17%|█▋        | 21/125 [00:16<01:33,  1.11it/s]

20


 18%|█▊        | 22/125 [00:17<01:32,  1.11it/s]

21


 18%|█▊        | 23/125 [00:18<01:29,  1.14it/s]

22


 19%|█▉        | 24/125 [00:19<01:26,  1.16it/s]

23


 20%|██        | 25/125 [00:19<01:21,  1.22it/s]

24


 21%|██        | 26/125 [00:20<01:15,  1.31it/s]

25


 22%|██▏       | 27/125 [00:21<01:23,  1.18it/s]

26


 22%|██▏       | 28/125 [00:22<01:24,  1.14it/s]

27


 23%|██▎       | 29/125 [00:23<01:25,  1.12it/s]

28


 24%|██▍       | 30/125 [00:24<01:32,  1.02it/s]

29


 25%|██▍       | 31/125 [00:25<01:31,  1.03it/s]

30


 26%|██▌       | 32/125 [00:26<01:27,  1.07it/s]

31


 26%|██▋       | 33/125 [00:27<01:21,  1.13it/s]

32


 27%|██▋       | 34/125 [00:28<01:23,  1.09it/s]

33


 28%|██▊       | 35/125 [00:28<01:17,  1.16it/s]

34


 29%|██▉       | 36/125 [00:29<01:13,  1.21it/s]

35


 30%|██▉       | 37/125 [00:30<01:09,  1.27it/s]

36


 30%|███       | 38/125 [00:30<01:02,  1.39it/s]

37


 31%|███       | 39/125 [00:31<01:04,  1.34it/s]

38


 32%|███▏      | 40/125 [00:32<01:04,  1.31it/s]

39


 33%|███▎      | 41/125 [00:33<01:05,  1.28it/s]

40


 34%|███▎      | 42/125 [00:33<01:02,  1.33it/s]

41


 34%|███▍      | 43/125 [00:34<00:58,  1.40it/s]

42


 35%|███▌      | 44/125 [00:35<01:00,  1.34it/s]

43


 36%|███▌      | 45/125 [00:36<01:04,  1.24it/s]

44


 37%|███▋      | 46/125 [00:37<01:06,  1.20it/s]

45


 38%|███▊      | 47/125 [00:38<01:15,  1.03it/s]

46


 38%|███▊      | 48/125 [00:39<01:18,  1.02s/it]

47


 39%|███▉      | 49/125 [00:40<01:21,  1.07s/it]

48


 40%|████      | 50/125 [00:41<01:17,  1.03s/it]

49


 41%|████      | 51/125 [00:42<01:07,  1.10it/s]

50


 42%|████▏     | 52/125 [00:43<01:03,  1.15it/s]

51


 42%|████▏     | 53/125 [00:43<00:58,  1.24it/s]

52


 43%|████▎     | 54/125 [00:44<00:56,  1.26it/s]

53


 44%|████▍     | 55/125 [00:45<00:54,  1.27it/s]

54


 45%|████▍     | 56/125 [00:46<00:55,  1.24it/s]

55


 46%|████▌     | 57/125 [00:47<00:57,  1.19it/s]

56


 46%|████▋     | 58/125 [00:47<00:53,  1.25it/s]

57


 47%|████▋     | 59/125 [00:48<00:57,  1.14it/s]

58


 48%|████▊     | 60/125 [00:49<00:58,  1.12it/s]

59


 49%|████▉     | 61/125 [00:50<00:54,  1.17it/s]

60


 50%|████▉     | 62/125 [00:51<00:54,  1.15it/s]

61


 50%|█████     | 63/125 [00:52<00:51,  1.20it/s]

62


 51%|█████     | 64/125 [00:53<00:52,  1.16it/s]

63


 52%|█████▏    | 65/125 [00:54<00:52,  1.15it/s]

64


 53%|█████▎    | 66/125 [00:54<00:49,  1.19it/s]

65


 54%|█████▎    | 67/125 [00:55<00:48,  1.19it/s]

66


 54%|█████▍    | 68/125 [00:56<00:51,  1.12it/s]

67


 55%|█████▌    | 69/125 [00:57<00:48,  1.15it/s]

68


 56%|█████▌    | 70/125 [00:58<00:47,  1.16it/s]

69


 57%|█████▋    | 71/125 [00:59<00:45,  1.19it/s]

70


 58%|█████▊    | 72/125 [01:00<00:44,  1.20it/s]

71


 58%|█████▊    | 73/125 [01:00<00:39,  1.32it/s]

72


 59%|█████▉    | 74/125 [01:01<00:39,  1.30it/s]

73


 60%|██████    | 75/125 [01:02<00:37,  1.33it/s]

74


 61%|██████    | 76/125 [01:02<00:35,  1.38it/s]

75


 62%|██████▏   | 77/125 [01:03<00:35,  1.37it/s]

76


 62%|██████▏   | 78/125 [01:04<00:40,  1.15it/s]

77


 63%|██████▎   | 79/125 [01:05<00:35,  1.28it/s]

78


 64%|██████▍   | 80/125 [01:06<00:38,  1.16it/s]

79


 65%|██████▍   | 81/125 [01:07<00:38,  1.14it/s]

80


 66%|██████▌   | 82/125 [01:07<00:33,  1.27it/s]

81


 66%|██████▋   | 83/125 [01:08<00:36,  1.15it/s]

82


 67%|██████▋   | 84/125 [01:09<00:35,  1.16it/s]

83


 68%|██████▊   | 85/125 [01:10<00:30,  1.31it/s]

84


 69%|██████▉   | 86/125 [01:11<00:29,  1.32it/s]

85


 70%|██████▉   | 87/125 [01:11<00:26,  1.41it/s]

86


 70%|███████   | 88/125 [01:12<00:25,  1.44it/s]

87


 71%|███████   | 89/125 [01:13<00:26,  1.34it/s]

88


 72%|███████▏  | 90/125 [01:13<00:25,  1.37it/s]

89


 73%|███████▎  | 91/125 [01:14<00:27,  1.26it/s]

90


 74%|███████▎  | 92/125 [01:15<00:26,  1.23it/s]

91


 74%|███████▍  | 93/125 [01:16<00:26,  1.20it/s]

92


 75%|███████▌  | 94/125 [01:18<00:32,  1.04s/it]

93


 76%|███████▌  | 95/125 [01:18<00:29,  1.00it/s]

94


 77%|███████▋  | 96/125 [01:19<00:26,  1.11it/s]

95


 78%|███████▊  | 97/125 [01:20<00:22,  1.25it/s]

96


 78%|███████▊  | 98/125 [01:21<00:22,  1.18it/s]

97


 79%|███████▉  | 99/125 [01:22<00:29,  1.14s/it]

98


 80%|████████  | 100/125 [01:23<00:26,  1.04s/it]

99


 81%|████████  | 101/125 [01:24<00:23,  1.02it/s]

100


 82%|████████▏ | 102/125 [01:25<00:23,  1.04s/it]

101


 82%|████████▏ | 103/125 [01:26<00:23,  1.08s/it]

102


 83%|████████▎ | 104/125 [01:27<00:19,  1.06it/s]

103


 84%|████████▍ | 105/125 [01:28<00:17,  1.13it/s]

104


 85%|████████▍ | 106/125 [01:29<00:15,  1.20it/s]

105


 86%|████████▌ | 107/125 [01:29<00:14,  1.23it/s]

106


 86%|████████▋ | 108/125 [01:30<00:14,  1.21it/s]

107


 87%|████████▋ | 109/125 [01:31<00:13,  1.20it/s]

108


 88%|████████▊ | 110/125 [01:32<00:12,  1.17it/s]

109


 89%|████████▉ | 111/125 [01:34<00:15,  1.08s/it]

110


 90%|████████▉ | 112/125 [01:34<00:13,  1.01s/it]

111


 90%|█████████ | 113/125 [01:36<00:13,  1.08s/it]

112


 91%|█████████ | 114/125 [01:37<00:12,  1.17s/it]

113


 92%|█████████▏| 115/125 [01:38<00:10,  1.04s/it]

114


 93%|█████████▎| 116/125 [01:39<00:08,  1.03it/s]

115


 94%|█████████▎| 117/125 [01:40<00:07,  1.05it/s]

116


 94%|█████████▍| 118/125 [01:40<00:06,  1.13it/s]

117


 95%|█████████▌| 119/125 [01:41<00:04,  1.24it/s]

118


 96%|█████████▌| 120/125 [01:42<00:04,  1.23it/s]

119


 97%|█████████▋| 121/125 [01:42<00:03,  1.26it/s]

120


 98%|█████████▊| 122/125 [01:43<00:02,  1.31it/s]

121


 98%|█████████▊| 123/125 [01:44<00:01,  1.34it/s]

122


 99%|█████████▉| 124/125 [01:44<00:00,  1.52it/s]

123


100%|██████████| 125/125 [01:45<00:00,  1.19it/s]


124
printloss [34875.44212245941, 40901.449310302734, 46365.159487724304, 24263.46447944641, 26915.055956840515, 20187.06628704071, 13148.040266513824, 23315.73695898056, 21019.434699058533, 22138.69783782959, 39417.38041782379, 59059.6878657341, 34352.81895637512, 28249.447415828705, 42346.36480522156, 40031.79313659668, 39781.64600276947, 29262.917463302612, 45956.904870033264, 57681.26901912689, 38480.28057098389, 52340.90189409256, 24977.000675201416, 38559.37342596054, 29371.72689628601, 21426.742323875427, 47668.839344501495, 58450.43257713318, 39256.29832458496, 78062.33053159714, 41692.90338563919, 41145.817175388336, 33558.75666761398, 57734.71664714813, 20610.07026386261, 33765.04499864578, 25128.139088630676, 15432.102952003479, 35303.49424171448, 34564.11417579651, 36666.76954650879, 27803.096847057343, 22319.89004135132, 38745.41991233826, 37781.70889377594, 20905.446155548096, 40559.69719791412, 31174.780990600586, 43241.80304479599, 44412.63985347748, 19891.964626312256,

100%|██████████| 25/25 [00:06<00:00,  4.08it/s]


Epoch: 05 | Time: 1m 52s
	Train Loss: 5.027 | Train PPL: 152.506
	 Val. Loss: 5.036 |  Val. PPL: 153.826


In [None]:
SRC = Input
TRG = Output

In [None]:
test_inputs,reference_output=creating_features(test_dataset)
hypothesis_list=[]
for input in test_inputs:
  input=input.split(" ")
  translation, attention = translate_sentence(input, SRC, TRG, model, device)
  hypothesis_list.append(untokenize(translation[:-1]).decode('utf-8'))

bleu = evaluate_bleu(reference_output, hypothesis_list)

In [None]:
print(bleu)

13.839435571817358


Reference - https://towardsdatascience.com/building-a-python-code-generator-4b476eec5804