In [1]:
'''
Mount Google Drive, copy data to runtime, and unzip folders

Make sure to put a link to "EC 523 Project" in your main google drive!
'''

# from google.colab import drive
# drive.mount('/content/drive')

# ! cp /content/drive/MyDrive/'Deep Learning Proj'/train.zip /content
# ! cp /content/drive/MyDrive/'Deep Learning Proj'/test.zip /content
# ! cp /content/drive/MyDrive/'Deep Learning Proj'/val.zip /content
# ! cp /content/drive/'My Drive'/'Deep Learning Proj'/math.txt /content

# from path will differ depending on where you saved the zip file in Google Drive
# ! unzip -DD -q  ./train.zip -d  .
# ! unzip -DD -q  ./test.zip -d  .
# ! unzip -DD -q  ./val.zip -d  .



'\nMount Google Drive, copy data to runtime, and unzip folders\n\nMake sure to put a link to "EC 523 Project" in your main google drive!\n'

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
'''
Length of datasets
'''
num_train_str = !ls train | wc -l
num_test_str = !ls test | wc -l
num_val_str = !ls val | wc -l
num_train = int(num_train_str[0])
num_test = int(num_test_str[0])
num_val = int(num_val_str[0])

print(f'Number of train images: {num_train}\nNumber of test images: {num_test}\nNumber of validation images: {num_val}\nTotal images: {num_train+num_test+num_val}')

Number of train images: 158480
Number of test images: 30637
Number of validation images: 6765
Total images: 195882


In [4]:
train_root = "./train/"
test_root = "./test/"
val_root = "./val/"
label_file = "./math.txt"
freq_vocabs = "./Corpus.txt"

Creating dataset class for images and labels

In [5]:
import os
import cv2
import torch.utils.data
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np


class LatexDataset(torch.utils.data.Dataset):
  def __init__(self, transform=None, dataroot=val_root, max_seq_length = 700): # can change dataroot to be either train_root, test_root, val_root
        '''Initialize the dataset.'''
        self.transform = transform
        self.dataroot = dataroot
        self.labels_txt = label_file
        self.max_seq_length = max_seq_length
        self._parse()

  def _parse(self):
        '''
        Parse the math.txt file.
        Populates the following private variables:
        - self.im_paths: A list of strings storing the associated image paths
        - self.labels: A list of strings, where each string is the latex code for an image
        '''
        def getImPath(idx):
            # Find image in either train, test, or validation folder
            imname = str(idx - 1).zfill(7) + '.png'
            if os.path.exists(f'{self.dataroot}{imname}'):
              impath = f'{self.dataroot}{imname}'
            else:
              return None

            try:
                Image.open(impath).verify()
            except Exception as e:
                # Some images can't be opened
                # print(f"Image at path {impath} is corrupted. Error: {e}")
                return None

            return impath

        self.im_paths = []
        self.labels = []

        with open(self.labels_txt) as f:
            for idx, line in enumerate(f):
                impath = getImPath(idx+1)

                if impath is not None:
                    labels = line.strip('\n')
                    if len(labels) < self.max_seq_length-1:   # Loading images with certain latex length
                      self.im_paths.append(impath)            # Image name
                      self.labels.append(labels)    # String of latex code


  def __len__(self):
        '''Return length of the dataset.'''
        assert len(self.labels) == len(self.im_paths)
        return len(self.labels)

  def __getitem__(self, index):
        '''
        Return the (image, attributes) tuple.
        This function gets called when you index the dataset.
        '''
        def img_load(index):
            imraw = Image.open(self.im_paths[index])
            imgray = imraw.convert('L')                         # Convert image to greyscale
            imthresh = imgray.point(lambda p: p > 240 and 255)  # Threshold image to remove background (white)
            if self.transform is not None:
              im = self.transform(imthresh)
            else:
              im = imthresh
            return im

        target = self.labels[index]
        return img_load(index), target

Generating most frequent characters used in math.txt file and outputing it to output.txt

In [6]:
from collections import Counter

def create_dictionary(file_path, n, output_file_path):
    word_count = Counter()

    with open(file_path, 'r') as file:
        # Read the file and count word occurrences
        for line in file:
            words = line.split()
            word_count.update(words)

    # Get the n most common words as a dictionary
    most_common_words = dict(word_count.most_common(n))

    # Save the dictionary to a text file
    ID = 1
    with open(output_file_path, 'w') as output_file:
        for word, count in most_common_words.items():
          if (word != ',') and (word != '\\,'):
            output_file.write(f"{word},{ID} \n")
            ID += 1
        output_file.write(f"{'PAD'},{ID} \n")
        ID += 1
        output_file.write(f"{'UNK'},{ID} \n")

# Example usage
n_most_common = 200  # Change this value to get different numbers of most common words
!touch ./output.txt
output_file_path = "./output.txt" # Replace 'output.txt' with the desired output file path

create_dictionary(label_file, n_most_common, output_file_path)
print(f"Data saved to {output_file_path}")

Data saved to ./output.txt


Creating dictionary for vocab and tokens

In [7]:
'''
Dictionary block: converts a LaTeX string to a dictionary of latex tokens, where
each unique token has its own entry and integer value assigned to it

'''
class LatexDict():
    def __init__(self, max_seq_length=256):
        self.labels_txt = label_file
        self.max_seq_length = max_seq_length
        self.latex_dict = {'<UKN>':0, '<PAD>':1} # Initialize with token for unknown and for padding
        self.latex_dict_inverse = {0:'<UKN>', 1:'<PAD>'} # Initialize inverse dict for quicker reverse lookups
        self.create_dict()

    def create_dict(self):
        # Go through entire label file and populate dictionary
        with open(self.labels_txt) as f:
            for line in f:
                tokens = line.split()
                for token in tokens:
                    if token not in self.latex_dict:
                        # Assign a new ID for the unseen token
                        new_id = len(self.latex_dict)
                        self.latex_dict[token] = new_id
                        self.latex_dict_inverse[new_id] = token

    def map_tokens(self, tex_str_list, batch_size):
        ids_tensor = torch.full((batch_size, self.max_seq_length), self.latex_dict['<PAD>'], dtype=torch.float32)

        for row, tex_str in enumerate(tex_str_list):
            tex_str = r'{ ' + tex_str + ' }'
            tokens = tex_str.split()
            for col, token in enumerate(tokens):
                ids_tensor[row, col] = self.latex_dict[token]

        return ids_tensor

    def tokens_to_tex(self, token_vec):
        tex_str = ' '
        for token_id in token_vec.tolist():
            if token_id in self.latex_dict_inverse:
                if self.latex_dict_inverse[token_id] != '<PAD>' and self.latex_dict_inverse[token_id] != '<UKN>':
                    tex_str += self.latex_dict_inverse[token_id] + ' '

        return tex_str

    def __dict__(self):
        return self.latex_dict

    def __len__(self):
        return len(self.latex_dict)

latex_dict = LatexDict()

LatexDictModified() is debugged version of LatexDict(). The problems with loading labels and dealing with escape characters (e.g \n, \t) have been resolved. It also puts UNK for each unknown character

In [8]:
'''
Dictionary block: converts a LaTeX string to a dictionary of latex tokens, where
each unique token has its own entry and integer value assigned to it

'''
class LatexDictModified():
    def __init__(self, max_seq_length=256):
        self.labels_txt = label_file
        self.max_seq_length = max_seq_length
        self.latex_dict = {} # Initialize with token for unknown and for padding
        self.latex_dict_inverse = {} # Initialize inverse dict for quicker reverse lookups
        self.create_dict()

    def create_dict(self):
        # Go through entire label file and populate dictionary

      Freq_labeled_file = open("./output.txt", "r")

      for line in Freq_labeled_file: #For each line in data file
          token = line.split(',')[1].split(" ")[0]
          latexSymbol = line.split(",")[0]
          self.latex_dict[latexSymbol] = int(token)
          self.latex_dict_inverse[token] = latexSymbol
      Freq_labeled_file.close()

    def map_tokens(self, tex_str_list, batch_size):
        ids_tensor = torch.full((batch_size, self.max_seq_length), self.latex_dict['PAD'], dtype=torch.float32)

        for row, tex_str in enumerate(tex_str_list):
            tex_str = r'{ ' + tex_str + ' }'
 
            tokens = tex_str.split()

            for col, token in enumerate(tokens):
              if token in self.latex_dict:
                ids_tensor[row, col] = self.latex_dict[token]
              else:
               ids_tensor[row, col] = self.latex_dict['UNK']

        return ids_tensor

    def tokens_to_tex(self, token_vec):
        tex_str = ' '
        for token_id in token_vec.tolist():
            if token_id in self.latex_dict_inverse:
                if self.latex_dict_inverse[token_id] != 'PAD' and self.latex_dict_inverse[token_id] != 'UNK':
                    tex_str += self.latex_dict_inverse[token_id] + ' '

        return tex_str

    def __dict__(self):
        return self.latex_dict

    def __len__(self):
        return len(self.latex_dict)

latex_dict = LatexDictModified()

CNN Block with 512 channel output

In [76]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim
import math

class CNN_Block(nn.Module):
    def __init__(self):
        super(CNN_Block, self).__init__()
        
        self.leaky = nn.LeakyReLU(negative_slope=0.1)
        # self.conv1 = nn.Conv2d(3, 64, 3)
        self.conv1 = nn.Conv2d(1, 64, 3, padding=1, bias=False)    # Images are originally one channel, added padding as well
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(64, 128, 3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(128)
        # self.pool2 = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv2d(128, 256, 3, padding=1, bias=False)
        # self.pool3 = nn.MaxPool2d(2, 2)
        self.conv4 = nn.Conv2d(256, 256, 3, padding=1, bias=False)
        self.conv5 = nn.Conv2d(256, 512, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(512)
        # self.pool4 = nn.MaxPool2d(2, 2)

        #self.fc1 = nn.Linear(256 * 2 * 8, 1024)  # Adjusted input size based on the output size of the convolutional layers
        #self.fc2 = nn.Linear(1024, 512)


    def forward(self, x):
#         x = self.pool(F.relu(self.conv1(x)))
#         x = self.pool(F.relu(self.conv2(x)))
#         x = self.pool(F.relu(self.conv3(x)))
#         x = (F.relu(self.conv4(x)))
        # x = F.relu(self.conv5(x))
    
        ''' 
        Replacing Leaky ReLU instead of ReLU
        '''
        
        x = self.pool(self.leaky(self.conv1(x)))
        x = self.pool(self.bn1(self.leaky(self.conv2(x))))
        x = self.pool(self.leaky(self.conv3(x)))
        x = self.pool(self.leaky(self.conv4(x)))
        x = self.bn2(self.leaky(self.conv5(x)))
    
        #print('x: shape:', x.shape)
        #x = x.view(64,-1)   # Flatten so this can be used in linear layers

        #x = F.relu(self.fc1(x))
        #x = self.fc2(x)

        return x


Defining decoding class: \\
  hidden_size: dimensionality of the hidden layer of the LSTM \\
  batch_size: Size of the batch \\
  seq_length: length of the input sequence \\
  vocab_size: size of our dictionary \\
  emd_size: Number of rows in embedding vector to represent \\
  enc_out: Number of output channels in encoder \\
  

In [84]:
class decoder(nn.Module):
  def __init__(self, hidden_size, batch_size, seq_length, vocab_size, embedding_size, enc_out = 512):
    super(decoder, self).__init__()

    self.hidden_size = hidden_size
    self.batch_size = batch_size
    self.seq_length = seq_length
    self.vocab_size = vocab_size
    self.embedding_size = embedding_size
    self.enc_out = enc_out


    self.cnn_encoder = CNN_Block()
    self.lstm = nn.LSTMCell(hidden_size + embedding_size, hidden_size)
    self.emb = nn.Embedding(vocab_size,embedding_size )

    self.wh_in = nn.Linear(enc_out, hidden_size, bias = False)
    self.wc_in = nn.Linear(enc_out, hidden_size, bias = False)
    self.wo_in = nn.Linear(enc_out, hidden_size, bias = False)

    self.dropout = nn.Dropout(p=0.2)
    self.soft = nn.Softmax(dim = 2)
    self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

 # Attention mechanism
    self.beta = nn.Parameter(torch.Tensor(enc_out))
    nn.init.uniform_(self.beta, -1e-2, 1e-2)
    self.W_1 = nn.Linear(enc_out, enc_out, bias=False)
    self.W_2 = nn.Linear(hidden_size, enc_out, bias=False)

    self.W_3 = nn.Linear(hidden_size + enc_out, hidden_size, bias=False)
    self.W_out = nn.Linear(hidden_size, vocab_size, bias=False)


  def forward(self, im, label):
    enc_im = self.encode(im.to(self.device))
    dec_states, o_t = self.init_decoder(enc_im.to(self.device))
    # Suppose we are taking labels with format [Batch, Max Len, Vocab_size]
    max_len = label.shape[1]
    outputs = []
    flag = False
    for i in range(max_len):
      lll = label[:, i, :]
      target_label = torch.argmax(lll.to(self.device), dim = 1, keepdim = True)
      if flag:
        target_label = torch.argmax(outputs[-1].to(self.device), dim = 1, keepdim = True) # Finding the result of the previous output and feed it to the next input

      dec_states, output = self.step_decode(dec_states, o_t.to(self.device), enc_im.to(self.device), target_label.to(self.device))
      outputs.append(output.to(self.device))
      flag = True
    outputs = torch.stack(outputs, dim=1) # Changing the dimensions to [Batch, Max Len, Vocab_size]
    outputs = self.soft(outputs)

    return outputs




  def encode(self, im):
    cnn_encode = self.cnn_encoder(im.to(self.device))
    
    pe = self.PositionalEmbedding2D(self.enc_out, cnn_encode.shape[2], cnn_encode.shape[3]).repeat(cnn_encode.shape[0], 1, 1, 1).to(self.device)

    cnn_encode = cnn_encode.permute(0, 2, 3, 1)   # [Batch, Width, Height, Channel]
    batch, height, width, channel = cnn_encode.shape


    cnn_encode = cnn_encode.contiguous().view(batch, height * width, -1)  # [Batch, Width*Height, Channel]

    return cnn_encode


  def PositionalEmbedding2D(self, D_model,height,width):
    if D_model % 4 != 0:
        raise ValueError("Cannot use sin/cos positional encoding with "
                         "odd dimension (got dim={:d})".format(D_model))
    pe = torch.zeros(D_model,height,width)
    d_model = int(D_model / 2)
    div_term = torch.exp(torch.arange(0., d_model, 2) * -(math.log(10000.0) / d_model))
    pos_w = torch.arange(0., width).unsqueeze(1)
    pos_h = torch.arange(0., height).unsqueeze(1)
    pe[0:d_model:2, :, :] = torch.sin(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
    pe[1:d_model:2, :, :] = torch.cos(pos_w * div_term).transpose(0, 1).unsqueeze(1).repeat(1, height, 1)
    pe[d_model::2, :, :] = torch.sin(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)
    pe[d_model + 1::2, :, :] = torch.cos(pos_h * div_term).transpose(0, 1).unsqueeze(2).repeat(1, 1, width)

    return pe.permute(0, 1, 2)


  def init_decoder(self, enc_im):
    '''
    Here we are trying to estimate if we had a LSTM module for encoder, what would be the result of H, C, O
    '''
    enc_im_mean = enc_im.mean(dim=1)
    h = torch.tanh(self.wh_in(enc_im_mean))
    c = torch.tanh(self.wc_in(enc_im_mean))
    o_init = torch.tanh(self.wo_in(enc_im_mean))
    return (h, c), o_init



  def step_decode(self, dec_states, o_t, enc_im, target_label ):
    y_emb = self.emb(target_label).squeeze(1) 
    inp = torch.cat([y_emb, o_t], dim = 1)
    h, c = self.lstm(inp, dec_states)
    h = self.dropout(h)
    c = self.dropout(c)

    contex = self.attentionMechanism(enc_im, h)

    o_t = self.W_3(torch.cat([h, contex], dim=1)).tanh()
    o_t = self.dropout(o_t)
    output = self.W_out(o_t)

    return (h, c), output


  def attentionMechanism(self, enc_im, h):
    alpha = torch.tanh(self.W_1(enc_im) + self.W_2(h).unsqueeze(1))
    alpha = torch.sum(self.beta * alpha, dim=-1)  # [B, L]
    alpha = F.softmax(alpha, dim=-1)  # [B, L]

    # cal context: [B, C]
    contex = torch.bmm(alpha.unsqueeze(1), enc_im)
    contex = contex.squeeze(1)
    return contex



Initializing the database and code output

In [18]:
'''
Initialize dataset and image preprocessing

NOTE:
    Some of the images in the dataset are corrupted. To deal with this,
    there is a check for each image to ensure that it can be loaded.
'''
import torchvision.transforms as transforms

reduced_imsize = (32, 128)  # Images are reduced to this size

# Define the transform pipeline - add normalization?
transform = transforms.Compose([
    transforms.Resize(reduced_imsize),
    transforms.ToTensor(),
])


seq_length = 150        # Maximum number of sequence length i

train_dataset = LatexDataset(transform=transform, dataroot=train_root, max_seq_length=seq_length)
test_dataset = LatexDataset(transform=transform, dataroot=test_root)
val_dataset = LatexDataset(transform=transform, dataroot=val_root)

# Device settings
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

Device: cuda:0


In [88]:
'''
Initialize hyperperameters, trainloader, and dictionary of LaTeX token mappings
'''

# Hyperparameters
batch_size = 16
learning_rate = 0.001
weight_decay = 0  # (L2 penalty)

latex_dict = LatexDictModified(max_seq_length=seq_length)

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8, drop_last=True)


In [89]:
'''
Initialize Model

Initialize Loss Functions:
1. Normal Cross-Entropy Loss between prediction and label
2. LaTeX compile test:
    - Custom function, returns True if code can compile into LaTeX, False if not

Initialize Optimizer:
1. Adam Optimizer
'''

    # Hyperparameters


hidden_size = 200
batch_size = 16
vocab_size = latex_dict.__len__()
embedding_size = 80
model = decoder(hidden_size, batch_size, seq_length, vocab_size, embedding_size).to(device)

# model = Model(embedding_size=embedding_size, hidden_size=hidden_size, batch_size=batch_size, sequence_length=sequence_length, vocab_size=vocab_size, o_layer_size = o_layer_size).to(device)
criterion = nn.CrossEntropyLoss()   ## May need to change this
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

Latex Checker

Install Dependencies

In [None]:
!pip install folium==0.2.1
!pip install pdflatex
!sudo apt-get install texlive-latex-recommended 
!sudo apt install texlive-latex-extra
!sudo apt install dvipng

Function: 

In [None]:
import subprocess
import pdflatex as ptex
def check_latex_syntax(tex_code):
    try:
        # Create a .tex file with the provided LaTeX code
        with open('temp.tex', 'w') as file:
            file.write(tex_code)

        # Run pdflatex to compile the temporary .tex file
        subprocess.run(['pdflatex', '-interaction=batchmode', 'temp.tex'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        # Check the return code to determine if compilation was successful
        return_code = subprocess.call(['pdflatex', '-interaction=batchmode', 'temp.tex'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        if return_code == 0:
            return True  # Compilation successful, syntax is valid
        else:
            return False  # Compilation failed, syntax is invalid

    except subprocess.CalledProcessError:
        return False

    finally:
        # Clean up temporary files
        subprocess.run(['rm', 'temp.tex', 'temp.log', 'temp.aux'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


#Counting the failed in compiling
def count_latex_errors(tex_code):
  Error_count = 0
  for i in range(len(tex_code)):
    temp_text = tex_code[i]
    result = check_latex_syntax(temp_text)
    if result:
      pass
    else:
      Error_count += 1
  return Error_count
def making_input_syntax_check(tex_str_list):
  Latex_code = []
  Base_start = r'''
\documentclass{article}
\begin{document}
'''
  Base_end = r'''
\end{document}
'''
  for i in range(len(tex_str_list)):
    temp_tex_str = Base_start + '$' + tex_str_list[i] + '$' + Base_end
    Latex_code.append(temp_tex_str)
  return Latex_code

Usage Example

In [None]:
latex_code = r'''\beta'''
latex = r'''\frac{a}{b}'''

tex_code =[]
tex_code.append(latex_code)
tex_code.append(latex)
tex_code_correct = making_input_syntax_check(tex_code)
# Check the LaTeX syntax
print(count_latex_errors(tex_code_correct))

Training Loop

In [87]:
from tqdm import tqdm

'''
Training Loop
'''




num_epoch = 100
model.train()
counter = 0
for epoch in range(num_epoch):
    print('epoch:', epoch)
    pbar = tqdm(trainloader)
    for images, y in pbar:

        images = images.to(device)              # Send to gpu
  
        y_vec = latex_dict.map_tokens(list(y), batch_size=batch_size)
        y_vec = torch.tensor(y_vec, dtype=torch.long)
        y_vec = torch.LongTensor(y_vec)
  
        one_hot = torch.zeros(y_vec.shape[0], y_vec.shape[1], vocab_size)
        for i in range(y_vec.shape[0]):     # Making one-hot representation of the labels [Batch, Max Len, Vocab-Size]
            for j in range(y_vec.shape[1]):
                one_hot[i,j,y_vec[i,j]-1] = 1
        one_hot = one_hot.to(device)                # Send to gpu
      
        predictions = model.forward(images,one_hot)        # Get predictions


        one_hot = torch.tensor(one_hot, dtype=torch.float)


        counter += 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        loss1 = criterion(predictions, one_hot)   # Calculate first loss function
        print(loss1)
        if (counter % 50 == 0):
            
            counter = 1
            print("===================Checking gradient===============")
            for name, param in model.named_parameters():
                print(name, param.grad.norm())
            print("===================    Finished   =================")
#             with open('./loss_report.txt', 'a') as output_file:
#                     output_file.write(f"{epoch},{loss1} \n")
        # loss2 = can_compile(precitions)         # Check if output can compile

        optimizer.zero_grad()

        loss1.backward()
        optimizer.step()
#     torch.save(model.state_dict(), f'checkpoint_{epoch}.pt')

epoch: 0


  y_vec = torch.tensor(y_vec, dtype=torch.long)
  one_hot = torch.tensor(one_hot, dtype=torch.float)
  0%|          | 1/1529 [00:00<18:05,  1.41it/s]

tensor(3.7580, device='cuda:0', grad_fn=<DivBackward1>)



  0%|          | 2/1529 [00:01<13:33,  1.88it/s]

tensor(3.7580, device='cuda:0', grad_fn=<DivBackward1>)



  0%|          | 3/1529 [00:01<12:02,  2.11it/s]

tensor(3.7580, device='cuda:0', grad_fn=<DivBackward1>)



  0%|          | 4/1529 [00:01<11:16,  2.25it/s]

tensor(3.7579, device='cuda:0', grad_fn=<DivBackward1>)



  0%|          | 5/1529 [00:02<10:48,  2.35it/s]

tensor(3.7576, device='cuda:0', grad_fn=<DivBackward1>)



  0%|          | 6/1529 [00:02<10:32,  2.41it/s]

tensor(3.7572, device='cuda:0', grad_fn=<DivBackward1>)



  0%|          | 7/1529 [00:03<10:21,  2.45it/s]

tensor(3.7563, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 8/1529 [00:03<10:15,  2.47it/s]

tensor(3.7549, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 9/1529 [00:03<10:10,  2.49it/s]

tensor(3.7526, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 10/1529 [00:04<10:07,  2.50it/s]

tensor(3.7509, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 11/1529 [00:04<10:05,  2.51it/s]

tensor(3.7486, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 12/1529 [00:05<10:03,  2.51it/s]

tensor(3.7500, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 13/1529 [00:05<10:02,  2.52it/s]

tensor(3.7580, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 14/1529 [00:05<10:00,  2.52it/s]

tensor(3.7580, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 15/1529 [00:06<10:00,  2.52it/s]

tensor(3.7580, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 16/1529 [00:06<10:03,  2.51it/s]

tensor(3.7580, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 17/1529 [00:07<10:00,  2.52it/s]

tensor(3.7580, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 18/1529 [00:07<09:58,  2.52it/s]

tensor(3.7579, device='cuda:0', grad_fn=<DivBackward1>)



  1%|          | 19/1529 [00:07<09:57,  2.53it/s]

tensor(3.7581, device='cuda:0', grad_fn=<DivBackward1>)



  1%|▏         | 20/1529 [00:08<09:57,  2.53it/s]

tensor(3.7570, device='cuda:0', grad_fn=<DivBackward1>)



  1%|▏         | 21/1529 [00:08<09:56,  2.53it/s]

tensor(3.7538, device='cuda:0', grad_fn=<DivBackward1>)



  1%|▏         | 22/1529 [00:09<09:56,  2.53it/s]

tensor(3.7563, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 23/1529 [00:09<09:56,  2.53it/s]

tensor(3.7541, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 24/1529 [00:09<09:55,  2.53it/s]

tensor(3.7582, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 25/1529 [00:10<09:57,  2.52it/s]

tensor(3.7541, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 26/1529 [00:10<09:57,  2.51it/s]

tensor(3.7519, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 27/1529 [00:11<09:55,  2.52it/s]

tensor(3.7449, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 28/1529 [00:11<09:56,  2.52it/s]

tensor(3.7391, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 29/1529 [00:11<09:55,  2.52it/s]

tensor(3.7212, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 30/1529 [00:12<09:56,  2.51it/s]

tensor(3.7546, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 31/1529 [00:12<09:56,  2.51it/s]

tensor(3.7513, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 32/1529 [00:13<09:55,  2.52it/s]

tensor(3.7449, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 33/1529 [00:13<09:52,  2.52it/s]

tensor(3.7514, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 34/1529 [00:13<09:52,  2.52it/s]

tensor(3.7484, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 35/1529 [00:14<09:51,  2.53it/s]

tensor(3.7489, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 36/1529 [00:14<09:51,  2.52it/s]

tensor(3.7478, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 37/1529 [00:14<09:50,  2.53it/s]

tensor(3.7463, device='cuda:0', grad_fn=<DivBackward1>)



  2%|▏         | 38/1529 [00:15<09:50,  2.53it/s]

tensor(3.7477, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 39/1529 [00:15<09:49,  2.53it/s]

tensor(3.7418, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 40/1529 [00:16<09:48,  2.53it/s]

tensor(3.7412, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 41/1529 [00:16<09:48,  2.53it/s]

tensor(3.7461, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 42/1529 [00:16<09:49,  2.52it/s]

tensor(3.7439, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 43/1529 [00:17<09:49,  2.52it/s]

tensor(3.7382, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 44/1529 [00:17<09:49,  2.52it/s]

tensor(3.7410, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 45/1529 [00:18<09:49,  2.52it/s]

tensor(3.7448, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 46/1529 [00:18<09:48,  2.52it/s]

tensor(3.7459, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 47/1529 [00:18<09:48,  2.52it/s]

tensor(3.7407, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 48/1529 [00:19<09:47,  2.52it/s]

tensor(3.7375, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 49/1529 [00:19<09:47,  2.52it/s]

tensor(3.7405, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 50/1529 [00:20<09:55,  2.49it/s]

tensor(3.7356, device='cuda:0', grad_fn=<DivBackward1>)
beta tensor(0.0124, device='cuda:0')
cnn_encoder.conv1.weight tensor(0.0133, device='cuda:0')
cnn_encoder.conv2.weight tensor(0.0104, device='cuda:0')
cnn_encoder.bn1.weight tensor(0.0002, device='cuda:0')
cnn_encoder.bn1.bias tensor(0.0003, device='cuda:0')
cnn_encoder.conv3.weight tensor(0.0032, device='cuda:0')
cnn_encoder.conv4.weight tensor(0.0076, device='cuda:0')
cnn_encoder.conv5.weight tensor(0.0100, device='cuda:0')
cnn_encoder.conv5.bias tensor(6.7728e-06, device='cuda:0')
cnn_encoder.bn2.weight tensor(0.0005, device='cuda:0')
cnn_encoder.bn2.bias tensor(0.0011, device='cuda:0')
lstm.weight_ih tensor(0.0172, device='cuda:0')
lstm.weight_hh tensor(0.0076, device='cuda:0')
lstm.bias_ih tensor(0.0017, device='cuda:0')
lstm.bias_hh tensor(0.0017, device='cuda:0')
emb.weight tensor(0.0014, device='cuda:0')
wh_in.weight tensor(0.0002, device='cuda:0')
wc_in.weight tensor(0.0002, device='cuda:0')
wo_in.weight tensor(0.0080, de


  3%|▎         | 51/1529 [00:20<09:52,  2.49it/s]

tensor(3.7370, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 52/1529 [00:20<09:51,  2.50it/s]

tensor(3.7484, device='cuda:0', grad_fn=<DivBackward1>)



  3%|▎         | 53/1529 [00:21<09:49,  2.50it/s]

tensor(3.7512, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▎         | 54/1529 [00:21<09:48,  2.51it/s]

tensor(3.7285, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▎         | 55/1529 [00:22<09:46,  2.51it/s]

tensor(3.7253, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▎         | 56/1529 [00:22<09:45,  2.51it/s]

tensor(3.7471, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▎         | 57/1529 [00:22<09:42,  2.53it/s]

tensor(3.7299, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▍         | 58/1529 [00:23<09:43,  2.52it/s]

tensor(3.7302, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▍         | 59/1529 [00:23<09:41,  2.53it/s]

tensor(3.7276, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▍         | 60/1529 [00:24<09:42,  2.52it/s]

tensor(3.7207, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▍         | 61/1529 [00:24<09:42,  2.52it/s]

tensor(3.7261, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▍         | 62/1529 [00:24<09:41,  2.52it/s]

tensor(3.7186, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▍         | 63/1529 [00:25<09:41,  2.52it/s]

tensor(3.7194, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▍         | 64/1529 [00:25<09:41,  2.52it/s]

tensor(3.7115, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▍         | 65/1529 [00:26<09:39,  2.53it/s]

tensor(3.7059, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▍         | 66/1529 [00:26<09:39,  2.52it/s]

tensor(3.7310, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▍         | 67/1529 [00:26<09:38,  2.53it/s]

tensor(3.7000, device='cuda:0', grad_fn=<DivBackward1>)



  4%|▍         | 68/1529 [00:27<09:39,  2.52it/s]

tensor(3.6994, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▍         | 69/1529 [00:27<09:39,  2.52it/s]

tensor(3.6932, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▍         | 70/1529 [00:28<09:37,  2.53it/s]

tensor(3.7309, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▍         | 71/1529 [00:28<09:39,  2.52it/s]

tensor(3.7252, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▍         | 72/1529 [00:28<09:38,  2.52it/s]

tensor(3.7319, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▍         | 73/1529 [00:29<09:37,  2.52it/s]

tensor(3.7264, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▍         | 74/1529 [00:29<09:37,  2.52it/s]

tensor(3.7086, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▍         | 75/1529 [00:30<09:36,  2.52it/s]

tensor(3.7090, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▍         | 76/1529 [00:30<09:36,  2.52it/s]

tensor(3.7095, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▌         | 77/1529 [00:30<09:35,  2.52it/s]

tensor(3.7161, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▌         | 78/1529 [00:31<09:35,  2.52it/s]

tensor(3.7230, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▌         | 79/1529 [00:31<09:36,  2.52it/s]

tensor(3.7152, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▌         | 80/1529 [00:32<09:35,  2.52it/s]

tensor(3.7034, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▌         | 81/1529 [00:32<09:34,  2.52it/s]

tensor(3.7017, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▌         | 82/1529 [00:32<09:33,  2.52it/s]

tensor(3.7068, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▌         | 83/1529 [00:33<09:33,  2.52it/s]

tensor(3.7038, device='cuda:0', grad_fn=<DivBackward1>)



  5%|▌         | 84/1529 [00:33<09:33,  2.52it/s]

tensor(3.7011, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▌         | 85/1529 [00:34<09:33,  2.52it/s]

tensor(3.7000, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▌         | 86/1529 [00:34<09:33,  2.52it/s]

tensor(3.6965, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▌         | 87/1529 [00:34<09:32,  2.52it/s]

tensor(3.6886, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▌         | 88/1529 [00:35<09:31,  2.52it/s]

tensor(3.6925, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▌         | 89/1529 [00:35<09:29,  2.53it/s]

tensor(3.6914, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▌         | 90/1529 [00:36<09:29,  2.53it/s]

tensor(3.6939, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▌         | 91/1529 [00:36<09:29,  2.53it/s]

tensor(3.6989, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▌         | 92/1529 [00:36<09:30,  2.52it/s]

tensor(3.6981, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▌         | 93/1529 [00:37<09:30,  2.52it/s]

tensor(3.6841, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▌         | 94/1529 [00:37<09:29,  2.52it/s]

tensor(3.7113, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▌         | 95/1529 [00:38<09:28,  2.52it/s]

tensor(3.6955, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▋         | 96/1529 [00:38<09:28,  2.52it/s]

tensor(3.6932, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▋         | 97/1529 [00:38<09:28,  2.52it/s]

tensor(3.6836, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▋         | 98/1529 [00:39<09:27,  2.52it/s]

tensor(3.6934, device='cuda:0', grad_fn=<DivBackward1>)



  6%|▋         | 99/1529 [00:39<09:31,  2.50it/s]

tensor(3.6907, device='cuda:0', grad_fn=<DivBackward1>)
beta tensor(0.0076, device='cuda:0')
cnn_encoder.conv1.weight tensor(0.0035, device='cuda:0')
cnn_encoder.conv2.weight tensor(0.0059, device='cuda:0')
cnn_encoder.bn1.weight tensor(0.0002, device='cuda:0')
cnn_encoder.bn1.bias tensor(5.1411e-05, device='cuda:0')
cnn_encoder.conv3.weight tensor(0.0012, device='cuda:0')
cnn_encoder.conv4.weight tensor(0.0017, device='cuda:0')
cnn_encoder.conv5.weight tensor(0.0044, device='cuda:0')
cnn_encoder.conv5.bias tensor(1.8720e-06, device='cuda:0')
cnn_encoder.bn2.weight tensor(0.0008, device='cuda:0')
cnn_encoder.bn2.bias tensor(0.0020, device='cuda:0')
lstm.weight_ih tensor(0.0509, device='cuda:0')
lstm.weight_hh tensor(0.0336, device='cuda:0')
lstm.bias_ih tensor(0.0050, device='cuda:0')
lstm.bias_hh tensor(0.0050, device='cuda:0')
emb.weight tensor(0.0044, device='cuda:0')
wh_in.weight tensor(0.0002, device='cuda:0')
wc_in.weight tensor(0.0002, device='cuda:0')
wo_in.weight tensor(0.0107


  7%|▋         | 100/1529 [00:40<09:29,  2.51it/s]

tensor(3.6842, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 101/1529 [00:40<09:29,  2.51it/s]

tensor(3.6814, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 102/1529 [00:40<09:28,  2.51it/s]

tensor(3.6806, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 103/1529 [00:41<09:28,  2.51it/s]

tensor(3.6735, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 104/1529 [00:41<09:27,  2.51it/s]

tensor(3.6896, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 105/1529 [00:42<09:27,  2.51it/s]

tensor(3.6764, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 106/1529 [00:42<09:26,  2.51it/s]

tensor(3.6840, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 107/1529 [00:42<09:24,  2.52it/s]

tensor(3.6865, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 108/1529 [00:43<09:23,  2.52it/s]

tensor(3.6827, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 109/1529 [00:43<09:23,  2.52it/s]

tensor(3.6770, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 110/1529 [00:43<09:22,  2.52it/s]

tensor(3.6810, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 111/1529 [00:44<08:42,  2.72it/s]

tensor(3.6779, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 112/1529 [00:44<08:13,  2.87it/s]

tensor(3.6837, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 113/1529 [00:44<07:52,  3.00it/s]

tensor(3.6843, device='cuda:0', grad_fn=<DivBackward1>)



  7%|▋         | 114/1529 [00:45<07:38,  3.09it/s]

tensor(3.6842, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 115/1529 [00:45<07:28,  3.15it/s]

tensor(3.6819, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 116/1529 [00:45<07:20,  3.20it/s]

tensor(3.6844, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 117/1529 [00:46<07:15,  3.24it/s]

tensor(3.6790, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 118/1529 [00:46<07:11,  3.27it/s]

tensor(3.6738, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 119/1529 [00:46<07:09,  3.28it/s]

tensor(3.6764, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 120/1529 [00:46<07:07,  3.29it/s]

tensor(3.6752, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 121/1529 [00:47<07:05,  3.31it/s]

tensor(3.6784, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 122/1529 [00:47<07:05,  3.31it/s]

tensor(3.6796, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 123/1529 [00:47<07:05,  3.30it/s]

tensor(3.6766, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 124/1529 [00:48<07:05,  3.30it/s]

tensor(3.6769, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 125/1529 [00:48<07:04,  3.31it/s]

tensor(3.6840, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 126/1529 [00:48<07:02,  3.32it/s]

tensor(3.6747, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 127/1529 [00:49<07:03,  3.31it/s]

tensor(3.6723, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 128/1529 [00:49<07:02,  3.31it/s]

tensor(3.6867, device='cuda:0', grad_fn=<DivBackward1>)



  8%|▊         | 129/1529 [00:49<07:02,  3.32it/s]

tensor(3.6778, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▊         | 130/1529 [00:50<07:02,  3.31it/s]

tensor(3.6838, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▊         | 131/1529 [00:50<07:01,  3.32it/s]

tensor(3.6756, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▊         | 132/1529 [00:50<07:02,  3.31it/s]

tensor(3.6770, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▊         | 133/1529 [00:50<07:03,  3.30it/s]

tensor(3.6797, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 134/1529 [00:51<07:02,  3.30it/s]

tensor(3.6753, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 135/1529 [00:51<07:02,  3.30it/s]

tensor(3.6759, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 136/1529 [00:51<07:01,  3.31it/s]

tensor(3.6849, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 137/1529 [00:52<07:00,  3.31it/s]

tensor(3.6811, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 138/1529 [00:52<07:00,  3.31it/s]

tensor(3.6818, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 139/1529 [00:52<07:00,  3.31it/s]

tensor(3.6823, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 140/1529 [00:53<06:59,  3.31it/s]

tensor(3.6779, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 141/1529 [00:53<06:58,  3.32it/s]

tensor(3.6835, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 142/1529 [00:53<06:58,  3.32it/s]

tensor(3.6813, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 143/1529 [00:53<06:57,  3.32it/s]

tensor(3.6783, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 144/1529 [00:54<06:57,  3.31it/s]

tensor(3.6792, device='cuda:0', grad_fn=<DivBackward1>)



  9%|▉         | 145/1529 [00:54<06:57,  3.31it/s]

tensor(3.6801, device='cuda:0', grad_fn=<DivBackward1>)



 10%|▉         | 146/1529 [00:54<06:57,  3.31it/s]

tensor(3.6690, device='cuda:0', grad_fn=<DivBackward1>)



 10%|▉         | 147/1529 [00:55<06:58,  3.30it/s]

tensor(3.6736, device='cuda:0', grad_fn=<DivBackward1>)



 10%|▉         | 148/1529 [00:55<07:02,  3.27it/s]

tensor(3.6708, device='cuda:0', grad_fn=<DivBackward1>)
beta tensor(0.0019, device='cuda:0')
cnn_encoder.conv1.weight tensor(0.0027, device='cuda:0')
cnn_encoder.conv2.weight tensor(0.0033, device='cuda:0')
cnn_encoder.bn1.weight tensor(0.0002, device='cuda:0')
cnn_encoder.bn1.bias tensor(8.8523e-05, device='cuda:0')
cnn_encoder.conv3.weight tensor(0.0017, device='cuda:0')
cnn_encoder.conv4.weight tensor(0.0019, device='cuda:0')
cnn_encoder.conv5.weight tensor(0.0062, device='cuda:0')
cnn_encoder.conv5.bias tensor(2.5977e-06, device='cuda:0')
cnn_encoder.bn2.weight tensor(0.0005, device='cuda:0')
cnn_encoder.bn2.bias tensor(0.0025, device='cuda:0')
lstm.weight_ih tensor(0.0782, device='cuda:0')
lstm.weight_hh tensor(0.0362, device='cuda:0')
lstm.bias_ih tensor(0.0070, device='cuda:0')
lstm.bias_hh tensor(0.0070, device='cuda:0')
emb.weight tensor(0.0038, device='cuda:0')
wh_in.weight tensor(0.0002, device='cuda:0')
wc_in.weight tensor(0.0003, device='cuda:0')
wo_in.weight tensor(0.0173


 10%|▉         | 149/1529 [00:55<07:00,  3.28it/s]

tensor(3.6813, device='cuda:0', grad_fn=<DivBackward1>)



 10%|▉         | 150/1529 [00:56<06:57,  3.30it/s]

tensor(3.6747, device='cuda:0', grad_fn=<DivBackward1>)



 10%|▉         | 151/1529 [00:56<06:58,  3.30it/s]

tensor(3.6785, device='cuda:0', grad_fn=<DivBackward1>)



 10%|▉         | 152/1529 [00:56<07:00,  3.27it/s]

tensor(3.6754, device='cuda:0', grad_fn=<DivBackward1>)



 10%|█         | 153/1529 [00:56<06:58,  3.29it/s]

tensor(3.6826, device='cuda:0', grad_fn=<DivBackward1>)



 10%|█         | 154/1529 [00:57<06:58,  3.29it/s]

tensor(3.6774, device='cuda:0', grad_fn=<DivBackward1>)



 10%|█         | 155/1529 [00:57<06:58,  3.29it/s]

tensor(3.6789, device='cuda:0', grad_fn=<DivBackward1>)



 10%|█         | 156/1529 [00:57<06:56,  3.30it/s]

tensor(3.6755, device='cuda:0', grad_fn=<DivBackward1>)



 10%|█         | 157/1529 [00:58<06:54,  3.31it/s]

tensor(3.6798, device='cuda:0', grad_fn=<DivBackward1>)



 10%|█         | 158/1529 [00:58<06:55,  3.30it/s]

tensor(3.6701, device='cuda:0', grad_fn=<DivBackward1>)



 10%|█         | 159/1529 [00:58<06:56,  3.29it/s]

tensor(3.6760, device='cuda:0', grad_fn=<DivBackward1>)



 10%|█         | 160/1529 [00:59<06:55,  3.29it/s]

tensor(3.6715, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 161/1529 [00:59<06:54,  3.30it/s]

tensor(3.6775, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 162/1529 [00:59<06:53,  3.30it/s]

tensor(3.6709, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 163/1529 [01:00<06:53,  3.31it/s]

tensor(3.6780, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 164/1529 [01:00<06:51,  3.31it/s]

tensor(3.6818, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 165/1529 [01:00<06:51,  3.32it/s]

tensor(3.6744, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 166/1529 [01:01<07:44,  2.94it/s]

tensor(3.6765, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 167/1529 [01:01<07:27,  3.04it/s]

tensor(3.6760, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 168/1529 [01:01<07:16,  3.12it/s]

tensor(3.6856, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 169/1529 [01:01<07:08,  3.17it/s]

tensor(3.6720, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 170/1529 [01:02<07:02,  3.22it/s]

tensor(3.6699, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 171/1529 [01:02<06:59,  3.24it/s]

tensor(3.6803, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█         | 172/1529 [01:02<06:56,  3.26it/s]

tensor(3.6738, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█▏        | 173/1529 [01:03<06:53,  3.28it/s]

tensor(3.6752, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█▏        | 174/1529 [01:03<06:53,  3.28it/s]

tensor(3.6745, device='cuda:0', grad_fn=<DivBackward1>)



 11%|█▏        | 175/1529 [01:03<06:51,  3.29it/s]

tensor(3.6703, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 176/1529 [01:04<06:50,  3.30it/s]

tensor(3.6720, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 177/1529 [01:04<06:49,  3.30it/s]

tensor(3.6726, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 178/1529 [01:04<06:49,  3.30it/s]

tensor(3.6680, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 179/1529 [01:04<06:49,  3.30it/s]

tensor(3.6759, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 180/1529 [01:05<06:47,  3.31it/s]

tensor(3.6791, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 181/1529 [01:05<06:47,  3.31it/s]

tensor(3.6743, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 182/1529 [01:05<06:47,  3.31it/s]

tensor(3.6746, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 183/1529 [01:06<06:46,  3.31it/s]

tensor(3.6884, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 184/1529 [01:06<06:46,  3.31it/s]

tensor(3.6782, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 185/1529 [01:06<06:46,  3.31it/s]

tensor(3.6755, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 186/1529 [01:07<06:48,  3.29it/s]

tensor(3.6780, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 187/1529 [01:07<06:47,  3.29it/s]

tensor(3.6844, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 188/1529 [01:07<06:47,  3.29it/s]

tensor(3.6784, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 189/1529 [01:07<06:46,  3.30it/s]

tensor(3.6724, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 190/1529 [01:08<06:45,  3.30it/s]

tensor(3.6733, device='cuda:0', grad_fn=<DivBackward1>)



 12%|█▏        | 191/1529 [01:08<06:45,  3.30it/s]

tensor(3.6796, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 192/1529 [01:08<06:44,  3.30it/s]

tensor(3.6791, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 193/1529 [01:09<06:43,  3.31it/s]

tensor(3.6790, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 194/1529 [01:09<06:43,  3.31it/s]

tensor(3.6756, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 195/1529 [01:09<06:42,  3.31it/s]

tensor(3.6855, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 196/1529 [01:10<06:44,  3.30it/s]

tensor(3.6798, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 197/1529 [01:10<06:46,  3.27it/s]

tensor(3.6712, device='cuda:0', grad_fn=<DivBackward1>)
beta tensor(0.0008, device='cuda:0')
cnn_encoder.conv1.weight tensor(0.0078, device='cuda:0')
cnn_encoder.conv2.weight tensor(0.0065, device='cuda:0')
cnn_encoder.bn1.weight tensor(0.0005, device='cuda:0')
cnn_encoder.bn1.bias tensor(0.0008, device='cuda:0')
cnn_encoder.conv3.weight tensor(0.0022, device='cuda:0')
cnn_encoder.conv4.weight tensor(0.0019, device='cuda:0')
cnn_encoder.conv5.weight tensor(0.0027, device='cuda:0')
cnn_encoder.conv5.bias tensor(8.9721e-07, device='cuda:0')
cnn_encoder.bn2.weight tensor(0.0004, device='cuda:0')
cnn_encoder.bn2.bias tensor(0.0008, device='cuda:0')
lstm.weight_ih tensor(0.0196, device='cuda:0')
lstm.weight_hh tensor(0.0142, device='cuda:0')
lstm.bias_ih tensor(0.0019, device='cuda:0')
lstm.bias_hh tensor(0.0019, device='cuda:0')
emb.weight tensor(0.0012, device='cuda:0')
wh_in.weight tensor(0.0002, device='cuda:0')
wc_in.weight tensor(0.0002, device='cuda:0')
wo_in.weight tensor(0.0045, de


 13%|█▎        | 198/1529 [01:10<06:45,  3.28it/s]

tensor(3.6783, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 199/1529 [01:11<06:44,  3.29it/s]

tensor(3.6797, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 200/1529 [01:11<06:43,  3.29it/s]

tensor(3.6746, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 201/1529 [01:11<06:42,  3.30it/s]

tensor(3.6772, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 202/1529 [01:11<06:42,  3.30it/s]

tensor(3.6745, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 203/1529 [01:12<06:44,  3.27it/s]

tensor(3.6726, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 204/1529 [01:12<06:43,  3.28it/s]

tensor(3.6693, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 205/1529 [01:12<06:43,  3.28it/s]

tensor(3.6694, device='cuda:0', grad_fn=<DivBackward1>)



 13%|█▎        | 206/1529 [01:13<06:41,  3.30it/s]

tensor(3.6889, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▎        | 207/1529 [01:13<06:40,  3.30it/s]

tensor(3.6675, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▎        | 208/1529 [01:13<06:40,  3.30it/s]

tensor(3.6786, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▎        | 209/1529 [01:14<06:39,  3.30it/s]

tensor(3.6752, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▎        | 210/1529 [01:14<06:40,  3.30it/s]

tensor(3.6839, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▍        | 211/1529 [01:14<06:39,  3.30it/s]

tensor(3.6818, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▍        | 212/1529 [01:14<06:38,  3.30it/s]

tensor(3.6725, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▍        | 213/1529 [01:15<06:37,  3.31it/s]

tensor(3.6753, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▍        | 214/1529 [01:15<06:37,  3.31it/s]

tensor(3.6796, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▍        | 215/1529 [01:15<06:37,  3.31it/s]

tensor(3.6836, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▍        | 216/1529 [01:16<06:36,  3.31it/s]

tensor(3.6804, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▍        | 217/1529 [01:16<06:37,  3.30it/s]

tensor(3.6811, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▍        | 218/1529 [01:16<06:36,  3.30it/s]

tensor(3.6777, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▍        | 219/1529 [01:17<06:36,  3.30it/s]

tensor(3.6719, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▍        | 220/1529 [01:17<06:37,  3.30it/s]

tensor(3.6699, device='cuda:0', grad_fn=<DivBackward1>)



 14%|█▍        | 221/1529 [01:17<06:36,  3.30it/s]

tensor(3.6734, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▍        | 222/1529 [01:18<06:35,  3.30it/s]

tensor(3.6698, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▍        | 223/1529 [01:18<06:35,  3.30it/s]

tensor(3.6728, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▍        | 224/1529 [01:18<06:35,  3.30it/s]

tensor(3.6760, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▍        | 225/1529 [01:18<06:35,  3.29it/s]

tensor(3.6716, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▍        | 226/1529 [01:19<06:35,  3.30it/s]

tensor(3.6748, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▍        | 227/1529 [01:19<06:34,  3.30it/s]

tensor(3.6808, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▍        | 228/1529 [01:19<06:33,  3.30it/s]

tensor(3.6745, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▍        | 229/1529 [01:20<06:32,  3.31it/s]

tensor(3.6782, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▌        | 230/1529 [01:20<06:32,  3.31it/s]

tensor(3.6761, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▌        | 231/1529 [01:20<06:32,  3.31it/s]

tensor(3.6731, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▌        | 232/1529 [01:21<06:32,  3.31it/s]

tensor(3.6851, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▌        | 233/1529 [01:21<06:31,  3.31it/s]

tensor(3.6725, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▌        | 234/1529 [01:21<06:32,  3.30it/s]

tensor(3.6688, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▌        | 235/1529 [01:21<06:30,  3.31it/s]

tensor(3.6744, device='cuda:0', grad_fn=<DivBackward1>)



 15%|█▌        | 236/1529 [01:22<06:30,  3.31it/s]

tensor(3.6856, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▌        | 237/1529 [01:22<06:30,  3.31it/s]

tensor(3.6778, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▌        | 238/1529 [01:22<06:29,  3.31it/s]

tensor(3.6817, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▌        | 239/1529 [01:23<06:30,  3.31it/s]

tensor(3.6772, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▌        | 240/1529 [01:23<06:29,  3.31it/s]

tensor(3.6722, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▌        | 241/1529 [01:23<06:29,  3.30it/s]

tensor(3.6815, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▌        | 242/1529 [01:24<06:29,  3.31it/s]

tensor(3.6804, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▌        | 243/1529 [01:24<06:29,  3.30it/s]

tensor(3.6716, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▌        | 244/1529 [01:24<06:29,  3.30it/s]

tensor(3.6720, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▌        | 245/1529 [01:24<06:28,  3.31it/s]

tensor(3.6715, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▌        | 246/1529 [01:25<06:31,  3.28it/s]

tensor(3.6737, device='cuda:0', grad_fn=<DivBackward1>)
beta tensor(0.0038, device='cuda:0')
cnn_encoder.conv1.weight tensor(0.0040, device='cuda:0')
cnn_encoder.conv2.weight tensor(0.0051, device='cuda:0')
cnn_encoder.bn1.weight tensor(0.0004, device='cuda:0')
cnn_encoder.bn1.bias tensor(0.0002, device='cuda:0')
cnn_encoder.conv3.weight tensor(0.0018, device='cuda:0')
cnn_encoder.conv4.weight tensor(0.0011, device='cuda:0')
cnn_encoder.conv5.weight tensor(0.0023, device='cuda:0')
cnn_encoder.conv5.bias tensor(7.6721e-07, device='cuda:0')
cnn_encoder.bn2.weight tensor(0.0004, device='cuda:0')
cnn_encoder.bn2.bias tensor(0.0013, device='cuda:0')
lstm.weight_ih tensor(0.0196, device='cuda:0')
lstm.weight_hh tensor(0.0139, device='cuda:0')
lstm.bias_ih tensor(0.0019, device='cuda:0')
lstm.bias_hh tensor(0.0019, device='cuda:0')
emb.weight tensor(0.0017, device='cuda:0')
wh_in.weight tensor(5.4718e-05, device='cuda:0')
wc_in.weight tensor(8.4931e-05, device='cuda:0')
wo_in.weight tensor(0.


 16%|█▌        | 247/1529 [01:25<06:30,  3.29it/s]

tensor(3.6724, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▌        | 248/1529 [01:25<06:28,  3.30it/s]

tensor(3.6818, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▋        | 249/1529 [01:26<06:27,  3.30it/s]

tensor(3.6784, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▋        | 250/1529 [01:26<06:27,  3.30it/s]

tensor(3.6734, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▋        | 251/1529 [01:26<06:28,  3.29it/s]

tensor(3.6733, device='cuda:0', grad_fn=<DivBackward1>)



 16%|█▋        | 252/1529 [01:27<06:27,  3.29it/s]

tensor(3.6671, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 253/1529 [01:27<06:26,  3.30it/s]

tensor(3.6710, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 254/1529 [01:27<06:25,  3.31it/s]

tensor(3.6849, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 255/1529 [01:27<06:25,  3.31it/s]

tensor(3.6723, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 256/1529 [01:28<06:25,  3.30it/s]

tensor(3.6728, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 257/1529 [01:28<06:24,  3.31it/s]

tensor(3.6798, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 258/1529 [01:28<06:27,  3.28it/s]

tensor(3.6812, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 259/1529 [01:29<06:26,  3.29it/s]

tensor(3.6768, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 260/1529 [01:29<06:25,  3.29it/s]

tensor(3.6786, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 261/1529 [01:29<06:23,  3.30it/s]

tensor(3.6750, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 262/1529 [01:30<06:23,  3.31it/s]

tensor(3.6762, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 263/1529 [01:30<06:22,  3.31it/s]

tensor(3.6849, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 264/1529 [01:30<06:22,  3.30it/s]

tensor(3.6766, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 265/1529 [01:31<06:22,  3.30it/s]

tensor(3.6712, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 266/1529 [01:31<06:22,  3.30it/s]

tensor(3.6726, device='cuda:0', grad_fn=<DivBackward1>)



 17%|█▋        | 267/1529 [01:31<06:22,  3.30it/s]

tensor(3.6720, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 268/1529 [01:31<06:21,  3.31it/s]

tensor(3.6783, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 269/1529 [01:32<06:20,  3.31it/s]

tensor(3.6693, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 270/1529 [01:32<06:21,  3.30it/s]

tensor(3.6750, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 271/1529 [01:32<06:20,  3.31it/s]

tensor(3.6685, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 272/1529 [01:33<06:22,  3.29it/s]

tensor(3.6717, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 273/1529 [01:33<06:21,  3.29it/s]

tensor(3.6814, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 274/1529 [01:33<06:20,  3.30it/s]

tensor(3.6770, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 275/1529 [01:34<06:19,  3.30it/s]

tensor(3.6681, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 276/1529 [01:34<06:19,  3.31it/s]

tensor(3.6658, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 277/1529 [01:34<06:18,  3.31it/s]

tensor(3.6748, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 278/1529 [01:34<06:18,  3.31it/s]

tensor(3.6796, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 279/1529 [01:35<06:17,  3.31it/s]

tensor(3.6737, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 280/1529 [01:35<06:18,  3.30it/s]

tensor(3.6659, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 281/1529 [01:35<06:17,  3.31it/s]

tensor(3.6773, device='cuda:0', grad_fn=<DivBackward1>)



 18%|█▊        | 282/1529 [01:36<06:17,  3.30it/s]

tensor(3.6736, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▊        | 283/1529 [01:36<06:17,  3.30it/s]

tensor(3.6711, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▊        | 284/1529 [01:36<06:16,  3.30it/s]

tensor(3.6684, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▊        | 285/1529 [01:37<06:16,  3.31it/s]

tensor(3.6761, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▊        | 286/1529 [01:37<06:18,  3.29it/s]

tensor(3.6782, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▉        | 287/1529 [01:37<06:17,  3.29it/s]

tensor(3.6759, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▉        | 288/1529 [01:37<06:16,  3.30it/s]

tensor(3.6659, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▉        | 289/1529 [01:38<06:15,  3.30it/s]

tensor(3.6703, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▉        | 290/1529 [01:38<06:15,  3.30it/s]

tensor(3.6776, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▉        | 291/1529 [01:38<06:15,  3.30it/s]

tensor(3.6764, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▉        | 292/1529 [01:39<06:14,  3.31it/s]

tensor(3.6852, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▉        | 293/1529 [01:39<06:13,  3.31it/s]

tensor(3.6745, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▉        | 294/1529 [01:39<06:12,  3.31it/s]

tensor(3.6835, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▉        | 295/1529 [01:40<06:15,  3.29it/s]

tensor(3.6769, device='cuda:0', grad_fn=<DivBackward1>)
beta tensor(0.0023, device='cuda:0')
cnn_encoder.conv1.weight tensor(0.0042, device='cuda:0')
cnn_encoder.conv2.weight tensor(0.0039, device='cuda:0')
cnn_encoder.bn1.weight tensor(0.0004, device='cuda:0')
cnn_encoder.bn1.bias tensor(0.0003, device='cuda:0')
cnn_encoder.conv3.weight tensor(0.0016, device='cuda:0')
cnn_encoder.conv4.weight tensor(0.0014, device='cuda:0')
cnn_encoder.conv5.weight tensor(0.0016, device='cuda:0')
cnn_encoder.conv5.bias tensor(4.3174e-07, device='cuda:0')
cnn_encoder.bn2.weight tensor(0.0003, device='cuda:0')
cnn_encoder.bn2.bias tensor(0.0023, device='cuda:0')
lstm.weight_ih tensor(0.0194, device='cuda:0')
lstm.weight_hh tensor(0.0112, device='cuda:0')
lstm.bias_ih tensor(0.0023, device='cuda:0')
lstm.bias_hh tensor(0.0023, device='cuda:0')
emb.weight tensor(0.0018, device='cuda:0')
wh_in.weight tensor(0.0001, device='cuda:0')
wc_in.weight tensor(0.0002, device='cuda:0')
wo_in.weight tensor(0.0030, de


 19%|█▉        | 296/1529 [01:40<06:14,  3.29it/s]

tensor(3.6768, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▉        | 297/1529 [01:40<06:14,  3.29it/s]

tensor(3.6784, device='cuda:0', grad_fn=<DivBackward1>)



 19%|█▉        | 298/1529 [01:41<06:16,  3.27it/s]

tensor(3.6730, device='cuda:0', grad_fn=<DivBackward1>)



 20%|█▉        | 299/1529 [01:41<06:20,  3.23it/s]

tensor(3.6684, device='cuda:0', grad_fn=<DivBackward1>)



 20%|█▉        | 300/1529 [01:41<06:25,  3.19it/s]

tensor(3.6636, device='cuda:0', grad_fn=<DivBackward1>)



 20%|█▉        | 301/1529 [01:42<06:30,  3.15it/s]

tensor(3.6752, device='cuda:0', grad_fn=<DivBackward1>)



 20%|█▉        | 302/1529 [01:42<06:29,  3.15it/s]

tensor(3.6725, device='cuda:0', grad_fn=<DivBackward1>)



 20%|█▉        | 303/1529 [01:42<06:29,  3.15it/s]

tensor(3.6806, device='cuda:0', grad_fn=<DivBackward1>)



 20%|█▉        | 304/1529 [01:42<06:23,  3.19it/s]

tensor(3.6769, device='cuda:0', grad_fn=<DivBackward1>)



 20%|█▉        | 305/1529 [01:43<06:19,  3.23it/s]

tensor(3.6769, device='cuda:0', grad_fn=<DivBackward1>)



 20%|██        | 306/1529 [01:43<06:16,  3.25it/s]

tensor(3.6784, device='cuda:0', grad_fn=<DivBackward1>)



 20%|██        | 307/1529 [01:43<06:14,  3.27it/s]

tensor(3.6776, device='cuda:0', grad_fn=<DivBackward1>)



 20%|██        | 308/1529 [01:44<06:12,  3.27it/s]

tensor(3.6743, device='cuda:0', grad_fn=<DivBackward1>)



 20%|██        | 309/1529 [01:44<06:10,  3.29it/s]

tensor(3.6728, device='cuda:0', grad_fn=<DivBackward1>)



 20%|██        | 310/1529 [01:44<06:09,  3.30it/s]

tensor(3.6724, device='cuda:0', grad_fn=<DivBackward1>)



 20%|██        | 311/1529 [01:45<06:08,  3.31it/s]

tensor(3.6737, device='cuda:0', grad_fn=<DivBackward1>)



 20%|██        | 312/1529 [01:45<06:08,  3.30it/s]

tensor(3.6711, device='cuda:0', grad_fn=<DivBackward1>)



 20%|██        | 313/1529 [01:45<06:08,  3.30it/s]

tensor(3.6693, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██        | 314/1529 [01:45<06:07,  3.31it/s]

tensor(3.6709, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██        | 315/1529 [01:46<06:06,  3.31it/s]

tensor(3.6732, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██        | 316/1529 [01:46<06:07,  3.30it/s]

tensor(3.6705, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██        | 317/1529 [01:46<06:05,  3.31it/s]

tensor(3.6734, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██        | 318/1529 [01:47<06:05,  3.32it/s]

tensor(3.6753, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██        | 319/1529 [01:47<06:04,  3.32it/s]

tensor(3.6691, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██        | 320/1529 [01:47<06:06,  3.30it/s]

tensor(3.6800, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██        | 321/1529 [01:48<06:06,  3.30it/s]

tensor(3.6825, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██        | 322/1529 [01:48<06:05,  3.30it/s]

tensor(3.6660, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██        | 323/1529 [01:48<06:05,  3.30it/s]

tensor(3.6724, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██        | 324/1529 [01:48<06:04,  3.31it/s]

tensor(3.6780, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██▏       | 325/1529 [01:49<06:02,  3.32it/s]

tensor(3.6765, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██▏       | 326/1529 [01:49<06:03,  3.31it/s]

tensor(3.6762, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██▏       | 327/1529 [01:49<06:02,  3.32it/s]

tensor(3.6806, device='cuda:0', grad_fn=<DivBackward1>)



 21%|██▏       | 328/1529 [01:50<06:02,  3.31it/s]

tensor(3.6828, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 329/1529 [01:50<06:02,  3.31it/s]

tensor(3.6733, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 330/1529 [01:50<06:02,  3.31it/s]

tensor(3.6788, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 331/1529 [01:51<06:01,  3.31it/s]

tensor(3.6776, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 332/1529 [01:51<06:01,  3.31it/s]

tensor(3.6745, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 333/1529 [01:51<06:00,  3.32it/s]

tensor(3.6708, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 334/1529 [01:52<06:00,  3.31it/s]

tensor(3.6666, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 335/1529 [01:52<06:00,  3.32it/s]

tensor(3.6740, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 336/1529 [01:52<06:01,  3.30it/s]

tensor(3.6671, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 337/1529 [01:52<06:01,  3.30it/s]

tensor(3.6722, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 338/1529 [01:53<06:01,  3.30it/s]

tensor(3.6730, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 339/1529 [01:53<06:00,  3.30it/s]

tensor(3.6724, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 340/1529 [01:53<06:00,  3.30it/s]

tensor(3.6761, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 341/1529 [01:54<05:58,  3.31it/s]

tensor(3.6771, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 342/1529 [01:54<05:58,  3.31it/s]

tensor(3.6629, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 343/1529 [01:54<05:59,  3.30it/s]

tensor(3.6734, device='cuda:0', grad_fn=<DivBackward1>)



 22%|██▏       | 344/1529 [01:55<06:01,  3.28it/s]

tensor(3.6713, device='cuda:0', grad_fn=<DivBackward1>)
beta tensor(0.0008, device='cuda:0')
cnn_encoder.conv1.weight tensor(0.0038, device='cuda:0')
cnn_encoder.conv2.weight tensor(0.0054, device='cuda:0')
cnn_encoder.bn1.weight tensor(0.0004, device='cuda:0')
cnn_encoder.bn1.bias tensor(0.0002, device='cuda:0')
cnn_encoder.conv3.weight tensor(0.0019, device='cuda:0')
cnn_encoder.conv4.weight tensor(0.0014, device='cuda:0')
cnn_encoder.conv5.weight tensor(0.0014, device='cuda:0')
cnn_encoder.conv5.bias tensor(4.4189e-07, device='cuda:0')
cnn_encoder.bn2.weight tensor(0.0003, device='cuda:0')
cnn_encoder.bn2.bias tensor(0.0008, device='cuda:0')
lstm.weight_ih tensor(0.0123, device='cuda:0')
lstm.weight_hh tensor(0.0085, device='cuda:0')
lstm.bias_ih tensor(0.0014, device='cuda:0')
lstm.bias_hh tensor(0.0014, device='cuda:0')
emb.weight tensor(0.0008, device='cuda:0')
wh_in.weight tensor(9.2709e-05, device='cuda:0')
wc_in.weight tensor(0.0002, device='cuda:0')
wo_in.weight tensor(0.0026


 23%|██▎       | 345/1529 [01:55<05:59,  3.29it/s]

tensor(3.6825, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 346/1529 [01:55<05:59,  3.29it/s]

tensor(3.6692, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 347/1529 [01:55<06:00,  3.28it/s]

tensor(3.6679, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 348/1529 [01:56<05:59,  3.28it/s]

tensor(3.6824, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 349/1529 [01:56<05:58,  3.29it/s]

tensor(3.6700, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 350/1529 [01:56<05:58,  3.29it/s]

tensor(3.6682, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 351/1529 [01:57<05:57,  3.30it/s]

tensor(3.6800, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 352/1529 [01:57<05:57,  3.29it/s]

tensor(3.6685, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 353/1529 [01:57<05:57,  3.29it/s]

tensor(3.6793, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 354/1529 [01:58<05:56,  3.29it/s]

tensor(3.6766, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 355/1529 [01:58<05:56,  3.30it/s]

tensor(3.6725, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 356/1529 [01:58<05:56,  3.29it/s]

tensor(3.6721, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 357/1529 [01:58<05:55,  3.29it/s]

tensor(3.6686, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 358/1529 [01:59<05:56,  3.28it/s]

tensor(3.6690, device='cuda:0', grad_fn=<DivBackward1>)



 23%|██▎       | 359/1529 [01:59<05:55,  3.29it/s]

tensor(3.6666, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▎       | 360/1529 [01:59<05:54,  3.29it/s]

tensor(3.6838, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▎       | 361/1529 [02:00<05:54,  3.30it/s]

tensor(3.6689, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▎       | 362/1529 [02:00<05:53,  3.30it/s]

tensor(3.6707, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▎       | 363/1529 [02:00<05:53,  3.29it/s]

tensor(3.6710, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▍       | 364/1529 [02:01<05:52,  3.30it/s]

tensor(3.6710, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▍       | 365/1529 [02:01<05:51,  3.31it/s]

tensor(3.6741, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▍       | 366/1529 [02:01<05:51,  3.31it/s]

tensor(3.6740, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▍       | 367/1529 [02:02<05:50,  3.31it/s]

tensor(3.6854, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▍       | 368/1529 [02:02<05:50,  3.32it/s]

tensor(3.6832, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▍       | 369/1529 [02:02<05:50,  3.31it/s]

tensor(3.6701, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▍       | 370/1529 [02:02<05:50,  3.31it/s]

tensor(3.6765, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▍       | 371/1529 [02:03<05:49,  3.31it/s]

tensor(3.6732, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▍       | 372/1529 [02:03<05:50,  3.30it/s]

tensor(3.6704, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▍       | 373/1529 [02:03<05:48,  3.32it/s]

tensor(3.6696, device='cuda:0', grad_fn=<DivBackward1>)



 24%|██▍       | 374/1529 [02:04<05:48,  3.32it/s]

tensor(3.6811, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▍       | 375/1529 [02:04<05:48,  3.31it/s]

tensor(3.6717, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▍       | 376/1529 [02:04<05:48,  3.31it/s]

tensor(3.6742, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▍       | 377/1529 [02:05<05:47,  3.31it/s]

tensor(3.6747, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▍       | 378/1529 [02:05<05:47,  3.31it/s]

tensor(3.6753, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▍       | 379/1529 [02:05<05:47,  3.31it/s]

tensor(3.6812, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▍       | 380/1529 [02:05<05:46,  3.31it/s]

tensor(3.6733, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▍       | 381/1529 [02:06<05:45,  3.32it/s]

tensor(3.6769, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▍       | 382/1529 [02:06<05:46,  3.31it/s]

tensor(3.6694, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▌       | 383/1529 [02:06<05:46,  3.31it/s]

tensor(3.6695, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▌       | 384/1529 [02:07<05:46,  3.30it/s]

tensor(3.6704, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▌       | 385/1529 [02:07<05:46,  3.31it/s]

tensor(3.6718, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▌       | 386/1529 [02:07<05:48,  3.28it/s]

tensor(3.6662, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▌       | 387/1529 [02:08<05:47,  3.28it/s]

tensor(3.6719, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▌       | 388/1529 [02:08<05:46,  3.30it/s]

tensor(3.6835, device='cuda:0', grad_fn=<DivBackward1>)



 25%|██▌       | 389/1529 [02:08<05:45,  3.30it/s]

tensor(3.6716, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▌       | 390/1529 [02:08<05:44,  3.31it/s]

tensor(3.6777, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▌       | 391/1529 [02:09<05:43,  3.31it/s]

tensor(3.6696, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▌       | 392/1529 [02:09<05:44,  3.30it/s]

tensor(3.6657, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▌       | 393/1529 [02:09<05:46,  3.28it/s]

tensor(3.6842, device='cuda:0', grad_fn=<DivBackward1>)
beta tensor(0.0030, device='cuda:0')
cnn_encoder.conv1.weight tensor(0.0074, device='cuda:0')
cnn_encoder.conv2.weight tensor(0.0071, device='cuda:0')
cnn_encoder.bn1.weight tensor(0.0008, device='cuda:0')
cnn_encoder.bn1.bias tensor(0.0003, device='cuda:0')
cnn_encoder.conv3.weight tensor(0.0032, device='cuda:0')
cnn_encoder.conv4.weight tensor(0.0021, device='cuda:0')
cnn_encoder.conv5.weight tensor(0.0026, device='cuda:0')
cnn_encoder.conv5.bias tensor(8.7586e-07, device='cuda:0')
cnn_encoder.bn2.weight tensor(0.0004, device='cuda:0')
cnn_encoder.bn2.bias tensor(0.0021, device='cuda:0')
lstm.weight_ih tensor(0.0266, device='cuda:0')
lstm.weight_hh tensor(0.0163, device='cuda:0')
lstm.bias_ih tensor(0.0029, device='cuda:0')
lstm.bias_hh tensor(0.0029, device='cuda:0')
emb.weight tensor(0.0024, device='cuda:0')
wh_in.weight tensor(8.9506e-05, device='cuda:0')
wc_in.weight tensor(0.0002, device='cuda:0')
wo_in.weight tensor(0.0050


 26%|██▌       | 394/1529 [02:10<05:45,  3.29it/s]

tensor(3.6729, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▌       | 395/1529 [02:10<05:44,  3.29it/s]

tensor(3.6713, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▌       | 396/1529 [02:10<05:43,  3.30it/s]

tensor(3.6771, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▌       | 397/1529 [02:11<05:41,  3.31it/s]

tensor(3.6807, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▌       | 398/1529 [02:11<05:41,  3.32it/s]

tensor(3.6780, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▌       | 399/1529 [02:11<05:41,  3.31it/s]

tensor(3.6753, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▌       | 400/1529 [02:12<05:41,  3.30it/s]

tensor(3.6702, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▌       | 401/1529 [02:12<05:41,  3.31it/s]

tensor(3.6739, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▋       | 402/1529 [02:12<05:40,  3.31it/s]

tensor(3.6683, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▋       | 403/1529 [02:12<05:40,  3.31it/s]

tensor(3.6739, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▋       | 404/1529 [02:13<05:39,  3.31it/s]

tensor(3.6866, device='cuda:0', grad_fn=<DivBackward1>)



 26%|██▋       | 405/1529 [02:13<05:38,  3.33it/s]

tensor(3.6780, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 406/1529 [02:13<05:38,  3.32it/s]

tensor(3.6660, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 407/1529 [02:14<05:38,  3.31it/s]

tensor(3.6759, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 408/1529 [02:14<05:39,  3.30it/s]

tensor(3.6738, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 409/1529 [02:14<05:39,  3.30it/s]

tensor(3.6754, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 410/1529 [02:15<05:39,  3.30it/s]

tensor(3.6710, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 411/1529 [02:15<05:38,  3.30it/s]

tensor(3.6730, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 412/1529 [02:15<05:40,  3.28it/s]

tensor(3.6681, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 413/1529 [02:15<05:37,  3.30it/s]

tensor(3.6730, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 414/1529 [02:16<05:36,  3.31it/s]

tensor(3.6690, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 415/1529 [02:16<05:36,  3.31it/s]

tensor(3.6719, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 416/1529 [02:16<05:35,  3.31it/s]

tensor(3.6777, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 417/1529 [02:17<05:35,  3.31it/s]

tensor(3.6711, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 418/1529 [02:17<05:36,  3.30it/s]

tensor(3.6732, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 419/1529 [02:17<05:36,  3.29it/s]

tensor(3.6698, device='cuda:0', grad_fn=<DivBackward1>)



 27%|██▋       | 420/1529 [02:18<05:36,  3.30it/s]

tensor(3.6787, device='cuda:0', grad_fn=<DivBackward1>)



 28%|██▊       | 421/1529 [02:18<05:34,  3.32it/s]

tensor(3.6744, device='cuda:0', grad_fn=<DivBackward1>)



 28%|██▊       | 422/1529 [02:18<05:34,  3.31it/s]

tensor(3.6673, device='cuda:0', grad_fn=<DivBackward1>)



 28%|██▊       | 423/1529 [02:18<05:33,  3.31it/s]

tensor(3.6816, device='cuda:0', grad_fn=<DivBackward1>)


 28%|██▊       | 423/1529 [02:19<06:03,  3.04it/s]


KeyboardInterrupt: 