In [2]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
import torch
from torch.utils.data import Dataset, DataLoader

class CharacterLevelShakespeareDataset(Dataset):
    def __init__(self, file_path, context_length):
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()
        self.chars = sorted(list(set(raw_text)))
        self.char_to_index = {ch: i for i, ch in enumerate(self.chars)}
        self.index_to_char = {i: ch for ch, i in self.char_to_index.items()}
        self.vocab_size = len(self.chars)
        self.encoded_text = [self.char_to_index[ch] for ch in raw_text]
        self.context_length = context_length

    def __len__(self):
        return len(self.encoded_text) - self.context_length

    def __getitem__(self, index):
        input_sequence = self.encoded_text[index:index + self.context_length]
        target_sequence = self.encoded_text[index + 1:index + 1 + self.context_length]
        return torch.tensor(input_sequence, dtype=torch.long), torch.tensor(target_sequence, dtype=torch.long)

def build_shakespeare_dataloader(path='input.txt', context_length=100, batch_size=64):
    dataset = CharacterLevelShakespeareDataset(path, context_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    return dataloader, dataset.vocab_size, dataset.char_to_index, dataset.index_to_char

dataloader, vocab_size, char_to_index, index_to_char = build_shakespeare_dataloader()


--2025-04-15 16:32:13--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-04-15 16:32:13 (14.6 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [13]:
from torch import nn
class Rnn(nn.Module):
  def __init__(self, vocab_size, embedding_dim=1, hidden_dim=5, output_dim=1):
    super().__init__()
    self.hidden_trans = nn.Linear(hidden_dim, hidden_dim)
    self.input_trans = nn.Linear(embedding_dim, hidden_dim)
    self.output_trans = nn.Linear(hidden_dim, output_dim)
    self.hidden_activation = nn.Tanh()
    self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
  def forward(self, x):
    batch_size, sequence_length = x.shape
    hidden_state = torch.zeros((batch_size, self.hidden_trans.in_features))
    for i in range(0, sequence_length):
      tokens = x[:,i]
      embedding = self.embedding_layer(tokens)
      hidden_state = self.hidden_activation(self.input_trans(embedding) + self.hidden_trans(hidden_state))
    return self.output_trans(hidden_state)

In [19]:
import torch
from torch import nn

class Rnn(nn.Module):
    def __init__(self, vocab_size, embedding_dim=1, hidden_dim=5, output_dim=1):
        super().__init__()
        self.hidden_trans = nn.Linear(hidden_dim, hidden_dim)
        self.input_trans = nn.Linear(embedding_dim, hidden_dim)
        self.output_trans = nn.Linear(hidden_dim, output_dim)
        self.hidden_activation = nn.Tanh()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, x):
        batch_size, sequence_length = x.shape
        hidden_state = torch.zeros((batch_size, self.hidden_trans.in_features), device=x.device)
        outputs = []
        for i in range(sequence_length):
            tokens = x[:, i]
            embedding = self.embedding_layer(tokens)
            hidden_state = self.hidden_activation(
                self.input_trans(embedding) + self.hidden_trans(hidden_state)
            )
            logits = self.output_trans(hidden_state)
            outputs.append(logits.unsqueeze(1))
        return torch.cat(outputs, dim=1)

In [17]:
from torch import nn
EPOCHS=1
loss = nn.CrossEntropyLoss()
model = Rnn(vocab_size, output_dim=vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(0, EPOCHS):
  for input_sequence, output_sequence in dataloader:
    output = model(input_sequence)
    print(output.shape, output_sequence.shape)
    output = loss(output, output_sequence)
    optimizer.zero_grad()
    output.backward()
    optimizer.step()

torch.Size([64, 65]) torch.Size([64, 100])


RuntimeError: 0D or 1D target tensor expected, multi-target not supported

In [20]:
EPOCHS = 1
model = Rnn(vocab_size, output_dim=vocab_size)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(EPOCHS):
    for input_sequence, output_sequence in dataloader:
        output_logits = model(input_sequence)                          # [batch_size, seq_len, vocab_size]
        predicted = output_logits.reshape(-1, output_logits.shape[-1]) # [batch_size*seq_len, vocab_size]
        target = output_sequence.reshape(-1)                           # [batch_size*seq_len]

        loss = loss_function(predicted, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

IndexError: index out of range in self

In [18]:
from torch import nn
import torch

class Rnn(nn.Module):
  def __init__(self, vocab_size, embedding_dim=1, hidden_dim=5, output_dim=1):
    super().__init__()
    self.hidden_trans = nn.Linear(hidden_dim, hidden_dim)
    self.input_trans = nn.Linear(embedding_dim, hidden_dim)
    self.output_trans = nn.Linear(hidden_dim, output_dim)
    self.hidden_activation = nn.Tanh()
    self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)

  def forward(self, x):
    batch_size, sequence_length = x.shape
    hidden_state = torch.zeros((batch_size, self.hidden_trans.in_features), device=x.device)
    for i in range(sequence_length):
      tokens = x[:, i]
      embedding = self.embedding_layer(tokens)
      hidden_state = self.hidden_activation(self.input_trans(embedding) + self.hidden_trans(hidden_state))
    return self.output_trans(hidden_state)

# Training
EPOCHS = 1
vocab_size = 30  # define this properly
model = Rnn(vocab_size, embedding_dim=8, hidden_dim=32, output_dim=vocab_size)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(EPOCHS):
  for input_sequence, output_sequence in dataloader:
    input_sequence = input_sequence.to(model.output_trans.weight.device)
    output_sequence = output_sequence.to(model.output_trans.weight.device)

    output = model(input_sequence)  # output.shape = (batch_size, vocab_size)
    loss = loss_function(output, output_sequence)  # output_sequence.shape = (batch_size,)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


IndexError: index out of range in self

In [35]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

class CharacterLevelShakespeareDataset(Dataset):
    def __init__(self, file_path, context_length):
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()
        self.chars = sorted(list(set(raw_text)))
        self.char_to_index = {ch: i for i, ch in enumerate(self.chars)}
        self.index_to_char = {i: ch for ch, i in self.char_to_index.items()}
        self.vocab_size = len(self.chars)
        self.encoded_text = [self.char_to_index[ch] for ch in raw_text]
        self.context_length = context_length

    def __len__(self):
        return len(self.encoded_text) - self.context_length

    def __getitem__(self, index):
        input_sequence = self.encoded_text[index:index + self.context_length]
        target_sequence = self.encoded_text[index + 1:index + 1 + self.context_length]
        return torch.tensor(input_sequence, dtype=torch.long), torch.tensor(target_sequence, dtype=torch.long)

def build_shakespeare_dataloader(path='input.txt', context_length=100, batch_size=64):
    dataset = CharacterLevelShakespeareDataset(path, context_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    return dataloader, dataset.vocab_size, dataset.char_to_index, dataset.index_to_char

dataloader, vocab_size, char_to_index, index_to_char = build_shakespeare_dataloader()


import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
class Rnn(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_dim=128, output_dim=None):
        super().__init__()
        self.hidden_trans = nn.Linear(hidden_dim, hidden_dim)
        self.input_trans = nn.Linear(embedding_dim, hidden_dim)
        self.output_trans = nn.Linear(hidden_dim, vocab_size)
        self.hidden_activation = nn.Tanh()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
    def forward(self, x):
        batch_size, sequence_length = x.shape
        hidden_state = torch.zeros((batch_size, self.hidden_trans.in_features), device=x.device)
        outputs = []
        for i in range(sequence_length):
            tokens = x[:, i]
            embedding = self.embedding_layer(tokens)
            hidden_state = self.hidden_activation(
                self.input_trans(embedding) + self.hidden_trans(hidden_state)
            )
            logits = self.output_trans(hidden_state)
            outputs.append(logits.unsqueeze(1))
        return torch.cat(outputs, dim=1)

EPOCHS = 1
embedding_dim = 64
hidden_dim = 128
model = Rnn(vocab_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=vocab_size)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
from tqdm import tqdm
for epoch in range(EPOCHS):
    for input_sequence, output_sequence in tqdm(dataloader):
        output_logits = model(input_sequence)
        predicted = output_logits.reshape(-1, output_logits.shape[-1])
        target = output_sequence.reshape(-1)
        loss = loss_function(predicted, target)
        optimizer.zero_grad()
        loss.backward()
        print(loss)
        optimizer.step()

--2025-04-15 17:57:00--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2025-04-15 17:57:00 (18.7 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



  0%|          | 2/17426 [00:00<48:45,  5.96it/s]  

tensor(4.2253, grad_fn=<NllLossBackward0>)
tensor(4.1804, grad_fn=<NllLossBackward0>)


  0%|          | 5/17426 [00:00<31:36,  9.19it/s]

tensor(4.1234, grad_fn=<NllLossBackward0>)
tensor(4.0607, grad_fn=<NllLossBackward0>)
tensor(4.0055, grad_fn=<NllLossBackward0>)


  0%|          | 7/17426 [00:00<27:39, 10.50it/s]

tensor(3.9642, grad_fn=<NllLossBackward0>)
tensor(3.8960, grad_fn=<NllLossBackward0>)
tensor(3.8377, grad_fn=<NllLossBackward0>)


  0%|          | 11/17426 [00:01<25:26, 11.41it/s]

tensor(3.7632, grad_fn=<NllLossBackward0>)
tensor(3.7185, grad_fn=<NllLossBackward0>)
tensor(3.6162, grad_fn=<NllLossBackward0>)


  0%|          | 13/17426 [00:01<24:40, 11.76it/s]

tensor(3.5553, grad_fn=<NllLossBackward0>)
tensor(3.4683, grad_fn=<NllLossBackward0>)
tensor(3.4147, grad_fn=<NllLossBackward0>)


  0%|          | 17/17426 [00:01<23:26, 12.38it/s]

tensor(3.3153, grad_fn=<NllLossBackward0>)
tensor(3.2414, grad_fn=<NllLossBackward0>)
tensor(3.2192, grad_fn=<NllLossBackward0>)


  0%|          | 19/17426 [00:01<23:57, 12.11it/s]

tensor(3.2210, grad_fn=<NllLossBackward0>)
tensor(3.1282, grad_fn=<NllLossBackward0>)
tensor(3.1093, grad_fn=<NllLossBackward0>)


  0%|          | 23/17426 [00:02<23:55, 12.12it/s]

tensor(3.0705, grad_fn=<NllLossBackward0>)
tensor(3.0383, grad_fn=<NllLossBackward0>)
tensor(3.0619, grad_fn=<NllLossBackward0>)


  0%|          | 25/17426 [00:02<23:48, 12.18it/s]

tensor(3.0243, grad_fn=<NllLossBackward0>)
tensor(3.0019, grad_fn=<NllLossBackward0>)
tensor(2.9427, grad_fn=<NllLossBackward0>)


  0%|          | 29/17426 [00:02<23:23, 12.39it/s]

tensor(2.9628, grad_fn=<NllLossBackward0>)
tensor(2.9272, grad_fn=<NllLossBackward0>)
tensor(2.9430, grad_fn=<NllLossBackward0>)


  0%|          | 31/17426 [00:02<23:24, 12.39it/s]

tensor(2.8938, grad_fn=<NllLossBackward0>)
tensor(2.9250, grad_fn=<NllLossBackward0>)
tensor(2.8848, grad_fn=<NllLossBackward0>)


  0%|          | 35/17426 [00:03<23:38, 12.26it/s]

tensor(2.8766, grad_fn=<NllLossBackward0>)
tensor(2.8450, grad_fn=<NllLossBackward0>)
tensor(2.8611, grad_fn=<NllLossBackward0>)


  0%|          | 37/17426 [00:03<23:39, 12.25it/s]

tensor(2.8223, grad_fn=<NllLossBackward0>)
tensor(2.8101, grad_fn=<NllLossBackward0>)
tensor(2.8225, grad_fn=<NllLossBackward0>)


  0%|          | 41/17426 [00:03<23:06, 12.54it/s]

tensor(2.8046, grad_fn=<NllLossBackward0>)
tensor(2.7683, grad_fn=<NllLossBackward0>)
tensor(2.7579, grad_fn=<NllLossBackward0>)


  0%|          | 43/17426 [00:03<23:16, 12.44it/s]

tensor(2.7783, grad_fn=<NllLossBackward0>)
tensor(2.7475, grad_fn=<NllLossBackward0>)
tensor(2.7591, grad_fn=<NllLossBackward0>)


  0%|          | 47/17426 [00:04<23:32, 12.30it/s]

tensor(2.7340, grad_fn=<NllLossBackward0>)
tensor(2.7276, grad_fn=<NllLossBackward0>)
tensor(2.7300, grad_fn=<NllLossBackward0>)


  0%|          | 49/17426 [00:04<23:24, 12.37it/s]

tensor(2.6939, grad_fn=<NllLossBackward0>)
tensor(2.6823, grad_fn=<NllLossBackward0>)
tensor(2.7188, grad_fn=<NllLossBackward0>)


  0%|          | 53/17426 [00:04<23:05, 12.54it/s]

tensor(2.6759, grad_fn=<NllLossBackward0>)
tensor(2.6819, grad_fn=<NllLossBackward0>)
tensor(2.6462, grad_fn=<NllLossBackward0>)


  0%|          | 55/17426 [00:04<23:01, 12.57it/s]

tensor(2.6414, grad_fn=<NllLossBackward0>)
tensor(2.6352, grad_fn=<NllLossBackward0>)
tensor(2.6208, grad_fn=<NllLossBackward0>)


  0%|          | 57/17426 [00:04<23:22, 12.39it/s]

tensor(2.6175, grad_fn=<NllLossBackward0>)
tensor(2.6063, grad_fn=<NllLossBackward0>)


  0%|          | 59/17426 [00:05<25:36, 11.31it/s]

tensor(2.6316, grad_fn=<NllLossBackward0>)
tensor(2.6254, grad_fn=<NllLossBackward0>)


  0%|          | 61/17426 [00:05<29:22,  9.85it/s]

tensor(2.5869, grad_fn=<NllLossBackward0>)
tensor(2.5669, grad_fn=<NllLossBackward0>)


  0%|          | 63/17426 [00:05<30:52,  9.37it/s]

tensor(2.5217, grad_fn=<NllLossBackward0>)


  0%|          | 65/17426 [00:06<44:16,  6.54it/s]

tensor(2.6153, grad_fn=<NllLossBackward0>)
tensor(2.6023, grad_fn=<NllLossBackward0>)


  0%|          | 67/17426 [00:06<40:36,  7.12it/s]

tensor(2.5612, grad_fn=<NllLossBackward0>)
tensor(2.5714, grad_fn=<NllLossBackward0>)


  0%|          | 68/17426 [00:06<38:34,  7.50it/s]

tensor(2.5769, grad_fn=<NllLossBackward0>)


  0%|          | 70/17426 [00:07<55:08,  5.25it/s]  

tensor(2.5257, grad_fn=<NllLossBackward0>)
tensor(2.5218, grad_fn=<NllLossBackward0>)


  0%|          | 72/17426 [00:07<47:47,  6.05it/s]

tensor(2.5369, grad_fn=<NllLossBackward0>)
tensor(2.5356, grad_fn=<NllLossBackward0>)


  0%|          | 74/17426 [00:07<43:36,  6.63it/s]

tensor(2.5541, grad_fn=<NllLossBackward0>)
tensor(2.5069, grad_fn=<NllLossBackward0>)


  0%|          | 77/17426 [00:07<35:14,  8.20it/s]

tensor(2.5140, grad_fn=<NllLossBackward0>)
tensor(2.4774, grad_fn=<NllLossBackward0>)
tensor(2.5016, grad_fn=<NllLossBackward0>)


  0%|          | 79/17426 [00:08<30:31,  9.47it/s]

tensor(2.4789, grad_fn=<NllLossBackward0>)
tensor(2.4800, grad_fn=<NllLossBackward0>)
tensor(2.4910, grad_fn=<NllLossBackward0>)


  0%|          | 83/17426 [00:08<26:45, 10.80it/s]

tensor(2.4758, grad_fn=<NllLossBackward0>)
tensor(2.4712, grad_fn=<NllLossBackward0>)
tensor(2.4699, grad_fn=<NllLossBackward0>)


  0%|          | 85/17426 [00:08<25:38, 11.27it/s]

tensor(2.4720, grad_fn=<NllLossBackward0>)
tensor(2.4710, grad_fn=<NllLossBackward0>)
tensor(2.4447, grad_fn=<NllLossBackward0>)


  1%|          | 89/17426 [00:08<24:11, 11.94it/s]

tensor(2.4666, grad_fn=<NllLossBackward0>)
tensor(2.4521, grad_fn=<NllLossBackward0>)
tensor(2.4359, grad_fn=<NllLossBackward0>)


  1%|          | 91/17426 [00:09<23:53, 12.09it/s]

tensor(2.4492, grad_fn=<NllLossBackward0>)
tensor(2.4354, grad_fn=<NllLossBackward0>)
tensor(2.4372, grad_fn=<NllLossBackward0>)


  1%|          | 95/17426 [00:09<24:02, 12.02it/s]

tensor(2.3916, grad_fn=<NllLossBackward0>)
tensor(2.4474, grad_fn=<NllLossBackward0>)
tensor(2.4002, grad_fn=<NllLossBackward0>)


  1%|          | 97/17426 [00:09<23:49, 12.12it/s]

tensor(2.4157, grad_fn=<NllLossBackward0>)
tensor(2.4176, grad_fn=<NllLossBackward0>)
tensor(2.3978, grad_fn=<NllLossBackward0>)


  1%|          | 101/17426 [00:09<23:43, 12.17it/s]

tensor(2.4058, grad_fn=<NllLossBackward0>)
tensor(2.3677, grad_fn=<NllLossBackward0>)
tensor(2.3782, grad_fn=<NllLossBackward0>)


  1%|          | 103/17426 [00:09<23:23, 12.34it/s]

tensor(2.4088, grad_fn=<NllLossBackward0>)
tensor(2.3703, grad_fn=<NllLossBackward0>)
tensor(2.3831, grad_fn=<NllLossBackward0>)


  1%|          | 107/17426 [00:10<23:50, 12.11it/s]

tensor(2.3365, grad_fn=<NllLossBackward0>)
tensor(2.3777, grad_fn=<NllLossBackward0>)
tensor(2.3747, grad_fn=<NllLossBackward0>)


  1%|          | 109/17426 [00:10<23:30, 12.28it/s]

tensor(2.3428, grad_fn=<NllLossBackward0>)
tensor(2.3640, grad_fn=<NllLossBackward0>)
tensor(2.3694, grad_fn=<NllLossBackward0>)


  1%|          | 113/17426 [00:10<23:53, 12.08it/s]

tensor(2.4227, grad_fn=<NllLossBackward0>)
tensor(2.3760, grad_fn=<NllLossBackward0>)
tensor(2.3781, grad_fn=<NllLossBackward0>)


  1%|          | 115/17426 [00:10<23:47, 12.13it/s]

tensor(2.3899, grad_fn=<NllLossBackward0>)
tensor(2.3373, grad_fn=<NllLossBackward0>)
tensor(2.3466, grad_fn=<NllLossBackward0>)


  1%|          | 119/17426 [00:11<23:57, 12.04it/s]

tensor(2.3274, grad_fn=<NllLossBackward0>)
tensor(2.3356, grad_fn=<NllLossBackward0>)
tensor(2.3106, grad_fn=<NllLossBackward0>)


  1%|          | 121/17426 [00:11<23:48, 12.11it/s]

tensor(2.3233, grad_fn=<NllLossBackward0>)
tensor(2.3447, grad_fn=<NllLossBackward0>)
tensor(2.3415, grad_fn=<NllLossBackward0>)


  1%|          | 125/17426 [00:11<23:39, 12.19it/s]

tensor(2.3388, grad_fn=<NllLossBackward0>)
tensor(2.3092, grad_fn=<NllLossBackward0>)
tensor(2.3172, grad_fn=<NllLossBackward0>)


  1%|          | 127/17426 [00:11<23:23, 12.33it/s]

tensor(2.3079, grad_fn=<NllLossBackward0>)
tensor(2.3243, grad_fn=<NllLossBackward0>)
tensor(2.3062, grad_fn=<NllLossBackward0>)


  1%|          | 131/17426 [00:12<23:17, 12.38it/s]

tensor(2.2802, grad_fn=<NllLossBackward0>)
tensor(2.3023, grad_fn=<NllLossBackward0>)
tensor(2.3108, grad_fn=<NllLossBackward0>)


  1%|          | 133/17426 [00:12<23:53, 12.06it/s]

tensor(2.3109, grad_fn=<NllLossBackward0>)
tensor(2.3017, grad_fn=<NllLossBackward0>)
tensor(2.3469, grad_fn=<NllLossBackward0>)


  1%|          | 137/17426 [00:12<23:32, 12.24it/s]

tensor(2.3243, grad_fn=<NllLossBackward0>)
tensor(2.2870, grad_fn=<NllLossBackward0>)
tensor(2.2814, grad_fn=<NllLossBackward0>)


  1%|          | 139/17426 [00:12<23:28, 12.28it/s]

tensor(2.3081, grad_fn=<NllLossBackward0>)
tensor(2.2728, grad_fn=<NllLossBackward0>)
tensor(2.2696, grad_fn=<NllLossBackward0>)


  1%|          | 143/17426 [00:13<23:18, 12.36it/s]

tensor(2.3071, grad_fn=<NllLossBackward0>)
tensor(2.2851, grad_fn=<NllLossBackward0>)
tensor(2.2825, grad_fn=<NllLossBackward0>)


  1%|          | 145/17426 [00:13<23:58, 12.01it/s]

tensor(2.2662, grad_fn=<NllLossBackward0>)
tensor(2.2839, grad_fn=<NllLossBackward0>)
tensor(2.2773, grad_fn=<NllLossBackward0>)


  1%|          | 149/17426 [00:13<23:43, 12.14it/s]

tensor(2.2794, grad_fn=<NllLossBackward0>)
tensor(2.3052, grad_fn=<NllLossBackward0>)
tensor(2.2525, grad_fn=<NllLossBackward0>)


  1%|          | 151/17426 [00:13<23:42, 12.14it/s]

tensor(2.2744, grad_fn=<NllLossBackward0>)
tensor(2.2506, grad_fn=<NllLossBackward0>)
tensor(2.2400, grad_fn=<NllLossBackward0>)


  1%|          | 155/17426 [00:14<23:19, 12.34it/s]

tensor(2.2465, grad_fn=<NllLossBackward0>)
tensor(2.2367, grad_fn=<NllLossBackward0>)
tensor(2.2875, grad_fn=<NllLossBackward0>)


  1%|          | 157/17426 [00:14<24:10, 11.90it/s]

tensor(2.2537, grad_fn=<NllLossBackward0>)
tensor(2.2329, grad_fn=<NllLossBackward0>)
tensor(2.2678, grad_fn=<NllLossBackward0>)


  1%|          | 161/17426 [00:14<23:20, 12.33it/s]

tensor(2.2403, grad_fn=<NllLossBackward0>)
tensor(2.2104, grad_fn=<NllLossBackward0>)
tensor(2.2166, grad_fn=<NllLossBackward0>)


  1%|          | 163/17426 [00:14<23:11, 12.40it/s]

tensor(2.2234, grad_fn=<NllLossBackward0>)
tensor(2.2378, grad_fn=<NllLossBackward0>)
tensor(2.2378, grad_fn=<NllLossBackward0>)


  1%|          | 167/17426 [00:15<23:14, 12.38it/s]

tensor(2.2147, grad_fn=<NllLossBackward0>)
tensor(2.2589, grad_fn=<NllLossBackward0>)
tensor(2.2099, grad_fn=<NllLossBackward0>)


  1%|          | 169/17426 [00:15<23:48, 12.08it/s]

tensor(2.2311, grad_fn=<NllLossBackward0>)
tensor(2.2227, grad_fn=<NllLossBackward0>)
tensor(2.1887, grad_fn=<NllLossBackward0>)


  1%|          | 173/17426 [00:15<23:10, 12.41it/s]

tensor(2.2218, grad_fn=<NllLossBackward0>)
tensor(2.2227, grad_fn=<NllLossBackward0>)
tensor(2.1871, grad_fn=<NllLossBackward0>)


  1%|          | 175/17426 [00:15<23:19, 12.32it/s]

tensor(2.2104, grad_fn=<NllLossBackward0>)
tensor(2.2136, grad_fn=<NllLossBackward0>)
tensor(2.2087, grad_fn=<NllLossBackward0>)


  1%|          | 179/17426 [00:16<23:02, 12.48it/s]

tensor(2.2127, grad_fn=<NllLossBackward0>)
tensor(2.2022, grad_fn=<NllLossBackward0>)
tensor(2.2111, grad_fn=<NllLossBackward0>)


  1%|          | 181/17426 [00:16<23:20, 12.32it/s]

tensor(2.1949, grad_fn=<NllLossBackward0>)
tensor(2.2126, grad_fn=<NllLossBackward0>)
tensor(2.2033, grad_fn=<NllLossBackward0>)


  1%|          | 185/17426 [00:16<23:42, 12.12it/s]

tensor(2.1734, grad_fn=<NllLossBackward0>)
tensor(2.1702, grad_fn=<NllLossBackward0>)
tensor(2.2106, grad_fn=<NllLossBackward0>)


  1%|          | 187/17426 [00:16<23:28, 12.24it/s]

tensor(2.2013, grad_fn=<NllLossBackward0>)
tensor(2.1926, grad_fn=<NllLossBackward0>)
tensor(2.2035, grad_fn=<NllLossBackward0>)


  1%|          | 191/17426 [00:17<23:06, 12.43it/s]

tensor(2.2059, grad_fn=<NllLossBackward0>)
tensor(2.1591, grad_fn=<NllLossBackward0>)
tensor(2.1968, grad_fn=<NllLossBackward0>)


  1%|          | 193/17426 [00:17<23:08, 12.41it/s]

tensor(2.1764, grad_fn=<NllLossBackward0>)
tensor(2.1727, grad_fn=<NllLossBackward0>)
tensor(2.1931, grad_fn=<NllLossBackward0>)


  1%|          | 197/17426 [00:17<23:21, 12.29it/s]

tensor(2.1852, grad_fn=<NllLossBackward0>)
tensor(2.1842, grad_fn=<NllLossBackward0>)
tensor(2.1612, grad_fn=<NllLossBackward0>)


  1%|          | 199/17426 [00:17<27:09, 10.57it/s]

tensor(2.1701, grad_fn=<NllLossBackward0>)
tensor(2.1497, grad_fn=<NllLossBackward0>)


  1%|          | 201/17426 [00:18<29:27,  9.74it/s]

tensor(2.1799, grad_fn=<NllLossBackward0>)
tensor(2.1540, grad_fn=<NllLossBackward0>)


  1%|          | 203/17426 [00:18<30:46,  9.33it/s]

tensor(2.1749, grad_fn=<NllLossBackward0>)
tensor(2.1471, grad_fn=<NllLossBackward0>)


  1%|          | 205/17426 [00:18<32:14,  8.90it/s]

tensor(2.1578, grad_fn=<NllLossBackward0>)
tensor(2.2245, grad_fn=<NllLossBackward0>)


  1%|          | 207/17426 [00:18<33:09,  8.66it/s]

tensor(2.1650, grad_fn=<NllLossBackward0>)
tensor(2.1374, grad_fn=<NllLossBackward0>)


  1%|          | 209/17426 [00:19<33:51,  8.48it/s]

tensor(2.1756, grad_fn=<NllLossBackward0>)
tensor(2.1526, grad_fn=<NllLossBackward0>)


  1%|          | 211/17426 [00:19<35:37,  8.05it/s]

tensor(2.1554, grad_fn=<NllLossBackward0>)
tensor(2.1311, grad_fn=<NllLossBackward0>)


  1%|          | 213/17426 [00:19<36:46,  7.80it/s]

tensor(2.1459, grad_fn=<NllLossBackward0>)
tensor(2.1594, grad_fn=<NllLossBackward0>)


  1%|          | 215/17426 [00:19<37:20,  7.68it/s]

tensor(2.1333, grad_fn=<NllLossBackward0>)
tensor(2.1441, grad_fn=<NllLossBackward0>)


  1%|          | 217/17426 [00:20<37:58,  7.55it/s]

tensor(2.1214, grad_fn=<NllLossBackward0>)
tensor(2.0910, grad_fn=<NllLossBackward0>)


  1%|▏         | 220/17426 [00:20<29:30,  9.72it/s]

tensor(2.1700, grad_fn=<NllLossBackward0>)
tensor(2.1397, grad_fn=<NllLossBackward0>)
tensor(2.1566, grad_fn=<NllLossBackward0>)


  1%|▏         | 222/17426 [00:20<27:37, 10.38it/s]

tensor(2.1230, grad_fn=<NllLossBackward0>)
tensor(2.1610, grad_fn=<NllLossBackward0>)
tensor(2.1499, grad_fn=<NllLossBackward0>)


  1%|▏         | 226/17426 [00:20<24:39, 11.62it/s]

tensor(2.1270, grad_fn=<NllLossBackward0>)
tensor(2.1378, grad_fn=<NllLossBackward0>)
tensor(2.1421, grad_fn=<NllLossBackward0>)


  1%|▏         | 228/17426 [00:21<24:15, 11.81it/s]

tensor(2.1175, grad_fn=<NllLossBackward0>)
tensor(2.1184, grad_fn=<NllLossBackward0>)
tensor(2.1281, grad_fn=<NllLossBackward0>)


  1%|▏         | 232/17426 [00:21<23:27, 12.21it/s]

tensor(2.1865, grad_fn=<NllLossBackward0>)
tensor(2.1409, grad_fn=<NllLossBackward0>)
tensor(2.1093, grad_fn=<NllLossBackward0>)


  1%|▏         | 234/17426 [00:21<23:13, 12.34it/s]

tensor(2.1227, grad_fn=<NllLossBackward0>)
tensor(2.1334, grad_fn=<NllLossBackward0>)
tensor(2.1197, grad_fn=<NllLossBackward0>)


  1%|▏         | 238/17426 [00:21<23:22, 12.25it/s]

tensor(2.1425, grad_fn=<NllLossBackward0>)
tensor(2.1088, grad_fn=<NllLossBackward0>)
tensor(2.1018, grad_fn=<NllLossBackward0>)


  1%|▏         | 240/17426 [00:22<23:32, 12.17it/s]

tensor(2.1050, grad_fn=<NllLossBackward0>)
tensor(2.1046, grad_fn=<NllLossBackward0>)
tensor(2.1324, grad_fn=<NllLossBackward0>)


  1%|▏         | 244/17426 [00:22<23:24, 12.24it/s]

tensor(2.0965, grad_fn=<NllLossBackward0>)
tensor(2.1372, grad_fn=<NllLossBackward0>)
tensor(2.1277, grad_fn=<NllLossBackward0>)


  1%|▏         | 246/17426 [00:22<23:15, 12.31it/s]

tensor(2.0818, grad_fn=<NllLossBackward0>)
tensor(2.1398, grad_fn=<NllLossBackward0>)
tensor(2.1113, grad_fn=<NllLossBackward0>)


  1%|▏         | 250/17426 [00:22<23:22, 12.25it/s]

tensor(2.1048, grad_fn=<NllLossBackward0>)
tensor(2.0885, grad_fn=<NllLossBackward0>)
tensor(2.1017, grad_fn=<NllLossBackward0>)


  1%|▏         | 252/17426 [00:23<23:13, 12.32it/s]

tensor(2.1225, grad_fn=<NllLossBackward0>)
tensor(2.0917, grad_fn=<NllLossBackward0>)
tensor(2.1111, grad_fn=<NllLossBackward0>)


  1%|▏         | 256/17426 [00:23<23:05, 12.40it/s]

tensor(2.1091, grad_fn=<NllLossBackward0>)
tensor(2.0893, grad_fn=<NllLossBackward0>)
tensor(2.0936, grad_fn=<NllLossBackward0>)


  1%|▏         | 258/17426 [00:23<23:06, 12.38it/s]

tensor(2.1090, grad_fn=<NllLossBackward0>)
tensor(2.0833, grad_fn=<NllLossBackward0>)
tensor(2.0923, grad_fn=<NllLossBackward0>)


  2%|▏         | 262/17426 [00:23<23:17, 12.29it/s]

tensor(2.0895, grad_fn=<NllLossBackward0>)
tensor(2.1152, grad_fn=<NllLossBackward0>)
tensor(2.0846, grad_fn=<NllLossBackward0>)


  2%|▏         | 264/17426 [00:24<23:18, 12.27it/s]

tensor(2.0903, grad_fn=<NllLossBackward0>)
tensor(2.0818, grad_fn=<NllLossBackward0>)
tensor(2.1089, grad_fn=<NllLossBackward0>)


  2%|▏         | 268/17426 [00:24<23:10, 12.34it/s]

tensor(2.0772, grad_fn=<NllLossBackward0>)
tensor(2.0872, grad_fn=<NllLossBackward0>)
tensor(2.0727, grad_fn=<NllLossBackward0>)


  2%|▏         | 270/17426 [00:24<23:18, 12.27it/s]

tensor(2.1189, grad_fn=<NllLossBackward0>)
tensor(2.0680, grad_fn=<NllLossBackward0>)
tensor(2.0480, grad_fn=<NllLossBackward0>)


  2%|▏         | 274/17426 [00:24<23:04, 12.39it/s]

tensor(2.1022, grad_fn=<NllLossBackward0>)
tensor(2.0876, grad_fn=<NllLossBackward0>)
tensor(2.0633, grad_fn=<NllLossBackward0>)


  2%|▏         | 276/17426 [00:25<23:15, 12.29it/s]

tensor(2.0654, grad_fn=<NllLossBackward0>)
tensor(2.0677, grad_fn=<NllLossBackward0>)
tensor(2.0691, grad_fn=<NllLossBackward0>)


  2%|▏         | 280/17426 [00:25<23:23, 12.22it/s]

tensor(2.0484, grad_fn=<NllLossBackward0>)
tensor(2.0594, grad_fn=<NllLossBackward0>)
tensor(2.0517, grad_fn=<NllLossBackward0>)


  2%|▏         | 282/17426 [00:25<23:17, 12.27it/s]

tensor(2.0542, grad_fn=<NllLossBackward0>)
tensor(2.0666, grad_fn=<NllLossBackward0>)
tensor(2.0742, grad_fn=<NllLossBackward0>)


  2%|▏         | 286/17426 [00:25<23:26, 12.18it/s]

tensor(2.0413, grad_fn=<NllLossBackward0>)
tensor(2.0914, grad_fn=<NllLossBackward0>)
tensor(2.0392, grad_fn=<NllLossBackward0>)


  2%|▏         | 288/17426 [00:26<23:23, 12.21it/s]

tensor(2.0675, grad_fn=<NllLossBackward0>)
tensor(2.0667, grad_fn=<NllLossBackward0>)
tensor(2.0504, grad_fn=<NllLossBackward0>)


  2%|▏         | 292/17426 [00:26<22:55, 12.45it/s]

tensor(2.0596, grad_fn=<NllLossBackward0>)
tensor(2.0393, grad_fn=<NllLossBackward0>)
tensor(2.0715, grad_fn=<NllLossBackward0>)


  2%|▏         | 294/17426 [00:26<23:21, 12.22it/s]

tensor(2.0475, grad_fn=<NllLossBackward0>)
tensor(2.0708, grad_fn=<NllLossBackward0>)
tensor(2.0585, grad_fn=<NllLossBackward0>)


  2%|▏         | 298/17426 [00:26<23:54, 11.94it/s]

tensor(2.0428, grad_fn=<NllLossBackward0>)
tensor(2.0306, grad_fn=<NllLossBackward0>)
tensor(2.0460, grad_fn=<NllLossBackward0>)


  2%|▏         | 300/17426 [00:27<23:45, 12.02it/s]

tensor(2.0470, grad_fn=<NllLossBackward0>)
tensor(2.0682, grad_fn=<NllLossBackward0>)
tensor(2.0557, grad_fn=<NllLossBackward0>)


  2%|▏         | 304/17426 [00:27<23:09, 12.32it/s]

tensor(2.0237, grad_fn=<NllLossBackward0>)
tensor(2.0504, grad_fn=<NllLossBackward0>)
tensor(2.0433, grad_fn=<NllLossBackward0>)


  2%|▏         | 306/17426 [00:27<23:12, 12.30it/s]

tensor(2.0514, grad_fn=<NllLossBackward0>)
tensor(2.0755, grad_fn=<NllLossBackward0>)
tensor(2.0189, grad_fn=<NllLossBackward0>)


  2%|▏         | 310/17426 [00:27<23:30, 12.13it/s]

tensor(2.0535, grad_fn=<NllLossBackward0>)
tensor(2.0424, grad_fn=<NllLossBackward0>)
tensor(2.0372, grad_fn=<NllLossBackward0>)


  2%|▏         | 312/17426 [00:27<23:34, 12.10it/s]

tensor(2.0310, grad_fn=<NllLossBackward0>)
tensor(2.0503, grad_fn=<NllLossBackward0>)
tensor(2.0375, grad_fn=<NllLossBackward0>)


  2%|▏         | 316/17426 [00:28<23:04, 12.36it/s]

tensor(2.0305, grad_fn=<NllLossBackward0>)
tensor(2.0481, grad_fn=<NllLossBackward0>)
tensor(2.0249, grad_fn=<NllLossBackward0>)


  2%|▏         | 318/17426 [00:28<23:13, 12.27it/s]

tensor(2.0508, grad_fn=<NllLossBackward0>)
tensor(2.0582, grad_fn=<NllLossBackward0>)
tensor(2.0477, grad_fn=<NllLossBackward0>)


  2%|▏         | 322/17426 [00:28<22:52, 12.46it/s]

tensor(2.0098, grad_fn=<NllLossBackward0>)
tensor(2.0317, grad_fn=<NllLossBackward0>)
tensor(2.0499, grad_fn=<NllLossBackward0>)


  2%|▏         | 324/17426 [00:28<23:17, 12.23it/s]

tensor(2.0279, grad_fn=<NllLossBackward0>)
tensor(2.0185, grad_fn=<NllLossBackward0>)
tensor(2.0069, grad_fn=<NllLossBackward0>)


  2%|▏         | 328/17426 [00:29<22:58, 12.41it/s]

tensor(2.0035, grad_fn=<NllLossBackward0>)
tensor(2.0581, grad_fn=<NllLossBackward0>)
tensor(2.0178, grad_fn=<NllLossBackward0>)


  2%|▏         | 330/17426 [00:29<23:16, 12.24it/s]

tensor(2.0393, grad_fn=<NllLossBackward0>)
tensor(1.9943, grad_fn=<NllLossBackward0>)
tensor(2.0211, grad_fn=<NllLossBackward0>)


  2%|▏         | 334/17426 [00:29<22:49, 12.48it/s]

tensor(2.0075, grad_fn=<NllLossBackward0>)
tensor(2.0258, grad_fn=<NllLossBackward0>)
tensor(2.0152, grad_fn=<NllLossBackward0>)


  2%|▏         | 336/17426 [00:29<23:34, 12.08it/s]

tensor(2.0299, grad_fn=<NllLossBackward0>)
tensor(2.0211, grad_fn=<NllLossBackward0>)
tensor(2.0080, grad_fn=<NllLossBackward0>)


  2%|▏         | 340/17426 [00:30<23:19, 12.21it/s]

tensor(2.0147, grad_fn=<NllLossBackward0>)
tensor(2.0389, grad_fn=<NllLossBackward0>)
tensor(2.0146, grad_fn=<NllLossBackward0>)


  2%|▏         | 342/17426 [00:30<27:00, 10.54it/s]

tensor(1.9762, grad_fn=<NllLossBackward0>)
tensor(1.9907, grad_fn=<NllLossBackward0>)


  2%|▏         | 344/17426 [00:30<29:10,  9.76it/s]

tensor(1.9723, grad_fn=<NllLossBackward0>)
tensor(2.0114, grad_fn=<NllLossBackward0>)


  2%|▏         | 346/17426 [00:30<30:40,  9.28it/s]

tensor(2.0006, grad_fn=<NllLossBackward0>)
tensor(2.0358, grad_fn=<NllLossBackward0>)


  2%|▏         | 348/17426 [00:31<31:35,  9.01it/s]

tensor(2.0464, grad_fn=<NllLossBackward0>)
tensor(2.0644, grad_fn=<NllLossBackward0>)


  2%|▏         | 350/17426 [00:31<33:12,  8.57it/s]

tensor(2.0215, grad_fn=<NllLossBackward0>)
tensor(2.0028, grad_fn=<NllLossBackward0>)


  2%|▏         | 352/17426 [00:31<33:04,  8.60it/s]

tensor(2.0299, grad_fn=<NllLossBackward0>)
tensor(2.0178, grad_fn=<NllLossBackward0>)


  2%|▏         | 354/17426 [00:31<34:43,  8.19it/s]

tensor(2.0049, grad_fn=<NllLossBackward0>)
tensor(2.0062, grad_fn=<NllLossBackward0>)


  2%|▏         | 356/17426 [00:32<36:23,  7.82it/s]

tensor(2.0267, grad_fn=<NllLossBackward0>)
tensor(2.0042, grad_fn=<NllLossBackward0>)


  2%|▏         | 358/17426 [00:32<36:44,  7.74it/s]

tensor(1.9926, grad_fn=<NllLossBackward0>)
tensor(2.0008, grad_fn=<NllLossBackward0>)


  2%|▏         | 360/17426 [00:32<36:49,  7.73it/s]

tensor(2.0163, grad_fn=<NllLossBackward0>)
tensor(2.0269, grad_fn=<NllLossBackward0>)


  2%|▏         | 363/17426 [00:33<29:33,  9.62it/s]

tensor(1.9884, grad_fn=<NllLossBackward0>)
tensor(2.0026, grad_fn=<NllLossBackward0>)
tensor(1.9777, grad_fn=<NllLossBackward0>)


  2%|▏         | 365/17426 [00:33<27:21, 10.40it/s]

tensor(1.9910, grad_fn=<NllLossBackward0>)
tensor(1.9691, grad_fn=<NllLossBackward0>)
tensor(1.9949, grad_fn=<NllLossBackward0>)


  2%|▏         | 369/17426 [00:33<24:43, 11.50it/s]

tensor(1.9732, grad_fn=<NllLossBackward0>)
tensor(1.9608, grad_fn=<NllLossBackward0>)
tensor(1.9804, grad_fn=<NllLossBackward0>)


  2%|▏         | 371/17426 [00:33<24:16, 11.71it/s]

tensor(1.9879, grad_fn=<NllLossBackward0>)
tensor(1.9847, grad_fn=<NllLossBackward0>)
tensor(1.9988, grad_fn=<NllLossBackward0>)


  2%|▏         | 375/17426 [00:34<23:08, 12.28it/s]

tensor(2.0071, grad_fn=<NllLossBackward0>)
tensor(2.0048, grad_fn=<NllLossBackward0>)
tensor(1.9915, grad_fn=<NllLossBackward0>)


  2%|▏         | 377/17426 [00:34<23:47, 11.94it/s]

tensor(1.9778, grad_fn=<NllLossBackward0>)
tensor(1.9527, grad_fn=<NllLossBackward0>)
tensor(1.9920, grad_fn=<NllLossBackward0>)


  2%|▏         | 381/17426 [00:34<23:05, 12.30it/s]

tensor(1.9585, grad_fn=<NllLossBackward0>)
tensor(2.0329, grad_fn=<NllLossBackward0>)
tensor(2.0150, grad_fn=<NllLossBackward0>)


  2%|▏         | 383/17426 [00:34<23:22, 12.15it/s]

tensor(1.9865, grad_fn=<NllLossBackward0>)
tensor(1.9831, grad_fn=<NllLossBackward0>)
tensor(1.9798, grad_fn=<NllLossBackward0>)


  2%|▏         | 387/17426 [00:34<22:44, 12.49it/s]

tensor(1.9905, grad_fn=<NllLossBackward0>)
tensor(1.9842, grad_fn=<NllLossBackward0>)
tensor(1.9584, grad_fn=<NllLossBackward0>)


  2%|▏         | 389/17426 [00:35<23:31, 12.07it/s]

tensor(1.9923, grad_fn=<NllLossBackward0>)
tensor(1.9898, grad_fn=<NllLossBackward0>)
tensor(1.9777, grad_fn=<NllLossBackward0>)


  2%|▏         | 393/17426 [00:35<23:29, 12.08it/s]

tensor(1.9894, grad_fn=<NllLossBackward0>)
tensor(1.9839, grad_fn=<NllLossBackward0>)
tensor(1.9811, grad_fn=<NllLossBackward0>)


  2%|▏         | 395/17426 [00:35<23:25, 12.11it/s]

tensor(1.9577, grad_fn=<NllLossBackward0>)
tensor(1.9683, grad_fn=<NllLossBackward0>)
tensor(1.9828, grad_fn=<NllLossBackward0>)


  2%|▏         | 399/17426 [00:35<22:44, 12.48it/s]

tensor(1.9680, grad_fn=<NllLossBackward0>)
tensor(2.0131, grad_fn=<NllLossBackward0>)
tensor(1.9801, grad_fn=<NllLossBackward0>)


  2%|▏         | 401/17426 [00:36<23:00, 12.33it/s]

tensor(1.9879, grad_fn=<NllLossBackward0>)
tensor(1.9571, grad_fn=<NllLossBackward0>)
tensor(1.9935, grad_fn=<NllLossBackward0>)


  2%|▏         | 405/17426 [00:36<23:16, 12.19it/s]

tensor(1.9495, grad_fn=<NllLossBackward0>)
tensor(1.9701, grad_fn=<NllLossBackward0>)
tensor(1.9568, grad_fn=<NllLossBackward0>)


  2%|▏         | 407/17426 [00:36<23:14, 12.20it/s]

tensor(1.9749, grad_fn=<NllLossBackward0>)
tensor(2.0180, grad_fn=<NllLossBackward0>)
tensor(1.9125, grad_fn=<NllLossBackward0>)


  2%|▏         | 411/17426 [00:36<22:56, 12.36it/s]

tensor(1.9715, grad_fn=<NllLossBackward0>)
tensor(1.9682, grad_fn=<NllLossBackward0>)
tensor(1.9639, grad_fn=<NllLossBackward0>)


  2%|▏         | 413/17426 [00:37<23:02, 12.31it/s]

tensor(1.9884, grad_fn=<NllLossBackward0>)
tensor(1.9699, grad_fn=<NllLossBackward0>)
tensor(1.9862, grad_fn=<NllLossBackward0>)


  2%|▏         | 417/17426 [00:37<23:18, 12.16it/s]

tensor(1.9540, grad_fn=<NllLossBackward0>)
tensor(1.9742, grad_fn=<NllLossBackward0>)
tensor(1.9402, grad_fn=<NllLossBackward0>)


  2%|▏         | 419/17426 [00:37<23:19, 12.15it/s]

tensor(1.9679, grad_fn=<NllLossBackward0>)
tensor(1.9986, grad_fn=<NllLossBackward0>)
tensor(1.9561, grad_fn=<NllLossBackward0>)


  2%|▏         | 423/17426 [00:37<24:08, 11.74it/s]

tensor(1.9973, grad_fn=<NllLossBackward0>)
tensor(1.9706, grad_fn=<NllLossBackward0>)
tensor(1.9974, grad_fn=<NllLossBackward0>)


  2%|▏         | 425/17426 [00:38<24:23, 11.62it/s]

tensor(1.9863, grad_fn=<NllLossBackward0>)
tensor(1.9234, grad_fn=<NllLossBackward0>)
tensor(1.9282, grad_fn=<NllLossBackward0>)


  2%|▏         | 429/17426 [00:38<23:57, 11.82it/s]

tensor(1.9434, grad_fn=<NllLossBackward0>)
tensor(1.9653, grad_fn=<NllLossBackward0>)
tensor(1.9501, grad_fn=<NllLossBackward0>)


  2%|▏         | 431/17426 [00:38<23:33, 12.03it/s]

tensor(1.9603, grad_fn=<NllLossBackward0>)
tensor(1.9571, grad_fn=<NllLossBackward0>)
tensor(1.9164, grad_fn=<NllLossBackward0>)


  2%|▏         | 435/17426 [00:38<23:02, 12.29it/s]

tensor(1.9755, grad_fn=<NllLossBackward0>)
tensor(1.9346, grad_fn=<NllLossBackward0>)
tensor(1.9570, grad_fn=<NllLossBackward0>)


  3%|▎         | 437/17426 [00:39<23:11, 12.21it/s]

tensor(1.9209, grad_fn=<NllLossBackward0>)
tensor(1.9336, grad_fn=<NllLossBackward0>)
tensor(1.9516, grad_fn=<NllLossBackward0>)


  3%|▎         | 441/17426 [00:39<23:17, 12.16it/s]

tensor(1.9632, grad_fn=<NllLossBackward0>)
tensor(1.9676, grad_fn=<NllLossBackward0>)
tensor(1.9309, grad_fn=<NllLossBackward0>)


  3%|▎         | 443/17426 [00:39<23:24, 12.09it/s]

tensor(1.9421, grad_fn=<NllLossBackward0>)
tensor(1.9390, grad_fn=<NllLossBackward0>)
tensor(1.9240, grad_fn=<NllLossBackward0>)


  3%|▎         | 447/17426 [00:39<23:08, 12.23it/s]

tensor(1.9360, grad_fn=<NllLossBackward0>)
tensor(1.9699, grad_fn=<NllLossBackward0>)
tensor(1.9449, grad_fn=<NllLossBackward0>)


  3%|▎         | 449/17426 [00:40<22:52, 12.36it/s]

tensor(1.9209, grad_fn=<NllLossBackward0>)
tensor(1.9385, grad_fn=<NllLossBackward0>)
tensor(1.9489, grad_fn=<NllLossBackward0>)


  3%|▎         | 453/17426 [00:40<23:29, 12.04it/s]

tensor(1.8858, grad_fn=<NllLossBackward0>)
tensor(1.9841, grad_fn=<NllLossBackward0>)
tensor(1.9074, grad_fn=<NllLossBackward0>)


  3%|▎         | 455/17426 [00:40<23:28, 12.05it/s]

tensor(1.9798, grad_fn=<NllLossBackward0>)
tensor(1.9442, grad_fn=<NllLossBackward0>)
tensor(1.9418, grad_fn=<NllLossBackward0>)


  3%|▎         | 459/17426 [00:40<23:04, 12.25it/s]

tensor(1.9424, grad_fn=<NllLossBackward0>)
tensor(1.9505, grad_fn=<NllLossBackward0>)
tensor(1.9319, grad_fn=<NllLossBackward0>)


  3%|▎         | 461/17426 [00:41<23:18, 12.13it/s]

tensor(1.9118, grad_fn=<NllLossBackward0>)
tensor(1.9589, grad_fn=<NllLossBackward0>)
tensor(1.9210, grad_fn=<NllLossBackward0>)


  3%|▎         | 465/17426 [00:41<23:31, 12.01it/s]

tensor(1.9457, grad_fn=<NllLossBackward0>)
tensor(1.9370, grad_fn=<NllLossBackward0>)
tensor(1.9234, grad_fn=<NllLossBackward0>)


  3%|▎         | 467/17426 [00:41<23:27, 12.05it/s]

tensor(1.9417, grad_fn=<NllLossBackward0>)
tensor(1.9050, grad_fn=<NllLossBackward0>)
tensor(1.9119, grad_fn=<NllLossBackward0>)


  3%|▎         | 471/17426 [00:41<23:14, 12.16it/s]

tensor(1.9286, grad_fn=<NllLossBackward0>)
tensor(1.9401, grad_fn=<NllLossBackward0>)
tensor(1.9319, grad_fn=<NllLossBackward0>)


  3%|▎         | 473/17426 [00:42<23:21, 12.10it/s]

tensor(1.9150, grad_fn=<NllLossBackward0>)
tensor(1.9228, grad_fn=<NllLossBackward0>)
tensor(1.9402, grad_fn=<NllLossBackward0>)


  3%|▎         | 477/17426 [00:42<23:42, 11.92it/s]

tensor(1.9113, grad_fn=<NllLossBackward0>)
tensor(1.9261, grad_fn=<NllLossBackward0>)
tensor(1.9013, grad_fn=<NllLossBackward0>)


  3%|▎         | 479/17426 [00:42<23:39, 11.94it/s]

tensor(1.9288, grad_fn=<NllLossBackward0>)
tensor(1.9744, grad_fn=<NllLossBackward0>)
tensor(1.9272, grad_fn=<NllLossBackward0>)


  3%|▎         | 481/17426 [00:42<23:41, 11.92it/s]

tensor(1.9361, grad_fn=<NllLossBackward0>)
tensor(1.9103, grad_fn=<NllLossBackward0>)


  3%|▎         | 483/17426 [00:43<26:17, 10.74it/s]

tensor(1.8896, grad_fn=<NllLossBackward0>)
tensor(1.9025, grad_fn=<NllLossBackward0>)


  3%|▎         | 485/17426 [00:43<28:57,  9.75it/s]

tensor(1.9100, grad_fn=<NllLossBackward0>)
tensor(1.9149, grad_fn=<NllLossBackward0>)


  3%|▎         | 488/17426 [00:43<31:28,  8.97it/s]

tensor(1.9062, grad_fn=<NllLossBackward0>)
tensor(1.9406, grad_fn=<NllLossBackward0>)


  3%|▎         | 490/17426 [00:43<31:55,  8.84it/s]

tensor(1.9395, grad_fn=<NllLossBackward0>)
tensor(1.8988, grad_fn=<NllLossBackward0>)


  3%|▎         | 492/17426 [00:44<31:58,  8.83it/s]

tensor(1.9249, grad_fn=<NllLossBackward0>)
tensor(1.9185, grad_fn=<NllLossBackward0>)


  3%|▎         | 494/17426 [00:44<32:06,  8.79it/s]

tensor(1.9139, grad_fn=<NllLossBackward0>)
tensor(1.9169, grad_fn=<NllLossBackward0>)


  3%|▎         | 496/17426 [00:44<34:32,  8.17it/s]

tensor(1.9098, grad_fn=<NllLossBackward0>)
tensor(1.8935, grad_fn=<NllLossBackward0>)


  3%|▎         | 498/17426 [00:44<35:11,  8.02it/s]

tensor(1.9390, grad_fn=<NllLossBackward0>)
tensor(1.9326, grad_fn=<NllLossBackward0>)


  3%|▎         | 500/17426 [00:45<35:35,  7.93it/s]

tensor(1.8852, grad_fn=<NllLossBackward0>)
tensor(1.9008, grad_fn=<NllLossBackward0>)


  3%|▎         | 502/17426 [00:45<36:20,  7.76it/s]

tensor(1.9415, grad_fn=<NllLossBackward0>)
tensor(1.9293, grad_fn=<NllLossBackward0>)


  3%|▎         | 503/17426 [00:45<37:43,  7.48it/s]

tensor(1.9330, grad_fn=<NllLossBackward0>)
tensor(1.9076, grad_fn=<NllLossBackward0>)


  3%|▎         | 506/17426 [00:45<40:18,  7.00it/s]

tensor(1.8973, grad_fn=<NllLossBackward0>)
tensor(1.8896, grad_fn=<NllLossBackward0>)


  3%|▎         | 508/17426 [00:46<33:04,  8.53it/s]

tensor(1.9114, grad_fn=<NllLossBackward0>)
tensor(1.9362, grad_fn=<NllLossBackward0>)
tensor(1.9198, grad_fn=<NllLossBackward0>)


  3%|▎         | 512/17426 [00:46<27:11, 10.37it/s]

tensor(1.9061, grad_fn=<NllLossBackward0>)
tensor(1.8851, grad_fn=<NllLossBackward0>)
tensor(1.9295, grad_fn=<NllLossBackward0>)


  3%|▎         | 514/17426 [00:46<26:25, 10.67it/s]

tensor(1.9220, grad_fn=<NllLossBackward0>)
tensor(1.9164, grad_fn=<NllLossBackward0>)
tensor(1.9261, grad_fn=<NllLossBackward0>)


  3%|▎         | 518/17426 [00:46<24:35, 11.46it/s]

tensor(1.9308, grad_fn=<NllLossBackward0>)
tensor(1.9105, grad_fn=<NllLossBackward0>)
tensor(1.8801, grad_fn=<NllLossBackward0>)


  3%|▎         | 520/17426 [00:47<24:04, 11.70it/s]

tensor(1.9061, grad_fn=<NllLossBackward0>)
tensor(1.9347, grad_fn=<NllLossBackward0>)
tensor(1.9203, grad_fn=<NllLossBackward0>)


  3%|▎         | 524/17426 [00:47<23:30, 11.98it/s]

tensor(1.9189, grad_fn=<NllLossBackward0>)
tensor(1.8762, grad_fn=<NllLossBackward0>)
tensor(1.9139, grad_fn=<NllLossBackward0>)


  3%|▎         | 526/17426 [00:47<24:09, 11.66it/s]

tensor(1.8783, grad_fn=<NllLossBackward0>)
tensor(1.9119, grad_fn=<NllLossBackward0>)
tensor(1.8970, grad_fn=<NllLossBackward0>)


  3%|▎         | 530/17426 [00:47<23:11, 12.14it/s]

tensor(1.9365, grad_fn=<NllLossBackward0>)
tensor(1.8676, grad_fn=<NllLossBackward0>)
tensor(1.8999, grad_fn=<NllLossBackward0>)


  3%|▎         | 532/17426 [00:48<23:13, 12.13it/s]

tensor(1.8621, grad_fn=<NllLossBackward0>)
tensor(1.9199, grad_fn=<NllLossBackward0>)
tensor(1.9063, grad_fn=<NllLossBackward0>)


  3%|▎         | 536/17426 [00:48<23:21, 12.05it/s]

tensor(1.8991, grad_fn=<NllLossBackward0>)
tensor(1.8923, grad_fn=<NllLossBackward0>)
tensor(1.8743, grad_fn=<NllLossBackward0>)


  3%|▎         | 538/17426 [00:48<23:47, 11.83it/s]

tensor(1.8702, grad_fn=<NllLossBackward0>)
tensor(1.8995, grad_fn=<NllLossBackward0>)
tensor(1.8866, grad_fn=<NllLossBackward0>)


  3%|▎         | 542/17426 [00:48<23:14, 12.10it/s]

tensor(1.8906, grad_fn=<NllLossBackward0>)
tensor(1.9235, grad_fn=<NllLossBackward0>)
tensor(1.8850, grad_fn=<NllLossBackward0>)


  3%|▎         | 544/17426 [00:49<22:57, 12.25it/s]

tensor(1.9228, grad_fn=<NllLossBackward0>)
tensor(1.8962, grad_fn=<NllLossBackward0>)
tensor(1.8965, grad_fn=<NllLossBackward0>)


  3%|▎         | 548/17426 [00:49<22:32, 12.48it/s]

tensor(1.8732, grad_fn=<NllLossBackward0>)
tensor(1.9101, grad_fn=<NllLossBackward0>)
tensor(1.8856, grad_fn=<NllLossBackward0>)


  3%|▎         | 550/17426 [00:49<22:55, 12.27it/s]

tensor(1.8750, grad_fn=<NllLossBackward0>)
tensor(1.8716, grad_fn=<NllLossBackward0>)
tensor(1.8753, grad_fn=<NllLossBackward0>)


  3%|▎         | 554/17426 [00:49<23:07, 12.16it/s]

tensor(1.8666, grad_fn=<NllLossBackward0>)
tensor(1.9025, grad_fn=<NllLossBackward0>)
tensor(1.9092, grad_fn=<NllLossBackward0>)


  3%|▎         | 556/17426 [00:50<23:11, 12.12it/s]

tensor(1.9213, grad_fn=<NllLossBackward0>)
tensor(1.8979, grad_fn=<NllLossBackward0>)
tensor(1.8808, grad_fn=<NllLossBackward0>)


  3%|▎         | 560/17426 [00:50<22:46, 12.34it/s]

tensor(1.8836, grad_fn=<NllLossBackward0>)
tensor(1.8606, grad_fn=<NllLossBackward0>)
tensor(1.8331, grad_fn=<NllLossBackward0>)


  3%|▎         | 562/17426 [00:50<22:58, 12.24it/s]

tensor(1.8806, grad_fn=<NllLossBackward0>)
tensor(1.8625, grad_fn=<NllLossBackward0>)
tensor(1.8713, grad_fn=<NllLossBackward0>)


  3%|▎         | 566/17426 [00:50<23:24, 12.00it/s]

tensor(1.8830, grad_fn=<NllLossBackward0>)
tensor(1.8838, grad_fn=<NllLossBackward0>)
tensor(1.8555, grad_fn=<NllLossBackward0>)


  3%|▎         | 568/17426 [00:51<23:09, 12.14it/s]

tensor(1.8497, grad_fn=<NllLossBackward0>)
tensor(1.8754, grad_fn=<NllLossBackward0>)
tensor(1.8671, grad_fn=<NllLossBackward0>)


  3%|▎         | 572/17426 [00:51<22:38, 12.41it/s]

tensor(1.9032, grad_fn=<NllLossBackward0>)
tensor(1.9024, grad_fn=<NllLossBackward0>)
tensor(1.9245, grad_fn=<NllLossBackward0>)


  3%|▎         | 574/17426 [00:51<22:49, 12.31it/s]

tensor(1.8805, grad_fn=<NllLossBackward0>)
tensor(1.8652, grad_fn=<NllLossBackward0>)
tensor(1.8651, grad_fn=<NllLossBackward0>)


  3%|▎         | 578/17426 [00:51<23:07, 12.14it/s]

tensor(1.9148, grad_fn=<NllLossBackward0>)
tensor(1.8856, grad_fn=<NllLossBackward0>)
tensor(1.9129, grad_fn=<NllLossBackward0>)


  3%|▎         | 580/17426 [00:52<22:54, 12.25it/s]

tensor(1.8603, grad_fn=<NllLossBackward0>)
tensor(1.9185, grad_fn=<NllLossBackward0>)
tensor(1.8640, grad_fn=<NllLossBackward0>)


  3%|▎         | 584/17426 [00:52<22:36, 12.42it/s]

tensor(1.8598, grad_fn=<NllLossBackward0>)
tensor(1.8772, grad_fn=<NllLossBackward0>)
tensor(1.8873, grad_fn=<NllLossBackward0>)


  3%|▎         | 586/17426 [00:52<22:37, 12.40it/s]

tensor(1.8663, grad_fn=<NllLossBackward0>)
tensor(1.8876, grad_fn=<NllLossBackward0>)
tensor(1.8649, grad_fn=<NllLossBackward0>)


  3%|▎         | 590/17426 [00:52<23:00, 12.20it/s]

tensor(1.8633, grad_fn=<NllLossBackward0>)
tensor(1.8597, grad_fn=<NllLossBackward0>)
tensor(1.8451, grad_fn=<NllLossBackward0>)


  3%|▎         | 592/17426 [00:53<22:56, 12.23it/s]

tensor(1.8602, grad_fn=<NllLossBackward0>)
tensor(1.8609, grad_fn=<NllLossBackward0>)
tensor(1.8565, grad_fn=<NllLossBackward0>)


  3%|▎         | 596/17426 [00:53<22:40, 12.37it/s]

tensor(1.8572, grad_fn=<NllLossBackward0>)
tensor(1.8440, grad_fn=<NllLossBackward0>)
tensor(1.8908, grad_fn=<NllLossBackward0>)


  3%|▎         | 598/17426 [00:53<22:51, 12.27it/s]

tensor(1.8744, grad_fn=<NllLossBackward0>)
tensor(1.8718, grad_fn=<NllLossBackward0>)
tensor(1.8766, grad_fn=<NllLossBackward0>)


  3%|▎         | 602/17426 [00:53<23:04, 12.15it/s]

tensor(1.8389, grad_fn=<NllLossBackward0>)
tensor(1.8537, grad_fn=<NllLossBackward0>)
tensor(1.8304, grad_fn=<NllLossBackward0>)


  3%|▎         | 604/17426 [00:54<23:08, 12.12it/s]

tensor(1.8662, grad_fn=<NllLossBackward0>)
tensor(1.8821, grad_fn=<NllLossBackward0>)
tensor(1.8447, grad_fn=<NllLossBackward0>)


  3%|▎         | 608/17426 [00:54<23:03, 12.16it/s]

tensor(1.8652, grad_fn=<NllLossBackward0>)
tensor(1.8527, grad_fn=<NllLossBackward0>)
tensor(1.8638, grad_fn=<NllLossBackward0>)


  4%|▎         | 610/17426 [00:54<23:00, 12.18it/s]

tensor(1.8655, grad_fn=<NllLossBackward0>)
tensor(1.8585, grad_fn=<NllLossBackward0>)
tensor(1.8250, grad_fn=<NllLossBackward0>)


  4%|▎         | 614/17426 [00:55<28:34,  9.81it/s]

tensor(1.9093, grad_fn=<NllLossBackward0>)
tensor(1.8387, grad_fn=<NllLossBackward0>)
tensor(1.8691, grad_fn=<NllLossBackward0>)


  4%|▎         | 616/17426 [00:55<27:05, 10.34it/s]

tensor(1.8763, grad_fn=<NllLossBackward0>)
tensor(1.8457, grad_fn=<NllLossBackward0>)
tensor(1.8605, grad_fn=<NllLossBackward0>)


  4%|▎         | 618/17426 [00:55<26:00, 10.77it/s]

tensor(1.8468, grad_fn=<NllLossBackward0>)
tensor(1.8682, grad_fn=<NllLossBackward0>)


  4%|▎         | 621/17426 [00:55<37:57,  7.38it/s]

tensor(1.8763, grad_fn=<NllLossBackward0>)
tensor(1.8271, grad_fn=<NllLossBackward0>)


  4%|▎         | 623/17426 [00:56<36:32,  7.66it/s]

tensor(1.8317, grad_fn=<NllLossBackward0>)
tensor(1.8346, grad_fn=<NllLossBackward0>)


  4%|▎         | 625/17426 [00:56<35:08,  7.97it/s]

tensor(1.8725, grad_fn=<NllLossBackward0>)
tensor(1.8501, grad_fn=<NllLossBackward0>)


  4%|▎         | 627/17426 [00:56<34:15,  8.17it/s]

tensor(1.8379, grad_fn=<NllLossBackward0>)
tensor(1.9033, grad_fn=<NllLossBackward0>)


  4%|▎         | 629/17426 [00:56<33:31,  8.35it/s]

tensor(1.8608, grad_fn=<NllLossBackward0>)
tensor(1.8400, grad_fn=<NllLossBackward0>)


  4%|▎         | 631/17426 [00:57<33:27,  8.37it/s]

tensor(1.8322, grad_fn=<NllLossBackward0>)
tensor(1.8642, grad_fn=<NllLossBackward0>)


  4%|▎         | 633/17426 [00:57<34:10,  8.19it/s]

tensor(1.8356, grad_fn=<NllLossBackward0>)
tensor(1.8486, grad_fn=<NllLossBackward0>)


  4%|▎         | 635/17426 [00:57<34:49,  8.03it/s]

tensor(1.8619, grad_fn=<NllLossBackward0>)
tensor(1.8373, grad_fn=<NllLossBackward0>)


  4%|▎         | 637/17426 [00:57<36:54,  7.58it/s]

tensor(1.8242, grad_fn=<NllLossBackward0>)
tensor(1.8149, grad_fn=<NllLossBackward0>)


  4%|▎         | 639/17426 [00:58<37:43,  7.42it/s]

tensor(1.8884, grad_fn=<NllLossBackward0>)
tensor(1.8613, grad_fn=<NllLossBackward0>)


  4%|▎         | 642/17426 [00:58<29:11,  9.58it/s]

tensor(1.8725, grad_fn=<NllLossBackward0>)
tensor(1.8563, grad_fn=<NllLossBackward0>)
tensor(1.8470, grad_fn=<NllLossBackward0>)


  4%|▎         | 644/17426 [00:58<27:01, 10.35it/s]

tensor(1.8685, grad_fn=<NllLossBackward0>)
tensor(1.8767, grad_fn=<NllLossBackward0>)
tensor(1.8208, grad_fn=<NllLossBackward0>)


  4%|▎         | 648/17426 [00:58<24:30, 11.41it/s]

tensor(1.8272, grad_fn=<NllLossBackward0>)
tensor(1.8647, grad_fn=<NllLossBackward0>)
tensor(1.8115, grad_fn=<NllLossBackward0>)


  4%|▎         | 650/17426 [00:59<24:27, 11.43it/s]

tensor(1.8420, grad_fn=<NllLossBackward0>)
tensor(1.8412, grad_fn=<NllLossBackward0>)
tensor(1.8727, grad_fn=<NllLossBackward0>)


  4%|▍         | 654/17426 [00:59<23:21, 11.97it/s]

tensor(1.8404, grad_fn=<NllLossBackward0>)
tensor(1.8585, grad_fn=<NllLossBackward0>)
tensor(1.7993, grad_fn=<NllLossBackward0>)


  4%|▍         | 656/17426 [00:59<23:25, 11.93it/s]

tensor(1.8555, grad_fn=<NllLossBackward0>)
tensor(1.8084, grad_fn=<NllLossBackward0>)
tensor(1.8103, grad_fn=<NllLossBackward0>)


  4%|▍         | 660/17426 [00:59<22:52, 12.22it/s]

tensor(1.8495, grad_fn=<NllLossBackward0>)
tensor(1.8368, grad_fn=<NllLossBackward0>)
tensor(1.7916, grad_fn=<NllLossBackward0>)


  4%|▍         | 662/17426 [01:00<23:35, 11.84it/s]

tensor(1.8159, grad_fn=<NllLossBackward0>)
tensor(1.8396, grad_fn=<NllLossBackward0>)
tensor(1.8430, grad_fn=<NllLossBackward0>)


  4%|▍         | 666/17426 [01:00<22:58, 12.15it/s]

tensor(1.8737, grad_fn=<NllLossBackward0>)
tensor(1.8252, grad_fn=<NllLossBackward0>)
tensor(1.8551, grad_fn=<NllLossBackward0>)


  4%|▍         | 668/17426 [01:00<22:57, 12.16it/s]

tensor(1.8511, grad_fn=<NllLossBackward0>)
tensor(1.8308, grad_fn=<NllLossBackward0>)
tensor(1.8560, grad_fn=<NllLossBackward0>)


  4%|▍         | 672/17426 [01:00<22:46, 12.26it/s]

tensor(1.8461, grad_fn=<NllLossBackward0>)
tensor(1.8348, grad_fn=<NllLossBackward0>)
tensor(1.8141, grad_fn=<NllLossBackward0>)


  4%|▍         | 674/17426 [01:01<23:24, 11.93it/s]

tensor(1.8221, grad_fn=<NllLossBackward0>)
tensor(1.8193, grad_fn=<NllLossBackward0>)
tensor(1.8634, grad_fn=<NllLossBackward0>)


  4%|▍         | 678/17426 [01:01<23:03, 12.10it/s]

tensor(1.8314, grad_fn=<NllLossBackward0>)
tensor(1.8474, grad_fn=<NllLossBackward0>)
tensor(1.8320, grad_fn=<NllLossBackward0>)


  4%|▍         | 680/17426 [01:01<23:04, 12.10it/s]

tensor(1.8200, grad_fn=<NllLossBackward0>)
tensor(1.8091, grad_fn=<NllLossBackward0>)
tensor(1.8225, grad_fn=<NllLossBackward0>)


  4%|▍         | 684/17426 [01:01<23:11, 12.03it/s]

tensor(1.7874, grad_fn=<NllLossBackward0>)
tensor(1.8146, grad_fn=<NllLossBackward0>)
tensor(1.8363, grad_fn=<NllLossBackward0>)


  4%|▍         | 686/17426 [01:02<24:10, 11.54it/s]

tensor(1.8523, grad_fn=<NllLossBackward0>)
tensor(1.8488, grad_fn=<NllLossBackward0>)
tensor(1.8332, grad_fn=<NllLossBackward0>)


  4%|▍         | 690/17426 [01:02<23:14, 12.00it/s]

tensor(1.8561, grad_fn=<NllLossBackward0>)
tensor(1.8242, grad_fn=<NllLossBackward0>)
tensor(1.8297, grad_fn=<NllLossBackward0>)


  4%|▍         | 692/17426 [01:02<23:06, 12.07it/s]

tensor(1.8034, grad_fn=<NllLossBackward0>)
tensor(1.8285, grad_fn=<NllLossBackward0>)
tensor(1.8348, grad_fn=<NllLossBackward0>)


  4%|▍         | 696/17426 [01:02<22:40, 12.30it/s]

tensor(1.8733, grad_fn=<NllLossBackward0>)
tensor(1.8375, grad_fn=<NllLossBackward0>)
tensor(1.8496, grad_fn=<NllLossBackward0>)


  4%|▍         | 698/17426 [01:03<22:50, 12.20it/s]

tensor(1.7906, grad_fn=<NllLossBackward0>)
tensor(1.7978, grad_fn=<NllLossBackward0>)
tensor(1.8506, grad_fn=<NllLossBackward0>)


  4%|▍         | 702/17426 [01:03<22:54, 12.17it/s]

tensor(1.8106, grad_fn=<NllLossBackward0>)
tensor(1.7959, grad_fn=<NllLossBackward0>)
tensor(1.8010, grad_fn=<NllLossBackward0>)


  4%|▍         | 704/17426 [01:03<22:46, 12.24it/s]

tensor(1.8391, grad_fn=<NllLossBackward0>)
tensor(1.8222, grad_fn=<NllLossBackward0>)
tensor(1.7652, grad_fn=<NllLossBackward0>)


  4%|▍         | 708/17426 [01:03<22:27, 12.41it/s]

tensor(1.7885, grad_fn=<NllLossBackward0>)
tensor(1.8084, grad_fn=<NllLossBackward0>)
tensor(1.8108, grad_fn=<NllLossBackward0>)


  4%|▍         | 710/17426 [01:04<22:30, 12.38it/s]

tensor(1.7994, grad_fn=<NllLossBackward0>)
tensor(1.8245, grad_fn=<NllLossBackward0>)
tensor(1.8389, grad_fn=<NllLossBackward0>)


  4%|▍         | 714/17426 [01:04<22:50, 12.20it/s]

tensor(1.8110, grad_fn=<NllLossBackward0>)
tensor(1.8579, grad_fn=<NllLossBackward0>)
tensor(1.8456, grad_fn=<NllLossBackward0>)


  4%|▍         | 716/17426 [01:04<22:44, 12.25it/s]

tensor(1.8266, grad_fn=<NllLossBackward0>)
tensor(1.8112, grad_fn=<NllLossBackward0>)
tensor(1.7944, grad_fn=<NllLossBackward0>)


  4%|▍         | 720/17426 [01:04<22:35, 12.33it/s]

tensor(1.8015, grad_fn=<NllLossBackward0>)
tensor(1.8145, grad_fn=<NllLossBackward0>)
tensor(1.7937, grad_fn=<NllLossBackward0>)


  4%|▍         | 722/17426 [01:05<22:38, 12.30it/s]

tensor(1.8546, grad_fn=<NllLossBackward0>)
tensor(1.8318, grad_fn=<NllLossBackward0>)
tensor(1.7794, grad_fn=<NllLossBackward0>)


  4%|▍         | 726/17426 [01:05<23:13, 11.98it/s]

tensor(1.8010, grad_fn=<NllLossBackward0>)
tensor(1.8140, grad_fn=<NllLossBackward0>)
tensor(1.8075, grad_fn=<NllLossBackward0>)


  4%|▍         | 728/17426 [01:05<22:52, 12.17it/s]

tensor(1.8478, grad_fn=<NllLossBackward0>)
tensor(1.8486, grad_fn=<NllLossBackward0>)
tensor(1.8361, grad_fn=<NllLossBackward0>)


  4%|▍         | 732/17426 [01:05<22:47, 12.21it/s]

tensor(1.8306, grad_fn=<NllLossBackward0>)
tensor(1.8289, grad_fn=<NllLossBackward0>)
tensor(1.8366, grad_fn=<NllLossBackward0>)


  4%|▍         | 734/17426 [01:06<22:56, 12.13it/s]

tensor(1.7880, grad_fn=<NllLossBackward0>)
tensor(1.8513, grad_fn=<NllLossBackward0>)
tensor(1.8287, grad_fn=<NllLossBackward0>)


  4%|▍         | 738/17426 [01:06<23:09, 12.01it/s]

tensor(1.8155, grad_fn=<NllLossBackward0>)
tensor(1.7829, grad_fn=<NllLossBackward0>)
tensor(1.8253, grad_fn=<NllLossBackward0>)


  4%|▍         | 740/17426 [01:06<22:50, 12.18it/s]

tensor(1.8132, grad_fn=<NllLossBackward0>)
tensor(1.8424, grad_fn=<NllLossBackward0>)
tensor(1.7530, grad_fn=<NllLossBackward0>)


  4%|▍         | 744/17426 [01:06<22:50, 12.18it/s]

tensor(1.8482, grad_fn=<NllLossBackward0>)
tensor(1.7709, grad_fn=<NllLossBackward0>)
tensor(1.7862, grad_fn=<NllLossBackward0>)


  4%|▍         | 746/17426 [01:07<22:41, 12.25it/s]

tensor(1.8309, grad_fn=<NllLossBackward0>)
tensor(1.7948, grad_fn=<NllLossBackward0>)
tensor(1.7922, grad_fn=<NllLossBackward0>)


  4%|▍         | 750/17426 [01:07<23:36, 11.77it/s]

tensor(1.8081, grad_fn=<NllLossBackward0>)
tensor(1.7907, grad_fn=<NllLossBackward0>)
tensor(1.8582, grad_fn=<NllLossBackward0>)


  4%|▍         | 752/17426 [01:07<23:16, 11.94it/s]

tensor(1.7834, grad_fn=<NllLossBackward0>)
tensor(1.8029, grad_fn=<NllLossBackward0>)
tensor(1.8208, grad_fn=<NllLossBackward0>)


  4%|▍         | 756/17426 [01:07<22:36, 12.29it/s]

tensor(1.7729, grad_fn=<NllLossBackward0>)
tensor(1.8310, grad_fn=<NllLossBackward0>)
tensor(1.7892, grad_fn=<NllLossBackward0>)


  4%|▍         | 758/17426 [01:08<23:27, 11.84it/s]

tensor(1.8368, grad_fn=<NllLossBackward0>)
tensor(1.7900, grad_fn=<NllLossBackward0>)
tensor(1.8155, grad_fn=<NllLossBackward0>)


  4%|▍         | 760/17426 [01:08<23:26, 11.85it/s]

tensor(1.8700, grad_fn=<NllLossBackward0>)
tensor(1.7936, grad_fn=<NllLossBackward0>)


  4%|▍         | 762/17426 [01:08<27:55,  9.95it/s]

tensor(1.7964, grad_fn=<NllLossBackward0>)
tensor(1.8056, grad_fn=<NllLossBackward0>)


  4%|▍         | 765/17426 [01:08<30:49,  9.01it/s]

tensor(1.8162, grad_fn=<NllLossBackward0>)
tensor(1.7776, grad_fn=<NllLossBackward0>)


  4%|▍         | 767/17426 [01:09<32:00,  8.67it/s]

tensor(1.7978, grad_fn=<NllLossBackward0>)
tensor(1.8105, grad_fn=<NllLossBackward0>)


  4%|▍         | 769/17426 [01:09<33:00,  8.41it/s]

tensor(1.8032, grad_fn=<NllLossBackward0>)
tensor(1.8248, grad_fn=<NllLossBackward0>)


  4%|▍         | 771/17426 [01:09<33:51,  8.20it/s]

tensor(1.7962, grad_fn=<NllLossBackward0>)
tensor(1.7776, grad_fn=<NllLossBackward0>)


  4%|▍         | 773/17426 [01:09<34:01,  8.16it/s]

tensor(1.8227, grad_fn=<NllLossBackward0>)
tensor(1.8073, grad_fn=<NllLossBackward0>)


  4%|▍         | 775/17426 [01:10<35:31,  7.81it/s]

tensor(1.8109, grad_fn=<NllLossBackward0>)
tensor(1.8186, grad_fn=<NllLossBackward0>)


  4%|▍         | 777/17426 [01:10<37:16,  7.45it/s]

tensor(1.7785, grad_fn=<NllLossBackward0>)
tensor(1.7875, grad_fn=<NllLossBackward0>)


  4%|▍         | 779/17426 [01:10<37:19,  7.43it/s]

tensor(1.7972, grad_fn=<NllLossBackward0>)
tensor(1.7877, grad_fn=<NllLossBackward0>)


  4%|▍         | 781/17426 [01:10<35:14,  7.87it/s]

tensor(1.7627, grad_fn=<NllLossBackward0>)
tensor(1.8436, grad_fn=<NllLossBackward0>)
tensor(1.7719, grad_fn=<NllLossBackward0>)


  5%|▍         | 785/17426 [01:11<26:54, 10.31it/s]

tensor(1.8036, grad_fn=<NllLossBackward0>)
tensor(1.7673, grad_fn=<NllLossBackward0>)
tensor(1.8095, grad_fn=<NllLossBackward0>)


  5%|▍         | 787/17426 [01:11<25:29, 10.88it/s]

tensor(1.8037, grad_fn=<NllLossBackward0>)
tensor(1.8203, grad_fn=<NllLossBackward0>)
tensor(1.8007, grad_fn=<NllLossBackward0>)


  5%|▍         | 791/17426 [01:11<24:22, 11.37it/s]

tensor(1.7967, grad_fn=<NllLossBackward0>)
tensor(1.8171, grad_fn=<NllLossBackward0>)
tensor(1.7874, grad_fn=<NllLossBackward0>)


  5%|▍         | 793/17426 [01:11<24:16, 11.42it/s]

tensor(1.7727, grad_fn=<NllLossBackward0>)
tensor(1.8315, grad_fn=<NllLossBackward0>)
tensor(1.8177, grad_fn=<NllLossBackward0>)


  5%|▍         | 797/17426 [01:12<23:25, 11.83it/s]

tensor(1.8374, grad_fn=<NllLossBackward0>)
tensor(1.8019, grad_fn=<NllLossBackward0>)
tensor(1.7875, grad_fn=<NllLossBackward0>)


  5%|▍         | 799/17426 [01:12<23:15, 11.92it/s]

tensor(1.7759, grad_fn=<NllLossBackward0>)
tensor(1.8067, grad_fn=<NllLossBackward0>)
tensor(1.8139, grad_fn=<NllLossBackward0>)


  5%|▍         | 803/17426 [01:12<23:00, 12.04it/s]

tensor(1.7732, grad_fn=<NllLossBackward0>)
tensor(1.7606, grad_fn=<NllLossBackward0>)
tensor(1.7595, grad_fn=<NllLossBackward0>)


  5%|▍         | 805/17426 [01:12<23:00, 12.04it/s]

tensor(1.7944, grad_fn=<NllLossBackward0>)
tensor(1.7637, grad_fn=<NllLossBackward0>)
tensor(1.7594, grad_fn=<NllLossBackward0>)


  5%|▍         | 809/17426 [01:13<22:40, 12.21it/s]

tensor(1.7833, grad_fn=<NllLossBackward0>)
tensor(1.7832, grad_fn=<NllLossBackward0>)
tensor(1.7679, grad_fn=<NllLossBackward0>)


  5%|▍         | 811/17426 [01:13<22:45, 12.17it/s]

tensor(1.7346, grad_fn=<NllLossBackward0>)
tensor(1.7989, grad_fn=<NllLossBackward0>)
tensor(1.7797, grad_fn=<NllLossBackward0>)


  5%|▍         | 815/17426 [01:13<22:51, 12.11it/s]

tensor(1.8002, grad_fn=<NllLossBackward0>)
tensor(1.7791, grad_fn=<NllLossBackward0>)
tensor(1.8012, grad_fn=<NllLossBackward0>)


  5%|▍         | 817/17426 [01:13<22:59, 12.04it/s]

tensor(1.7982, grad_fn=<NllLossBackward0>)
tensor(1.7769, grad_fn=<NllLossBackward0>)
tensor(1.7758, grad_fn=<NllLossBackward0>)


  5%|▍         | 821/17426 [01:14<22:37, 12.23it/s]

tensor(1.8058, grad_fn=<NllLossBackward0>)
tensor(1.8090, grad_fn=<NllLossBackward0>)
tensor(1.8132, grad_fn=<NllLossBackward0>)


  5%|▍         | 823/17426 [01:14<22:42, 12.18it/s]

tensor(1.8010, grad_fn=<NllLossBackward0>)
tensor(1.7695, grad_fn=<NllLossBackward0>)
tensor(1.7762, grad_fn=<NllLossBackward0>)


  5%|▍         | 827/17426 [01:14<22:43, 12.18it/s]

tensor(1.7782, grad_fn=<NllLossBackward0>)
tensor(1.7803, grad_fn=<NllLossBackward0>)
tensor(1.7851, grad_fn=<NllLossBackward0>)


  5%|▍         | 829/17426 [01:14<22:47, 12.14it/s]

tensor(1.7951, grad_fn=<NllLossBackward0>)
tensor(1.7681, grad_fn=<NllLossBackward0>)
tensor(1.7846, grad_fn=<NllLossBackward0>)


  5%|▍         | 833/17426 [01:15<22:33, 12.26it/s]

tensor(1.7922, grad_fn=<NllLossBackward0>)
tensor(1.8190, grad_fn=<NllLossBackward0>)
tensor(1.7594, grad_fn=<NllLossBackward0>)


  5%|▍         | 835/17426 [01:15<22:35, 12.24it/s]

tensor(1.7484, grad_fn=<NllLossBackward0>)
tensor(1.7966, grad_fn=<NllLossBackward0>)
tensor(1.8093, grad_fn=<NllLossBackward0>)


  5%|▍         | 839/17426 [01:15<23:08, 11.95it/s]

tensor(1.7917, grad_fn=<NllLossBackward0>)
tensor(1.8202, grad_fn=<NllLossBackward0>)
tensor(1.8189, grad_fn=<NllLossBackward0>)


  5%|▍         | 841/17426 [01:15<23:00, 12.01it/s]

tensor(1.7405, grad_fn=<NllLossBackward0>)
tensor(1.7970, grad_fn=<NllLossBackward0>)
tensor(1.7784, grad_fn=<NllLossBackward0>)


  5%|▍         | 845/17426 [01:16<22:43, 12.16it/s]

tensor(1.7847, grad_fn=<NllLossBackward0>)
tensor(1.7890, grad_fn=<NllLossBackward0>)
tensor(1.7877, grad_fn=<NllLossBackward0>)


  5%|▍         | 847/17426 [01:16<22:49, 12.10it/s]

tensor(1.7336, grad_fn=<NllLossBackward0>)
tensor(1.7880, grad_fn=<NllLossBackward0>)
tensor(1.7907, grad_fn=<NllLossBackward0>)


  5%|▍         | 851/17426 [01:16<23:11, 11.91it/s]

tensor(1.7834, grad_fn=<NllLossBackward0>)
tensor(1.7852, grad_fn=<NllLossBackward0>)
tensor(1.7793, grad_fn=<NllLossBackward0>)


  5%|▍         | 853/17426 [01:16<23:07, 11.95it/s]

tensor(1.7985, grad_fn=<NllLossBackward0>)
tensor(1.7905, grad_fn=<NllLossBackward0>)
tensor(1.7532, grad_fn=<NllLossBackward0>)


  5%|▍         | 857/17426 [01:17<22:42, 12.17it/s]

tensor(1.7251, grad_fn=<NllLossBackward0>)
tensor(1.8095, grad_fn=<NllLossBackward0>)
tensor(1.7589, grad_fn=<NllLossBackward0>)


  5%|▍         | 859/17426 [01:17<22:30, 12.27it/s]

tensor(1.7997, grad_fn=<NllLossBackward0>)
tensor(1.7711, grad_fn=<NllLossBackward0>)
tensor(1.7368, grad_fn=<NllLossBackward0>)


  5%|▍         | 863/17426 [01:17<22:52, 12.07it/s]

tensor(1.7827, grad_fn=<NllLossBackward0>)
tensor(1.7987, grad_fn=<NllLossBackward0>)
tensor(1.7924, grad_fn=<NllLossBackward0>)


  5%|▍         | 865/17426 [01:17<22:44, 12.14it/s]

tensor(1.8257, grad_fn=<NllLossBackward0>)
tensor(1.7606, grad_fn=<NllLossBackward0>)
tensor(1.8214, grad_fn=<NllLossBackward0>)


  5%|▍         | 869/17426 [01:18<22:38, 12.19it/s]

tensor(1.7605, grad_fn=<NllLossBackward0>)
tensor(1.7752, grad_fn=<NllLossBackward0>)
tensor(1.7580, grad_fn=<NllLossBackward0>)


  5%|▍         | 871/17426 [01:18<22:42, 12.15it/s]

tensor(1.7527, grad_fn=<NllLossBackward0>)
tensor(1.7721, grad_fn=<NllLossBackward0>)
tensor(1.8260, grad_fn=<NllLossBackward0>)


  5%|▌         | 875/17426 [01:18<22:51, 12.07it/s]

tensor(1.7930, grad_fn=<NllLossBackward0>)
tensor(1.7809, grad_fn=<NllLossBackward0>)
tensor(1.8009, grad_fn=<NllLossBackward0>)


  5%|▌         | 877/17426 [01:18<23:01, 11.97it/s]

tensor(1.7917, grad_fn=<NllLossBackward0>)
tensor(1.7978, grad_fn=<NllLossBackward0>)
tensor(1.7781, grad_fn=<NllLossBackward0>)


  5%|▌         | 881/17426 [01:19<22:22, 12.33it/s]

tensor(1.7593, grad_fn=<NllLossBackward0>)
tensor(1.8274, grad_fn=<NllLossBackward0>)
tensor(1.7286, grad_fn=<NllLossBackward0>)


  5%|▌         | 883/17426 [01:19<22:39, 12.17it/s]

tensor(1.7957, grad_fn=<NllLossBackward0>)
tensor(1.7831, grad_fn=<NllLossBackward0>)
tensor(1.7745, grad_fn=<NllLossBackward0>)


  5%|▌         | 887/17426 [01:19<22:12, 12.41it/s]

tensor(1.7547, grad_fn=<NllLossBackward0>)
tensor(1.8070, grad_fn=<NllLossBackward0>)
tensor(1.7579, grad_fn=<NllLossBackward0>)


  5%|▌         | 889/17426 [01:19<22:45, 12.11it/s]

tensor(1.7913, grad_fn=<NllLossBackward0>)
tensor(1.7564, grad_fn=<NllLossBackward0>)
tensor(1.7358, grad_fn=<NllLossBackward0>)


  5%|▌         | 893/17426 [01:20<22:35, 12.19it/s]

tensor(1.7822, grad_fn=<NllLossBackward0>)
tensor(1.7741, grad_fn=<NllLossBackward0>)
tensor(1.7659, grad_fn=<NllLossBackward0>)


  5%|▌         | 895/17426 [01:20<22:35, 12.19it/s]

tensor(1.7969, grad_fn=<NllLossBackward0>)
tensor(1.7415, grad_fn=<NllLossBackward0>)
tensor(1.7523, grad_fn=<NllLossBackward0>)


  5%|▌         | 899/17426 [01:20<22:19, 12.34it/s]

tensor(1.7648, grad_fn=<NllLossBackward0>)
tensor(1.7834, grad_fn=<NllLossBackward0>)
tensor(1.7575, grad_fn=<NllLossBackward0>)


  5%|▌         | 901/17426 [01:20<23:07, 11.91it/s]

tensor(1.7261, grad_fn=<NllLossBackward0>)
tensor(1.7545, grad_fn=<NllLossBackward0>)
tensor(1.8024, grad_fn=<NllLossBackward0>)


  5%|▌         | 903/17426 [01:21<26:20, 10.45it/s]

tensor(1.7736, grad_fn=<NllLossBackward0>)
tensor(1.7468, grad_fn=<NllLossBackward0>)


  5%|▌         | 906/17426 [01:21<29:30,  9.33it/s]

tensor(1.7298, grad_fn=<NllLossBackward0>)
tensor(1.8070, grad_fn=<NllLossBackward0>)


  5%|▌         | 908/17426 [01:21<31:29,  8.74it/s]

tensor(1.7540, grad_fn=<NllLossBackward0>)
tensor(1.7733, grad_fn=<NllLossBackward0>)


  5%|▌         | 910/17426 [01:22<32:41,  8.42it/s]

tensor(1.7813, grad_fn=<NllLossBackward0>)
tensor(1.7658, grad_fn=<NllLossBackward0>)


  5%|▌         | 912/17426 [01:22<32:07,  8.57it/s]

tensor(1.7803, grad_fn=<NllLossBackward0>)
tensor(1.7565, grad_fn=<NllLossBackward0>)


  5%|▌         | 914/17426 [01:22<32:25,  8.49it/s]

tensor(1.7754, grad_fn=<NllLossBackward0>)
tensor(1.7413, grad_fn=<NllLossBackward0>)


  5%|▌         | 916/17426 [01:22<33:42,  8.16it/s]

tensor(1.7534, grad_fn=<NllLossBackward0>)
tensor(1.7463, grad_fn=<NllLossBackward0>)


  5%|▌         | 918/17426 [01:23<35:12,  7.81it/s]

tensor(1.7893, grad_fn=<NllLossBackward0>)
tensor(1.7680, grad_fn=<NllLossBackward0>)


  5%|▌         | 920/17426 [01:23<37:04,  7.42it/s]

tensor(1.7651, grad_fn=<NllLossBackward0>)
tensor(1.7552, grad_fn=<NllLossBackward0>)


  5%|▌         | 922/17426 [01:23<37:27,  7.34it/s]

tensor(1.7767, grad_fn=<NllLossBackward0>)
tensor(1.7643, grad_fn=<NllLossBackward0>)


  5%|▌         | 925/17426 [01:23<29:05,  9.45it/s]

tensor(1.7842, grad_fn=<NllLossBackward0>)
tensor(1.7698, grad_fn=<NllLossBackward0>)
tensor(1.7219, grad_fn=<NllLossBackward0>)


  5%|▌         | 927/17426 [01:24<27:06, 10.14it/s]

tensor(1.7733, grad_fn=<NllLossBackward0>)
tensor(1.7493, grad_fn=<NllLossBackward0>)
tensor(1.7759, grad_fn=<NllLossBackward0>)


  5%|▌         | 931/17426 [01:24<24:28, 11.24it/s]

tensor(1.7791, grad_fn=<NllLossBackward0>)
tensor(1.8220, grad_fn=<NllLossBackward0>)
tensor(1.7702, grad_fn=<NllLossBackward0>)


  5%|▌         | 933/17426 [01:24<23:43, 11.58it/s]

tensor(1.7374, grad_fn=<NllLossBackward0>)
tensor(1.7496, grad_fn=<NllLossBackward0>)
tensor(1.7682, grad_fn=<NllLossBackward0>)


  5%|▌         | 937/17426 [01:24<23:05, 11.90it/s]

tensor(1.7991, grad_fn=<NllLossBackward0>)
tensor(1.8186, grad_fn=<NllLossBackward0>)
tensor(1.7302, grad_fn=<NllLossBackward0>)


  5%|▌         | 939/17426 [01:25<23:22, 11.76it/s]

tensor(1.7442, grad_fn=<NllLossBackward0>)
tensor(1.7328, grad_fn=<NllLossBackward0>)
tensor(1.7686, grad_fn=<NllLossBackward0>)


  5%|▌         | 943/17426 [01:25<23:03, 11.91it/s]

tensor(1.7892, grad_fn=<NllLossBackward0>)
tensor(1.7521, grad_fn=<NllLossBackward0>)
tensor(1.7236, grad_fn=<NllLossBackward0>)


  5%|▌         | 945/17426 [01:25<23:08, 11.87it/s]

tensor(1.7672, grad_fn=<NllLossBackward0>)
tensor(1.7674, grad_fn=<NllLossBackward0>)
tensor(1.7306, grad_fn=<NllLossBackward0>)


  5%|▌         | 949/17426 [01:25<22:40, 12.11it/s]

tensor(1.7593, grad_fn=<NllLossBackward0>)
tensor(1.7763, grad_fn=<NllLossBackward0>)
tensor(1.7537, grad_fn=<NllLossBackward0>)


  5%|▌         | 951/17426 [01:26<23:04, 11.90it/s]

tensor(1.7874, grad_fn=<NllLossBackward0>)
tensor(1.7660, grad_fn=<NllLossBackward0>)
tensor(1.7802, grad_fn=<NllLossBackward0>)


  5%|▌         | 955/17426 [01:26<22:43, 12.08it/s]

tensor(1.7508, grad_fn=<NllLossBackward0>)
tensor(1.7520, grad_fn=<NllLossBackward0>)
tensor(1.7414, grad_fn=<NllLossBackward0>)


  5%|▌         | 957/17426 [01:26<22:40, 12.10it/s]

tensor(1.7255, grad_fn=<NllLossBackward0>)
tensor(1.7350, grad_fn=<NllLossBackward0>)
tensor(1.7343, grad_fn=<NllLossBackward0>)


  6%|▌         | 961/17426 [01:26<22:38, 12.12it/s]

tensor(1.7322, grad_fn=<NllLossBackward0>)
tensor(1.7175, grad_fn=<NllLossBackward0>)
tensor(1.7397, grad_fn=<NllLossBackward0>)


  6%|▌         | 963/17426 [01:27<22:35, 12.14it/s]

tensor(1.7498, grad_fn=<NllLossBackward0>)
tensor(1.7381, grad_fn=<NllLossBackward0>)
tensor(1.7986, grad_fn=<NllLossBackward0>)


  6%|▌         | 967/17426 [01:27<22:50, 12.01it/s]

tensor(1.7423, grad_fn=<NllLossBackward0>)
tensor(1.7914, grad_fn=<NllLossBackward0>)
tensor(1.7828, grad_fn=<NllLossBackward0>)


  6%|▌         | 969/17426 [01:27<22:46, 12.05it/s]

tensor(1.7671, grad_fn=<NllLossBackward0>)
tensor(1.7294, grad_fn=<NllLossBackward0>)
tensor(1.7635, grad_fn=<NllLossBackward0>)


  6%|▌         | 973/17426 [01:27<22:31, 12.18it/s]

tensor(1.7945, grad_fn=<NllLossBackward0>)
tensor(1.7619, grad_fn=<NllLossBackward0>)
tensor(1.7703, grad_fn=<NllLossBackward0>)


  6%|▌         | 975/17426 [01:28<22:33, 12.16it/s]

tensor(1.7834, grad_fn=<NllLossBackward0>)
tensor(1.7265, grad_fn=<NllLossBackward0>)
tensor(1.7424, grad_fn=<NllLossBackward0>)


  6%|▌         | 979/17426 [01:28<22:37, 12.12it/s]

tensor(1.7392, grad_fn=<NllLossBackward0>)
tensor(1.7227, grad_fn=<NllLossBackward0>)
tensor(1.7888, grad_fn=<NllLossBackward0>)


  6%|▌         | 981/17426 [01:28<22:40, 12.09it/s]

tensor(1.7543, grad_fn=<NllLossBackward0>)
tensor(1.7833, grad_fn=<NllLossBackward0>)
tensor(1.7871, grad_fn=<NllLossBackward0>)


  6%|▌         | 985/17426 [01:28<22:16, 12.31it/s]

tensor(1.7547, grad_fn=<NllLossBackward0>)
tensor(1.7507, grad_fn=<NllLossBackward0>)
tensor(1.7783, grad_fn=<NllLossBackward0>)


  6%|▌         | 987/17426 [01:28<22:32, 12.15it/s]

tensor(1.7716, grad_fn=<NllLossBackward0>)
tensor(1.7632, grad_fn=<NllLossBackward0>)
tensor(1.7601, grad_fn=<NllLossBackward0>)


  6%|▌         | 991/17426 [01:29<22:59, 11.91it/s]

tensor(1.7724, grad_fn=<NllLossBackward0>)
tensor(1.7538, grad_fn=<NllLossBackward0>)
tensor(1.8086, grad_fn=<NllLossBackward0>)


  6%|▌         | 993/17426 [01:29<22:50, 11.99it/s]

tensor(1.7709, grad_fn=<NllLossBackward0>)
tensor(1.7792, grad_fn=<NllLossBackward0>)
tensor(1.7766, grad_fn=<NllLossBackward0>)


  6%|▌         | 997/17426 [01:29<22:29, 12.17it/s]

tensor(1.7598, grad_fn=<NllLossBackward0>)
tensor(1.7769, grad_fn=<NllLossBackward0>)
tensor(1.7767, grad_fn=<NllLossBackward0>)


  6%|▌         | 999/17426 [01:29<22:34, 12.13it/s]

tensor(1.7516, grad_fn=<NllLossBackward0>)
tensor(1.7278, grad_fn=<NllLossBackward0>)
tensor(1.7608, grad_fn=<NllLossBackward0>)


  6%|▌         | 1003/17426 [01:30<22:38, 12.09it/s]

tensor(1.7476, grad_fn=<NllLossBackward0>)
tensor(1.7728, grad_fn=<NllLossBackward0>)
tensor(1.7168, grad_fn=<NllLossBackward0>)


  6%|▌         | 1005/17426 [01:30<22:22, 12.23it/s]

tensor(1.7459, grad_fn=<NllLossBackward0>)
tensor(1.7418, grad_fn=<NllLossBackward0>)
tensor(1.7778, grad_fn=<NllLossBackward0>)


  6%|▌         | 1009/17426 [01:30<22:06, 12.38it/s]

tensor(1.7659, grad_fn=<NllLossBackward0>)
tensor(1.7864, grad_fn=<NllLossBackward0>)
tensor(1.7122, grad_fn=<NllLossBackward0>)


  6%|▌         | 1011/17426 [01:30<22:13, 12.31it/s]

tensor(1.7287, grad_fn=<NllLossBackward0>)
tensor(1.7282, grad_fn=<NllLossBackward0>)
tensor(1.7213, grad_fn=<NllLossBackward0>)


  6%|▌         | 1015/17426 [01:31<22:36, 12.10it/s]

tensor(1.7608, grad_fn=<NllLossBackward0>)
tensor(1.7466, grad_fn=<NllLossBackward0>)
tensor(1.7469, grad_fn=<NllLossBackward0>)


  6%|▌         | 1017/17426 [01:31<22:51, 11.96it/s]

tensor(1.7994, grad_fn=<NllLossBackward0>)
tensor(1.7434, grad_fn=<NllLossBackward0>)
tensor(1.7414, grad_fn=<NllLossBackward0>)


  6%|▌         | 1021/17426 [01:31<22:33, 12.12it/s]

tensor(1.7278, grad_fn=<NllLossBackward0>)
tensor(1.7053, grad_fn=<NllLossBackward0>)
tensor(1.7812, grad_fn=<NllLossBackward0>)


  6%|▌         | 1023/17426 [01:31<22:44, 12.02it/s]

tensor(1.7870, grad_fn=<NllLossBackward0>)
tensor(1.7068, grad_fn=<NllLossBackward0>)
tensor(1.7478, grad_fn=<NllLossBackward0>)


  6%|▌         | 1027/17426 [01:32<22:51, 11.96it/s]

tensor(1.7433, grad_fn=<NllLossBackward0>)
tensor(1.7099, grad_fn=<NllLossBackward0>)
tensor(1.7315, grad_fn=<NllLossBackward0>)


  6%|▌         | 1029/17426 [01:32<22:48, 11.98it/s]

tensor(1.7410, grad_fn=<NllLossBackward0>)
tensor(1.7144, grad_fn=<NllLossBackward0>)
tensor(1.7578, grad_fn=<NllLossBackward0>)


  6%|▌         | 1033/17426 [01:32<22:20, 12.23it/s]

tensor(1.7523, grad_fn=<NllLossBackward0>)
tensor(1.7130, grad_fn=<NllLossBackward0>)
tensor(1.7141, grad_fn=<NllLossBackward0>)


  6%|▌         | 1035/17426 [01:32<22:42, 12.03it/s]

tensor(1.7728, grad_fn=<NllLossBackward0>)
tensor(1.7139, grad_fn=<NllLossBackward0>)
tensor(1.7783, grad_fn=<NllLossBackward0>)


  6%|▌         | 1039/17426 [01:33<23:05, 11.83it/s]

tensor(1.7352, grad_fn=<NllLossBackward0>)
tensor(1.6906, grad_fn=<NllLossBackward0>)
tensor(1.7203, grad_fn=<NllLossBackward0>)


  6%|▌         | 1041/17426 [01:33<22:59, 11.88it/s]

tensor(1.6876, grad_fn=<NllLossBackward0>)
tensor(1.7446, grad_fn=<NllLossBackward0>)
tensor(1.7311, grad_fn=<NllLossBackward0>)


  6%|▌         | 1043/17426 [01:33<23:30, 11.61it/s]

tensor(1.7528, grad_fn=<NllLossBackward0>)
tensor(1.7098, grad_fn=<NllLossBackward0>)


  6%|▌         | 1045/17426 [01:33<26:44, 10.21it/s]

tensor(1.7039, grad_fn=<NllLossBackward0>)
tensor(1.7288, grad_fn=<NllLossBackward0>)


  6%|▌         | 1048/17426 [01:34<30:07,  9.06it/s]

tensor(1.7178, grad_fn=<NllLossBackward0>)
tensor(1.7251, grad_fn=<NllLossBackward0>)


  6%|▌         | 1050/17426 [01:34<31:59,  8.53it/s]

tensor(1.7313, grad_fn=<NllLossBackward0>)
tensor(1.7431, grad_fn=<NllLossBackward0>)


  6%|▌         | 1052/17426 [01:34<33:05,  8.25it/s]

tensor(1.6928, grad_fn=<NllLossBackward0>)
tensor(1.7310, grad_fn=<NllLossBackward0>)


  6%|▌         | 1054/17426 [01:35<32:24,  8.42it/s]

tensor(1.7571, grad_fn=<NllLossBackward0>)
tensor(1.7185, grad_fn=<NllLossBackward0>)


  6%|▌         | 1056/17426 [01:35<32:52,  8.30it/s]

tensor(1.7099, grad_fn=<NllLossBackward0>)
tensor(1.6975, grad_fn=<NllLossBackward0>)


  6%|▌         | 1058/17426 [01:35<35:12,  7.75it/s]

tensor(1.7270, grad_fn=<NllLossBackward0>)
tensor(1.7463, grad_fn=<NllLossBackward0>)


  6%|▌         | 1060/17426 [01:35<36:09,  7.54it/s]

tensor(1.7358, grad_fn=<NllLossBackward0>)
tensor(1.7311, grad_fn=<NllLossBackward0>)


  6%|▌         | 1062/17426 [01:36<35:24,  7.70it/s]

tensor(1.7575, grad_fn=<NllLossBackward0>)
tensor(1.7287, grad_fn=<NllLossBackward0>)


  6%|▌         | 1065/17426 [01:36<30:52,  8.83it/s]

tensor(1.7603, grad_fn=<NllLossBackward0>)
tensor(1.7382, grad_fn=<NllLossBackward0>)
tensor(1.7407, grad_fn=<NllLossBackward0>)


  6%|▌         | 1067/17426 [01:36<27:33,  9.89it/s]

tensor(1.7400, grad_fn=<NllLossBackward0>)
tensor(1.7470, grad_fn=<NllLossBackward0>)
tensor(1.7115, grad_fn=<NllLossBackward0>)


  6%|▌         | 1071/17426 [01:36<24:07, 11.30it/s]

tensor(1.7288, grad_fn=<NllLossBackward0>)
tensor(1.7627, grad_fn=<NllLossBackward0>)
tensor(1.6758, grad_fn=<NllLossBackward0>)


  6%|▌         | 1073/17426 [01:37<23:51, 11.42it/s]

tensor(1.7280, grad_fn=<NllLossBackward0>)
tensor(1.7385, grad_fn=<NllLossBackward0>)
tensor(1.7432, grad_fn=<NllLossBackward0>)


  6%|▌         | 1077/17426 [01:37<23:38, 11.53it/s]

tensor(1.7744, grad_fn=<NllLossBackward0>)
tensor(1.7418, grad_fn=<NllLossBackward0>)
tensor(1.7275, grad_fn=<NllLossBackward0>)


  6%|▌         | 1079/17426 [01:37<23:13, 11.73it/s]

tensor(1.6937, grad_fn=<NllLossBackward0>)
tensor(1.7178, grad_fn=<NllLossBackward0>)
tensor(1.7344, grad_fn=<NllLossBackward0>)


  6%|▌         | 1083/17426 [01:37<22:34, 12.07it/s]

tensor(1.7162, grad_fn=<NllLossBackward0>)
tensor(1.7573, grad_fn=<NllLossBackward0>)
tensor(1.7074, grad_fn=<NllLossBackward0>)


  6%|▌         | 1085/17426 [01:38<23:39, 11.51it/s]

tensor(1.7454, grad_fn=<NllLossBackward0>)
tensor(1.7018, grad_fn=<NllLossBackward0>)
tensor(1.7039, grad_fn=<NllLossBackward0>)


  6%|▌         | 1089/17426 [01:38<23:45, 11.46it/s]

tensor(1.6933, grad_fn=<NllLossBackward0>)
tensor(1.7280, grad_fn=<NllLossBackward0>)
tensor(1.7203, grad_fn=<NllLossBackward0>)


  6%|▋         | 1091/17426 [01:38<23:16, 11.70it/s]

tensor(1.7278, grad_fn=<NllLossBackward0>)
tensor(1.7668, grad_fn=<NllLossBackward0>)
tensor(1.7713, grad_fn=<NllLossBackward0>)


  6%|▋         | 1095/17426 [01:38<22:25, 12.14it/s]

tensor(1.7222, grad_fn=<NllLossBackward0>)
tensor(1.7226, grad_fn=<NllLossBackward0>)
tensor(1.7148, grad_fn=<NllLossBackward0>)


  6%|▋         | 1097/17426 [01:39<22:27, 12.12it/s]

tensor(1.7551, grad_fn=<NllLossBackward0>)
tensor(1.7607, grad_fn=<NllLossBackward0>)
tensor(1.6905, grad_fn=<NllLossBackward0>)


  6%|▋         | 1101/17426 [01:39<22:17, 12.21it/s]

tensor(1.6895, grad_fn=<NllLossBackward0>)
tensor(1.7265, grad_fn=<NllLossBackward0>)
tensor(1.7181, grad_fn=<NllLossBackward0>)


  6%|▋         | 1103/17426 [01:39<22:50, 11.91it/s]

tensor(1.7136, grad_fn=<NllLossBackward0>)
tensor(1.7052, grad_fn=<NllLossBackward0>)
tensor(1.7256, grad_fn=<NllLossBackward0>)


  6%|▋         | 1107/17426 [01:39<22:37, 12.02it/s]

tensor(1.7279, grad_fn=<NllLossBackward0>)
tensor(1.7563, grad_fn=<NllLossBackward0>)
tensor(1.7469, grad_fn=<NllLossBackward0>)


  6%|▋         | 1109/17426 [01:40<22:55, 11.87it/s]

tensor(1.6889, grad_fn=<NllLossBackward0>)
tensor(1.7368, grad_fn=<NllLossBackward0>)
tensor(1.7281, grad_fn=<NllLossBackward0>)


  6%|▋         | 1113/17426 [01:40<22:53, 11.88it/s]

tensor(1.7108, grad_fn=<NllLossBackward0>)
tensor(1.6830, grad_fn=<NllLossBackward0>)
tensor(1.7277, grad_fn=<NllLossBackward0>)


  6%|▋         | 1115/17426 [01:40<23:08, 11.75it/s]

tensor(1.7093, grad_fn=<NllLossBackward0>)
tensor(1.7338, grad_fn=<NllLossBackward0>)
tensor(1.7207, grad_fn=<NllLossBackward0>)


  6%|▋         | 1119/17426 [01:40<22:32, 12.06it/s]

tensor(1.7755, grad_fn=<NllLossBackward0>)
tensor(1.7058, grad_fn=<NllLossBackward0>)
tensor(1.7249, grad_fn=<NllLossBackward0>)


  6%|▋         | 1121/17426 [01:41<22:41, 11.97it/s]

tensor(1.7381, grad_fn=<NllLossBackward0>)
tensor(1.7663, grad_fn=<NllLossBackward0>)
tensor(1.7229, grad_fn=<NllLossBackward0>)


  6%|▋         | 1125/17426 [01:41<22:20, 12.16it/s]

tensor(1.7281, grad_fn=<NllLossBackward0>)
tensor(1.7400, grad_fn=<NllLossBackward0>)
tensor(1.6935, grad_fn=<NllLossBackward0>)


  6%|▋         | 1127/17426 [01:41<22:59, 11.81it/s]

tensor(1.7661, grad_fn=<NllLossBackward0>)
tensor(1.7478, grad_fn=<NllLossBackward0>)
tensor(1.7505, grad_fn=<NllLossBackward0>)


  6%|▋         | 1131/17426 [01:41<22:53, 11.86it/s]

tensor(1.6957, grad_fn=<NllLossBackward0>)
tensor(1.7173, grad_fn=<NllLossBackward0>)
tensor(1.7525, grad_fn=<NllLossBackward0>)


  7%|▋         | 1133/17426 [01:42<22:48, 11.90it/s]

tensor(1.7339, grad_fn=<NllLossBackward0>)
tensor(1.7477, grad_fn=<NllLossBackward0>)
tensor(1.7380, grad_fn=<NllLossBackward0>)


  7%|▋         | 1137/17426 [01:42<22:21, 12.14it/s]

tensor(1.6914, grad_fn=<NllLossBackward0>)
tensor(1.7255, grad_fn=<NllLossBackward0>)
tensor(1.7759, grad_fn=<NllLossBackward0>)


  7%|▋         | 1139/17426 [01:42<22:59, 11.81it/s]

tensor(1.7070, grad_fn=<NllLossBackward0>)
tensor(1.7292, grad_fn=<NllLossBackward0>)
tensor(1.6885, grad_fn=<NllLossBackward0>)


  7%|▋         | 1143/17426 [01:42<22:20, 12.14it/s]

tensor(1.7489, grad_fn=<NllLossBackward0>)
tensor(1.7111, grad_fn=<NllLossBackward0>)
tensor(1.7213, grad_fn=<NllLossBackward0>)


  7%|▋         | 1145/17426 [01:43<22:42, 11.95it/s]

tensor(1.7376, grad_fn=<NllLossBackward0>)
tensor(1.6686, grad_fn=<NllLossBackward0>)
tensor(1.6765, grad_fn=<NllLossBackward0>)


  7%|▋         | 1149/17426 [01:43<22:17, 12.17it/s]

tensor(1.6980, grad_fn=<NllLossBackward0>)
tensor(1.7049, grad_fn=<NllLossBackward0>)
tensor(1.7127, grad_fn=<NllLossBackward0>)


  7%|▋         | 1151/17426 [01:43<22:40, 11.96it/s]

tensor(1.7433, grad_fn=<NllLossBackward0>)
tensor(1.7660, grad_fn=<NllLossBackward0>)
tensor(1.7768, grad_fn=<NllLossBackward0>)


  7%|▋         | 1155/17426 [01:43<22:16, 12.18it/s]

tensor(1.7107, grad_fn=<NllLossBackward0>)
tensor(1.7787, grad_fn=<NllLossBackward0>)
tensor(1.7382, grad_fn=<NllLossBackward0>)


  7%|▋         | 1157/17426 [01:44<22:25, 12.09it/s]

tensor(1.7244, grad_fn=<NllLossBackward0>)
tensor(1.7612, grad_fn=<NllLossBackward0>)
tensor(1.7301, grad_fn=<NllLossBackward0>)


  7%|▋         | 1161/17426 [01:44<22:22, 12.12it/s]

tensor(1.7401, grad_fn=<NllLossBackward0>)
tensor(1.7410, grad_fn=<NllLossBackward0>)
tensor(1.7491, grad_fn=<NllLossBackward0>)


  7%|▋         | 1163/17426 [01:44<22:46, 11.90it/s]

tensor(1.6993, grad_fn=<NllLossBackward0>)
tensor(1.7856, grad_fn=<NllLossBackward0>)
tensor(1.7147, grad_fn=<NllLossBackward0>)


  7%|▋         | 1167/17426 [01:44<22:29, 12.05it/s]

tensor(1.7110, grad_fn=<NllLossBackward0>)
tensor(1.6825, grad_fn=<NllLossBackward0>)
tensor(1.6948, grad_fn=<NllLossBackward0>)


  7%|▋         | 1169/17426 [01:45<22:26, 12.07it/s]

tensor(1.7382, grad_fn=<NllLossBackward0>)
tensor(1.6666, grad_fn=<NllLossBackward0>)
tensor(1.7428, grad_fn=<NllLossBackward0>)


  7%|▋         | 1173/17426 [01:45<22:20, 12.12it/s]

tensor(1.7051, grad_fn=<NllLossBackward0>)
tensor(1.7080, grad_fn=<NllLossBackward0>)
tensor(1.7052, grad_fn=<NllLossBackward0>)


  7%|▋         | 1175/17426 [01:45<22:18, 12.14it/s]

tensor(1.7366, grad_fn=<NllLossBackward0>)
tensor(1.7128, grad_fn=<NllLossBackward0>)
tensor(1.7265, grad_fn=<NllLossBackward0>)


  7%|▋         | 1179/17426 [01:45<22:39, 11.95it/s]

tensor(1.7404, grad_fn=<NllLossBackward0>)
tensor(1.7392, grad_fn=<NllLossBackward0>)
tensor(1.7087, grad_fn=<NllLossBackward0>)


  7%|▋         | 1181/17426 [01:46<22:35, 11.99it/s]

tensor(1.7920, grad_fn=<NllLossBackward0>)
tensor(1.6711, grad_fn=<NllLossBackward0>)
tensor(1.6895, grad_fn=<NllLossBackward0>)


  7%|▋         | 1183/17426 [01:46<22:36, 11.97it/s]

tensor(1.7348, grad_fn=<NllLossBackward0>)
tensor(1.7259, grad_fn=<NllLossBackward0>)


  7%|▋         | 1185/17426 [01:46<26:27, 10.23it/s]

tensor(1.7287, grad_fn=<NllLossBackward0>)
tensor(1.7418, grad_fn=<NllLossBackward0>)


  7%|▋         | 1188/17426 [01:46<30:00,  9.02it/s]

tensor(1.7314, grad_fn=<NllLossBackward0>)
tensor(1.6978, grad_fn=<NllLossBackward0>)


  7%|▋         | 1190/17426 [01:47<31:03,  8.71it/s]

tensor(1.6861, grad_fn=<NllLossBackward0>)
tensor(1.7617, grad_fn=<NllLossBackward0>)


  7%|▋         | 1192/17426 [01:47<31:31,  8.58it/s]

tensor(1.6579, grad_fn=<NllLossBackward0>)
tensor(1.7188, grad_fn=<NllLossBackward0>)


  7%|▋         | 1194/17426 [01:47<31:52,  8.49it/s]

tensor(1.7171, grad_fn=<NllLossBackward0>)
tensor(1.7053, grad_fn=<NllLossBackward0>)


  7%|▋         | 1196/17426 [01:47<33:06,  8.17it/s]

tensor(1.6942, grad_fn=<NllLossBackward0>)
tensor(1.7367, grad_fn=<NllLossBackward0>)


  7%|▋         | 1198/17426 [01:48<35:59,  7.51it/s]

tensor(1.6993, grad_fn=<NllLossBackward0>)
tensor(1.6982, grad_fn=<NllLossBackward0>)


  7%|▋         | 1200/17426 [01:48<35:32,  7.61it/s]

tensor(1.6900, grad_fn=<NllLossBackward0>)
tensor(1.7419, grad_fn=<NllLossBackward0>)


  7%|▋         | 1202/17426 [01:48<36:10,  7.48it/s]

tensor(1.7178, grad_fn=<NllLossBackward0>)
tensor(1.7456, grad_fn=<NllLossBackward0>)


  7%|▋         | 1204/17426 [01:48<34:20,  7.87it/s]

tensor(1.7282, grad_fn=<NllLossBackward0>)
tensor(1.7455, grad_fn=<NllLossBackward0>)
tensor(1.6943, grad_fn=<NllLossBackward0>)


  7%|▋         | 1208/17426 [01:49<25:52, 10.45it/s]

tensor(1.6807, grad_fn=<NllLossBackward0>)
tensor(1.7258, grad_fn=<NllLossBackward0>)
tensor(1.7134, grad_fn=<NllLossBackward0>)


  7%|▋         | 1210/17426 [01:49<24:36, 10.98it/s]

tensor(1.7169, grad_fn=<NllLossBackward0>)
tensor(1.7390, grad_fn=<NllLossBackward0>)
tensor(1.7206, grad_fn=<NllLossBackward0>)


  7%|▋         | 1214/17426 [01:49<22:55, 11.79it/s]

tensor(1.7424, grad_fn=<NllLossBackward0>)
tensor(1.7190, grad_fn=<NllLossBackward0>)
tensor(1.7430, grad_fn=<NllLossBackward0>)


  7%|▋         | 1216/17426 [01:49<23:28, 11.51it/s]

tensor(1.6617, grad_fn=<NllLossBackward0>)
tensor(1.6892, grad_fn=<NllLossBackward0>)
tensor(1.7062, grad_fn=<NllLossBackward0>)


  7%|▋         | 1220/17426 [01:50<22:23, 12.06it/s]

tensor(1.7234, grad_fn=<NllLossBackward0>)
tensor(1.7135, grad_fn=<NllLossBackward0>)
tensor(1.7044, grad_fn=<NllLossBackward0>)


  7%|▋         | 1222/17426 [01:50<22:32, 11.98it/s]

tensor(1.7246, grad_fn=<NllLossBackward0>)
tensor(1.7297, grad_fn=<NllLossBackward0>)
tensor(1.7374, grad_fn=<NllLossBackward0>)


  7%|▋         | 1226/17426 [01:50<22:03, 12.24it/s]

tensor(1.6913, grad_fn=<NllLossBackward0>)
tensor(1.7178, grad_fn=<NllLossBackward0>)
tensor(1.6810, grad_fn=<NllLossBackward0>)


  7%|▋         | 1228/17426 [01:50<22:08, 12.19it/s]

tensor(1.6592, grad_fn=<NllLossBackward0>)
tensor(1.7486, grad_fn=<NllLossBackward0>)
tensor(1.7102, grad_fn=<NllLossBackward0>)


  7%|▋         | 1232/17426 [01:51<22:30, 11.99it/s]

tensor(1.7010, grad_fn=<NllLossBackward0>)
tensor(1.7543, grad_fn=<NllLossBackward0>)
tensor(1.7100, grad_fn=<NllLossBackward0>)


  7%|▋         | 1234/17426 [01:51<22:27, 12.02it/s]

tensor(1.7085, grad_fn=<NllLossBackward0>)
tensor(1.7171, grad_fn=<NllLossBackward0>)
tensor(1.6885, grad_fn=<NllLossBackward0>)


  7%|▋         | 1238/17426 [01:51<22:02, 12.24it/s]

tensor(1.6808, grad_fn=<NllLossBackward0>)
tensor(1.6541, grad_fn=<NllLossBackward0>)
tensor(1.7091, grad_fn=<NllLossBackward0>)


  7%|▋         | 1240/17426 [01:51<21:54, 12.32it/s]

tensor(1.6849, grad_fn=<NllLossBackward0>)
tensor(1.6775, grad_fn=<NllLossBackward0>)
tensor(1.6936, grad_fn=<NllLossBackward0>)


  7%|▋         | 1244/17426 [01:52<22:17, 12.10it/s]

tensor(1.6952, grad_fn=<NllLossBackward0>)
tensor(1.7380, grad_fn=<NllLossBackward0>)
tensor(1.7253, grad_fn=<NllLossBackward0>)


  7%|▋         | 1246/17426 [01:52<22:24, 12.03it/s]

tensor(1.6889, grad_fn=<NllLossBackward0>)
tensor(1.6798, grad_fn=<NllLossBackward0>)
tensor(1.7207, grad_fn=<NllLossBackward0>)


  7%|▋         | 1250/17426 [01:52<22:10, 12.16it/s]

tensor(1.7170, grad_fn=<NllLossBackward0>)
tensor(1.7161, grad_fn=<NllLossBackward0>)
tensor(1.6794, grad_fn=<NllLossBackward0>)


  7%|▋         | 1252/17426 [01:52<22:16, 12.10it/s]

tensor(1.7216, grad_fn=<NllLossBackward0>)
tensor(1.7136, grad_fn=<NllLossBackward0>)
tensor(1.7159, grad_fn=<NllLossBackward0>)


  7%|▋         | 1256/17426 [01:53<22:22, 12.04it/s]

tensor(1.7503, grad_fn=<NllLossBackward0>)
tensor(1.7082, grad_fn=<NllLossBackward0>)
tensor(1.7247, grad_fn=<NllLossBackward0>)


  7%|▋         | 1258/17426 [01:53<22:26, 12.01it/s]

tensor(1.7166, grad_fn=<NllLossBackward0>)
tensor(1.7454, grad_fn=<NllLossBackward0>)
tensor(1.7079, grad_fn=<NllLossBackward0>)


  7%|▋         | 1262/17426 [01:53<22:04, 12.20it/s]

tensor(1.6856, grad_fn=<NllLossBackward0>)
tensor(1.7189, grad_fn=<NllLossBackward0>)
tensor(1.7159, grad_fn=<NllLossBackward0>)


  7%|▋         | 1264/17426 [01:53<22:16, 12.10it/s]

tensor(1.7164, grad_fn=<NllLossBackward0>)
tensor(1.7162, grad_fn=<NllLossBackward0>)
tensor(1.6957, grad_fn=<NllLossBackward0>)


  7%|▋         | 1268/17426 [01:54<22:26, 12.00it/s]

tensor(1.7273, grad_fn=<NllLossBackward0>)
tensor(1.7165, grad_fn=<NllLossBackward0>)
tensor(1.6856, grad_fn=<NllLossBackward0>)


  7%|▋         | 1270/17426 [01:54<22:26, 11.99it/s]

tensor(1.7184, grad_fn=<NllLossBackward0>)
tensor(1.7072, grad_fn=<NllLossBackward0>)
tensor(1.7032, grad_fn=<NllLossBackward0>)


  7%|▋         | 1274/17426 [01:54<21:43, 12.39it/s]

tensor(1.7396, grad_fn=<NllLossBackward0>)
tensor(1.7285, grad_fn=<NllLossBackward0>)
tensor(1.6729, grad_fn=<NllLossBackward0>)


  7%|▋         | 1276/17426 [01:54<21:54, 12.29it/s]

tensor(1.6984, grad_fn=<NllLossBackward0>)
tensor(1.7039, grad_fn=<NllLossBackward0>)
tensor(1.7307, grad_fn=<NllLossBackward0>)


  7%|▋         | 1280/17426 [01:55<22:30, 11.95it/s]

tensor(1.7089, grad_fn=<NllLossBackward0>)
tensor(1.6935, grad_fn=<NllLossBackward0>)
tensor(1.7568, grad_fn=<NllLossBackward0>)


  7%|▋         | 1282/17426 [01:55<22:37, 11.89it/s]

tensor(1.6741, grad_fn=<NllLossBackward0>)
tensor(1.7351, grad_fn=<NllLossBackward0>)
tensor(1.7463, grad_fn=<NllLossBackward0>)


  7%|▋         | 1286/17426 [01:55<22:11, 12.12it/s]

tensor(1.7100, grad_fn=<NllLossBackward0>)
tensor(1.6711, grad_fn=<NllLossBackward0>)
tensor(1.7090, grad_fn=<NllLossBackward0>)


  7%|▋         | 1288/17426 [01:55<22:09, 12.14it/s]

tensor(1.6969, grad_fn=<NllLossBackward0>)
tensor(1.7186, grad_fn=<NllLossBackward0>)
tensor(1.7123, grad_fn=<NllLossBackward0>)


  7%|▋         | 1292/17426 [01:56<22:51, 11.76it/s]

tensor(1.7288, grad_fn=<NllLossBackward0>)
tensor(1.7011, grad_fn=<NllLossBackward0>)
tensor(1.7154, grad_fn=<NllLossBackward0>)


  7%|▋         | 1294/17426 [01:56<23:02, 11.67it/s]

tensor(1.6977, grad_fn=<NllLossBackward0>)
tensor(1.7163, grad_fn=<NllLossBackward0>)
tensor(1.7175, grad_fn=<NllLossBackward0>)


  7%|▋         | 1298/17426 [01:56<22:31, 11.93it/s]

tensor(1.6465, grad_fn=<NllLossBackward0>)
tensor(1.6380, grad_fn=<NllLossBackward0>)
tensor(1.7319, grad_fn=<NllLossBackward0>)


  7%|▋         | 1300/17426 [01:56<22:31, 11.93it/s]

tensor(1.7193, grad_fn=<NllLossBackward0>)
tensor(1.6783, grad_fn=<NllLossBackward0>)
tensor(1.6991, grad_fn=<NllLossBackward0>)


  7%|▋         | 1304/17426 [01:57<23:19, 11.52it/s]

tensor(1.7065, grad_fn=<NllLossBackward0>)
tensor(1.7058, grad_fn=<NllLossBackward0>)
tensor(1.6974, grad_fn=<NllLossBackward0>)


  7%|▋         | 1306/17426 [01:57<23:12, 11.58it/s]

tensor(1.6697, grad_fn=<NllLossBackward0>)
tensor(1.7188, grad_fn=<NllLossBackward0>)
tensor(1.6798, grad_fn=<NllLossBackward0>)


  8%|▊         | 1310/17426 [01:57<22:19, 12.03it/s]

tensor(1.6779, grad_fn=<NllLossBackward0>)
tensor(1.6882, grad_fn=<NllLossBackward0>)
tensor(1.6699, grad_fn=<NllLossBackward0>)


  8%|▊         | 1312/17426 [01:57<22:20, 12.02it/s]

tensor(1.7114, grad_fn=<NllLossBackward0>)
tensor(1.6976, grad_fn=<NllLossBackward0>)
tensor(1.7017, grad_fn=<NllLossBackward0>)


  8%|▊         | 1316/17426 [01:58<22:39, 11.85it/s]

tensor(1.6935, grad_fn=<NllLossBackward0>)
tensor(1.6703, grad_fn=<NllLossBackward0>)
tensor(1.6884, grad_fn=<NllLossBackward0>)


  8%|▊         | 1318/17426 [01:58<22:35, 11.88it/s]

tensor(1.6665, grad_fn=<NllLossBackward0>)
tensor(1.7053, grad_fn=<NllLossBackward0>)
tensor(1.7436, grad_fn=<NllLossBackward0>)


  8%|▊         | 1322/17426 [01:58<22:21, 12.01it/s]

tensor(1.7169, grad_fn=<NllLossBackward0>)
tensor(1.7048, grad_fn=<NllLossBackward0>)
tensor(1.7422, grad_fn=<NllLossBackward0>)


  8%|▊         | 1324/17426 [01:58<23:12, 11.56it/s]

tensor(1.7240, grad_fn=<NllLossBackward0>)
tensor(1.6671, grad_fn=<NllLossBackward0>)


  8%|▊         | 1326/17426 [01:59<26:55,  9.97it/s]

tensor(1.7071, grad_fn=<NllLossBackward0>)
tensor(1.6764, grad_fn=<NllLossBackward0>)


  8%|▊         | 1328/17426 [01:59<28:41,  9.35it/s]

tensor(1.6572, grad_fn=<NllLossBackward0>)
tensor(1.6761, grad_fn=<NllLossBackward0>)


  8%|▊         | 1330/17426 [01:59<30:20,  8.84it/s]

tensor(1.7043, grad_fn=<NllLossBackward0>)
tensor(1.6969, grad_fn=<NllLossBackward0>)


  8%|▊         | 1332/17426 [02:00<31:41,  8.46it/s]

tensor(1.6653, grad_fn=<NllLossBackward0>)
tensor(1.6835, grad_fn=<NllLossBackward0>)


  8%|▊         | 1334/17426 [02:00<32:00,  8.38it/s]

tensor(1.6820, grad_fn=<NllLossBackward0>)
tensor(1.6597, grad_fn=<NllLossBackward0>)


  8%|▊         | 1336/17426 [02:00<32:10,  8.33it/s]

tensor(1.7242, grad_fn=<NllLossBackward0>)
tensor(1.6474, grad_fn=<NllLossBackward0>)


  8%|▊         | 1338/17426 [02:00<33:23,  8.03it/s]

tensor(1.7272, grad_fn=<NllLossBackward0>)
tensor(1.6726, grad_fn=<NllLossBackward0>)


  8%|▊         | 1340/17426 [02:01<33:20,  8.04it/s]

tensor(1.6816, grad_fn=<NllLossBackward0>)
tensor(1.6755, grad_fn=<NllLossBackward0>)


  8%|▊         | 1342/17426 [02:01<34:33,  7.76it/s]

tensor(1.6769, grad_fn=<NllLossBackward0>)
tensor(1.7158, grad_fn=<NllLossBackward0>)


  8%|▊         | 1344/17426 [02:01<35:57,  7.45it/s]

tensor(1.7015, grad_fn=<NllLossBackward0>)
tensor(1.6999, grad_fn=<NllLossBackward0>)


  8%|▊         | 1347/17426 [02:01<28:59,  9.24it/s]

tensor(1.6523, grad_fn=<NllLossBackward0>)
tensor(1.7083, grad_fn=<NllLossBackward0>)
tensor(1.7254, grad_fn=<NllLossBackward0>)


  8%|▊         | 1349/17426 [02:02<26:17, 10.19it/s]

tensor(1.6926, grad_fn=<NllLossBackward0>)
tensor(1.6598, grad_fn=<NllLossBackward0>)
tensor(1.7161, grad_fn=<NllLossBackward0>)


  8%|▊         | 1353/17426 [02:02<23:44, 11.28it/s]

tensor(1.7313, grad_fn=<NllLossBackward0>)
tensor(1.6656, grad_fn=<NllLossBackward0>)
tensor(1.7137, grad_fn=<NllLossBackward0>)


  8%|▊         | 1355/17426 [02:02<23:47, 11.26it/s]

tensor(1.6806, grad_fn=<NllLossBackward0>)
tensor(1.7371, grad_fn=<NllLossBackward0>)
tensor(1.6751, grad_fn=<NllLossBackward0>)


  8%|▊         | 1359/17426 [02:02<22:19, 11.99it/s]

tensor(1.7348, grad_fn=<NllLossBackward0>)
tensor(1.6667, grad_fn=<NllLossBackward0>)
tensor(1.6660, grad_fn=<NllLossBackward0>)


  8%|▊         | 1361/17426 [02:02<22:23, 11.96it/s]

tensor(1.6753, grad_fn=<NllLossBackward0>)
tensor(1.6625, grad_fn=<NllLossBackward0>)
tensor(1.7103, grad_fn=<NllLossBackward0>)


  8%|▊         | 1365/17426 [02:03<22:14, 12.03it/s]

tensor(1.6758, grad_fn=<NllLossBackward0>)
tensor(1.6675, grad_fn=<NllLossBackward0>)
tensor(1.6655, grad_fn=<NllLossBackward0>)


  8%|▊         | 1367/17426 [02:03<22:41, 11.79it/s]

tensor(1.6757, grad_fn=<NllLossBackward0>)
tensor(1.6483, grad_fn=<NllLossBackward0>)
tensor(1.6944, grad_fn=<NllLossBackward0>)


  8%|▊         | 1371/17426 [02:03<21:56, 12.20it/s]

tensor(1.7124, grad_fn=<NllLossBackward0>)
tensor(1.7484, grad_fn=<NllLossBackward0>)
tensor(1.6720, grad_fn=<NllLossBackward0>)


  8%|▊         | 1373/17426 [02:03<22:09, 12.07it/s]

tensor(1.7085, grad_fn=<NllLossBackward0>)
tensor(1.7013, grad_fn=<NllLossBackward0>)
tensor(1.7142, grad_fn=<NllLossBackward0>)


  8%|▊         | 1377/17426 [02:04<21:55, 12.20it/s]

tensor(1.7038, grad_fn=<NllLossBackward0>)
tensor(1.6787, grad_fn=<NllLossBackward0>)
tensor(1.6647, grad_fn=<NllLossBackward0>)


  8%|▊         | 1379/17426 [02:04<22:24, 11.93it/s]

tensor(1.7044, grad_fn=<NllLossBackward0>)
tensor(1.6940, grad_fn=<NllLossBackward0>)
tensor(1.6780, grad_fn=<NllLossBackward0>)


  8%|▊         | 1383/17426 [02:04<21:52, 12.22it/s]

tensor(1.6794, grad_fn=<NllLossBackward0>)
tensor(1.6465, grad_fn=<NllLossBackward0>)
tensor(1.7208, grad_fn=<NllLossBackward0>)


  8%|▊         | 1385/17426 [02:04<22:07, 12.08it/s]

tensor(1.6693, grad_fn=<NllLossBackward0>)
tensor(1.7077, grad_fn=<NllLossBackward0>)
tensor(1.6714, grad_fn=<NllLossBackward0>)


  8%|▊         | 1389/17426 [02:05<22:11, 12.05it/s]

tensor(1.7195, grad_fn=<NllLossBackward0>)
tensor(1.6996, grad_fn=<NllLossBackward0>)
tensor(1.6790, grad_fn=<NllLossBackward0>)


  8%|▊         | 1391/17426 [02:05<22:09, 12.06it/s]

tensor(1.7044, grad_fn=<NllLossBackward0>)
tensor(1.7237, grad_fn=<NllLossBackward0>)
tensor(1.6917, grad_fn=<NllLossBackward0>)


  8%|▊         | 1395/17426 [02:05<22:12, 12.03it/s]

tensor(1.6849, grad_fn=<NllLossBackward0>)
tensor(1.6722, grad_fn=<NllLossBackward0>)
tensor(1.6603, grad_fn=<NllLossBackward0>)


  8%|▊         | 1397/17426 [02:05<22:07, 12.08it/s]

tensor(1.6807, grad_fn=<NllLossBackward0>)
tensor(1.6829, grad_fn=<NllLossBackward0>)
tensor(1.6766, grad_fn=<NllLossBackward0>)


  8%|▊         | 1401/17426 [02:06<22:06, 12.08it/s]

tensor(1.7070, grad_fn=<NllLossBackward0>)
tensor(1.6835, grad_fn=<NllLossBackward0>)
tensor(1.7038, grad_fn=<NllLossBackward0>)


  8%|▊         | 1403/17426 [02:06<22:06, 12.08it/s]

tensor(1.6930, grad_fn=<NllLossBackward0>)
tensor(1.6892, grad_fn=<NllLossBackward0>)
tensor(1.6710, grad_fn=<NllLossBackward0>)


  8%|▊         | 1407/17426 [02:06<22:00, 12.13it/s]

tensor(1.6861, grad_fn=<NllLossBackward0>)
tensor(1.6702, grad_fn=<NllLossBackward0>)
tensor(1.7151, grad_fn=<NllLossBackward0>)


  8%|▊         | 1409/17426 [02:06<22:04, 12.10it/s]

tensor(1.6946, grad_fn=<NllLossBackward0>)
tensor(1.6759, grad_fn=<NllLossBackward0>)
tensor(1.6769, grad_fn=<NllLossBackward0>)


  8%|▊         | 1413/17426 [02:07<22:06, 12.07it/s]

tensor(1.6966, grad_fn=<NllLossBackward0>)
tensor(1.6897, grad_fn=<NllLossBackward0>)
tensor(1.6351, grad_fn=<NllLossBackward0>)


  8%|▊         | 1415/17426 [02:07<21:57, 12.16it/s]

tensor(1.7163, grad_fn=<NllLossBackward0>)
tensor(1.6299, grad_fn=<NllLossBackward0>)
tensor(1.6731, grad_fn=<NllLossBackward0>)


  8%|▊         | 1419/17426 [02:07<22:04, 12.09it/s]

tensor(1.6704, grad_fn=<NllLossBackward0>)
tensor(1.6364, grad_fn=<NllLossBackward0>)
tensor(1.7042, grad_fn=<NllLossBackward0>)


  8%|▊         | 1421/17426 [02:07<22:16, 11.98it/s]

tensor(1.6687, grad_fn=<NllLossBackward0>)
tensor(1.7098, grad_fn=<NllLossBackward0>)
tensor(1.7074, grad_fn=<NllLossBackward0>)


  8%|▊         | 1425/17426 [02:08<22:43, 11.74it/s]

tensor(1.6647, grad_fn=<NllLossBackward0>)
tensor(1.6969, grad_fn=<NllLossBackward0>)
tensor(1.7036, grad_fn=<NllLossBackward0>)


  8%|▊         | 1427/17426 [02:08<22:40, 11.76it/s]

tensor(1.7068, grad_fn=<NllLossBackward0>)
tensor(1.6970, grad_fn=<NllLossBackward0>)
tensor(1.6910, grad_fn=<NllLossBackward0>)


  8%|▊         | 1431/17426 [02:08<22:15, 11.98it/s]

tensor(1.7090, grad_fn=<NllLossBackward0>)
tensor(1.7296, grad_fn=<NllLossBackward0>)
tensor(1.6989, grad_fn=<NllLossBackward0>)


  8%|▊         | 1433/17426 [02:09<22:12, 12.01it/s]

tensor(1.6572, grad_fn=<NllLossBackward0>)
tensor(1.6745, grad_fn=<NllLossBackward0>)
tensor(1.6835, grad_fn=<NllLossBackward0>)


  8%|▊         | 1437/17426 [02:09<21:59, 12.12it/s]

tensor(1.6920, grad_fn=<NllLossBackward0>)
tensor(1.6651, grad_fn=<NllLossBackward0>)
tensor(1.7100, grad_fn=<NllLossBackward0>)


  8%|▊         | 1439/17426 [02:09<21:59, 12.11it/s]

tensor(1.6723, grad_fn=<NllLossBackward0>)
tensor(1.6717, grad_fn=<NllLossBackward0>)
tensor(1.6365, grad_fn=<NllLossBackward0>)


  8%|▊         | 1443/17426 [02:09<22:14, 11.98it/s]

tensor(1.6882, grad_fn=<NllLossBackward0>)
tensor(1.6923, grad_fn=<NllLossBackward0>)
tensor(1.6608, grad_fn=<NllLossBackward0>)


  8%|▊         | 1445/17426 [02:10<22:09, 12.02it/s]

tensor(1.6667, grad_fn=<NllLossBackward0>)
tensor(1.6795, grad_fn=<NllLossBackward0>)
tensor(1.6785, grad_fn=<NllLossBackward0>)


  8%|▊         | 1449/17426 [02:10<22:02, 12.09it/s]

tensor(1.6891, grad_fn=<NllLossBackward0>)
tensor(1.6659, grad_fn=<NllLossBackward0>)
tensor(1.7121, grad_fn=<NllLossBackward0>)


  8%|▊         | 1451/17426 [02:10<22:17, 11.94it/s]

tensor(1.6518, grad_fn=<NllLossBackward0>)
tensor(1.6845, grad_fn=<NllLossBackward0>)
tensor(1.6879, grad_fn=<NllLossBackward0>)


  8%|▊         | 1455/17426 [02:10<22:32, 11.81it/s]

tensor(1.6918, grad_fn=<NllLossBackward0>)
tensor(1.7299, grad_fn=<NllLossBackward0>)
tensor(1.7174, grad_fn=<NllLossBackward0>)


  8%|▊         | 1457/17426 [02:11<22:17, 11.94it/s]

tensor(1.6573, grad_fn=<NllLossBackward0>)
tensor(1.6903, grad_fn=<NllLossBackward0>)
tensor(1.6665, grad_fn=<NllLossBackward0>)


  8%|▊         | 1461/17426 [02:11<22:02, 12.07it/s]

tensor(1.6731, grad_fn=<NllLossBackward0>)
tensor(1.6679, grad_fn=<NllLossBackward0>)
tensor(1.6527, grad_fn=<NllLossBackward0>)


  8%|▊         | 1463/17426 [02:11<21:57, 12.12it/s]

tensor(1.7179, grad_fn=<NllLossBackward0>)
tensor(1.6270, grad_fn=<NllLossBackward0>)
tensor(1.6795, grad_fn=<NllLossBackward0>)


  8%|▊         | 1465/17426 [02:11<22:29, 11.82it/s]

tensor(1.6636, grad_fn=<NllLossBackward0>)
tensor(1.6798, grad_fn=<NllLossBackward0>)


  8%|▊         | 1467/17426 [02:11<26:30, 10.03it/s]

tensor(1.7091, grad_fn=<NllLossBackward0>)
tensor(1.6076, grad_fn=<NllLossBackward0>)


  8%|▊         | 1470/17426 [02:12<29:44,  8.94it/s]

tensor(1.6156, grad_fn=<NllLossBackward0>)
tensor(1.6704, grad_fn=<NllLossBackward0>)


  8%|▊         | 1472/17426 [02:12<31:29,  8.44it/s]

tensor(1.6770, grad_fn=<NllLossBackward0>)
tensor(1.7146, grad_fn=<NllLossBackward0>)


  8%|▊         | 1474/17426 [02:12<32:52,  8.09it/s]

tensor(1.6704, grad_fn=<NllLossBackward0>)
tensor(1.6745, grad_fn=<NllLossBackward0>)


  8%|▊         | 1476/17426 [02:13<31:16,  8.50it/s]

tensor(1.6686, grad_fn=<NllLossBackward0>)
tensor(1.6431, grad_fn=<NllLossBackward0>)


  8%|▊         | 1478/17426 [02:13<30:48,  8.63it/s]

tensor(1.6709, grad_fn=<NllLossBackward0>)
tensor(1.6689, grad_fn=<NllLossBackward0>)


  8%|▊         | 1480/17426 [02:13<32:07,  8.27it/s]

tensor(1.7268, grad_fn=<NllLossBackward0>)
tensor(1.6825, grad_fn=<NllLossBackward0>)


  9%|▊         | 1482/17426 [02:13<32:50,  8.09it/s]

tensor(1.7522, grad_fn=<NllLossBackward0>)
tensor(1.7110, grad_fn=<NllLossBackward0>)


  9%|▊         | 1484/17426 [02:14<34:11,  7.77it/s]

tensor(1.6640, grad_fn=<NllLossBackward0>)
tensor(1.6734, grad_fn=<NllLossBackward0>)


  9%|▊         | 1486/17426 [02:14<34:36,  7.68it/s]

tensor(1.7151, grad_fn=<NllLossBackward0>)
tensor(1.7021, grad_fn=<NllLossBackward0>)


  9%|▊         | 1489/17426 [02:14<28:00,  9.48it/s]

tensor(1.6850, grad_fn=<NllLossBackward0>)
tensor(1.6836, grad_fn=<NllLossBackward0>)
tensor(1.7005, grad_fn=<NllLossBackward0>)


  9%|▊         | 1491/17426 [02:14<25:40, 10.35it/s]

tensor(1.7212, grad_fn=<NllLossBackward0>)
tensor(1.6692, grad_fn=<NllLossBackward0>)
tensor(1.7003, grad_fn=<NllLossBackward0>)


  9%|▊         | 1495/17426 [02:15<23:44, 11.19it/s]

tensor(1.6673, grad_fn=<NllLossBackward0>)
tensor(1.6894, grad_fn=<NllLossBackward0>)
tensor(1.6476, grad_fn=<NllLossBackward0>)


  9%|▊         | 1497/17426 [02:15<23:43, 11.19it/s]

tensor(1.6654, grad_fn=<NllLossBackward0>)
tensor(1.6725, grad_fn=<NllLossBackward0>)
tensor(1.6697, grad_fn=<NllLossBackward0>)


  9%|▊         | 1501/17426 [02:15<22:54, 11.59it/s]

tensor(1.6654, grad_fn=<NllLossBackward0>)
tensor(1.6165, grad_fn=<NllLossBackward0>)
tensor(1.6672, grad_fn=<NllLossBackward0>)


  9%|▊         | 1503/17426 [02:15<22:29, 11.80it/s]

tensor(1.6591, grad_fn=<NllLossBackward0>)
tensor(1.6859, grad_fn=<NllLossBackward0>)
tensor(1.6641, grad_fn=<NllLossBackward0>)


  9%|▊         | 1507/17426 [02:16<22:06, 12.00it/s]

tensor(1.6553, grad_fn=<NllLossBackward0>)
tensor(1.6754, grad_fn=<NllLossBackward0>)
tensor(1.6924, grad_fn=<NllLossBackward0>)


  9%|▊         | 1509/17426 [02:16<22:05, 12.01it/s]

tensor(1.6954, grad_fn=<NllLossBackward0>)
tensor(1.6669, grad_fn=<NllLossBackward0>)
tensor(1.7097, grad_fn=<NllLossBackward0>)


  9%|▊         | 1513/17426 [02:16<22:06, 12.00it/s]

tensor(1.6795, grad_fn=<NllLossBackward0>)
tensor(1.6244, grad_fn=<NllLossBackward0>)
tensor(1.6634, grad_fn=<NllLossBackward0>)


  9%|▊         | 1515/17426 [02:16<21:44, 12.19it/s]

tensor(1.7152, grad_fn=<NllLossBackward0>)
tensor(1.6487, grad_fn=<NllLossBackward0>)
tensor(1.6739, grad_fn=<NllLossBackward0>)


  9%|▊         | 1519/17426 [02:17<22:01, 12.04it/s]

tensor(1.6576, grad_fn=<NllLossBackward0>)
tensor(1.6846, grad_fn=<NllLossBackward0>)
tensor(1.6523, grad_fn=<NllLossBackward0>)


  9%|▊         | 1521/17426 [02:17<22:03, 12.02it/s]

tensor(1.6882, grad_fn=<NllLossBackward0>)
tensor(1.6575, grad_fn=<NllLossBackward0>)
tensor(1.6481, grad_fn=<NllLossBackward0>)


  9%|▉         | 1525/17426 [02:17<22:14, 11.92it/s]

tensor(1.6835, grad_fn=<NllLossBackward0>)
tensor(1.7520, grad_fn=<NllLossBackward0>)
tensor(1.6738, grad_fn=<NllLossBackward0>)


  9%|▉         | 1527/17426 [02:17<22:12, 11.93it/s]

tensor(1.6915, grad_fn=<NllLossBackward0>)
tensor(1.6566, grad_fn=<NllLossBackward0>)
tensor(1.6498, grad_fn=<NllLossBackward0>)


  9%|▉         | 1531/17426 [02:18<22:08, 11.96it/s]

tensor(1.6715, grad_fn=<NllLossBackward0>)
tensor(1.6581, grad_fn=<NllLossBackward0>)
tensor(1.6264, grad_fn=<NllLossBackward0>)


  9%|▉         | 1533/17426 [02:18<22:08, 11.96it/s]

tensor(1.6947, grad_fn=<NllLossBackward0>)
tensor(1.6353, grad_fn=<NllLossBackward0>)
tensor(1.6839, grad_fn=<NllLossBackward0>)


  9%|▉         | 1537/17426 [02:18<22:02, 12.01it/s]

tensor(1.6994, grad_fn=<NllLossBackward0>)
tensor(1.6945, grad_fn=<NllLossBackward0>)
tensor(1.7028, grad_fn=<NllLossBackward0>)


  9%|▉         | 1539/17426 [02:18<22:01, 12.02it/s]

tensor(1.6694, grad_fn=<NllLossBackward0>)
tensor(1.6417, grad_fn=<NllLossBackward0>)
tensor(1.6686, grad_fn=<NllLossBackward0>)


  9%|▉         | 1543/17426 [02:19<21:55, 12.08it/s]

tensor(1.6869, grad_fn=<NllLossBackward0>)
tensor(1.6784, grad_fn=<NllLossBackward0>)
tensor(1.6537, grad_fn=<NllLossBackward0>)


  9%|▉         | 1545/17426 [02:19<21:55, 12.07it/s]

tensor(1.6742, grad_fn=<NllLossBackward0>)
tensor(1.6691, grad_fn=<NllLossBackward0>)
tensor(1.6750, grad_fn=<NllLossBackward0>)


  9%|▉         | 1549/17426 [02:19<21:45, 12.16it/s]

tensor(1.6002, grad_fn=<NllLossBackward0>)
tensor(1.6435, grad_fn=<NllLossBackward0>)
tensor(1.6571, grad_fn=<NllLossBackward0>)


  9%|▉         | 1551/17426 [02:19<21:52, 12.09it/s]

tensor(1.6696, grad_fn=<NllLossBackward0>)
tensor(1.6830, grad_fn=<NllLossBackward0>)
tensor(1.6712, grad_fn=<NllLossBackward0>)


  9%|▉         | 1555/17426 [02:20<22:25, 11.79it/s]

tensor(1.6742, grad_fn=<NllLossBackward0>)
tensor(1.6814, grad_fn=<NllLossBackward0>)
tensor(1.7059, grad_fn=<NllLossBackward0>)


  9%|▉         | 1557/17426 [02:20<22:25, 11.79it/s]

tensor(1.6824, grad_fn=<NllLossBackward0>)
tensor(1.6496, grad_fn=<NllLossBackward0>)
tensor(1.6068, grad_fn=<NllLossBackward0>)


  9%|▉         | 1561/17426 [02:20<22:14, 11.89it/s]

tensor(1.6353, grad_fn=<NllLossBackward0>)
tensor(1.6141, grad_fn=<NllLossBackward0>)
tensor(1.6720, grad_fn=<NllLossBackward0>)


  9%|▉         | 1563/17426 [02:20<22:05, 11.97it/s]

tensor(1.6712, grad_fn=<NllLossBackward0>)
tensor(1.6278, grad_fn=<NllLossBackward0>)
tensor(1.6982, grad_fn=<NllLossBackward0>)


  9%|▉         | 1567/17426 [02:21<22:38, 11.67it/s]

tensor(1.6918, grad_fn=<NllLossBackward0>)
tensor(1.6927, grad_fn=<NllLossBackward0>)
tensor(1.6727, grad_fn=<NllLossBackward0>)


  9%|▉         | 1569/17426 [02:21<22:32, 11.72it/s]

tensor(1.6948, grad_fn=<NllLossBackward0>)
tensor(1.7010, grad_fn=<NllLossBackward0>)
tensor(1.6925, grad_fn=<NllLossBackward0>)


  9%|▉         | 1573/17426 [02:21<21:55, 12.05it/s]

tensor(1.6624, grad_fn=<NllLossBackward0>)
tensor(1.7480, grad_fn=<NllLossBackward0>)
tensor(1.6771, grad_fn=<NllLossBackward0>)


  9%|▉         | 1575/17426 [02:21<21:39, 12.20it/s]

tensor(1.6179, grad_fn=<NllLossBackward0>)
tensor(1.6531, grad_fn=<NllLossBackward0>)
tensor(1.6379, grad_fn=<NllLossBackward0>)


  9%|▉         | 1579/17426 [02:22<22:09, 11.92it/s]

tensor(1.6528, grad_fn=<NllLossBackward0>)
tensor(1.6646, grad_fn=<NllLossBackward0>)
tensor(1.6577, grad_fn=<NllLossBackward0>)


  9%|▉         | 1581/17426 [02:22<22:15, 11.86it/s]

tensor(1.6613, grad_fn=<NllLossBackward0>)
tensor(1.7120, grad_fn=<NllLossBackward0>)
tensor(1.6921, grad_fn=<NllLossBackward0>)


  9%|▉         | 1585/17426 [02:22<21:54, 12.05it/s]

tensor(1.6592, grad_fn=<NllLossBackward0>)
tensor(1.6802, grad_fn=<NllLossBackward0>)
tensor(1.6570, grad_fn=<NllLossBackward0>)


  9%|▉         | 1587/17426 [02:22<22:01, 11.98it/s]

tensor(1.6901, grad_fn=<NllLossBackward0>)
tensor(1.6288, grad_fn=<NllLossBackward0>)
tensor(1.6499, grad_fn=<NllLossBackward0>)


  9%|▉         | 1591/17426 [02:23<22:34, 11.69it/s]

tensor(1.6463, grad_fn=<NllLossBackward0>)
tensor(1.6655, grad_fn=<NllLossBackward0>)
tensor(1.6659, grad_fn=<NllLossBackward0>)


  9%|▉         | 1593/17426 [02:23<22:27, 11.75it/s]

tensor(1.7005, grad_fn=<NllLossBackward0>)
tensor(1.6623, grad_fn=<NllLossBackward0>)
tensor(1.6264, grad_fn=<NllLossBackward0>)


  9%|▉         | 1597/17426 [02:23<22:01, 11.98it/s]

tensor(1.6630, grad_fn=<NllLossBackward0>)
tensor(1.6300, grad_fn=<NllLossBackward0>)
tensor(1.6386, grad_fn=<NllLossBackward0>)


  9%|▉         | 1599/17426 [02:23<22:11, 11.89it/s]

tensor(1.6916, grad_fn=<NllLossBackward0>)
tensor(1.6839, grad_fn=<NllLossBackward0>)
tensor(1.6768, grad_fn=<NllLossBackward0>)


  9%|▉         | 1603/17426 [02:24<22:20, 11.80it/s]

tensor(1.6331, grad_fn=<NllLossBackward0>)
tensor(1.6778, grad_fn=<NllLossBackward0>)
tensor(1.6738, grad_fn=<NllLossBackward0>)


  9%|▉         | 1605/17426 [02:24<22:16, 11.83it/s]

tensor(1.6241, grad_fn=<NllLossBackward0>)
tensor(1.6391, grad_fn=<NllLossBackward0>)
tensor(1.7248, grad_fn=<NllLossBackward0>)


  9%|▉         | 1607/17426 [02:24<25:11, 10.47it/s]

tensor(1.6593, grad_fn=<NllLossBackward0>)
tensor(1.7018, grad_fn=<NllLossBackward0>)


  9%|▉         | 1609/17426 [02:24<27:14,  9.68it/s]

tensor(1.6743, grad_fn=<NllLossBackward0>)
tensor(1.6486, grad_fn=<NllLossBackward0>)


  9%|▉         | 1612/17426 [02:25<29:34,  8.91it/s]

tensor(1.6455, grad_fn=<NllLossBackward0>)
tensor(1.6746, grad_fn=<NllLossBackward0>)


  9%|▉         | 1614/17426 [02:25<30:06,  8.75it/s]

tensor(1.6721, grad_fn=<NllLossBackward0>)
tensor(1.6697, grad_fn=<NllLossBackward0>)


  9%|▉         | 1616/17426 [02:25<30:47,  8.56it/s]

tensor(1.6382, grad_fn=<NllLossBackward0>)
tensor(1.6610, grad_fn=<NllLossBackward0>)


  9%|▉         | 1618/17426 [02:25<31:20,  8.40it/s]

tensor(1.6446, grad_fn=<NllLossBackward0>)
tensor(1.6381, grad_fn=<NllLossBackward0>)


  9%|▉         | 1620/17426 [02:26<32:55,  8.00it/s]

tensor(1.6500, grad_fn=<NllLossBackward0>)
tensor(1.6770, grad_fn=<NllLossBackward0>)


  9%|▉         | 1622/17426 [02:26<34:42,  7.59it/s]

tensor(1.6717, grad_fn=<NllLossBackward0>)
tensor(1.6641, grad_fn=<NllLossBackward0>)


  9%|▉         | 1624/17426 [02:26<35:22,  7.45it/s]

tensor(1.6658, grad_fn=<NllLossBackward0>)
tensor(1.6834, grad_fn=<NllLossBackward0>)


  9%|▉         | 1626/17426 [02:27<35:12,  7.48it/s]

tensor(1.6991, grad_fn=<NllLossBackward0>)
tensor(1.6526, grad_fn=<NllLossBackward0>)


  9%|▉         | 1629/17426 [02:27<29:26,  8.94it/s]

tensor(1.6328, grad_fn=<NllLossBackward0>)
tensor(1.6930, grad_fn=<NllLossBackward0>)
tensor(1.6686, grad_fn=<NllLossBackward0>)


  9%|▉         | 1631/17426 [02:27<26:14, 10.03it/s]

tensor(1.6447, grad_fn=<NllLossBackward0>)
tensor(1.6543, grad_fn=<NllLossBackward0>)
tensor(1.6515, grad_fn=<NllLossBackward0>)


  9%|▉         | 1635/17426 [02:27<23:21, 11.27it/s]

tensor(1.6987, grad_fn=<NllLossBackward0>)
tensor(1.6554, grad_fn=<NllLossBackward0>)
tensor(1.6355, grad_fn=<NllLossBackward0>)


  9%|▉         | 1637/17426 [02:28<22:42, 11.59it/s]

tensor(1.6340, grad_fn=<NllLossBackward0>)
tensor(1.6411, grad_fn=<NllLossBackward0>)
tensor(1.6761, grad_fn=<NllLossBackward0>)


  9%|▉         | 1641/17426 [02:28<22:32, 11.67it/s]

tensor(1.6656, grad_fn=<NllLossBackward0>)
tensor(1.6536, grad_fn=<NllLossBackward0>)
tensor(1.6389, grad_fn=<NllLossBackward0>)


  9%|▉         | 1643/17426 [02:28<22:34, 11.65it/s]

tensor(1.6986, grad_fn=<NllLossBackward0>)
tensor(1.6514, grad_fn=<NllLossBackward0>)
tensor(1.6415, grad_fn=<NllLossBackward0>)


  9%|▉         | 1647/17426 [02:28<21:59, 11.96it/s]

tensor(1.6786, grad_fn=<NllLossBackward0>)
tensor(1.7026, grad_fn=<NllLossBackward0>)
tensor(1.6378, grad_fn=<NllLossBackward0>)


  9%|▉         | 1649/17426 [02:29<21:53, 12.01it/s]

tensor(1.6665, grad_fn=<NllLossBackward0>)
tensor(1.6745, grad_fn=<NllLossBackward0>)
tensor(1.6271, grad_fn=<NllLossBackward0>)


  9%|▉         | 1653/17426 [02:29<22:08, 11.87it/s]

tensor(1.6606, grad_fn=<NllLossBackward0>)
tensor(1.6476, grad_fn=<NllLossBackward0>)
tensor(1.7023, grad_fn=<NllLossBackward0>)


  9%|▉         | 1655/17426 [02:29<22:26, 11.71it/s]

tensor(1.6642, grad_fn=<NllLossBackward0>)
tensor(1.6702, grad_fn=<NllLossBackward0>)
tensor(1.6178, grad_fn=<NllLossBackward0>)


 10%|▉         | 1659/17426 [02:29<21:59, 11.95it/s]

tensor(1.6748, grad_fn=<NllLossBackward0>)
tensor(1.6837, grad_fn=<NllLossBackward0>)
tensor(1.6267, grad_fn=<NllLossBackward0>)


 10%|▉         | 1661/17426 [02:30<21:51, 12.02it/s]

tensor(1.6365, grad_fn=<NllLossBackward0>)
tensor(1.6357, grad_fn=<NllLossBackward0>)
tensor(1.6704, grad_fn=<NllLossBackward0>)


 10%|▉         | 1665/17426 [02:30<21:38, 12.14it/s]

tensor(1.6998, grad_fn=<NllLossBackward0>)
tensor(1.6689, grad_fn=<NllLossBackward0>)
tensor(1.6397, grad_fn=<NllLossBackward0>)


 10%|▉         | 1667/17426 [02:30<22:10, 11.84it/s]

tensor(1.6662, grad_fn=<NllLossBackward0>)
tensor(1.6492, grad_fn=<NllLossBackward0>)
tensor(1.6420, grad_fn=<NllLossBackward0>)


 10%|▉         | 1671/17426 [02:30<21:36, 12.15it/s]

tensor(1.6716, grad_fn=<NllLossBackward0>)
tensor(1.6250, grad_fn=<NllLossBackward0>)
tensor(1.6423, grad_fn=<NllLossBackward0>)


 10%|▉         | 1673/17426 [02:31<21:51, 12.01it/s]

tensor(1.6693, grad_fn=<NllLossBackward0>)
tensor(1.6252, grad_fn=<NllLossBackward0>)
tensor(1.6603, grad_fn=<NllLossBackward0>)


 10%|▉         | 1677/17426 [02:31<21:45, 12.06it/s]

tensor(1.6832, grad_fn=<NllLossBackward0>)
tensor(1.6896, grad_fn=<NllLossBackward0>)
tensor(1.6571, grad_fn=<NllLossBackward0>)


 10%|▉         | 1679/17426 [02:31<22:18, 11.76it/s]

tensor(1.6476, grad_fn=<NllLossBackward0>)
tensor(1.6457, grad_fn=<NllLossBackward0>)
tensor(1.6291, grad_fn=<NllLossBackward0>)


 10%|▉         | 1683/17426 [02:31<21:43, 12.07it/s]

tensor(1.6323, grad_fn=<NllLossBackward0>)
tensor(1.6621, grad_fn=<NllLossBackward0>)
tensor(1.7053, grad_fn=<NllLossBackward0>)


 10%|▉         | 1685/17426 [02:32<21:46, 12.05it/s]

tensor(1.6860, grad_fn=<NllLossBackward0>)
tensor(1.6453, grad_fn=<NllLossBackward0>)
tensor(1.7068, grad_fn=<NllLossBackward0>)


 10%|▉         | 1689/17426 [02:32<21:32, 12.17it/s]

tensor(1.7099, grad_fn=<NllLossBackward0>)
tensor(1.6716, grad_fn=<NllLossBackward0>)
tensor(1.6600, grad_fn=<NllLossBackward0>)


 10%|▉         | 1691/17426 [02:32<22:20, 11.74it/s]

tensor(1.6502, grad_fn=<NllLossBackward0>)
tensor(1.6312, grad_fn=<NllLossBackward0>)
tensor(1.6309, grad_fn=<NllLossBackward0>)


 10%|▉         | 1695/17426 [02:32<21:46, 12.04it/s]

tensor(1.6880, grad_fn=<NllLossBackward0>)
tensor(1.6162, grad_fn=<NllLossBackward0>)
tensor(1.6451, grad_fn=<NllLossBackward0>)


 10%|▉         | 1697/17426 [02:33<21:54, 11.97it/s]

tensor(1.6603, grad_fn=<NllLossBackward0>)
tensor(1.6586, grad_fn=<NllLossBackward0>)
tensor(1.6693, grad_fn=<NllLossBackward0>)


 10%|▉         | 1701/17426 [02:33<21:45, 12.05it/s]

tensor(1.6412, grad_fn=<NllLossBackward0>)
tensor(1.6550, grad_fn=<NllLossBackward0>)
tensor(1.6615, grad_fn=<NllLossBackward0>)


 10%|▉         | 1703/17426 [02:33<22:23, 11.70it/s]

tensor(1.6096, grad_fn=<NllLossBackward0>)
tensor(1.6471, grad_fn=<NllLossBackward0>)
tensor(1.6493, grad_fn=<NllLossBackward0>)


 10%|▉         | 1707/17426 [02:33<22:04, 11.87it/s]

tensor(1.6867, grad_fn=<NllLossBackward0>)
tensor(1.6541, grad_fn=<NllLossBackward0>)
tensor(1.6479, grad_fn=<NllLossBackward0>)


 10%|▉         | 1709/17426 [02:34<21:52, 11.97it/s]

tensor(1.6337, grad_fn=<NllLossBackward0>)
tensor(1.7003, grad_fn=<NllLossBackward0>)
tensor(1.6509, grad_fn=<NllLossBackward0>)


 10%|▉         | 1713/17426 [02:34<21:28, 12.19it/s]

tensor(1.6227, grad_fn=<NllLossBackward0>)
tensor(1.6894, grad_fn=<NllLossBackward0>)
tensor(1.6514, grad_fn=<NllLossBackward0>)


 10%|▉         | 1715/17426 [02:34<22:11, 11.80it/s]

tensor(1.6341, grad_fn=<NllLossBackward0>)
tensor(1.6478, grad_fn=<NllLossBackward0>)
tensor(1.6675, grad_fn=<NllLossBackward0>)


 10%|▉         | 1719/17426 [02:34<21:35, 12.13it/s]

tensor(1.6526, grad_fn=<NllLossBackward0>)
tensor(1.6786, grad_fn=<NllLossBackward0>)
tensor(1.6836, grad_fn=<NllLossBackward0>)


 10%|▉         | 1721/17426 [02:35<21:26, 12.21it/s]

tensor(1.6701, grad_fn=<NllLossBackward0>)
tensor(1.6418, grad_fn=<NllLossBackward0>)
tensor(1.6824, grad_fn=<NllLossBackward0>)


 10%|▉         | 1725/17426 [02:35<21:22, 12.24it/s]

tensor(1.6555, grad_fn=<NllLossBackward0>)
tensor(1.6391, grad_fn=<NllLossBackward0>)
tensor(1.6379, grad_fn=<NllLossBackward0>)


 10%|▉         | 1727/17426 [02:35<21:41, 12.07it/s]

tensor(1.5998, grad_fn=<NllLossBackward0>)
tensor(1.7082, grad_fn=<NllLossBackward0>)
tensor(1.6536, grad_fn=<NllLossBackward0>)


 10%|▉         | 1731/17426 [02:35<22:03, 11.86it/s]

tensor(1.6784, grad_fn=<NllLossBackward0>)
tensor(1.6570, grad_fn=<NllLossBackward0>)
tensor(1.7116, grad_fn=<NllLossBackward0>)


 10%|▉         | 1733/17426 [02:36<21:39, 12.07it/s]

tensor(1.6912, grad_fn=<NllLossBackward0>)
tensor(1.6561, grad_fn=<NllLossBackward0>)
tensor(1.6945, grad_fn=<NllLossBackward0>)


 10%|▉         | 1737/17426 [02:36<21:40, 12.07it/s]

tensor(1.6544, grad_fn=<NllLossBackward0>)
tensor(1.6654, grad_fn=<NllLossBackward0>)
tensor(1.6586, grad_fn=<NllLossBackward0>)


 10%|▉         | 1739/17426 [02:36<21:50, 11.97it/s]

tensor(1.6698, grad_fn=<NllLossBackward0>)
tensor(1.6720, grad_fn=<NllLossBackward0>)
tensor(1.6574, grad_fn=<NllLossBackward0>)


 10%|█         | 1743/17426 [02:36<21:54, 11.93it/s]

tensor(1.6754, grad_fn=<NllLossBackward0>)
tensor(1.7074, grad_fn=<NllLossBackward0>)
tensor(1.6505, grad_fn=<NllLossBackward0>)


 10%|█         | 1745/17426 [02:37<21:48, 11.98it/s]

tensor(1.6565, grad_fn=<NllLossBackward0>)
tensor(1.6566, grad_fn=<NllLossBackward0>)
tensor(1.6686, grad_fn=<NllLossBackward0>)


 10%|█         | 1747/17426 [02:37<23:22, 11.18it/s]

tensor(1.6334, grad_fn=<NllLossBackward0>)
tensor(1.6633, grad_fn=<NllLossBackward0>)


 10%|█         | 1749/17426 [02:37<26:12,  9.97it/s]

tensor(1.6685, grad_fn=<NllLossBackward0>)
tensor(1.6323, grad_fn=<NllLossBackward0>)


 10%|█         | 1752/17426 [02:37<29:07,  8.97it/s]

tensor(1.6456, grad_fn=<NllLossBackward0>)
tensor(1.6322, grad_fn=<NllLossBackward0>)


 10%|█         | 1754/17426 [02:38<29:44,  8.78it/s]

tensor(1.6328, grad_fn=<NllLossBackward0>)
tensor(1.6591, grad_fn=<NllLossBackward0>)


 10%|█         | 1756/17426 [02:38<30:31,  8.56it/s]

tensor(1.6458, grad_fn=<NllLossBackward0>)
tensor(1.6687, grad_fn=<NllLossBackward0>)


 10%|█         | 1758/17426 [02:38<31:45,  8.22it/s]

tensor(1.6744, grad_fn=<NllLossBackward0>)
tensor(1.6090, grad_fn=<NllLossBackward0>)


 10%|█         | 1760/17426 [02:38<31:49,  8.20it/s]

tensor(1.6210, grad_fn=<NllLossBackward0>)
tensor(1.6480, grad_fn=<NllLossBackward0>)


 10%|█         | 1762/17426 [02:39<33:37,  7.76it/s]

tensor(1.6861, grad_fn=<NllLossBackward0>)
tensor(1.6145, grad_fn=<NllLossBackward0>)


 10%|█         | 1764/17426 [02:39<33:20,  7.83it/s]

tensor(1.6226, grad_fn=<NllLossBackward0>)
tensor(1.6322, grad_fn=<NllLossBackward0>)


 10%|█         | 1766/17426 [02:39<33:59,  7.68it/s]

tensor(1.6380, grad_fn=<NllLossBackward0>)
tensor(1.6304, grad_fn=<NllLossBackward0>)


 10%|█         | 1768/17426 [02:39<36:15,  7.20it/s]

tensor(1.6058, grad_fn=<NllLossBackward0>)
tensor(1.6849, grad_fn=<NllLossBackward0>)


 10%|█         | 1771/17426 [02:40<28:02,  9.30it/s]

tensor(1.7057, grad_fn=<NllLossBackward0>)
tensor(1.6722, grad_fn=<NllLossBackward0>)
tensor(1.6463, grad_fn=<NllLossBackward0>)


 10%|█         | 1774/17426 [02:40<25:00, 10.43it/s]

tensor(1.6825, grad_fn=<NllLossBackward0>)
tensor(1.6675, grad_fn=<NllLossBackward0>)
tensor(1.6265, grad_fn=<NllLossBackward0>)


 10%|█         | 1776/17426 [02:40<23:52, 10.93it/s]

tensor(1.6873, grad_fn=<NllLossBackward0>)
tensor(1.6516, grad_fn=<NllLossBackward0>)
tensor(1.5908, grad_fn=<NllLossBackward0>)


 10%|█         | 1780/17426 [02:41<22:56, 11.37it/s]

tensor(1.6671, grad_fn=<NllLossBackward0>)
tensor(1.6503, grad_fn=<NllLossBackward0>)
tensor(1.6347, grad_fn=<NllLossBackward0>)


 10%|█         | 1782/17426 [02:41<22:42, 11.48it/s]

tensor(1.6696, grad_fn=<NllLossBackward0>)
tensor(1.6617, grad_fn=<NllLossBackward0>)
tensor(1.6249, grad_fn=<NllLossBackward0>)


 10%|█         | 1786/17426 [02:41<22:21, 11.66it/s]

tensor(1.6884, grad_fn=<NllLossBackward0>)
tensor(1.6331, grad_fn=<NllLossBackward0>)
tensor(1.6482, grad_fn=<NllLossBackward0>)


 10%|█         | 1788/17426 [02:41<22:20, 11.67it/s]

tensor(1.6742, grad_fn=<NllLossBackward0>)
tensor(1.6911, grad_fn=<NllLossBackward0>)
tensor(1.6868, grad_fn=<NllLossBackward0>)


 10%|█         | 1792/17426 [02:42<22:14, 11.71it/s]

tensor(1.6521, grad_fn=<NllLossBackward0>)
tensor(1.6614, grad_fn=<NllLossBackward0>)
tensor(1.6345, grad_fn=<NllLossBackward0>)


 10%|█         | 1794/17426 [02:42<23:21, 11.15it/s]

tensor(1.6662, grad_fn=<NllLossBackward0>)
tensor(1.6682, grad_fn=<NllLossBackward0>)


 10%|█         | 1796/17426 [02:42<27:41,  9.41it/s]

tensor(1.6696, grad_fn=<NllLossBackward0>)
tensor(1.6077, grad_fn=<NllLossBackward0>)


 10%|█         | 1798/17426 [02:42<27:28,  9.48it/s]

tensor(1.6264, grad_fn=<NllLossBackward0>)
tensor(1.6538, grad_fn=<NllLossBackward0>)
tensor(1.6598, grad_fn=<NllLossBackward0>)


 10%|█         | 1802/17426 [02:43<25:54, 10.05it/s]

tensor(1.6713, grad_fn=<NllLossBackward0>)
tensor(1.6200, grad_fn=<NllLossBackward0>)
tensor(1.6168, grad_fn=<NllLossBackward0>)


 10%|█         | 1804/17426 [02:43<25:21, 10.27it/s]

tensor(1.5799, grad_fn=<NllLossBackward0>)
tensor(1.6371, grad_fn=<NllLossBackward0>)
tensor(1.6734, grad_fn=<NllLossBackward0>)


 10%|█         | 1808/17426 [02:43<23:19, 11.16it/s]

tensor(1.6718, grad_fn=<NllLossBackward0>)
tensor(1.6555, grad_fn=<NllLossBackward0>)
tensor(1.6666, grad_fn=<NllLossBackward0>)


 10%|█         | 1810/17426 [02:43<22:59, 11.32it/s]

tensor(1.6554, grad_fn=<NllLossBackward0>)
tensor(1.6473, grad_fn=<NllLossBackward0>)
tensor(1.6779, grad_fn=<NllLossBackward0>)


 10%|█         | 1814/17426 [02:44<22:20, 11.64it/s]

tensor(1.6740, grad_fn=<NllLossBackward0>)
tensor(1.6213, grad_fn=<NllLossBackward0>)
tensor(1.6700, grad_fn=<NllLossBackward0>)


 10%|█         | 1816/17426 [02:44<22:08, 11.75it/s]

tensor(1.6510, grad_fn=<NllLossBackward0>)
tensor(1.6791, grad_fn=<NllLossBackward0>)
tensor(1.6453, grad_fn=<NllLossBackward0>)


 10%|█         | 1820/17426 [02:44<21:53, 11.88it/s]

tensor(1.6458, grad_fn=<NllLossBackward0>)
tensor(1.6227, grad_fn=<NllLossBackward0>)
tensor(1.6615, grad_fn=<NllLossBackward0>)


 10%|█         | 1822/17426 [02:44<21:47, 11.93it/s]

tensor(1.6564, grad_fn=<NllLossBackward0>)
tensor(1.5785, grad_fn=<NllLossBackward0>)
tensor(1.6328, grad_fn=<NllLossBackward0>)


 10%|█         | 1826/17426 [02:45<22:06, 11.76it/s]

tensor(1.6664, grad_fn=<NllLossBackward0>)
tensor(1.6034, grad_fn=<NllLossBackward0>)
tensor(1.6657, grad_fn=<NllLossBackward0>)


 10%|█         | 1828/17426 [02:45<21:52, 11.89it/s]

tensor(1.5894, grad_fn=<NllLossBackward0>)
tensor(1.6879, grad_fn=<NllLossBackward0>)
tensor(1.6111, grad_fn=<NllLossBackward0>)


 11%|█         | 1832/17426 [02:45<21:36, 12.03it/s]

tensor(1.6537, grad_fn=<NllLossBackward0>)
tensor(1.6389, grad_fn=<NllLossBackward0>)
tensor(1.6578, grad_fn=<NllLossBackward0>)


 11%|█         | 1834/17426 [02:45<21:55, 11.85it/s]

tensor(1.6122, grad_fn=<NllLossBackward0>)
tensor(1.6237, grad_fn=<NllLossBackward0>)
tensor(1.5917, grad_fn=<NllLossBackward0>)


 11%|█         | 1838/17426 [02:46<21:58, 11.82it/s]

tensor(1.6886, grad_fn=<NllLossBackward0>)
tensor(1.6576, grad_fn=<NllLossBackward0>)
tensor(1.6456, grad_fn=<NllLossBackward0>)


 11%|█         | 1840/17426 [02:46<21:53, 11.87it/s]

tensor(1.6164, grad_fn=<NllLossBackward0>)
tensor(1.6282, grad_fn=<NllLossBackward0>)
tensor(1.6602, grad_fn=<NllLossBackward0>)


 11%|█         | 1844/17426 [02:46<21:38, 12.00it/s]

tensor(1.6481, grad_fn=<NllLossBackward0>)
tensor(1.6791, grad_fn=<NllLossBackward0>)
tensor(1.6433, grad_fn=<NllLossBackward0>)


 11%|█         | 1846/17426 [02:46<21:35, 12.03it/s]

tensor(1.6334, grad_fn=<NllLossBackward0>)
tensor(1.6345, grad_fn=<NllLossBackward0>)
tensor(1.6469, grad_fn=<NllLossBackward0>)


 11%|█         | 1850/17426 [02:47<21:59, 11.81it/s]

tensor(1.5953, grad_fn=<NllLossBackward0>)
tensor(1.6640, grad_fn=<NllLossBackward0>)
tensor(1.6342, grad_fn=<NllLossBackward0>)


 11%|█         | 1852/17426 [02:47<21:53, 11.85it/s]

tensor(1.6614, grad_fn=<NllLossBackward0>)
tensor(1.6396, grad_fn=<NllLossBackward0>)
tensor(1.6748, grad_fn=<NllLossBackward0>)


 11%|█         | 1856/17426 [02:47<21:46, 11.92it/s]

tensor(1.6085, grad_fn=<NllLossBackward0>)
tensor(1.6498, grad_fn=<NllLossBackward0>)
tensor(1.6112, grad_fn=<NllLossBackward0>)


 11%|█         | 1858/17426 [02:47<21:43, 11.94it/s]

tensor(1.6419, grad_fn=<NllLossBackward0>)
tensor(1.6167, grad_fn=<NllLossBackward0>)
tensor(1.6508, grad_fn=<NllLossBackward0>)


 11%|█         | 1862/17426 [02:48<21:54, 11.84it/s]

tensor(1.6656, grad_fn=<NllLossBackward0>)
tensor(1.6095, grad_fn=<NllLossBackward0>)
tensor(1.6783, grad_fn=<NllLossBackward0>)


 11%|█         | 1864/17426 [02:48<21:38, 11.99it/s]

tensor(1.6296, grad_fn=<NllLossBackward0>)
tensor(1.6291, grad_fn=<NllLossBackward0>)
tensor(1.7130, grad_fn=<NllLossBackward0>)


 11%|█         | 1868/17426 [02:48<21:15, 12.20it/s]

tensor(1.6200, grad_fn=<NllLossBackward0>)
tensor(1.6810, grad_fn=<NllLossBackward0>)
tensor(1.6322, grad_fn=<NllLossBackward0>)


 11%|█         | 1870/17426 [02:48<21:08, 12.27it/s]

tensor(1.6478, grad_fn=<NllLossBackward0>)
tensor(1.6470, grad_fn=<NllLossBackward0>)
tensor(1.6059, grad_fn=<NllLossBackward0>)


 11%|█         | 1874/17426 [02:49<21:32, 12.03it/s]

tensor(1.6560, grad_fn=<NllLossBackward0>)
tensor(1.6252, grad_fn=<NllLossBackward0>)
tensor(1.5984, grad_fn=<NllLossBackward0>)


 11%|█         | 1876/17426 [02:49<21:48, 11.88it/s]

tensor(1.5916, grad_fn=<NllLossBackward0>)
tensor(1.6342, grad_fn=<NllLossBackward0>)
tensor(1.6460, grad_fn=<NllLossBackward0>)


 11%|█         | 1880/17426 [02:49<21:30, 12.05it/s]

tensor(1.6349, grad_fn=<NllLossBackward0>)
tensor(1.6961, grad_fn=<NllLossBackward0>)
tensor(1.6064, grad_fn=<NllLossBackward0>)


 11%|█         | 1882/17426 [02:49<21:29, 12.05it/s]

tensor(1.6337, grad_fn=<NllLossBackward0>)
tensor(1.6838, grad_fn=<NllLossBackward0>)
tensor(1.6738, grad_fn=<NllLossBackward0>)


 11%|█         | 1884/17426 [02:50<21:39, 11.96it/s]

tensor(1.6544, grad_fn=<NllLossBackward0>)
tensor(1.6489, grad_fn=<NllLossBackward0>)


 11%|█         | 1886/17426 [02:50<25:20, 10.22it/s]

tensor(1.6797, grad_fn=<NllLossBackward0>)
tensor(1.6463, grad_fn=<NllLossBackward0>)


 11%|█         | 1889/17426 [02:50<28:07,  9.21it/s]

tensor(1.6838, grad_fn=<NllLossBackward0>)
tensor(1.6497, grad_fn=<NllLossBackward0>)


 11%|█         | 1891/17426 [02:50<30:16,  8.55it/s]

tensor(1.6370, grad_fn=<NllLossBackward0>)
tensor(1.6575, grad_fn=<NllLossBackward0>)


 11%|█         | 1893/17426 [02:51<32:54,  7.87it/s]

tensor(1.5992, grad_fn=<NllLossBackward0>)
tensor(1.6543, grad_fn=<NllLossBackward0>)


 11%|█         | 1895/17426 [02:51<31:25,  8.24it/s]

tensor(1.6142, grad_fn=<NllLossBackward0>)
tensor(1.5972, grad_fn=<NllLossBackward0>)


 11%|█         | 1897/17426 [02:51<31:49,  8.13it/s]

tensor(1.6338, grad_fn=<NllLossBackward0>)
tensor(1.6158, grad_fn=<NllLossBackward0>)


 11%|█         | 1899/17426 [02:51<32:34,  7.95it/s]

tensor(1.5895, grad_fn=<NllLossBackward0>)
tensor(1.6408, grad_fn=<NllLossBackward0>)


 11%|█         | 1901/17426 [02:52<32:23,  7.99it/s]

tensor(1.6218, grad_fn=<NllLossBackward0>)
tensor(1.6654, grad_fn=<NllLossBackward0>)


 11%|█         | 1903/17426 [02:52<33:32,  7.72it/s]

tensor(1.6193, grad_fn=<NllLossBackward0>)
tensor(1.6414, grad_fn=<NllLossBackward0>)


 11%|█         | 1905/17426 [02:52<34:51,  7.42it/s]

tensor(1.6285, grad_fn=<NllLossBackward0>)
tensor(1.6274, grad_fn=<NllLossBackward0>)


 11%|█         | 1908/17426 [02:53<28:01,  9.23it/s]

tensor(1.6655, grad_fn=<NllLossBackward0>)
tensor(1.6601, grad_fn=<NllLossBackward0>)
tensor(1.6630, grad_fn=<NllLossBackward0>)


 11%|█         | 1910/17426 [02:53<25:33, 10.12it/s]

tensor(1.6223, grad_fn=<NllLossBackward0>)
tensor(1.6757, grad_fn=<NllLossBackward0>)
tensor(1.6469, grad_fn=<NllLossBackward0>)


 11%|█         | 1914/17426 [02:53<23:23, 11.05it/s]

tensor(1.6058, grad_fn=<NllLossBackward0>)
tensor(1.6782, grad_fn=<NllLossBackward0>)
tensor(1.6564, grad_fn=<NllLossBackward0>)


 11%|█         | 1916/17426 [02:53<23:01, 11.22it/s]

tensor(1.6535, grad_fn=<NllLossBackward0>)
tensor(1.6523, grad_fn=<NllLossBackward0>)
tensor(1.5689, grad_fn=<NllLossBackward0>)


 11%|█         | 1920/17426 [02:54<21:58, 11.76it/s]

tensor(1.6473, grad_fn=<NllLossBackward0>)
tensor(1.6270, grad_fn=<NllLossBackward0>)
tensor(1.6320, grad_fn=<NllLossBackward0>)


 11%|█         | 1922/17426 [02:54<21:58, 11.76it/s]

tensor(1.6777, grad_fn=<NllLossBackward0>)
tensor(1.6696, grad_fn=<NllLossBackward0>)
tensor(1.6200, grad_fn=<NllLossBackward0>)


 11%|█         | 1926/17426 [02:54<21:51, 11.82it/s]

tensor(1.6385, grad_fn=<NllLossBackward0>)
tensor(1.6294, grad_fn=<NllLossBackward0>)
tensor(1.6573, grad_fn=<NllLossBackward0>)


 11%|█         | 1928/17426 [02:54<22:04, 11.70it/s]

tensor(1.6699, grad_fn=<NllLossBackward0>)
tensor(1.6284, grad_fn=<NllLossBackward0>)
tensor(1.6477, grad_fn=<NllLossBackward0>)


 11%|█         | 1932/17426 [02:55<21:23, 12.07it/s]

tensor(1.6540, grad_fn=<NllLossBackward0>)
tensor(1.6329, grad_fn=<NllLossBackward0>)
tensor(1.6310, grad_fn=<NllLossBackward0>)


 11%|█         | 1934/17426 [02:55<21:40, 11.91it/s]

tensor(1.6133, grad_fn=<NllLossBackward0>)
tensor(1.6227, grad_fn=<NllLossBackward0>)
tensor(1.6806, grad_fn=<NllLossBackward0>)


 11%|█         | 1938/17426 [02:55<21:53, 11.79it/s]

tensor(1.6697, grad_fn=<NllLossBackward0>)
tensor(1.6729, grad_fn=<NllLossBackward0>)
tensor(1.6308, grad_fn=<NllLossBackward0>)


 11%|█         | 1940/17426 [02:55<22:07, 11.67it/s]

tensor(1.6456, grad_fn=<NllLossBackward0>)
tensor(1.6074, grad_fn=<NllLossBackward0>)
tensor(1.6779, grad_fn=<NllLossBackward0>)


 11%|█         | 1944/17426 [02:56<21:39, 11.91it/s]

tensor(1.5607, grad_fn=<NllLossBackward0>)
tensor(1.5872, grad_fn=<NllLossBackward0>)
tensor(1.6497, grad_fn=<NllLossBackward0>)


 11%|█         | 1946/17426 [02:56<21:47, 11.84it/s]

tensor(1.6474, grad_fn=<NllLossBackward0>)
tensor(1.6156, grad_fn=<NllLossBackward0>)
tensor(1.6987, grad_fn=<NllLossBackward0>)


 11%|█         | 1950/17426 [02:56<21:53, 11.78it/s]

tensor(1.6846, grad_fn=<NllLossBackward0>)
tensor(1.6214, grad_fn=<NllLossBackward0>)
tensor(1.6520, grad_fn=<NllLossBackward0>)


 11%|█         | 1952/17426 [02:56<22:10, 11.63it/s]

tensor(1.6001, grad_fn=<NllLossBackward0>)
tensor(1.6450, grad_fn=<NllLossBackward0>)
tensor(1.7016, grad_fn=<NllLossBackward0>)


 11%|█         | 1956/17426 [02:57<21:43, 11.87it/s]

tensor(1.6694, grad_fn=<NllLossBackward0>)
tensor(1.6924, grad_fn=<NllLossBackward0>)
tensor(1.6216, grad_fn=<NllLossBackward0>)


 11%|█         | 1958/17426 [02:57<21:38, 11.91it/s]

tensor(1.6485, grad_fn=<NllLossBackward0>)
tensor(1.6345, grad_fn=<NllLossBackward0>)
tensor(1.6287, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1962/17426 [02:57<21:38, 11.91it/s]

tensor(1.6285, grad_fn=<NllLossBackward0>)
tensor(1.6591, grad_fn=<NllLossBackward0>)
tensor(1.6033, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1964/17426 [02:57<21:59, 11.72it/s]

tensor(1.6189, grad_fn=<NllLossBackward0>)
tensor(1.5817, grad_fn=<NllLossBackward0>)
tensor(1.5793, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1968/17426 [02:58<21:37, 11.91it/s]

tensor(1.6393, grad_fn=<NllLossBackward0>)
tensor(1.6388, grad_fn=<NllLossBackward0>)
tensor(1.6431, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1970/17426 [02:58<21:38, 11.90it/s]

tensor(1.5881, grad_fn=<NllLossBackward0>)
tensor(1.6365, grad_fn=<NllLossBackward0>)
tensor(1.6332, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1974/17426 [02:58<21:47, 11.82it/s]

tensor(1.6193, grad_fn=<NllLossBackward0>)
tensor(1.6190, grad_fn=<NllLossBackward0>)
tensor(1.6831, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1976/17426 [02:58<21:48, 11.81it/s]

tensor(1.6552, grad_fn=<NllLossBackward0>)
tensor(1.6739, grad_fn=<NllLossBackward0>)
tensor(1.6361, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1980/17426 [02:59<21:21, 12.05it/s]

tensor(1.6475, grad_fn=<NllLossBackward0>)
tensor(1.6521, grad_fn=<NllLossBackward0>)
tensor(1.6415, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1982/17426 [02:59<21:22, 12.05it/s]

tensor(1.6320, grad_fn=<NllLossBackward0>)
tensor(1.6215, grad_fn=<NllLossBackward0>)
tensor(1.6256, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1986/17426 [02:59<21:44, 11.84it/s]

tensor(1.6602, grad_fn=<NllLossBackward0>)
tensor(1.6101, grad_fn=<NllLossBackward0>)
tensor(1.6526, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1988/17426 [02:59<21:50, 11.78it/s]

tensor(1.6429, grad_fn=<NllLossBackward0>)
tensor(1.6493, grad_fn=<NllLossBackward0>)
tensor(1.6430, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1992/17426 [03:00<21:31, 11.95it/s]

tensor(1.6032, grad_fn=<NllLossBackward0>)
tensor(1.6324, grad_fn=<NllLossBackward0>)
tensor(1.6371, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1994/17426 [03:00<21:26, 11.99it/s]

tensor(1.6021, grad_fn=<NllLossBackward0>)
tensor(1.6397, grad_fn=<NllLossBackward0>)
tensor(1.6343, grad_fn=<NllLossBackward0>)


 11%|█▏        | 1998/17426 [03:00<21:58, 11.70it/s]

tensor(1.6538, grad_fn=<NllLossBackward0>)
tensor(1.6282, grad_fn=<NllLossBackward0>)
tensor(1.6284, grad_fn=<NllLossBackward0>)


 11%|█▏        | 2000/17426 [03:00<22:02, 11.66it/s]

tensor(1.5747, grad_fn=<NllLossBackward0>)
tensor(1.6178, grad_fn=<NllLossBackward0>)
tensor(1.6693, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2004/17426 [03:01<21:52, 11.75it/s]

tensor(1.6600, grad_fn=<NllLossBackward0>)
tensor(1.6478, grad_fn=<NllLossBackward0>)
tensor(1.6253, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2006/17426 [03:01<22:04, 11.64it/s]

tensor(1.6803, grad_fn=<NllLossBackward0>)
tensor(1.6017, grad_fn=<NllLossBackward0>)
tensor(1.6518, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2010/17426 [03:01<22:27, 11.44it/s]

tensor(1.6424, grad_fn=<NllLossBackward0>)
tensor(1.6710, grad_fn=<NllLossBackward0>)
tensor(1.6640, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2012/17426 [03:01<22:13, 11.56it/s]

tensor(1.6355, grad_fn=<NllLossBackward0>)
tensor(1.6675, grad_fn=<NllLossBackward0>)
tensor(1.6181, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2016/17426 [03:02<21:53, 11.73it/s]

tensor(1.6166, grad_fn=<NllLossBackward0>)
tensor(1.6018, grad_fn=<NllLossBackward0>)
tensor(1.6797, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2018/17426 [03:02<21:45, 11.80it/s]

tensor(1.6160, grad_fn=<NllLossBackward0>)
tensor(1.5943, grad_fn=<NllLossBackward0>)
tensor(1.6566, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2022/17426 [03:02<22:12, 11.56it/s]

tensor(1.6109, grad_fn=<NllLossBackward0>)
tensor(1.5988, grad_fn=<NllLossBackward0>)
tensor(1.6975, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2024/17426 [03:02<23:23, 10.98it/s]

tensor(1.6039, grad_fn=<NllLossBackward0>)
tensor(1.6361, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2026/17426 [03:03<26:03,  9.85it/s]

tensor(1.6254, grad_fn=<NllLossBackward0>)
tensor(1.6544, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2028/17426 [03:03<27:58,  9.17it/s]

tensor(1.6183, grad_fn=<NllLossBackward0>)
tensor(1.5961, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2030/17426 [03:03<30:33,  8.40it/s]

tensor(1.6070, grad_fn=<NllLossBackward0>)
tensor(1.6302, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2032/17426 [03:03<31:02,  8.27it/s]

tensor(1.6854, grad_fn=<NllLossBackward0>)
tensor(1.5759, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2034/17426 [03:04<32:32,  7.88it/s]

tensor(1.6722, grad_fn=<NllLossBackward0>)
tensor(1.6740, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2036/17426 [03:04<32:02,  8.01it/s]

tensor(1.6350, grad_fn=<NllLossBackward0>)
tensor(1.5863, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2038/17426 [03:04<34:53,  7.35it/s]

tensor(1.6404, grad_fn=<NllLossBackward0>)
tensor(1.6518, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2040/17426 [03:05<34:57,  7.34it/s]

tensor(1.6704, grad_fn=<NllLossBackward0>)
tensor(1.6151, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2042/17426 [03:05<36:41,  6.99it/s]

tensor(1.5978, grad_fn=<NllLossBackward0>)
tensor(1.6407, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2045/17426 [03:05<30:13,  8.48it/s]

tensor(1.6351, grad_fn=<NllLossBackward0>)
tensor(1.6245, grad_fn=<NllLossBackward0>)
tensor(1.6736, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2048/17426 [03:05<25:46,  9.94it/s]

tensor(1.6536, grad_fn=<NllLossBackward0>)
tensor(1.6582, grad_fn=<NllLossBackward0>)
tensor(1.6451, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2050/17426 [03:06<24:18, 10.54it/s]

tensor(1.6274, grad_fn=<NllLossBackward0>)
tensor(1.6260, grad_fn=<NllLossBackward0>)
tensor(1.6681, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2054/17426 [03:06<22:52, 11.20it/s]

tensor(1.6213, grad_fn=<NllLossBackward0>)
tensor(1.6011, grad_fn=<NllLossBackward0>)
tensor(1.6315, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2056/17426 [03:06<22:23, 11.44it/s]

tensor(1.5788, grad_fn=<NllLossBackward0>)
tensor(1.6567, grad_fn=<NllLossBackward0>)
tensor(1.6387, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2060/17426 [03:06<22:04, 11.60it/s]

tensor(1.6028, grad_fn=<NllLossBackward0>)
tensor(1.6523, grad_fn=<NllLossBackward0>)
tensor(1.6351, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2062/17426 [03:07<21:56, 11.67it/s]

tensor(1.6458, grad_fn=<NllLossBackward0>)
tensor(1.6902, grad_fn=<NllLossBackward0>)
tensor(1.6079, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2066/17426 [03:07<21:40, 11.81it/s]

tensor(1.6634, grad_fn=<NllLossBackward0>)
tensor(1.6338, grad_fn=<NllLossBackward0>)
tensor(1.6430, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2068/17426 [03:07<21:45, 11.77it/s]

tensor(1.6518, grad_fn=<NllLossBackward0>)
tensor(1.6451, grad_fn=<NllLossBackward0>)
tensor(1.6130, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2072/17426 [03:07<21:49, 11.73it/s]

tensor(1.6479, grad_fn=<NllLossBackward0>)
tensor(1.6108, grad_fn=<NllLossBackward0>)
tensor(1.6076, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2074/17426 [03:08<21:38, 11.83it/s]

tensor(1.6025, grad_fn=<NllLossBackward0>)
tensor(1.6023, grad_fn=<NllLossBackward0>)
tensor(1.6599, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2078/17426 [03:08<22:17, 11.48it/s]

tensor(1.5921, grad_fn=<NllLossBackward0>)
tensor(1.6094, grad_fn=<NllLossBackward0>)
tensor(1.6590, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2080/17426 [03:08<22:25, 11.40it/s]

tensor(1.6247, grad_fn=<NllLossBackward0>)
tensor(1.6357, grad_fn=<NllLossBackward0>)
tensor(1.6417, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2084/17426 [03:09<21:53, 11.68it/s]

tensor(1.6410, grad_fn=<NllLossBackward0>)
tensor(1.6942, grad_fn=<NllLossBackward0>)
tensor(1.6350, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2086/17426 [03:09<21:59, 11.62it/s]

tensor(1.5912, grad_fn=<NllLossBackward0>)
tensor(1.6209, grad_fn=<NllLossBackward0>)
tensor(1.6334, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2090/17426 [03:09<21:38, 11.81it/s]

tensor(1.6776, grad_fn=<NllLossBackward0>)
tensor(1.6054, grad_fn=<NllLossBackward0>)
tensor(1.6424, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2092/17426 [03:09<21:47, 11.73it/s]

tensor(1.5978, grad_fn=<NllLossBackward0>)
tensor(1.6651, grad_fn=<NllLossBackward0>)
tensor(1.6364, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2096/17426 [03:10<21:41, 11.78it/s]

tensor(1.6398, grad_fn=<NllLossBackward0>)
tensor(1.6484, grad_fn=<NllLossBackward0>)
tensor(1.6542, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2098/17426 [03:10<21:37, 11.81it/s]

tensor(1.6484, grad_fn=<NllLossBackward0>)
tensor(1.6110, grad_fn=<NllLossBackward0>)
tensor(1.6268, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2102/17426 [03:10<21:32, 11.86it/s]

tensor(1.6406, grad_fn=<NllLossBackward0>)
tensor(1.6606, grad_fn=<NllLossBackward0>)
tensor(1.6507, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2104/17426 [03:10<21:58, 11.62it/s]

tensor(1.5962, grad_fn=<NllLossBackward0>)
tensor(1.6152, grad_fn=<NllLossBackward0>)
tensor(1.6263, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2108/17426 [03:11<22:09, 11.52it/s]

tensor(1.6787, grad_fn=<NllLossBackward0>)
tensor(1.6312, grad_fn=<NllLossBackward0>)
tensor(1.6238, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2110/17426 [03:11<21:56, 11.64it/s]

tensor(1.6687, grad_fn=<NllLossBackward0>)
tensor(1.6265, grad_fn=<NllLossBackward0>)
tensor(1.6013, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2114/17426 [03:11<21:29, 11.88it/s]

tensor(1.6153, grad_fn=<NllLossBackward0>)
tensor(1.5940, grad_fn=<NllLossBackward0>)
tensor(1.6629, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2116/17426 [03:11<21:46, 11.71it/s]

tensor(1.6495, grad_fn=<NllLossBackward0>)
tensor(1.6496, grad_fn=<NllLossBackward0>)
tensor(1.5965, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2120/17426 [03:12<21:58, 11.61it/s]

tensor(1.5903, grad_fn=<NllLossBackward0>)
tensor(1.6342, grad_fn=<NllLossBackward0>)
tensor(1.6823, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2122/17426 [03:12<21:45, 11.73it/s]

tensor(1.6146, grad_fn=<NllLossBackward0>)
tensor(1.6334, grad_fn=<NllLossBackward0>)
tensor(1.6339, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2126/17426 [03:12<21:25, 11.90it/s]

tensor(1.6345, grad_fn=<NllLossBackward0>)
tensor(1.6202, grad_fn=<NllLossBackward0>)
tensor(1.6154, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2128/17426 [03:12<21:42, 11.75it/s]

tensor(1.6351, grad_fn=<NllLossBackward0>)
tensor(1.6060, grad_fn=<NllLossBackward0>)
tensor(1.6134, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2132/17426 [03:13<21:48, 11.69it/s]

tensor(1.6646, grad_fn=<NllLossBackward0>)
tensor(1.6105, grad_fn=<NllLossBackward0>)
tensor(1.6101, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2134/17426 [03:13<22:01, 11.58it/s]

tensor(1.5588, grad_fn=<NllLossBackward0>)
tensor(1.6064, grad_fn=<NllLossBackward0>)
tensor(1.6110, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2138/17426 [03:13<21:30, 11.85it/s]

tensor(1.6556, grad_fn=<NllLossBackward0>)
tensor(1.5697, grad_fn=<NllLossBackward0>)
tensor(1.6281, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2140/17426 [03:13<21:53, 11.64it/s]

tensor(1.6230, grad_fn=<NllLossBackward0>)
tensor(1.6017, grad_fn=<NllLossBackward0>)
tensor(1.6274, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2144/17426 [03:14<21:59, 11.59it/s]

tensor(1.6174, grad_fn=<NllLossBackward0>)
tensor(1.5782, grad_fn=<NllLossBackward0>)
tensor(1.6103, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2146/17426 [03:14<21:52, 11.64it/s]

tensor(1.6412, grad_fn=<NllLossBackward0>)
tensor(1.6383, grad_fn=<NllLossBackward0>)
tensor(1.6308, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2150/17426 [03:14<21:28, 11.86it/s]

tensor(1.6277, grad_fn=<NllLossBackward0>)
tensor(1.6490, grad_fn=<NllLossBackward0>)
tensor(1.6585, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2152/17426 [03:14<21:31, 11.83it/s]

tensor(1.6380, grad_fn=<NllLossBackward0>)
tensor(1.6069, grad_fn=<NllLossBackward0>)
tensor(1.6469, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2156/17426 [03:15<21:46, 11.69it/s]

tensor(1.6164, grad_fn=<NllLossBackward0>)
tensor(1.6311, grad_fn=<NllLossBackward0>)
tensor(1.5952, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2158/17426 [03:15<21:36, 11.77it/s]

tensor(1.6563, grad_fn=<NllLossBackward0>)
tensor(1.6086, grad_fn=<NllLossBackward0>)
tensor(1.6329, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2160/17426 [03:15<21:55, 11.60it/s]

tensor(1.6071, grad_fn=<NllLossBackward0>)
tensor(1.5633, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2162/17426 [03:15<25:50,  9.84it/s]

tensor(1.6112, grad_fn=<NllLossBackward0>)
tensor(1.6373, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2165/17426 [03:16<28:50,  8.82it/s]

tensor(1.6232, grad_fn=<NllLossBackward0>)
tensor(1.5891, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2167/17426 [03:16<30:45,  8.27it/s]

tensor(1.6418, grad_fn=<NllLossBackward0>)
tensor(1.5947, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2169/17426 [03:16<32:17,  7.87it/s]

tensor(1.6513, grad_fn=<NllLossBackward0>)
tensor(1.6306, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2171/17426 [03:17<32:23,  7.85it/s]

tensor(1.6102, grad_fn=<NllLossBackward0>)
tensor(1.5731, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2173/17426 [03:17<31:59,  7.94it/s]

tensor(1.5949, grad_fn=<NllLossBackward0>)
tensor(1.6103, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2175/17426 [03:17<33:50,  7.51it/s]

tensor(1.6123, grad_fn=<NllLossBackward0>)
tensor(1.6198, grad_fn=<NllLossBackward0>)


 12%|█▏        | 2177/17426 [03:17<34:57,  7.27it/s]

tensor(1.6174, grad_fn=<NllLossBackward0>)
tensor(1.6256, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2179/17426 [03:18<35:22,  7.18it/s]

tensor(1.6075, grad_fn=<NllLossBackward0>)
tensor(1.6802, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2182/17426 [03:18<29:58,  8.48it/s]

tensor(1.6182, grad_fn=<NllLossBackward0>)
tensor(1.6202, grad_fn=<NllLossBackward0>)
tensor(1.6278, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2184/17426 [03:18<26:43,  9.50it/s]

tensor(1.6676, grad_fn=<NllLossBackward0>)
tensor(1.6185, grad_fn=<NllLossBackward0>)
tensor(1.6649, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2188/17426 [03:18<23:17, 10.90it/s]

tensor(1.6192, grad_fn=<NllLossBackward0>)
tensor(1.6010, grad_fn=<NllLossBackward0>)
tensor(1.6168, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2190/17426 [03:19<22:49, 11.13it/s]

tensor(1.5887, grad_fn=<NllLossBackward0>)
tensor(1.6202, grad_fn=<NllLossBackward0>)
tensor(1.6272, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2194/17426 [03:19<22:03, 11.51it/s]

tensor(1.6728, grad_fn=<NllLossBackward0>)
tensor(1.6272, grad_fn=<NllLossBackward0>)
tensor(1.6266, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2196/17426 [03:19<21:53, 11.60it/s]

tensor(1.5823, grad_fn=<NllLossBackward0>)
tensor(1.5801, grad_fn=<NllLossBackward0>)
tensor(1.6484, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2200/17426 [03:19<21:32, 11.78it/s]

tensor(1.6616, grad_fn=<NllLossBackward0>)
tensor(1.6212, grad_fn=<NllLossBackward0>)
tensor(1.6172, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2202/17426 [03:20<21:34, 11.76it/s]

tensor(1.6020, grad_fn=<NllLossBackward0>)
tensor(1.6133, grad_fn=<NllLossBackward0>)
tensor(1.6463, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2206/17426 [03:20<21:31, 11.78it/s]

tensor(1.6432, grad_fn=<NllLossBackward0>)
tensor(1.6292, grad_fn=<NllLossBackward0>)
tensor(1.6204, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2208/17426 [03:20<21:28, 11.81it/s]

tensor(1.6496, grad_fn=<NllLossBackward0>)
tensor(1.6405, grad_fn=<NllLossBackward0>)
tensor(1.6483, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2212/17426 [03:20<21:31, 11.78it/s]

tensor(1.6090, grad_fn=<NllLossBackward0>)
tensor(1.5990, grad_fn=<NllLossBackward0>)
tensor(1.6355, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2214/17426 [03:21<21:29, 11.80it/s]

tensor(1.6417, grad_fn=<NllLossBackward0>)
tensor(1.6464, grad_fn=<NllLossBackward0>)
tensor(1.6157, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2218/17426 [03:21<21:32, 11.77it/s]

tensor(1.5934, grad_fn=<NllLossBackward0>)
tensor(1.6494, grad_fn=<NllLossBackward0>)
tensor(1.6370, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2220/17426 [03:21<21:29, 11.79it/s]

tensor(1.6008, grad_fn=<NllLossBackward0>)
tensor(1.5966, grad_fn=<NllLossBackward0>)
tensor(1.5906, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2224/17426 [03:21<21:10, 11.96it/s]

tensor(1.6366, grad_fn=<NllLossBackward0>)
tensor(1.6381, grad_fn=<NllLossBackward0>)
tensor(1.6104, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2226/17426 [03:22<21:20, 11.87it/s]

tensor(1.6430, grad_fn=<NllLossBackward0>)
tensor(1.5889, grad_fn=<NllLossBackward0>)
tensor(1.6414, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2230/17426 [03:22<21:35, 11.73it/s]

tensor(1.6210, grad_fn=<NllLossBackward0>)
tensor(1.5985, grad_fn=<NllLossBackward0>)
tensor(1.6427, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2232/17426 [03:22<21:38, 11.70it/s]

tensor(1.6341, grad_fn=<NllLossBackward0>)
tensor(1.6349, grad_fn=<NllLossBackward0>)
tensor(1.6170, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2236/17426 [03:23<21:05, 12.01it/s]

tensor(1.6018, grad_fn=<NllLossBackward0>)
tensor(1.6097, grad_fn=<NllLossBackward0>)
tensor(1.6061, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2238/17426 [03:23<21:15, 11.91it/s]

tensor(1.5999, grad_fn=<NllLossBackward0>)
tensor(1.6240, grad_fn=<NllLossBackward0>)
tensor(1.6069, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2242/17426 [03:23<21:21, 11.85it/s]

tensor(1.6035, grad_fn=<NllLossBackward0>)
tensor(1.5970, grad_fn=<NllLossBackward0>)
tensor(1.5805, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2244/17426 [03:23<21:45, 11.63it/s]

tensor(1.6988, grad_fn=<NllLossBackward0>)
tensor(1.6375, grad_fn=<NllLossBackward0>)
tensor(1.5860, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2248/17426 [03:24<21:37, 11.70it/s]

tensor(1.5886, grad_fn=<NllLossBackward0>)
tensor(1.6278, grad_fn=<NllLossBackward0>)
tensor(1.5883, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2250/17426 [03:24<21:55, 11.54it/s]

tensor(1.6229, grad_fn=<NllLossBackward0>)
tensor(1.6553, grad_fn=<NllLossBackward0>)
tensor(1.6285, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2254/17426 [03:24<21:57, 11.51it/s]

tensor(1.6097, grad_fn=<NllLossBackward0>)
tensor(1.5787, grad_fn=<NllLossBackward0>)
tensor(1.6141, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2256/17426 [03:24<22:11, 11.39it/s]

tensor(1.6315, grad_fn=<NllLossBackward0>)
tensor(1.6231, grad_fn=<NllLossBackward0>)
tensor(1.6586, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2260/17426 [03:25<21:41, 11.65it/s]

tensor(1.6159, grad_fn=<NllLossBackward0>)
tensor(1.6173, grad_fn=<NllLossBackward0>)
tensor(1.6134, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2262/17426 [03:25<21:42, 11.64it/s]

tensor(1.6168, grad_fn=<NllLossBackward0>)
tensor(1.5941, grad_fn=<NllLossBackward0>)
tensor(1.6017, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2266/17426 [03:25<21:47, 11.59it/s]

tensor(1.6910, grad_fn=<NllLossBackward0>)
tensor(1.6187, grad_fn=<NllLossBackward0>)
tensor(1.5771, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2268/17426 [03:25<21:50, 11.57it/s]

tensor(1.6341, grad_fn=<NllLossBackward0>)
tensor(1.6018, grad_fn=<NllLossBackward0>)
tensor(1.6198, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2272/17426 [03:26<21:19, 11.84it/s]

tensor(1.5984, grad_fn=<NllLossBackward0>)
tensor(1.5851, grad_fn=<NllLossBackward0>)
tensor(1.5949, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2274/17426 [03:26<21:38, 11.67it/s]

tensor(1.5980, grad_fn=<NllLossBackward0>)
tensor(1.6329, grad_fn=<NllLossBackward0>)
tensor(1.6392, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2278/17426 [03:26<21:43, 11.62it/s]

tensor(1.6033, grad_fn=<NllLossBackward0>)
tensor(1.6310, grad_fn=<NllLossBackward0>)
tensor(1.6083, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2280/17426 [03:26<21:41, 11.64it/s]

tensor(1.6320, grad_fn=<NllLossBackward0>)
tensor(1.6329, grad_fn=<NllLossBackward0>)
tensor(1.5664, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2284/17426 [03:27<21:29, 11.75it/s]

tensor(1.6303, grad_fn=<NllLossBackward0>)
tensor(1.5771, grad_fn=<NllLossBackward0>)
tensor(1.6215, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2286/17426 [03:27<21:34, 11.70it/s]

tensor(1.6015, grad_fn=<NllLossBackward0>)
tensor(1.6554, grad_fn=<NllLossBackward0>)
tensor(1.6631, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2290/17426 [03:27<21:47, 11.58it/s]

tensor(1.5951, grad_fn=<NllLossBackward0>)
tensor(1.5815, grad_fn=<NllLossBackward0>)
tensor(1.6473, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2292/17426 [03:27<21:33, 11.70it/s]

tensor(1.6213, grad_fn=<NllLossBackward0>)
tensor(1.6409, grad_fn=<NllLossBackward0>)
tensor(1.5888, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2296/17426 [03:28<21:22, 11.80it/s]

tensor(1.6151, grad_fn=<NllLossBackward0>)
tensor(1.6058, grad_fn=<NllLossBackward0>)
tensor(1.5913, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2298/17426 [03:28<22:22, 11.27it/s]

tensor(1.5930, grad_fn=<NllLossBackward0>)
tensor(1.5485, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2300/17426 [03:28<25:49,  9.76it/s]

tensor(1.6011, grad_fn=<NllLossBackward0>)
tensor(1.6065, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2302/17426 [03:28<27:13,  9.26it/s]

tensor(1.6035, grad_fn=<NllLossBackward0>)
tensor(1.6353, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2304/17426 [03:29<28:28,  8.85it/s]

tensor(1.6313, grad_fn=<NllLossBackward0>)
tensor(1.6281, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2306/17426 [03:29<28:59,  8.69it/s]

tensor(1.6036, grad_fn=<NllLossBackward0>)
tensor(1.5878, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2308/17426 [03:29<29:33,  8.53it/s]

tensor(1.5922, grad_fn=<NllLossBackward0>)
tensor(1.6220, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2310/17426 [03:29<30:52,  8.16it/s]

tensor(1.5809, grad_fn=<NllLossBackward0>)
tensor(1.6130, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2312/17426 [03:30<29:53,  8.43it/s]

tensor(1.6057, grad_fn=<NllLossBackward0>)
tensor(1.6241, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2314/17426 [03:30<31:43,  7.94it/s]

tensor(1.6334, grad_fn=<NllLossBackward0>)
tensor(1.6364, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2316/17426 [03:30<31:45,  7.93it/s]

tensor(1.6369, grad_fn=<NllLossBackward0>)
tensor(1.5935, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2318/17426 [03:30<33:38,  7.48it/s]

tensor(1.6156, grad_fn=<NllLossBackward0>)
tensor(1.6310, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2320/17426 [03:31<34:22,  7.32it/s]

tensor(1.5756, grad_fn=<NllLossBackward0>)
tensor(1.5938, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2323/17426 [03:31<27:14,  9.24it/s]

tensor(1.6051, grad_fn=<NllLossBackward0>)
tensor(1.6343, grad_fn=<NllLossBackward0>)
tensor(1.6575, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2325/17426 [03:31<24:56, 10.09it/s]

tensor(1.6209, grad_fn=<NllLossBackward0>)
tensor(1.6059, grad_fn=<NllLossBackward0>)
tensor(1.5925, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2329/17426 [03:31<22:58, 10.96it/s]

tensor(1.5830, grad_fn=<NllLossBackward0>)
tensor(1.6729, grad_fn=<NllLossBackward0>)
tensor(1.6024, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2331/17426 [03:32<22:50, 11.02it/s]

tensor(1.6149, grad_fn=<NllLossBackward0>)
tensor(1.6119, grad_fn=<NllLossBackward0>)
tensor(1.6024, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2335/17426 [03:32<21:43, 11.58it/s]

tensor(1.6122, grad_fn=<NllLossBackward0>)
tensor(1.5888, grad_fn=<NllLossBackward0>)
tensor(1.5899, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2337/17426 [03:32<21:59, 11.44it/s]

tensor(1.6241, grad_fn=<NllLossBackward0>)
tensor(1.6077, grad_fn=<NllLossBackward0>)
tensor(1.6311, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2341/17426 [03:32<21:33, 11.67it/s]

tensor(1.6300, grad_fn=<NllLossBackward0>)
tensor(1.6231, grad_fn=<NllLossBackward0>)
tensor(1.5856, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2343/17426 [03:33<21:39, 11.61it/s]

tensor(1.6077, grad_fn=<NllLossBackward0>)
tensor(1.6461, grad_fn=<NllLossBackward0>)
tensor(1.6033, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2347/17426 [03:33<21:17, 11.80it/s]

tensor(1.5979, grad_fn=<NllLossBackward0>)
tensor(1.6022, grad_fn=<NllLossBackward0>)
tensor(1.5783, grad_fn=<NllLossBackward0>)


 13%|█▎        | 2349/17426 [03:33<21:15, 11.82it/s]

tensor(1.6292, grad_fn=<NllLossBackward0>)
tensor(1.6424, grad_fn=<NllLossBackward0>)
tensor(1.5885, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2353/17426 [03:34<21:24, 11.73it/s]

tensor(1.5965, grad_fn=<NllLossBackward0>)
tensor(1.5849, grad_fn=<NllLossBackward0>)
tensor(1.5914, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2355/17426 [03:34<21:31, 11.67it/s]

tensor(1.6094, grad_fn=<NllLossBackward0>)
tensor(1.6512, grad_fn=<NllLossBackward0>)
tensor(1.6002, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2359/17426 [03:34<21:06, 11.90it/s]

tensor(1.5632, grad_fn=<NllLossBackward0>)
tensor(1.5928, grad_fn=<NllLossBackward0>)
tensor(1.6332, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2361/17426 [03:34<21:13, 11.83it/s]

tensor(1.6451, grad_fn=<NllLossBackward0>)
tensor(1.5952, grad_fn=<NllLossBackward0>)
tensor(1.5836, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2365/17426 [03:35<21:30, 11.67it/s]

tensor(1.6341, grad_fn=<NllLossBackward0>)
tensor(1.6468, grad_fn=<NllLossBackward0>)
tensor(1.6019, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2367/17426 [03:35<21:30, 11.67it/s]

tensor(1.6625, grad_fn=<NllLossBackward0>)
tensor(1.6329, grad_fn=<NllLossBackward0>)
tensor(1.6499, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2371/17426 [03:35<21:07, 11.88it/s]

tensor(1.6132, grad_fn=<NllLossBackward0>)
tensor(1.6432, grad_fn=<NllLossBackward0>)
tensor(1.6109, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2373/17426 [03:35<21:26, 11.70it/s]

tensor(1.6014, grad_fn=<NllLossBackward0>)
tensor(1.6154, grad_fn=<NllLossBackward0>)
tensor(1.5854, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2377/17426 [03:36<21:51, 11.48it/s]

tensor(1.5647, grad_fn=<NllLossBackward0>)
tensor(1.5547, grad_fn=<NllLossBackward0>)
tensor(1.6061, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2379/17426 [03:36<21:49, 11.49it/s]

tensor(1.5675, grad_fn=<NllLossBackward0>)
tensor(1.6048, grad_fn=<NllLossBackward0>)
tensor(1.6016, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2383/17426 [03:36<21:21, 11.74it/s]

tensor(1.6286, grad_fn=<NllLossBackward0>)
tensor(1.5765, grad_fn=<NllLossBackward0>)
tensor(1.6347, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2385/17426 [03:36<21:12, 11.82it/s]

tensor(1.6106, grad_fn=<NllLossBackward0>)
tensor(1.6182, grad_fn=<NllLossBackward0>)
tensor(1.5718, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2389/17426 [03:37<21:38, 11.58it/s]

tensor(1.5705, grad_fn=<NllLossBackward0>)
tensor(1.6334, grad_fn=<NllLossBackward0>)
tensor(1.6248, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2391/17426 [03:37<21:38, 11.58it/s]

tensor(1.6441, grad_fn=<NllLossBackward0>)
tensor(1.6014, grad_fn=<NllLossBackward0>)
tensor(1.6190, grad_fn=<NllLossBackward0>)


 14%|█▎        | 2395/17426 [03:37<21:11, 11.82it/s]

tensor(1.6041, grad_fn=<NllLossBackward0>)
tensor(1.6141, grad_fn=<NllLossBackward0>)
tensor(1.6004, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2397/17426 [03:37<21:26, 11.68it/s]

tensor(1.5820, grad_fn=<NllLossBackward0>)
tensor(1.6169, grad_fn=<NllLossBackward0>)
tensor(1.6073, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2401/17426 [03:38<21:28, 11.66it/s]

tensor(1.6420, grad_fn=<NllLossBackward0>)
tensor(1.6335, grad_fn=<NllLossBackward0>)
tensor(1.6001, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2403/17426 [03:38<21:14, 11.79it/s]

tensor(1.6145, grad_fn=<NllLossBackward0>)
tensor(1.6043, grad_fn=<NllLossBackward0>)
tensor(1.5897, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2407/17426 [03:38<21:44, 11.51it/s]

tensor(1.5917, grad_fn=<NllLossBackward0>)
tensor(1.6473, grad_fn=<NllLossBackward0>)
tensor(1.6290, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2409/17426 [03:38<21:36, 11.58it/s]

tensor(1.5756, grad_fn=<NllLossBackward0>)
tensor(1.6109, grad_fn=<NllLossBackward0>)
tensor(1.6054, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2413/17426 [03:39<21:33, 11.60it/s]

tensor(1.6006, grad_fn=<NllLossBackward0>)
tensor(1.6115, grad_fn=<NllLossBackward0>)
tensor(1.6068, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2415/17426 [03:39<21:43, 11.51it/s]

tensor(1.6316, grad_fn=<NllLossBackward0>)
tensor(1.6324, grad_fn=<NllLossBackward0>)
tensor(1.6065, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2419/17426 [03:39<21:01, 11.89it/s]

tensor(1.5924, grad_fn=<NllLossBackward0>)
tensor(1.6429, grad_fn=<NllLossBackward0>)
tensor(1.6132, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2421/17426 [03:39<21:10, 11.81it/s]

tensor(1.5691, grad_fn=<NllLossBackward0>)
tensor(1.6147, grad_fn=<NllLossBackward0>)
tensor(1.5648, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2425/17426 [03:40<21:33, 11.60it/s]

tensor(1.6242, grad_fn=<NllLossBackward0>)
tensor(1.6473, grad_fn=<NllLossBackward0>)
tensor(1.5943, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2427/17426 [03:40<21:34, 11.58it/s]

tensor(1.6114, grad_fn=<NllLossBackward0>)
tensor(1.6078, grad_fn=<NllLossBackward0>)
tensor(1.5933, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2431/17426 [03:40<21:01, 11.89it/s]

tensor(1.6382, grad_fn=<NllLossBackward0>)
tensor(1.5582, grad_fn=<NllLossBackward0>)
tensor(1.5728, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2433/17426 [03:40<21:20, 11.71it/s]

tensor(1.5797, grad_fn=<NllLossBackward0>)
tensor(1.6291, grad_fn=<NllLossBackward0>)
tensor(1.6039, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2437/17426 [03:41<21:42, 11.51it/s]

tensor(1.6039, grad_fn=<NllLossBackward0>)
tensor(1.5903, grad_fn=<NllLossBackward0>)
tensor(1.6042, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2439/17426 [03:41<24:44, 10.09it/s]

tensor(1.6071, grad_fn=<NllLossBackward0>)
tensor(1.5982, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2441/17426 [03:41<26:41,  9.36it/s]

tensor(1.6020, grad_fn=<NllLossBackward0>)
tensor(1.6137, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2443/17426 [03:42<27:47,  8.99it/s]

tensor(1.5724, grad_fn=<NllLossBackward0>)
tensor(1.6207, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2445/17426 [03:42<29:53,  8.35it/s]

tensor(1.5987, grad_fn=<NllLossBackward0>)
tensor(1.6141, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2447/17426 [03:42<29:48,  8.38it/s]

tensor(1.6286, grad_fn=<NllLossBackward0>)
tensor(1.6123, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2449/17426 [03:42<30:30,  8.18it/s]

tensor(1.6030, grad_fn=<NllLossBackward0>)
tensor(1.6573, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2451/17426 [03:43<30:28,  8.19it/s]

tensor(1.5834, grad_fn=<NllLossBackward0>)
tensor(1.6167, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2453/17426 [03:43<32:03,  7.78it/s]

tensor(1.6429, grad_fn=<NllLossBackward0>)
tensor(1.6397, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2455/17426 [03:43<31:53,  7.82it/s]

tensor(1.6254, grad_fn=<NllLossBackward0>)
tensor(1.6280, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2457/17426 [03:43<33:32,  7.44it/s]

tensor(1.5966, grad_fn=<NllLossBackward0>)
tensor(1.5929, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2459/17426 [03:44<34:24,  7.25it/s]

tensor(1.5967, grad_fn=<NllLossBackward0>)
tensor(1.6367, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2462/17426 [03:44<27:34,  9.04it/s]

tensor(1.6098, grad_fn=<NllLossBackward0>)
tensor(1.6165, grad_fn=<NllLossBackward0>)
tensor(1.5859, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2464/17426 [03:44<25:10,  9.91it/s]

tensor(1.6157, grad_fn=<NllLossBackward0>)
tensor(1.5942, grad_fn=<NllLossBackward0>)
tensor(1.5902, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2468/17426 [03:44<22:43, 10.97it/s]

tensor(1.6168, grad_fn=<NllLossBackward0>)
tensor(1.6619, grad_fn=<NllLossBackward0>)
tensor(1.6625, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2470/17426 [03:45<22:06, 11.28it/s]

tensor(1.5800, grad_fn=<NllLossBackward0>)
tensor(1.5729, grad_fn=<NllLossBackward0>)
tensor(1.5827, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2474/17426 [03:45<22:06, 11.27it/s]

tensor(1.5971, grad_fn=<NllLossBackward0>)
tensor(1.6212, grad_fn=<NllLossBackward0>)
tensor(1.6308, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2476/17426 [03:45<21:33, 11.56it/s]

tensor(1.5976, grad_fn=<NllLossBackward0>)
tensor(1.5861, grad_fn=<NllLossBackward0>)
tensor(1.5795, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2480/17426 [03:45<21:09, 11.77it/s]

tensor(1.5948, grad_fn=<NllLossBackward0>)
tensor(1.5815, grad_fn=<NllLossBackward0>)
tensor(1.6191, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2482/17426 [03:46<21:01, 11.84it/s]

tensor(1.5291, grad_fn=<NllLossBackward0>)
tensor(1.6180, grad_fn=<NllLossBackward0>)
tensor(1.6010, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2486/17426 [03:46<21:30, 11.58it/s]

tensor(1.6322, grad_fn=<NllLossBackward0>)
tensor(1.5890, grad_fn=<NllLossBackward0>)
tensor(1.5620, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2488/17426 [03:46<21:21, 11.65it/s]

tensor(1.6071, grad_fn=<NllLossBackward0>)
tensor(1.5856, grad_fn=<NllLossBackward0>)
tensor(1.5925, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2492/17426 [03:46<20:52, 11.92it/s]

tensor(1.5521, grad_fn=<NllLossBackward0>)
tensor(1.6326, grad_fn=<NllLossBackward0>)
tensor(1.6140, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2494/17426 [03:47<21:04, 11.81it/s]

tensor(1.6310, grad_fn=<NllLossBackward0>)
tensor(1.6067, grad_fn=<NllLossBackward0>)
tensor(1.5924, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2498/17426 [03:47<21:22, 11.64it/s]

tensor(1.5956, grad_fn=<NllLossBackward0>)
tensor(1.6232, grad_fn=<NllLossBackward0>)
tensor(1.6104, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2500/17426 [03:47<21:34, 11.53it/s]

tensor(1.6312, grad_fn=<NllLossBackward0>)
tensor(1.5738, grad_fn=<NllLossBackward0>)
tensor(1.6407, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2504/17426 [03:47<21:17, 11.68it/s]

tensor(1.6140, grad_fn=<NllLossBackward0>)
tensor(1.5971, grad_fn=<NllLossBackward0>)
tensor(1.5638, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2506/17426 [03:48<21:13, 11.71it/s]

tensor(1.6247, grad_fn=<NllLossBackward0>)
tensor(1.6428, grad_fn=<NllLossBackward0>)
tensor(1.5991, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2510/17426 [03:48<21:14, 11.70it/s]

tensor(1.5819, grad_fn=<NllLossBackward0>)
tensor(1.5694, grad_fn=<NllLossBackward0>)
tensor(1.6114, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2512/17426 [03:48<21:07, 11.77it/s]

tensor(1.6241, grad_fn=<NllLossBackward0>)
tensor(1.6404, grad_fn=<NllLossBackward0>)
tensor(1.5595, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2516/17426 [03:48<20:53, 11.89it/s]

tensor(1.5918, grad_fn=<NllLossBackward0>)
tensor(1.6272, grad_fn=<NllLossBackward0>)
tensor(1.5881, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2518/17426 [03:49<21:00, 11.83it/s]

tensor(1.6006, grad_fn=<NllLossBackward0>)
tensor(1.6200, grad_fn=<NllLossBackward0>)
tensor(1.5896, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2522/17426 [03:49<21:13, 11.70it/s]

tensor(1.5507, grad_fn=<NllLossBackward0>)
tensor(1.6231, grad_fn=<NllLossBackward0>)
tensor(1.6331, grad_fn=<NllLossBackward0>)


 14%|█▍        | 2524/17426 [03:49<21:14, 11.70it/s]

tensor(1.6477, grad_fn=<NllLossBackward0>)
tensor(1.6122, grad_fn=<NllLossBackward0>)
tensor(1.6354, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2528/17426 [03:50<20:56, 11.85it/s]

tensor(1.5781, grad_fn=<NllLossBackward0>)
tensor(1.6097, grad_fn=<NllLossBackward0>)
tensor(1.5945, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2530/17426 [03:50<21:17, 11.66it/s]

tensor(1.5913, grad_fn=<NllLossBackward0>)
tensor(1.5973, grad_fn=<NllLossBackward0>)
tensor(1.6050, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2534/17426 [03:50<21:31, 11.53it/s]

tensor(1.6177, grad_fn=<NllLossBackward0>)
tensor(1.5686, grad_fn=<NllLossBackward0>)
tensor(1.5882, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2536/17426 [03:50<21:23, 11.60it/s]

tensor(1.5747, grad_fn=<NllLossBackward0>)
tensor(1.5990, grad_fn=<NllLossBackward0>)
tensor(1.6004, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2540/17426 [03:51<20:42, 11.98it/s]

tensor(1.6008, grad_fn=<NllLossBackward0>)
tensor(1.5873, grad_fn=<NllLossBackward0>)
tensor(1.5964, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2542/17426 [03:51<21:02, 11.79it/s]

tensor(1.5895, grad_fn=<NllLossBackward0>)
tensor(1.5686, grad_fn=<NllLossBackward0>)
tensor(1.6643, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2546/17426 [03:51<21:22, 11.60it/s]

tensor(1.5824, grad_fn=<NllLossBackward0>)
tensor(1.6055, grad_fn=<NllLossBackward0>)
tensor(1.5827, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2548/17426 [03:51<21:15, 11.66it/s]

tensor(1.6492, grad_fn=<NllLossBackward0>)
tensor(1.5784, grad_fn=<NllLossBackward0>)
tensor(1.6075, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2552/17426 [03:52<20:49, 11.91it/s]

tensor(1.6190, grad_fn=<NllLossBackward0>)
tensor(1.6174, grad_fn=<NllLossBackward0>)
tensor(1.6417, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2554/17426 [03:52<21:14, 11.67it/s]

tensor(1.5992, grad_fn=<NllLossBackward0>)
tensor(1.6117, grad_fn=<NllLossBackward0>)
tensor(1.6090, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2558/17426 [03:52<21:38, 11.45it/s]

tensor(1.6248, grad_fn=<NllLossBackward0>)
tensor(1.6062, grad_fn=<NllLossBackward0>)
tensor(1.5721, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2560/17426 [03:52<21:26, 11.55it/s]

tensor(1.5897, grad_fn=<NllLossBackward0>)
tensor(1.6162, grad_fn=<NllLossBackward0>)
tensor(1.5744, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2564/17426 [03:53<21:10, 11.70it/s]

tensor(1.6184, grad_fn=<NllLossBackward0>)
tensor(1.5919, grad_fn=<NllLossBackward0>)
tensor(1.6060, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2566/17426 [03:53<21:08, 11.71it/s]

tensor(1.6147, grad_fn=<NllLossBackward0>)
tensor(1.5913, grad_fn=<NllLossBackward0>)
tensor(1.6279, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2570/17426 [03:53<21:39, 11.44it/s]

tensor(1.5717, grad_fn=<NllLossBackward0>)
tensor(1.5756, grad_fn=<NllLossBackward0>)
tensor(1.6377, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2572/17426 [03:53<21:28, 11.53it/s]

tensor(1.6188, grad_fn=<NllLossBackward0>)
tensor(1.6364, grad_fn=<NllLossBackward0>)
tensor(1.5952, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2576/17426 [03:54<20:55, 11.83it/s]

tensor(1.5869, grad_fn=<NllLossBackward0>)
tensor(1.6194, grad_fn=<NllLossBackward0>)
tensor(1.5762, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2578/17426 [03:54<24:33, 10.08it/s]

tensor(1.6406, grad_fn=<NllLossBackward0>)
tensor(1.6266, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2580/17426 [03:54<27:18,  9.06it/s]

tensor(1.6364, grad_fn=<NllLossBackward0>)
tensor(1.5559, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2582/17426 [03:54<28:13,  8.76it/s]

tensor(1.6025, grad_fn=<NllLossBackward0>)
tensor(1.5844, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2584/17426 [03:55<29:44,  8.32it/s]

tensor(1.5725, grad_fn=<NllLossBackward0>)
tensor(1.6109, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2586/17426 [03:55<30:18,  8.16it/s]

tensor(1.6429, grad_fn=<NllLossBackward0>)
tensor(1.5926, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2588/17426 [03:55<31:34,  7.83it/s]

tensor(1.5547, grad_fn=<NllLossBackward0>)
tensor(1.5954, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2590/17426 [03:55<30:43,  8.05it/s]

tensor(1.6353, grad_fn=<NllLossBackward0>)
tensor(1.5662, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2592/17426 [03:56<32:00,  7.73it/s]

tensor(1.6144, grad_fn=<NllLossBackward0>)
tensor(1.6575, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2594/17426 [03:56<31:28,  7.85it/s]

tensor(1.6112, grad_fn=<NllLossBackward0>)
tensor(1.5918, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2596/17426 [03:56<32:11,  7.68it/s]

tensor(1.6040, grad_fn=<NllLossBackward0>)
tensor(1.6134, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2598/17426 [03:57<32:25,  7.62it/s]

tensor(1.6460, grad_fn=<NllLossBackward0>)
tensor(1.5804, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2601/17426 [03:57<26:53,  9.19it/s]

tensor(1.6107, grad_fn=<NllLossBackward0>)
tensor(1.5689, grad_fn=<NllLossBackward0>)
tensor(1.5875, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2603/17426 [03:57<24:34, 10.05it/s]

tensor(1.6215, grad_fn=<NllLossBackward0>)
tensor(1.6154, grad_fn=<NllLossBackward0>)
tensor(1.5713, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2607/17426 [03:57<22:33, 10.94it/s]

tensor(1.6559, grad_fn=<NllLossBackward0>)
tensor(1.6077, grad_fn=<NllLossBackward0>)
tensor(1.6074, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2609/17426 [03:57<22:04, 11.18it/s]

tensor(1.5709, grad_fn=<NllLossBackward0>)
tensor(1.6042, grad_fn=<NllLossBackward0>)
tensor(1.6167, grad_fn=<NllLossBackward0>)


 15%|█▍        | 2613/17426 [03:58<21:03, 11.72it/s]

tensor(1.5743, grad_fn=<NllLossBackward0>)
tensor(1.5722, grad_fn=<NllLossBackward0>)
tensor(1.6465, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2615/17426 [03:58<21:06, 11.70it/s]

tensor(1.6040, grad_fn=<NllLossBackward0>)
tensor(1.5628, grad_fn=<NllLossBackward0>)
tensor(1.5933, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2619/17426 [03:58<21:14, 11.62it/s]

tensor(1.6215, grad_fn=<NllLossBackward0>)
tensor(1.6134, grad_fn=<NllLossBackward0>)
tensor(1.6077, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2621/17426 [03:59<21:22, 11.54it/s]

tensor(1.6233, grad_fn=<NllLossBackward0>)
tensor(1.5948, grad_fn=<NllLossBackward0>)
tensor(1.6196, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2625/17426 [03:59<21:03, 11.71it/s]

tensor(1.6093, grad_fn=<NllLossBackward0>)
tensor(1.6265, grad_fn=<NllLossBackward0>)
tensor(1.6472, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2627/17426 [03:59<20:50, 11.83it/s]

tensor(1.5839, grad_fn=<NllLossBackward0>)
tensor(1.5628, grad_fn=<NllLossBackward0>)
tensor(1.5863, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2631/17426 [03:59<21:13, 11.62it/s]

tensor(1.6046, grad_fn=<NllLossBackward0>)
tensor(1.5494, grad_fn=<NllLossBackward0>)
tensor(1.6177, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2633/17426 [04:00<21:11, 11.64it/s]

tensor(1.6536, grad_fn=<NllLossBackward0>)
tensor(1.5942, grad_fn=<NllLossBackward0>)
tensor(1.5811, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2637/17426 [04:00<20:57, 11.76it/s]

tensor(1.5970, grad_fn=<NllLossBackward0>)
tensor(1.6468, grad_fn=<NllLossBackward0>)
tensor(1.5879, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2639/17426 [04:00<21:00, 11.73it/s]

tensor(1.6182, grad_fn=<NllLossBackward0>)
tensor(1.6010, grad_fn=<NllLossBackward0>)
tensor(1.5992, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2643/17426 [04:00<21:14, 11.60it/s]

tensor(1.6329, grad_fn=<NllLossBackward0>)
tensor(1.5749, grad_fn=<NllLossBackward0>)
tensor(1.6011, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2645/17426 [04:01<21:03, 11.70it/s]

tensor(1.5819, grad_fn=<NllLossBackward0>)
tensor(1.5962, grad_fn=<NllLossBackward0>)
tensor(1.6053, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2649/17426 [04:01<20:36, 11.95it/s]

tensor(1.6014, grad_fn=<NllLossBackward0>)
tensor(1.6322, grad_fn=<NllLossBackward0>)
tensor(1.5548, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2651/17426 [04:01<20:38, 11.93it/s]

tensor(1.5818, grad_fn=<NllLossBackward0>)
tensor(1.6137, grad_fn=<NllLossBackward0>)
tensor(1.6028, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2655/17426 [04:01<21:01, 11.71it/s]

tensor(1.5667, grad_fn=<NllLossBackward0>)
tensor(1.6147, grad_fn=<NllLossBackward0>)
tensor(1.6011, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2657/17426 [04:02<20:47, 11.84it/s]

tensor(1.6222, grad_fn=<NllLossBackward0>)
tensor(1.5777, grad_fn=<NllLossBackward0>)
tensor(1.6499, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2661/17426 [04:02<20:48, 11.83it/s]

tensor(1.5991, grad_fn=<NllLossBackward0>)
tensor(1.6336, grad_fn=<NllLossBackward0>)
tensor(1.5414, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2663/17426 [04:02<20:55, 11.76it/s]

tensor(1.6119, grad_fn=<NllLossBackward0>)
tensor(1.6091, grad_fn=<NllLossBackward0>)
tensor(1.6054, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2667/17426 [04:02<21:09, 11.63it/s]

tensor(1.5519, grad_fn=<NllLossBackward0>)
tensor(1.6144, grad_fn=<NllLossBackward0>)
tensor(1.5780, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2669/17426 [04:03<21:04, 11.67it/s]

tensor(1.6086, grad_fn=<NllLossBackward0>)
tensor(1.6001, grad_fn=<NllLossBackward0>)
tensor(1.5512, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2673/17426 [04:03<20:41, 11.88it/s]

tensor(1.6243, grad_fn=<NllLossBackward0>)
tensor(1.5819, grad_fn=<NllLossBackward0>)
tensor(1.5704, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2675/17426 [04:03<20:44, 11.85it/s]

tensor(1.5783, grad_fn=<NllLossBackward0>)
tensor(1.5730, grad_fn=<NllLossBackward0>)
tensor(1.5873, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2679/17426 [04:03<21:02, 11.68it/s]

tensor(1.5790, grad_fn=<NllLossBackward0>)
tensor(1.5303, grad_fn=<NllLossBackward0>)
tensor(1.6408, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2681/17426 [04:04<20:52, 11.77it/s]

tensor(1.5659, grad_fn=<NllLossBackward0>)
tensor(1.5846, grad_fn=<NllLossBackward0>)
tensor(1.5805, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2685/17426 [04:04<20:35, 11.93it/s]

tensor(1.6431, grad_fn=<NllLossBackward0>)
tensor(1.6000, grad_fn=<NllLossBackward0>)
tensor(1.6120, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2687/17426 [04:04<20:39, 11.89it/s]

tensor(1.5904, grad_fn=<NllLossBackward0>)
tensor(1.5730, grad_fn=<NllLossBackward0>)
tensor(1.5740, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2691/17426 [04:04<21:05, 11.65it/s]

tensor(1.5758, grad_fn=<NllLossBackward0>)
tensor(1.6048, grad_fn=<NllLossBackward0>)
tensor(1.6147, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2693/17426 [04:05<21:13, 11.57it/s]

tensor(1.5818, grad_fn=<NllLossBackward0>)
tensor(1.5378, grad_fn=<NllLossBackward0>)
tensor(1.5696, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2697/17426 [04:05<20:39, 11.88it/s]

tensor(1.5618, grad_fn=<NllLossBackward0>)
tensor(1.6090, grad_fn=<NllLossBackward0>)
tensor(1.5864, grad_fn=<NllLossBackward0>)


 15%|█▌        | 2699/17426 [04:05<20:57, 11.71it/s]

tensor(1.5633, grad_fn=<NllLossBackward0>)
tensor(1.5857, grad_fn=<NllLossBackward0>)
tensor(1.6293, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2703/17426 [04:05<20:48, 11.79it/s]

tensor(1.5702, grad_fn=<NllLossBackward0>)
tensor(1.6064, grad_fn=<NllLossBackward0>)
tensor(1.5722, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2705/17426 [04:06<20:57, 11.71it/s]

tensor(1.6058, grad_fn=<NllLossBackward0>)
tensor(1.6442, grad_fn=<NllLossBackward0>)
tensor(1.6230, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2709/17426 [04:06<20:22, 12.03it/s]

tensor(1.5994, grad_fn=<NllLossBackward0>)
tensor(1.5833, grad_fn=<NllLossBackward0>)
tensor(1.6163, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2711/17426 [04:06<20:46, 11.80it/s]

tensor(1.5463, grad_fn=<NllLossBackward0>)
tensor(1.5808, grad_fn=<NllLossBackward0>)
tensor(1.6158, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2715/17426 [04:06<20:33, 11.92it/s]

tensor(1.6418, grad_fn=<NllLossBackward0>)
tensor(1.5627, grad_fn=<NllLossBackward0>)
tensor(1.6129, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2717/17426 [04:07<23:00, 10.65it/s]

tensor(1.5675, grad_fn=<NllLossBackward0>)
tensor(1.6002, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2719/17426 [04:07<25:38,  9.56it/s]

tensor(1.5793, grad_fn=<NllLossBackward0>)
tensor(1.5315, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2721/17426 [04:07<26:35,  9.22it/s]

tensor(1.5705, grad_fn=<NllLossBackward0>)
tensor(1.5606, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2723/17426 [04:07<27:49,  8.81it/s]

tensor(1.5595, grad_fn=<NllLossBackward0>)
tensor(1.6094, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2725/17426 [04:08<29:01,  8.44it/s]

tensor(1.6299, grad_fn=<NllLossBackward0>)
tensor(1.5447, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2727/17426 [04:08<29:42,  8.25it/s]

tensor(1.5887, grad_fn=<NllLossBackward0>)
tensor(1.5526, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2729/17426 [04:08<31:03,  7.89it/s]

tensor(1.5859, grad_fn=<NllLossBackward0>)
tensor(1.5480, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2731/17426 [04:08<30:45,  7.96it/s]

tensor(1.5715, grad_fn=<NllLossBackward0>)
tensor(1.6063, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2733/17426 [04:09<31:54,  7.68it/s]

tensor(1.6130, grad_fn=<NllLossBackward0>)
tensor(1.6198, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2735/17426 [04:09<30:58,  7.90it/s]

tensor(1.5635, grad_fn=<NllLossBackward0>)
tensor(1.6005, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2737/17426 [04:09<32:18,  7.58it/s]

tensor(1.6310, grad_fn=<NllLossBackward0>)
tensor(1.5859, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2739/17426 [04:10<33:07,  7.39it/s]

tensor(1.5921, grad_fn=<NllLossBackward0>)
tensor(1.6379, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2742/17426 [04:10<26:32,  9.22it/s]

tensor(1.5948, grad_fn=<NllLossBackward0>)
tensor(1.5762, grad_fn=<NllLossBackward0>)
tensor(1.5868, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2745/17426 [04:10<23:30, 10.41it/s]

tensor(1.6188, grad_fn=<NllLossBackward0>)
tensor(1.6276, grad_fn=<NllLossBackward0>)
tensor(1.6256, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2747/17426 [04:10<23:10, 10.55it/s]

tensor(1.6313, grad_fn=<NllLossBackward0>)
tensor(1.5811, grad_fn=<NllLossBackward0>)
tensor(1.5756, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2751/17426 [04:11<21:31, 11.36it/s]

tensor(1.5962, grad_fn=<NllLossBackward0>)
tensor(1.6198, grad_fn=<NllLossBackward0>)
tensor(1.6074, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2753/17426 [04:11<21:52, 11.18it/s]

tensor(1.6044, grad_fn=<NllLossBackward0>)
tensor(1.6414, grad_fn=<NllLossBackward0>)
tensor(1.6348, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2757/17426 [04:11<21:16, 11.49it/s]

tensor(1.5685, grad_fn=<NllLossBackward0>)
tensor(1.5820, grad_fn=<NllLossBackward0>)
tensor(1.5884, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2759/17426 [04:11<21:06, 11.58it/s]

tensor(1.5554, grad_fn=<NllLossBackward0>)
tensor(1.6235, grad_fn=<NllLossBackward0>)
tensor(1.6130, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2763/17426 [04:12<20:38, 11.84it/s]

tensor(1.5615, grad_fn=<NllLossBackward0>)
tensor(1.5821, grad_fn=<NllLossBackward0>)
tensor(1.6377, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2765/17426 [04:12<21:03, 11.60it/s]

tensor(1.5753, grad_fn=<NllLossBackward0>)
tensor(1.6230, grad_fn=<NllLossBackward0>)
tensor(1.6037, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2769/17426 [04:12<20:53, 11.69it/s]

tensor(1.5961, grad_fn=<NllLossBackward0>)
tensor(1.5848, grad_fn=<NllLossBackward0>)
tensor(1.5464, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2771/17426 [04:12<20:45, 11.77it/s]

tensor(1.6117, grad_fn=<NllLossBackward0>)
tensor(1.5453, grad_fn=<NllLossBackward0>)
tensor(1.5992, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2775/17426 [04:13<20:20, 12.01it/s]

tensor(1.5793, grad_fn=<NllLossBackward0>)
tensor(1.5496, grad_fn=<NllLossBackward0>)
tensor(1.5944, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2777/17426 [04:13<20:53, 11.69it/s]

tensor(1.6042, grad_fn=<NllLossBackward0>)
tensor(1.5843, grad_fn=<NllLossBackward0>)
tensor(1.5805, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2781/17426 [04:13<20:34, 11.87it/s]

tensor(1.5891, grad_fn=<NllLossBackward0>)
tensor(1.6314, grad_fn=<NllLossBackward0>)
tensor(1.5839, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2783/17426 [04:13<20:54, 11.67it/s]

tensor(1.5946, grad_fn=<NllLossBackward0>)
tensor(1.5936, grad_fn=<NllLossBackward0>)
tensor(1.5930, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2787/17426 [04:14<20:35, 11.85it/s]

tensor(1.6094, grad_fn=<NllLossBackward0>)
tensor(1.5365, grad_fn=<NllLossBackward0>)
tensor(1.5720, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2789/17426 [04:14<20:54, 11.66it/s]

tensor(1.6254, grad_fn=<NllLossBackward0>)
tensor(1.5980, grad_fn=<NllLossBackward0>)
tensor(1.5860, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2793/17426 [04:14<20:30, 11.89it/s]

tensor(1.5887, grad_fn=<NllLossBackward0>)
tensor(1.5785, grad_fn=<NllLossBackward0>)
tensor(1.6180, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2795/17426 [04:14<20:38, 11.82it/s]

tensor(1.6328, grad_fn=<NllLossBackward0>)
tensor(1.5924, grad_fn=<NllLossBackward0>)
tensor(1.6015, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2799/17426 [04:15<20:34, 11.84it/s]

tensor(1.5586, grad_fn=<NllLossBackward0>)
tensor(1.5631, grad_fn=<NllLossBackward0>)
tensor(1.5801, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2801/17426 [04:15<21:07, 11.54it/s]

tensor(1.6019, grad_fn=<NllLossBackward0>)
tensor(1.6016, grad_fn=<NllLossBackward0>)
tensor(1.5842, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2805/17426 [04:15<21:00, 11.60it/s]

tensor(1.6225, grad_fn=<NllLossBackward0>)
tensor(1.5728, grad_fn=<NllLossBackward0>)
tensor(1.6005, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2807/17426 [04:15<21:05, 11.55it/s]

tensor(1.5764, grad_fn=<NllLossBackward0>)
tensor(1.5884, grad_fn=<NllLossBackward0>)
tensor(1.5979, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2811/17426 [04:16<20:33, 11.85it/s]

tensor(1.5926, grad_fn=<NllLossBackward0>)
tensor(1.6600, grad_fn=<NllLossBackward0>)
tensor(1.6190, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2813/17426 [04:16<21:04, 11.56it/s]

tensor(1.5822, grad_fn=<NllLossBackward0>)
tensor(1.6030, grad_fn=<NllLossBackward0>)
tensor(1.5969, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2817/17426 [04:16<20:31, 11.87it/s]

tensor(1.5630, grad_fn=<NllLossBackward0>)
tensor(1.5555, grad_fn=<NllLossBackward0>)
tensor(1.5756, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2819/17426 [04:16<20:44, 11.74it/s]

tensor(1.6155, grad_fn=<NllLossBackward0>)
tensor(1.6330, grad_fn=<NllLossBackward0>)
tensor(1.6090, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2823/17426 [04:17<20:21, 11.96it/s]

tensor(1.6069, grad_fn=<NllLossBackward0>)
tensor(1.5639, grad_fn=<NllLossBackward0>)
tensor(1.6474, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2825/17426 [04:17<20:54, 11.64it/s]

tensor(1.5666, grad_fn=<NllLossBackward0>)
tensor(1.5472, grad_fn=<NllLossBackward0>)
tensor(1.6080, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2829/17426 [04:17<20:33, 11.83it/s]

tensor(1.6032, grad_fn=<NllLossBackward0>)
tensor(1.5989, grad_fn=<NllLossBackward0>)
tensor(1.6391, grad_fn=<NllLossBackward0>)


 16%|█▌        | 2831/17426 [04:17<20:53, 11.65it/s]

tensor(1.6135, grad_fn=<NllLossBackward0>)
tensor(1.6259, grad_fn=<NllLossBackward0>)
tensor(1.6162, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2835/17426 [04:18<20:32, 11.84it/s]

tensor(1.5933, grad_fn=<NllLossBackward0>)
tensor(1.6493, grad_fn=<NllLossBackward0>)
tensor(1.6356, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2837/17426 [04:18<21:19, 11.40it/s]

tensor(1.6244, grad_fn=<NllLossBackward0>)
tensor(1.5817, grad_fn=<NllLossBackward0>)
tensor(1.6365, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2841/17426 [04:18<20:27, 11.88it/s]

tensor(1.5348, grad_fn=<NllLossBackward0>)
tensor(1.6156, grad_fn=<NllLossBackward0>)
tensor(1.5714, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2843/17426 [04:18<20:45, 11.71it/s]

tensor(1.5875, grad_fn=<NllLossBackward0>)
tensor(1.6391, grad_fn=<NllLossBackward0>)
tensor(1.5909, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2847/17426 [04:19<20:23, 11.92it/s]

tensor(1.5304, grad_fn=<NllLossBackward0>)
tensor(1.6254, grad_fn=<NllLossBackward0>)
tensor(1.5788, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2849/17426 [04:19<20:58, 11.58it/s]

tensor(1.6210, grad_fn=<NllLossBackward0>)
tensor(1.6027, grad_fn=<NllLossBackward0>)
tensor(1.5898, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2853/17426 [04:19<20:31, 11.83it/s]

tensor(1.5655, grad_fn=<NllLossBackward0>)
tensor(1.5793, grad_fn=<NllLossBackward0>)
tensor(1.6332, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2855/17426 [04:20<20:45, 11.70it/s]

tensor(1.6269, grad_fn=<NllLossBackward0>)
tensor(1.6552, grad_fn=<NllLossBackward0>)
tensor(1.5536, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2857/17426 [04:20<23:02, 10.54it/s]

tensor(1.5777, grad_fn=<NllLossBackward0>)
tensor(1.5668, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2860/17426 [04:20<26:30,  9.16it/s]

tensor(1.6016, grad_fn=<NllLossBackward0>)
tensor(1.5635, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2862/17426 [04:20<28:53,  8.40it/s]

tensor(1.5612, grad_fn=<NllLossBackward0>)
tensor(1.5806, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2864/17426 [04:21<28:43,  8.45it/s]

tensor(1.6158, grad_fn=<NllLossBackward0>)
tensor(1.5745, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2866/17426 [04:21<28:30,  8.51it/s]

tensor(1.5747, grad_fn=<NllLossBackward0>)
tensor(1.5890, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2868/17426 [04:21<29:49,  8.14it/s]

tensor(1.5788, grad_fn=<NllLossBackward0>)
tensor(1.5808, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2870/17426 [04:21<28:51,  8.41it/s]

tensor(1.5873, grad_fn=<NllLossBackward0>)
tensor(1.5586, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2872/17426 [04:22<31:15,  7.76it/s]

tensor(1.5577, grad_fn=<NllLossBackward0>)
tensor(1.5830, grad_fn=<NllLossBackward0>)


 16%|█▋        | 2874/17426 [04:22<30:42,  7.90it/s]

tensor(1.6025, grad_fn=<NllLossBackward0>)
tensor(1.6049, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2876/17426 [04:22<32:29,  7.47it/s]

tensor(1.5972, grad_fn=<NllLossBackward0>)
tensor(1.5799, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2878/17426 [04:22<32:28,  7.47it/s]

tensor(1.5918, grad_fn=<NllLossBackward0>)
tensor(1.6006, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2880/17426 [04:23<31:04,  7.80it/s]

tensor(1.5731, grad_fn=<NllLossBackward0>)
tensor(1.6232, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2882/17426 [04:23<26:33,  9.13it/s]

tensor(1.6130, grad_fn=<NllLossBackward0>)
tensor(1.5964, grad_fn=<NllLossBackward0>)
tensor(1.5979, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2886/17426 [04:23<23:18, 10.40it/s]

tensor(1.6174, grad_fn=<NllLossBackward0>)
tensor(1.5711, grad_fn=<NllLossBackward0>)
tensor(1.5840, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2888/17426 [04:23<22:27, 10.79it/s]

tensor(1.5585, grad_fn=<NllLossBackward0>)
tensor(1.5469, grad_fn=<NllLossBackward0>)
tensor(1.6074, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2892/17426 [04:24<21:17, 11.38it/s]

tensor(1.5689, grad_fn=<NllLossBackward0>)
tensor(1.5766, grad_fn=<NllLossBackward0>)
tensor(1.5971, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2894/17426 [04:24<21:04, 11.49it/s]

tensor(1.5932, grad_fn=<NllLossBackward0>)
tensor(1.5960, grad_fn=<NllLossBackward0>)
tensor(1.6216, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2898/17426 [04:24<21:17, 11.37it/s]

tensor(1.6084, grad_fn=<NllLossBackward0>)
tensor(1.5544, grad_fn=<NllLossBackward0>)
tensor(1.5507, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2900/17426 [04:24<21:16, 11.38it/s]

tensor(1.5721, grad_fn=<NllLossBackward0>)
tensor(1.5719, grad_fn=<NllLossBackward0>)
tensor(1.5682, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2904/17426 [04:25<20:42, 11.69it/s]

tensor(1.5750, grad_fn=<NllLossBackward0>)
tensor(1.5615, grad_fn=<NllLossBackward0>)
tensor(1.5925, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2906/17426 [04:25<20:44, 11.67it/s]

tensor(1.5703, grad_fn=<NllLossBackward0>)
tensor(1.6180, grad_fn=<NllLossBackward0>)
tensor(1.5616, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2910/17426 [04:25<21:21, 11.33it/s]

tensor(1.5751, grad_fn=<NllLossBackward0>)
tensor(1.5525, grad_fn=<NllLossBackward0>)
tensor(1.6142, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2912/17426 [04:25<21:11, 11.41it/s]

tensor(1.5993, grad_fn=<NllLossBackward0>)
tensor(1.6407, grad_fn=<NllLossBackward0>)
tensor(1.5940, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2916/17426 [04:26<20:41, 11.69it/s]

tensor(1.5757, grad_fn=<NllLossBackward0>)
tensor(1.5563, grad_fn=<NllLossBackward0>)
tensor(1.5886, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2918/17426 [04:26<20:49, 11.61it/s]

tensor(1.6301, grad_fn=<NllLossBackward0>)
tensor(1.5698, grad_fn=<NllLossBackward0>)
tensor(1.5978, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2922/17426 [04:26<20:53, 11.57it/s]

tensor(1.5930, grad_fn=<NllLossBackward0>)
tensor(1.5470, grad_fn=<NllLossBackward0>)
tensor(1.5854, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2924/17426 [04:27<21:00, 11.50it/s]

tensor(1.5316, grad_fn=<NllLossBackward0>)
tensor(1.5631, grad_fn=<NllLossBackward0>)
tensor(1.6195, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2928/17426 [04:27<20:18, 11.90it/s]

tensor(1.5337, grad_fn=<NllLossBackward0>)
tensor(1.5838, grad_fn=<NllLossBackward0>)
tensor(1.6046, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2930/17426 [04:27<20:17, 11.91it/s]

tensor(1.5855, grad_fn=<NllLossBackward0>)
tensor(1.6048, grad_fn=<NllLossBackward0>)
tensor(1.5967, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2934/17426 [04:27<20:52, 11.57it/s]

tensor(1.6336, grad_fn=<NllLossBackward0>)
tensor(1.5587, grad_fn=<NllLossBackward0>)
tensor(1.5928, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2936/17426 [04:28<20:54, 11.55it/s]

tensor(1.6089, grad_fn=<NllLossBackward0>)
tensor(1.5803, grad_fn=<NllLossBackward0>)
tensor(1.5892, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2940/17426 [04:28<20:19, 11.88it/s]

tensor(1.5973, grad_fn=<NllLossBackward0>)
tensor(1.6214, grad_fn=<NllLossBackward0>)
tensor(1.5556, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2942/17426 [04:28<20:29, 11.78it/s]

tensor(1.5942, grad_fn=<NllLossBackward0>)
tensor(1.5609, grad_fn=<NllLossBackward0>)
tensor(1.6122, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2946/17426 [04:28<20:45, 11.63it/s]

tensor(1.5467, grad_fn=<NllLossBackward0>)
tensor(1.5964, grad_fn=<NllLossBackward0>)
tensor(1.6028, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2948/17426 [04:29<20:52, 11.56it/s]

tensor(1.6075, grad_fn=<NllLossBackward0>)
tensor(1.5439, grad_fn=<NllLossBackward0>)
tensor(1.5407, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2952/17426 [04:29<20:23, 11.83it/s]

tensor(1.5779, grad_fn=<NllLossBackward0>)
tensor(1.5741, grad_fn=<NllLossBackward0>)
tensor(1.5507, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2954/17426 [04:29<20:34, 11.73it/s]

tensor(1.6048, grad_fn=<NllLossBackward0>)
tensor(1.6281, grad_fn=<NllLossBackward0>)
tensor(1.5753, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2958/17426 [04:29<20:58, 11.49it/s]

tensor(1.6033, grad_fn=<NllLossBackward0>)
tensor(1.5606, grad_fn=<NllLossBackward0>)
tensor(1.6254, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2960/17426 [04:30<20:56, 11.51it/s]

tensor(1.5827, grad_fn=<NllLossBackward0>)
tensor(1.6040, grad_fn=<NllLossBackward0>)
tensor(1.5483, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2964/17426 [04:30<20:20, 11.84it/s]

tensor(1.5640, grad_fn=<NllLossBackward0>)
tensor(1.6055, grad_fn=<NllLossBackward0>)
tensor(1.6230, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2966/17426 [04:30<20:28, 11.77it/s]

tensor(1.5610, grad_fn=<NllLossBackward0>)
tensor(1.5860, grad_fn=<NllLossBackward0>)
tensor(1.6116, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2970/17426 [04:30<20:49, 11.57it/s]

tensor(1.5854, grad_fn=<NllLossBackward0>)
tensor(1.5565, grad_fn=<NllLossBackward0>)
tensor(1.5830, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2972/17426 [04:31<20:50, 11.55it/s]

tensor(1.5626, grad_fn=<NllLossBackward0>)
tensor(1.6231, grad_fn=<NllLossBackward0>)
tensor(1.5951, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2976/17426 [04:31<20:16, 11.88it/s]

tensor(1.5919, grad_fn=<NllLossBackward0>)
tensor(1.6062, grad_fn=<NllLossBackward0>)
tensor(1.5948, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2978/17426 [04:31<20:23, 11.81it/s]

tensor(1.5723, grad_fn=<NllLossBackward0>)
tensor(1.6040, grad_fn=<NllLossBackward0>)
tensor(1.5816, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2982/17426 [04:31<20:54, 11.51it/s]

tensor(1.5607, grad_fn=<NllLossBackward0>)
tensor(1.6185, grad_fn=<NllLossBackward0>)
tensor(1.6027, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2984/17426 [04:32<20:46, 11.59it/s]

tensor(1.5549, grad_fn=<NllLossBackward0>)
tensor(1.5644, grad_fn=<NllLossBackward0>)
tensor(1.5517, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2988/17426 [04:32<20:31, 11.73it/s]

tensor(1.5710, grad_fn=<NllLossBackward0>)
tensor(1.6141, grad_fn=<NllLossBackward0>)
tensor(1.5629, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2990/17426 [04:32<20:50, 11.54it/s]

tensor(1.6076, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)
tensor(1.5889, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2994/17426 [04:33<20:44, 11.60it/s]

tensor(1.5656, grad_fn=<NllLossBackward0>)
tensor(1.5776, grad_fn=<NllLossBackward0>)
tensor(1.6086, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2996/17426 [04:33<22:09, 10.86it/s]

tensor(1.5828, grad_fn=<NllLossBackward0>)
tensor(1.5821, grad_fn=<NllLossBackward0>)


 17%|█▋        | 2998/17426 [04:33<24:25,  9.84it/s]

tensor(1.5734, grad_fn=<NllLossBackward0>)
tensor(1.6389, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3000/17426 [04:33<27:03,  8.88it/s]

tensor(1.6158, grad_fn=<NllLossBackward0>)
tensor(1.5678, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3002/17426 [04:33<27:17,  8.81it/s]

tensor(1.5757, grad_fn=<NllLossBackward0>)
tensor(1.5986, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3004/17426 [04:34<28:42,  8.37it/s]

tensor(1.6052, grad_fn=<NllLossBackward0>)
tensor(1.5357, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3006/17426 [04:34<30:41,  7.83it/s]

tensor(1.5886, grad_fn=<NllLossBackward0>)
tensor(1.5925, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3008/17426 [04:34<31:13,  7.70it/s]

tensor(1.5949, grad_fn=<NllLossBackward0>)
tensor(1.6399, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3010/17426 [04:35<30:03,  8.00it/s]

tensor(1.6055, grad_fn=<NllLossBackward0>)
tensor(1.5426, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3012/17426 [04:35<31:26,  7.64it/s]

tensor(1.5917, grad_fn=<NllLossBackward0>)
tensor(1.5864, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3014/17426 [04:35<33:00,  7.28it/s]

tensor(1.5561, grad_fn=<NllLossBackward0>)
tensor(1.5660, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3016/17426 [04:35<33:16,  7.22it/s]

tensor(1.5826, grad_fn=<NllLossBackward0>)
tensor(1.5689, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3019/17426 [04:36<28:10,  8.52it/s]

tensor(1.5387, grad_fn=<NllLossBackward0>)
tensor(1.6003, grad_fn=<NllLossBackward0>)
tensor(1.5670, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3021/17426 [04:36<24:55,  9.64it/s]

tensor(1.5629, grad_fn=<NllLossBackward0>)
tensor(1.6667, grad_fn=<NllLossBackward0>)
tensor(1.5941, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3025/17426 [04:36<22:38, 10.60it/s]

tensor(1.6133, grad_fn=<NllLossBackward0>)
tensor(1.5945, grad_fn=<NllLossBackward0>)
tensor(1.5833, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3027/17426 [04:36<21:47, 11.01it/s]

tensor(1.6100, grad_fn=<NllLossBackward0>)
tensor(1.6060, grad_fn=<NllLossBackward0>)
tensor(1.5610, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3031/17426 [04:37<21:19, 11.25it/s]

tensor(1.5726, grad_fn=<NllLossBackward0>)
tensor(1.5933, grad_fn=<NllLossBackward0>)
tensor(1.5354, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3033/17426 [04:37<21:16, 11.27it/s]

tensor(1.5619, grad_fn=<NllLossBackward0>)
tensor(1.5388, grad_fn=<NllLossBackward0>)
tensor(1.5975, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3037/17426 [04:37<20:44, 11.56it/s]

tensor(1.6012, grad_fn=<NllLossBackward0>)
tensor(1.6044, grad_fn=<NllLossBackward0>)
tensor(1.5985, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3039/17426 [04:37<20:50, 11.51it/s]

tensor(1.6280, grad_fn=<NllLossBackward0>)
tensor(1.6465, grad_fn=<NllLossBackward0>)
tensor(1.5444, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3043/17426 [04:38<20:52, 11.48it/s]

tensor(1.6025, grad_fn=<NllLossBackward0>)
tensor(1.5797, grad_fn=<NllLossBackward0>)
tensor(1.5924, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3045/17426 [04:38<20:47, 11.52it/s]

tensor(1.5503, grad_fn=<NllLossBackward0>)
tensor(1.6434, grad_fn=<NllLossBackward0>)
tensor(1.6135, grad_fn=<NllLossBackward0>)


 17%|█▋        | 3049/17426 [04:38<21:20, 11.23it/s]

tensor(1.6069, grad_fn=<NllLossBackward0>)
tensor(1.6110, grad_fn=<NllLossBackward0>)
tensor(1.5971, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3051/17426 [04:38<21:00, 11.40it/s]

tensor(1.5659, grad_fn=<NllLossBackward0>)
tensor(1.5839, grad_fn=<NllLossBackward0>)
tensor(1.5923, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3055/17426 [04:39<21:08, 11.32it/s]

tensor(1.5665, grad_fn=<NllLossBackward0>)
tensor(1.5663, grad_fn=<NllLossBackward0>)
tensor(1.5666, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3057/17426 [04:39<20:59, 11.41it/s]

tensor(1.5799, grad_fn=<NllLossBackward0>)
tensor(1.6220, grad_fn=<NllLossBackward0>)
tensor(1.5732, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3061/17426 [04:39<20:36, 11.61it/s]

tensor(1.5749, grad_fn=<NllLossBackward0>)
tensor(1.5776, grad_fn=<NllLossBackward0>)
tensor(1.5914, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3063/17426 [04:40<20:49, 11.49it/s]

tensor(1.6216, grad_fn=<NllLossBackward0>)
tensor(1.5553, grad_fn=<NllLossBackward0>)
tensor(1.5716, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3067/17426 [04:40<21:01, 11.39it/s]

tensor(1.5881, grad_fn=<NllLossBackward0>)
tensor(1.6199, grad_fn=<NllLossBackward0>)
tensor(1.5690, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3069/17426 [04:40<20:52, 11.47it/s]

tensor(1.6318, grad_fn=<NllLossBackward0>)
tensor(1.5730, grad_fn=<NllLossBackward0>)
tensor(1.5526, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3073/17426 [04:40<20:16, 11.80it/s]

tensor(1.5630, grad_fn=<NllLossBackward0>)
tensor(1.5605, grad_fn=<NllLossBackward0>)
tensor(1.5616, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3075/17426 [04:41<20:21, 11.75it/s]

tensor(1.5686, grad_fn=<NllLossBackward0>)
tensor(1.5902, grad_fn=<NllLossBackward0>)
tensor(1.5766, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3079/17426 [04:41<20:51, 11.46it/s]

tensor(1.6601, grad_fn=<NllLossBackward0>)
tensor(1.5668, grad_fn=<NllLossBackward0>)
tensor(1.5813, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3081/17426 [04:41<20:52, 11.45it/s]

tensor(1.6279, grad_fn=<NllLossBackward0>)
tensor(1.6511, grad_fn=<NllLossBackward0>)
tensor(1.5547, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3085/17426 [04:41<20:24, 11.71it/s]

tensor(1.5658, grad_fn=<NllLossBackward0>)
tensor(1.5790, grad_fn=<NllLossBackward0>)
tensor(1.5757, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3087/17426 [04:42<20:41, 11.55it/s]

tensor(1.5938, grad_fn=<NllLossBackward0>)
tensor(1.5625, grad_fn=<NllLossBackward0>)
tensor(1.5456, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3091/17426 [04:42<20:33, 11.63it/s]

tensor(1.5769, grad_fn=<NllLossBackward0>)
tensor(1.6220, grad_fn=<NllLossBackward0>)
tensor(1.5427, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3093/17426 [04:42<20:30, 11.64it/s]

tensor(1.5920, grad_fn=<NllLossBackward0>)
tensor(1.5993, grad_fn=<NllLossBackward0>)
tensor(1.5889, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3097/17426 [04:42<20:11, 11.83it/s]

tensor(1.5873, grad_fn=<NllLossBackward0>)
tensor(1.5762, grad_fn=<NllLossBackward0>)
tensor(1.5891, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3099/17426 [04:43<20:17, 11.77it/s]

tensor(1.6130, grad_fn=<NllLossBackward0>)
tensor(1.5942, grad_fn=<NllLossBackward0>)
tensor(1.5718, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3103/17426 [04:43<20:26, 11.68it/s]

tensor(1.5722, grad_fn=<NllLossBackward0>)
tensor(1.5851, grad_fn=<NllLossBackward0>)
tensor(1.5965, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3105/17426 [04:43<20:37, 11.57it/s]

tensor(1.5874, grad_fn=<NllLossBackward0>)
tensor(1.6247, grad_fn=<NllLossBackward0>)
tensor(1.5932, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3109/17426 [04:43<20:09, 11.84it/s]

tensor(1.5744, grad_fn=<NllLossBackward0>)
tensor(1.5359, grad_fn=<NllLossBackward0>)
tensor(1.6122, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3111/17426 [04:44<20:08, 11.84it/s]

tensor(1.5746, grad_fn=<NllLossBackward0>)
tensor(1.6288, grad_fn=<NllLossBackward0>)
tensor(1.5963, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3115/17426 [04:44<20:25, 11.67it/s]

tensor(1.5788, grad_fn=<NllLossBackward0>)
tensor(1.6044, grad_fn=<NllLossBackward0>)
tensor(1.5933, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3117/17426 [04:44<20:39, 11.54it/s]

tensor(1.6009, grad_fn=<NllLossBackward0>)
tensor(1.6268, grad_fn=<NllLossBackward0>)
tensor(1.5443, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3121/17426 [04:44<20:03, 11.89it/s]

tensor(1.5713, grad_fn=<NllLossBackward0>)
tensor(1.6182, grad_fn=<NllLossBackward0>)
tensor(1.5405, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3123/17426 [04:45<20:23, 11.69it/s]

tensor(1.6097, grad_fn=<NllLossBackward0>)
tensor(1.6064, grad_fn=<NllLossBackward0>)
tensor(1.5725, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3127/17426 [04:45<20:26, 11.66it/s]

tensor(1.5298, grad_fn=<NllLossBackward0>)
tensor(1.5983, grad_fn=<NllLossBackward0>)
tensor(1.5857, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3129/17426 [04:45<20:25, 11.66it/s]

tensor(1.5969, grad_fn=<NllLossBackward0>)
tensor(1.5754, grad_fn=<NllLossBackward0>)
tensor(1.5892, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3133/17426 [04:46<20:13, 11.78it/s]

tensor(1.5823, grad_fn=<NllLossBackward0>)
tensor(1.5756, grad_fn=<NllLossBackward0>)
tensor(1.5719, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3135/17426 [04:46<23:07, 10.30it/s]

tensor(1.5840, grad_fn=<NllLossBackward0>)
tensor(1.5802, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3137/17426 [04:46<26:23,  9.02it/s]

tensor(1.6052, grad_fn=<NllLossBackward0>)
tensor(1.5661, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3139/17426 [04:46<28:02,  8.49it/s]

tensor(1.5769, grad_fn=<NllLossBackward0>)
tensor(1.6245, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3141/17426 [04:47<28:01,  8.50it/s]

tensor(1.5558, grad_fn=<NllLossBackward0>)
tensor(1.5788, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3143/17426 [04:47<28:22,  8.39it/s]

tensor(1.5923, grad_fn=<NllLossBackward0>)
tensor(1.6075, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3145/17426 [04:47<29:18,  8.12it/s]

tensor(1.5580, grad_fn=<NllLossBackward0>)
tensor(1.5817, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3147/17426 [04:47<28:27,  8.36it/s]

tensor(1.5950, grad_fn=<NllLossBackward0>)
tensor(1.5746, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3149/17426 [04:48<28:46,  8.27it/s]

tensor(1.6214, grad_fn=<NllLossBackward0>)
tensor(1.5950, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3151/17426 [04:48<29:34,  8.04it/s]

tensor(1.5999, grad_fn=<NllLossBackward0>)
tensor(1.5913, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3153/17426 [04:48<28:24,  8.37it/s]

tensor(1.6228, grad_fn=<NllLossBackward0>)
tensor(1.5486, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3155/17426 [04:48<30:42,  7.75it/s]

tensor(1.5816, grad_fn=<NllLossBackward0>)
tensor(1.5994, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3157/17426 [04:49<32:23,  7.34it/s]

tensor(1.5855, grad_fn=<NllLossBackward0>)
tensor(1.6095, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3160/17426 [04:49<25:14,  9.42it/s]

tensor(1.6032, grad_fn=<NllLossBackward0>)
tensor(1.5611, grad_fn=<NllLossBackward0>)
tensor(1.5628, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3162/17426 [04:49<23:09, 10.26it/s]

tensor(1.5925, grad_fn=<NllLossBackward0>)
tensor(1.6131, grad_fn=<NllLossBackward0>)
tensor(1.6086, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3166/17426 [04:49<21:49, 10.89it/s]

tensor(1.6407, grad_fn=<NllLossBackward0>)
tensor(1.5708, grad_fn=<NllLossBackward0>)
tensor(1.6097, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3168/17426 [04:50<21:36, 10.99it/s]

tensor(1.5741, grad_fn=<NllLossBackward0>)
tensor(1.6086, grad_fn=<NllLossBackward0>)
tensor(1.5456, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3172/17426 [04:50<20:23, 11.65it/s]

tensor(1.6384, grad_fn=<NllLossBackward0>)
tensor(1.5437, grad_fn=<NllLossBackward0>)
tensor(1.5766, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3174/17426 [04:50<20:29, 11.59it/s]

tensor(1.5765, grad_fn=<NllLossBackward0>)
tensor(1.5714, grad_fn=<NllLossBackward0>)
tensor(1.5669, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3178/17426 [04:50<20:18, 11.69it/s]

tensor(1.5768, grad_fn=<NllLossBackward0>)
tensor(1.5505, grad_fn=<NllLossBackward0>)
tensor(1.5986, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3180/17426 [04:51<20:23, 11.64it/s]

tensor(1.6225, grad_fn=<NllLossBackward0>)
tensor(1.5742, grad_fn=<NllLossBackward0>)
tensor(1.5915, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3184/17426 [04:51<19:52, 11.94it/s]

tensor(1.6002, grad_fn=<NllLossBackward0>)
tensor(1.6007, grad_fn=<NllLossBackward0>)
tensor(1.6118, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3186/17426 [04:51<19:58, 11.88it/s]

tensor(1.5830, grad_fn=<NllLossBackward0>)
tensor(1.5888, grad_fn=<NllLossBackward0>)
tensor(1.5678, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3190/17426 [04:51<19:57, 11.89it/s]

tensor(1.5585, grad_fn=<NllLossBackward0>)
tensor(1.5833, grad_fn=<NllLossBackward0>)
tensor(1.5729, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3192/17426 [04:52<20:11, 11.75it/s]

tensor(1.5908, grad_fn=<NllLossBackward0>)
tensor(1.5685, grad_fn=<NllLossBackward0>)
tensor(1.5629, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3196/17426 [04:52<19:55, 11.91it/s]

tensor(1.6117, grad_fn=<NllLossBackward0>)
tensor(1.5411, grad_fn=<NllLossBackward0>)
tensor(1.5784, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3198/17426 [04:52<20:03, 11.82it/s]

tensor(1.5649, grad_fn=<NllLossBackward0>)
tensor(1.5502, grad_fn=<NllLossBackward0>)
tensor(1.5810, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3202/17426 [04:52<20:24, 11.61it/s]

tensor(1.5697, grad_fn=<NllLossBackward0>)
tensor(1.5629, grad_fn=<NllLossBackward0>)
tensor(1.5777, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3204/17426 [04:53<20:23, 11.62it/s]

tensor(1.5387, grad_fn=<NllLossBackward0>)
tensor(1.5425, grad_fn=<NllLossBackward0>)
tensor(1.6295, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3208/17426 [04:53<20:00, 11.84it/s]

tensor(1.5089, grad_fn=<NllLossBackward0>)
tensor(1.5643, grad_fn=<NllLossBackward0>)
tensor(1.5766, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3210/17426 [04:53<20:08, 11.76it/s]

tensor(1.5845, grad_fn=<NllLossBackward0>)
tensor(1.5968, grad_fn=<NllLossBackward0>)
tensor(1.5965, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3214/17426 [04:53<20:20, 11.65it/s]

tensor(1.5942, grad_fn=<NllLossBackward0>)
tensor(1.5769, grad_fn=<NllLossBackward0>)
tensor(1.5954, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3216/17426 [04:54<20:26, 11.59it/s]

tensor(1.5868, grad_fn=<NllLossBackward0>)
tensor(1.5498, grad_fn=<NllLossBackward0>)
tensor(1.5873, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3220/17426 [04:54<20:13, 11.70it/s]

tensor(1.5453, grad_fn=<NllLossBackward0>)
tensor(1.5864, grad_fn=<NllLossBackward0>)
tensor(1.5713, grad_fn=<NllLossBackward0>)


 18%|█▊        | 3222/17426 [04:54<20:26, 11.58it/s]

tensor(1.5581, grad_fn=<NllLossBackward0>)
tensor(1.5823, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3226/17426 [04:55<20:09, 11.74it/s]

tensor(1.5685, grad_fn=<NllLossBackward0>)
tensor(1.5925, grad_fn=<NllLossBackward0>)
tensor(1.5519, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3228/17426 [04:55<20:27, 11.57it/s]

tensor(1.5644, grad_fn=<NllLossBackward0>)
tensor(1.5877, grad_fn=<NllLossBackward0>)
tensor(1.5723, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3232/17426 [04:55<20:17, 11.66it/s]

tensor(1.5835, grad_fn=<NllLossBackward0>)
tensor(1.5659, grad_fn=<NllLossBackward0>)
tensor(1.5535, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3234/17426 [04:55<20:39, 11.45it/s]

tensor(1.5998, grad_fn=<NllLossBackward0>)
tensor(1.5892, grad_fn=<NllLossBackward0>)
tensor(1.6108, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3238/17426 [04:56<20:24, 11.59it/s]

tensor(1.5648, grad_fn=<NllLossBackward0>)
tensor(1.5393, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3240/17426 [04:56<20:22, 11.61it/s]

tensor(1.5687, grad_fn=<NllLossBackward0>)
tensor(1.6109, grad_fn=<NllLossBackward0>)
tensor(1.6001, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3244/17426 [04:56<20:06, 11.75it/s]

tensor(1.6124, grad_fn=<NllLossBackward0>)
tensor(1.5859, grad_fn=<NllLossBackward0>)
tensor(1.5822, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3246/17426 [04:56<20:18, 11.64it/s]

tensor(1.6160, grad_fn=<NllLossBackward0>)
tensor(1.5605, grad_fn=<NllLossBackward0>)
tensor(1.5918, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3250/17426 [04:57<20:18, 11.64it/s]

tensor(1.6267, grad_fn=<NllLossBackward0>)
tensor(1.5938, grad_fn=<NllLossBackward0>)
tensor(1.5860, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3252/17426 [04:57<20:26, 11.56it/s]

tensor(1.5627, grad_fn=<NllLossBackward0>)
tensor(1.5725, grad_fn=<NllLossBackward0>)
tensor(1.5986, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3256/17426 [04:57<20:18, 11.63it/s]

tensor(1.5973, grad_fn=<NllLossBackward0>)
tensor(1.5872, grad_fn=<NllLossBackward0>)
tensor(1.5783, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3258/17426 [04:57<20:14, 11.67it/s]

tensor(1.5414, grad_fn=<NllLossBackward0>)
tensor(1.5745, grad_fn=<NllLossBackward0>)
tensor(1.6035, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3262/17426 [04:58<20:43, 11.39it/s]

tensor(1.5315, grad_fn=<NllLossBackward0>)
tensor(1.5617, grad_fn=<NllLossBackward0>)
tensor(1.5598, grad_fn=<NllLossBackward0>)


 19%|█▊        | 3264/17426 [04:58<20:30, 11.51it/s]

tensor(1.5616, grad_fn=<NllLossBackward0>)
tensor(1.5748, grad_fn=<NllLossBackward0>)
tensor(1.5463, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3268/17426 [04:58<20:15, 11.65it/s]

tensor(1.5441, grad_fn=<NllLossBackward0>)
tensor(1.5902, grad_fn=<NllLossBackward0>)
tensor(1.5438, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3270/17426 [04:58<20:05, 11.75it/s]

tensor(1.5594, grad_fn=<NllLossBackward0>)
tensor(1.5789, grad_fn=<NllLossBackward0>)
tensor(1.5950, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3274/17426 [04:59<21:33, 10.94it/s]

tensor(1.6047, grad_fn=<NllLossBackward0>)
tensor(1.5862, grad_fn=<NllLossBackward0>)
tensor(1.5647, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3276/17426 [04:59<23:58,  9.84it/s]

tensor(1.5879, grad_fn=<NllLossBackward0>)
tensor(1.6070, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3278/17426 [04:59<26:01,  9.06it/s]

tensor(1.5072, grad_fn=<NllLossBackward0>)
tensor(1.6009, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3280/17426 [05:00<28:34,  8.25it/s]

tensor(1.6293, grad_fn=<NllLossBackward0>)
tensor(1.5697, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3282/17426 [05:00<28:51,  8.17it/s]

tensor(1.5517, grad_fn=<NllLossBackward0>)
tensor(1.5888, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3284/17426 [05:00<30:28,  7.74it/s]

tensor(1.5624, grad_fn=<NllLossBackward0>)
tensor(1.5439, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3286/17426 [05:00<30:07,  7.83it/s]

tensor(1.6237, grad_fn=<NllLossBackward0>)
tensor(1.6096, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3288/17426 [05:01<28:56,  8.14it/s]

tensor(1.5621, grad_fn=<NllLossBackward0>)
tensor(1.6238, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3290/17426 [05:01<30:13,  7.80it/s]

tensor(1.5767, grad_fn=<NllLossBackward0>)
tensor(1.5448, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3292/17426 [05:01<29:15,  8.05it/s]

tensor(1.5753, grad_fn=<NllLossBackward0>)
tensor(1.5790, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3294/17426 [05:01<30:42,  7.67it/s]

tensor(1.6326, grad_fn=<NllLossBackward0>)
tensor(1.5637, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3296/17426 [05:02<30:43,  7.66it/s]

tensor(1.5911, grad_fn=<NllLossBackward0>)
tensor(1.5901, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3299/17426 [05:02<25:41,  9.17it/s]

tensor(1.5537, grad_fn=<NllLossBackward0>)
tensor(1.6063, grad_fn=<NllLossBackward0>)
tensor(1.5840, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3301/17426 [05:02<23:42,  9.93it/s]

tensor(1.6380, grad_fn=<NllLossBackward0>)
tensor(1.6155, grad_fn=<NllLossBackward0>)
tensor(1.6005, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3305/17426 [05:02<21:15, 11.07it/s]

tensor(1.5661, grad_fn=<NllLossBackward0>)
tensor(1.5831, grad_fn=<NllLossBackward0>)
tensor(1.5528, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3307/17426 [05:03<20:51, 11.28it/s]

tensor(1.5735, grad_fn=<NllLossBackward0>)
tensor(1.5827, grad_fn=<NllLossBackward0>)
tensor(1.5421, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3311/17426 [05:03<20:48, 11.30it/s]

tensor(1.5835, grad_fn=<NllLossBackward0>)
tensor(1.6006, grad_fn=<NllLossBackward0>)
tensor(1.5504, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3313/17426 [05:03<20:41, 11.36it/s]

tensor(1.5267, grad_fn=<NllLossBackward0>)
tensor(1.5956, grad_fn=<NllLossBackward0>)
tensor(1.5781, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3317/17426 [05:03<20:03, 11.72it/s]

tensor(1.5529, grad_fn=<NllLossBackward0>)
tensor(1.6126, grad_fn=<NllLossBackward0>)
tensor(1.5633, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3319/17426 [05:04<20:15, 11.60it/s]

tensor(1.5680, grad_fn=<NllLossBackward0>)
tensor(1.5604, grad_fn=<NllLossBackward0>)
tensor(1.5519, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3323/17426 [05:04<20:07, 11.68it/s]

tensor(1.5760, grad_fn=<NllLossBackward0>)
tensor(1.5942, grad_fn=<NllLossBackward0>)
tensor(1.5867, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3325/17426 [05:04<20:05, 11.70it/s]

tensor(1.5809, grad_fn=<NllLossBackward0>)
tensor(1.5735, grad_fn=<NllLossBackward0>)
tensor(1.6003, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3329/17426 [05:04<19:36, 11.98it/s]

tensor(1.5699, grad_fn=<NllLossBackward0>)
tensor(1.5840, grad_fn=<NllLossBackward0>)
tensor(1.5590, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3331/17426 [05:05<19:56, 11.78it/s]

tensor(1.5424, grad_fn=<NllLossBackward0>)
tensor(1.5611, grad_fn=<NllLossBackward0>)
tensor(1.5417, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3335/17426 [05:05<19:55, 11.78it/s]

tensor(1.5366, grad_fn=<NllLossBackward0>)
tensor(1.5655, grad_fn=<NllLossBackward0>)
tensor(1.6277, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3337/17426 [05:05<20:02, 11.71it/s]

tensor(1.6069, grad_fn=<NllLossBackward0>)
tensor(1.5662, grad_fn=<NllLossBackward0>)
tensor(1.5793, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3341/17426 [05:05<19:30, 12.03it/s]

tensor(1.6147, grad_fn=<NllLossBackward0>)
tensor(1.5579, grad_fn=<NllLossBackward0>)
tensor(1.5935, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3343/17426 [05:06<19:55, 11.78it/s]

tensor(1.5574, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)
tensor(1.6213, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3347/17426 [05:06<19:55, 11.78it/s]

tensor(1.6153, grad_fn=<NllLossBackward0>)
tensor(1.5698, grad_fn=<NllLossBackward0>)
tensor(1.5736, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3349/17426 [05:06<20:07, 11.66it/s]

tensor(1.5738, grad_fn=<NllLossBackward0>)
tensor(1.5463, grad_fn=<NllLossBackward0>)
tensor(1.5452, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3353/17426 [05:07<19:37, 11.95it/s]

tensor(1.5669, grad_fn=<NllLossBackward0>)
tensor(1.5874, grad_fn=<NllLossBackward0>)
tensor(1.5694, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3355/17426 [05:07<20:13, 11.59it/s]

tensor(1.5921, grad_fn=<NllLossBackward0>)
tensor(1.5772, grad_fn=<NllLossBackward0>)
tensor(1.5846, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3359/17426 [05:07<20:07, 11.65it/s]

tensor(1.5983, grad_fn=<NllLossBackward0>)
tensor(1.5380, grad_fn=<NllLossBackward0>)
tensor(1.5377, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3361/17426 [05:07<20:11, 11.61it/s]

tensor(1.5572, grad_fn=<NllLossBackward0>)
tensor(1.5364, grad_fn=<NllLossBackward0>)
tensor(1.5784, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3365/17426 [05:08<19:56, 11.75it/s]

tensor(1.5704, grad_fn=<NllLossBackward0>)
tensor(1.5697, grad_fn=<NllLossBackward0>)
tensor(1.5554, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3367/17426 [05:08<20:14, 11.57it/s]

tensor(1.5345, grad_fn=<NllLossBackward0>)
tensor(1.5797, grad_fn=<NllLossBackward0>)
tensor(1.6175, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3371/17426 [05:08<20:06, 11.65it/s]

tensor(1.6021, grad_fn=<NllLossBackward0>)
tensor(1.5819, grad_fn=<NllLossBackward0>)
tensor(1.5601, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3373/17426 [05:08<20:16, 11.55it/s]

tensor(1.5642, grad_fn=<NllLossBackward0>)
tensor(1.5761, grad_fn=<NllLossBackward0>)
tensor(1.5932, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3377/17426 [05:09<20:18, 11.53it/s]

tensor(1.5816, grad_fn=<NllLossBackward0>)
tensor(1.5896, grad_fn=<NllLossBackward0>)
tensor(1.5793, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3379/17426 [05:09<20:29, 11.42it/s]

tensor(1.6122, grad_fn=<NllLossBackward0>)
tensor(1.5513, grad_fn=<NllLossBackward0>)
tensor(1.5243, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3383/17426 [05:09<20:04, 11.66it/s]

tensor(1.6141, grad_fn=<NllLossBackward0>)
tensor(1.6041, grad_fn=<NllLossBackward0>)
tensor(1.5954, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3385/17426 [05:09<20:17, 11.54it/s]

tensor(1.6260, grad_fn=<NllLossBackward0>)
tensor(1.5483, grad_fn=<NllLossBackward0>)
tensor(1.5960, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3389/17426 [05:10<20:04, 11.65it/s]

tensor(1.6147, grad_fn=<NllLossBackward0>)
tensor(1.5591, grad_fn=<NllLossBackward0>)
tensor(1.5934, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3391/17426 [05:10<20:22, 11.48it/s]

tensor(1.5354, grad_fn=<NllLossBackward0>)
tensor(1.5728, grad_fn=<NllLossBackward0>)
tensor(1.5474, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3395/17426 [05:10<20:29, 11.41it/s]

tensor(1.5697, grad_fn=<NllLossBackward0>)
tensor(1.5967, grad_fn=<NllLossBackward0>)
tensor(1.6005, grad_fn=<NllLossBackward0>)


 19%|█▉        | 3397/17426 [05:10<20:25, 11.44it/s]

tensor(1.6160, grad_fn=<NllLossBackward0>)
tensor(1.5403, grad_fn=<NllLossBackward0>)
tensor(1.5787, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3401/17426 [05:11<20:12, 11.57it/s]

tensor(1.5862, grad_fn=<NllLossBackward0>)
tensor(1.5995, grad_fn=<NllLossBackward0>)
tensor(1.5877, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3403/17426 [05:11<20:19, 11.50it/s]

tensor(1.5707, grad_fn=<NllLossBackward0>)
tensor(1.5915, grad_fn=<NllLossBackward0>)
tensor(1.5702, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3407/17426 [05:11<20:27, 11.43it/s]

tensor(1.5784, grad_fn=<NllLossBackward0>)
tensor(1.6000, grad_fn=<NllLossBackward0>)
tensor(1.6202, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3409/17426 [05:11<20:22, 11.47it/s]

tensor(1.5693, grad_fn=<NllLossBackward0>)
tensor(1.5523, grad_fn=<NllLossBackward0>)
tensor(1.5669, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3411/17426 [05:12<20:16, 11.52it/s]

tensor(1.6049, grad_fn=<NllLossBackward0>)
tensor(1.5842, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3413/17426 [05:12<21:21, 10.93it/s]

tensor(1.5979, grad_fn=<NllLossBackward0>)
tensor(1.5835, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3415/17426 [05:12<23:43,  9.84it/s]

tensor(1.5571, grad_fn=<NllLossBackward0>)
tensor(1.5557, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3418/17426 [05:12<25:54,  9.01it/s]

tensor(1.5564, grad_fn=<NllLossBackward0>)
tensor(1.5198, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3420/17426 [05:13<27:45,  8.41it/s]

tensor(1.6094, grad_fn=<NllLossBackward0>)
tensor(1.5739, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3422/17426 [05:13<27:34,  8.47it/s]

tensor(1.5516, grad_fn=<NllLossBackward0>)
tensor(1.5739, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3424/17426 [05:13<29:26,  7.93it/s]

tensor(1.5436, grad_fn=<NllLossBackward0>)
tensor(1.5380, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3426/17426 [05:13<27:36,  8.45it/s]

tensor(1.6122, grad_fn=<NllLossBackward0>)
tensor(1.5675, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3428/17426 [05:14<27:07,  8.60it/s]

tensor(1.5888, grad_fn=<NllLossBackward0>)
tensor(1.6047, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3430/17426 [05:14<29:58,  7.78it/s]

tensor(1.5897, grad_fn=<NllLossBackward0>)
tensor(1.5676, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3432/17426 [05:14<30:36,  7.62it/s]

tensor(1.5693, grad_fn=<NllLossBackward0>)
tensor(1.5505, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3434/17426 [05:14<31:25,  7.42it/s]

tensor(1.5550, grad_fn=<NllLossBackward0>)
tensor(1.5179, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3436/17426 [05:15<32:27,  7.18it/s]

tensor(1.6245, grad_fn=<NllLossBackward0>)
tensor(1.5816, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3438/17426 [05:15<26:50,  8.69it/s]

tensor(1.5227, grad_fn=<NllLossBackward0>)
tensor(1.5682, grad_fn=<NllLossBackward0>)
tensor(1.5653, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3442/17426 [05:15<22:44, 10.25it/s]

tensor(1.5515, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)
tensor(1.5736, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3444/17426 [05:15<21:42, 10.74it/s]

tensor(1.5543, grad_fn=<NllLossBackward0>)
tensor(1.5638, grad_fn=<NllLossBackward0>)
tensor(1.5510, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3448/17426 [05:16<20:43, 11.24it/s]

tensor(1.5087, grad_fn=<NllLossBackward0>)
tensor(1.6032, grad_fn=<NllLossBackward0>)
tensor(1.5888, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3450/17426 [05:16<20:31, 11.35it/s]

tensor(1.5405, grad_fn=<NllLossBackward0>)
tensor(1.5743, grad_fn=<NllLossBackward0>)
tensor(1.5328, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3454/17426 [05:16<20:34, 11.32it/s]

tensor(1.5633, grad_fn=<NllLossBackward0>)
tensor(1.5457, grad_fn=<NllLossBackward0>)
tensor(1.5803, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3456/17426 [05:16<20:38, 11.28it/s]

tensor(1.5938, grad_fn=<NllLossBackward0>)
tensor(1.6098, grad_fn=<NllLossBackward0>)
tensor(1.5727, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3460/17426 [05:17<20:12, 11.52it/s]

tensor(1.5698, grad_fn=<NllLossBackward0>)
tensor(1.5873, grad_fn=<NllLossBackward0>)
tensor(1.6149, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3462/17426 [05:17<20:12, 11.52it/s]

tensor(1.6008, grad_fn=<NllLossBackward0>)
tensor(1.5982, grad_fn=<NllLossBackward0>)
tensor(1.5734, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3466/17426 [05:17<20:17, 11.46it/s]

tensor(1.5814, grad_fn=<NllLossBackward0>)
tensor(1.5449, grad_fn=<NllLossBackward0>)
tensor(1.5682, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3468/17426 [05:18<20:13, 11.50it/s]

tensor(1.5895, grad_fn=<NllLossBackward0>)
tensor(1.5708, grad_fn=<NllLossBackward0>)
tensor(1.5447, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3472/17426 [05:18<19:56, 11.66it/s]

tensor(1.6103, grad_fn=<NllLossBackward0>)
tensor(1.5560, grad_fn=<NllLossBackward0>)
tensor(1.6174, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3474/17426 [05:18<19:58, 11.64it/s]

tensor(1.5241, grad_fn=<NllLossBackward0>)
tensor(1.5779, grad_fn=<NllLossBackward0>)
tensor(1.5097, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3478/17426 [05:18<20:01, 11.61it/s]

tensor(1.5588, grad_fn=<NllLossBackward0>)
tensor(1.5913, grad_fn=<NllLossBackward0>)
tensor(1.5528, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3480/17426 [05:19<20:06, 11.56it/s]

tensor(1.5773, grad_fn=<NllLossBackward0>)
tensor(1.5787, grad_fn=<NllLossBackward0>)
tensor(1.6151, grad_fn=<NllLossBackward0>)


 20%|█▉        | 3484/17426 [05:19<19:55, 11.67it/s]

tensor(1.4939, grad_fn=<NllLossBackward0>)
tensor(1.5838, grad_fn=<NllLossBackward0>)
tensor(1.5717, grad_fn=<NllLossBackward0>)


 20%|██        | 3486/17426 [05:19<19:50, 11.71it/s]

tensor(1.5359, grad_fn=<NllLossBackward0>)
tensor(1.5599, grad_fn=<NllLossBackward0>)
tensor(1.5537, grad_fn=<NllLossBackward0>)


 20%|██        | 3490/17426 [05:19<19:45, 11.75it/s]

tensor(1.5868, grad_fn=<NllLossBackward0>)
tensor(1.5466, grad_fn=<NllLossBackward0>)
tensor(1.5888, grad_fn=<NllLossBackward0>)


 20%|██        | 3492/17426 [05:20<19:50, 11.71it/s]

tensor(1.6194, grad_fn=<NllLossBackward0>)
tensor(1.5683, grad_fn=<NllLossBackward0>)
tensor(1.5912, grad_fn=<NllLossBackward0>)


 20%|██        | 3496/17426 [05:20<19:52, 11.68it/s]

tensor(1.5584, grad_fn=<NllLossBackward0>)
tensor(1.5914, grad_fn=<NllLossBackward0>)
tensor(1.5431, grad_fn=<NllLossBackward0>)


 20%|██        | 3498/17426 [05:20<20:07, 11.53it/s]

tensor(1.5867, grad_fn=<NllLossBackward0>)
tensor(1.5516, grad_fn=<NllLossBackward0>)
tensor(1.5716, grad_fn=<NllLossBackward0>)


 20%|██        | 3502/17426 [05:20<19:52, 11.67it/s]

tensor(1.5369, grad_fn=<NllLossBackward0>)
tensor(1.5684, grad_fn=<NllLossBackward0>)
tensor(1.5592, grad_fn=<NllLossBackward0>)


 20%|██        | 3504/17426 [05:21<19:47, 11.73it/s]

tensor(1.6277, grad_fn=<NllLossBackward0>)
tensor(1.5506, grad_fn=<NllLossBackward0>)
tensor(1.5415, grad_fn=<NllLossBackward0>)


 20%|██        | 3508/17426 [05:21<19:44, 11.75it/s]

tensor(1.5349, grad_fn=<NllLossBackward0>)
tensor(1.5801, grad_fn=<NllLossBackward0>)
tensor(1.5660, grad_fn=<NllLossBackward0>)


 20%|██        | 3510/17426 [05:21<19:55, 11.64it/s]

tensor(1.5862, grad_fn=<NllLossBackward0>)
tensor(1.5955, grad_fn=<NllLossBackward0>)
tensor(1.5272, grad_fn=<NllLossBackward0>)


 20%|██        | 3514/17426 [05:21<19:49, 11.70it/s]

tensor(1.6020, grad_fn=<NllLossBackward0>)
tensor(1.5972, grad_fn=<NllLossBackward0>)
tensor(1.5396, grad_fn=<NllLossBackward0>)


 20%|██        | 3516/17426 [05:22<19:45, 11.73it/s]

tensor(1.5394, grad_fn=<NllLossBackward0>)
tensor(1.5584, grad_fn=<NllLossBackward0>)
tensor(1.5718, grad_fn=<NllLossBackward0>)


 20%|██        | 3520/17426 [05:22<19:48, 11.70it/s]

tensor(1.5759, grad_fn=<NllLossBackward0>)
tensor(1.5792, grad_fn=<NllLossBackward0>)
tensor(1.5758, grad_fn=<NllLossBackward0>)


 20%|██        | 3522/17426 [05:22<19:56, 11.62it/s]

tensor(1.6043, grad_fn=<NllLossBackward0>)
tensor(1.5487, grad_fn=<NllLossBackward0>)
tensor(1.5682, grad_fn=<NllLossBackward0>)


 20%|██        | 3526/17426 [05:22<19:54, 11.64it/s]

tensor(1.5458, grad_fn=<NllLossBackward0>)
tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.5639, grad_fn=<NllLossBackward0>)


 20%|██        | 3528/17426 [05:23<20:02, 11.55it/s]

tensor(1.6226, grad_fn=<NllLossBackward0>)
tensor(1.5525, grad_fn=<NllLossBackward0>)
tensor(1.5825, grad_fn=<NllLossBackward0>)


 20%|██        | 3532/17426 [05:23<19:43, 11.74it/s]

tensor(1.5589, grad_fn=<NllLossBackward0>)
tensor(1.5515, grad_fn=<NllLossBackward0>)
tensor(1.5889, grad_fn=<NllLossBackward0>)


 20%|██        | 3534/17426 [05:23<19:46, 11.71it/s]

tensor(1.5654, grad_fn=<NllLossBackward0>)
tensor(1.6089, grad_fn=<NllLossBackward0>)
tensor(1.5967, grad_fn=<NllLossBackward0>)


 20%|██        | 3538/17426 [05:24<20:06, 11.51it/s]

tensor(1.5634, grad_fn=<NllLossBackward0>)
tensor(1.5735, grad_fn=<NllLossBackward0>)
tensor(1.5742, grad_fn=<NllLossBackward0>)


 20%|██        | 3540/17426 [05:24<20:05, 11.52it/s]

tensor(1.5758, grad_fn=<NllLossBackward0>)
tensor(1.5700, grad_fn=<NllLossBackward0>)
tensor(1.5377, grad_fn=<NllLossBackward0>)


 20%|██        | 3544/17426 [05:24<19:50, 11.66it/s]

tensor(1.5706, grad_fn=<NllLossBackward0>)
tensor(1.5760, grad_fn=<NllLossBackward0>)
tensor(1.5754, grad_fn=<NllLossBackward0>)


 20%|██        | 3546/17426 [05:24<19:51, 11.65it/s]

tensor(1.5648, grad_fn=<NllLossBackward0>)
tensor(1.5601, grad_fn=<NllLossBackward0>)
tensor(1.6138, grad_fn=<NllLossBackward0>)


 20%|██        | 3550/17426 [05:25<20:04, 11.52it/s]

tensor(1.5622, grad_fn=<NllLossBackward0>)
tensor(1.5657, grad_fn=<NllLossBackward0>)
tensor(1.5383, grad_fn=<NllLossBackward0>)


 20%|██        | 3552/17426 [05:25<20:43, 11.16it/s]

tensor(1.5993, grad_fn=<NllLossBackward0>)
tensor(1.5616, grad_fn=<NllLossBackward0>)


 20%|██        | 3554/17426 [05:25<23:34,  9.80it/s]

tensor(1.5628, grad_fn=<NllLossBackward0>)
tensor(1.5768, grad_fn=<NllLossBackward0>)


 20%|██        | 3556/17426 [05:25<25:14,  9.16it/s]

tensor(1.5955, grad_fn=<NllLossBackward0>)
tensor(1.5577, grad_fn=<NllLossBackward0>)


 20%|██        | 3558/17426 [05:26<27:51,  8.30it/s]

tensor(1.5653, grad_fn=<NllLossBackward0>)
tensor(1.6084, grad_fn=<NllLossBackward0>)


 20%|██        | 3560/17426 [05:26<28:08,  8.21it/s]

tensor(1.6305, grad_fn=<NllLossBackward0>)
tensor(1.5537, grad_fn=<NllLossBackward0>)


 20%|██        | 3562/17426 [05:26<28:34,  8.09it/s]

tensor(1.5671, grad_fn=<NllLossBackward0>)
tensor(1.5653, grad_fn=<NllLossBackward0>)


 20%|██        | 3564/17426 [05:26<28:44,  8.04it/s]

tensor(1.5210, grad_fn=<NllLossBackward0>)
tensor(1.5770, grad_fn=<NllLossBackward0>)


 20%|██        | 3566/17426 [05:27<27:26,  8.42it/s]

tensor(1.5753, grad_fn=<NllLossBackward0>)
tensor(1.5711, grad_fn=<NllLossBackward0>)


 20%|██        | 3568/17426 [05:27<29:54,  7.72it/s]

tensor(1.5560, grad_fn=<NllLossBackward0>)
tensor(1.5790, grad_fn=<NllLossBackward0>)


 20%|██        | 3570/17426 [05:27<29:32,  7.82it/s]

tensor(1.5832, grad_fn=<NllLossBackward0>)
tensor(1.5647, grad_fn=<NllLossBackward0>)


 20%|██        | 3572/17426 [05:27<29:56,  7.71it/s]

tensor(1.5357, grad_fn=<NllLossBackward0>)
tensor(1.5511, grad_fn=<NllLossBackward0>)


 21%|██        | 3574/17426 [05:28<29:59,  7.70it/s]

tensor(1.5318, grad_fn=<NllLossBackward0>)
tensor(1.5374, grad_fn=<NllLossBackward0>)


 21%|██        | 3576/17426 [05:28<31:14,  7.39it/s]

tensor(1.5654, grad_fn=<NllLossBackward0>)
tensor(1.5882, grad_fn=<NllLossBackward0>)


 21%|██        | 3579/17426 [05:28<24:13,  9.53it/s]

tensor(1.5181, grad_fn=<NllLossBackward0>)
tensor(1.6040, grad_fn=<NllLossBackward0>)
tensor(1.5675, grad_fn=<NllLossBackward0>)


 21%|██        | 3582/17426 [05:28<21:38, 10.66it/s]

tensor(1.5563, grad_fn=<NllLossBackward0>)
tensor(1.5478, grad_fn=<NllLossBackward0>)
tensor(1.5772, grad_fn=<NllLossBackward0>)


 21%|██        | 3584/17426 [05:29<21:01, 10.97it/s]

tensor(1.5217, grad_fn=<NllLossBackward0>)
tensor(1.6375, grad_fn=<NllLossBackward0>)
tensor(1.5509, grad_fn=<NllLossBackward0>)


 21%|██        | 3588/17426 [05:29<20:33, 11.22it/s]

tensor(1.5505, grad_fn=<NllLossBackward0>)
tensor(1.5512, grad_fn=<NllLossBackward0>)
tensor(1.5580, grad_fn=<NllLossBackward0>)


 21%|██        | 3590/17426 [05:29<20:25, 11.29it/s]

tensor(1.5457, grad_fn=<NllLossBackward0>)
tensor(1.5770, grad_fn=<NllLossBackward0>)
tensor(1.5443, grad_fn=<NllLossBackward0>)


 21%|██        | 3594/17426 [05:29<19:56, 11.56it/s]

tensor(1.5721, grad_fn=<NllLossBackward0>)
tensor(1.6286, grad_fn=<NllLossBackward0>)
tensor(1.5935, grad_fn=<NllLossBackward0>)


 21%|██        | 3596/17426 [05:30<19:56, 11.56it/s]

tensor(1.5795, grad_fn=<NllLossBackward0>)
tensor(1.5346, grad_fn=<NllLossBackward0>)
tensor(1.5484, grad_fn=<NllLossBackward0>)


 21%|██        | 3600/17426 [05:30<20:06, 11.46it/s]

tensor(1.5278, grad_fn=<NllLossBackward0>)
tensor(1.5843, grad_fn=<NllLossBackward0>)
tensor(1.5574, grad_fn=<NllLossBackward0>)


 21%|██        | 3602/17426 [05:30<20:08, 11.44it/s]

tensor(1.6099, grad_fn=<NllLossBackward0>)
tensor(1.5941, grad_fn=<NllLossBackward0>)
tensor(1.5783, grad_fn=<NllLossBackward0>)


 21%|██        | 3606/17426 [05:30<19:48, 11.63it/s]

tensor(1.6064, grad_fn=<NllLossBackward0>)
tensor(1.6201, grad_fn=<NllLossBackward0>)
tensor(1.5511, grad_fn=<NllLossBackward0>)


 21%|██        | 3608/17426 [05:31<19:56, 11.55it/s]

tensor(1.5873, grad_fn=<NllLossBackward0>)
tensor(1.6245, grad_fn=<NllLossBackward0>)
tensor(1.5737, grad_fn=<NllLossBackward0>)


 21%|██        | 3612/17426 [05:31<20:09, 11.42it/s]

tensor(1.5701, grad_fn=<NllLossBackward0>)
tensor(1.4988, grad_fn=<NllLossBackward0>)
tensor(1.6054, grad_fn=<NllLossBackward0>)


 21%|██        | 3614/17426 [05:31<20:15, 11.36it/s]

tensor(1.5783, grad_fn=<NllLossBackward0>)
tensor(1.5417, grad_fn=<NllLossBackward0>)
tensor(1.5285, grad_fn=<NllLossBackward0>)


 21%|██        | 3618/17426 [05:32<19:40, 11.70it/s]

tensor(1.5867, grad_fn=<NllLossBackward0>)
tensor(1.5790, grad_fn=<NllLossBackward0>)
tensor(1.5126, grad_fn=<NllLossBackward0>)


 21%|██        | 3620/17426 [05:32<20:00, 11.50it/s]

tensor(1.4911, grad_fn=<NllLossBackward0>)
tensor(1.5594, grad_fn=<NllLossBackward0>)
tensor(1.5619, grad_fn=<NllLossBackward0>)


 21%|██        | 3624/17426 [05:32<20:10, 11.40it/s]

tensor(1.5624, grad_fn=<NllLossBackward0>)
tensor(1.5668, grad_fn=<NllLossBackward0>)
tensor(1.5782, grad_fn=<NllLossBackward0>)


 21%|██        | 3626/17426 [05:32<20:10, 11.40it/s]

tensor(1.5445, grad_fn=<NllLossBackward0>)
tensor(1.5661, grad_fn=<NllLossBackward0>)
tensor(1.5220, grad_fn=<NllLossBackward0>)


 21%|██        | 3630/17426 [05:33<19:36, 11.72it/s]

tensor(1.5759, grad_fn=<NllLossBackward0>)
tensor(1.5925, grad_fn=<NllLossBackward0>)
tensor(1.5574, grad_fn=<NllLossBackward0>)


 21%|██        | 3632/17426 [05:33<19:58, 11.51it/s]

tensor(1.5511, grad_fn=<NllLossBackward0>)
tensor(1.5568, grad_fn=<NllLossBackward0>)
tensor(1.5501, grad_fn=<NllLossBackward0>)


 21%|██        | 3636/17426 [05:33<20:13, 11.36it/s]

tensor(1.5318, grad_fn=<NllLossBackward0>)
tensor(1.5541, grad_fn=<NllLossBackward0>)
tensor(1.5781, grad_fn=<NllLossBackward0>)


 21%|██        | 3638/17426 [05:33<20:29, 11.21it/s]

tensor(1.5946, grad_fn=<NllLossBackward0>)
tensor(1.5812, grad_fn=<NllLossBackward0>)
tensor(1.5669, grad_fn=<NllLossBackward0>)


 21%|██        | 3642/17426 [05:34<19:58, 11.50it/s]

tensor(1.5140, grad_fn=<NllLossBackward0>)
tensor(1.5730, grad_fn=<NllLossBackward0>)
tensor(1.5522, grad_fn=<NllLossBackward0>)


 21%|██        | 3644/17426 [05:34<20:01, 11.47it/s]

tensor(1.5323, grad_fn=<NllLossBackward0>)
tensor(1.5820, grad_fn=<NllLossBackward0>)
tensor(1.5878, grad_fn=<NllLossBackward0>)


 21%|██        | 3648/17426 [05:34<20:09, 11.39it/s]

tensor(1.6017, grad_fn=<NllLossBackward0>)
tensor(1.6089, grad_fn=<NllLossBackward0>)
tensor(1.5987, grad_fn=<NllLossBackward0>)


 21%|██        | 3650/17426 [05:34<20:01, 11.46it/s]

tensor(1.6002, grad_fn=<NllLossBackward0>)
tensor(1.5578, grad_fn=<NllLossBackward0>)
tensor(1.5604, grad_fn=<NllLossBackward0>)


 21%|██        | 3654/17426 [05:35<19:42, 11.65it/s]

tensor(1.5704, grad_fn=<NllLossBackward0>)
tensor(1.5390, grad_fn=<NllLossBackward0>)
tensor(1.6024, grad_fn=<NllLossBackward0>)


 21%|██        | 3656/17426 [05:35<19:59, 11.48it/s]

tensor(1.5535, grad_fn=<NllLossBackward0>)
tensor(1.5937, grad_fn=<NllLossBackward0>)
tensor(1.5269, grad_fn=<NllLossBackward0>)


 21%|██        | 3660/17426 [05:35<19:59, 11.47it/s]

tensor(1.5838, grad_fn=<NllLossBackward0>)
tensor(1.5343, grad_fn=<NllLossBackward0>)
tensor(1.5277, grad_fn=<NllLossBackward0>)


 21%|██        | 3662/17426 [05:35<20:00, 11.47it/s]

tensor(1.5797, grad_fn=<NllLossBackward0>)
tensor(1.5580, grad_fn=<NllLossBackward0>)
tensor(1.5352, grad_fn=<NllLossBackward0>)


 21%|██        | 3666/17426 [05:36<19:41, 11.65it/s]

tensor(1.5376, grad_fn=<NllLossBackward0>)
tensor(1.5272, grad_fn=<NllLossBackward0>)
tensor(1.6060, grad_fn=<NllLossBackward0>)


 21%|██        | 3668/17426 [05:36<19:51, 11.55it/s]

tensor(1.5153, grad_fn=<NllLossBackward0>)
tensor(1.5499, grad_fn=<NllLossBackward0>)
tensor(1.5603, grad_fn=<NllLossBackward0>)


 21%|██        | 3672/17426 [05:36<19:46, 11.59it/s]

tensor(1.6149, grad_fn=<NllLossBackward0>)
tensor(1.6123, grad_fn=<NllLossBackward0>)
tensor(1.5400, grad_fn=<NllLossBackward0>)


 21%|██        | 3674/17426 [05:36<19:45, 11.60it/s]

tensor(1.5844, grad_fn=<NllLossBackward0>)
tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.5616, grad_fn=<NllLossBackward0>)


 21%|██        | 3678/17426 [05:37<19:40, 11.65it/s]

tensor(1.5683, grad_fn=<NllLossBackward0>)
tensor(1.5605, grad_fn=<NllLossBackward0>)
tensor(1.6382, grad_fn=<NllLossBackward0>)


 21%|██        | 3680/17426 [05:37<19:55, 11.49it/s]

tensor(1.5349, grad_fn=<NllLossBackward0>)
tensor(1.5379, grad_fn=<NllLossBackward0>)
tensor(1.5609, grad_fn=<NllLossBackward0>)


 21%|██        | 3684/17426 [05:37<19:44, 11.60it/s]

tensor(1.5413, grad_fn=<NllLossBackward0>)
tensor(1.5498, grad_fn=<NllLossBackward0>)
tensor(1.5438, grad_fn=<NllLossBackward0>)


 21%|██        | 3686/17426 [05:37<19:59, 11.45it/s]

tensor(1.6048, grad_fn=<NllLossBackward0>)
tensor(1.6202, grad_fn=<NllLossBackward0>)
tensor(1.5576, grad_fn=<NllLossBackward0>)


 21%|██        | 3690/17426 [05:38<19:39, 11.64it/s]

tensor(1.5875, grad_fn=<NllLossBackward0>)
tensor(1.5621, grad_fn=<NllLossBackward0>)
tensor(1.5171, grad_fn=<NllLossBackward0>)


 21%|██        | 3692/17426 [05:38<22:50, 10.02it/s]

tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.5648, grad_fn=<NllLossBackward0>)


 21%|██        | 3694/17426 [05:38<25:20,  9.03it/s]

tensor(1.5719, grad_fn=<NllLossBackward0>)
tensor(1.5755, grad_fn=<NllLossBackward0>)


 21%|██        | 3696/17426 [05:39<27:34,  8.30it/s]

tensor(1.6405, grad_fn=<NllLossBackward0>)
tensor(1.5911, grad_fn=<NllLossBackward0>)


 21%|██        | 3698/17426 [05:39<28:14,  8.10it/s]

tensor(1.5895, grad_fn=<NllLossBackward0>)
tensor(1.6079, grad_fn=<NllLossBackward0>)


 21%|██        | 3700/17426 [05:39<28:09,  8.13it/s]

tensor(1.5340, grad_fn=<NllLossBackward0>)
tensor(1.5581, grad_fn=<NllLossBackward0>)


 21%|██        | 3702/17426 [05:39<29:11,  7.84it/s]

tensor(1.5172, grad_fn=<NllLossBackward0>)
tensor(1.6060, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3704/17426 [05:40<27:57,  8.18it/s]

tensor(1.5447, grad_fn=<NllLossBackward0>)
tensor(1.5560, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3706/17426 [05:40<29:10,  7.84it/s]

tensor(1.5846, grad_fn=<NllLossBackward0>)
tensor(1.5588, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3708/17426 [05:40<30:38,  7.46it/s]

tensor(1.5613, grad_fn=<NllLossBackward0>)
tensor(1.5837, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3710/17426 [05:40<31:21,  7.29it/s]

tensor(1.6006, grad_fn=<NllLossBackward0>)
tensor(1.6130, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3712/17426 [05:41<32:58,  6.93it/s]

tensor(1.5943, grad_fn=<NllLossBackward0>)
tensor(1.5487, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3714/17426 [05:41<31:20,  7.29it/s]

tensor(1.5215, grad_fn=<NllLossBackward0>)
tensor(1.5693, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3717/17426 [05:41<25:03,  9.12it/s]

tensor(1.5678, grad_fn=<NllLossBackward0>)
tensor(1.5436, grad_fn=<NllLossBackward0>)
tensor(1.5876, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3719/17426 [05:41<22:43, 10.05it/s]

tensor(1.5697, grad_fn=<NllLossBackward0>)
tensor(1.5882, grad_fn=<NllLossBackward0>)
tensor(1.5958, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3723/17426 [05:42<20:51, 10.95it/s]

tensor(1.6038, grad_fn=<NllLossBackward0>)
tensor(1.5766, grad_fn=<NllLossBackward0>)
tensor(1.5918, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3725/17426 [05:42<20:28, 11.15it/s]

tensor(1.5813, grad_fn=<NllLossBackward0>)
tensor(1.5703, grad_fn=<NllLossBackward0>)
tensor(1.5580, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3729/17426 [05:42<20:04, 11.37it/s]

tensor(1.5756, grad_fn=<NllLossBackward0>)
tensor(1.5240, grad_fn=<NllLossBackward0>)
tensor(1.5811, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3731/17426 [05:43<19:44, 11.56it/s]

tensor(1.5821, grad_fn=<NllLossBackward0>)
tensor(1.5880, grad_fn=<NllLossBackward0>)
tensor(1.5329, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3735/17426 [05:43<19:35, 11.65it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.5243, grad_fn=<NllLossBackward0>)
tensor(1.5398, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3737/17426 [05:43<19:50, 11.50it/s]

tensor(1.5588, grad_fn=<NllLossBackward0>)
tensor(1.5166, grad_fn=<NllLossBackward0>)
tensor(1.5568, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3741/17426 [05:43<20:04, 11.36it/s]

tensor(1.5578, grad_fn=<NllLossBackward0>)
tensor(1.5670, grad_fn=<NllLossBackward0>)
tensor(1.5519, grad_fn=<NllLossBackward0>)


 21%|██▏       | 3743/17426 [05:44<20:07, 11.33it/s]

tensor(1.5392, grad_fn=<NllLossBackward0>)
tensor(1.5395, grad_fn=<NllLossBackward0>)
tensor(1.5623, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3747/17426 [05:44<19:37, 11.61it/s]

tensor(1.5497, grad_fn=<NllLossBackward0>)
tensor(1.6130, grad_fn=<NllLossBackward0>)
tensor(1.5533, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3749/17426 [05:44<19:54, 11.45it/s]

tensor(1.5687, grad_fn=<NllLossBackward0>)
tensor(1.5656, grad_fn=<NllLossBackward0>)
tensor(1.5430, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3753/17426 [05:44<19:44, 11.55it/s]

tensor(1.5761, grad_fn=<NllLossBackward0>)
tensor(1.5502, grad_fn=<NllLossBackward0>)
tensor(1.5693, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3755/17426 [05:45<19:53, 11.46it/s]

tensor(1.5628, grad_fn=<NllLossBackward0>)
tensor(1.5610, grad_fn=<NllLossBackward0>)
tensor(1.5526, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3759/17426 [05:45<19:41, 11.57it/s]

tensor(1.5611, grad_fn=<NllLossBackward0>)
tensor(1.5433, grad_fn=<NllLossBackward0>)
tensor(1.6231, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3761/17426 [05:45<19:40, 11.58it/s]

tensor(1.5458, grad_fn=<NllLossBackward0>)
tensor(1.5951, grad_fn=<NllLossBackward0>)
tensor(1.5639, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3765/17426 [05:45<19:49, 11.48it/s]

tensor(1.5459, grad_fn=<NllLossBackward0>)
tensor(1.6010, grad_fn=<NllLossBackward0>)
tensor(1.5245, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3767/17426 [05:46<19:53, 11.44it/s]

tensor(1.5814, grad_fn=<NllLossBackward0>)
tensor(1.5560, grad_fn=<NllLossBackward0>)
tensor(1.5021, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3771/17426 [05:46<19:45, 11.52it/s]

tensor(1.5544, grad_fn=<NllLossBackward0>)
tensor(1.5260, grad_fn=<NllLossBackward0>)
tensor(1.5593, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3773/17426 [05:46<19:59, 11.38it/s]

tensor(1.6306, grad_fn=<NllLossBackward0>)
tensor(1.6212, grad_fn=<NllLossBackward0>)
tensor(1.5787, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3777/17426 [05:47<19:46, 11.51it/s]

tensor(1.5896, grad_fn=<NllLossBackward0>)
tensor(1.6182, grad_fn=<NllLossBackward0>)
tensor(1.5162, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3779/17426 [05:47<19:54, 11.43it/s]

tensor(1.5640, grad_fn=<NllLossBackward0>)
tensor(1.5593, grad_fn=<NllLossBackward0>)
tensor(1.5505, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3783/17426 [05:47<19:36, 11.60it/s]

tensor(1.5206, grad_fn=<NllLossBackward0>)
tensor(1.5601, grad_fn=<NllLossBackward0>)
tensor(1.5385, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3785/17426 [05:47<19:41, 11.54it/s]

tensor(1.5887, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)
tensor(1.5652, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3789/17426 [05:48<19:25, 11.70it/s]

tensor(1.5798, grad_fn=<NllLossBackward0>)
tensor(1.5191, grad_fn=<NllLossBackward0>)
tensor(1.5329, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3791/17426 [05:48<19:32, 11.63it/s]

tensor(1.5328, grad_fn=<NllLossBackward0>)
tensor(1.5986, grad_fn=<NllLossBackward0>)
tensor(1.5361, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3795/17426 [05:48<19:42, 11.52it/s]

tensor(1.5700, grad_fn=<NllLossBackward0>)
tensor(1.5681, grad_fn=<NllLossBackward0>)
tensor(1.5518, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3797/17426 [05:48<19:48, 11.47it/s]

tensor(1.5686, grad_fn=<NllLossBackward0>)
tensor(1.5548, grad_fn=<NllLossBackward0>)
tensor(1.6021, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3801/17426 [05:49<19:55, 11.40it/s]

tensor(1.5584, grad_fn=<NllLossBackward0>)
tensor(1.5777, grad_fn=<NllLossBackward0>)
tensor(1.5417, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3803/17426 [05:49<19:59, 11.36it/s]

tensor(1.5557, grad_fn=<NllLossBackward0>)
tensor(1.5735, grad_fn=<NllLossBackward0>)
tensor(1.5226, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3807/17426 [05:49<19:36, 11.58it/s]

tensor(1.5601, grad_fn=<NllLossBackward0>)
tensor(1.5580, grad_fn=<NllLossBackward0>)
tensor(1.5726, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3809/17426 [05:49<19:38, 11.55it/s]

tensor(1.5631, grad_fn=<NllLossBackward0>)
tensor(1.5547, grad_fn=<NllLossBackward0>)
tensor(1.5799, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3813/17426 [05:50<19:59, 11.35it/s]

tensor(1.5754, grad_fn=<NllLossBackward0>)
tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.5698, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3815/17426 [05:50<19:54, 11.39it/s]

tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.5608, grad_fn=<NllLossBackward0>)
tensor(1.5752, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3819/17426 [05:50<19:30, 11.63it/s]

tensor(1.5221, grad_fn=<NllLossBackward0>)
tensor(1.5523, grad_fn=<NllLossBackward0>)
tensor(1.5511, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3821/17426 [05:50<19:40, 11.52it/s]

tensor(1.5629, grad_fn=<NllLossBackward0>)
tensor(1.5348, grad_fn=<NllLossBackward0>)
tensor(1.5395, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3825/17426 [05:51<19:44, 11.48it/s]

tensor(1.5702, grad_fn=<NllLossBackward0>)
tensor(1.5703, grad_fn=<NllLossBackward0>)
tensor(1.5632, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3827/17426 [05:51<19:54, 11.38it/s]

tensor(1.5452, grad_fn=<NllLossBackward0>)
tensor(1.6107, grad_fn=<NllLossBackward0>)
tensor(1.5740, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3829/17426 [05:51<20:56, 10.82it/s]

tensor(1.5839, grad_fn=<NllLossBackward0>)
tensor(1.5225, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3832/17426 [05:51<24:42,  9.17it/s]

tensor(1.5424, grad_fn=<NllLossBackward0>)
tensor(1.5548, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3834/17426 [05:52<26:49,  8.44it/s]

tensor(1.5607, grad_fn=<NllLossBackward0>)
tensor(1.5608, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3836/17426 [05:52<27:50,  8.13it/s]

tensor(1.5467, grad_fn=<NllLossBackward0>)
tensor(1.5565, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3838/17426 [05:52<28:55,  7.83it/s]

tensor(1.5632, grad_fn=<NllLossBackward0>)
tensor(1.5804, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3840/17426 [05:53<28:04,  8.07it/s]

tensor(1.5455, grad_fn=<NllLossBackward0>)
tensor(1.5867, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3842/17426 [05:53<28:42,  7.89it/s]

tensor(1.5423, grad_fn=<NllLossBackward0>)
tensor(1.5614, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3844/17426 [05:53<26:56,  8.40it/s]

tensor(1.6211, grad_fn=<NllLossBackward0>)
tensor(1.5481, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3846/17426 [05:53<28:39,  7.90it/s]

tensor(1.5845, grad_fn=<NllLossBackward0>)
tensor(1.5966, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3848/17426 [05:54<28:41,  7.89it/s]

tensor(1.5653, grad_fn=<NllLossBackward0>)
tensor(1.5595, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3850/17426 [05:54<30:55,  7.32it/s]

tensor(1.5587, grad_fn=<NllLossBackward0>)
tensor(1.5431, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3852/17426 [05:54<30:47,  7.35it/s]

tensor(1.5907, grad_fn=<NllLossBackward0>)
tensor(1.5726, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3854/17426 [05:54<28:54,  7.82it/s]

tensor(1.5942, grad_fn=<NllLossBackward0>)
tensor(1.5659, grad_fn=<NllLossBackward0>)
tensor(1.5510, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3858/17426 [05:55<22:19, 10.13it/s]

tensor(1.5707, grad_fn=<NllLossBackward0>)
tensor(1.5544, grad_fn=<NllLossBackward0>)
tensor(1.5809, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3860/17426 [05:55<22:41,  9.96it/s]

tensor(1.5350, grad_fn=<NllLossBackward0>)
tensor(1.5401, grad_fn=<NllLossBackward0>)
tensor(1.5599, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3864/17426 [05:55<20:43, 10.91it/s]

tensor(1.5396, grad_fn=<NllLossBackward0>)
tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.5788, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3866/17426 [05:55<20:21, 11.10it/s]

tensor(1.5800, grad_fn=<NllLossBackward0>)
tensor(1.5496, grad_fn=<NllLossBackward0>)
tensor(1.5784, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3870/17426 [05:56<19:45, 11.44it/s]

tensor(1.5491, grad_fn=<NllLossBackward0>)
tensor(1.5565, grad_fn=<NllLossBackward0>)
tensor(1.5358, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3872/17426 [05:56<20:15, 11.16it/s]

tensor(1.6190, grad_fn=<NllLossBackward0>)
tensor(1.5245, grad_fn=<NllLossBackward0>)
tensor(1.6087, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3876/17426 [05:56<19:43, 11.45it/s]

tensor(1.5593, grad_fn=<NllLossBackward0>)
tensor(1.6241, grad_fn=<NllLossBackward0>)
tensor(1.5609, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3878/17426 [05:56<19:36, 11.52it/s]

tensor(1.5317, grad_fn=<NllLossBackward0>)
tensor(1.5532, grad_fn=<NllLossBackward0>)
tensor(1.5346, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3882/17426 [05:57<19:36, 11.51it/s]

tensor(1.5517, grad_fn=<NllLossBackward0>)
tensor(1.5173, grad_fn=<NllLossBackward0>)
tensor(1.5894, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3884/17426 [05:57<20:07, 11.21it/s]

tensor(1.5226, grad_fn=<NllLossBackward0>)
tensor(1.6023, grad_fn=<NllLossBackward0>)
tensor(1.5790, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3888/17426 [05:57<19:40, 11.46it/s]

tensor(1.5665, grad_fn=<NllLossBackward0>)
tensor(1.5654, grad_fn=<NllLossBackward0>)
tensor(1.5507, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3890/17426 [05:58<19:44, 11.43it/s]

tensor(1.5542, grad_fn=<NllLossBackward0>)
tensor(1.5603, grad_fn=<NllLossBackward0>)
tensor(1.6046, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3894/17426 [05:58<19:27, 11.59it/s]

tensor(1.5283, grad_fn=<NllLossBackward0>)
tensor(1.5467, grad_fn=<NllLossBackward0>)
tensor(1.5750, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3896/17426 [05:58<20:05, 11.22it/s]

tensor(1.5326, grad_fn=<NllLossBackward0>)
tensor(1.5470, grad_fn=<NllLossBackward0>)
tensor(1.5662, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3900/17426 [05:58<19:35, 11.50it/s]

tensor(1.5492, grad_fn=<NllLossBackward0>)
tensor(1.5801, grad_fn=<NllLossBackward0>)
tensor(1.5294, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3902/17426 [05:59<19:31, 11.54it/s]

tensor(1.5813, grad_fn=<NllLossBackward0>)
tensor(1.5493, grad_fn=<NllLossBackward0>)
tensor(1.5075, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3906/17426 [05:59<19:32, 11.53it/s]

tensor(1.5763, grad_fn=<NllLossBackward0>)
tensor(1.5522, grad_fn=<NllLossBackward0>)
tensor(1.5922, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3908/17426 [05:59<19:58, 11.27it/s]

tensor(1.5174, grad_fn=<NllLossBackward0>)
tensor(1.5921, grad_fn=<NllLossBackward0>)
tensor(1.5691, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3912/17426 [05:59<19:43, 11.42it/s]

tensor(1.5870, grad_fn=<NllLossBackward0>)
tensor(1.5995, grad_fn=<NllLossBackward0>)
tensor(1.5779, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3914/17426 [06:00<19:48, 11.37it/s]

tensor(1.5429, grad_fn=<NllLossBackward0>)
tensor(1.5627, grad_fn=<NllLossBackward0>)
tensor(1.5482, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3918/17426 [06:00<19:19, 11.65it/s]

tensor(1.5996, grad_fn=<NllLossBackward0>)
tensor(1.5699, grad_fn=<NllLossBackward0>)
tensor(1.5345, grad_fn=<NllLossBackward0>)


 22%|██▏       | 3920/17426 [06:00<20:02, 11.23it/s]

tensor(1.5456, grad_fn=<NllLossBackward0>)
tensor(1.5520, grad_fn=<NllLossBackward0>)
tensor(1.5583, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3924/17426 [06:01<19:33, 11.50it/s]

tensor(1.5771, grad_fn=<NllLossBackward0>)
tensor(1.6198, grad_fn=<NllLossBackward0>)
tensor(1.5606, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3926/17426 [06:01<19:41, 11.43it/s]

tensor(1.5546, grad_fn=<NllLossBackward0>)
tensor(1.5925, grad_fn=<NllLossBackward0>)
tensor(1.5838, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3930/17426 [06:01<19:22, 11.61it/s]

tensor(1.5688, grad_fn=<NllLossBackward0>)
tensor(1.5605, grad_fn=<NllLossBackward0>)
tensor(1.5955, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3932/17426 [06:01<19:56, 11.28it/s]

tensor(1.5184, grad_fn=<NllLossBackward0>)
tensor(1.5791, grad_fn=<NllLossBackward0>)
tensor(1.5521, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3936/17426 [06:02<19:35, 11.48it/s]

tensor(1.5403, grad_fn=<NllLossBackward0>)
tensor(1.5457, grad_fn=<NllLossBackward0>)
tensor(1.5732, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3938/17426 [06:02<19:35, 11.47it/s]

tensor(1.5377, grad_fn=<NllLossBackward0>)
tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.5340, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3942/17426 [06:02<19:19, 11.63it/s]

tensor(1.6408, grad_fn=<NllLossBackward0>)
tensor(1.5764, grad_fn=<NllLossBackward0>)
tensor(1.5631, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3944/17426 [06:02<20:07, 11.16it/s]

tensor(1.5432, grad_fn=<NllLossBackward0>)
tensor(1.5890, grad_fn=<NllLossBackward0>)
tensor(1.5928, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3948/17426 [06:03<19:26, 11.55it/s]

tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.5781, grad_fn=<NllLossBackward0>)
tensor(1.5464, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3950/17426 [06:03<19:28, 11.53it/s]

tensor(1.6021, grad_fn=<NllLossBackward0>)
tensor(1.5488, grad_fn=<NllLossBackward0>)
tensor(1.5344, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3954/17426 [06:03<19:14, 11.67it/s]

tensor(1.5554, grad_fn=<NllLossBackward0>)
tensor(1.5483, grad_fn=<NllLossBackward0>)
tensor(1.6099, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3956/17426 [06:03<19:53, 11.29it/s]

tensor(1.5202, grad_fn=<NllLossBackward0>)
tensor(1.6023, grad_fn=<NllLossBackward0>)
tensor(1.5426, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3960/17426 [06:04<19:24, 11.57it/s]

tensor(1.5674, grad_fn=<NllLossBackward0>)
tensor(1.4553, grad_fn=<NllLossBackward0>)
tensor(1.5638, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3962/17426 [06:04<19:36, 11.45it/s]

tensor(1.5455, grad_fn=<NllLossBackward0>)
tensor(1.6082, grad_fn=<NllLossBackward0>)
tensor(1.5558, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3966/17426 [06:04<19:58, 11.23it/s]

tensor(1.5084, grad_fn=<NllLossBackward0>)
tensor(1.5509, grad_fn=<NllLossBackward0>)
tensor(1.5875, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3968/17426 [06:04<21:41, 10.34it/s]

tensor(1.5505, grad_fn=<NllLossBackward0>)
tensor(1.5447, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3970/17426 [06:05<23:59,  9.35it/s]

tensor(1.6073, grad_fn=<NllLossBackward0>)
tensor(1.5611, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3972/17426 [06:05<24:58,  8.98it/s]

tensor(1.5699, grad_fn=<NllLossBackward0>)
tensor(1.5345, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3974/17426 [06:05<25:54,  8.65it/s]

tensor(1.5140, grad_fn=<NllLossBackward0>)
tensor(1.5761, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3976/17426 [06:05<26:53,  8.33it/s]

tensor(1.5758, grad_fn=<NllLossBackward0>)
tensor(1.5938, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3978/17426 [06:06<26:48,  8.36it/s]

tensor(1.5950, grad_fn=<NllLossBackward0>)
tensor(1.5987, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3980/17426 [06:06<28:13,  7.94it/s]

tensor(1.5536, grad_fn=<NllLossBackward0>)
tensor(1.6103, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3982/17426 [06:06<28:51,  7.76it/s]

tensor(1.5342, grad_fn=<NllLossBackward0>)
tensor(1.5553, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3984/17426 [06:06<30:36,  7.32it/s]

tensor(1.5625, grad_fn=<NllLossBackward0>)
tensor(1.6083, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3986/17426 [06:07<30:49,  7.26it/s]

tensor(1.5615, grad_fn=<NllLossBackward0>)
tensor(1.5446, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3988/17426 [06:07<32:07,  6.97it/s]

tensor(1.6056, grad_fn=<NllLossBackward0>)
tensor(1.5761, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3990/17426 [06:07<32:11,  6.96it/s]

tensor(1.5757, grad_fn=<NllLossBackward0>)
tensor(1.5992, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3993/17426 [06:08<25:20,  8.84it/s]

tensor(1.5481, grad_fn=<NllLossBackward0>)
tensor(1.5613, grad_fn=<NllLossBackward0>)
tensor(1.5737, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3996/17426 [06:08<21:58, 10.19it/s]

tensor(1.5823, grad_fn=<NllLossBackward0>)
tensor(1.5202, grad_fn=<NllLossBackward0>)
tensor(1.5728, grad_fn=<NllLossBackward0>)


 23%|██▎       | 3998/17426 [06:08<21:13, 10.54it/s]

tensor(1.5766, grad_fn=<NllLossBackward0>)
tensor(1.5504, grad_fn=<NllLossBackward0>)
tensor(1.6220, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4002/17426 [06:08<19:59, 11.19it/s]

tensor(1.5771, grad_fn=<NllLossBackward0>)
tensor(1.5297, grad_fn=<NllLossBackward0>)
tensor(1.5816, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4004/17426 [06:09<21:19, 10.49it/s]

tensor(1.5668, grad_fn=<NllLossBackward0>)
tensor(1.5395, grad_fn=<NllLossBackward0>)
tensor(1.5851, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4008/17426 [06:09<20:03, 11.15it/s]

tensor(1.5417, grad_fn=<NllLossBackward0>)
tensor(1.5563, grad_fn=<NllLossBackward0>)
tensor(1.5777, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4010/17426 [06:09<20:06, 11.12it/s]

tensor(1.5548, grad_fn=<NllLossBackward0>)
tensor(1.5668, grad_fn=<NllLossBackward0>)
tensor(1.5755, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4014/17426 [06:10<20:05, 11.12it/s]

tensor(1.5551, grad_fn=<NllLossBackward0>)
tensor(1.5721, grad_fn=<NllLossBackward0>)
tensor(1.5959, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4016/17426 [06:10<20:00, 11.17it/s]

tensor(1.5594, grad_fn=<NllLossBackward0>)
tensor(1.5704, grad_fn=<NllLossBackward0>)
tensor(1.5847, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4020/17426 [06:10<19:46, 11.30it/s]

tensor(1.5467, grad_fn=<NllLossBackward0>)
tensor(1.5334, grad_fn=<NllLossBackward0>)
tensor(1.5505, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4022/17426 [06:10<19:47, 11.29it/s]

tensor(1.5763, grad_fn=<NllLossBackward0>)
tensor(1.5503, grad_fn=<NllLossBackward0>)
tensor(1.5829, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4026/17426 [06:11<19:37, 11.38it/s]

tensor(1.5584, grad_fn=<NllLossBackward0>)
tensor(1.5656, grad_fn=<NllLossBackward0>)
tensor(1.5263, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4028/17426 [06:11<19:50, 11.25it/s]

tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.5514, grad_fn=<NllLossBackward0>)
tensor(1.5547, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4032/17426 [06:11<19:40, 11.35it/s]

tensor(1.5558, grad_fn=<NllLossBackward0>)
tensor(1.6412, grad_fn=<NllLossBackward0>)
tensor(1.5677, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4034/17426 [06:11<19:56, 11.20it/s]

tensor(1.5588, grad_fn=<NllLossBackward0>)
tensor(1.5587, grad_fn=<NllLossBackward0>)
tensor(1.6034, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4038/17426 [06:12<19:57, 11.18it/s]

tensor(1.5678, grad_fn=<NllLossBackward0>)
tensor(1.5353, grad_fn=<NllLossBackward0>)
tensor(1.5775, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4040/17426 [06:12<19:39, 11.35it/s]

tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.5802, grad_fn=<NllLossBackward0>)
tensor(1.5495, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4044/17426 [06:12<19:16, 11.57it/s]

tensor(1.5202, grad_fn=<NllLossBackward0>)
tensor(1.5570, grad_fn=<NllLossBackward0>)
tensor(1.5365, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4046/17426 [06:12<19:20, 11.53it/s]

tensor(1.4963, grad_fn=<NllLossBackward0>)
tensor(1.5412, grad_fn=<NllLossBackward0>)
tensor(1.5366, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4050/17426 [06:13<19:39, 11.34it/s]

tensor(1.5455, grad_fn=<NllLossBackward0>)
tensor(1.5239, grad_fn=<NllLossBackward0>)
tensor(1.5442, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4052/17426 [06:13<19:49, 11.24it/s]

tensor(1.5682, grad_fn=<NllLossBackward0>)
tensor(1.5418, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4056/17426 [06:13<19:20, 11.52it/s]

tensor(1.5504, grad_fn=<NllLossBackward0>)
tensor(1.5499, grad_fn=<NllLossBackward0>)
tensor(1.5619, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4058/17426 [06:13<19:39, 11.33it/s]

tensor(1.5301, grad_fn=<NllLossBackward0>)
tensor(1.5282, grad_fn=<NllLossBackward0>)
tensor(1.5941, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4062/17426 [06:14<19:48, 11.25it/s]

tensor(1.5496, grad_fn=<NllLossBackward0>)
tensor(1.5821, grad_fn=<NllLossBackward0>)
tensor(1.5612, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4064/17426 [06:14<19:41, 11.31it/s]

tensor(1.5767, grad_fn=<NllLossBackward0>)
tensor(1.5720, grad_fn=<NllLossBackward0>)
tensor(1.5404, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4068/17426 [06:14<18:58, 11.73it/s]

tensor(1.6021, grad_fn=<NllLossBackward0>)
tensor(1.5636, grad_fn=<NllLossBackward0>)
tensor(1.5442, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4070/17426 [06:14<19:15, 11.56it/s]

tensor(1.5731, grad_fn=<NllLossBackward0>)
tensor(1.5363, grad_fn=<NllLossBackward0>)
tensor(1.5829, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4074/17426 [06:15<19:29, 11.41it/s]

tensor(1.5740, grad_fn=<NllLossBackward0>)
tensor(1.5426, grad_fn=<NllLossBackward0>)
tensor(1.5246, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4076/17426 [06:15<19:29, 11.41it/s]

tensor(1.5770, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.5917, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4080/17426 [06:15<19:15, 11.55it/s]

tensor(1.5409, grad_fn=<NllLossBackward0>)
tensor(1.5651, grad_fn=<NllLossBackward0>)
tensor(1.5978, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4082/17426 [06:16<19:14, 11.55it/s]

tensor(1.5216, grad_fn=<NllLossBackward0>)
tensor(1.5723, grad_fn=<NllLossBackward0>)
tensor(1.5824, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4086/17426 [06:16<19:28, 11.41it/s]

tensor(1.5372, grad_fn=<NllLossBackward0>)
tensor(1.5852, grad_fn=<NllLossBackward0>)
tensor(1.5554, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4088/17426 [06:16<19:41, 11.29it/s]

tensor(1.5369, grad_fn=<NllLossBackward0>)
tensor(1.5811, grad_fn=<NllLossBackward0>)
tensor(1.5641, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4092/17426 [06:16<19:22, 11.47it/s]

tensor(1.5815, grad_fn=<NllLossBackward0>)
tensor(1.5636, grad_fn=<NllLossBackward0>)
tensor(1.5224, grad_fn=<NllLossBackward0>)


 23%|██▎       | 4094/17426 [06:17<19:23, 11.46it/s]

tensor(1.5628, grad_fn=<NllLossBackward0>)
tensor(1.5562, grad_fn=<NllLossBackward0>)
tensor(1.5319, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4098/17426 [06:17<19:27, 11.42it/s]

tensor(1.5363, grad_fn=<NllLossBackward0>)
tensor(1.5683, grad_fn=<NllLossBackward0>)
tensor(1.5692, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4100/17426 [06:17<19:19, 11.49it/s]

tensor(1.5708, grad_fn=<NllLossBackward0>)
tensor(1.5754, grad_fn=<NllLossBackward0>)
tensor(1.5580, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4104/17426 [06:17<19:23, 11.45it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.5640, grad_fn=<NllLossBackward0>)
tensor(1.5800, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4106/17426 [06:18<22:52,  9.71it/s]

tensor(1.6111, grad_fn=<NllLossBackward0>)
tensor(1.5770, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4108/17426 [06:18<24:31,  9.05it/s]

tensor(1.5946, grad_fn=<NllLossBackward0>)
tensor(1.5634, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4110/17426 [06:18<25:33,  8.68it/s]

tensor(1.5521, grad_fn=<NllLossBackward0>)
tensor(1.5861, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4112/17426 [06:18<26:42,  8.31it/s]

tensor(1.5553, grad_fn=<NllLossBackward0>)
tensor(1.5338, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4114/17426 [06:19<27:56,  7.94it/s]

tensor(1.6023, grad_fn=<NllLossBackward0>)
tensor(1.5321, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4116/17426 [06:19<29:13,  7.59it/s]

tensor(1.5144, grad_fn=<NllLossBackward0>)
tensor(1.5667, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4118/17426 [06:19<27:13,  8.15it/s]

tensor(1.5187, grad_fn=<NllLossBackward0>)
tensor(1.5632, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4120/17426 [06:20<27:30,  8.06it/s]

tensor(1.5422, grad_fn=<NllLossBackward0>)
tensor(1.5603, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4122/17426 [06:20<29:05,  7.62it/s]

tensor(1.5467, grad_fn=<NllLossBackward0>)
tensor(1.5568, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4124/17426 [06:20<29:47,  7.44it/s]

tensor(1.5985, grad_fn=<NllLossBackward0>)
tensor(1.5250, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4126/17426 [06:20<30:13,  7.33it/s]

tensor(1.5818, grad_fn=<NllLossBackward0>)
tensor(1.5668, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4128/17426 [06:21<29:34,  7.49it/s]

tensor(1.5617, grad_fn=<NllLossBackward0>)
tensor(1.5433, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4131/17426 [06:21<23:52,  9.28it/s]

tensor(1.5769, grad_fn=<NllLossBackward0>)
tensor(1.5693, grad_fn=<NllLossBackward0>)
tensor(1.5360, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4134/17426 [06:21<21:50, 10.14it/s]

tensor(1.4898, grad_fn=<NllLossBackward0>)
tensor(1.5768, grad_fn=<NllLossBackward0>)
tensor(1.5732, grad_fn=<NllLossBackward0>)


 24%|██▎       | 4136/17426 [06:21<20:53, 10.60it/s]

tensor(1.5499, grad_fn=<NllLossBackward0>)
tensor(1.5879, grad_fn=<NllLossBackward0>)
tensor(1.5396, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4140/17426 [06:22<19:38, 11.27it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.5429, grad_fn=<NllLossBackward0>)
tensor(1.5620, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4142/17426 [06:22<19:25, 11.39it/s]

tensor(1.5937, grad_fn=<NllLossBackward0>)
tensor(1.5325, grad_fn=<NllLossBackward0>)
tensor(1.5778, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4146/17426 [06:22<19:20, 11.44it/s]

tensor(1.5941, grad_fn=<NllLossBackward0>)
tensor(1.5509, grad_fn=<NllLossBackward0>)
tensor(1.5128, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4148/17426 [06:22<19:22, 11.43it/s]

tensor(1.5962, grad_fn=<NllLossBackward0>)
tensor(1.5562, grad_fn=<NllLossBackward0>)
tensor(1.5438, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4152/17426 [06:23<19:11, 11.53it/s]

tensor(1.5361, grad_fn=<NllLossBackward0>)
tensor(1.5586, grad_fn=<NllLossBackward0>)
tensor(1.6178, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4154/17426 [06:23<19:26, 11.38it/s]

tensor(1.5745, grad_fn=<NllLossBackward0>)
tensor(1.5331, grad_fn=<NllLossBackward0>)
tensor(1.5979, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4158/17426 [06:23<19:28, 11.36it/s]

tensor(1.5601, grad_fn=<NllLossBackward0>)
tensor(1.5918, grad_fn=<NllLossBackward0>)
tensor(1.5192, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4160/17426 [06:23<19:25, 11.38it/s]

tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.5924, grad_fn=<NllLossBackward0>)
tensor(1.5566, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4164/17426 [06:24<19:08, 11.54it/s]

tensor(1.5226, grad_fn=<NllLossBackward0>)
tensor(1.5731, grad_fn=<NllLossBackward0>)
tensor(1.5496, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4166/17426 [06:24<19:13, 11.50it/s]

tensor(1.5839, grad_fn=<NllLossBackward0>)
tensor(1.5623, grad_fn=<NllLossBackward0>)
tensor(1.5469, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4170/17426 [06:24<19:00, 11.62it/s]

tensor(1.5740, grad_fn=<NllLossBackward0>)
tensor(1.5550, grad_fn=<NllLossBackward0>)
tensor(1.5541, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4172/17426 [06:25<19:14, 11.48it/s]

tensor(1.5380, grad_fn=<NllLossBackward0>)
tensor(1.6285, grad_fn=<NllLossBackward0>)
tensor(1.5761, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4176/17426 [06:25<19:10, 11.52it/s]

tensor(1.5923, grad_fn=<NllLossBackward0>)
tensor(1.5320, grad_fn=<NllLossBackward0>)
tensor(1.5487, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4178/17426 [06:25<19:33, 11.28it/s]

tensor(1.5727, grad_fn=<NllLossBackward0>)
tensor(1.5439, grad_fn=<NllLossBackward0>)
tensor(1.5521, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4182/17426 [06:25<19:16, 11.45it/s]

tensor(1.5244, grad_fn=<NllLossBackward0>)
tensor(1.5980, grad_fn=<NllLossBackward0>)
tensor(1.5970, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4184/17426 [06:26<19:27, 11.34it/s]

tensor(1.4964, grad_fn=<NllLossBackward0>)
tensor(1.5618, grad_fn=<NllLossBackward0>)
tensor(1.5585, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4188/17426 [06:26<19:08, 11.53it/s]

tensor(1.5560, grad_fn=<NllLossBackward0>)
tensor(1.5789, grad_fn=<NllLossBackward0>)
tensor(1.5405, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4190/17426 [06:26<19:16, 11.44it/s]

tensor(1.5948, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)
tensor(1.5261, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4194/17426 [06:26<19:21, 11.39it/s]

tensor(1.5411, grad_fn=<NllLossBackward0>)
tensor(1.5727, grad_fn=<NllLossBackward0>)
tensor(1.5030, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4196/17426 [06:27<19:30, 11.30it/s]

tensor(1.5275, grad_fn=<NllLossBackward0>)
tensor(1.5650, grad_fn=<NllLossBackward0>)
tensor(1.5953, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4200/17426 [06:27<19:08, 11.52it/s]

tensor(1.6052, grad_fn=<NllLossBackward0>)
tensor(1.5465, grad_fn=<NllLossBackward0>)
tensor(1.5571, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4202/17426 [06:27<19:18, 11.41it/s]

tensor(1.5205, grad_fn=<NllLossBackward0>)
tensor(1.5247, grad_fn=<NllLossBackward0>)
tensor(1.5277, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4206/17426 [06:28<19:32, 11.27it/s]

tensor(1.5721, grad_fn=<NllLossBackward0>)
tensor(1.5588, grad_fn=<NllLossBackward0>)
tensor(1.5803, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4208/17426 [06:28<19:31, 11.29it/s]

tensor(1.5425, grad_fn=<NllLossBackward0>)
tensor(1.5633, grad_fn=<NllLossBackward0>)
tensor(1.5848, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4212/17426 [06:28<19:13, 11.46it/s]

tensor(1.5368, grad_fn=<NllLossBackward0>)
tensor(1.5801, grad_fn=<NllLossBackward0>)
tensor(1.5450, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4214/17426 [06:28<19:17, 11.42it/s]

tensor(1.5881, grad_fn=<NllLossBackward0>)
tensor(1.5288, grad_fn=<NllLossBackward0>)
tensor(1.5699, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4218/17426 [06:29<19:07, 11.51it/s]

tensor(1.5776, grad_fn=<NllLossBackward0>)
tensor(1.5574, grad_fn=<NllLossBackward0>)
tensor(1.5565, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4220/17426 [06:29<19:18, 11.40it/s]

tensor(1.5425, grad_fn=<NllLossBackward0>)
tensor(1.5456, grad_fn=<NllLossBackward0>)
tensor(1.5824, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4224/17426 [06:29<19:10, 11.48it/s]

tensor(1.5206, grad_fn=<NllLossBackward0>)
tensor(1.5335, grad_fn=<NllLossBackward0>)
tensor(1.5722, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4226/17426 [06:29<19:09, 11.48it/s]

tensor(1.5678, grad_fn=<NllLossBackward0>)
tensor(1.5558, grad_fn=<NllLossBackward0>)
tensor(1.5992, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4230/17426 [06:30<19:05, 11.52it/s]

tensor(1.5375, grad_fn=<NllLossBackward0>)
tensor(1.5560, grad_fn=<NllLossBackward0>)
tensor(1.5608, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4232/17426 [06:30<19:13, 11.44it/s]

tensor(1.5212, grad_fn=<NllLossBackward0>)
tensor(1.5857, grad_fn=<NllLossBackward0>)
tensor(1.5784, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4236/17426 [06:30<18:55, 11.61it/s]

tensor(1.5321, grad_fn=<NllLossBackward0>)
tensor(1.6077, grad_fn=<NllLossBackward0>)
tensor(1.5394, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4238/17426 [06:30<19:02, 11.54it/s]

tensor(1.5784, grad_fn=<NllLossBackward0>)
tensor(1.5241, grad_fn=<NllLossBackward0>)
tensor(1.5472, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4242/17426 [06:31<19:16, 11.40it/s]

tensor(1.5521, grad_fn=<NllLossBackward0>)
tensor(1.5258, grad_fn=<NllLossBackward0>)
tensor(1.5530, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4244/17426 [06:31<22:32,  9.75it/s]

tensor(1.5401, grad_fn=<NllLossBackward0>)
tensor(1.5564, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4246/17426 [06:31<23:52,  9.20it/s]

tensor(1.5891, grad_fn=<NllLossBackward0>)
tensor(1.5448, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4248/17426 [06:31<25:45,  8.53it/s]

tensor(1.5588, grad_fn=<NllLossBackward0>)
tensor(1.5135, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4250/17426 [06:32<25:52,  8.49it/s]

tensor(1.5470, grad_fn=<NllLossBackward0>)
tensor(1.5617, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4252/17426 [06:32<25:46,  8.52it/s]

tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.5388, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4254/17426 [06:32<25:54,  8.47it/s]

tensor(1.5283, grad_fn=<NllLossBackward0>)
tensor(1.5728, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4256/17426 [06:32<26:08,  8.40it/s]

tensor(1.5653, grad_fn=<NllLossBackward0>)
tensor(1.5586, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4258/17426 [06:33<26:23,  8.31it/s]

tensor(1.5741, grad_fn=<NllLossBackward0>)
tensor(1.5164, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4260/17426 [06:33<29:14,  7.50it/s]

tensor(1.5341, grad_fn=<NllLossBackward0>)
tensor(1.5608, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4262/17426 [06:33<28:44,  7.63it/s]

tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.5468, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4264/17426 [06:33<28:12,  7.77it/s]

tensor(1.5611, grad_fn=<NllLossBackward0>)
tensor(1.5465, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4266/17426 [06:34<28:42,  7.64it/s]

tensor(1.5214, grad_fn=<NllLossBackward0>)
tensor(1.5270, grad_fn=<NllLossBackward0>)


 24%|██▍       | 4268/17426 [06:34<29:57,  7.32it/s]

tensor(1.5902, grad_fn=<NllLossBackward0>)
tensor(1.5310, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4271/17426 [06:34<23:17,  9.41it/s]

tensor(1.5482, grad_fn=<NllLossBackward0>)
tensor(1.5138, grad_fn=<NllLossBackward0>)
tensor(1.5480, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4274/17426 [06:35<20:58, 10.45it/s]

tensor(1.5557, grad_fn=<NllLossBackward0>)
tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.5796, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4276/17426 [06:35<21:00, 10.44it/s]

tensor(1.5528, grad_fn=<NllLossBackward0>)
tensor(1.5566, grad_fn=<NllLossBackward0>)
tensor(1.5126, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4280/17426 [06:35<19:30, 11.23it/s]

tensor(1.5685, grad_fn=<NllLossBackward0>)
tensor(1.5198, grad_fn=<NllLossBackward0>)
tensor(1.5853, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4282/17426 [06:35<19:26, 11.27it/s]

tensor(1.5399, grad_fn=<NllLossBackward0>)
tensor(1.5605, grad_fn=<NllLossBackward0>)
tensor(1.5075, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4286/17426 [06:36<18:57, 11.56it/s]

tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.5358, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4288/17426 [06:36<19:51, 11.03it/s]

tensor(1.5592, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)
tensor(1.5260, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4292/17426 [06:36<19:01, 11.50it/s]

tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.5727, grad_fn=<NllLossBackward0>)
tensor(1.5058, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4294/17426 [06:36<19:03, 11.49it/s]

tensor(1.5977, grad_fn=<NllLossBackward0>)
tensor(1.5579, grad_fn=<NllLossBackward0>)
tensor(1.5906, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4298/17426 [06:37<19:01, 11.50it/s]

tensor(1.5404, grad_fn=<NllLossBackward0>)
tensor(1.5596, grad_fn=<NllLossBackward0>)
tensor(1.5447, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4300/17426 [06:37<19:38, 11.14it/s]

tensor(1.5656, grad_fn=<NllLossBackward0>)
tensor(1.5282, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4304/17426 [06:37<19:13, 11.37it/s]

tensor(1.5581, grad_fn=<NllLossBackward0>)
tensor(1.5514, grad_fn=<NllLossBackward0>)
tensor(1.5266, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4306/17426 [06:37<19:23, 11.28it/s]

tensor(1.5737, grad_fn=<NllLossBackward0>)
tensor(1.5871, grad_fn=<NllLossBackward0>)
tensor(1.5364, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4310/17426 [06:38<19:09, 11.41it/s]

tensor(1.5734, grad_fn=<NllLossBackward0>)
tensor(1.5341, grad_fn=<NllLossBackward0>)
tensor(1.5507, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4312/17426 [06:38<19:33, 11.18it/s]

tensor(1.5324, grad_fn=<NllLossBackward0>)
tensor(1.5029, grad_fn=<NllLossBackward0>)
tensor(1.5161, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4316/17426 [06:38<18:55, 11.55it/s]

tensor(1.5811, grad_fn=<NllLossBackward0>)
tensor(1.5474, grad_fn=<NllLossBackward0>)
tensor(1.5511, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4318/17426 [06:38<19:03, 11.46it/s]

tensor(1.5445, grad_fn=<NllLossBackward0>)
tensor(1.5718, grad_fn=<NllLossBackward0>)
tensor(1.5263, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4320/17426 [06:39<19:11, 11.38it/s]

tensor(1.5469, grad_fn=<NllLossBackward0>)
tensor(1.5395, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4324/17426 [06:39<19:39, 11.10it/s]

tensor(1.4978, grad_fn=<NllLossBackward0>)
tensor(1.5565, grad_fn=<NllLossBackward0>)
tensor(1.5437, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4326/17426 [06:39<19:33, 11.16it/s]

tensor(1.5345, grad_fn=<NllLossBackward0>)
tensor(1.4659, grad_fn=<NllLossBackward0>)
tensor(1.5869, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4330/17426 [06:39<19:04, 11.45it/s]

tensor(1.5567, grad_fn=<NllLossBackward0>)
tensor(1.5584, grad_fn=<NllLossBackward0>)
tensor(1.5942, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4332/17426 [06:40<19:15, 11.33it/s]

tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.5739, grad_fn=<NllLossBackward0>)
tensor(1.5686, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4336/17426 [06:40<19:14, 11.34it/s]

tensor(1.5369, grad_fn=<NllLossBackward0>)
tensor(1.5738, grad_fn=<NllLossBackward0>)
tensor(1.6132, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4338/17426 [06:40<19:24, 11.24it/s]

tensor(1.5606, grad_fn=<NllLossBackward0>)
tensor(1.5430, grad_fn=<NllLossBackward0>)
tensor(1.5867, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4342/17426 [06:41<18:43, 11.65it/s]

tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.5311, grad_fn=<NllLossBackward0>)
tensor(1.5812, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4344/17426 [06:41<19:13, 11.34it/s]

tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.5498, grad_fn=<NllLossBackward0>)
tensor(1.5555, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4348/17426 [06:41<19:16, 11.31it/s]

tensor(1.5678, grad_fn=<NllLossBackward0>)
tensor(1.5680, grad_fn=<NllLossBackward0>)
tensor(1.5858, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4350/17426 [06:41<19:10, 11.37it/s]

tensor(1.5680, grad_fn=<NllLossBackward0>)
tensor(1.5875, grad_fn=<NllLossBackward0>)
tensor(1.5070, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4354/17426 [06:42<19:01, 11.45it/s]

tensor(1.5579, grad_fn=<NllLossBackward0>)
tensor(1.5846, grad_fn=<NllLossBackward0>)
tensor(1.5720, grad_fn=<NllLossBackward0>)


 25%|██▍       | 4356/17426 [06:42<19:11, 11.35it/s]

tensor(1.5190, grad_fn=<NllLossBackward0>)
tensor(1.5841, grad_fn=<NllLossBackward0>)
tensor(1.5530, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4360/17426 [06:42<19:18, 11.28it/s]

tensor(1.5435, grad_fn=<NllLossBackward0>)
tensor(1.5101, grad_fn=<NllLossBackward0>)
tensor(1.5678, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4362/17426 [06:42<19:18, 11.28it/s]

tensor(1.5990, grad_fn=<NllLossBackward0>)
tensor(1.5770, grad_fn=<NllLossBackward0>)
tensor(1.5568, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4366/17426 [06:43<19:10, 11.36it/s]

tensor(1.5240, grad_fn=<NllLossBackward0>)
tensor(1.5421, grad_fn=<NllLossBackward0>)
tensor(1.5539, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4368/17426 [06:43<19:13, 11.32it/s]

tensor(1.5249, grad_fn=<NllLossBackward0>)
tensor(1.5586, grad_fn=<NllLossBackward0>)
tensor(1.5510, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4372/17426 [06:43<19:12, 11.33it/s]

tensor(1.5460, grad_fn=<NllLossBackward0>)
tensor(1.5709, grad_fn=<NllLossBackward0>)
tensor(1.5056, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4374/17426 [06:43<19:05, 11.39it/s]

tensor(1.5485, grad_fn=<NllLossBackward0>)
tensor(1.5263, grad_fn=<NllLossBackward0>)
tensor(1.5991, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4378/17426 [06:44<18:51, 11.53it/s]

tensor(1.5167, grad_fn=<NllLossBackward0>)
tensor(1.5652, grad_fn=<NllLossBackward0>)
tensor(1.5618, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4380/17426 [06:44<19:01, 11.43it/s]

tensor(1.5615, grad_fn=<NllLossBackward0>)
tensor(1.5422, grad_fn=<NllLossBackward0>)
tensor(1.5402, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4382/17426 [06:44<21:25, 10.14it/s]

tensor(1.5145, grad_fn=<NllLossBackward0>)
tensor(1.5463, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4385/17426 [06:45<24:36,  8.83it/s]

tensor(1.4930, grad_fn=<NllLossBackward0>)
tensor(1.5531, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4387/17426 [06:45<25:38,  8.48it/s]

tensor(1.5391, grad_fn=<NllLossBackward0>)
tensor(1.5195, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4389/17426 [06:45<27:13,  7.98it/s]

tensor(1.5795, grad_fn=<NllLossBackward0>)
tensor(1.4921, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4391/17426 [06:45<28:02,  7.75it/s]

tensor(1.5620, grad_fn=<NllLossBackward0>)
tensor(1.5409, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4393/17426 [06:46<27:44,  7.83it/s]

tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.5620, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4395/17426 [06:46<28:51,  7.53it/s]

tensor(1.5291, grad_fn=<NllLossBackward0>)
tensor(1.5536, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4397/17426 [06:46<28:51,  7.52it/s]

tensor(1.5244, grad_fn=<NllLossBackward0>)
tensor(1.5629, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4399/17426 [06:46<30:04,  7.22it/s]

tensor(1.5685, grad_fn=<NllLossBackward0>)
tensor(1.5529, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4401/17426 [06:47<31:01,  7.00it/s]

tensor(1.5442, grad_fn=<NllLossBackward0>)
tensor(1.5203, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4403/17426 [06:47<29:49,  7.28it/s]

tensor(1.5453, grad_fn=<NllLossBackward0>)
tensor(1.5472, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4406/17426 [06:47<26:04,  8.32it/s]

tensor(1.5293, grad_fn=<NllLossBackward0>)
tensor(1.5184, grad_fn=<NllLossBackward0>)
tensor(1.5817, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4408/17426 [06:48<23:11,  9.36it/s]

tensor(1.5526, grad_fn=<NllLossBackward0>)
tensor(1.5075, grad_fn=<NllLossBackward0>)
tensor(1.5711, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4412/17426 [06:48<20:13, 10.73it/s]

tensor(1.5294, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)
tensor(1.5919, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4414/17426 [06:48<20:21, 10.65it/s]

tensor(1.5793, grad_fn=<NllLossBackward0>)
tensor(1.5419, grad_fn=<NllLossBackward0>)
tensor(1.5685, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4418/17426 [06:48<19:32, 11.09it/s]

tensor(1.5512, grad_fn=<NllLossBackward0>)
tensor(1.5718, grad_fn=<NllLossBackward0>)
tensor(1.5660, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4420/17426 [06:49<19:30, 11.11it/s]

tensor(1.5660, grad_fn=<NllLossBackward0>)
tensor(1.6171, grad_fn=<NllLossBackward0>)
tensor(1.6081, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4424/17426 [06:49<19:20, 11.20it/s]

tensor(1.5075, grad_fn=<NllLossBackward0>)
tensor(1.5743, grad_fn=<NllLossBackward0>)
tensor(1.5035, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4426/17426 [06:49<19:23, 11.18it/s]

tensor(1.5730, grad_fn=<NllLossBackward0>)
tensor(1.5668, grad_fn=<NllLossBackward0>)
tensor(1.5811, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4430/17426 [06:49<19:03, 11.37it/s]

tensor(1.5342, grad_fn=<NllLossBackward0>)
tensor(1.5714, grad_fn=<NllLossBackward0>)
tensor(1.4977, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4432/17426 [06:50<19:19, 11.21it/s]

tensor(1.5646, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.5732, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4436/17426 [06:50<19:09, 11.30it/s]

tensor(1.5687, grad_fn=<NllLossBackward0>)
tensor(1.5617, grad_fn=<NllLossBackward0>)
tensor(1.5151, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4438/17426 [06:50<19:21, 11.18it/s]

tensor(1.5488, grad_fn=<NllLossBackward0>)
tensor(1.5210, grad_fn=<NllLossBackward0>)
tensor(1.4774, grad_fn=<NllLossBackward0>)


 25%|██▌       | 4442/17426 [06:51<19:02, 11.36it/s]

tensor(1.5512, grad_fn=<NllLossBackward0>)
tensor(1.5203, grad_fn=<NllLossBackward0>)
tensor(1.5454, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4444/17426 [06:51<19:13, 11.26it/s]

tensor(1.5768, grad_fn=<NllLossBackward0>)
tensor(1.5177, grad_fn=<NllLossBackward0>)
tensor(1.5595, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4448/17426 [06:51<18:56, 11.42it/s]

tensor(1.6156, grad_fn=<NllLossBackward0>)
tensor(1.5612, grad_fn=<NllLossBackward0>)
tensor(1.5575, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4450/17426 [06:51<19:32, 11.07it/s]

tensor(1.5260, grad_fn=<NllLossBackward0>)
tensor(1.5752, grad_fn=<NllLossBackward0>)
tensor(1.5362, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4454/17426 [06:52<18:55, 11.43it/s]

tensor(1.5571, grad_fn=<NllLossBackward0>)
tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.5779, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4456/17426 [06:52<19:04, 11.33it/s]

tensor(1.5518, grad_fn=<NllLossBackward0>)
tensor(1.5619, grad_fn=<NllLossBackward0>)
tensor(1.5361, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4460/17426 [06:52<18:56, 11.40it/s]

tensor(1.5516, grad_fn=<NllLossBackward0>)
tensor(1.5265, grad_fn=<NllLossBackward0>)
tensor(1.5671, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4462/17426 [06:52<19:01, 11.35it/s]

tensor(1.5667, grad_fn=<NllLossBackward0>)
tensor(1.5416, grad_fn=<NllLossBackward0>)
tensor(1.5046, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4466/17426 [06:53<18:39, 11.57it/s]

tensor(1.6012, grad_fn=<NllLossBackward0>)
tensor(1.5278, grad_fn=<NllLossBackward0>)
tensor(1.5507, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4468/17426 [06:53<18:51, 11.45it/s]

tensor(1.5214, grad_fn=<NllLossBackward0>)
tensor(1.5394, grad_fn=<NllLossBackward0>)
tensor(1.5246, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4472/17426 [06:53<18:43, 11.53it/s]

tensor(1.5087, grad_fn=<NllLossBackward0>)
tensor(1.5600, grad_fn=<NllLossBackward0>)
tensor(1.5473, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4474/17426 [06:53<19:12, 11.24it/s]

tensor(1.5512, grad_fn=<NllLossBackward0>)
tensor(1.5775, grad_fn=<NllLossBackward0>)
tensor(1.5379, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4478/17426 [06:54<18:47, 11.48it/s]

tensor(1.5419, grad_fn=<NllLossBackward0>)
tensor(1.5158, grad_fn=<NllLossBackward0>)
tensor(1.5563, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4480/17426 [06:54<18:49, 11.46it/s]

tensor(1.5555, grad_fn=<NllLossBackward0>)
tensor(1.6037, grad_fn=<NllLossBackward0>)
tensor(1.5331, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4484/17426 [06:54<18:39, 11.56it/s]

tensor(1.5820, grad_fn=<NllLossBackward0>)
tensor(1.5516, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4486/17426 [06:54<19:17, 11.18it/s]

tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.5719, grad_fn=<NllLossBackward0>)
tensor(1.5329, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4490/17426 [06:55<18:47, 11.47it/s]

tensor(1.5744, grad_fn=<NllLossBackward0>)
tensor(1.5675, grad_fn=<NllLossBackward0>)
tensor(1.5315, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4492/17426 [06:55<18:52, 11.42it/s]

tensor(1.5269, grad_fn=<NllLossBackward0>)
tensor(1.5994, grad_fn=<NllLossBackward0>)
tensor(1.5055, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4496/17426 [06:55<18:41, 11.53it/s]

tensor(1.5531, grad_fn=<NllLossBackward0>)
tensor(1.5396, grad_fn=<NllLossBackward0>)
tensor(1.5547, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4498/17426 [06:55<19:13, 11.21it/s]

tensor(1.5669, grad_fn=<NllLossBackward0>)
tensor(1.5611, grad_fn=<NllLossBackward0>)
tensor(1.5403, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4502/17426 [06:56<18:59, 11.34it/s]

tensor(1.5715, grad_fn=<NllLossBackward0>)
tensor(1.5363, grad_fn=<NllLossBackward0>)
tensor(1.5391, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4504/17426 [06:56<19:09, 11.24it/s]

tensor(1.5654, grad_fn=<NllLossBackward0>)
tensor(1.5721, grad_fn=<NllLossBackward0>)
tensor(1.5141, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4508/17426 [06:56<18:33, 11.61it/s]

tensor(1.5796, grad_fn=<NllLossBackward0>)
tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.5508, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4510/17426 [06:57<19:13, 11.20it/s]

tensor(1.5801, grad_fn=<NllLossBackward0>)
tensor(1.5315, grad_fn=<NllLossBackward0>)
tensor(1.5053, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4514/17426 [06:57<18:42, 11.50it/s]

tensor(1.5773, grad_fn=<NllLossBackward0>)
tensor(1.5304, grad_fn=<NllLossBackward0>)
tensor(1.5699, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4516/17426 [06:57<19:11, 11.21it/s]

tensor(1.6120, grad_fn=<NllLossBackward0>)
tensor(1.5916, grad_fn=<NllLossBackward0>)
tensor(1.5035, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4518/17426 [06:57<20:00, 10.75it/s]

tensor(1.5471, grad_fn=<NllLossBackward0>)
tensor(1.5432, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4521/17426 [06:58<23:54,  9.00it/s]

tensor(1.5591, grad_fn=<NllLossBackward0>)
tensor(1.6048, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4523/17426 [06:58<24:55,  8.63it/s]

tensor(1.5481, grad_fn=<NllLossBackward0>)
tensor(1.5276, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4525/17426 [06:58<26:37,  8.08it/s]

tensor(1.5692, grad_fn=<NllLossBackward0>)
tensor(1.5664, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4527/17426 [06:58<27:51,  7.72it/s]

tensor(1.5959, grad_fn=<NllLossBackward0>)
tensor(1.5417, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4529/17426 [06:59<28:05,  7.65it/s]

tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.5720, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4531/17426 [06:59<27:40,  7.77it/s]

tensor(1.5316, grad_fn=<NllLossBackward0>)
tensor(1.5581, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4533/17426 [06:59<27:29,  7.82it/s]

tensor(1.5877, grad_fn=<NllLossBackward0>)
tensor(1.5899, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4535/17426 [07:00<28:03,  7.66it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.5401, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4537/17426 [07:00<27:49,  7.72it/s]

tensor(1.5472, grad_fn=<NllLossBackward0>)
tensor(1.5223, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4539/17426 [07:00<30:18,  7.09it/s]

tensor(1.5504, grad_fn=<NllLossBackward0>)
tensor(1.5008, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4541/17426 [07:00<30:17,  7.09it/s]

tensor(1.5321, grad_fn=<NllLossBackward0>)
tensor(1.5087, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4544/17426 [07:01<24:52,  8.63it/s]

tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.5239, grad_fn=<NllLossBackward0>)
tensor(1.5805, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4546/17426 [07:01<22:12,  9.66it/s]

tensor(1.5976, grad_fn=<NllLossBackward0>)
tensor(1.5463, grad_fn=<NllLossBackward0>)
tensor(1.5928, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4550/17426 [07:01<20:14, 10.60it/s]

tensor(1.5576, grad_fn=<NllLossBackward0>)
tensor(1.5372, grad_fn=<NllLossBackward0>)
tensor(1.5681, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4552/17426 [07:01<19:49, 10.83it/s]

tensor(1.5332, grad_fn=<NllLossBackward0>)
tensor(1.5663, grad_fn=<NllLossBackward0>)
tensor(1.5597, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4556/17426 [07:02<19:34, 10.96it/s]

tensor(1.5576, grad_fn=<NllLossBackward0>)
tensor(1.5393, grad_fn=<NllLossBackward0>)
tensor(1.5722, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4558/17426 [07:02<19:25, 11.04it/s]

tensor(1.5693, grad_fn=<NllLossBackward0>)
tensor(1.5791, grad_fn=<NllLossBackward0>)
tensor(1.5978, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4562/17426 [07:02<19:02, 11.26it/s]

tensor(1.5767, grad_fn=<NllLossBackward0>)
tensor(1.5952, grad_fn=<NllLossBackward0>)
tensor(1.5517, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4564/17426 [07:02<19:10, 11.18it/s]

tensor(1.6031, grad_fn=<NllLossBackward0>)
tensor(1.5670, grad_fn=<NllLossBackward0>)
tensor(1.5701, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4568/17426 [07:03<19:08, 11.19it/s]

tensor(1.6184, grad_fn=<NllLossBackward0>)
tensor(1.5654, grad_fn=<NllLossBackward0>)
tensor(1.5748, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4570/17426 [07:03<19:11, 11.17it/s]

tensor(1.5408, grad_fn=<NllLossBackward0>)
tensor(1.5320, grad_fn=<NllLossBackward0>)
tensor(1.5432, grad_fn=<NllLossBackward0>)


 26%|██▌       | 4574/17426 [07:03<18:44, 11.42it/s]

tensor(1.5631, grad_fn=<NllLossBackward0>)
tensor(1.5465, grad_fn=<NllLossBackward0>)
tensor(1.5803, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4576/17426 [07:04<18:50, 11.37it/s]

tensor(1.5585, grad_fn=<NllLossBackward0>)
tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.5644, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4580/17426 [07:04<18:59, 11.28it/s]

tensor(1.5460, grad_fn=<NllLossBackward0>)
tensor(1.5689, grad_fn=<NllLossBackward0>)
tensor(1.5617, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4582/17426 [07:04<19:01, 11.25it/s]

tensor(1.5803, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.5164, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4586/17426 [07:04<18:45, 11.40it/s]

tensor(1.5810, grad_fn=<NllLossBackward0>)
tensor(1.5795, grad_fn=<NllLossBackward0>)
tensor(1.5396, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4588/17426 [07:05<18:53, 11.33it/s]

tensor(1.5477, grad_fn=<NllLossBackward0>)
tensor(1.5598, grad_fn=<NllLossBackward0>)
tensor(1.5292, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4592/17426 [07:05<19:02, 11.23it/s]

tensor(1.5527, grad_fn=<NllLossBackward0>)
tensor(1.5803, grad_fn=<NllLossBackward0>)
tensor(1.5273, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4594/17426 [07:05<19:04, 11.21it/s]

tensor(1.5145, grad_fn=<NllLossBackward0>)
tensor(1.5689, grad_fn=<NllLossBackward0>)
tensor(1.5260, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4598/17426 [07:05<18:26, 11.60it/s]

tensor(1.5909, grad_fn=<NllLossBackward0>)
tensor(1.5437, grad_fn=<NllLossBackward0>)
tensor(1.5446, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4600/17426 [07:06<18:35, 11.50it/s]

tensor(1.5643, grad_fn=<NllLossBackward0>)
tensor(1.5385, grad_fn=<NllLossBackward0>)
tensor(1.4981, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4604/17426 [07:06<18:52, 11.32it/s]

tensor(1.5602, grad_fn=<NllLossBackward0>)
tensor(1.5548, grad_fn=<NllLossBackward0>)
tensor(1.5537, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4606/17426 [07:06<19:05, 11.19it/s]

tensor(1.5598, grad_fn=<NllLossBackward0>)
tensor(1.5492, grad_fn=<NllLossBackward0>)
tensor(1.5405, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4610/17426 [07:07<18:48, 11.35it/s]

tensor(1.5540, grad_fn=<NllLossBackward0>)
tensor(1.5257, grad_fn=<NllLossBackward0>)
tensor(1.5727, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4612/17426 [07:07<18:58, 11.25it/s]

tensor(1.6059, grad_fn=<NllLossBackward0>)
tensor(1.5446, grad_fn=<NllLossBackward0>)
tensor(1.5517, grad_fn=<NllLossBackward0>)


 26%|██▋       | 4616/17426 [07:07<18:44, 11.40it/s]

tensor(1.5941, grad_fn=<NllLossBackward0>)
tensor(1.5583, grad_fn=<NllLossBackward0>)
tensor(1.5380, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4618/17426 [07:07<18:52, 11.31it/s]

tensor(1.5310, grad_fn=<NllLossBackward0>)
tensor(1.5515, grad_fn=<NllLossBackward0>)
tensor(1.4943, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4622/17426 [07:08<18:37, 11.45it/s]

tensor(1.5432, grad_fn=<NllLossBackward0>)
tensor(1.5443, grad_fn=<NllLossBackward0>)
tensor(1.5366, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4624/17426 [07:08<18:45, 11.38it/s]

tensor(1.5393, grad_fn=<NllLossBackward0>)
tensor(1.5296, grad_fn=<NllLossBackward0>)
tensor(1.5300, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4628/17426 [07:08<19:01, 11.21it/s]

tensor(1.5384, grad_fn=<NllLossBackward0>)
tensor(1.5509, grad_fn=<NllLossBackward0>)
tensor(1.5731, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4630/17426 [07:08<18:58, 11.24it/s]

tensor(1.5908, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)
tensor(1.5446, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4634/17426 [07:09<18:35, 11.46it/s]

tensor(1.5637, grad_fn=<NllLossBackward0>)
tensor(1.5567, grad_fn=<NllLossBackward0>)
tensor(1.5461, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4636/17426 [07:09<19:13, 11.09it/s]

tensor(1.5690, grad_fn=<NllLossBackward0>)
tensor(1.5822, grad_fn=<NllLossBackward0>)
tensor(1.5292, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4640/17426 [07:09<19:05, 11.16it/s]

tensor(1.5502, grad_fn=<NllLossBackward0>)
tensor(1.5503, grad_fn=<NllLossBackward0>)
tensor(1.5610, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4642/17426 [07:09<19:09, 11.12it/s]

tensor(1.5655, grad_fn=<NllLossBackward0>)
tensor(1.5147, grad_fn=<NllLossBackward0>)
tensor(1.5299, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4646/17426 [07:10<18:44, 11.36it/s]

tensor(1.5322, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)
tensor(1.5419, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4648/17426 [07:10<18:43, 11.37it/s]

tensor(1.5334, grad_fn=<NllLossBackward0>)
tensor(1.5562, grad_fn=<NllLossBackward0>)
tensor(1.5560, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4652/17426 [07:10<18:55, 11.25it/s]

tensor(1.5639, grad_fn=<NllLossBackward0>)
tensor(1.5477, grad_fn=<NllLossBackward0>)
tensor(1.5613, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4654/17426 [07:10<19:04, 11.16it/s]

tensor(1.5437, grad_fn=<NllLossBackward0>)
tensor(1.5477, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4656/17426 [07:11<22:11,  9.59it/s]

tensor(1.5668, grad_fn=<NllLossBackward0>)
tensor(1.5240, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4658/17426 [07:11<25:27,  8.36it/s]

tensor(1.5489, grad_fn=<NllLossBackward0>)
tensor(1.5690, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4660/17426 [07:11<27:14,  7.81it/s]

tensor(1.5579, grad_fn=<NllLossBackward0>)
tensor(1.5725, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4662/17426 [07:12<27:13,  7.81it/s]

tensor(1.5477, grad_fn=<NllLossBackward0>)
tensor(1.5427, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4664/17426 [07:12<27:17,  7.79it/s]

tensor(1.5381, grad_fn=<NllLossBackward0>)
tensor(1.5717, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4666/17426 [07:12<27:17,  7.79it/s]

tensor(1.5191, grad_fn=<NllLossBackward0>)
tensor(1.5440, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4668/17426 [07:12<27:53,  7.62it/s]

tensor(1.5445, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4670/17426 [07:13<26:49,  7.93it/s]

tensor(1.5211, grad_fn=<NllLossBackward0>)
tensor(1.5392, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4672/17426 [07:13<27:15,  7.80it/s]

tensor(1.5744, grad_fn=<NllLossBackward0>)
tensor(1.5495, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4674/17426 [07:13<27:26,  7.74it/s]

tensor(1.5383, grad_fn=<NllLossBackward0>)
tensor(1.5316, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4676/17426 [07:13<28:53,  7.36it/s]

tensor(1.5664, grad_fn=<NllLossBackward0>)
tensor(1.5949, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4678/17426 [07:14<29:06,  7.30it/s]

tensor(1.4851, grad_fn=<NllLossBackward0>)
tensor(1.5035, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4681/17426 [07:14<23:37,  8.99it/s]

tensor(1.5400, grad_fn=<NllLossBackward0>)
tensor(1.5831, grad_fn=<NllLossBackward0>)
tensor(1.5267, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4684/17426 [07:14<21:25,  9.91it/s]

tensor(1.5623, grad_fn=<NllLossBackward0>)
tensor(1.5395, grad_fn=<NllLossBackward0>)
tensor(1.5532, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4687/17426 [07:15<20:02, 10.59it/s]

tensor(1.5591, grad_fn=<NllLossBackward0>)
tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.5206, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4689/17426 [07:15<19:32, 10.86it/s]

tensor(1.5250, grad_fn=<NllLossBackward0>)
tensor(1.5877, grad_fn=<NllLossBackward0>)
tensor(1.5634, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4693/17426 [07:15<19:01, 11.15it/s]

tensor(1.5562, grad_fn=<NllLossBackward0>)
tensor(1.5813, grad_fn=<NllLossBackward0>)
tensor(1.5109, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4695/17426 [07:15<19:01, 11.15it/s]

tensor(1.5423, grad_fn=<NllLossBackward0>)
tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.5487, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4699/17426 [07:16<18:39, 11.36it/s]

tensor(1.5601, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)
tensor(1.5372, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4701/17426 [07:16<18:36, 11.39it/s]

tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.5642, grad_fn=<NllLossBackward0>)
tensor(1.4794, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4705/17426 [07:16<18:32, 11.44it/s]

tensor(1.5795, grad_fn=<NllLossBackward0>)
tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.5574, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4707/17426 [07:16<18:36, 11.39it/s]

tensor(1.4926, grad_fn=<NllLossBackward0>)
tensor(1.5416, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4711/17426 [07:17<18:52, 11.23it/s]

tensor(1.5241, grad_fn=<NllLossBackward0>)
tensor(1.5783, grad_fn=<NllLossBackward0>)
tensor(1.5359, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4713/17426 [07:17<18:47, 11.28it/s]

tensor(1.5724, grad_fn=<NllLossBackward0>)
tensor(1.5646, grad_fn=<NllLossBackward0>)
tensor(1.5540, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4717/17426 [07:17<18:14, 11.62it/s]

tensor(1.5607, grad_fn=<NllLossBackward0>)
tensor(1.5736, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4719/17426 [07:17<18:48, 11.26it/s]

tensor(1.5572, grad_fn=<NllLossBackward0>)
tensor(1.5161, grad_fn=<NllLossBackward0>)
tensor(1.5056, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4723/17426 [07:18<18:31, 11.43it/s]

tensor(1.5198, grad_fn=<NllLossBackward0>)
tensor(1.5581, grad_fn=<NllLossBackward0>)
tensor(1.5476, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4725/17426 [07:18<18:39, 11.35it/s]

tensor(1.5445, grad_fn=<NllLossBackward0>)
tensor(1.5402, grad_fn=<NllLossBackward0>)
tensor(1.5434, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4729/17426 [07:18<18:27, 11.46it/s]

tensor(1.5375, grad_fn=<NllLossBackward0>)
tensor(1.5409, grad_fn=<NllLossBackward0>)
tensor(1.5605, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4731/17426 [07:18<18:57, 11.16it/s]

tensor(1.5413, grad_fn=<NllLossBackward0>)
tensor(1.5757, grad_fn=<NllLossBackward0>)
tensor(1.5403, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4735/17426 [07:19<18:35, 11.38it/s]

tensor(1.5768, grad_fn=<NllLossBackward0>)
tensor(1.5562, grad_fn=<NllLossBackward0>)
tensor(1.4863, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4737/17426 [07:19<18:47, 11.25it/s]

tensor(1.5442, grad_fn=<NllLossBackward0>)
tensor(1.5963, grad_fn=<NllLossBackward0>)
tensor(1.4895, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4741/17426 [07:19<18:38, 11.35it/s]

tensor(1.5426, grad_fn=<NllLossBackward0>)
tensor(1.5373, grad_fn=<NllLossBackward0>)
tensor(1.5159, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4743/17426 [07:20<19:01, 11.11it/s]

tensor(1.4864, grad_fn=<NllLossBackward0>)
tensor(1.5611, grad_fn=<NllLossBackward0>)
tensor(1.5431, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4747/17426 [07:20<18:26, 11.46it/s]

tensor(1.5526, grad_fn=<NllLossBackward0>)
tensor(1.5583, grad_fn=<NllLossBackward0>)
tensor(1.5219, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4749/17426 [07:20<18:36, 11.36it/s]

tensor(1.5872, grad_fn=<NllLossBackward0>)
tensor(1.5240, grad_fn=<NllLossBackward0>)
tensor(1.5379, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4753/17426 [07:20<18:24, 11.48it/s]

tensor(1.5205, grad_fn=<NllLossBackward0>)
tensor(1.5674, grad_fn=<NllLossBackward0>)
tensor(1.5619, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4755/17426 [07:21<18:46, 11.25it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.5567, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4759/17426 [07:21<18:38, 11.33it/s]

tensor(1.5249, grad_fn=<NllLossBackward0>)
tensor(1.5525, grad_fn=<NllLossBackward0>)
tensor(1.5393, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4761/17426 [07:21<18:35, 11.35it/s]

tensor(1.5496, grad_fn=<NllLossBackward0>)
tensor(1.5824, grad_fn=<NllLossBackward0>)
tensor(1.5090, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4765/17426 [07:21<18:15, 11.55it/s]

tensor(1.5584, grad_fn=<NllLossBackward0>)
tensor(1.5253, grad_fn=<NllLossBackward0>)
tensor(1.5691, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4767/17426 [07:22<18:57, 11.12it/s]

tensor(1.5072, grad_fn=<NllLossBackward0>)
tensor(1.5281, grad_fn=<NllLossBackward0>)
tensor(1.5369, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4771/17426 [07:22<18:32, 11.38it/s]

tensor(1.5293, grad_fn=<NllLossBackward0>)
tensor(1.5293, grad_fn=<NllLossBackward0>)
tensor(1.5249, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4773/17426 [07:22<18:40, 11.29it/s]

tensor(1.5153, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)
tensor(1.5875, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4777/17426 [07:23<18:21, 11.49it/s]

tensor(1.5329, grad_fn=<NllLossBackward0>)
tensor(1.5784, grad_fn=<NllLossBackward0>)
tensor(1.5613, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4779/17426 [07:23<18:52, 11.17it/s]

tensor(1.4879, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)
tensor(1.5223, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4783/17426 [07:23<18:16, 11.53it/s]

tensor(1.5310, grad_fn=<NllLossBackward0>)
tensor(1.5033, grad_fn=<NllLossBackward0>)
tensor(1.5417, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4785/17426 [07:23<18:37, 11.31it/s]

tensor(1.5674, grad_fn=<NllLossBackward0>)
tensor(1.5847, grad_fn=<NllLossBackward0>)
tensor(1.5205, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4789/17426 [07:24<18:15, 11.54it/s]

tensor(1.5152, grad_fn=<NllLossBackward0>)
tensor(1.5474, grad_fn=<NllLossBackward0>)
tensor(1.5293, grad_fn=<NllLossBackward0>)


 27%|██▋       | 4791/17426 [07:24<18:51, 11.17it/s]

tensor(1.5720, grad_fn=<NllLossBackward0>)
tensor(1.5488, grad_fn=<NllLossBackward0>)
tensor(1.6052, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4793/17426 [07:24<20:24, 10.32it/s]

tensor(1.5178, grad_fn=<NllLossBackward0>)
tensor(1.5676, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4796/17426 [07:24<23:13,  9.07it/s]

tensor(1.5527, grad_fn=<NllLossBackward0>)
tensor(1.5799, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4798/17426 [07:25<24:45,  8.50it/s]

tensor(1.4987, grad_fn=<NllLossBackward0>)
tensor(1.5114, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4800/17426 [07:25<25:51,  8.14it/s]

tensor(1.5297, grad_fn=<NllLossBackward0>)
tensor(1.5299, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4802/17426 [07:25<27:43,  7.59it/s]

tensor(1.5430, grad_fn=<NllLossBackward0>)
tensor(1.5618, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4804/17426 [07:25<26:55,  7.81it/s]

tensor(1.5222, grad_fn=<NllLossBackward0>)
tensor(1.5441, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4806/17426 [07:26<26:06,  8.05it/s]

tensor(1.5129, grad_fn=<NllLossBackward0>)
tensor(1.5337, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4808/17426 [07:26<26:12,  8.03it/s]

tensor(1.5358, grad_fn=<NllLossBackward0>)
tensor(1.4793, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4810/17426 [07:26<28:39,  7.34it/s]

tensor(1.5616, grad_fn=<NllLossBackward0>)
tensor(1.5123, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4812/17426 [07:26<27:31,  7.64it/s]

tensor(1.5625, grad_fn=<NllLossBackward0>)
tensor(1.5695, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4814/17426 [07:27<29:25,  7.14it/s]

tensor(1.5781, grad_fn=<NllLossBackward0>)
tensor(1.5261, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4816/17426 [07:27<30:13,  6.95it/s]

tensor(1.5678, grad_fn=<NllLossBackward0>)
tensor(1.5848, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4819/17426 [07:27<22:59,  9.14it/s]

tensor(1.5374, grad_fn=<NllLossBackward0>)
tensor(1.5620, grad_fn=<NllLossBackward0>)
tensor(1.5421, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4822/17426 [07:28<20:23, 10.30it/s]

tensor(1.5385, grad_fn=<NllLossBackward0>)
tensor(1.5332, grad_fn=<NllLossBackward0>)
tensor(1.5450, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4824/17426 [07:28<19:45, 10.63it/s]

tensor(1.5371, grad_fn=<NllLossBackward0>)
tensor(1.5411, grad_fn=<NllLossBackward0>)
tensor(1.4966, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4828/17426 [07:28<18:52, 11.13it/s]

tensor(1.5989, grad_fn=<NllLossBackward0>)
tensor(1.5558, grad_fn=<NllLossBackward0>)
tensor(1.5401, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4830/17426 [07:28<18:48, 11.16it/s]

tensor(1.5879, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)
tensor(1.5609, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4834/17426 [07:29<18:30, 11.34it/s]

tensor(1.5351, grad_fn=<NllLossBackward0>)
tensor(1.5687, grad_fn=<NllLossBackward0>)
tensor(1.5480, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4836/17426 [07:29<18:35, 11.28it/s]

tensor(1.5678, grad_fn=<NllLossBackward0>)
tensor(1.5197, grad_fn=<NllLossBackward0>)
tensor(1.5669, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4840/17426 [07:29<18:45, 11.19it/s]

tensor(1.5413, grad_fn=<NllLossBackward0>)
tensor(1.4999, grad_fn=<NllLossBackward0>)
tensor(1.5553, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4842/17426 [07:29<18:54, 11.09it/s]

tensor(1.5546, grad_fn=<NllLossBackward0>)
tensor(1.5545, grad_fn=<NllLossBackward0>)
tensor(1.5544, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4846/17426 [07:30<18:12, 11.51it/s]

tensor(1.5400, grad_fn=<NllLossBackward0>)
tensor(1.5612, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4848/17426 [07:30<18:40, 11.22it/s]

tensor(1.5547, grad_fn=<NllLossBackward0>)
tensor(1.5617, grad_fn=<NllLossBackward0>)
tensor(1.5411, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4852/17426 [07:30<18:10, 11.53it/s]

tensor(1.4884, grad_fn=<NllLossBackward0>)
tensor(1.5508, grad_fn=<NllLossBackward0>)
tensor(1.5473, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4854/17426 [07:30<18:22, 11.40it/s]

tensor(1.5572, grad_fn=<NllLossBackward0>)
tensor(1.5711, grad_fn=<NllLossBackward0>)
tensor(1.5237, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4858/17426 [07:31<18:10, 11.53it/s]

tensor(1.5309, grad_fn=<NllLossBackward0>)
tensor(1.5326, grad_fn=<NllLossBackward0>)
tensor(1.5551, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4860/17426 [07:31<18:34, 11.27it/s]

tensor(1.5386, grad_fn=<NllLossBackward0>)
tensor(1.5379, grad_fn=<NllLossBackward0>)
tensor(1.5633, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4864/17426 [07:31<18:22, 11.39it/s]

tensor(1.5328, grad_fn=<NllLossBackward0>)
tensor(1.5377, grad_fn=<NllLossBackward0>)
tensor(1.5339, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4866/17426 [07:32<18:35, 11.26it/s]

tensor(1.5307, grad_fn=<NllLossBackward0>)
tensor(1.5576, grad_fn=<NllLossBackward0>)
tensor(1.4948, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4870/17426 [07:32<18:20, 11.41it/s]

tensor(1.5088, grad_fn=<NllLossBackward0>)
tensor(1.5407, grad_fn=<NllLossBackward0>)
tensor(1.5347, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4872/17426 [07:32<18:55, 11.06it/s]

tensor(1.5663, grad_fn=<NllLossBackward0>)
tensor(1.5497, grad_fn=<NllLossBackward0>)
tensor(1.5401, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4876/17426 [07:32<18:16, 11.45it/s]

tensor(1.5084, grad_fn=<NllLossBackward0>)
tensor(1.5625, grad_fn=<NllLossBackward0>)
tensor(1.5689, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4878/17426 [07:33<18:21, 11.39it/s]

tensor(1.5070, grad_fn=<NllLossBackward0>)
tensor(1.5294, grad_fn=<NllLossBackward0>)
tensor(1.5629, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4882/17426 [07:33<17:58, 11.63it/s]

tensor(1.5893, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)
tensor(1.5429, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4884/17426 [07:33<18:37, 11.22it/s]

tensor(1.5456, grad_fn=<NllLossBackward0>)
tensor(1.5122, grad_fn=<NllLossBackward0>)
tensor(1.5262, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4888/17426 [07:33<18:28, 11.31it/s]

tensor(1.5441, grad_fn=<NllLossBackward0>)
tensor(1.5275, grad_fn=<NllLossBackward0>)
tensor(1.5627, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4890/17426 [07:34<18:24, 11.35it/s]

tensor(1.5527, grad_fn=<NllLossBackward0>)
tensor(1.5690, grad_fn=<NllLossBackward0>)
tensor(1.5385, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4894/17426 [07:34<18:07, 11.53it/s]

tensor(1.5511, grad_fn=<NllLossBackward0>)
tensor(1.5738, grad_fn=<NllLossBackward0>)
tensor(1.5230, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4896/17426 [07:34<18:45, 11.13it/s]

tensor(1.5466, grad_fn=<NllLossBackward0>)
tensor(1.5456, grad_fn=<NllLossBackward0>)
tensor(1.5819, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4900/17426 [07:35<18:25, 11.33it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.5239, grad_fn=<NllLossBackward0>)
tensor(1.5910, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4902/17426 [07:35<18:26, 11.31it/s]

tensor(1.5511, grad_fn=<NllLossBackward0>)
tensor(1.5318, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4906/17426 [07:35<18:04, 11.55it/s]

tensor(1.5593, grad_fn=<NllLossBackward0>)
tensor(1.5721, grad_fn=<NllLossBackward0>)
tensor(1.5145, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4908/17426 [07:35<18:45, 11.12it/s]

tensor(1.5043, grad_fn=<NllLossBackward0>)
tensor(1.5313, grad_fn=<NllLossBackward0>)
tensor(1.5129, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4912/17426 [07:36<18:19, 11.38it/s]

tensor(1.5676, grad_fn=<NllLossBackward0>)
tensor(1.5545, grad_fn=<NllLossBackward0>)
tensor(1.5644, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4914/17426 [07:36<18:35, 11.22it/s]

tensor(1.5445, grad_fn=<NllLossBackward0>)
tensor(1.5272, grad_fn=<NllLossBackward0>)
tensor(1.5802, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4918/17426 [07:36<18:39, 11.17it/s]

tensor(1.5489, grad_fn=<NllLossBackward0>)
tensor(1.5540, grad_fn=<NllLossBackward0>)
tensor(1.5442, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4920/17426 [07:36<18:43, 11.13it/s]

tensor(1.5245, grad_fn=<NllLossBackward0>)
tensor(1.5301, grad_fn=<NllLossBackward0>)
tensor(1.5428, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4924/17426 [07:37<18:10, 11.46it/s]

tensor(1.5155, grad_fn=<NllLossBackward0>)
tensor(1.5317, grad_fn=<NllLossBackward0>)
tensor(1.5344, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4926/17426 [07:37<18:21, 11.35it/s]

tensor(1.5269, grad_fn=<NllLossBackward0>)
tensor(1.4936, grad_fn=<NllLossBackward0>)
tensor(1.5480, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4928/17426 [07:37<18:22, 11.34it/s]

tensor(1.5869, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4930/17426 [07:37<21:18,  9.77it/s]

tensor(1.4973, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4933/17426 [07:38<23:43,  8.77it/s]

tensor(1.5513, grad_fn=<NllLossBackward0>)
tensor(1.5472, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4935/17426 [07:38<26:06,  7.97it/s]

tensor(1.5433, grad_fn=<NllLossBackward0>)
tensor(1.5215, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4937/17426 [07:38<25:34,  8.14it/s]

tensor(1.5067, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4939/17426 [07:38<26:25,  7.88it/s]

tensor(1.5380, grad_fn=<NllLossBackward0>)
tensor(1.5143, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4941/17426 [07:39<26:47,  7.77it/s]

tensor(1.5634, grad_fn=<NllLossBackward0>)
tensor(1.5722, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4943/17426 [07:39<26:05,  7.97it/s]

tensor(1.5470, grad_fn=<NllLossBackward0>)
tensor(1.5993, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4945/17426 [07:39<26:42,  7.79it/s]

tensor(1.5140, grad_fn=<NllLossBackward0>)
tensor(1.5466, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4947/17426 [07:40<28:15,  7.36it/s]

tensor(1.5285, grad_fn=<NllLossBackward0>)
tensor(1.5504, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4949/17426 [07:40<27:07,  7.67it/s]

tensor(1.5603, grad_fn=<NllLossBackward0>)
tensor(1.5433, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4951/17426 [07:40<27:57,  7.44it/s]

tensor(1.5750, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4953/17426 [07:40<28:33,  7.28it/s]

tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.5260, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4955/17426 [07:41<27:09,  7.65it/s]

tensor(1.5298, grad_fn=<NllLossBackward0>)
tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.5696, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4959/17426 [07:41<21:02,  9.87it/s]

tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.5481, grad_fn=<NllLossBackward0>)
tensor(1.5636, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4962/17426 [07:41<19:23, 10.72it/s]

tensor(1.5436, grad_fn=<NllLossBackward0>)
tensor(1.5546, grad_fn=<NllLossBackward0>)
tensor(1.5647, grad_fn=<NllLossBackward0>)


 28%|██▊       | 4964/17426 [07:41<19:06, 10.87it/s]

tensor(1.5533, grad_fn=<NllLossBackward0>)
tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.5588, grad_fn=<NllLossBackward0>)


 29%|██▊       | 4968/17426 [07:42<18:43, 11.09it/s]

tensor(1.5349, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.5381, grad_fn=<NllLossBackward0>)


 29%|██▊       | 4970/17426 [07:42<18:32, 11.19it/s]

tensor(1.5330, grad_fn=<NllLossBackward0>)
tensor(1.5291, grad_fn=<NllLossBackward0>)
tensor(1.5019, grad_fn=<NllLossBackward0>)


 29%|██▊       | 4974/17426 [07:42<17:53, 11.60it/s]

tensor(1.5007, grad_fn=<NllLossBackward0>)
tensor(1.5486, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 29%|██▊       | 4976/17426 [07:42<18:03, 11.49it/s]

tensor(1.6043, grad_fn=<NllLossBackward0>)
tensor(1.5728, grad_fn=<NllLossBackward0>)
tensor(1.5469, grad_fn=<NllLossBackward0>)


 29%|██▊       | 4980/17426 [07:43<18:16, 11.35it/s]

tensor(1.5322, grad_fn=<NllLossBackward0>)
tensor(1.5394, grad_fn=<NllLossBackward0>)
tensor(1.5791, grad_fn=<NllLossBackward0>)


 29%|██▊       | 4982/17426 [07:43<18:15, 11.36it/s]

tensor(1.5421, grad_fn=<NllLossBackward0>)
tensor(1.5795, grad_fn=<NllLossBackward0>)
tensor(1.5320, grad_fn=<NllLossBackward0>)


 29%|██▊       | 4986/17426 [07:43<17:54, 11.58it/s]

tensor(1.5199, grad_fn=<NllLossBackward0>)
tensor(1.5352, grad_fn=<NllLossBackward0>)
tensor(1.5376, grad_fn=<NllLossBackward0>)


 29%|██▊       | 4988/17426 [07:43<17:56, 11.56it/s]

tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.5291, grad_fn=<NllLossBackward0>)


 29%|██▊       | 4992/17426 [07:44<18:02, 11.48it/s]

tensor(1.5471, grad_fn=<NllLossBackward0>)
tensor(1.5364, grad_fn=<NllLossBackward0>)
tensor(1.5252, grad_fn=<NllLossBackward0>)


 29%|██▊       | 4994/17426 [07:44<18:06, 11.44it/s]

tensor(1.5270, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)
tensor(1.5746, grad_fn=<NllLossBackward0>)


 29%|██▊       | 4998/17426 [07:44<18:04, 11.46it/s]

tensor(1.5500, grad_fn=<NllLossBackward0>)
tensor(1.5523, grad_fn=<NllLossBackward0>)
tensor(1.5191, grad_fn=<NllLossBackward0>)


 29%|██▊       | 5000/17426 [07:45<18:08, 11.42it/s]

tensor(1.5610, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)
tensor(1.5070, grad_fn=<NllLossBackward0>)


 29%|██▊       | 5004/17426 [07:45<18:15, 11.34it/s]

tensor(1.5404, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)
tensor(1.5632, grad_fn=<NllLossBackward0>)


 29%|██▊       | 5006/17426 [07:45<18:26, 11.23it/s]

tensor(1.5700, grad_fn=<NllLossBackward0>)
tensor(1.5096, grad_fn=<NllLossBackward0>)
tensor(1.5177, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5010/17426 [07:45<18:09, 11.40it/s]

tensor(1.5259, grad_fn=<NllLossBackward0>)
tensor(1.5158, grad_fn=<NllLossBackward0>)
tensor(1.5689, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5012/17426 [07:46<18:53, 10.95it/s]

tensor(1.5166, grad_fn=<NllLossBackward0>)
tensor(1.5270, grad_fn=<NllLossBackward0>)
tensor(1.5322, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5016/17426 [07:46<18:16, 11.31it/s]

tensor(1.5244, grad_fn=<NllLossBackward0>)
tensor(1.5304, grad_fn=<NllLossBackward0>)
tensor(1.5898, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5018/17426 [07:46<18:15, 11.33it/s]

tensor(1.5101, grad_fn=<NllLossBackward0>)
tensor(1.5561, grad_fn=<NllLossBackward0>)
tensor(1.5731, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5022/17426 [07:47<18:02, 11.45it/s]

tensor(1.5780, grad_fn=<NllLossBackward0>)
tensor(1.5203, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5024/17426 [07:47<18:44, 11.03it/s]

tensor(1.5763, grad_fn=<NllLossBackward0>)
tensor(1.5476, grad_fn=<NllLossBackward0>)
tensor(1.5755, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5028/17426 [07:47<18:10, 11.37it/s]

tensor(1.5360, grad_fn=<NllLossBackward0>)
tensor(1.5415, grad_fn=<NllLossBackward0>)
tensor(1.5399, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5030/17426 [07:47<18:22, 11.24it/s]

tensor(1.5351, grad_fn=<NllLossBackward0>)
tensor(1.5328, grad_fn=<NllLossBackward0>)
tensor(1.5410, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5034/17426 [07:48<17:57, 11.50it/s]

tensor(1.6133, grad_fn=<NllLossBackward0>)
tensor(1.5484, grad_fn=<NllLossBackward0>)
tensor(1.5737, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5036/17426 [07:48<18:34, 11.12it/s]

tensor(1.5304, grad_fn=<NllLossBackward0>)
tensor(1.5158, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5040/17426 [07:48<18:20, 11.26it/s]

tensor(1.5473, grad_fn=<NllLossBackward0>)
tensor(1.5476, grad_fn=<NllLossBackward0>)
tensor(1.5234, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5042/17426 [07:48<18:35, 11.10it/s]

tensor(1.5524, grad_fn=<NllLossBackward0>)
tensor(1.5169, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5046/17426 [07:49<18:19, 11.26it/s]

tensor(1.5505, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)
tensor(1.5407, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5048/17426 [07:49<18:47, 10.98it/s]

tensor(1.5826, grad_fn=<NllLossBackward0>)
tensor(1.5047, grad_fn=<NllLossBackward0>)
tensor(1.5467, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5052/17426 [07:49<18:14, 11.31it/s]

tensor(1.5484, grad_fn=<NllLossBackward0>)
tensor(1.5267, grad_fn=<NllLossBackward0>)
tensor(1.5407, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5054/17426 [07:49<18:36, 11.09it/s]

tensor(1.6371, grad_fn=<NllLossBackward0>)
tensor(1.5747, grad_fn=<NllLossBackward0>)
tensor(1.5515, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5058/17426 [07:50<18:38, 11.06it/s]

tensor(1.5476, grad_fn=<NllLossBackward0>)
tensor(1.5642, grad_fn=<NllLossBackward0>)
tensor(1.5533, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5060/17426 [07:50<18:33, 11.10it/s]

tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)
tensor(1.5581, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5064/17426 [07:50<18:16, 11.27it/s]

tensor(1.5343, grad_fn=<NllLossBackward0>)
tensor(1.5341, grad_fn=<NllLossBackward0>)
tensor(1.5526, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5066/17426 [07:50<18:16, 11.28it/s]

tensor(1.5567, grad_fn=<NllLossBackward0>)
tensor(1.5127, grad_fn=<NllLossBackward0>)
tensor(1.5418, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5068/17426 [07:51<19:36, 10.50it/s]

tensor(1.5209, grad_fn=<NllLossBackward0>)
tensor(1.5498, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5071/17426 [07:51<22:16,  9.25it/s]

tensor(1.5542, grad_fn=<NllLossBackward0>)
tensor(1.5429, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5073/17426 [07:51<23:51,  8.63it/s]

tensor(1.4682, grad_fn=<NllLossBackward0>)
tensor(1.5771, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5075/17426 [07:52<24:09,  8.52it/s]

tensor(1.5494, grad_fn=<NllLossBackward0>)
tensor(1.5857, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5077/17426 [07:52<25:42,  8.01it/s]

tensor(1.5666, grad_fn=<NllLossBackward0>)
tensor(1.4825, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5079/17426 [07:52<26:40,  7.71it/s]

tensor(1.5303, grad_fn=<NllLossBackward0>)
tensor(1.5208, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5081/17426 [07:52<27:09,  7.58it/s]

tensor(1.5875, grad_fn=<NllLossBackward0>)
tensor(1.5296, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5083/17426 [07:53<26:26,  7.78it/s]

tensor(1.5440, grad_fn=<NllLossBackward0>)
tensor(1.5885, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5085/17426 [07:53<26:53,  7.65it/s]

tensor(1.5456, grad_fn=<NllLossBackward0>)
tensor(1.5248, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5087/17426 [07:53<28:18,  7.27it/s]

tensor(1.5384, grad_fn=<NllLossBackward0>)
tensor(1.5641, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5089/17426 [07:53<29:22,  7.00it/s]

tensor(1.5356, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5091/17426 [07:54<29:19,  7.01it/s]

tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.5395, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5093/17426 [07:54<29:51,  6.88it/s]

tensor(1.5002, grad_fn=<NllLossBackward0>)
tensor(1.5448, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5096/17426 [07:54<22:48,  9.01it/s]

tensor(1.4643, grad_fn=<NllLossBackward0>)
tensor(1.5334, grad_fn=<NllLossBackward0>)
tensor(1.4935, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5099/17426 [07:55<20:17, 10.12it/s]

tensor(1.5013, grad_fn=<NllLossBackward0>)
tensor(1.5799, grad_fn=<NllLossBackward0>)
tensor(1.5681, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5102/17426 [07:55<19:46, 10.38it/s]

tensor(1.5579, grad_fn=<NllLossBackward0>)
tensor(1.5434, grad_fn=<NllLossBackward0>)
tensor(1.5568, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5104/17426 [07:55<19:37, 10.46it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)
tensor(1.5655, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5108/17426 [07:55<18:47, 10.93it/s]

tensor(1.5441, grad_fn=<NllLossBackward0>)
tensor(1.5110, grad_fn=<NllLossBackward0>)
tensor(1.5324, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5110/17426 [07:56<18:37, 11.02it/s]

tensor(1.5315, grad_fn=<NllLossBackward0>)
tensor(1.5466, grad_fn=<NllLossBackward0>)
tensor(1.5395, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5114/17426 [07:56<18:22, 11.17it/s]

tensor(1.5665, grad_fn=<NllLossBackward0>)
tensor(1.5522, grad_fn=<NllLossBackward0>)
tensor(1.5709, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5116/17426 [07:56<18:48, 10.91it/s]

tensor(1.5558, grad_fn=<NllLossBackward0>)
tensor(1.5172, grad_fn=<NllLossBackward0>)
tensor(1.5497, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5120/17426 [07:57<18:09, 11.29it/s]

tensor(1.5549, grad_fn=<NllLossBackward0>)
tensor(1.5362, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5122/17426 [07:57<18:24, 11.14it/s]

tensor(1.5491, grad_fn=<NllLossBackward0>)
tensor(1.5554, grad_fn=<NllLossBackward0>)
tensor(1.5139, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5126/17426 [07:57<18:10, 11.28it/s]

tensor(1.5556, grad_fn=<NllLossBackward0>)
tensor(1.5633, grad_fn=<NllLossBackward0>)
tensor(1.5780, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5128/17426 [07:57<18:46, 10.91it/s]

tensor(1.5149, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5132/17426 [07:58<18:14, 11.23it/s]

tensor(1.5025, grad_fn=<NllLossBackward0>)
tensor(1.5546, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5134/17426 [07:58<18:15, 11.22it/s]

tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.4915, grad_fn=<NllLossBackward0>)
tensor(1.5496, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5138/17426 [07:58<18:37, 10.99it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.5590, grad_fn=<NllLossBackward0>)
tensor(1.5464, grad_fn=<NllLossBackward0>)


 29%|██▉       | 5140/17426 [07:58<18:54, 10.83it/s]

tensor(1.5211, grad_fn=<NllLossBackward0>)
tensor(1.5638, grad_fn=<NllLossBackward0>)
tensor(1.6034, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5144/17426 [07:59<18:16, 11.20it/s]

tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.5715, grad_fn=<NllLossBackward0>)
tensor(1.5390, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5146/17426 [07:59<18:32, 11.04it/s]

tensor(1.5630, grad_fn=<NllLossBackward0>)
tensor(1.5533, grad_fn=<NllLossBackward0>)
tensor(1.5997, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5150/17426 [07:59<18:22, 11.14it/s]

tensor(1.5489, grad_fn=<NllLossBackward0>)
tensor(1.5556, grad_fn=<NllLossBackward0>)
tensor(1.5768, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5152/17426 [07:59<18:29, 11.06it/s]

tensor(1.5832, grad_fn=<NllLossBackward0>)
tensor(1.5686, grad_fn=<NllLossBackward0>)
tensor(1.5628, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5156/17426 [08:00<18:15, 11.20it/s]

tensor(1.5488, grad_fn=<NllLossBackward0>)
tensor(1.5482, grad_fn=<NllLossBackward0>)
tensor(1.5245, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5158/17426 [08:00<18:22, 11.13it/s]

tensor(1.5661, grad_fn=<NllLossBackward0>)
tensor(1.5310, grad_fn=<NllLossBackward0>)
tensor(1.4917, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5162/17426 [08:00<18:11, 11.24it/s]

tensor(1.5114, grad_fn=<NllLossBackward0>)
tensor(1.5532, grad_fn=<NllLossBackward0>)
tensor(1.5209, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5164/17426 [08:00<18:11, 11.24it/s]

tensor(1.5296, grad_fn=<NllLossBackward0>)
tensor(1.5261, grad_fn=<NllLossBackward0>)
tensor(1.5539, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5168/17426 [08:01<18:01, 11.33it/s]

tensor(1.5445, grad_fn=<NllLossBackward0>)
tensor(1.5923, grad_fn=<NllLossBackward0>)
tensor(1.5196, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5170/17426 [08:01<18:09, 11.25it/s]

tensor(1.5345, grad_fn=<NllLossBackward0>)
tensor(1.5212, grad_fn=<NllLossBackward0>)
tensor(1.5209, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5174/17426 [08:01<18:27, 11.06it/s]

tensor(1.5482, grad_fn=<NllLossBackward0>)
tensor(1.5703, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5176/17426 [08:02<18:24, 11.09it/s]

tensor(1.5272, grad_fn=<NllLossBackward0>)
tensor(1.6090, grad_fn=<NllLossBackward0>)
tensor(1.5651, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5180/17426 [08:02<18:00, 11.34it/s]

tensor(1.5052, grad_fn=<NllLossBackward0>)
tensor(1.5574, grad_fn=<NllLossBackward0>)
tensor(1.5414, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5182/17426 [08:02<18:01, 11.32it/s]

tensor(1.5663, grad_fn=<NllLossBackward0>)
tensor(1.5217, grad_fn=<NllLossBackward0>)
tensor(1.5526, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5186/17426 [08:02<18:09, 11.23it/s]

tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.5884, grad_fn=<NllLossBackward0>)
tensor(1.5305, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5188/17426 [08:03<18:17, 11.16it/s]

tensor(1.5697, grad_fn=<NllLossBackward0>)
tensor(1.5057, grad_fn=<NllLossBackward0>)
tensor(1.5163, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5192/17426 [08:03<17:59, 11.34it/s]

tensor(1.5303, grad_fn=<NllLossBackward0>)
tensor(1.4619, grad_fn=<NllLossBackward0>)
tensor(1.5505, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5194/17426 [08:03<18:03, 11.29it/s]

tensor(1.5285, grad_fn=<NllLossBackward0>)
tensor(1.5212, grad_fn=<NllLossBackward0>)
tensor(1.5573, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5198/17426 [08:04<18:12, 11.19it/s]

tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.5131, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5200/17426 [08:04<18:10, 11.21it/s]

tensor(1.5502, grad_fn=<NllLossBackward0>)
tensor(1.5525, grad_fn=<NllLossBackward0>)
tensor(1.5334, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5204/17426 [08:04<18:42, 10.89it/s]

tensor(1.5735, grad_fn=<NllLossBackward0>)
tensor(1.5623, grad_fn=<NllLossBackward0>)
tensor(1.5416, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5206/17426 [08:04<21:09,  9.63it/s]

tensor(1.6130, grad_fn=<NllLossBackward0>)
tensor(1.5641, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5208/17426 [08:05<23:20,  8.72it/s]

tensor(1.5405, grad_fn=<NllLossBackward0>)
tensor(1.5771, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5210/17426 [08:05<24:02,  8.47it/s]

tensor(1.5559, grad_fn=<NllLossBackward0>)
tensor(1.5532, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5212/17426 [08:05<25:28,  7.99it/s]

tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.5385, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5214/17426 [08:05<25:15,  8.06it/s]

tensor(1.5570, grad_fn=<NllLossBackward0>)
tensor(1.5451, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5216/17426 [08:06<26:17,  7.74it/s]

tensor(1.5394, grad_fn=<NllLossBackward0>)
tensor(1.5258, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5218/17426 [08:06<24:52,  8.18it/s]

tensor(1.5979, grad_fn=<NllLossBackward0>)
tensor(1.5491, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5220/17426 [08:06<24:54,  8.17it/s]

tensor(1.5704, grad_fn=<NllLossBackward0>)
tensor(1.5461, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5222/17426 [08:06<26:12,  7.76it/s]

tensor(1.4985, grad_fn=<NllLossBackward0>)
tensor(1.5414, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5224/17426 [08:07<27:47,  7.32it/s]

tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)


 30%|██▉       | 5226/17426 [08:07<27:37,  7.36it/s]

tensor(1.5181, grad_fn=<NllLossBackward0>)
tensor(1.5298, grad_fn=<NllLossBackward0>)


 30%|███       | 5228/17426 [08:07<29:27,  6.90it/s]

tensor(1.4951, grad_fn=<NllLossBackward0>)
tensor(1.5027, grad_fn=<NllLossBackward0>)


 30%|███       | 5230/17426 [08:08<28:41,  7.08it/s]

tensor(1.4946, grad_fn=<NllLossBackward0>)
tensor(1.5603, grad_fn=<NllLossBackward0>)


 30%|███       | 5233/17426 [08:08<22:16,  9.12it/s]

tensor(1.5188, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)
tensor(1.5482, grad_fn=<NllLossBackward0>)


 30%|███       | 5236/17426 [08:08<20:16, 10.02it/s]

tensor(1.5576, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)
tensor(1.5588, grad_fn=<NllLossBackward0>)


 30%|███       | 5239/17426 [08:08<19:24, 10.47it/s]

tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.5476, grad_fn=<NllLossBackward0>)
tensor(1.5749, grad_fn=<NllLossBackward0>)


 30%|███       | 5241/17426 [08:09<19:15, 10.54it/s]

tensor(1.5388, grad_fn=<NllLossBackward0>)
tensor(1.5452, grad_fn=<NllLossBackward0>)
tensor(1.5414, grad_fn=<NllLossBackward0>)


 30%|███       | 5245/17426 [08:09<18:26, 11.01it/s]

tensor(1.5360, grad_fn=<NllLossBackward0>)
tensor(1.5448, grad_fn=<NllLossBackward0>)
tensor(1.5501, grad_fn=<NllLossBackward0>)


 30%|███       | 5247/17426 [08:09<19:11, 10.58it/s]

tensor(1.5369, grad_fn=<NllLossBackward0>)
tensor(1.5190, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)


 30%|███       | 5251/17426 [08:09<18:32, 10.95it/s]

tensor(1.5320, grad_fn=<NllLossBackward0>)
tensor(1.5646, grad_fn=<NllLossBackward0>)
tensor(1.5458, grad_fn=<NllLossBackward0>)


 30%|███       | 5253/17426 [08:10<18:57, 10.70it/s]

tensor(1.5602, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)
tensor(1.5366, grad_fn=<NllLossBackward0>)


 30%|███       | 5257/17426 [08:10<18:37, 10.89it/s]

tensor(1.5493, grad_fn=<NllLossBackward0>)
tensor(1.5555, grad_fn=<NllLossBackward0>)
tensor(1.5451, grad_fn=<NllLossBackward0>)


 30%|███       | 5259/17426 [08:10<18:33, 10.93it/s]

tensor(1.4987, grad_fn=<NllLossBackward0>)
tensor(1.5534, grad_fn=<NllLossBackward0>)
tensor(1.5555, grad_fn=<NllLossBackward0>)


 30%|███       | 5263/17426 [08:11<17:57, 11.28it/s]

tensor(1.5475, grad_fn=<NllLossBackward0>)
tensor(1.5229, grad_fn=<NllLossBackward0>)
tensor(1.5299, grad_fn=<NllLossBackward0>)


 30%|███       | 5265/17426 [08:11<18:31, 10.94it/s]

tensor(1.5319, grad_fn=<NllLossBackward0>)
tensor(1.5902, grad_fn=<NllLossBackward0>)
tensor(1.5020, grad_fn=<NllLossBackward0>)


 30%|███       | 5269/17426 [08:11<18:01, 11.24it/s]

tensor(1.5627, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.5194, grad_fn=<NllLossBackward0>)


 30%|███       | 5271/17426 [08:11<18:10, 11.14it/s]

tensor(1.4958, grad_fn=<NllLossBackward0>)
tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.5336, grad_fn=<NllLossBackward0>)


 30%|███       | 5275/17426 [08:12<17:51, 11.34it/s]

tensor(1.5516, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)
tensor(1.5175, grad_fn=<NllLossBackward0>)


 30%|███       | 5277/17426 [08:12<18:25, 10.99it/s]

tensor(1.5351, grad_fn=<NllLossBackward0>)
tensor(1.5594, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)


 30%|███       | 5281/17426 [08:12<17:44, 11.41it/s]

tensor(1.4983, grad_fn=<NllLossBackward0>)
tensor(1.5374, grad_fn=<NllLossBackward0>)
tensor(1.5210, grad_fn=<NllLossBackward0>)


 30%|███       | 5283/17426 [08:12<17:53, 11.31it/s]

tensor(1.5137, grad_fn=<NllLossBackward0>)
tensor(1.5782, grad_fn=<NllLossBackward0>)
tensor(1.4922, grad_fn=<NllLossBackward0>)


 30%|███       | 5287/17426 [08:13<17:56, 11.27it/s]

tensor(1.5256, grad_fn=<NllLossBackward0>)
tensor(1.5846, grad_fn=<NllLossBackward0>)
tensor(1.5311, grad_fn=<NllLossBackward0>)


 30%|███       | 5289/17426 [08:13<18:11, 11.12it/s]

tensor(1.5257, grad_fn=<NllLossBackward0>)
tensor(1.6110, grad_fn=<NllLossBackward0>)
tensor(1.5294, grad_fn=<NllLossBackward0>)


 30%|███       | 5293/17426 [08:13<17:44, 11.40it/s]

tensor(1.5034, grad_fn=<NllLossBackward0>)
tensor(1.5345, grad_fn=<NllLossBackward0>)
tensor(1.5143, grad_fn=<NllLossBackward0>)


 30%|███       | 5295/17426 [08:13<18:17, 11.05it/s]

tensor(1.5485, grad_fn=<NllLossBackward0>)
tensor(1.5209, grad_fn=<NllLossBackward0>)
tensor(1.5868, grad_fn=<NllLossBackward0>)


 30%|███       | 5299/17426 [08:14<18:11, 11.11it/s]

tensor(1.5430, grad_fn=<NllLossBackward0>)
tensor(1.5275, grad_fn=<NllLossBackward0>)
tensor(1.5461, grad_fn=<NllLossBackward0>)


 30%|███       | 5301/17426 [08:14<18:12, 11.10it/s]

tensor(1.5678, grad_fn=<NllLossBackward0>)
tensor(1.4916, grad_fn=<NllLossBackward0>)
tensor(1.5238, grad_fn=<NllLossBackward0>)


 30%|███       | 5305/17426 [08:14<17:51, 11.31it/s]

tensor(1.5466, grad_fn=<NllLossBackward0>)
tensor(1.5237, grad_fn=<NllLossBackward0>)
tensor(1.5411, grad_fn=<NllLossBackward0>)


 30%|███       | 5307/17426 [08:15<17:55, 11.26it/s]

tensor(1.5720, grad_fn=<NllLossBackward0>)
tensor(1.5505, grad_fn=<NllLossBackward0>)
tensor(1.5275, grad_fn=<NllLossBackward0>)


 30%|███       | 5311/17426 [08:15<17:47, 11.35it/s]

tensor(1.5503, grad_fn=<NllLossBackward0>)
tensor(1.5545, grad_fn=<NllLossBackward0>)
tensor(1.5476, grad_fn=<NllLossBackward0>)


 30%|███       | 5313/17426 [08:15<17:55, 11.26it/s]

tensor(1.5827, grad_fn=<NllLossBackward0>)
tensor(1.5132, grad_fn=<NllLossBackward0>)
tensor(1.5756, grad_fn=<NllLossBackward0>)


 31%|███       | 5317/17426 [08:15<17:39, 11.43it/s]

tensor(1.5506, grad_fn=<NllLossBackward0>)
tensor(1.5336, grad_fn=<NllLossBackward0>)
tensor(1.5408, grad_fn=<NllLossBackward0>)


 31%|███       | 5319/17426 [08:16<17:55, 11.26it/s]

tensor(1.5274, grad_fn=<NllLossBackward0>)
tensor(1.5194, grad_fn=<NllLossBackward0>)
tensor(1.5270, grad_fn=<NllLossBackward0>)


 31%|███       | 5323/17426 [08:16<18:02, 11.18it/s]

tensor(1.5216, grad_fn=<NllLossBackward0>)
tensor(1.5200, grad_fn=<NllLossBackward0>)
tensor(1.5353, grad_fn=<NllLossBackward0>)


 31%|███       | 5325/17426 [08:16<18:07, 11.13it/s]

tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.6006, grad_fn=<NllLossBackward0>)
tensor(1.5735, grad_fn=<NllLossBackward0>)


 31%|███       | 5329/17426 [08:16<17:44, 11.37it/s]

tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.5732, grad_fn=<NllLossBackward0>)
tensor(1.5354, grad_fn=<NllLossBackward0>)


 31%|███       | 5331/17426 [08:17<17:53, 11.27it/s]

tensor(1.5157, grad_fn=<NllLossBackward0>)
tensor(1.5503, grad_fn=<NllLossBackward0>)
tensor(1.5902, grad_fn=<NllLossBackward0>)


 31%|███       | 5335/17426 [08:17<18:04, 11.15it/s]

tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.5492, grad_fn=<NllLossBackward0>)
tensor(1.5626, grad_fn=<NllLossBackward0>)


 31%|███       | 5337/17426 [08:17<18:11, 11.07it/s]

tensor(1.5463, grad_fn=<NllLossBackward0>)
tensor(1.5128, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 31%|███       | 5341/17426 [08:18<18:18, 11.01it/s]

tensor(1.5380, grad_fn=<NllLossBackward0>)
tensor(1.5588, grad_fn=<NllLossBackward0>)
tensor(1.5198, grad_fn=<NllLossBackward0>)


 31%|███       | 5343/17426 [08:18<20:35,  9.78it/s]

tensor(1.5056, grad_fn=<NllLossBackward0>)
tensor(1.5610, grad_fn=<NllLossBackward0>)


 31%|███       | 5345/17426 [08:18<22:51,  8.81it/s]

tensor(1.5142, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 31%|███       | 5347/17426 [08:18<23:54,  8.42it/s]

tensor(1.5690, grad_fn=<NllLossBackward0>)
tensor(1.5452, grad_fn=<NllLossBackward0>)


 31%|███       | 5349/17426 [08:19<24:09,  8.33it/s]

tensor(1.5653, grad_fn=<NllLossBackward0>)
tensor(1.4805, grad_fn=<NllLossBackward0>)


 31%|███       | 5351/17426 [08:19<23:43,  8.49it/s]

tensor(1.5497, grad_fn=<NllLossBackward0>)
tensor(1.5409, grad_fn=<NllLossBackward0>)


 31%|███       | 5353/17426 [08:19<24:52,  8.09it/s]

tensor(1.5274, grad_fn=<NllLossBackward0>)
tensor(1.4900, grad_fn=<NllLossBackward0>)


 31%|███       | 5355/17426 [08:19<23:53,  8.42it/s]

tensor(1.5166, grad_fn=<NllLossBackward0>)
tensor(1.5214, grad_fn=<NllLossBackward0>)


 31%|███       | 5357/17426 [08:20<23:53,  8.42it/s]

tensor(1.5462, grad_fn=<NllLossBackward0>)
tensor(1.5252, grad_fn=<NllLossBackward0>)


 31%|███       | 5359/17426 [08:20<23:49,  8.44it/s]

tensor(1.5928, grad_fn=<NllLossBackward0>)
tensor(1.5171, grad_fn=<NllLossBackward0>)


 31%|███       | 5361/17426 [08:20<23:43,  8.48it/s]

tensor(1.5737, grad_fn=<NllLossBackward0>)
tensor(1.5212, grad_fn=<NllLossBackward0>)


 31%|███       | 5363/17426 [08:20<26:29,  7.59it/s]

tensor(1.5093, grad_fn=<NllLossBackward0>)
tensor(1.5774, grad_fn=<NllLossBackward0>)


 31%|███       | 5365/17426 [08:21<26:47,  7.50it/s]

tensor(1.5460, grad_fn=<NllLossBackward0>)
tensor(1.4951, grad_fn=<NllLossBackward0>)


 31%|███       | 5367/17426 [08:21<27:57,  7.19it/s]

tensor(1.5222, grad_fn=<NllLossBackward0>)
tensor(1.5094, grad_fn=<NllLossBackward0>)


 31%|███       | 5369/17426 [08:21<27:55,  7.20it/s]

tensor(1.5491, grad_fn=<NllLossBackward0>)
tensor(1.5883, grad_fn=<NllLossBackward0>)


 31%|███       | 5372/17426 [08:21<22:41,  8.86it/s]

tensor(1.5706, grad_fn=<NllLossBackward0>)
tensor(1.5384, grad_fn=<NllLossBackward0>)
tensor(1.5068, grad_fn=<NllLossBackward0>)


 31%|███       | 5375/17426 [08:22<19:57, 10.06it/s]

tensor(1.5592, grad_fn=<NllLossBackward0>)
tensor(1.5417, grad_fn=<NllLossBackward0>)
tensor(1.5244, grad_fn=<NllLossBackward0>)


 31%|███       | 5378/17426 [08:22<19:00, 10.57it/s]

tensor(1.5044, grad_fn=<NllLossBackward0>)
tensor(1.5282, grad_fn=<NllLossBackward0>)
tensor(1.5213, grad_fn=<NllLossBackward0>)


 31%|███       | 5380/17426 [08:22<19:14, 10.44it/s]

tensor(1.5226, grad_fn=<NllLossBackward0>)
tensor(1.5534, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)


 31%|███       | 5384/17426 [08:23<18:06, 11.08it/s]

tensor(1.5501, grad_fn=<NllLossBackward0>)
tensor(1.5850, grad_fn=<NllLossBackward0>)
tensor(1.4681, grad_fn=<NllLossBackward0>)


 31%|███       | 5386/17426 [08:23<18:11, 11.03it/s]

tensor(1.5387, grad_fn=<NllLossBackward0>)
tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.5438, grad_fn=<NllLossBackward0>)


 31%|███       | 5390/17426 [08:23<17:44, 11.31it/s]

tensor(1.5613, grad_fn=<NllLossBackward0>)
tensor(1.5595, grad_fn=<NllLossBackward0>)
tensor(1.5527, grad_fn=<NllLossBackward0>)


 31%|███       | 5392/17426 [08:23<18:32, 10.82it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.5325, grad_fn=<NllLossBackward0>)
tensor(1.5365, grad_fn=<NllLossBackward0>)


 31%|███       | 5396/17426 [08:24<17:58, 11.15it/s]

tensor(1.5701, grad_fn=<NllLossBackward0>)
tensor(1.5432, grad_fn=<NllLossBackward0>)
tensor(1.5394, grad_fn=<NllLossBackward0>)


 31%|███       | 5398/17426 [08:24<17:54, 11.20it/s]

tensor(1.5282, grad_fn=<NllLossBackward0>)
tensor(1.5115, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)


 31%|███       | 5402/17426 [08:24<17:32, 11.42it/s]

tensor(1.5691, grad_fn=<NllLossBackward0>)
tensor(1.5413, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)


 31%|███       | 5404/17426 [08:24<18:02, 11.11it/s]

tensor(1.5240, grad_fn=<NllLossBackward0>)
tensor(1.5685, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)


 31%|███       | 5408/17426 [08:25<17:35, 11.39it/s]

tensor(1.5327, grad_fn=<NllLossBackward0>)
tensor(1.5603, grad_fn=<NllLossBackward0>)
tensor(1.5222, grad_fn=<NllLossBackward0>)


 31%|███       | 5410/17426 [08:25<17:44, 11.29it/s]

tensor(1.5358, grad_fn=<NllLossBackward0>)
tensor(1.5434, grad_fn=<NllLossBackward0>)
tensor(1.5497, grad_fn=<NllLossBackward0>)


 31%|███       | 5414/17426 [08:25<17:30, 11.43it/s]

tensor(1.5542, grad_fn=<NllLossBackward0>)
tensor(1.4630, grad_fn=<NllLossBackward0>)
tensor(1.5415, grad_fn=<NllLossBackward0>)


 31%|███       | 5416/17426 [08:25<18:13, 10.98it/s]

tensor(1.5272, grad_fn=<NllLossBackward0>)
tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.5459, grad_fn=<NllLossBackward0>)


 31%|███       | 5420/17426 [08:26<17:28, 11.45it/s]

tensor(1.5412, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.5285, grad_fn=<NllLossBackward0>)


 31%|███       | 5422/17426 [08:26<17:34, 11.38it/s]

tensor(1.5558, grad_fn=<NllLossBackward0>)
tensor(1.5216, grad_fn=<NllLossBackward0>)
tensor(1.5200, grad_fn=<NllLossBackward0>)


 31%|███       | 5426/17426 [08:26<17:22, 11.51it/s]

tensor(1.5223, grad_fn=<NllLossBackward0>)
tensor(1.5254, grad_fn=<NllLossBackward0>)
tensor(1.5377, grad_fn=<NllLossBackward0>)


 31%|███       | 5428/17426 [08:27<18:07, 11.04it/s]

tensor(1.5483, grad_fn=<NllLossBackward0>)
tensor(1.5190, grad_fn=<NllLossBackward0>)
tensor(1.5482, grad_fn=<NllLossBackward0>)


 31%|███       | 5432/17426 [08:27<17:42, 11.29it/s]

tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.5283, grad_fn=<NllLossBackward0>)
tensor(1.5704, grad_fn=<NllLossBackward0>)


 31%|███       | 5434/17426 [08:27<17:38, 11.32it/s]

tensor(1.5313, grad_fn=<NllLossBackward0>)
tensor(1.5529, grad_fn=<NllLossBackward0>)
tensor(1.5052, grad_fn=<NllLossBackward0>)


 31%|███       | 5438/17426 [08:27<17:21, 11.51it/s]

tensor(1.4786, grad_fn=<NllLossBackward0>)
tensor(1.5291, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)


 31%|███       | 5440/17426 [08:28<17:52, 11.18it/s]

tensor(1.5432, grad_fn=<NllLossBackward0>)
tensor(1.5405, grad_fn=<NllLossBackward0>)
tensor(1.5351, grad_fn=<NllLossBackward0>)


 31%|███       | 5444/17426 [08:28<17:38, 11.32it/s]

tensor(1.5092, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)
tensor(1.5696, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5446/17426 [08:28<17:36, 11.33it/s]

tensor(1.5390, grad_fn=<NllLossBackward0>)
tensor(1.5710, grad_fn=<NllLossBackward0>)
tensor(1.5078, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5450/17426 [08:28<17:25, 11.45it/s]

tensor(1.5458, grad_fn=<NllLossBackward0>)
tensor(1.5311, grad_fn=<NllLossBackward0>)
tensor(1.5075, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5452/17426 [08:29<18:03, 11.05it/s]

tensor(1.5563, grad_fn=<NllLossBackward0>)
tensor(1.5739, grad_fn=<NllLossBackward0>)
tensor(1.5632, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5456/17426 [08:29<17:31, 11.39it/s]

tensor(1.4847, grad_fn=<NllLossBackward0>)
tensor(1.4858, grad_fn=<NllLossBackward0>)
tensor(1.5690, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5458/17426 [08:29<17:42, 11.26it/s]

tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.5914, grad_fn=<NllLossBackward0>)
tensor(1.5247, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5462/17426 [08:30<18:08, 10.99it/s]

tensor(1.5867, grad_fn=<NllLossBackward0>)
tensor(1.5405, grad_fn=<NllLossBackward0>)
tensor(1.5420, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5464/17426 [08:30<18:03, 11.04it/s]

tensor(1.5304, grad_fn=<NllLossBackward0>)
tensor(1.5719, grad_fn=<NllLossBackward0>)
tensor(1.5395, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5468/17426 [08:30<17:27, 11.41it/s]

tensor(1.5117, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.4980, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5470/17426 [08:30<17:48, 11.19it/s]

tensor(1.5376, grad_fn=<NllLossBackward0>)
tensor(1.5343, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5474/17426 [08:31<17:54, 11.12it/s]

tensor(1.5395, grad_fn=<NllLossBackward0>)
tensor(1.5166, grad_fn=<NllLossBackward0>)
tensor(1.5316, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5476/17426 [08:31<18:00, 11.06it/s]

tensor(1.5429, grad_fn=<NllLossBackward0>)
tensor(1.4935, grad_fn=<NllLossBackward0>)
tensor(1.5218, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5480/17426 [08:31<17:26, 11.41it/s]

tensor(1.5486, grad_fn=<NllLossBackward0>)
tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.5806, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5482/17426 [08:31<17:32, 11.34it/s]

tensor(1.6054, grad_fn=<NllLossBackward0>)
tensor(1.5072, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5484/17426 [08:32<20:11,  9.86it/s]

tensor(1.5493, grad_fn=<NllLossBackward0>)
tensor(1.5321, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5486/17426 [08:32<22:23,  8.88it/s]

tensor(1.5701, grad_fn=<NllLossBackward0>)
tensor(1.5205, grad_fn=<NllLossBackward0>)


 31%|███▏      | 5488/17426 [08:32<23:53,  8.33it/s]

tensor(1.5607, grad_fn=<NllLossBackward0>)
tensor(1.5317, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5490/17426 [08:32<24:34,  8.10it/s]

tensor(1.5640, grad_fn=<NllLossBackward0>)
tensor(1.5354, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5492/17426 [08:33<24:53,  7.99it/s]

tensor(1.5033, grad_fn=<NllLossBackward0>)
tensor(1.5013, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5494/17426 [08:33<25:21,  7.84it/s]

tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.5258, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5496/17426 [08:33<24:16,  8.19it/s]

tensor(1.5553, grad_fn=<NllLossBackward0>)
tensor(1.5337, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5498/17426 [08:33<24:41,  8.05it/s]

tensor(1.4832, grad_fn=<NllLossBackward0>)
tensor(1.5134, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5500/17426 [08:34<26:12,  7.59it/s]

tensor(1.5535, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5502/17426 [08:34<26:28,  7.51it/s]

tensor(1.5077, grad_fn=<NllLossBackward0>)
tensor(1.5787, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5504/17426 [08:34<26:14,  7.57it/s]

tensor(1.5529, grad_fn=<NllLossBackward0>)
tensor(1.5481, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5506/17426 [08:34<27:10,  7.31it/s]

tensor(1.4684, grad_fn=<NllLossBackward0>)
tensor(1.5381, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5508/17426 [08:35<27:17,  7.28it/s]

tensor(1.5095, grad_fn=<NllLossBackward0>)
tensor(1.5058, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5511/17426 [08:35<21:52,  9.08it/s]

tensor(1.5214, grad_fn=<NllLossBackward0>)
tensor(1.5013, grad_fn=<NllLossBackward0>)
tensor(1.5432, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5514/17426 [08:35<19:39, 10.10it/s]

tensor(1.5458, grad_fn=<NllLossBackward0>)
tensor(1.5457, grad_fn=<NllLossBackward0>)
tensor(1.5448, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5517/17426 [08:36<18:39, 10.64it/s]

tensor(1.4967, grad_fn=<NllLossBackward0>)
tensor(1.4993, grad_fn=<NllLossBackward0>)
tensor(1.5331, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5519/17426 [08:36<18:31, 10.72it/s]

tensor(1.4658, grad_fn=<NllLossBackward0>)
tensor(1.5249, grad_fn=<NllLossBackward0>)
tensor(1.5514, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5523/17426 [08:36<18:01, 11.01it/s]

tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.5341, grad_fn=<NllLossBackward0>)
tensor(1.5268, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5525/17426 [08:36<18:17, 10.85it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.5396, grad_fn=<NllLossBackward0>)
tensor(1.5612, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5529/17426 [08:37<17:40, 11.22it/s]

tensor(1.5505, grad_fn=<NllLossBackward0>)
tensor(1.5728, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5531/17426 [08:37<17:39, 11.23it/s]

tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.4989, grad_fn=<NllLossBackward0>)
tensor(1.5390, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5535/17426 [08:37<17:38, 11.23it/s]

tensor(1.5114, grad_fn=<NllLossBackward0>)
tensor(1.5229, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5537/17426 [08:37<17:58, 11.02it/s]

tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.5728, grad_fn=<NllLossBackward0>)
tensor(1.5620, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5541/17426 [08:38<17:21, 11.41it/s]

tensor(1.5453, grad_fn=<NllLossBackward0>)
tensor(1.5514, grad_fn=<NllLossBackward0>)
tensor(1.5515, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5543/17426 [08:38<17:33, 11.28it/s]

tensor(1.5016, grad_fn=<NllLossBackward0>)
tensor(1.5350, grad_fn=<NllLossBackward0>)
tensor(1.5172, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5547/17426 [08:38<17:45, 11.15it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.5843, grad_fn=<NllLossBackward0>)
tensor(1.5723, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5549/17426 [08:38<17:43, 11.17it/s]

tensor(1.5434, grad_fn=<NllLossBackward0>)
tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.4887, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5553/17426 [08:39<17:20, 11.42it/s]

tensor(1.5290, grad_fn=<NllLossBackward0>)
tensor(1.5611, grad_fn=<NllLossBackward0>)
tensor(1.5359, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5555/17426 [08:39<17:57, 11.02it/s]

tensor(1.5243, grad_fn=<NllLossBackward0>)
tensor(1.6050, grad_fn=<NllLossBackward0>)
tensor(1.5125, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5559/17426 [08:39<18:14, 10.85it/s]

tensor(1.5398, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)
tensor(1.5410, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5561/17426 [08:40<18:10, 10.88it/s]

tensor(1.4858, grad_fn=<NllLossBackward0>)
tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.6237, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5565/17426 [08:40<18:04, 10.94it/s]

tensor(1.5216, grad_fn=<NllLossBackward0>)
tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.5389, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5567/17426 [08:40<18:25, 10.72it/s]

tensor(1.5477, grad_fn=<NllLossBackward0>)
tensor(1.5363, grad_fn=<NllLossBackward0>)
tensor(1.5569, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5571/17426 [08:41<18:01, 10.96it/s]

tensor(1.5783, grad_fn=<NllLossBackward0>)
tensor(1.5742, grad_fn=<NllLossBackward0>)
tensor(1.5268, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5573/17426 [08:41<18:07, 10.90it/s]

tensor(1.5321, grad_fn=<NllLossBackward0>)
tensor(1.5278, grad_fn=<NllLossBackward0>)
tensor(1.5433, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5577/17426 [08:41<17:48, 11.09it/s]

tensor(1.4827, grad_fn=<NllLossBackward0>)
tensor(1.5805, grad_fn=<NllLossBackward0>)
tensor(1.5336, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5579/17426 [08:41<18:16, 10.80it/s]

tensor(1.5482, grad_fn=<NllLossBackward0>)
tensor(1.5445, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5583/17426 [08:42<17:36, 11.21it/s]

tensor(1.5659, grad_fn=<NllLossBackward0>)
tensor(1.5630, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5585/17426 [08:42<17:56, 11.00it/s]

tensor(1.5109, grad_fn=<NllLossBackward0>)
tensor(1.5409, grad_fn=<NllLossBackward0>)
tensor(1.5127, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5589/17426 [08:42<17:47, 11.09it/s]

tensor(1.5568, grad_fn=<NllLossBackward0>)
tensor(1.5575, grad_fn=<NllLossBackward0>)
tensor(1.5657, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5591/17426 [08:42<17:52, 11.03it/s]

tensor(1.5458, grad_fn=<NllLossBackward0>)
tensor(1.5038, grad_fn=<NllLossBackward0>)
tensor(1.5462, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5595/17426 [08:43<17:36, 11.20it/s]

tensor(1.5118, grad_fn=<NllLossBackward0>)
tensor(1.5003, grad_fn=<NllLossBackward0>)
tensor(1.5451, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5597/17426 [08:43<17:38, 11.18it/s]

tensor(1.4958, grad_fn=<NllLossBackward0>)
tensor(1.5570, grad_fn=<NllLossBackward0>)
tensor(1.5407, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5601/17426 [08:43<17:34, 11.22it/s]

tensor(1.5058, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5603/17426 [08:43<17:33, 11.22it/s]

tensor(1.5564, grad_fn=<NllLossBackward0>)
tensor(1.5055, grad_fn=<NllLossBackward0>)
tensor(1.5699, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5607/17426 [08:44<17:28, 11.28it/s]

tensor(1.5388, grad_fn=<NllLossBackward0>)
tensor(1.5753, grad_fn=<NllLossBackward0>)
tensor(1.5375, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5609/17426 [08:44<17:33, 11.21it/s]

tensor(1.5776, grad_fn=<NllLossBackward0>)
tensor(1.5519, grad_fn=<NllLossBackward0>)
tensor(1.5733, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5613/17426 [08:44<17:38, 11.16it/s]

tensor(1.5085, grad_fn=<NllLossBackward0>)
tensor(1.4937, grad_fn=<NllLossBackward0>)
tensor(1.4885, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5615/17426 [08:44<17:54, 10.99it/s]

tensor(1.5193, grad_fn=<NllLossBackward0>)
tensor(1.5577, grad_fn=<NllLossBackward0>)
tensor(1.5307, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5619/17426 [08:45<18:12, 10.80it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.5431, grad_fn=<NllLossBackward0>)
tensor(1.5312, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5621/17426 [08:45<20:27,  9.61it/s]

tensor(1.5098, grad_fn=<NllLossBackward0>)
tensor(1.5247, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5623/17426 [08:45<22:43,  8.66it/s]

tensor(1.4778, grad_fn=<NllLossBackward0>)
tensor(1.5700, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5625/17426 [08:46<23:59,  8.20it/s]

tensor(1.5600, grad_fn=<NllLossBackward0>)
tensor(1.5422, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5627/17426 [08:46<23:57,  8.21it/s]

tensor(1.5393, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5629/17426 [08:46<24:17,  8.09it/s]

tensor(1.5432, grad_fn=<NllLossBackward0>)
tensor(1.5396, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5631/17426 [08:46<25:10,  7.81it/s]

tensor(1.5334, grad_fn=<NllLossBackward0>)
tensor(1.5480, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5633/17426 [08:47<24:10,  8.13it/s]

tensor(1.4998, grad_fn=<NllLossBackward0>)
tensor(1.5060, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5635/17426 [08:47<23:59,  8.19it/s]

tensor(1.4829, grad_fn=<NllLossBackward0>)
tensor(1.5320, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5637/17426 [08:47<23:09,  8.48it/s]

tensor(1.5247, grad_fn=<NllLossBackward0>)
tensor(1.5097, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5639/17426 [08:47<24:20,  8.07it/s]

tensor(1.5485, grad_fn=<NllLossBackward0>)
tensor(1.5410, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5641/17426 [08:48<25:48,  7.61it/s]

tensor(1.5432, grad_fn=<NllLossBackward0>)
tensor(1.4799, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5643/17426 [08:48<27:15,  7.21it/s]

tensor(1.5078, grad_fn=<NllLossBackward0>)
tensor(1.5287, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5645/17426 [08:48<27:01,  7.27it/s]

tensor(1.5208, grad_fn=<NllLossBackward0>)
tensor(1.5129, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5647/17426 [08:49<26:52,  7.31it/s]

tensor(1.5210, grad_fn=<NllLossBackward0>)
tensor(1.5724, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5650/17426 [08:49<20:54,  9.39it/s]

tensor(1.4975, grad_fn=<NllLossBackward0>)
tensor(1.5250, grad_fn=<NllLossBackward0>)
tensor(1.5463, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5653/17426 [08:49<18:55, 10.37it/s]

tensor(1.5465, grad_fn=<NllLossBackward0>)
tensor(1.5317, grad_fn=<NllLossBackward0>)
tensor(1.5898, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5655/17426 [08:49<18:24, 10.66it/s]

tensor(1.5358, grad_fn=<NllLossBackward0>)
tensor(1.5308, grad_fn=<NllLossBackward0>)
tensor(1.5724, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5659/17426 [08:50<18:09, 10.80it/s]

tensor(1.5230, grad_fn=<NllLossBackward0>)
tensor(1.4954, grad_fn=<NllLossBackward0>)
tensor(1.5444, grad_fn=<NllLossBackward0>)


 32%|███▏      | 5661/17426 [08:50<17:59, 10.89it/s]

tensor(1.5172, grad_fn=<NllLossBackward0>)
tensor(1.5759, grad_fn=<NllLossBackward0>)
tensor(1.5172, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5665/17426 [08:50<17:25, 11.24it/s]

tensor(1.5535, grad_fn=<NllLossBackward0>)
tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.5604, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5667/17426 [08:50<17:17, 11.34it/s]

tensor(1.5475, grad_fn=<NllLossBackward0>)
tensor(1.5660, grad_fn=<NllLossBackward0>)
tensor(1.5202, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5671/17426 [08:51<17:41, 11.08it/s]

tensor(1.5304, grad_fn=<NllLossBackward0>)
tensor(1.5313, grad_fn=<NllLossBackward0>)
tensor(1.5233, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5673/17426 [08:51<17:37, 11.11it/s]

tensor(1.5448, grad_fn=<NllLossBackward0>)
tensor(1.5192, grad_fn=<NllLossBackward0>)
tensor(1.5312, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5677/17426 [08:51<17:10, 11.40it/s]

tensor(1.5248, grad_fn=<NllLossBackward0>)
tensor(1.4812, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5679/17426 [08:51<17:11, 11.39it/s]

tensor(1.5569, grad_fn=<NllLossBackward0>)
tensor(1.5705, grad_fn=<NllLossBackward0>)
tensor(1.5700, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5683/17426 [08:52<17:19, 11.30it/s]

tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.5189, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5685/17426 [08:52<17:16, 11.33it/s]

tensor(1.6064, grad_fn=<NllLossBackward0>)
tensor(1.5349, grad_fn=<NllLossBackward0>)
tensor(1.5055, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5689/17426 [08:52<16:51, 11.60it/s]

tensor(1.5679, grad_fn=<NllLossBackward0>)
tensor(1.5197, grad_fn=<NllLossBackward0>)
tensor(1.4927, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5691/17426 [08:52<17:05, 11.45it/s]

tensor(1.5433, grad_fn=<NllLossBackward0>)
tensor(1.5615, grad_fn=<NllLossBackward0>)
tensor(1.4663, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5695/17426 [08:53<17:09, 11.40it/s]

tensor(1.5186, grad_fn=<NllLossBackward0>)
tensor(1.5552, grad_fn=<NllLossBackward0>)
tensor(1.5412, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5697/17426 [08:53<17:20, 11.27it/s]

tensor(1.5445, grad_fn=<NllLossBackward0>)
tensor(1.5156, grad_fn=<NllLossBackward0>)
tensor(1.5637, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5701/17426 [08:53<17:05, 11.43it/s]

tensor(1.4995, grad_fn=<NllLossBackward0>)
tensor(1.5338, grad_fn=<NllLossBackward0>)
tensor(1.5325, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5703/17426 [08:54<17:20, 11.27it/s]

tensor(1.5190, grad_fn=<NllLossBackward0>)
tensor(1.5568, grad_fn=<NllLossBackward0>)
tensor(1.4910, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5707/17426 [08:54<17:29, 11.16it/s]

tensor(1.5462, grad_fn=<NllLossBackward0>)
tensor(1.5838, grad_fn=<NllLossBackward0>)
tensor(1.5897, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5709/17426 [08:54<17:34, 11.11it/s]

tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.5432, grad_fn=<NllLossBackward0>)
tensor(1.5194, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5713/17426 [08:54<17:03, 11.44it/s]

tensor(1.5527, grad_fn=<NllLossBackward0>)
tensor(1.5582, grad_fn=<NllLossBackward0>)
tensor(1.5368, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5715/17426 [08:55<17:18, 11.27it/s]

tensor(1.5454, grad_fn=<NllLossBackward0>)
tensor(1.5350, grad_fn=<NllLossBackward0>)
tensor(1.5684, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5719/17426 [08:55<17:12, 11.34it/s]

tensor(1.5191, grad_fn=<NllLossBackward0>)
tensor(1.5518, grad_fn=<NllLossBackward0>)
tensor(1.5404, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5721/17426 [08:55<17:16, 11.30it/s]

tensor(1.5035, grad_fn=<NllLossBackward0>)
tensor(1.5665, grad_fn=<NllLossBackward0>)
tensor(1.5189, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5725/17426 [08:55<16:57, 11.50it/s]

tensor(1.4960, grad_fn=<NllLossBackward0>)
tensor(1.4836, grad_fn=<NllLossBackward0>)
tensor(1.5487, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5727/17426 [08:56<17:14, 11.30it/s]

tensor(1.5223, grad_fn=<NllLossBackward0>)
tensor(1.5642, grad_fn=<NllLossBackward0>)
tensor(1.5447, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5731/17426 [08:56<17:26, 11.18it/s]

tensor(1.5249, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)
tensor(1.5421, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5733/17426 [08:56<17:27, 11.16it/s]

tensor(1.4925, grad_fn=<NllLossBackward0>)
tensor(1.4729, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5737/17426 [08:57<17:11, 11.33it/s]

tensor(1.5745, grad_fn=<NllLossBackward0>)
tensor(1.4977, grad_fn=<NllLossBackward0>)
tensor(1.5670, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5739/17426 [08:57<17:17, 11.26it/s]

tensor(1.5508, grad_fn=<NllLossBackward0>)
tensor(1.5646, grad_fn=<NllLossBackward0>)
tensor(1.5389, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5743/17426 [08:57<17:14, 11.29it/s]

tensor(1.5868, grad_fn=<NllLossBackward0>)
tensor(1.5294, grad_fn=<NllLossBackward0>)
tensor(1.5534, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5745/17426 [08:57<17:28, 11.14it/s]

tensor(1.5597, grad_fn=<NllLossBackward0>)
tensor(1.4907, grad_fn=<NllLossBackward0>)
tensor(1.5394, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5749/17426 [08:58<16:58, 11.47it/s]

tensor(1.5299, grad_fn=<NllLossBackward0>)
tensor(1.5156, grad_fn=<NllLossBackward0>)
tensor(1.5023, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5751/17426 [08:58<17:02, 11.41it/s]

tensor(1.5316, grad_fn=<NllLossBackward0>)
tensor(1.5331, grad_fn=<NllLossBackward0>)
tensor(1.5198, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5755/17426 [08:58<17:13, 11.29it/s]

tensor(1.5401, grad_fn=<NllLossBackward0>)
tensor(1.5148, grad_fn=<NllLossBackward0>)
tensor(1.5580, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5757/17426 [08:58<17:13, 11.29it/s]

tensor(1.5184, grad_fn=<NllLossBackward0>)
tensor(1.5338, grad_fn=<NllLossBackward0>)
tensor(1.5430, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5759/17426 [08:59<17:32, 11.09it/s]

tensor(1.4873, grad_fn=<NllLossBackward0>)
tensor(1.5144, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5761/17426 [08:59<20:00,  9.71it/s]

tensor(1.5347, grad_fn=<NllLossBackward0>)
tensor(1.4976, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5764/17426 [08:59<21:53,  8.88it/s]

tensor(1.5393, grad_fn=<NllLossBackward0>)
tensor(1.4876, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5766/17426 [08:59<22:46,  8.54it/s]

tensor(1.5560, grad_fn=<NllLossBackward0>)
tensor(1.5472, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5768/17426 [09:00<23:57,  8.11it/s]

tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.5413, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5770/17426 [09:00<24:59,  7.78it/s]

tensor(1.5240, grad_fn=<NllLossBackward0>)
tensor(1.5840, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5772/17426 [09:00<24:16,  8.00it/s]

tensor(1.5118, grad_fn=<NllLossBackward0>)
tensor(1.5393, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5774/17426 [09:00<23:42,  8.19it/s]

tensor(1.5394, grad_fn=<NllLossBackward0>)
tensor(1.5263, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5776/17426 [09:01<22:52,  8.49it/s]

tensor(1.5772, grad_fn=<NllLossBackward0>)
tensor(1.5090, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5778/17426 [09:01<23:05,  8.41it/s]

tensor(1.5382, grad_fn=<NllLossBackward0>)
tensor(1.5057, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5780/17426 [09:01<25:21,  7.65it/s]

tensor(1.5879, grad_fn=<NllLossBackward0>)
tensor(1.5174, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5782/17426 [09:01<26:54,  7.21it/s]

tensor(1.5192, grad_fn=<NllLossBackward0>)
tensor(1.5135, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5784/17426 [09:02<26:16,  7.38it/s]

tensor(1.5293, grad_fn=<NllLossBackward0>)
tensor(1.5608, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5786/17426 [09:02<28:03,  6.91it/s]

tensor(1.5263, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5789/17426 [09:02<22:08,  8.76it/s]

tensor(1.5214, grad_fn=<NllLossBackward0>)
tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.4982, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5792/17426 [09:03<19:11, 10.10it/s]

tensor(1.5352, grad_fn=<NllLossBackward0>)
tensor(1.5096, grad_fn=<NllLossBackward0>)
tensor(1.5680, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5795/17426 [09:03<18:01, 10.75it/s]

tensor(1.4890, grad_fn=<NllLossBackward0>)
tensor(1.4780, grad_fn=<NllLossBackward0>)
tensor(1.5667, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5797/17426 [09:03<17:55, 10.81it/s]

tensor(1.5378, grad_fn=<NllLossBackward0>)
tensor(1.5044, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5801/17426 [09:03<17:23, 11.14it/s]

tensor(1.5409, grad_fn=<NllLossBackward0>)
tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5803/17426 [09:04<17:24, 11.13it/s]

tensor(1.5004, grad_fn=<NllLossBackward0>)
tensor(1.5426, grad_fn=<NllLossBackward0>)
tensor(1.5417, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5807/17426 [09:04<17:12, 11.26it/s]

tensor(1.5419, grad_fn=<NllLossBackward0>)
tensor(1.5626, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5809/17426 [09:04<17:14, 11.23it/s]

tensor(1.5202, grad_fn=<NllLossBackward0>)
tensor(1.4900, grad_fn=<NllLossBackward0>)
tensor(1.5202, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5813/17426 [09:05<17:08, 11.30it/s]

tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.5476, grad_fn=<NllLossBackward0>)
tensor(1.5563, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5815/17426 [09:05<17:14, 11.22it/s]

tensor(1.5385, grad_fn=<NllLossBackward0>)
tensor(1.5401, grad_fn=<NllLossBackward0>)
tensor(1.5186, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5819/17426 [09:05<16:45, 11.54it/s]

tensor(1.5446, grad_fn=<NllLossBackward0>)
tensor(1.5368, grad_fn=<NllLossBackward0>)
tensor(1.5541, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5821/17426 [09:05<16:59, 11.38it/s]

tensor(1.5188, grad_fn=<NllLossBackward0>)
tensor(1.5605, grad_fn=<NllLossBackward0>)
tensor(1.5349, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5825/17426 [09:06<17:01, 11.36it/s]

tensor(1.5123, grad_fn=<NllLossBackward0>)
tensor(1.5260, grad_fn=<NllLossBackward0>)
tensor(1.5323, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5827/17426 [09:06<17:02, 11.34it/s]

tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.5251, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5831/17426 [09:06<17:01, 11.35it/s]

tensor(1.5662, grad_fn=<NllLossBackward0>)
tensor(1.5530, grad_fn=<NllLossBackward0>)
tensor(1.5096, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5833/17426 [09:06<17:23, 11.11it/s]

tensor(1.5073, grad_fn=<NllLossBackward0>)
tensor(1.5545, grad_fn=<NllLossBackward0>)
tensor(1.5692, grad_fn=<NllLossBackward0>)


 33%|███▎      | 5837/17426 [09:07<16:52, 11.45it/s]

tensor(1.5315, grad_fn=<NllLossBackward0>)
tensor(1.4920, grad_fn=<NllLossBackward0>)
tensor(1.5169, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5839/17426 [09:07<17:04, 11.31it/s]

tensor(1.5332, grad_fn=<NllLossBackward0>)
tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.5201, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5843/17426 [09:07<16:38, 11.60it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.5498, grad_fn=<NllLossBackward0>)
tensor(1.5144, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5845/17426 [09:07<17:19, 11.14it/s]

tensor(1.5199, grad_fn=<NllLossBackward0>)
tensor(1.5486, grad_fn=<NllLossBackward0>)
tensor(1.5589, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5849/17426 [09:08<17:15, 11.18it/s]

tensor(1.5696, grad_fn=<NllLossBackward0>)
tensor(1.5254, grad_fn=<NllLossBackward0>)
tensor(1.5247, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5851/17426 [09:08<17:05, 11.29it/s]

tensor(1.4918, grad_fn=<NllLossBackward0>)
tensor(1.5129, grad_fn=<NllLossBackward0>)
tensor(1.5026, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5855/17426 [09:08<16:44, 11.52it/s]

tensor(1.5271, grad_fn=<NllLossBackward0>)
tensor(1.5060, grad_fn=<NllLossBackward0>)
tensor(1.5649, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5857/17426 [09:08<17:19, 11.13it/s]

tensor(1.5145, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)
tensor(1.5105, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5861/17426 [09:09<16:56, 11.38it/s]

tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.5725, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5863/17426 [09:09<17:06, 11.26it/s]

tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.5240, grad_fn=<NllLossBackward0>)
tensor(1.5256, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5867/17426 [09:09<17:36, 10.94it/s]

tensor(1.5204, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)
tensor(1.5854, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5869/17426 [09:09<17:45, 10.85it/s]

tensor(1.5459, grad_fn=<NllLossBackward0>)
tensor(1.5304, grad_fn=<NllLossBackward0>)
tensor(1.5308, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5873/17426 [09:10<17:20, 11.10it/s]

tensor(1.5313, grad_fn=<NllLossBackward0>)
tensor(1.5607, grad_fn=<NllLossBackward0>)
tensor(1.5079, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5875/17426 [09:10<17:40, 10.90it/s]

tensor(1.5545, grad_fn=<NllLossBackward0>)
tensor(1.5807, grad_fn=<NllLossBackward0>)
tensor(1.5305, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5879/17426 [09:10<16:57, 11.35it/s]

tensor(1.5675, grad_fn=<NllLossBackward0>)
tensor(1.5570, grad_fn=<NllLossBackward0>)
tensor(1.5712, grad_fn=<NllLossBackward0>)


 34%|███▎      | 5881/17426 [09:11<17:31, 10.98it/s]

tensor(1.5291, grad_fn=<NllLossBackward0>)
tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.4997, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5885/17426 [09:11<17:13, 11.16it/s]

tensor(1.5128, grad_fn=<NllLossBackward0>)
tensor(1.5425, grad_fn=<NllLossBackward0>)
tensor(1.5609, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5887/17426 [09:11<17:14, 11.15it/s]

tensor(1.5275, grad_fn=<NllLossBackward0>)
tensor(1.5479, grad_fn=<NllLossBackward0>)
tensor(1.5046, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5891/17426 [09:11<17:12, 11.18it/s]

tensor(1.5504, grad_fn=<NllLossBackward0>)
tensor(1.4560, grad_fn=<NllLossBackward0>)
tensor(1.5386, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5893/17426 [09:12<17:13, 11.16it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.4917, grad_fn=<NllLossBackward0>)
tensor(1.5452, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5897/17426 [09:12<16:54, 11.36it/s]

tensor(1.5792, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)
tensor(1.4866, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5899/17426 [09:12<18:10, 10.57it/s]

tensor(1.5022, grad_fn=<NllLossBackward0>)
tensor(1.5170, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5901/17426 [09:12<20:20,  9.44it/s]

tensor(1.4951, grad_fn=<NllLossBackward0>)
tensor(1.4926, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5903/17426 [09:13<22:12,  8.65it/s]

tensor(1.5066, grad_fn=<NllLossBackward0>)
tensor(1.5603, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5905/17426 [09:13<23:22,  8.22it/s]

tensor(1.5040, grad_fn=<NllLossBackward0>)
tensor(1.5008, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5907/17426 [09:13<23:47,  8.07it/s]

tensor(1.5441, grad_fn=<NllLossBackward0>)
tensor(1.5829, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5909/17426 [09:14<23:20,  8.23it/s]

tensor(1.5250, grad_fn=<NllLossBackward0>)
tensor(1.5483, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5911/17426 [09:14<24:48,  7.74it/s]

tensor(1.5324, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5913/17426 [09:14<23:30,  8.16it/s]

tensor(1.5030, grad_fn=<NllLossBackward0>)
tensor(1.4932, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5915/17426 [09:14<23:16,  8.24it/s]

tensor(1.5229, grad_fn=<NllLossBackward0>)
tensor(1.5249, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5917/17426 [09:15<24:13,  7.92it/s]

tensor(1.5290, grad_fn=<NllLossBackward0>)
tensor(1.5781, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5919/17426 [09:15<25:33,  7.50it/s]

tensor(1.5225, grad_fn=<NllLossBackward0>)
tensor(1.5118, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5921/17426 [09:15<24:14,  7.91it/s]

tensor(1.5072, grad_fn=<NllLossBackward0>)
tensor(1.5483, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5923/17426 [09:15<26:47,  7.16it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.5137, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5925/17426 [09:16<26:52,  7.13it/s]

tensor(1.4920, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5928/17426 [09:16<21:57,  8.73it/s]

tensor(1.5580, grad_fn=<NllLossBackward0>)
tensor(1.5177, grad_fn=<NllLossBackward0>)
tensor(1.5073, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5931/17426 [09:16<19:12,  9.97it/s]

tensor(1.4783, grad_fn=<NllLossBackward0>)
tensor(1.4762, grad_fn=<NllLossBackward0>)
tensor(1.5119, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5934/17426 [09:16<17:45, 10.78it/s]

tensor(1.5620, grad_fn=<NllLossBackward0>)
tensor(1.5059, grad_fn=<NllLossBackward0>)
tensor(1.5277, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5936/17426 [09:17<17:51, 10.72it/s]

tensor(1.5411, grad_fn=<NllLossBackward0>)
tensor(1.5393, grad_fn=<NllLossBackward0>)
tensor(1.5355, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5940/17426 [09:17<17:19, 11.05it/s]

tensor(1.5080, grad_fn=<NllLossBackward0>)
tensor(1.5293, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5942/17426 [09:17<17:16, 11.08it/s]

tensor(1.5500, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)
tensor(1.5184, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5946/17426 [09:18<16:57, 11.28it/s]

tensor(1.5200, grad_fn=<NllLossBackward0>)
tensor(1.5275, grad_fn=<NllLossBackward0>)
tensor(1.5698, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5948/17426 [09:18<17:04, 11.20it/s]

tensor(1.5521, grad_fn=<NllLossBackward0>)
tensor(1.5721, grad_fn=<NllLossBackward0>)
tensor(1.5373, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5952/17426 [09:18<17:03, 11.21it/s]

tensor(1.5461, grad_fn=<NllLossBackward0>)
tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.5383, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5954/17426 [09:18<17:20, 11.03it/s]

tensor(1.5692, grad_fn=<NllLossBackward0>)
tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.4922, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5958/17426 [09:19<17:00, 11.24it/s]

tensor(1.5250, grad_fn=<NllLossBackward0>)
tensor(1.5091, grad_fn=<NllLossBackward0>)
tensor(1.5519, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5960/17426 [09:19<17:27, 10.94it/s]

tensor(1.5449, grad_fn=<NllLossBackward0>)
tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5659, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5964/17426 [09:19<17:07, 11.15it/s]

tensor(1.5521, grad_fn=<NllLossBackward0>)
tensor(1.4945, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5966/17426 [09:19<17:12, 11.09it/s]

tensor(1.5656, grad_fn=<NllLossBackward0>)
tensor(1.5068, grad_fn=<NllLossBackward0>)
tensor(1.5404, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5970/17426 [09:20<16:57, 11.26it/s]

tensor(1.5122, grad_fn=<NllLossBackward0>)
tensor(1.4757, grad_fn=<NllLossBackward0>)
tensor(1.4972, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5972/17426 [09:20<17:20, 11.01it/s]

tensor(1.5649, grad_fn=<NllLossBackward0>)
tensor(1.4681, grad_fn=<NllLossBackward0>)
tensor(1.5249, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5976/17426 [09:20<17:04, 11.18it/s]

tensor(1.5318, grad_fn=<NllLossBackward0>)
tensor(1.5715, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5978/17426 [09:20<17:10, 11.11it/s]

tensor(1.5311, grad_fn=<NllLossBackward0>)
tensor(1.5413, grad_fn=<NllLossBackward0>)
tensor(1.4777, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5982/17426 [09:21<17:00, 11.21it/s]

tensor(1.5632, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)
tensor(1.5166, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5984/17426 [09:21<17:27, 10.93it/s]

tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)
tensor(1.5559, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5988/17426 [09:21<17:02, 11.19it/s]

tensor(1.5536, grad_fn=<NllLossBackward0>)
tensor(1.5717, grad_fn=<NllLossBackward0>)
tensor(1.5382, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5990/17426 [09:22<17:01, 11.20it/s]

tensor(1.5220, grad_fn=<NllLossBackward0>)
tensor(1.5516, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5994/17426 [09:22<16:50, 11.31it/s]

tensor(1.5244, grad_fn=<NllLossBackward0>)
tensor(1.5423, grad_fn=<NllLossBackward0>)
tensor(1.5351, grad_fn=<NllLossBackward0>)


 34%|███▍      | 5996/17426 [09:22<17:26, 10.92it/s]

tensor(1.5534, grad_fn=<NllLossBackward0>)
tensor(1.5388, grad_fn=<NllLossBackward0>)
tensor(1.5310, grad_fn=<NllLossBackward0>)


 34%|███▍      | 6000/17426 [09:22<17:00, 11.20it/s]

tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.5085, grad_fn=<NllLossBackward0>)
tensor(1.5437, grad_fn=<NllLossBackward0>)


 34%|███▍      | 6002/17426 [09:23<17:13, 11.06it/s]

tensor(1.5570, grad_fn=<NllLossBackward0>)
tensor(1.5239, grad_fn=<NllLossBackward0>)
tensor(1.5296, grad_fn=<NllLossBackward0>)


 34%|███▍      | 6006/17426 [09:23<17:02, 11.16it/s]

tensor(1.5614, grad_fn=<NllLossBackward0>)
tensor(1.5324, grad_fn=<NllLossBackward0>)
tensor(1.5132, grad_fn=<NllLossBackward0>)


 34%|███▍      | 6008/17426 [09:23<17:19, 10.98it/s]

tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.5614, grad_fn=<NllLossBackward0>)
tensor(1.5059, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6012/17426 [09:24<17:00, 11.18it/s]

tensor(1.5384, grad_fn=<NllLossBackward0>)
tensor(1.4707, grad_fn=<NllLossBackward0>)
tensor(1.5226, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6014/17426 [09:24<17:15, 11.02it/s]

tensor(1.5177, grad_fn=<NllLossBackward0>)
tensor(1.5532, grad_fn=<NllLossBackward0>)
tensor(1.5313, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6018/17426 [09:24<17:31, 10.85it/s]

tensor(1.5498, grad_fn=<NllLossBackward0>)
tensor(1.5346, grad_fn=<NllLossBackward0>)
tensor(1.5700, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6020/17426 [09:24<17:37, 10.78it/s]

tensor(1.5621, grad_fn=<NllLossBackward0>)
tensor(1.4969, grad_fn=<NllLossBackward0>)
tensor(1.5443, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6024/17426 [09:25<17:06, 11.10it/s]

tensor(1.5735, grad_fn=<NllLossBackward0>)
tensor(1.5224, grad_fn=<NllLossBackward0>)
tensor(1.5532, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6026/17426 [09:25<17:16, 11.00it/s]

tensor(1.5366, grad_fn=<NllLossBackward0>)
tensor(1.5689, grad_fn=<NllLossBackward0>)
tensor(1.5146, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6030/17426 [09:25<17:10, 11.06it/s]

tensor(1.4905, grad_fn=<NllLossBackward0>)
tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.5040, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6032/17426 [09:25<17:19, 10.96it/s]

tensor(1.5672, grad_fn=<NllLossBackward0>)
tensor(1.5193, grad_fn=<NllLossBackward0>)
tensor(1.5375, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6036/17426 [09:26<17:05, 11.11it/s]

tensor(1.5534, grad_fn=<NllLossBackward0>)
tensor(1.5331, grad_fn=<NllLossBackward0>)
tensor(1.4755, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6038/17426 [09:26<19:22,  9.80it/s]

tensor(1.5622, grad_fn=<NllLossBackward0>)
tensor(1.5410, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6040/17426 [09:26<21:19,  8.90it/s]

tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.5520, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6042/17426 [09:27<22:59,  8.25it/s]

tensor(1.5319, grad_fn=<NllLossBackward0>)
tensor(1.4810, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6044/17426 [09:27<23:00,  8.25it/s]

tensor(1.5625, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6046/17426 [09:27<22:28,  8.44it/s]

tensor(1.5305, grad_fn=<NllLossBackward0>)
tensor(1.5303, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6048/17426 [09:27<24:14,  7.82it/s]

tensor(1.5131, grad_fn=<NllLossBackward0>)
tensor(1.5473, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6050/17426 [09:28<24:36,  7.71it/s]

tensor(1.5363, grad_fn=<NllLossBackward0>)
tensor(1.5056, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6052/17426 [09:28<23:30,  8.07it/s]

tensor(1.5047, grad_fn=<NllLossBackward0>)
tensor(1.5601, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6054/17426 [09:28<23:22,  8.11it/s]

tensor(1.5313, grad_fn=<NllLossBackward0>)
tensor(1.5600, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6056/17426 [09:28<25:35,  7.41it/s]

tensor(1.5067, grad_fn=<NllLossBackward0>)
tensor(1.5347, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6058/17426 [09:29<24:49,  7.63it/s]

tensor(1.5018, grad_fn=<NllLossBackward0>)
tensor(1.5270, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6060/17426 [09:29<25:34,  7.41it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6062/17426 [09:29<25:38,  7.39it/s]

tensor(1.5309, grad_fn=<NllLossBackward0>)
tensor(1.5415, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6064/17426 [09:29<26:46,  7.07it/s]

tensor(1.4803, grad_fn=<NllLossBackward0>)
tensor(1.5265, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6067/17426 [09:30<21:36,  8.76it/s]

tensor(1.5274, grad_fn=<NllLossBackward0>)
tensor(1.5843, grad_fn=<NllLossBackward0>)
tensor(1.5591, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6070/17426 [09:30<19:09,  9.88it/s]

tensor(1.5274, grad_fn=<NllLossBackward0>)
tensor(1.5167, grad_fn=<NllLossBackward0>)
tensor(1.5241, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6073/17426 [09:30<18:14, 10.37it/s]

tensor(1.4799, grad_fn=<NllLossBackward0>)
tensor(1.5348, grad_fn=<NllLossBackward0>)
tensor(1.4907, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6075/17426 [09:31<18:31, 10.21it/s]

tensor(1.5147, grad_fn=<NllLossBackward0>)
tensor(1.4584, grad_fn=<NllLossBackward0>)
tensor(1.5256, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6079/17426 [09:31<17:31, 10.79it/s]

tensor(1.5324, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6081/17426 [09:31<17:34, 10.76it/s]

tensor(1.4950, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)
tensor(1.5618, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6085/17426 [09:31<16:58, 11.13it/s]

tensor(1.4762, grad_fn=<NllLossBackward0>)
tensor(1.5132, grad_fn=<NllLossBackward0>)
tensor(1.4537, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6087/17426 [09:32<17:29, 10.80it/s]

tensor(1.5318, grad_fn=<NllLossBackward0>)
tensor(1.5422, grad_fn=<NllLossBackward0>)
tensor(1.5161, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6091/17426 [09:32<17:04, 11.06it/s]

tensor(1.5409, grad_fn=<NllLossBackward0>)
tensor(1.4931, grad_fn=<NllLossBackward0>)
tensor(1.5474, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6093/17426 [09:32<17:04, 11.07it/s]

tensor(1.5239, grad_fn=<NllLossBackward0>)
tensor(1.5558, grad_fn=<NllLossBackward0>)
tensor(1.5273, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6097/17426 [09:33<17:02, 11.08it/s]

tensor(1.5420, grad_fn=<NllLossBackward0>)
tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.5213, grad_fn=<NllLossBackward0>)


 35%|███▍      | 6099/17426 [09:33<17:13, 10.96it/s]

tensor(1.5245, grad_fn=<NllLossBackward0>)
tensor(1.5071, grad_fn=<NllLossBackward0>)
tensor(1.5128, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6103/17426 [09:33<16:50, 11.20it/s]

tensor(1.5386, grad_fn=<NllLossBackward0>)
tensor(1.5520, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6105/17426 [09:33<16:54, 11.15it/s]

tensor(1.5055, grad_fn=<NllLossBackward0>)
tensor(1.5188, grad_fn=<NllLossBackward0>)
tensor(1.5392, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6109/17426 [09:34<16:57, 11.13it/s]

tensor(1.5262, grad_fn=<NllLossBackward0>)
tensor(1.5423, grad_fn=<NllLossBackward0>)
tensor(1.5106, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6111/17426 [09:34<17:14, 10.94it/s]

tensor(1.5135, grad_fn=<NllLossBackward0>)
tensor(1.5266, grad_fn=<NllLossBackward0>)
tensor(1.5444, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6115/17426 [09:34<17:08, 10.99it/s]

tensor(1.5054, grad_fn=<NllLossBackward0>)
tensor(1.5328, grad_fn=<NllLossBackward0>)
tensor(1.5394, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6117/17426 [09:34<17:06, 11.02it/s]

tensor(1.5603, grad_fn=<NllLossBackward0>)
tensor(1.4894, grad_fn=<NllLossBackward0>)
tensor(1.5836, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6121/17426 [09:35<17:17, 10.90it/s]

tensor(1.5078, grad_fn=<NllLossBackward0>)
tensor(1.5295, grad_fn=<NllLossBackward0>)
tensor(1.5243, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6123/17426 [09:35<17:09, 10.98it/s]

tensor(1.5325, grad_fn=<NllLossBackward0>)
tensor(1.5153, grad_fn=<NllLossBackward0>)
tensor(1.5272, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6127/17426 [09:35<16:52, 11.16it/s]

tensor(1.4982, grad_fn=<NllLossBackward0>)
tensor(1.5425, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6129/17426 [09:35<17:08, 10.98it/s]

tensor(1.5503, grad_fn=<NllLossBackward0>)
tensor(1.5299, grad_fn=<NllLossBackward0>)
tensor(1.5355, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6133/17426 [09:36<17:06, 11.00it/s]

tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.5440, grad_fn=<NllLossBackward0>)
tensor(1.5116, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6135/17426 [09:36<17:00, 11.07it/s]

tensor(1.5336, grad_fn=<NllLossBackward0>)
tensor(1.5331, grad_fn=<NllLossBackward0>)
tensor(1.5074, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6139/17426 [09:36<16:44, 11.24it/s]

tensor(1.5319, grad_fn=<NllLossBackward0>)
tensor(1.5530, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6141/17426 [09:37<16:48, 11.19it/s]

tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.5599, grad_fn=<NllLossBackward0>)
tensor(1.5157, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6145/17426 [09:37<16:44, 11.23it/s]

tensor(1.5241, grad_fn=<NllLossBackward0>)
tensor(1.5106, grad_fn=<NllLossBackward0>)
tensor(1.5433, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6147/17426 [09:37<16:52, 11.14it/s]

tensor(1.5673, grad_fn=<NllLossBackward0>)
tensor(1.5686, grad_fn=<NllLossBackward0>)
tensor(1.5150, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6151/17426 [09:37<16:46, 11.20it/s]

tensor(1.5194, grad_fn=<NllLossBackward0>)
tensor(1.5382, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6153/17426 [09:38<16:55, 11.10it/s]

tensor(1.5408, grad_fn=<NllLossBackward0>)
tensor(1.5349, grad_fn=<NllLossBackward0>)
tensor(1.5456, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6157/17426 [09:38<16:48, 11.18it/s]

tensor(1.5669, grad_fn=<NllLossBackward0>)
tensor(1.5160, grad_fn=<NllLossBackward0>)
tensor(1.5071, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6159/17426 [09:38<16:52, 11.13it/s]

tensor(1.5455, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6163/17426 [09:38<16:37, 11.30it/s]

tensor(1.4879, grad_fn=<NllLossBackward0>)
tensor(1.5281, grad_fn=<NllLossBackward0>)
tensor(1.5304, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6165/17426 [09:39<17:15, 10.87it/s]

tensor(1.5104, grad_fn=<NllLossBackward0>)
tensor(1.5144, grad_fn=<NllLossBackward0>)
tensor(1.5415, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6169/17426 [09:39<16:59, 11.04it/s]

tensor(1.5118, grad_fn=<NllLossBackward0>)
tensor(1.5464, grad_fn=<NllLossBackward0>)
tensor(1.5328, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6171/17426 [09:39<17:02, 11.00it/s]

tensor(1.5685, grad_fn=<NllLossBackward0>)
tensor(1.4958, grad_fn=<NllLossBackward0>)
tensor(1.5200, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6173/17426 [09:39<17:50, 10.52it/s]

tensor(1.5566, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6176/17426 [09:40<20:47,  9.02it/s]

tensor(1.5242, grad_fn=<NllLossBackward0>)
tensor(1.5148, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6178/17426 [09:40<22:12,  8.44it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.5256, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6180/17426 [09:40<23:37,  7.94it/s]

tensor(1.5385, grad_fn=<NllLossBackward0>)
tensor(1.4528, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6182/17426 [09:41<22:32,  8.32it/s]

tensor(1.5157, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6184/17426 [09:41<23:44,  7.89it/s]

tensor(1.5298, grad_fn=<NllLossBackward0>)
tensor(1.5425, grad_fn=<NllLossBackward0>)


 35%|███▌      | 6186/17426 [09:41<23:11,  8.08it/s]

tensor(1.5271, grad_fn=<NllLossBackward0>)
tensor(1.5512, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6188/17426 [09:41<22:46,  8.23it/s]

tensor(1.5384, grad_fn=<NllLossBackward0>)
tensor(1.5495, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6190/17426 [09:42<22:04,  8.48it/s]

tensor(1.5293, grad_fn=<NllLossBackward0>)
tensor(1.5115, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6192/17426 [09:42<22:20,  8.38it/s]

tensor(1.5016, grad_fn=<NllLossBackward0>)
tensor(1.5503, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6194/17426 [09:42<23:12,  8.07it/s]

tensor(1.5418, grad_fn=<NllLossBackward0>)
tensor(1.5490, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6196/17426 [09:42<24:24,  7.67it/s]

tensor(1.5044, grad_fn=<NllLossBackward0>)
tensor(1.5682, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6198/17426 [09:43<24:59,  7.49it/s]

tensor(1.5467, grad_fn=<NllLossBackward0>)
tensor(1.5373, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6200/17426 [09:43<26:55,  6.95it/s]

tensor(1.4964, grad_fn=<NllLossBackward0>)
tensor(1.5137, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6202/17426 [09:43<27:47,  6.73it/s]

tensor(1.5267, grad_fn=<NllLossBackward0>)
tensor(1.5271, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6205/17426 [09:44<22:34,  8.28it/s]

tensor(1.5044, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)
tensor(1.5674, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6208/17426 [09:44<19:32,  9.57it/s]

tensor(1.5549, grad_fn=<NllLossBackward0>)
tensor(1.5563, grad_fn=<NllLossBackward0>)
tensor(1.5196, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6210/17426 [09:44<19:55,  9.38it/s]

tensor(1.4706, grad_fn=<NllLossBackward0>)
tensor(1.5285, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6212/17426 [09:44<18:42,  9.99it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.5207, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6216/17426 [09:45<17:14, 10.83it/s]

tensor(1.5221, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.5290, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6218/17426 [09:45<17:19, 10.79it/s]

tensor(1.5520, grad_fn=<NllLossBackward0>)
tensor(1.5153, grad_fn=<NllLossBackward0>)
tensor(1.5349, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6222/17426 [09:45<17:07, 10.91it/s]

tensor(1.5608, grad_fn=<NllLossBackward0>)
tensor(1.4748, grad_fn=<NllLossBackward0>)
tensor(1.5087, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6224/17426 [09:45<16:56, 11.02it/s]

tensor(1.5274, grad_fn=<NllLossBackward0>)
tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.5660, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6228/17426 [09:46<16:43, 11.16it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5875, grad_fn=<NllLossBackward0>)
tensor(1.5616, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6230/17426 [09:46<16:53, 11.05it/s]

tensor(1.4983, grad_fn=<NllLossBackward0>)
tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.5237, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6234/17426 [09:46<16:42, 11.17it/s]

tensor(1.5736, grad_fn=<NllLossBackward0>)
tensor(1.5138, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6236/17426 [09:46<16:53, 11.04it/s]

tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.4712, grad_fn=<NllLossBackward0>)
tensor(1.5551, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6240/17426 [09:47<16:43, 11.15it/s]

tensor(1.4946, grad_fn=<NllLossBackward0>)
tensor(1.5999, grad_fn=<NllLossBackward0>)
tensor(1.5459, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6242/17426 [09:47<16:41, 11.17it/s]

tensor(1.5530, grad_fn=<NllLossBackward0>)
tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.5479, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6246/17426 [09:47<16:37, 11.21it/s]

tensor(1.5249, grad_fn=<NllLossBackward0>)
tensor(1.4955, grad_fn=<NllLossBackward0>)
tensor(1.5445, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6248/17426 [09:48<16:55, 11.01it/s]

tensor(1.5413, grad_fn=<NllLossBackward0>)
tensor(1.5262, grad_fn=<NllLossBackward0>)
tensor(1.4821, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6252/17426 [09:48<16:32, 11.26it/s]

tensor(1.5529, grad_fn=<NllLossBackward0>)
tensor(1.4781, grad_fn=<NllLossBackward0>)
tensor(1.5355, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6254/17426 [09:48<16:42, 11.14it/s]

tensor(1.5308, grad_fn=<NllLossBackward0>)
tensor(1.4934, grad_fn=<NllLossBackward0>)
tensor(1.5381, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6258/17426 [09:48<16:41, 11.15it/s]

tensor(1.4764, grad_fn=<NllLossBackward0>)
tensor(1.5563, grad_fn=<NllLossBackward0>)
tensor(1.5159, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6260/17426 [09:49<16:56, 10.99it/s]

tensor(1.5177, grad_fn=<NllLossBackward0>)
tensor(1.5520, grad_fn=<NllLossBackward0>)
tensor(1.5181, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6264/17426 [09:49<16:29, 11.28it/s]

tensor(1.5239, grad_fn=<NllLossBackward0>)
tensor(1.5661, grad_fn=<NllLossBackward0>)
tensor(1.5349, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6266/17426 [09:49<16:33, 11.23it/s]

tensor(1.4929, grad_fn=<NllLossBackward0>)
tensor(1.4848, grad_fn=<NllLossBackward0>)
tensor(1.5550, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6270/17426 [09:49<16:32, 11.24it/s]

tensor(1.5645, grad_fn=<NllLossBackward0>)
tensor(1.5240, grad_fn=<NllLossBackward0>)
tensor(1.5566, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6272/17426 [09:50<16:40, 11.14it/s]

tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.5332, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6276/17426 [09:50<16:15, 11.43it/s]

tensor(1.5425, grad_fn=<NllLossBackward0>)
tensor(1.5312, grad_fn=<NllLossBackward0>)
tensor(1.5701, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6278/17426 [09:50<16:36, 11.18it/s]

tensor(1.4875, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)
tensor(1.5577, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6282/17426 [09:51<16:30, 11.25it/s]

tensor(1.5747, grad_fn=<NllLossBackward0>)
tensor(1.4795, grad_fn=<NllLossBackward0>)
tensor(1.4779, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6284/17426 [09:51<16:47, 11.06it/s]

tensor(1.5263, grad_fn=<NllLossBackward0>)
tensor(1.5387, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6288/17426 [09:51<16:25, 11.30it/s]

tensor(1.5439, grad_fn=<NllLossBackward0>)
tensor(1.5342, grad_fn=<NllLossBackward0>)
tensor(1.5571, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6290/17426 [09:51<16:31, 11.23it/s]

tensor(1.5265, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)
tensor(1.5243, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6294/17426 [09:52<16:35, 11.18it/s]

tensor(1.5644, grad_fn=<NllLossBackward0>)
tensor(1.5074, grad_fn=<NllLossBackward0>)
tensor(1.5421, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6296/17426 [09:52<16:40, 11.12it/s]

tensor(1.5021, grad_fn=<NllLossBackward0>)
tensor(1.5079, grad_fn=<NllLossBackward0>)
tensor(1.4751, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6300/17426 [09:52<16:26, 11.28it/s]

tensor(1.5361, grad_fn=<NllLossBackward0>)
tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.5276, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6302/17426 [09:52<17:02, 10.87it/s]

tensor(1.5237, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6306/17426 [09:53<16:50, 11.00it/s]

tensor(1.5459, grad_fn=<NllLossBackward0>)
tensor(1.5734, grad_fn=<NllLossBackward0>)
tensor(1.5029, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6308/17426 [09:53<16:45, 11.06it/s]

tensor(1.5335, grad_fn=<NllLossBackward0>)
tensor(1.4985, grad_fn=<NllLossBackward0>)
tensor(1.4881, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6312/17426 [09:53<16:17, 11.37it/s]

tensor(1.5622, grad_fn=<NllLossBackward0>)
tensor(1.4908, grad_fn=<NllLossBackward0>)
tensor(1.4634, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6314/17426 [09:53<17:10, 10.79it/s]

tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.5338, grad_fn=<NllLossBackward0>)


 36%|███▌      | 6316/17426 [09:54<19:07,  9.68it/s]

tensor(1.5599, grad_fn=<NllLossBackward0>)
tensor(1.5084, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6318/17426 [09:54<20:50,  8.89it/s]

tensor(1.5122, grad_fn=<NllLossBackward0>)
tensor(1.5245, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6320/17426 [09:54<23:17,  7.95it/s]

tensor(1.4811, grad_fn=<NllLossBackward0>)
tensor(1.5484, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6322/17426 [09:55<24:08,  7.67it/s]

tensor(1.5286, grad_fn=<NllLossBackward0>)
tensor(1.5150, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6324/17426 [09:55<23:29,  7.88it/s]

tensor(1.5365, grad_fn=<NllLossBackward0>)
tensor(1.5203, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6326/17426 [09:55<23:27,  7.89it/s]

tensor(1.5545, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6328/17426 [09:55<23:56,  7.73it/s]

tensor(1.5594, grad_fn=<NllLossBackward0>)
tensor(1.5283, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6330/17426 [09:56<24:24,  7.58it/s]

tensor(1.5415, grad_fn=<NllLossBackward0>)
tensor(1.5140, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6332/17426 [09:56<24:16,  7.62it/s]

tensor(1.5160, grad_fn=<NllLossBackward0>)
tensor(1.5484, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6334/17426 [09:56<25:12,  7.33it/s]

tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.5110, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6336/17426 [09:56<24:50,  7.44it/s]

tensor(1.5236, grad_fn=<NllLossBackward0>)
tensor(1.5610, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6338/17426 [09:57<26:23,  7.00it/s]

tensor(1.5572, grad_fn=<NllLossBackward0>)
tensor(1.5053, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6340/17426 [09:57<25:14,  7.32it/s]

tensor(1.5479, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6342/17426 [09:57<26:03,  7.09it/s]

tensor(1.5585, grad_fn=<NllLossBackward0>)
tensor(1.5233, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6345/17426 [09:58<20:21,  9.07it/s]

tensor(1.5215, grad_fn=<NllLossBackward0>)
tensor(1.5267, grad_fn=<NllLossBackward0>)
tensor(1.5257, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6347/17426 [09:58<20:07,  9.18it/s]

tensor(1.5570, grad_fn=<NllLossBackward0>)
tensor(1.4892, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6349/17426 [09:58<19:05,  9.67it/s]

tensor(1.5503, grad_fn=<NllLossBackward0>)
tensor(1.5848, grad_fn=<NllLossBackward0>)
tensor(1.5580, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6353/17426 [09:58<17:35, 10.49it/s]

tensor(1.5382, grad_fn=<NllLossBackward0>)
tensor(1.5035, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6355/17426 [09:58<17:05, 10.80it/s]

tensor(1.5273, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.4772, grad_fn=<NllLossBackward0>)


 36%|███▋      | 6359/17426 [09:59<16:54, 10.91it/s]

tensor(1.4929, grad_fn=<NllLossBackward0>)
tensor(1.5220, grad_fn=<NllLossBackward0>)
tensor(1.5150, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6361/17426 [09:59<17:01, 10.83it/s]

tensor(1.5033, grad_fn=<NllLossBackward0>)
tensor(1.5381, grad_fn=<NllLossBackward0>)
tensor(1.5575, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6365/17426 [09:59<16:32, 11.15it/s]

tensor(1.5009, grad_fn=<NllLossBackward0>)
tensor(1.5414, grad_fn=<NllLossBackward0>)
tensor(1.5463, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6367/17426 [10:00<16:36, 11.10it/s]

tensor(1.4841, grad_fn=<NllLossBackward0>)
tensor(1.5266, grad_fn=<NllLossBackward0>)
tensor(1.5498, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6371/17426 [10:00<16:41, 11.04it/s]

tensor(1.5131, grad_fn=<NllLossBackward0>)
tensor(1.5071, grad_fn=<NllLossBackward0>)
tensor(1.5152, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6373/17426 [10:00<16:48, 10.96it/s]

tensor(1.5049, grad_fn=<NllLossBackward0>)
tensor(1.5128, grad_fn=<NllLossBackward0>)
tensor(1.5191, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6377/17426 [10:00<16:27, 11.19it/s]

tensor(1.5913, grad_fn=<NllLossBackward0>)
tensor(1.5425, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6379/17426 [10:01<16:37, 11.08it/s]

tensor(1.5089, grad_fn=<NllLossBackward0>)
tensor(1.5253, grad_fn=<NllLossBackward0>)
tensor(1.5768, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6383/17426 [10:01<16:28, 11.17it/s]

tensor(1.5372, grad_fn=<NllLossBackward0>)
tensor(1.5453, grad_fn=<NllLossBackward0>)
tensor(1.5227, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6385/17426 [10:01<16:42, 11.01it/s]

tensor(1.4796, grad_fn=<NllLossBackward0>)
tensor(1.5348, grad_fn=<NllLossBackward0>)
tensor(1.5421, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6389/17426 [10:02<16:24, 11.21it/s]

tensor(1.5347, grad_fn=<NllLossBackward0>)
tensor(1.5254, grad_fn=<NllLossBackward0>)
tensor(1.5450, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6391/17426 [10:02<16:50, 10.92it/s]

tensor(1.5365, grad_fn=<NllLossBackward0>)
tensor(1.5453, grad_fn=<NllLossBackward0>)
tensor(1.5153, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6395/17426 [10:02<16:48, 10.94it/s]

tensor(1.5014, grad_fn=<NllLossBackward0>)
tensor(1.5391, grad_fn=<NllLossBackward0>)
tensor(1.5318, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6397/17426 [10:02<16:51, 10.91it/s]

tensor(1.5858, grad_fn=<NllLossBackward0>)
tensor(1.5314, grad_fn=<NllLossBackward0>)
tensor(1.5762, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6401/17426 [10:03<16:40, 11.02it/s]

tensor(1.5294, grad_fn=<NllLossBackward0>)
tensor(1.5417, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6403/17426 [10:03<17:12, 10.68it/s]

tensor(1.5370, grad_fn=<NllLossBackward0>)
tensor(1.5282, grad_fn=<NllLossBackward0>)
tensor(1.5488, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6407/17426 [10:03<16:42, 10.99it/s]

tensor(1.5215, grad_fn=<NllLossBackward0>)
tensor(1.5604, grad_fn=<NllLossBackward0>)
tensor(1.4995, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6409/17426 [10:03<16:47, 10.94it/s]

tensor(1.5139, grad_fn=<NllLossBackward0>)
tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.5271, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6413/17426 [10:04<16:33, 11.09it/s]

tensor(1.5236, grad_fn=<NllLossBackward0>)
tensor(1.5101, grad_fn=<NllLossBackward0>)
tensor(1.5891, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6415/17426 [10:04<17:03, 10.75it/s]

tensor(1.5422, grad_fn=<NllLossBackward0>)
tensor(1.5229, grad_fn=<NllLossBackward0>)
tensor(1.5255, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6419/17426 [10:04<16:34, 11.07it/s]

tensor(1.5136, grad_fn=<NllLossBackward0>)
tensor(1.5488, grad_fn=<NllLossBackward0>)
tensor(1.5456, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6421/17426 [10:05<16:34, 11.06it/s]

tensor(1.5371, grad_fn=<NllLossBackward0>)
tensor(1.5103, grad_fn=<NllLossBackward0>)
tensor(1.5421, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6425/17426 [10:05<16:29, 11.12it/s]

tensor(1.5341, grad_fn=<NllLossBackward0>)
tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.5105, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6427/17426 [10:05<16:50, 10.89it/s]

tensor(1.4686, grad_fn=<NllLossBackward0>)
tensor(1.5028, grad_fn=<NllLossBackward0>)
tensor(1.5378, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6431/17426 [10:05<16:10, 11.33it/s]

tensor(1.5163, grad_fn=<NllLossBackward0>)
tensor(1.4575, grad_fn=<NllLossBackward0>)
tensor(1.5611, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6433/17426 [10:06<16:24, 11.17it/s]

tensor(1.5506, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.5430, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6437/17426 [10:06<16:12, 11.30it/s]

tensor(1.5725, grad_fn=<NllLossBackward0>)
tensor(1.5511, grad_fn=<NllLossBackward0>)
tensor(1.5311, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6439/17426 [10:06<16:39, 10.99it/s]

tensor(1.5154, grad_fn=<NllLossBackward0>)
tensor(1.5575, grad_fn=<NllLossBackward0>)
tensor(1.4983, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6443/17426 [10:06<16:14, 11.27it/s]

tensor(1.5258, grad_fn=<NllLossBackward0>)
tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.5022, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6445/17426 [10:07<16:18, 11.22it/s]

tensor(1.5634, grad_fn=<NllLossBackward0>)
tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.4872, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6449/17426 [10:07<16:35, 11.03it/s]

tensor(1.5184, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6451/17426 [10:07<16:46, 10.90it/s]

tensor(1.5578, grad_fn=<NllLossBackward0>)
tensor(1.5217, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6453/17426 [10:08<19:14,  9.50it/s]

tensor(1.5536, grad_fn=<NllLossBackward0>)
tensor(1.4933, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6455/17426 [10:08<21:15,  8.60it/s]

tensor(1.5154, grad_fn=<NllLossBackward0>)
tensor(1.4973, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6457/17426 [10:08<21:18,  8.58it/s]

tensor(1.5319, grad_fn=<NllLossBackward0>)
tensor(1.5154, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6459/17426 [10:08<23:22,  7.82it/s]

tensor(1.5175, grad_fn=<NllLossBackward0>)
tensor(1.5317, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6461/17426 [10:09<23:01,  7.94it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6463/17426 [10:09<22:30,  8.12it/s]

tensor(1.5018, grad_fn=<NllLossBackward0>)
tensor(1.4556, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6465/17426 [10:09<22:17,  8.19it/s]

tensor(1.5529, grad_fn=<NllLossBackward0>)
tensor(1.4966, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6467/17426 [10:09<22:56,  7.96it/s]

tensor(1.5066, grad_fn=<NllLossBackward0>)
tensor(1.5514, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6469/17426 [10:10<22:23,  8.16it/s]

tensor(1.5487, grad_fn=<NllLossBackward0>)
tensor(1.5346, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6471/17426 [10:10<22:33,  8.09it/s]

tensor(1.4972, grad_fn=<NllLossBackward0>)
tensor(1.5229, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6473/17426 [10:10<21:54,  8.33it/s]

tensor(1.5135, grad_fn=<NllLossBackward0>)
tensor(1.4844, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6475/17426 [10:10<24:35,  7.42it/s]

tensor(1.5084, grad_fn=<NllLossBackward0>)
tensor(1.4865, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6477/17426 [10:11<25:22,  7.19it/s]

tensor(1.5442, grad_fn=<NllLossBackward0>)
tensor(1.5443, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6479/17426 [10:11<26:02,  7.01it/s]

tensor(1.5124, grad_fn=<NllLossBackward0>)
tensor(1.5302, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6481/17426 [10:11<25:52,  7.05it/s]

tensor(1.5195, grad_fn=<NllLossBackward0>)
tensor(1.5424, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6483/17426 [10:11<25:51,  7.05it/s]

tensor(1.5157, grad_fn=<NllLossBackward0>)
tensor(1.4739, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6486/17426 [10:12<20:15,  9.00it/s]

tensor(1.5329, grad_fn=<NllLossBackward0>)
tensor(1.4861, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6489/17426 [10:12<18:22,  9.92it/s]

tensor(1.5174, grad_fn=<NllLossBackward0>)
tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.5041, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6492/17426 [10:12<17:20, 10.51it/s]

tensor(1.5470, grad_fn=<NllLossBackward0>)
tensor(1.5326, grad_fn=<NllLossBackward0>)
tensor(1.5687, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6494/17426 [10:13<17:40, 10.31it/s]

tensor(1.5449, grad_fn=<NllLossBackward0>)
tensor(1.5366, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6498/17426 [10:13<16:47, 10.84it/s]

tensor(1.5397, grad_fn=<NllLossBackward0>)
tensor(1.5150, grad_fn=<NllLossBackward0>)
tensor(1.5278, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6500/17426 [10:13<16:51, 10.80it/s]

tensor(1.5191, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)
tensor(1.5295, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6504/17426 [10:13<16:32, 11.00it/s]

tensor(1.5556, grad_fn=<NllLossBackward0>)
tensor(1.5032, grad_fn=<NllLossBackward0>)
tensor(1.5510, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6506/17426 [10:14<16:53, 10.77it/s]

tensor(1.4952, grad_fn=<NllLossBackward0>)
tensor(1.5383, grad_fn=<NllLossBackward0>)
tensor(1.5359, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6510/17426 [10:14<16:34, 10.98it/s]

tensor(1.5246, grad_fn=<NllLossBackward0>)
tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.5364, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6512/17426 [10:14<16:31, 11.00it/s]

tensor(1.5280, grad_fn=<NllLossBackward0>)
tensor(1.4750, grad_fn=<NllLossBackward0>)
tensor(1.4879, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6516/17426 [10:14<16:04, 11.31it/s]

tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.5524, grad_fn=<NllLossBackward0>)
tensor(1.5184, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6518/17426 [10:15<16:46, 10.83it/s]

tensor(1.4878, grad_fn=<NllLossBackward0>)
tensor(1.5369, grad_fn=<NllLossBackward0>)
tensor(1.5546, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6522/17426 [10:15<16:29, 11.02it/s]

tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)
tensor(1.5233, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6524/17426 [10:15<16:42, 10.87it/s]

tensor(1.5861, grad_fn=<NllLossBackward0>)
tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.4762, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6528/17426 [10:16<16:35, 10.95it/s]

tensor(1.4913, grad_fn=<NllLossBackward0>)
tensor(1.4912, grad_fn=<NllLossBackward0>)
tensor(1.5419, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6530/17426 [10:16<16:34, 10.96it/s]

tensor(1.4832, grad_fn=<NllLossBackward0>)
tensor(1.5502, grad_fn=<NllLossBackward0>)
tensor(1.5006, grad_fn=<NllLossBackward0>)


 37%|███▋      | 6534/17426 [10:16<16:18, 11.13it/s]

tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.5150, grad_fn=<NllLossBackward0>)
tensor(1.5355, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6536/17426 [10:16<16:34, 10.95it/s]

tensor(1.5722, grad_fn=<NllLossBackward0>)
tensor(1.5409, grad_fn=<NllLossBackward0>)
tensor(1.5439, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6540/17426 [10:17<16:38, 10.90it/s]

tensor(1.4857, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.4925, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6542/17426 [10:17<16:46, 10.81it/s]

tensor(1.5529, grad_fn=<NllLossBackward0>)
tensor(1.5106, grad_fn=<NllLossBackward0>)
tensor(1.5650, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6546/17426 [10:17<16:24, 11.05it/s]

tensor(1.5511, grad_fn=<NllLossBackward0>)
tensor(1.5482, grad_fn=<NllLossBackward0>)
tensor(1.5403, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6548/17426 [10:17<16:34, 10.94it/s]

tensor(1.5338, grad_fn=<NllLossBackward0>)
tensor(1.5303, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6552/17426 [10:18<16:21, 11.08it/s]

tensor(1.4997, grad_fn=<NllLossBackward0>)
tensor(1.5085, grad_fn=<NllLossBackward0>)
tensor(1.5247, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6554/17426 [10:18<16:27, 11.00it/s]

tensor(1.4691, grad_fn=<NllLossBackward0>)
tensor(1.5067, grad_fn=<NllLossBackward0>)
tensor(1.5488, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6558/17426 [10:18<16:05, 11.26it/s]

tensor(1.5324, grad_fn=<NllLossBackward0>)
tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.5500, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6560/17426 [10:19<16:09, 11.20it/s]

tensor(1.5579, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)
tensor(1.4820, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6564/17426 [10:19<16:28, 10.99it/s]

tensor(1.5470, grad_fn=<NllLossBackward0>)
tensor(1.5546, grad_fn=<NllLossBackward0>)
tensor(1.5516, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6566/17426 [10:19<16:40, 10.85it/s]

tensor(1.5244, grad_fn=<NllLossBackward0>)
tensor(1.5296, grad_fn=<NllLossBackward0>)
tensor(1.5859, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6570/17426 [10:19<16:20, 11.07it/s]

tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.5588, grad_fn=<NllLossBackward0>)
tensor(1.5171, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6572/17426 [10:20<16:29, 10.97it/s]

tensor(1.5895, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)
tensor(1.5186, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6576/17426 [10:20<16:29, 10.96it/s]

tensor(1.5403, grad_fn=<NllLossBackward0>)
tensor(1.4977, grad_fn=<NllLossBackward0>)
tensor(1.4887, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6578/17426 [10:20<16:29, 10.97it/s]

tensor(1.5568, grad_fn=<NllLossBackward0>)
tensor(1.5472, grad_fn=<NllLossBackward0>)
tensor(1.5498, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6582/17426 [10:21<16:15, 11.12it/s]

tensor(1.5421, grad_fn=<NllLossBackward0>)
tensor(1.5144, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6584/17426 [10:21<16:33, 10.91it/s]

tensor(1.5473, grad_fn=<NllLossBackward0>)
tensor(1.5087, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6588/17426 [10:21<16:29, 10.95it/s]

tensor(1.5368, grad_fn=<NllLossBackward0>)
tensor(1.5398, grad_fn=<NllLossBackward0>)
tensor(1.5281, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6590/17426 [10:21<16:40, 10.83it/s]

tensor(1.5581, grad_fn=<NllLossBackward0>)
tensor(1.5256, grad_fn=<NllLossBackward0>)
tensor(1.5201, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6592/17426 [10:21<16:58, 10.64it/s]

tensor(1.5276, grad_fn=<NllLossBackward0>)
tensor(1.5333, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6595/17426 [10:22<20:28,  8.81it/s]

tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.5281, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6597/17426 [10:22<22:02,  8.19it/s]

tensor(1.4991, grad_fn=<NllLossBackward0>)
tensor(1.5623, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6599/17426 [10:22<22:48,  7.91it/s]

tensor(1.4990, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6601/17426 [10:23<22:47,  7.92it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.5263, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6603/17426 [10:23<24:24,  7.39it/s]

tensor(1.5138, grad_fn=<NllLossBackward0>)
tensor(1.5187, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6605/17426 [10:23<23:39,  7.63it/s]

tensor(1.5000, grad_fn=<NllLossBackward0>)
tensor(1.5515, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6607/17426 [10:24<23:32,  7.66it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.4943, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6609/17426 [10:24<22:32,  8.00it/s]

tensor(1.5375, grad_fn=<NllLossBackward0>)
tensor(1.5052, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6611/17426 [10:24<23:59,  7.51it/s]

tensor(1.4891, grad_fn=<NllLossBackward0>)
tensor(1.5728, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6613/17426 [10:24<24:31,  7.35it/s]

tensor(1.5130, grad_fn=<NllLossBackward0>)
tensor(1.5325, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6615/17426 [10:25<24:35,  7.32it/s]

tensor(1.5264, grad_fn=<NllLossBackward0>)
tensor(1.5081, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6617/17426 [10:25<24:49,  7.26it/s]

tensor(1.5753, grad_fn=<NllLossBackward0>)
tensor(1.5090, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6619/17426 [10:25<26:15,  6.86it/s]

tensor(1.5356, grad_fn=<NllLossBackward0>)
tensor(1.5159, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6622/17426 [10:25<20:26,  8.81it/s]

tensor(1.5215, grad_fn=<NllLossBackward0>)
tensor(1.5404, grad_fn=<NllLossBackward0>)
tensor(1.5294, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6625/17426 [10:26<18:17,  9.84it/s]

tensor(1.5248, grad_fn=<NllLossBackward0>)
tensor(1.5027, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6628/17426 [10:26<17:32, 10.26it/s]

tensor(1.4661, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)
tensor(1.5375, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6630/17426 [10:26<17:45, 10.13it/s]

tensor(1.5029, grad_fn=<NllLossBackward0>)
tensor(1.5296, grad_fn=<NllLossBackward0>)
tensor(1.5762, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6634/17426 [10:27<16:36, 10.83it/s]

tensor(1.5236, grad_fn=<NllLossBackward0>)
tensor(1.5114, grad_fn=<NllLossBackward0>)
tensor(1.5764, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6636/17426 [10:27<16:46, 10.72it/s]

tensor(1.5255, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)
tensor(1.5066, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6640/17426 [10:27<16:28, 10.91it/s]

tensor(1.4882, grad_fn=<NllLossBackward0>)
tensor(1.5343, grad_fn=<NllLossBackward0>)
tensor(1.5322, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6642/17426 [10:27<16:50, 10.67it/s]

tensor(1.5227, grad_fn=<NllLossBackward0>)
tensor(1.4990, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6646/17426 [10:28<16:19, 11.00it/s]

tensor(1.5617, grad_fn=<NllLossBackward0>)
tensor(1.5361, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6648/17426 [10:28<16:24, 10.94it/s]

tensor(1.5307, grad_fn=<NllLossBackward0>)
tensor(1.5161, grad_fn=<NllLossBackward0>)
tensor(1.4878, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6652/17426 [10:28<16:34, 10.84it/s]

tensor(1.4987, grad_fn=<NllLossBackward0>)
tensor(1.5378, grad_fn=<NllLossBackward0>)
tensor(1.4933, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6654/17426 [10:28<16:45, 10.71it/s]

tensor(1.5511, grad_fn=<NllLossBackward0>)
tensor(1.5096, grad_fn=<NllLossBackward0>)
tensor(1.5218, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6658/17426 [10:29<16:19, 10.99it/s]

tensor(1.5572, grad_fn=<NllLossBackward0>)
tensor(1.5269, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6660/17426 [10:29<16:26, 10.92it/s]

tensor(1.5668, grad_fn=<NllLossBackward0>)
tensor(1.4989, grad_fn=<NllLossBackward0>)
tensor(1.5366, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6662/17426 [10:29<16:42, 10.74it/s]

tensor(1.5536, grad_fn=<NllLossBackward0>)
tensor(1.4921, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6666/17426 [10:30<17:01, 10.53it/s]

tensor(1.5090, grad_fn=<NllLossBackward0>)
tensor(1.5075, grad_fn=<NllLossBackward0>)
tensor(1.4891, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6668/17426 [10:30<17:12, 10.42it/s]

tensor(1.5312, grad_fn=<NllLossBackward0>)
tensor(1.5698, grad_fn=<NllLossBackward0>)
tensor(1.5244, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6672/17426 [10:30<16:40, 10.75it/s]

tensor(1.5642, grad_fn=<NllLossBackward0>)
tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.5247, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6674/17426 [10:30<17:04, 10.49it/s]

tensor(1.5261, grad_fn=<NllLossBackward0>)
tensor(1.4813, grad_fn=<NllLossBackward0>)
tensor(1.5421, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6678/17426 [10:31<16:39, 10.75it/s]

tensor(1.5221, grad_fn=<NllLossBackward0>)
tensor(1.5291, grad_fn=<NllLossBackward0>)
tensor(1.5027, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6680/17426 [10:31<16:42, 10.72it/s]

tensor(1.5935, grad_fn=<NllLossBackward0>)
tensor(1.5404, grad_fn=<NllLossBackward0>)
tensor(1.5169, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6684/17426 [10:31<16:09, 11.08it/s]

tensor(1.4774, grad_fn=<NllLossBackward0>)
tensor(1.5713, grad_fn=<NllLossBackward0>)
tensor(1.5558, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6686/17426 [10:31<16:45, 10.69it/s]

tensor(1.5107, grad_fn=<NllLossBackward0>)
tensor(1.5282, grad_fn=<NllLossBackward0>)
tensor(1.5321, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6690/17426 [10:32<16:19, 10.96it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.5435, grad_fn=<NllLossBackward0>)
tensor(1.4811, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6692/17426 [10:32<16:34, 10.79it/s]

tensor(1.5043, grad_fn=<NllLossBackward0>)
tensor(1.5194, grad_fn=<NllLossBackward0>)
tensor(1.4925, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6696/17426 [10:32<16:20, 10.94it/s]

tensor(1.4858, grad_fn=<NllLossBackward0>)
tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.5428, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6698/17426 [10:33<16:32, 10.81it/s]

tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.5436, grad_fn=<NllLossBackward0>)
tensor(1.5128, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6702/17426 [10:33<16:21, 10.93it/s]

tensor(1.5766, grad_fn=<NllLossBackward0>)
tensor(1.5191, grad_fn=<NllLossBackward0>)
tensor(1.5210, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6704/17426 [10:33<16:26, 10.87it/s]

tensor(1.5365, grad_fn=<NllLossBackward0>)
tensor(1.5160, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 38%|███▊      | 6708/17426 [10:33<16:36, 10.75it/s]

tensor(1.5487, grad_fn=<NllLossBackward0>)
tensor(1.5670, grad_fn=<NllLossBackward0>)
tensor(1.5131, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6710/17426 [10:34<16:38, 10.73it/s]

tensor(1.4979, grad_fn=<NllLossBackward0>)
tensor(1.5351, grad_fn=<NllLossBackward0>)
tensor(1.5244, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6714/17426 [10:34<16:14, 11.00it/s]

tensor(1.5161, grad_fn=<NllLossBackward0>)
tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.5167, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6716/17426 [10:34<16:20, 10.92it/s]

tensor(1.5311, grad_fn=<NllLossBackward0>)
tensor(1.4327, grad_fn=<NllLossBackward0>)
tensor(1.5288, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6720/17426 [10:35<16:20, 10.92it/s]

tensor(1.5069, grad_fn=<NllLossBackward0>)
tensor(1.4558, grad_fn=<NllLossBackward0>)
tensor(1.4767, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6722/17426 [10:35<16:18, 10.94it/s]

tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.5236, grad_fn=<NllLossBackward0>)
tensor(1.5255, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6726/17426 [10:35<16:19, 10.92it/s]

tensor(1.5442, grad_fn=<NllLossBackward0>)
tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.5554, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6728/17426 [10:35<18:17,  9.74it/s]

tensor(1.5501, grad_fn=<NllLossBackward0>)
tensor(1.5359, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6730/17426 [10:36<21:22,  8.34it/s]

tensor(1.5144, grad_fn=<NllLossBackward0>)
tensor(1.4983, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6732/17426 [10:36<23:12,  7.68it/s]

tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.5066, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6734/17426 [10:36<24:12,  7.36it/s]

tensor(1.5028, grad_fn=<NllLossBackward0>)
tensor(1.5152, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6736/17426 [10:37<23:06,  7.71it/s]

tensor(1.5420, grad_fn=<NllLossBackward0>)
tensor(1.5393, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6738/17426 [10:37<23:38,  7.53it/s]

tensor(1.5713, grad_fn=<NllLossBackward0>)
tensor(1.5127, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6740/17426 [10:37<22:52,  7.79it/s]

tensor(1.5411, grad_fn=<NllLossBackward0>)
tensor(1.5497, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6742/17426 [10:37<23:10,  7.68it/s]

tensor(1.5346, grad_fn=<NllLossBackward0>)
tensor(1.5213, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6744/17426 [10:38<22:22,  7.95it/s]

tensor(1.4982, grad_fn=<NllLossBackward0>)
tensor(1.5065, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6746/17426 [10:38<23:47,  7.48it/s]

tensor(1.5285, grad_fn=<NllLossBackward0>)
tensor(1.5119, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6748/17426 [10:38<24:10,  7.36it/s]

tensor(1.5068, grad_fn=<NllLossBackward0>)
tensor(1.4703, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6750/17426 [10:38<26:41,  6.67it/s]

tensor(1.4985, grad_fn=<NllLossBackward0>)
tensor(1.5318, grad_fn=<NllLossBackward0>)


 39%|███▊      | 6752/17426 [10:39<27:42,  6.42it/s]

tensor(1.5303, grad_fn=<NllLossBackward0>)
tensor(1.5168, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6755/17426 [10:39<22:09,  8.03it/s]

tensor(1.5950, grad_fn=<NllLossBackward0>)
tensor(1.5485, grad_fn=<NllLossBackward0>)
tensor(1.5572, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6758/17426 [10:39<18:52,  9.42it/s]

tensor(1.5230, grad_fn=<NllLossBackward0>)
tensor(1.4960, grad_fn=<NllLossBackward0>)
tensor(1.4898, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6759/17426 [10:39<19:04,  9.32it/s]

tensor(1.5105, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)
tensor(1.5669, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6764/17426 [10:40<17:48,  9.98it/s]

tensor(1.5473, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)
tensor(1.5154, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6767/17426 [10:40<17:09, 10.35it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.4979, grad_fn=<NllLossBackward0>)
tensor(1.5463, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6769/17426 [10:40<16:52, 10.53it/s]

tensor(1.4904, grad_fn=<NllLossBackward0>)
tensor(1.5392, grad_fn=<NllLossBackward0>)
tensor(1.5445, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6773/17426 [10:41<16:54, 10.50it/s]

tensor(1.5209, grad_fn=<NllLossBackward0>)
tensor(1.5416, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6775/17426 [10:41<16:54, 10.50it/s]

tensor(1.5508, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)
tensor(1.4836, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6779/17426 [10:41<16:23, 10.82it/s]

tensor(1.5502, grad_fn=<NllLossBackward0>)
tensor(1.5285, grad_fn=<NllLossBackward0>)
tensor(1.4983, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6781/17426 [10:42<16:32, 10.73it/s]

tensor(1.5592, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)
tensor(1.5201, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6785/17426 [10:42<16:31, 10.73it/s]

tensor(1.5667, grad_fn=<NllLossBackward0>)
tensor(1.5154, grad_fn=<NllLossBackward0>)
tensor(1.4935, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6787/17426 [10:42<16:31, 10.73it/s]

tensor(1.5138, grad_fn=<NllLossBackward0>)
tensor(1.4900, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6791/17426 [10:43<16:12, 10.93it/s]

tensor(1.5143, grad_fn=<NllLossBackward0>)
tensor(1.5543, grad_fn=<NllLossBackward0>)
tensor(1.4904, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6793/17426 [10:43<16:17, 10.88it/s]

tensor(1.5271, grad_fn=<NllLossBackward0>)
tensor(1.5338, grad_fn=<NllLossBackward0>)
tensor(1.5367, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6797/17426 [10:43<16:20, 10.84it/s]

tensor(1.4677, grad_fn=<NllLossBackward0>)
tensor(1.5456, grad_fn=<NllLossBackward0>)
tensor(1.5116, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6799/17426 [10:43<16:29, 10.74it/s]

tensor(1.5019, grad_fn=<NllLossBackward0>)
tensor(1.5302, grad_fn=<NllLossBackward0>)
tensor(1.5399, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6803/17426 [10:44<16:12, 10.93it/s]

tensor(1.5211, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6805/17426 [10:44<16:22, 10.81it/s]

tensor(1.5144, grad_fn=<NllLossBackward0>)
tensor(1.5295, grad_fn=<NllLossBackward0>)
tensor(1.5193, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6809/17426 [10:44<16:19, 10.83it/s]

tensor(1.5127, grad_fn=<NllLossBackward0>)
tensor(1.5147, grad_fn=<NllLossBackward0>)
tensor(1.4914, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6811/17426 [10:44<16:23, 10.79it/s]

tensor(1.4754, grad_fn=<NllLossBackward0>)
tensor(1.5096, grad_fn=<NllLossBackward0>)
tensor(1.4762, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6815/17426 [10:45<16:09, 10.95it/s]

tensor(1.4802, grad_fn=<NllLossBackward0>)
tensor(1.5195, grad_fn=<NllLossBackward0>)
tensor(1.4807, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6817/17426 [10:45<16:18, 10.84it/s]

tensor(1.4996, grad_fn=<NllLossBackward0>)
tensor(1.5447, grad_fn=<NllLossBackward0>)
tensor(1.5496, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6821/17426 [10:45<16:05, 10.99it/s]

tensor(1.5407, grad_fn=<NllLossBackward0>)
tensor(1.5141, grad_fn=<NllLossBackward0>)
tensor(1.5467, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6823/17426 [10:45<16:14, 10.88it/s]

tensor(1.5242, grad_fn=<NllLossBackward0>)
tensor(1.5367, grad_fn=<NllLossBackward0>)
tensor(1.5249, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6827/17426 [10:46<16:09, 10.93it/s]

tensor(1.5263, grad_fn=<NllLossBackward0>)
tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.5042, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6829/17426 [10:46<16:44, 10.55it/s]

tensor(1.5283, grad_fn=<NllLossBackward0>)
tensor(1.5535, grad_fn=<NllLossBackward0>)
tensor(1.5598, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6833/17426 [10:46<16:04, 10.98it/s]

tensor(1.5149, grad_fn=<NllLossBackward0>)
tensor(1.5312, grad_fn=<NllLossBackward0>)
tensor(1.5203, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6835/17426 [10:47<16:23, 10.77it/s]

tensor(1.5254, grad_fn=<NllLossBackward0>)
tensor(1.5407, grad_fn=<NllLossBackward0>)
tensor(1.5647, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6839/17426 [10:47<15:54, 11.09it/s]

tensor(1.5084, grad_fn=<NllLossBackward0>)
tensor(1.5209, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6841/17426 [10:47<16:28, 10.70it/s]

tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.5504, grad_fn=<NllLossBackward0>)
tensor(1.5011, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6845/17426 [10:48<16:01, 11.00it/s]

tensor(1.4739, grad_fn=<NllLossBackward0>)
tensor(1.5293, grad_fn=<NllLossBackward0>)
tensor(1.5778, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6847/17426 [10:48<16:17, 10.82it/s]

tensor(1.5446, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)
tensor(1.5499, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6851/17426 [10:48<16:22, 10.76it/s]

tensor(1.5202, grad_fn=<NllLossBackward0>)
tensor(1.4973, grad_fn=<NllLossBackward0>)
tensor(1.5347, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6853/17426 [10:48<16:26, 10.72it/s]

tensor(1.5646, grad_fn=<NllLossBackward0>)
tensor(1.4972, grad_fn=<NllLossBackward0>)
tensor(1.5278, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6857/17426 [10:49<15:56, 11.05it/s]

tensor(1.5458, grad_fn=<NllLossBackward0>)
tensor(1.5401, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6859/17426 [10:49<16:23, 10.74it/s]

tensor(1.5348, grad_fn=<NllLossBackward0>)
tensor(1.5090, grad_fn=<NllLossBackward0>)
tensor(1.4902, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6861/17426 [10:49<16:57, 10.38it/s]

tensor(1.4994, grad_fn=<NllLossBackward0>)
tensor(1.5271, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6864/17426 [10:49<20:02,  8.78it/s]

tensor(1.5530, grad_fn=<NllLossBackward0>)
tensor(1.5460, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6866/17426 [10:50<21:58,  8.01it/s]

tensor(1.4951, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6868/17426 [10:50<22:47,  7.72it/s]

tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.5213, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6870/17426 [10:50<24:08,  7.29it/s]

tensor(1.4734, grad_fn=<NllLossBackward0>)
tensor(1.5144, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6872/17426 [10:51<24:06,  7.30it/s]

tensor(1.4873, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6874/17426 [10:51<24:25,  7.20it/s]

tensor(1.5349, grad_fn=<NllLossBackward0>)
tensor(1.5623, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6876/17426 [10:51<22:41,  7.75it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.5579, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6878/17426 [10:51<23:47,  7.39it/s]

tensor(1.4897, grad_fn=<NllLossBackward0>)
tensor(1.5211, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6880/17426 [10:52<24:08,  7.28it/s]

tensor(1.5268, grad_fn=<NllLossBackward0>)
tensor(1.4957, grad_fn=<NllLossBackward0>)


 39%|███▉      | 6882/17426 [10:52<24:20,  7.22it/s]

tensor(1.5030, grad_fn=<NllLossBackward0>)
tensor(1.5728, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6884/17426 [10:52<25:00,  7.03it/s]

tensor(1.5060, grad_fn=<NllLossBackward0>)
tensor(1.4999, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6886/17426 [10:53<25:19,  6.93it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.4836, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6888/17426 [10:53<24:36,  7.14it/s]

tensor(1.5531, grad_fn=<NllLossBackward0>)
tensor(1.5261, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6890/17426 [10:53<21:15,  8.26it/s]

tensor(1.5174, grad_fn=<NllLossBackward0>)
tensor(1.5701, grad_fn=<NllLossBackward0>)
tensor(1.4955, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6894/17426 [10:53<17:13, 10.19it/s]

tensor(1.5221, grad_fn=<NllLossBackward0>)
tensor(1.5449, grad_fn=<NllLossBackward0>)
tensor(1.5064, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6897/17426 [10:54<17:41,  9.91it/s]

tensor(1.5508, grad_fn=<NllLossBackward0>)
tensor(1.5150, grad_fn=<NllLossBackward0>)
tensor(1.5291, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6900/17426 [10:54<17:01, 10.31it/s]

tensor(1.5360, grad_fn=<NllLossBackward0>)
tensor(1.4667, grad_fn=<NllLossBackward0>)
tensor(1.5335, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6902/17426 [10:54<16:58, 10.33it/s]

tensor(1.5327, grad_fn=<NllLossBackward0>)
tensor(1.5414, grad_fn=<NllLossBackward0>)
tensor(1.4922, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6906/17426 [10:55<16:28, 10.64it/s]

tensor(1.5062, grad_fn=<NllLossBackward0>)
tensor(1.4694, grad_fn=<NllLossBackward0>)
tensor(1.5087, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6908/17426 [10:55<16:38, 10.54it/s]

tensor(1.5776, grad_fn=<NllLossBackward0>)
tensor(1.5106, grad_fn=<NllLossBackward0>)
tensor(1.5326, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6912/17426 [10:55<16:11, 10.82it/s]

tensor(1.5148, grad_fn=<NllLossBackward0>)
tensor(1.4928, grad_fn=<NllLossBackward0>)
tensor(1.4884, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6914/17426 [10:55<16:06, 10.88it/s]

tensor(1.5291, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)
tensor(1.4872, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6918/17426 [10:56<16:09, 10.84it/s]

tensor(1.5330, grad_fn=<NllLossBackward0>)
tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.5155, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6920/17426 [10:56<16:11, 10.82it/s]

tensor(1.5047, grad_fn=<NllLossBackward0>)
tensor(1.5824, grad_fn=<NllLossBackward0>)
tensor(1.5232, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6924/17426 [10:56<16:15, 10.77it/s]

tensor(1.5735, grad_fn=<NllLossBackward0>)
tensor(1.5573, grad_fn=<NllLossBackward0>)
tensor(1.5408, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6926/17426 [10:56<16:21, 10.69it/s]

tensor(1.5593, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.5558, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6930/17426 [10:57<16:10, 10.82it/s]

tensor(1.5152, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.5230, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6932/17426 [10:57<16:17, 10.73it/s]

tensor(1.5444, grad_fn=<NllLossBackward0>)
tensor(1.5471, grad_fn=<NllLossBackward0>)
tensor(1.5302, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6936/17426 [10:57<15:55, 10.97it/s]

tensor(1.5115, grad_fn=<NllLossBackward0>)
tensor(1.5288, grad_fn=<NllLossBackward0>)
tensor(1.4893, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6938/17426 [10:57<16:01, 10.91it/s]

tensor(1.5209, grad_fn=<NllLossBackward0>)
tensor(1.5031, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6942/17426 [10:58<16:15, 10.75it/s]

tensor(1.5535, grad_fn=<NllLossBackward0>)
tensor(1.5367, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6944/17426 [10:58<16:24, 10.64it/s]

tensor(1.5247, grad_fn=<NllLossBackward0>)
tensor(1.4731, grad_fn=<NllLossBackward0>)
tensor(1.5842, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6948/17426 [10:58<15:51, 11.01it/s]

tensor(1.5038, grad_fn=<NllLossBackward0>)
tensor(1.5647, grad_fn=<NllLossBackward0>)
tensor(1.5698, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6950/17426 [10:59<16:07, 10.82it/s]

tensor(1.5275, grad_fn=<NllLossBackward0>)
tensor(1.4784, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6954/17426 [10:59<16:02, 10.88it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.5266, grad_fn=<NllLossBackward0>)
tensor(1.5361, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6956/17426 [10:59<16:02, 10.88it/s]

tensor(1.5174, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)
tensor(1.5606, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6960/17426 [11:00<15:54, 10.96it/s]

tensor(1.5306, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6962/17426 [11:00<16:19, 10.68it/s]

tensor(1.5993, grad_fn=<NllLossBackward0>)
tensor(1.5301, grad_fn=<NllLossBackward0>)
tensor(1.5331, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6966/17426 [11:00<16:00, 10.89it/s]

tensor(1.5685, grad_fn=<NllLossBackward0>)
tensor(1.4874, grad_fn=<NllLossBackward0>)
tensor(1.4973, grad_fn=<NllLossBackward0>)


 40%|███▉      | 6968/17426 [11:00<16:00, 10.88it/s]

tensor(1.4682, grad_fn=<NllLossBackward0>)
tensor(1.5377, grad_fn=<NllLossBackward0>)
tensor(1.5348, grad_fn=<NllLossBackward0>)


 40%|████      | 6972/17426 [11:01<15:53, 10.97it/s]

tensor(1.5331, grad_fn=<NllLossBackward0>)
tensor(1.5023, grad_fn=<NllLossBackward0>)
tensor(1.4800, grad_fn=<NllLossBackward0>)


 40%|████      | 6974/17426 [11:01<16:25, 10.61it/s]

tensor(1.5310, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)
tensor(1.5439, grad_fn=<NllLossBackward0>)


 40%|████      | 6978/17426 [11:01<15:57, 10.91it/s]

tensor(1.5383, grad_fn=<NllLossBackward0>)
tensor(1.5071, grad_fn=<NllLossBackward0>)
tensor(1.5858, grad_fn=<NllLossBackward0>)


 40%|████      | 6980/17426 [11:01<16:00, 10.88it/s]

tensor(1.4994, grad_fn=<NllLossBackward0>)
tensor(1.5833, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)


 40%|████      | 6984/17426 [11:02<16:04, 10.82it/s]

tensor(1.5109, grad_fn=<NllLossBackward0>)
tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.5461, grad_fn=<NllLossBackward0>)


 40%|████      | 6986/17426 [11:02<16:04, 10.83it/s]

tensor(1.5657, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)
tensor(1.5382, grad_fn=<NllLossBackward0>)


 40%|████      | 6990/17426 [11:02<15:44, 11.05it/s]

tensor(1.5142, grad_fn=<NllLossBackward0>)
tensor(1.4884, grad_fn=<NllLossBackward0>)
tensor(1.5255, grad_fn=<NllLossBackward0>)


 40%|████      | 6992/17426 [11:02<15:54, 10.93it/s]

tensor(1.5631, grad_fn=<NllLossBackward0>)
tensor(1.5586, grad_fn=<NllLossBackward0>)
tensor(1.5388, grad_fn=<NllLossBackward0>)


 40%|████      | 6994/17426 [11:03<16:05, 10.80it/s]

tensor(1.5002, grad_fn=<NllLossBackward0>)
tensor(1.5079, grad_fn=<NllLossBackward0>)


 40%|████      | 6997/17426 [11:03<19:20,  8.99it/s]

tensor(1.5473, grad_fn=<NllLossBackward0>)
tensor(1.5453, grad_fn=<NllLossBackward0>)


 40%|████      | 6999/17426 [11:03<21:51,  7.95it/s]

tensor(1.5186, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 40%|████      | 7001/17426 [11:04<22:29,  7.73it/s]

tensor(1.4847, grad_fn=<NllLossBackward0>)
tensor(1.4761, grad_fn=<NllLossBackward0>)


 40%|████      | 7003/17426 [11:04<23:13,  7.48it/s]

tensor(1.5492, grad_fn=<NllLossBackward0>)
tensor(1.5622, grad_fn=<NllLossBackward0>)


 40%|████      | 7005/17426 [11:04<22:03,  7.87it/s]

tensor(1.5245, grad_fn=<NllLossBackward0>)
tensor(1.5697, grad_fn=<NllLossBackward0>)


 40%|████      | 7007/17426 [11:04<21:33,  8.05it/s]

tensor(1.5595, grad_fn=<NllLossBackward0>)
tensor(1.5155, grad_fn=<NllLossBackward0>)


 40%|████      | 7009/17426 [11:05<21:51,  7.94it/s]

tensor(1.5358, grad_fn=<NllLossBackward0>)
tensor(1.5543, grad_fn=<NllLossBackward0>)


 40%|████      | 7011/17426 [11:05<21:00,  8.26it/s]

tensor(1.5180, grad_fn=<NllLossBackward0>)
tensor(1.5214, grad_fn=<NllLossBackward0>)


 40%|████      | 7013/17426 [11:05<21:40,  8.01it/s]

tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.5229, grad_fn=<NllLossBackward0>)


 40%|████      | 7015/17426 [11:05<23:22,  7.42it/s]

tensor(1.5076, grad_fn=<NllLossBackward0>)
tensor(1.5281, grad_fn=<NllLossBackward0>)


 40%|████      | 7017/17426 [11:06<24:14,  7.16it/s]

tensor(1.4902, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 40%|████      | 7019/17426 [11:06<25:56,  6.69it/s]

tensor(1.5279, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)


 40%|████      | 7021/17426 [11:06<25:21,  6.84it/s]

tensor(1.5323, grad_fn=<NllLossBackward0>)
tensor(1.4993, grad_fn=<NllLossBackward0>)


 40%|████      | 7023/17426 [11:07<25:17,  6.85it/s]

tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.5338, grad_fn=<NllLossBackward0>)


 40%|████      | 7026/17426 [11:07<19:34,  8.86it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.5600, grad_fn=<NllLossBackward0>)
tensor(1.5198, grad_fn=<NllLossBackward0>)


 40%|████      | 7029/17426 [11:07<17:59,  9.63it/s]

tensor(1.5066, grad_fn=<NllLossBackward0>)
tensor(1.5195, grad_fn=<NllLossBackward0>)
tensor(1.5350, grad_fn=<NllLossBackward0>)


 40%|████      | 7032/17426 [11:08<16:54, 10.25it/s]

tensor(1.5139, grad_fn=<NllLossBackward0>)
tensor(1.4903, grad_fn=<NllLossBackward0>)
tensor(1.5293, grad_fn=<NllLossBackward0>)


 40%|████      | 7034/17426 [11:08<16:49, 10.29it/s]

tensor(1.5109, grad_fn=<NllLossBackward0>)
tensor(1.4896, grad_fn=<NllLossBackward0>)
tensor(1.5601, grad_fn=<NllLossBackward0>)


 40%|████      | 7038/17426 [11:08<16:00, 10.81it/s]

tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.5193, grad_fn=<NllLossBackward0>)
tensor(1.5078, grad_fn=<NllLossBackward0>)


 40%|████      | 7040/17426 [11:08<16:35, 10.44it/s]

tensor(1.5339, grad_fn=<NllLossBackward0>)
tensor(1.4936, grad_fn=<NllLossBackward0>)
tensor(1.4939, grad_fn=<NllLossBackward0>)


 40%|████      | 7044/17426 [11:09<16:03, 10.78it/s]

tensor(1.5656, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)
tensor(1.4951, grad_fn=<NllLossBackward0>)


 40%|████      | 7046/17426 [11:09<16:09, 10.71it/s]

tensor(1.4932, grad_fn=<NllLossBackward0>)
tensor(1.5716, grad_fn=<NllLossBackward0>)
tensor(1.5379, grad_fn=<NllLossBackward0>)


 40%|████      | 7050/17426 [11:09<16:10, 10.69it/s]

tensor(1.5523, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)


 40%|████      | 7052/17426 [11:09<16:16, 10.62it/s]

tensor(1.4950, grad_fn=<NllLossBackward0>)
tensor(1.5353, grad_fn=<NllLossBackward0>)
tensor(1.5173, grad_fn=<NllLossBackward0>)


 40%|████      | 7056/17426 [11:10<16:19, 10.58it/s]

tensor(1.5574, grad_fn=<NllLossBackward0>)
tensor(1.5586, grad_fn=<NllLossBackward0>)
tensor(1.5356, grad_fn=<NllLossBackward0>)


 41%|████      | 7058/17426 [11:10<16:26, 10.51it/s]

tensor(1.5480, grad_fn=<NllLossBackward0>)
tensor(1.5065, grad_fn=<NllLossBackward0>)
tensor(1.5239, grad_fn=<NllLossBackward0>)


 41%|████      | 7062/17426 [11:10<16:22, 10.55it/s]

tensor(1.5550, grad_fn=<NllLossBackward0>)
tensor(1.4873, grad_fn=<NllLossBackward0>)
tensor(1.5536, grad_fn=<NllLossBackward0>)


 41%|████      | 7064/17426 [11:11<16:19, 10.58it/s]

tensor(1.5402, grad_fn=<NllLossBackward0>)
tensor(1.4815, grad_fn=<NllLossBackward0>)
tensor(1.5416, grad_fn=<NllLossBackward0>)


 41%|████      | 7068/17426 [11:11<16:08, 10.69it/s]

tensor(1.5196, grad_fn=<NllLossBackward0>)
tensor(1.5187, grad_fn=<NllLossBackward0>)
tensor(1.4574, grad_fn=<NllLossBackward0>)


 41%|████      | 7070/17426 [11:11<16:16, 10.60it/s]

tensor(1.5119, grad_fn=<NllLossBackward0>)
tensor(1.5302, grad_fn=<NllLossBackward0>)
tensor(1.4920, grad_fn=<NllLossBackward0>)


 41%|████      | 7074/17426 [11:11<16:14, 10.63it/s]

tensor(1.5052, grad_fn=<NllLossBackward0>)
tensor(1.4674, grad_fn=<NllLossBackward0>)
tensor(1.5511, grad_fn=<NllLossBackward0>)


 41%|████      | 7076/17426 [11:12<16:24, 10.51it/s]

tensor(1.5277, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)
tensor(1.5470, grad_fn=<NllLossBackward0>)


 41%|████      | 7080/17426 [11:12<16:01, 10.76it/s]

tensor(1.5419, grad_fn=<NllLossBackward0>)
tensor(1.5457, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)


 41%|████      | 7082/17426 [11:12<16:14, 10.61it/s]

tensor(1.4812, grad_fn=<NllLossBackward0>)
tensor(1.5272, grad_fn=<NllLossBackward0>)
tensor(1.4990, grad_fn=<NllLossBackward0>)


 41%|████      | 7086/17426 [11:13<15:59, 10.78it/s]

tensor(1.4724, grad_fn=<NllLossBackward0>)
tensor(1.4947, grad_fn=<NllLossBackward0>)
tensor(1.5305, grad_fn=<NllLossBackward0>)


 41%|████      | 7088/17426 [11:13<16:12, 10.64it/s]

tensor(1.4669, grad_fn=<NllLossBackward0>)
tensor(1.5418, grad_fn=<NllLossBackward0>)
tensor(1.4967, grad_fn=<NllLossBackward0>)


 41%|████      | 7092/17426 [11:13<15:55, 10.82it/s]

tensor(1.4945, grad_fn=<NllLossBackward0>)
tensor(1.4765, grad_fn=<NllLossBackward0>)
tensor(1.5316, grad_fn=<NllLossBackward0>)


 41%|████      | 7094/17426 [11:13<16:16, 10.58it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.5538, grad_fn=<NllLossBackward0>)
tensor(1.5002, grad_fn=<NllLossBackward0>)


 41%|████      | 7098/17426 [11:14<15:48, 10.88it/s]

tensor(1.4987, grad_fn=<NllLossBackward0>)
tensor(1.5296, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)


 41%|████      | 7100/17426 [11:14<15:53, 10.83it/s]

tensor(1.5360, grad_fn=<NllLossBackward0>)
tensor(1.5144, grad_fn=<NllLossBackward0>)
tensor(1.4863, grad_fn=<NllLossBackward0>)


 41%|████      | 7104/17426 [11:14<15:33, 11.05it/s]

tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.5027, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 41%|████      | 7106/17426 [11:14<16:01, 10.74it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.5016, grad_fn=<NllLossBackward0>)
tensor(1.4896, grad_fn=<NllLossBackward0>)


 41%|████      | 7110/17426 [11:15<15:43, 10.93it/s]

tensor(1.5894, grad_fn=<NllLossBackward0>)
tensor(1.5469, grad_fn=<NllLossBackward0>)
tensor(1.5324, grad_fn=<NllLossBackward0>)


 41%|████      | 7112/17426 [11:15<15:47, 10.88it/s]

tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.4508, grad_fn=<NllLossBackward0>)
tensor(1.5530, grad_fn=<NllLossBackward0>)


 41%|████      | 7116/17426 [11:15<15:37, 11.00it/s]

tensor(1.5029, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.5580, grad_fn=<NllLossBackward0>)


 41%|████      | 7118/17426 [11:16<16:00, 10.73it/s]

tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.5268, grad_fn=<NllLossBackward0>)


 41%|████      | 7122/17426 [11:16<15:32, 11.05it/s]

tensor(1.5277, grad_fn=<NllLossBackward0>)
tensor(1.5367, grad_fn=<NllLossBackward0>)
tensor(1.5387, grad_fn=<NllLossBackward0>)


 41%|████      | 7124/17426 [11:16<15:54, 10.79it/s]

tensor(1.4784, grad_fn=<NllLossBackward0>)
tensor(1.5609, grad_fn=<NllLossBackward0>)
tensor(1.5611, grad_fn=<NllLossBackward0>)


 41%|████      | 7128/17426 [11:17<15:45, 10.90it/s]

tensor(1.5387, grad_fn=<NllLossBackward0>)
tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)


 41%|████      | 7130/17426 [11:17<16:58, 10.11it/s]

tensor(1.4944, grad_fn=<NllLossBackward0>)
tensor(1.5397, grad_fn=<NllLossBackward0>)


 41%|████      | 7132/17426 [11:17<18:20,  9.36it/s]

tensor(1.4501, grad_fn=<NllLossBackward0>)
tensor(1.4812, grad_fn=<NllLossBackward0>)


 41%|████      | 7134/17426 [11:17<20:09,  8.51it/s]

tensor(1.5536, grad_fn=<NllLossBackward0>)
tensor(1.5097, grad_fn=<NllLossBackward0>)


 41%|████      | 7136/17426 [11:18<21:08,  8.11it/s]

tensor(1.5588, grad_fn=<NllLossBackward0>)
tensor(1.5209, grad_fn=<NllLossBackward0>)


 41%|████      | 7138/17426 [11:18<20:41,  8.29it/s]

tensor(1.4998, grad_fn=<NllLossBackward0>)
tensor(1.5585, grad_fn=<NllLossBackward0>)


 41%|████      | 7140/17426 [11:18<20:03,  8.54it/s]

tensor(1.5588, grad_fn=<NllLossBackward0>)
tensor(1.4913, grad_fn=<NllLossBackward0>)


 41%|████      | 7142/17426 [11:18<21:09,  8.10it/s]

tensor(1.5365, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)


 41%|████      | 7144/17426 [11:19<20:37,  8.31it/s]

tensor(1.5645, grad_fn=<NllLossBackward0>)
tensor(1.5372, grad_fn=<NllLossBackward0>)


 41%|████      | 7146/17426 [11:19<21:14,  8.07it/s]

tensor(1.4558, grad_fn=<NllLossBackward0>)
tensor(1.5780, grad_fn=<NllLossBackward0>)


 41%|████      | 7148/17426 [11:19<20:17,  8.44it/s]

tensor(1.5067, grad_fn=<NllLossBackward0>)
tensor(1.4871, grad_fn=<NllLossBackward0>)


 41%|████      | 7150/17426 [11:19<22:42,  7.54it/s]

tensor(1.5016, grad_fn=<NllLossBackward0>)
tensor(1.4786, grad_fn=<NllLossBackward0>)


 41%|████      | 7152/17426 [11:20<21:52,  7.83it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.5139, grad_fn=<NllLossBackward0>)


 41%|████      | 7154/17426 [11:20<23:49,  7.19it/s]

tensor(1.4898, grad_fn=<NllLossBackward0>)
tensor(1.4979, grad_fn=<NllLossBackward0>)


 41%|████      | 7156/17426 [11:20<23:05,  7.41it/s]

tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.4946, grad_fn=<NllLossBackward0>)


 41%|████      | 7158/17426 [11:20<23:38,  7.24it/s]

tensor(1.4851, grad_fn=<NllLossBackward0>)
tensor(1.5334, grad_fn=<NllLossBackward0>)


 41%|████      | 7160/17426 [11:21<24:34,  6.96it/s]

tensor(1.5111, grad_fn=<NllLossBackward0>)
tensor(1.4967, grad_fn=<NllLossBackward0>)


 41%|████      | 7163/17426 [11:21<20:46,  8.24it/s]

tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)
tensor(1.5158, grad_fn=<NllLossBackward0>)


 41%|████      | 7166/17426 [11:21<18:03,  9.47it/s]

tensor(1.4995, grad_fn=<NllLossBackward0>)
tensor(1.5290, grad_fn=<NllLossBackward0>)
tensor(1.5245, grad_fn=<NllLossBackward0>)


 41%|████      | 7169/17426 [11:22<17:03, 10.02it/s]

tensor(1.5187, grad_fn=<NllLossBackward0>)
tensor(1.5269, grad_fn=<NllLossBackward0>)
tensor(1.5389, grad_fn=<NllLossBackward0>)


 41%|████      | 7171/17426 [11:22<17:26,  9.80it/s]

tensor(1.5534, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)


 41%|████      | 7173/17426 [11:22<17:57,  9.52it/s]

tensor(1.5044, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)
tensor(1.5534, grad_fn=<NllLossBackward0>)


 41%|████      | 7176/17426 [11:22<17:02, 10.03it/s]

tensor(1.5588, grad_fn=<NllLossBackward0>)
tensor(1.5186, grad_fn=<NllLossBackward0>)
tensor(1.5236, grad_fn=<NllLossBackward0>)


 41%|████      | 7180/17426 [11:23<15:49, 10.79it/s]

tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.4628, grad_fn=<NllLossBackward0>)
tensor(1.5124, grad_fn=<NllLossBackward0>)


 41%|████      | 7182/17426 [11:23<16:03, 10.63it/s]

tensor(1.4798, grad_fn=<NllLossBackward0>)
tensor(1.5066, grad_fn=<NllLossBackward0>)
tensor(1.5006, grad_fn=<NllLossBackward0>)


 41%|████      | 7186/17426 [11:23<16:04, 10.62it/s]

tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.4618, grad_fn=<NllLossBackward0>)
tensor(1.5432, grad_fn=<NllLossBackward0>)


 41%|████      | 7188/17426 [11:23<16:05, 10.60it/s]

tensor(1.5266, grad_fn=<NllLossBackward0>)
tensor(1.5256, grad_fn=<NllLossBackward0>)
tensor(1.5135, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7192/17426 [11:24<15:41, 10.87it/s]

tensor(1.5061, grad_fn=<NllLossBackward0>)
tensor(1.4691, grad_fn=<NllLossBackward0>)
tensor(1.5292, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7194/17426 [11:24<16:04, 10.61it/s]

tensor(1.5167, grad_fn=<NllLossBackward0>)
tensor(1.5495, grad_fn=<NllLossBackward0>)
tensor(1.5327, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7198/17426 [11:24<15:31, 10.99it/s]

tensor(1.4848, grad_fn=<NllLossBackward0>)
tensor(1.5091, grad_fn=<NllLossBackward0>)
tensor(1.5566, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7200/17426 [11:25<15:32, 10.96it/s]

tensor(1.5084, grad_fn=<NllLossBackward0>)
tensor(1.4545, grad_fn=<NllLossBackward0>)
tensor(1.4935, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7204/17426 [11:25<15:19, 11.11it/s]

tensor(1.5341, grad_fn=<NllLossBackward0>)
tensor(1.5148, grad_fn=<NllLossBackward0>)
tensor(1.5564, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7206/17426 [11:25<16:00, 10.64it/s]

tensor(1.4787, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)
tensor(1.5357, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7210/17426 [11:25<15:33, 10.94it/s]

tensor(1.5318, grad_fn=<NllLossBackward0>)
tensor(1.5268, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7212/17426 [11:26<15:50, 10.75it/s]

tensor(1.5381, grad_fn=<NllLossBackward0>)
tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.5449, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7216/17426 [11:26<15:47, 10.77it/s]

tensor(1.5244, grad_fn=<NllLossBackward0>)
tensor(1.5542, grad_fn=<NllLossBackward0>)
tensor(1.5186, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7218/17426 [11:26<15:49, 10.75it/s]

tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.4985, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7222/17426 [11:27<15:28, 10.99it/s]

tensor(1.5000, grad_fn=<NllLossBackward0>)
tensor(1.4419, grad_fn=<NllLossBackward0>)
tensor(1.5210, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7224/17426 [11:27<15:41, 10.84it/s]

tensor(1.4440, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)
tensor(1.5134, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7228/17426 [11:27<15:40, 10.84it/s]

tensor(1.5635, grad_fn=<NllLossBackward0>)
tensor(1.4964, grad_fn=<NllLossBackward0>)
tensor(1.4692, grad_fn=<NllLossBackward0>)


 41%|████▏     | 7230/17426 [11:27<15:43, 10.81it/s]

tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.5126, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7234/17426 [11:28<15:20, 11.07it/s]

tensor(1.5027, grad_fn=<NllLossBackward0>)
tensor(1.5093, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7236/17426 [11:28<15:35, 10.90it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.5111, grad_fn=<NllLossBackward0>)
tensor(1.5056, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7240/17426 [11:28<15:32, 10.92it/s]

tensor(1.4772, grad_fn=<NllLossBackward0>)
tensor(1.5412, grad_fn=<NllLossBackward0>)
tensor(1.4980, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7242/17426 [11:28<15:41, 10.82it/s]

tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.4949, grad_fn=<NllLossBackward0>)
tensor(1.4724, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7246/17426 [11:29<15:19, 11.08it/s]

tensor(1.5786, grad_fn=<NllLossBackward0>)
tensor(1.5534, grad_fn=<NllLossBackward0>)
tensor(1.4898, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7248/17426 [11:29<15:29, 10.95it/s]

tensor(1.4848, grad_fn=<NllLossBackward0>)
tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7252/17426 [11:29<15:32, 10.92it/s]

tensor(1.5572, grad_fn=<NllLossBackward0>)
tensor(1.5651, grad_fn=<NllLossBackward0>)
tensor(1.5226, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7254/17426 [11:30<15:43, 10.78it/s]

tensor(1.5250, grad_fn=<NllLossBackward0>)
tensor(1.5182, grad_fn=<NllLossBackward0>)
tensor(1.5206, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7258/17426 [11:30<15:24, 11.00it/s]

tensor(1.5056, grad_fn=<NllLossBackward0>)
tensor(1.4711, grad_fn=<NllLossBackward0>)
tensor(1.5323, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7260/17426 [11:30<15:50, 10.70it/s]

tensor(1.4885, grad_fn=<NllLossBackward0>)
tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.5031, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7264/17426 [11:30<15:43, 10.77it/s]

tensor(1.5003, grad_fn=<NllLossBackward0>)
tensor(1.5066, grad_fn=<NllLossBackward0>)
tensor(1.5459, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7266/17426 [11:31<15:51, 10.68it/s]

tensor(1.4883, grad_fn=<NllLossBackward0>)
tensor(1.5156, grad_fn=<NllLossBackward0>)
tensor(1.4874, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7268/17426 [11:31<15:57, 10.61it/s]

tensor(1.5384, grad_fn=<NllLossBackward0>)
tensor(1.5194, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7271/17426 [11:31<19:13,  8.81it/s]

tensor(1.4661, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7273/17426 [11:32<20:20,  8.32it/s]

tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7275/17426 [11:32<20:50,  8.11it/s]

tensor(1.5324, grad_fn=<NllLossBackward0>)
tensor(1.4962, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7277/17426 [11:32<21:04,  8.03it/s]

tensor(1.5329, grad_fn=<NllLossBackward0>)
tensor(1.5268, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7279/17426 [11:32<22:29,  7.52it/s]

tensor(1.5671, grad_fn=<NllLossBackward0>)
tensor(1.5369, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7281/17426 [11:33<22:22,  7.55it/s]

tensor(1.4930, grad_fn=<NllLossBackward0>)
tensor(1.5215, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7283/17426 [11:33<21:43,  7.78it/s]

tensor(1.4773, grad_fn=<NllLossBackward0>)
tensor(1.5168, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7285/17426 [11:33<22:39,  7.46it/s]

tensor(1.5365, grad_fn=<NllLossBackward0>)
tensor(1.5243, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7287/17426 [11:33<21:55,  7.71it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.5109, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7289/17426 [11:34<22:56,  7.37it/s]

tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.4999, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7291/17426 [11:34<23:55,  7.06it/s]

tensor(1.4838, grad_fn=<NllLossBackward0>)
tensor(1.5033, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7293/17426 [11:34<24:03,  7.02it/s]

tensor(1.5290, grad_fn=<NllLossBackward0>)
tensor(1.4857, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7295/17426 [11:35<24:49,  6.80it/s]

tensor(1.5137, grad_fn=<NllLossBackward0>)
tensor(1.5252, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7297/17426 [11:35<25:22,  6.65it/s]

tensor(1.5383, grad_fn=<NllLossBackward0>)
tensor(1.4961, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7300/17426 [11:35<19:27,  8.67it/s]

tensor(1.5618, grad_fn=<NllLossBackward0>)
tensor(1.5158, grad_fn=<NllLossBackward0>)
tensor(1.5200, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7303/17426 [11:35<17:13,  9.80it/s]

tensor(1.5209, grad_fn=<NllLossBackward0>)
tensor(1.5524, grad_fn=<NllLossBackward0>)
tensor(1.5106, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7305/17426 [11:36<17:41,  9.53it/s]

tensor(1.5069, grad_fn=<NllLossBackward0>)
tensor(1.5736, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7309/17426 [11:36<15:54, 10.60it/s]

tensor(1.4648, grad_fn=<NllLossBackward0>)
tensor(1.5480, grad_fn=<NllLossBackward0>)
tensor(1.4853, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7311/17426 [11:36<15:51, 10.63it/s]

tensor(1.5150, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)
tensor(1.5356, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7315/17426 [11:37<15:41, 10.74it/s]

tensor(1.5642, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)
tensor(1.5382, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7317/17426 [11:37<16:00, 10.53it/s]

tensor(1.5224, grad_fn=<NllLossBackward0>)
tensor(1.4633, grad_fn=<NllLossBackward0>)
tensor(1.5159, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7321/17426 [11:37<15:41, 10.73it/s]

tensor(1.5228, grad_fn=<NllLossBackward0>)
tensor(1.5411, grad_fn=<NllLossBackward0>)
tensor(1.5512, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7323/17426 [11:37<15:44, 10.70it/s]

tensor(1.5440, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.5167, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7327/17426 [11:38<15:43, 10.70it/s]

tensor(1.4712, grad_fn=<NllLossBackward0>)
tensor(1.5365, grad_fn=<NllLossBackward0>)
tensor(1.4736, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7329/17426 [11:38<15:47, 10.66it/s]

tensor(1.5199, grad_fn=<NllLossBackward0>)
tensor(1.4958, grad_fn=<NllLossBackward0>)
tensor(1.5109, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7333/17426 [11:38<15:19, 10.98it/s]

tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.5126, grad_fn=<NllLossBackward0>)
tensor(1.5488, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7335/17426 [11:38<15:29, 10.86it/s]

tensor(1.4862, grad_fn=<NllLossBackward0>)
tensor(1.5296, grad_fn=<NllLossBackward0>)
tensor(1.5796, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7339/17426 [11:39<15:22, 10.94it/s]

tensor(1.5564, grad_fn=<NllLossBackward0>)
tensor(1.5248, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7341/17426 [11:39<15:35, 10.78it/s]

tensor(1.5481, grad_fn=<NllLossBackward0>)
tensor(1.5044, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7345/17426 [11:39<15:26, 10.88it/s]

tensor(1.5533, grad_fn=<NllLossBackward0>)
tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.5406, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7347/17426 [11:40<15:29, 10.84it/s]

tensor(1.4929, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)
tensor(1.5012, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7349/17426 [11:40<15:56, 10.53it/s]

tensor(1.5576, grad_fn=<NllLossBackward0>)
tensor(1.5329, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7353/17426 [11:40<15:59, 10.50it/s]

tensor(1.5531, grad_fn=<NllLossBackward0>)
tensor(1.5188, grad_fn=<NllLossBackward0>)
tensor(1.5302, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7355/17426 [11:40<15:46, 10.64it/s]

tensor(1.5206, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)
tensor(1.4656, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7359/17426 [11:41<15:37, 10.74it/s]

tensor(1.4755, grad_fn=<NllLossBackward0>)
tensor(1.5474, grad_fn=<NllLossBackward0>)
tensor(1.5347, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7361/17426 [11:41<15:44, 10.65it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.5389, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7365/17426 [11:41<15:14, 11.00it/s]

tensor(1.5446, grad_fn=<NllLossBackward0>)
tensor(1.5481, grad_fn=<NllLossBackward0>)
tensor(1.5113, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7367/17426 [11:41<15:30, 10.81it/s]

tensor(1.5366, grad_fn=<NllLossBackward0>)
tensor(1.4823, grad_fn=<NllLossBackward0>)
tensor(1.4849, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7371/17426 [11:42<15:36, 10.74it/s]

tensor(1.5423, grad_fn=<NllLossBackward0>)
tensor(1.5131, grad_fn=<NllLossBackward0>)
tensor(1.5181, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7373/17426 [11:42<15:26, 10.85it/s]

tensor(1.5165, grad_fn=<NllLossBackward0>)
tensor(1.5334, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7377/17426 [11:42<14:58, 11.18it/s]

tensor(1.5242, grad_fn=<NllLossBackward0>)
tensor(1.5041, grad_fn=<NllLossBackward0>)
tensor(1.5236, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7379/17426 [11:43<15:12, 11.01it/s]

tensor(1.5243, grad_fn=<NllLossBackward0>)
tensor(1.5168, grad_fn=<NllLossBackward0>)
tensor(1.5634, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7383/17426 [11:43<15:29, 10.81it/s]

tensor(1.4791, grad_fn=<NllLossBackward0>)
tensor(1.5234, grad_fn=<NllLossBackward0>)
tensor(1.5631, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7385/17426 [11:43<15:36, 10.73it/s]

tensor(1.5085, grad_fn=<NllLossBackward0>)
tensor(1.5217, grad_fn=<NllLossBackward0>)
tensor(1.5316, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7389/17426 [11:43<15:18, 10.92it/s]

tensor(1.5303, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.4787, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7391/17426 [11:44<15:24, 10.86it/s]

tensor(1.5495, grad_fn=<NllLossBackward0>)
tensor(1.5154, grad_fn=<NllLossBackward0>)
tensor(1.5236, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7395/17426 [11:44<15:24, 10.85it/s]

tensor(1.4985, grad_fn=<NllLossBackward0>)
tensor(1.5536, grad_fn=<NllLossBackward0>)
tensor(1.5271, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7397/17426 [11:44<15:21, 10.88it/s]

tensor(1.5250, grad_fn=<NllLossBackward0>)
tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.5400, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7401/17426 [11:45<15:03, 11.10it/s]

tensor(1.5379, grad_fn=<NllLossBackward0>)
tensor(1.5154, grad_fn=<NllLossBackward0>)
tensor(1.5250, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7403/17426 [11:45<15:28, 10.79it/s]

tensor(1.4791, grad_fn=<NllLossBackward0>)
tensor(1.5327, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)


 42%|████▏     | 7405/17426 [11:45<16:58,  9.84it/s]

tensor(1.4977, grad_fn=<NllLossBackward0>)
tensor(1.5217, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7408/17426 [11:45<20:09,  8.28it/s]

tensor(1.5138, grad_fn=<NllLossBackward0>)
tensor(1.5446, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7410/17426 [11:46<20:46,  8.04it/s]

tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.5210, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7412/17426 [11:46<21:10,  7.88it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)
tensor(1.4975, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7414/17426 [11:46<20:45,  8.04it/s]

tensor(1.5396, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7416/17426 [11:46<20:19,  8.21it/s]

tensor(1.5020, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7418/17426 [11:47<20:05,  8.30it/s]

tensor(1.5527, grad_fn=<NllLossBackward0>)
tensor(1.4741, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7420/17426 [11:47<19:55,  8.37it/s]

tensor(1.5078, grad_fn=<NllLossBackward0>)
tensor(1.5486, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7422/17426 [11:47<22:32,  7.39it/s]

tensor(1.5518, grad_fn=<NllLossBackward0>)
tensor(1.5195, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7424/17426 [11:48<23:32,  7.08it/s]

tensor(1.4930, grad_fn=<NllLossBackward0>)
tensor(1.5216, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7426/17426 [11:48<23:53,  6.98it/s]

tensor(1.4705, grad_fn=<NllLossBackward0>)
tensor(1.5307, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7428/17426 [11:48<23:37,  7.05it/s]

tensor(1.5900, grad_fn=<NllLossBackward0>)
tensor(1.5078, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7430/17426 [11:48<24:50,  6.71it/s]

tensor(1.4709, grad_fn=<NllLossBackward0>)
tensor(1.5189, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7432/17426 [11:49<24:34,  6.78it/s]

tensor(1.4773, grad_fn=<NllLossBackward0>)
tensor(1.4816, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7435/17426 [11:49<20:03,  8.30it/s]

tensor(1.5279, grad_fn=<NllLossBackward0>)
tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.5581, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7438/17426 [11:49<17:42,  9.40it/s]

tensor(1.5144, grad_fn=<NllLossBackward0>)
tensor(1.4980, grad_fn=<NllLossBackward0>)
tensor(1.5569, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7441/17426 [11:50<16:30, 10.08it/s]

tensor(1.4932, grad_fn=<NllLossBackward0>)
tensor(1.4995, grad_fn=<NllLossBackward0>)
tensor(1.4985, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7444/17426 [11:50<16:05, 10.33it/s]

tensor(1.4857, grad_fn=<NllLossBackward0>)
tensor(1.5022, grad_fn=<NllLossBackward0>)
tensor(1.4638, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7446/17426 [11:50<15:58, 10.42it/s]

tensor(1.5351, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)
tensor(1.4704, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7450/17426 [11:50<15:32, 10.70it/s]

tensor(1.5557, grad_fn=<NllLossBackward0>)
tensor(1.5071, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7452/17426 [11:51<15:38, 10.63it/s]

tensor(1.5000, grad_fn=<NllLossBackward0>)
tensor(1.4906, grad_fn=<NllLossBackward0>)
tensor(1.5432, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7456/17426 [11:51<15:06, 11.00it/s]

tensor(1.5570, grad_fn=<NllLossBackward0>)
tensor(1.5078, grad_fn=<NllLossBackward0>)
tensor(1.5067, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7458/17426 [11:51<15:31, 10.70it/s]

tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.4838, grad_fn=<NllLossBackward0>)
tensor(1.5306, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7462/17426 [11:52<15:10, 10.95it/s]

tensor(1.5031, grad_fn=<NllLossBackward0>)
tensor(1.4791, grad_fn=<NllLossBackward0>)
tensor(1.4946, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7464/17426 [11:52<15:16, 10.87it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.4822, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7468/17426 [11:52<15:05, 10.99it/s]

tensor(1.4943, grad_fn=<NllLossBackward0>)
tensor(1.4872, grad_fn=<NllLossBackward0>)
tensor(1.5262, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7470/17426 [11:52<15:41, 10.58it/s]

tensor(1.5030, grad_fn=<NllLossBackward0>)
tensor(1.4947, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7474/17426 [11:53<15:09, 10.94it/s]

tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.4967, grad_fn=<NllLossBackward0>)
tensor(1.4948, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7476/17426 [11:53<15:15, 10.87it/s]

tensor(1.4967, grad_fn=<NllLossBackward0>)
tensor(1.4805, grad_fn=<NllLossBackward0>)
tensor(1.5182, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7480/17426 [11:53<14:55, 11.11it/s]

tensor(1.5208, grad_fn=<NllLossBackward0>)
tensor(1.5094, grad_fn=<NllLossBackward0>)
tensor(1.4991, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7482/17426 [11:53<15:34, 10.64it/s]

tensor(1.5380, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7486/17426 [11:54<15:10, 10.91it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.5397, grad_fn=<NllLossBackward0>)
tensor(1.5060, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7488/17426 [11:54<15:14, 10.86it/s]

tensor(1.4804, grad_fn=<NllLossBackward0>)
tensor(1.4676, grad_fn=<NllLossBackward0>)
tensor(1.5385, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7492/17426 [11:54<15:21, 10.78it/s]

tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.5195, grad_fn=<NllLossBackward0>)
tensor(1.4836, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7494/17426 [11:55<15:31, 10.66it/s]

tensor(1.5055, grad_fn=<NllLossBackward0>)
tensor(1.5275, grad_fn=<NllLossBackward0>)
tensor(1.5596, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7498/17426 [11:55<15:13, 10.87it/s]

tensor(1.5242, grad_fn=<NllLossBackward0>)
tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.4659, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7500/17426 [11:55<15:11, 10.89it/s]

tensor(1.4630, grad_fn=<NllLossBackward0>)
tensor(1.5067, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7504/17426 [11:55<15:26, 10.70it/s]

tensor(1.5727, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7506/17426 [11:56<15:30, 10.66it/s]

tensor(1.5119, grad_fn=<NllLossBackward0>)
tensor(1.5386, grad_fn=<NllLossBackward0>)
tensor(1.5541, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7510/17426 [11:56<15:12, 10.86it/s]

tensor(1.4975, grad_fn=<NllLossBackward0>)
tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.5392, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7512/17426 [11:56<15:23, 10.73it/s]

tensor(1.5300, grad_fn=<NllLossBackward0>)
tensor(1.4900, grad_fn=<NllLossBackward0>)
tensor(1.5155, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7516/17426 [11:57<15:12, 10.86it/s]

tensor(1.5529, grad_fn=<NllLossBackward0>)
tensor(1.5346, grad_fn=<NllLossBackward0>)
tensor(1.4865, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7518/17426 [11:57<15:25, 10.70it/s]

tensor(1.4857, grad_fn=<NllLossBackward0>)
tensor(1.4717, grad_fn=<NllLossBackward0>)
tensor(1.5418, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7522/17426 [11:57<15:13, 10.84it/s]

tensor(1.5263, grad_fn=<NllLossBackward0>)
tensor(1.5012, grad_fn=<NllLossBackward0>)
tensor(1.5319, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7524/17426 [11:57<15:28, 10.66it/s]

tensor(1.4773, grad_fn=<NllLossBackward0>)
tensor(1.5030, grad_fn=<NllLossBackward0>)
tensor(1.5030, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7528/17426 [11:58<15:12, 10.85it/s]

tensor(1.5522, grad_fn=<NllLossBackward0>)
tensor(1.5433, grad_fn=<NllLossBackward0>)
tensor(1.4884, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7530/17426 [11:58<15:14, 10.82it/s]

tensor(1.5285, grad_fn=<NllLossBackward0>)
tensor(1.5126, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7534/17426 [11:58<14:59, 10.99it/s]

tensor(1.4938, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)
tensor(1.5649, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7536/17426 [11:58<15:17, 10.78it/s]

tensor(1.4955, grad_fn=<NllLossBackward0>)
tensor(1.4719, grad_fn=<NllLossBackward0>)
tensor(1.5123, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7540/17426 [11:59<15:04, 10.94it/s]

tensor(1.5176, grad_fn=<NllLossBackward0>)
tensor(1.5201, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7542/17426 [11:59<16:56,  9.72it/s]

tensor(1.4879, grad_fn=<NllLossBackward0>)
tensor(1.5003, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7544/17426 [11:59<18:14,  9.03it/s]

tensor(1.5153, grad_fn=<NllLossBackward0>)
tensor(1.5339, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7546/17426 [12:00<20:05,  8.20it/s]

tensor(1.4429, grad_fn=<NllLossBackward0>)
tensor(1.5172, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7548/17426 [12:00<20:12,  8.15it/s]

tensor(1.4940, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7550/17426 [12:00<20:11,  8.15it/s]

tensor(1.4762, grad_fn=<NllLossBackward0>)
tensor(1.4897, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7552/17426 [12:00<20:14,  8.13it/s]

tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7554/17426 [12:01<21:31,  7.65it/s]

tensor(1.4950, grad_fn=<NllLossBackward0>)
tensor(1.5289, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7556/17426 [12:01<20:05,  8.19it/s]

tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.5027, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7558/17426 [12:01<19:46,  8.32it/s]

tensor(1.5072, grad_fn=<NllLossBackward0>)
tensor(1.5349, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7560/17426 [12:01<19:34,  8.40it/s]

tensor(1.5323, grad_fn=<NllLossBackward0>)
tensor(1.5718, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7562/17426 [12:02<21:35,  7.61it/s]

tensor(1.5226, grad_fn=<NllLossBackward0>)
tensor(1.4597, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7564/17426 [12:02<23:50,  6.89it/s]

tensor(1.4689, grad_fn=<NllLossBackward0>)
tensor(1.5292, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7566/17426 [12:02<23:33,  6.97it/s]

tensor(1.5254, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7568/17426 [12:03<23:27,  7.00it/s]

tensor(1.4695, grad_fn=<NllLossBackward0>)
tensor(1.5423, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7570/17426 [12:03<24:35,  6.68it/s]

tensor(1.5751, grad_fn=<NllLossBackward0>)
tensor(1.5494, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7572/17426 [12:03<23:52,  6.88it/s]

tensor(1.5276, grad_fn=<NllLossBackward0>)
tensor(1.5198, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7575/17426 [12:03<18:40,  8.79it/s]

tensor(1.5149, grad_fn=<NllLossBackward0>)
tensor(1.5122, grad_fn=<NllLossBackward0>)
tensor(1.5317, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7578/17426 [12:04<16:47,  9.77it/s]

tensor(1.5248, grad_fn=<NllLossBackward0>)
tensor(1.5504, grad_fn=<NllLossBackward0>)
tensor(1.5309, grad_fn=<NllLossBackward0>)


 43%|████▎     | 7580/17426 [12:04<17:09,  9.57it/s]

tensor(1.5304, grad_fn=<NllLossBackward0>)
tensor(1.5369, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7582/17426 [12:04<17:25,  9.42it/s]

tensor(1.5215, grad_fn=<NllLossBackward0>)
tensor(1.5699, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7585/17426 [12:04<16:00, 10.24it/s]

tensor(1.4704, grad_fn=<NllLossBackward0>)
tensor(1.5596, grad_fn=<NllLossBackward0>)
tensor(1.5417, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7587/17426 [12:05<15:44, 10.41it/s]

tensor(1.4792, grad_fn=<NllLossBackward0>)
tensor(1.5485, grad_fn=<NllLossBackward0>)
tensor(1.5381, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7591/17426 [12:05<15:27, 10.61it/s]

tensor(1.5346, grad_fn=<NllLossBackward0>)
tensor(1.5796, grad_fn=<NllLossBackward0>)
tensor(1.5403, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7593/17426 [12:05<15:28, 10.59it/s]

tensor(1.5368, grad_fn=<NllLossBackward0>)
tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.5282, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7597/17426 [12:06<15:05, 10.85it/s]

tensor(1.4902, grad_fn=<NllLossBackward0>)
tensor(1.5324, grad_fn=<NllLossBackward0>)
tensor(1.5685, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7599/17426 [12:06<15:20, 10.67it/s]

tensor(1.5160, grad_fn=<NllLossBackward0>)
tensor(1.5211, grad_fn=<NllLossBackward0>)
tensor(1.5146, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7603/17426 [12:06<15:16, 10.71it/s]

tensor(1.5234, grad_fn=<NllLossBackward0>)
tensor(1.5376, grad_fn=<NllLossBackward0>)
tensor(1.5179, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7605/17426 [12:06<15:36, 10.49it/s]

tensor(1.4663, grad_fn=<NllLossBackward0>)
tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7609/17426 [12:07<15:15, 10.73it/s]

tensor(1.5142, grad_fn=<NllLossBackward0>)
tensor(1.4907, grad_fn=<NllLossBackward0>)
tensor(1.5163, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7611/17426 [12:07<15:21, 10.65it/s]

tensor(1.5503, grad_fn=<NllLossBackward0>)
tensor(1.5345, grad_fn=<NllLossBackward0>)
tensor(1.5384, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7615/17426 [12:07<15:07, 10.81it/s]

tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.4796, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7617/17426 [12:07<15:20, 10.66it/s]

tensor(1.5262, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7621/17426 [12:08<14:58, 10.92it/s]

tensor(1.5047, grad_fn=<NllLossBackward0>)
tensor(1.5456, grad_fn=<NllLossBackward0>)
tensor(1.4884, grad_fn=<NllLossBackward0>)


 44%|████▎     | 7623/17426 [12:08<15:15, 10.71it/s]

tensor(1.5473, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)
tensor(1.5250, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7627/17426 [12:08<15:17, 10.68it/s]

tensor(1.4609, grad_fn=<NllLossBackward0>)
tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.5051, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7629/17426 [12:09<15:21, 10.64it/s]

tensor(1.5539, grad_fn=<NllLossBackward0>)
tensor(1.5229, grad_fn=<NllLossBackward0>)
tensor(1.5421, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7633/17426 [12:09<15:00, 10.88it/s]

tensor(1.4680, grad_fn=<NllLossBackward0>)
tensor(1.5364, grad_fn=<NllLossBackward0>)
tensor(1.4928, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7635/17426 [12:09<15:30, 10.52it/s]

tensor(1.4988, grad_fn=<NllLossBackward0>)
tensor(1.4499, grad_fn=<NllLossBackward0>)
tensor(1.4789, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7639/17426 [12:09<15:11, 10.74it/s]

tensor(1.5280, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.4995, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7641/17426 [12:10<15:17, 10.67it/s]

tensor(1.4823, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)
tensor(1.5256, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7643/17426 [12:10<15:25, 10.57it/s]

tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.4597, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7647/17426 [12:10<16:07, 10.10it/s]

tensor(1.5075, grad_fn=<NllLossBackward0>)
tensor(1.5254, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7651/17426 [12:11<15:22, 10.60it/s]

tensor(1.5161, grad_fn=<NllLossBackward0>)
tensor(1.5467, grad_fn=<NllLossBackward0>)
tensor(1.5244, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7653/17426 [12:11<15:24, 10.57it/s]

tensor(1.4615, grad_fn=<NllLossBackward0>)
tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.5627, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7657/17426 [12:11<15:25, 10.55it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)
tensor(1.5229, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7659/17426 [12:11<15:26, 10.54it/s]

tensor(1.5291, grad_fn=<NllLossBackward0>)
tensor(1.4954, grad_fn=<NllLossBackward0>)
tensor(1.5017, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7663/17426 [12:12<15:00, 10.84it/s]

tensor(1.5416, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)
tensor(1.5266, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7665/17426 [12:12<15:05, 10.78it/s]

tensor(1.5002, grad_fn=<NllLossBackward0>)
tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7669/17426 [12:12<15:15, 10.66it/s]

tensor(1.5445, grad_fn=<NllLossBackward0>)
tensor(1.5206, grad_fn=<NllLossBackward0>)
tensor(1.4985, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7671/17426 [12:13<15:17, 10.63it/s]

tensor(1.4730, grad_fn=<NllLossBackward0>)
tensor(1.5140, grad_fn=<NllLossBackward0>)
tensor(1.4932, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7675/17426 [12:13<14:46, 11.00it/s]

tensor(1.5168, grad_fn=<NllLossBackward0>)
tensor(1.4952, grad_fn=<NllLossBackward0>)
tensor(1.4964, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7677/17426 [12:13<15:04, 10.78it/s]

tensor(1.4815, grad_fn=<NllLossBackward0>)
tensor(1.4913, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7679/17426 [12:13<17:43,  9.16it/s]

tensor(1.5630, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7681/17426 [12:14<19:17,  8.42it/s]

tensor(1.5332, grad_fn=<NllLossBackward0>)
tensor(1.5530, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7683/17426 [12:14<20:23,  7.97it/s]

tensor(1.5384, grad_fn=<NllLossBackward0>)
tensor(1.5251, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7685/17426 [12:14<21:19,  7.61it/s]

tensor(1.5141, grad_fn=<NllLossBackward0>)
tensor(1.5385, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7687/17426 [12:14<21:59,  7.38it/s]

tensor(1.5337, grad_fn=<NllLossBackward0>)
tensor(1.4673, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7689/17426 [12:15<22:51,  7.10it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.4900, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7691/17426 [12:15<22:52,  7.09it/s]

tensor(1.4877, grad_fn=<NllLossBackward0>)
tensor(1.5256, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7693/17426 [12:15<20:47,  7.80it/s]

tensor(1.5234, grad_fn=<NllLossBackward0>)
tensor(1.5529, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7695/17426 [12:16<21:47,  7.45it/s]

tensor(1.5237, grad_fn=<NllLossBackward0>)
tensor(1.4676, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7697/17426 [12:16<22:50,  7.10it/s]

tensor(1.5195, grad_fn=<NllLossBackward0>)
tensor(1.5153, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7699/17426 [12:16<22:57,  7.06it/s]

tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.5576, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7701/17426 [12:16<23:29,  6.90it/s]

tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.5691, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7703/17426 [12:17<22:43,  7.13it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7705/17426 [12:17<23:28,  6.90it/s]

tensor(1.4754, grad_fn=<NllLossBackward0>)
tensor(1.4755, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7708/17426 [12:17<18:36,  8.70it/s]

tensor(1.5152, grad_fn=<NllLossBackward0>)
tensor(1.5301, grad_fn=<NllLossBackward0>)
tensor(1.5220, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7711/17426 [12:18<17:12,  9.41it/s]

tensor(1.5400, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.5513, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7714/17426 [12:18<16:14,  9.97it/s]

tensor(1.5087, grad_fn=<NllLossBackward0>)
tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.4706, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7717/17426 [12:18<15:50, 10.21it/s]

tensor(1.4712, grad_fn=<NllLossBackward0>)
tensor(1.5255, grad_fn=<NllLossBackward0>)
tensor(1.5259, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7720/17426 [12:18<15:21, 10.53it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.5487, grad_fn=<NllLossBackward0>)
tensor(1.4766, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7722/17426 [12:19<15:48, 10.23it/s]

tensor(1.5800, grad_fn=<NllLossBackward0>)
tensor(1.5293, grad_fn=<NllLossBackward0>)
tensor(1.5686, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7726/17426 [12:19<15:05, 10.71it/s]

tensor(1.5073, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)
tensor(1.5033, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7728/17426 [12:19<15:06, 10.70it/s]

tensor(1.5110, grad_fn=<NllLossBackward0>)
tensor(1.5798, grad_fn=<NllLossBackward0>)
tensor(1.4922, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7732/17426 [12:20<14:58, 10.79it/s]

tensor(1.5515, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)
tensor(1.5465, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7734/17426 [12:20<15:33, 10.38it/s]

tensor(1.5192, grad_fn=<NllLossBackward0>)
tensor(1.5182, grad_fn=<NllLossBackward0>)
tensor(1.5135, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7738/17426 [12:20<14:56, 10.80it/s]

tensor(1.5052, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)
tensor(1.5206, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7740/17426 [12:20<15:04, 10.71it/s]

tensor(1.4987, grad_fn=<NllLossBackward0>)
tensor(1.5271, grad_fn=<NllLossBackward0>)
tensor(1.4761, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7744/17426 [12:21<15:04, 10.70it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.4753, grad_fn=<NllLossBackward0>)
tensor(1.4629, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7746/17426 [12:21<15:14, 10.59it/s]

tensor(1.4730, grad_fn=<NllLossBackward0>)
tensor(1.5356, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7750/17426 [12:21<14:49, 10.88it/s]

tensor(1.5194, grad_fn=<NllLossBackward0>)
tensor(1.5265, grad_fn=<NllLossBackward0>)
tensor(1.5023, grad_fn=<NllLossBackward0>)


 44%|████▍     | 7752/17426 [12:21<14:49, 10.88it/s]

tensor(1.5442, grad_fn=<NllLossBackward0>)
tensor(1.5193, grad_fn=<NllLossBackward0>)
tensor(1.5291, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7756/17426 [12:22<14:59, 10.75it/s]

tensor(1.5128, grad_fn=<NllLossBackward0>)
tensor(1.4850, grad_fn=<NllLossBackward0>)
tensor(1.5635, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7758/17426 [12:22<15:05, 10.68it/s]

tensor(1.5319, grad_fn=<NllLossBackward0>)
tensor(1.5132, grad_fn=<NllLossBackward0>)
tensor(1.5159, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7762/17426 [12:22<14:57, 10.77it/s]

tensor(1.5264, grad_fn=<NllLossBackward0>)
tensor(1.5157, grad_fn=<NllLossBackward0>)
tensor(1.4935, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7764/17426 [12:23<14:57, 10.76it/s]

tensor(1.4874, grad_fn=<NllLossBackward0>)
tensor(1.4810, grad_fn=<NllLossBackward0>)
tensor(1.4837, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7768/17426 [12:23<14:51, 10.84it/s]

tensor(1.5400, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)
tensor(1.5171, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7770/17426 [12:23<15:02, 10.70it/s]

tensor(1.5638, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.4615, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7774/17426 [12:24<14:46, 10.89it/s]

tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)
tensor(1.5192, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7776/17426 [12:24<14:51, 10.82it/s]

tensor(1.5240, grad_fn=<NllLossBackward0>)
tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7780/17426 [12:24<14:52, 10.81it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.4612, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7782/17426 [12:24<15:08, 10.62it/s]

tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)
tensor(1.5193, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7786/17426 [12:25<14:44, 10.90it/s]

tensor(1.5073, grad_fn=<NllLossBackward0>)
tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.4845, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7788/17426 [12:25<15:03, 10.66it/s]

tensor(1.5033, grad_fn=<NllLossBackward0>)
tensor(1.5260, grad_fn=<NllLossBackward0>)
tensor(1.5387, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7792/17426 [12:25<14:34, 11.02it/s]

tensor(1.5495, grad_fn=<NllLossBackward0>)
tensor(1.5547, grad_fn=<NllLossBackward0>)
tensor(1.5135, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7794/17426 [12:25<14:53, 10.78it/s]

tensor(1.4517, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)
tensor(1.4716, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7798/17426 [12:26<14:39, 10.95it/s]

tensor(1.5279, grad_fn=<NllLossBackward0>)
tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7800/17426 [12:26<15:18, 10.47it/s]

tensor(1.5681, grad_fn=<NllLossBackward0>)
tensor(1.5329, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7804/17426 [12:26<14:50, 10.81it/s]

tensor(1.5655, grad_fn=<NllLossBackward0>)
tensor(1.5287, grad_fn=<NllLossBackward0>)
tensor(1.5306, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7806/17426 [12:27<14:56, 10.73it/s]

tensor(1.5378, grad_fn=<NllLossBackward0>)
tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.5297, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7810/17426 [12:27<14:50, 10.80it/s]

tensor(1.4842, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)
tensor(1.5452, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7812/17426 [12:27<16:00, 10.01it/s]

tensor(1.5408, grad_fn=<NllLossBackward0>)
tensor(1.4725, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7814/17426 [12:27<17:28,  9.17it/s]

tensor(1.5505, grad_fn=<NllLossBackward0>)
tensor(1.5733, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7816/17426 [12:28<19:18,  8.29it/s]

tensor(1.5259, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7818/17426 [12:28<19:12,  8.34it/s]

tensor(1.4966, grad_fn=<NllLossBackward0>)
tensor(1.5297, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7820/17426 [12:28<21:02,  7.61it/s]

tensor(1.5485, grad_fn=<NllLossBackward0>)
tensor(1.5126, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7822/17426 [12:28<20:27,  7.82it/s]

tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.5224, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7824/17426 [12:29<21:27,  7.46it/s]

tensor(1.5550, grad_fn=<NllLossBackward0>)
tensor(1.5492, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7826/17426 [12:29<20:52,  7.67it/s]

tensor(1.5107, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7828/17426 [12:29<20:42,  7.73it/s]

tensor(1.5052, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7830/17426 [12:29<20:00,  7.99it/s]

tensor(1.5506, grad_fn=<NllLossBackward0>)
tensor(1.4995, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7832/17426 [12:30<20:37,  7.75it/s]

tensor(1.4969, grad_fn=<NllLossBackward0>)
tensor(1.5182, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7834/17426 [12:30<21:49,  7.33it/s]

tensor(1.5194, grad_fn=<NllLossBackward0>)
tensor(1.5442, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7836/17426 [12:30<23:30,  6.80it/s]

tensor(1.5400, grad_fn=<NllLossBackward0>)
tensor(1.5171, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7838/17426 [12:31<23:45,  6.73it/s]

tensor(1.5132, grad_fn=<NllLossBackward0>)
tensor(1.5132, grad_fn=<NllLossBackward0>)


 45%|████▍     | 7840/17426 [12:31<23:37,  6.76it/s]

tensor(1.4555, grad_fn=<NllLossBackward0>)
tensor(1.4985, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7842/17426 [12:31<22:55,  6.97it/s]

tensor(1.5107, grad_fn=<NllLossBackward0>)
tensor(1.4927, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7845/17426 [12:32<18:14,  8.76it/s]

tensor(1.5106, grad_fn=<NllLossBackward0>)
tensor(1.5465, grad_fn=<NllLossBackward0>)
tensor(1.4964, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7848/17426 [12:32<16:29,  9.68it/s]

tensor(1.4834, grad_fn=<NllLossBackward0>)
tensor(1.5576, grad_fn=<NllLossBackward0>)
tensor(1.5598, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7851/17426 [12:32<15:31, 10.28it/s]

tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.4511, grad_fn=<NllLossBackward0>)
tensor(1.4911, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7853/17426 [12:32<15:56, 10.01it/s]

tensor(1.5436, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)
tensor(1.5150, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7857/17426 [12:33<14:57, 10.67it/s]

tensor(1.5489, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.5615, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7859/17426 [12:33<15:04, 10.58it/s]

tensor(1.5277, grad_fn=<NllLossBackward0>)
tensor(1.5931, grad_fn=<NllLossBackward0>)
tensor(1.5367, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7863/17426 [12:33<14:38, 10.88it/s]

tensor(1.5117, grad_fn=<NllLossBackward0>)
tensor(1.4603, grad_fn=<NllLossBackward0>)
tensor(1.4789, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7865/17426 [12:33<15:09, 10.51it/s]

tensor(1.5486, grad_fn=<NllLossBackward0>)
tensor(1.5129, grad_fn=<NllLossBackward0>)
tensor(1.5505, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7869/17426 [12:34<14:57, 10.65it/s]

tensor(1.5028, grad_fn=<NllLossBackward0>)
tensor(1.5321, grad_fn=<NllLossBackward0>)
tensor(1.5353, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7871/17426 [12:34<14:55, 10.67it/s]

tensor(1.5223, grad_fn=<NllLossBackward0>)
tensor(1.5012, grad_fn=<NllLossBackward0>)
tensor(1.4983, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7875/17426 [12:34<14:55, 10.66it/s]

tensor(1.4787, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)
tensor(1.5560, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7877/17426 [12:35<14:58, 10.63it/s]

tensor(1.5050, grad_fn=<NllLossBackward0>)
tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.5710, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7881/17426 [12:35<14:38, 10.87it/s]

tensor(1.6017, grad_fn=<NllLossBackward0>)
tensor(1.5111, grad_fn=<NllLossBackward0>)
tensor(1.5199, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7883/17426 [12:35<14:40, 10.84it/s]

tensor(1.5176, grad_fn=<NllLossBackward0>)
tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.5418, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7887/17426 [12:36<14:53, 10.67it/s]

tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.5370, grad_fn=<NllLossBackward0>)
tensor(1.4991, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7889/17426 [12:36<15:01, 10.58it/s]

tensor(1.5061, grad_fn=<NllLossBackward0>)
tensor(1.5360, grad_fn=<NllLossBackward0>)
tensor(1.5247, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7893/17426 [12:36<14:42, 10.80it/s]

tensor(1.5311, grad_fn=<NllLossBackward0>)
tensor(1.5273, grad_fn=<NllLossBackward0>)
tensor(1.4876, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7895/17426 [12:36<14:52, 10.68it/s]

tensor(1.5566, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)
tensor(1.4715, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7899/17426 [12:37<14:44, 10.77it/s]

tensor(1.5268, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)
tensor(1.5245, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7901/17426 [12:37<14:56, 10.62it/s]

tensor(1.5105, grad_fn=<NllLossBackward0>)
tensor(1.5512, grad_fn=<NllLossBackward0>)
tensor(1.5168, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7905/17426 [12:37<14:33, 10.90it/s]

tensor(1.5292, grad_fn=<NllLossBackward0>)
tensor(1.4842, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7907/17426 [12:37<14:40, 10.81it/s]

tensor(1.5347, grad_fn=<NllLossBackward0>)
tensor(1.5547, grad_fn=<NllLossBackward0>)
tensor(1.5265, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7911/17426 [12:38<14:46, 10.73it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7913/17426 [12:38<14:46, 10.73it/s]

tensor(1.5094, grad_fn=<NllLossBackward0>)
tensor(1.5157, grad_fn=<NllLossBackward0>)
tensor(1.5153, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7917/17426 [12:38<14:26, 10.97it/s]

tensor(1.4819, grad_fn=<NllLossBackward0>)
tensor(1.5892, grad_fn=<NllLossBackward0>)
tensor(1.5244, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7919/17426 [12:38<14:38, 10.82it/s]

tensor(1.5335, grad_fn=<NllLossBackward0>)
tensor(1.5118, grad_fn=<NllLossBackward0>)
tensor(1.4921, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7923/17426 [12:39<14:41, 10.78it/s]

tensor(1.5591, grad_fn=<NllLossBackward0>)
tensor(1.4792, grad_fn=<NllLossBackward0>)
tensor(1.5568, grad_fn=<NllLossBackward0>)


 45%|████▌     | 7925/17426 [12:39<14:54, 10.62it/s]

tensor(1.5409, grad_fn=<NllLossBackward0>)
tensor(1.5426, grad_fn=<NllLossBackward0>)
tensor(1.5086, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7929/17426 [12:39<14:44, 10.73it/s]

tensor(1.4515, grad_fn=<NllLossBackward0>)
tensor(1.4844, grad_fn=<NllLossBackward0>)
tensor(1.5140, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7931/17426 [12:40<15:17, 10.35it/s]

tensor(1.5155, grad_fn=<NllLossBackward0>)
tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.5137, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7935/17426 [12:40<15:32, 10.18it/s]

tensor(1.4507, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7937/17426 [12:40<15:28, 10.21it/s]

tensor(1.5057, grad_fn=<NllLossBackward0>)
tensor(1.4966, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7941/17426 [12:41<15:19, 10.32it/s]

tensor(1.5175, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)
tensor(1.5162, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7943/17426 [12:41<15:15, 10.36it/s]

tensor(1.5387, grad_fn=<NllLossBackward0>)
tensor(1.5199, grad_fn=<NllLossBackward0>)
tensor(1.5073, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7947/17426 [12:41<14:35, 10.82it/s]

tensor(1.5166, grad_fn=<NllLossBackward0>)
tensor(1.5519, grad_fn=<NllLossBackward0>)
tensor(1.4712, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7949/17426 [12:41<16:12,  9.74it/s]

tensor(1.5181, grad_fn=<NllLossBackward0>)
tensor(1.5005, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7951/17426 [12:42<18:17,  8.63it/s]

tensor(1.5493, grad_fn=<NllLossBackward0>)
tensor(1.5496, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7953/17426 [12:42<19:31,  8.08it/s]

tensor(1.4729, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7955/17426 [12:42<21:12,  7.44it/s]

tensor(1.4931, grad_fn=<NllLossBackward0>)
tensor(1.5450, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7957/17426 [12:43<20:29,  7.70it/s]

tensor(1.5755, grad_fn=<NllLossBackward0>)
tensor(1.5564, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7959/17426 [12:43<21:52,  7.22it/s]

tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.4739, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7961/17426 [12:43<20:28,  7.71it/s]

tensor(1.4695, grad_fn=<NllLossBackward0>)
tensor(1.5299, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7963/17426 [12:43<20:33,  7.67it/s]

tensor(1.5161, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7965/17426 [12:44<19:16,  8.18it/s]

tensor(1.5460, grad_fn=<NllLossBackward0>)
tensor(1.5046, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7967/17426 [12:44<19:02,  8.28it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.4529, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7969/17426 [12:44<20:47,  7.58it/s]

tensor(1.4973, grad_fn=<NllLossBackward0>)
tensor(1.4911, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7971/17426 [12:44<22:10,  7.11it/s]

tensor(1.5202, grad_fn=<NllLossBackward0>)
tensor(1.5421, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7973/17426 [12:45<22:37,  6.96it/s]

tensor(1.4736, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7975/17426 [12:45<23:54,  6.59it/s]

tensor(1.5132, grad_fn=<NllLossBackward0>)
tensor(1.4729, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7977/17426 [12:45<23:46,  6.62it/s]

tensor(1.5298, grad_fn=<NllLossBackward0>)
tensor(1.5082, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7979/17426 [12:46<22:58,  6.86it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7982/17426 [12:46<17:59,  8.75it/s]

tensor(1.5518, grad_fn=<NllLossBackward0>)
tensor(1.5580, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7985/17426 [12:46<16:22,  9.61it/s]

tensor(1.4985, grad_fn=<NllLossBackward0>)
tensor(1.5484, grad_fn=<NllLossBackward0>)
tensor(1.4427, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7987/17426 [12:46<16:36,  9.48it/s]

tensor(1.5471, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)
tensor(1.5105, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7990/17426 [12:47<15:51,  9.92it/s]

tensor(1.5027, grad_fn=<NllLossBackward0>)
tensor(1.5180, grad_fn=<NllLossBackward0>)
tensor(1.4773, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7994/17426 [12:47<14:59, 10.48it/s]

tensor(1.5115, grad_fn=<NllLossBackward0>)
tensor(1.5033, grad_fn=<NllLossBackward0>)
tensor(1.4593, grad_fn=<NllLossBackward0>)


 46%|████▌     | 7996/17426 [12:47<14:56, 10.52it/s]

tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.5037, grad_fn=<NllLossBackward0>)
tensor(1.5323, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8000/17426 [12:48<14:26, 10.88it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.5709, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8002/17426 [12:48<14:52, 10.56it/s]

tensor(1.5473, grad_fn=<NllLossBackward0>)
tensor(1.5858, grad_fn=<NllLossBackward0>)
tensor(1.5088, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8006/17426 [12:48<14:40, 10.70it/s]

tensor(1.5245, grad_fn=<NllLossBackward0>)
tensor(1.5280, grad_fn=<NllLossBackward0>)
tensor(1.5395, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8008/17426 [12:48<14:38, 10.72it/s]

tensor(1.5315, grad_fn=<NllLossBackward0>)
tensor(1.5054, grad_fn=<NllLossBackward0>)
tensor(1.5455, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8012/17426 [12:49<14:31, 10.80it/s]

tensor(1.4707, grad_fn=<NllLossBackward0>)
tensor(1.4796, grad_fn=<NllLossBackward0>)
tensor(1.4997, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8014/17426 [12:49<14:38, 10.71it/s]

tensor(1.5371, grad_fn=<NllLossBackward0>)
tensor(1.5434, grad_fn=<NllLossBackward0>)
tensor(1.5051, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8018/17426 [12:49<14:27, 10.84it/s]

tensor(1.5462, grad_fn=<NllLossBackward0>)
tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.5168, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8020/17426 [12:49<14:45, 10.62it/s]

tensor(1.5464, grad_fn=<NllLossBackward0>)
tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.5160, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8024/17426 [12:50<14:22, 10.90it/s]

tensor(1.5131, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)
tensor(1.5344, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8026/17426 [12:50<14:31, 10.78it/s]

tensor(1.5422, grad_fn=<NllLossBackward0>)
tensor(1.5255, grad_fn=<NllLossBackward0>)
tensor(1.5331, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8030/17426 [12:50<14:27, 10.83it/s]

tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.5205, grad_fn=<NllLossBackward0>)
tensor(1.4912, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8032/17426 [12:51<14:33, 10.75it/s]

tensor(1.5462, grad_fn=<NllLossBackward0>)
tensor(1.4952, grad_fn=<NllLossBackward0>)
tensor(1.5246, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8036/17426 [12:51<14:19, 10.93it/s]

tensor(1.4964, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)
tensor(1.5431, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8038/17426 [12:51<14:52, 10.52it/s]

tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.4889, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8042/17426 [12:52<14:24, 10.86it/s]

tensor(1.4845, grad_fn=<NllLossBackward0>)
tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.5467, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8044/17426 [12:52<14:40, 10.65it/s]

tensor(1.5303, grad_fn=<NllLossBackward0>)
tensor(1.5426, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8048/17426 [12:52<14:14, 10.97it/s]

tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.4851, grad_fn=<NllLossBackward0>)
tensor(1.5472, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8050/17426 [12:52<14:37, 10.68it/s]

tensor(1.5403, grad_fn=<NllLossBackward0>)
tensor(1.5234, grad_fn=<NllLossBackward0>)
tensor(1.5296, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8054/17426 [12:53<14:18, 10.92it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5374, grad_fn=<NllLossBackward0>)
tensor(1.5331, grad_fn=<NllLossBackward0>)


 46%|████▌     | 8056/17426 [12:53<14:25, 10.83it/s]

tensor(1.5158, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.5430, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8060/17426 [12:53<14:04, 11.09it/s]

tensor(1.4804, grad_fn=<NllLossBackward0>)
tensor(1.5244, grad_fn=<NllLossBackward0>)
tensor(1.5382, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8062/17426 [12:53<14:45, 10.57it/s]

tensor(1.5548, grad_fn=<NllLossBackward0>)
tensor(1.5198, grad_fn=<NllLossBackward0>)
tensor(1.5437, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8066/17426 [12:54<14:18, 10.91it/s]

tensor(1.5093, grad_fn=<NllLossBackward0>)
tensor(1.5424, grad_fn=<NllLossBackward0>)
tensor(1.4991, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8068/17426 [12:54<14:29, 10.77it/s]

tensor(1.5408, grad_fn=<NllLossBackward0>)
tensor(1.5350, grad_fn=<NllLossBackward0>)
tensor(1.5197, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8072/17426 [12:54<14:23, 10.84it/s]

tensor(1.5485, grad_fn=<NllLossBackward0>)
tensor(1.5167, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8074/17426 [12:55<14:35, 10.68it/s]

tensor(1.5060, grad_fn=<NllLossBackward0>)
tensor(1.4875, grad_fn=<NllLossBackward0>)
tensor(1.5325, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8078/17426 [12:55<14:13, 10.95it/s]

tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.5301, grad_fn=<NllLossBackward0>)
tensor(1.4888, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8080/17426 [12:55<14:28, 10.77it/s]

tensor(1.5278, grad_fn=<NllLossBackward0>)
tensor(1.5173, grad_fn=<NllLossBackward0>)
tensor(1.4914, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8084/17426 [12:55<14:47, 10.53it/s]

tensor(1.5539, grad_fn=<NllLossBackward0>)
tensor(1.5368, grad_fn=<NllLossBackward0>)
tensor(1.4873, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8086/17426 [12:56<16:35,  9.38it/s]

tensor(1.4996, grad_fn=<NllLossBackward0>)
tensor(1.4834, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8088/17426 [12:56<17:41,  8.80it/s]

tensor(1.4985, grad_fn=<NllLossBackward0>)
tensor(1.5230, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8090/17426 [12:56<19:09,  8.12it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.5321, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8092/17426 [12:57<19:57,  7.80it/s]

tensor(1.5269, grad_fn=<NllLossBackward0>)
tensor(1.5093, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8094/17426 [12:57<19:44,  7.88it/s]

tensor(1.5435, grad_fn=<NllLossBackward0>)
tensor(1.5871, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8096/17426 [12:57<20:00,  7.77it/s]

tensor(1.5660, grad_fn=<NllLossBackward0>)
tensor(1.5123, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8098/17426 [12:57<20:44,  7.50it/s]

tensor(1.5570, grad_fn=<NllLossBackward0>)
tensor(1.5171, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8100/17426 [12:58<21:45,  7.14it/s]

tensor(1.5198, grad_fn=<NllLossBackward0>)
tensor(1.5228, grad_fn=<NllLossBackward0>)


 46%|████▋     | 8102/17426 [12:58<19:43,  7.88it/s]

tensor(1.5111, grad_fn=<NllLossBackward0>)
tensor(1.4790, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8104/17426 [12:58<20:20,  7.64it/s]

tensor(1.5462, grad_fn=<NllLossBackward0>)
tensor(1.5501, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8106/17426 [12:58<21:57,  7.07it/s]

tensor(1.4264, grad_fn=<NllLossBackward0>)
tensor(1.4737, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8108/17426 [12:59<22:06,  7.02it/s]

tensor(1.4737, grad_fn=<NllLossBackward0>)
tensor(1.5143, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8110/17426 [12:59<22:12,  6.99it/s]

tensor(1.5716, grad_fn=<NllLossBackward0>)
tensor(1.5349, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8112/17426 [12:59<22:36,  6.86it/s]

tensor(1.5157, grad_fn=<NllLossBackward0>)
tensor(1.5220, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8114/17426 [13:00<22:55,  6.77it/s]

tensor(1.5080, grad_fn=<NllLossBackward0>)
tensor(1.5029, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8116/17426 [13:00<22:09,  7.00it/s]

tensor(1.4958, grad_fn=<NllLossBackward0>)
tensor(1.5722, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8119/17426 [13:00<17:25,  8.90it/s]

tensor(1.5296, grad_fn=<NllLossBackward0>)
tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.5329, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8122/17426 [13:00<15:49,  9.79it/s]

tensor(1.5159, grad_fn=<NllLossBackward0>)
tensor(1.5302, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8124/17426 [13:01<16:41,  9.29it/s]

tensor(1.5197, grad_fn=<NllLossBackward0>)
tensor(1.5257, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8126/17426 [13:01<16:21,  9.48it/s]

tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.5349, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8128/17426 [13:01<15:41,  9.88it/s]

tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.5228, grad_fn=<NllLossBackward0>)
tensor(1.5189, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8132/17426 [13:01<14:47, 10.47it/s]

tensor(1.4732, grad_fn=<NllLossBackward0>)
tensor(1.5122, grad_fn=<NllLossBackward0>)
tensor(1.5737, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8134/17426 [13:02<14:51, 10.42it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.5068, grad_fn=<NllLossBackward0>)
tensor(1.5315, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8138/17426 [13:02<14:32, 10.65it/s]

tensor(1.5032, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.5212, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8140/17426 [13:02<14:44, 10.50it/s]

tensor(1.5158, grad_fn=<NllLossBackward0>)
tensor(1.4977, grad_fn=<NllLossBackward0>)
tensor(1.5248, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8144/17426 [13:03<14:20, 10.79it/s]

tensor(1.5127, grad_fn=<NllLossBackward0>)
tensor(1.4917, grad_fn=<NllLossBackward0>)
tensor(1.5672, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8146/17426 [13:03<14:52, 10.40it/s]

tensor(1.5066, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)
tensor(1.5161, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8150/17426 [13:03<14:17, 10.82it/s]

tensor(1.4707, grad_fn=<NllLossBackward0>)
tensor(1.5169, grad_fn=<NllLossBackward0>)
tensor(1.5044, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8152/17426 [13:03<14:30, 10.66it/s]

tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.5047, grad_fn=<NllLossBackward0>)
tensor(1.5068, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8156/17426 [13:04<14:17, 10.81it/s]

tensor(1.4874, grad_fn=<NllLossBackward0>)
tensor(1.5290, grad_fn=<NllLossBackward0>)
tensor(1.5141, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8158/17426 [13:04<14:51, 10.40it/s]

tensor(1.4527, grad_fn=<NllLossBackward0>)
tensor(1.5167, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8162/17426 [13:04<14:15, 10.83it/s]

tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8164/17426 [13:04<14:30, 10.64it/s]

tensor(1.5540, grad_fn=<NllLossBackward0>)
tensor(1.4677, grad_fn=<NllLossBackward0>)
tensor(1.4617, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8168/17426 [13:05<14:24, 10.71it/s]

tensor(1.5670, grad_fn=<NllLossBackward0>)
tensor(1.4966, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8170/17426 [13:05<14:25, 10.69it/s]

tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.5123, grad_fn=<NllLossBackward0>)
tensor(1.5182, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8174/17426 [13:05<14:05, 10.94it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.5027, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8176/17426 [13:06<14:33, 10.59it/s]

tensor(1.5905, grad_fn=<NllLossBackward0>)
tensor(1.5472, grad_fn=<NllLossBackward0>)
tensor(1.4995, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8180/17426 [13:06<14:32, 10.59it/s]

tensor(1.5159, grad_fn=<NllLossBackward0>)
tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8182/17426 [13:06<14:36, 10.55it/s]

tensor(1.4775, grad_fn=<NllLossBackward0>)
tensor(1.4381, grad_fn=<NllLossBackward0>)
tensor(1.5567, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8186/17426 [13:07<14:15, 10.80it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.4974, grad_fn=<NllLossBackward0>)
tensor(1.4892, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8188/17426 [13:07<14:24, 10.69it/s]

tensor(1.5236, grad_fn=<NllLossBackward0>)
tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.5720, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8192/17426 [13:07<14:12, 10.83it/s]

tensor(1.5060, grad_fn=<NllLossBackward0>)
tensor(1.4960, grad_fn=<NllLossBackward0>)
tensor(1.5230, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8194/17426 [13:07<14:20, 10.73it/s]

tensor(1.5137, grad_fn=<NllLossBackward0>)
tensor(1.5353, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8198/17426 [13:08<13:59, 10.99it/s]

tensor(1.4849, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)
tensor(1.5027, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8200/17426 [13:08<14:15, 10.79it/s]

tensor(1.5031, grad_fn=<NllLossBackward0>)
tensor(1.5411, grad_fn=<NllLossBackward0>)
tensor(1.4820, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8204/17426 [13:08<14:06, 10.90it/s]

tensor(1.5568, grad_fn=<NllLossBackward0>)
tensor(1.4516, grad_fn=<NllLossBackward0>)
tensor(1.5076, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8206/17426 [13:08<14:27, 10.63it/s]

tensor(1.5019, grad_fn=<NllLossBackward0>)
tensor(1.5202, grad_fn=<NllLossBackward0>)
tensor(1.5529, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8210/17426 [13:09<14:12, 10.82it/s]

tensor(1.5395, grad_fn=<NllLossBackward0>)
tensor(1.4873, grad_fn=<NllLossBackward0>)
tensor(1.5214, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8212/17426 [13:09<14:50, 10.35it/s]

tensor(1.5173, grad_fn=<NllLossBackward0>)
tensor(1.5205, grad_fn=<NllLossBackward0>)
tensor(1.5372, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8216/17426 [13:09<14:28, 10.60it/s]

tensor(1.5262, grad_fn=<NllLossBackward0>)
tensor(1.5683, grad_fn=<NllLossBackward0>)
tensor(1.5375, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8218/17426 [13:10<14:38, 10.48it/s]

tensor(1.4913, grad_fn=<NllLossBackward0>)
tensor(1.4814, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8220/17426 [13:10<14:41, 10.45it/s]

tensor(1.4881, grad_fn=<NllLossBackward0>)
tensor(1.4614, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8223/17426 [13:10<16:49,  9.12it/s]

tensor(1.4680, grad_fn=<NllLossBackward0>)
tensor(1.4798, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8225/17426 [13:10<18:32,  8.27it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.5122, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8227/17426 [13:11<20:15,  7.57it/s]

tensor(1.5619, grad_fn=<NllLossBackward0>)
tensor(1.5188, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8229/17426 [13:11<19:24,  7.90it/s]

tensor(1.4712, grad_fn=<NllLossBackward0>)
tensor(1.5463, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8231/17426 [13:11<20:25,  7.50it/s]

tensor(1.5565, grad_fn=<NllLossBackward0>)
tensor(1.4939, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8233/17426 [13:12<20:44,  7.39it/s]

tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.5378, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8235/17426 [13:12<20:59,  7.30it/s]

tensor(1.5393, grad_fn=<NllLossBackward0>)
tensor(1.5041, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8237/17426 [13:12<20:56,  7.31it/s]

tensor(1.4997, grad_fn=<NllLossBackward0>)
tensor(1.4544, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8239/17426 [13:12<20:53,  7.33it/s]

tensor(1.5250, grad_fn=<NllLossBackward0>)
tensor(1.4905, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8241/17426 [13:13<19:43,  7.76it/s]

tensor(1.5261, grad_fn=<NllLossBackward0>)
tensor(1.5629, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8243/17426 [13:13<19:55,  7.68it/s]

tensor(1.5202, grad_fn=<NllLossBackward0>)
tensor(1.4489, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8245/17426 [13:13<21:17,  7.19it/s]

tensor(1.5343, grad_fn=<NllLossBackward0>)
tensor(1.5012, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8247/17426 [13:13<22:10,  6.90it/s]

tensor(1.5157, grad_fn=<NllLossBackward0>)
tensor(1.5349, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8249/17426 [13:14<22:16,  6.87it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.4964, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8251/17426 [13:14<22:39,  6.75it/s]

tensor(1.4973, grad_fn=<NllLossBackward0>)
tensor(1.4830, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8253/17426 [13:14<22:03,  6.93it/s]

tensor(1.5172, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8256/17426 [13:15<18:34,  8.23it/s]

tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.5197, grad_fn=<NllLossBackward0>)
tensor(1.5309, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8259/17426 [13:15<15:54,  9.61it/s]

tensor(1.5290, grad_fn=<NllLossBackward0>)
tensor(1.5267, grad_fn=<NllLossBackward0>)
tensor(1.5328, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8262/17426 [13:15<15:05, 10.12it/s]

tensor(1.5028, grad_fn=<NllLossBackward0>)
tensor(1.5018, grad_fn=<NllLossBackward0>)
tensor(1.4944, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8264/17426 [13:15<15:50,  9.63it/s]

tensor(1.5126, grad_fn=<NllLossBackward0>)
tensor(1.5338, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8266/17426 [13:16<16:06,  9.47it/s]

tensor(1.5135, grad_fn=<NllLossBackward0>)
tensor(1.5289, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8268/17426 [13:16<16:03,  9.51it/s]

tensor(1.4827, grad_fn=<NllLossBackward0>)
tensor(1.4869, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8270/17426 [13:16<15:28,  9.87it/s]

tensor(1.4714, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)
tensor(1.5040, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8274/17426 [13:16<14:17, 10.67it/s]

tensor(1.4856, grad_fn=<NllLossBackward0>)
tensor(1.4844, grad_fn=<NllLossBackward0>)
tensor(1.4964, grad_fn=<NllLossBackward0>)


 47%|████▋     | 8276/17426 [13:17<14:51, 10.27it/s]

tensor(1.5474, grad_fn=<NllLossBackward0>)
tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.4830, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8280/17426 [13:17<14:06, 10.81it/s]

tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.5019, grad_fn=<NllLossBackward0>)
tensor(1.4603, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8282/17426 [13:17<14:18, 10.65it/s]

tensor(1.4815, grad_fn=<NllLossBackward0>)
tensor(1.5182, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8286/17426 [13:18<14:27, 10.54it/s]

tensor(1.5247, grad_fn=<NllLossBackward0>)
tensor(1.5131, grad_fn=<NllLossBackward0>)
tensor(1.5019, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8288/17426 [13:18<14:29, 10.51it/s]

tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.5020, grad_fn=<NllLossBackward0>)
tensor(1.4967, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8292/17426 [13:18<14:09, 10.76it/s]

tensor(1.5465, grad_fn=<NllLossBackward0>)
tensor(1.5360, grad_fn=<NllLossBackward0>)
tensor(1.4898, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8294/17426 [13:18<14:18, 10.64it/s]

tensor(1.5143, grad_fn=<NllLossBackward0>)
tensor(1.4780, grad_fn=<NllLossBackward0>)
tensor(1.5557, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8298/17426 [13:19<14:12, 10.71it/s]

tensor(1.5080, grad_fn=<NllLossBackward0>)
tensor(1.5496, grad_fn=<NllLossBackward0>)
tensor(1.5143, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8300/17426 [13:19<14:16, 10.65it/s]

tensor(1.5263, grad_fn=<NllLossBackward0>)
tensor(1.5203, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8304/17426 [13:19<14:04, 10.81it/s]

tensor(1.5100, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)
tensor(1.5535, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8306/17426 [13:19<14:20, 10.60it/s]

tensor(1.5316, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8310/17426 [13:20<14:21, 10.58it/s]

tensor(1.5117, grad_fn=<NllLossBackward0>)
tensor(1.5467, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8312/17426 [13:20<14:18, 10.61it/s]

tensor(1.4915, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.5414, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8316/17426 [13:20<13:57, 10.88it/s]

tensor(1.4654, grad_fn=<NllLossBackward0>)
tensor(1.5187, grad_fn=<NllLossBackward0>)
tensor(1.5937, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8318/17426 [13:21<14:07, 10.74it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.5361, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8322/17426 [13:21<14:07, 10.74it/s]

tensor(1.4856, grad_fn=<NllLossBackward0>)
tensor(1.4950, grad_fn=<NllLossBackward0>)
tensor(1.5002, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8324/17426 [13:21<14:16, 10.63it/s]

tensor(1.4791, grad_fn=<NllLossBackward0>)
tensor(1.5030, grad_fn=<NllLossBackward0>)
tensor(1.5000, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8328/17426 [13:22<14:05, 10.76it/s]

tensor(1.5297, grad_fn=<NllLossBackward0>)
tensor(1.5374, grad_fn=<NllLossBackward0>)
tensor(1.4967, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8330/17426 [13:22<14:35, 10.39it/s]

tensor(1.4850, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)
tensor(1.5367, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8334/17426 [13:22<14:09, 10.70it/s]

tensor(1.5281, grad_fn=<NllLossBackward0>)
tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8336/17426 [13:22<14:23, 10.53it/s]

tensor(1.5084, grad_fn=<NllLossBackward0>)
tensor(1.5316, grad_fn=<NllLossBackward0>)
tensor(1.5628, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8340/17426 [13:23<14:02, 10.78it/s]

tensor(1.5224, grad_fn=<NllLossBackward0>)
tensor(1.5163, grad_fn=<NllLossBackward0>)
tensor(1.4811, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8342/17426 [13:23<14:31, 10.43it/s]

tensor(1.5129, grad_fn=<NllLossBackward0>)
tensor(1.5131, grad_fn=<NllLossBackward0>)
tensor(1.5163, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8346/17426 [13:23<14:02, 10.77it/s]

tensor(1.4805, grad_fn=<NllLossBackward0>)
tensor(1.4672, grad_fn=<NllLossBackward0>)
tensor(1.5170, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8348/17426 [13:23<14:10, 10.68it/s]

tensor(1.5349, grad_fn=<NllLossBackward0>)
tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.4802, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8352/17426 [13:24<14:19, 10.56it/s]

tensor(1.5055, grad_fn=<NllLossBackward0>)
tensor(1.4842, grad_fn=<NllLossBackward0>)
tensor(1.5219, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8354/17426 [13:24<14:15, 10.60it/s]

tensor(1.5212, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8358/17426 [13:24<13:53, 10.88it/s]

tensor(1.4691, grad_fn=<NllLossBackward0>)
tensor(1.5076, grad_fn=<NllLossBackward0>)
tensor(1.4657, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8360/17426 [13:25<15:39,  9.65it/s]

tensor(1.5635, grad_fn=<NllLossBackward0>)
tensor(1.4713, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8362/17426 [13:25<17:30,  8.63it/s]

tensor(1.4387, grad_fn=<NllLossBackward0>)
tensor(1.5074, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8364/17426 [13:25<19:02,  7.93it/s]

tensor(1.5138, grad_fn=<NllLossBackward0>)
tensor(1.4810, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8366/17426 [13:25<19:39,  7.68it/s]

tensor(1.5236, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8368/17426 [13:26<19:11,  7.86it/s]

tensor(1.5004, grad_fn=<NllLossBackward0>)
tensor(1.4617, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8370/17426 [13:26<18:56,  7.97it/s]

tensor(1.4866, grad_fn=<NllLossBackward0>)
tensor(1.4936, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8372/17426 [13:26<18:44,  8.05it/s]

tensor(1.5261, grad_fn=<NllLossBackward0>)
tensor(1.5562, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8374/17426 [13:26<19:25,  7.77it/s]

tensor(1.4829, grad_fn=<NllLossBackward0>)
tensor(1.5716, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8376/17426 [13:27<19:40,  7.66it/s]

tensor(1.4844, grad_fn=<NllLossBackward0>)
tensor(1.5692, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8378/17426 [13:27<19:51,  7.59it/s]

tensor(1.5366, grad_fn=<NllLossBackward0>)
tensor(1.5005, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8380/17426 [13:27<20:47,  7.25it/s]

tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.4631, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8382/17426 [13:28<21:52,  6.89it/s]

tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.4827, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8384/17426 [13:28<21:01,  7.17it/s]

tensor(1.4802, grad_fn=<NllLossBackward0>)
tensor(1.5744, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8386/17426 [13:28<22:08,  6.80it/s]

tensor(1.4850, grad_fn=<NllLossBackward0>)
tensor(1.5046, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8388/17426 [13:28<21:22,  7.05it/s]

tensor(1.5057, grad_fn=<NllLossBackward0>)
tensor(1.5016, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8390/17426 [13:29<22:11,  6.79it/s]

tensor(1.5085, grad_fn=<NllLossBackward0>)
tensor(1.4780, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8392/17426 [13:29<20:07,  7.48it/s]

tensor(1.5016, grad_fn=<NllLossBackward0>)
tensor(1.4665, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8394/17426 [13:29<19:02,  7.90it/s]

tensor(1.4806, grad_fn=<NllLossBackward0>)
tensor(1.5028, grad_fn=<NllLossBackward0>)
tensor(1.5353, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8397/17426 [13:30<16:28,  9.14it/s]

tensor(1.5394, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)
tensor(1.4943, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8401/17426 [13:30<14:46, 10.18it/s]

tensor(1.4573, grad_fn=<NllLossBackward0>)
tensor(1.5359, grad_fn=<NllLossBackward0>)
tensor(1.5151, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8403/17426 [13:30<14:42, 10.22it/s]

tensor(1.4665, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)
tensor(1.5157, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8407/17426 [13:30<14:14, 10.56it/s]

tensor(1.5415, grad_fn=<NllLossBackward0>)
tensor(1.5113, grad_fn=<NllLossBackward0>)
tensor(1.4816, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8409/17426 [13:31<14:12, 10.57it/s]

tensor(1.4730, grad_fn=<NllLossBackward0>)
tensor(1.5652, grad_fn=<NllLossBackward0>)
tensor(1.4850, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8413/17426 [13:31<13:53, 10.82it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.5152, grad_fn=<NllLossBackward0>)
tensor(1.5084, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8415/17426 [13:31<14:21, 10.46it/s]

tensor(1.4884, grad_fn=<NllLossBackward0>)
tensor(1.4810, grad_fn=<NllLossBackward0>)
tensor(1.4682, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8419/17426 [13:32<14:06, 10.64it/s]

tensor(1.4699, grad_fn=<NllLossBackward0>)
tensor(1.5327, grad_fn=<NllLossBackward0>)
tensor(1.4813, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8421/17426 [13:32<14:21, 10.46it/s]

tensor(1.5113, grad_fn=<NllLossBackward0>)
tensor(1.4764, grad_fn=<NllLossBackward0>)
tensor(1.4928, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8425/17426 [13:32<13:57, 10.75it/s]

tensor(1.5329, grad_fn=<NllLossBackward0>)
tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.4850, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8427/17426 [13:32<14:34, 10.29it/s]

tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.4734, grad_fn=<NllLossBackward0>)
tensor(1.4483, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8431/17426 [13:33<14:08, 10.60it/s]

tensor(1.5035, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.5221, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8433/17426 [13:33<14:11, 10.56it/s]

tensor(1.4814, grad_fn=<NllLossBackward0>)
tensor(1.5473, grad_fn=<NllLossBackward0>)
tensor(1.5124, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8437/17426 [13:33<14:13, 10.53it/s]

tensor(1.5551, grad_fn=<NllLossBackward0>)
tensor(1.4911, grad_fn=<NllLossBackward0>)
tensor(1.4462, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8439/17426 [13:34<14:20, 10.44it/s]

tensor(1.5104, grad_fn=<NllLossBackward0>)
tensor(1.5012, grad_fn=<NllLossBackward0>)
tensor(1.5498, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8443/17426 [13:34<14:06, 10.61it/s]

tensor(1.5533, grad_fn=<NllLossBackward0>)
tensor(1.4783, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8445/17426 [13:34<14:11, 10.55it/s]

tensor(1.4618, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)
tensor(1.4969, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8449/17426 [13:34<14:12, 10.53it/s]

tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.4729, grad_fn=<NllLossBackward0>)
tensor(1.4663, grad_fn=<NllLossBackward0>)


 48%|████▊     | 8451/17426 [13:35<14:14, 10.51it/s]

tensor(1.5007, grad_fn=<NllLossBackward0>)
tensor(1.5309, grad_fn=<NllLossBackward0>)
tensor(1.5115, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8455/17426 [13:35<13:59, 10.68it/s]

tensor(1.5486, grad_fn=<NllLossBackward0>)
tensor(1.5258, grad_fn=<NllLossBackward0>)
tensor(1.5103, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8457/17426 [13:35<14:05, 10.61it/s]

tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.5258, grad_fn=<NllLossBackward0>)
tensor(1.4950, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8461/17426 [13:36<14:03, 10.62it/s]

tensor(1.4901, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)
tensor(1.5432, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8463/17426 [13:36<14:13, 10.51it/s]

tensor(1.5522, grad_fn=<NllLossBackward0>)
tensor(1.4906, grad_fn=<NllLossBackward0>)
tensor(1.5249, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8467/17426 [13:36<13:42, 10.89it/s]

tensor(1.5325, grad_fn=<NllLossBackward0>)
tensor(1.4959, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8469/17426 [13:36<13:51, 10.77it/s]

tensor(1.4943, grad_fn=<NllLossBackward0>)
tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.5084, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8473/17426 [13:37<13:53, 10.74it/s]

tensor(1.5390, grad_fn=<NllLossBackward0>)
tensor(1.5223, grad_fn=<NllLossBackward0>)
tensor(1.4568, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8475/17426 [13:37<14:05, 10.59it/s]

tensor(1.5013, grad_fn=<NllLossBackward0>)
tensor(1.4601, grad_fn=<NllLossBackward0>)
tensor(1.5175, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8479/17426 [13:37<13:40, 10.91it/s]

tensor(1.5530, grad_fn=<NllLossBackward0>)
tensor(1.5084, grad_fn=<NllLossBackward0>)
tensor(1.4914, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8481/17426 [13:37<14:17, 10.43it/s]

tensor(1.5551, grad_fn=<NllLossBackward0>)
tensor(1.5111, grad_fn=<NllLossBackward0>)
tensor(1.4883, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8485/17426 [13:38<13:58, 10.67it/s]

tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.4821, grad_fn=<NllLossBackward0>)
tensor(1.5205, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8487/17426 [13:38<14:02, 10.61it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.5106, grad_fn=<NllLossBackward0>)
tensor(1.5166, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8491/17426 [13:38<13:36, 10.95it/s]

tensor(1.4873, grad_fn=<NllLossBackward0>)
tensor(1.4872, grad_fn=<NllLossBackward0>)
tensor(1.5366, grad_fn=<NllLossBackward0>)


 49%|████▊     | 8493/17426 [13:39<14:16, 10.43it/s]

tensor(1.5501, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)
tensor(1.5218, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8496/17426 [13:39<16:06,  9.24it/s]

tensor(1.4913, grad_fn=<NllLossBackward0>)
tensor(1.4647, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8498/17426 [13:39<18:10,  8.19it/s]

tensor(1.5471, grad_fn=<NllLossBackward0>)
tensor(1.4878, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8500/17426 [13:40<20:01,  7.43it/s]

tensor(1.4464, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8502/17426 [13:40<19:16,  7.71it/s]

tensor(1.4833, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8504/17426 [13:40<18:36,  7.99it/s]

tensor(1.5848, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8506/17426 [13:40<18:21,  8.09it/s]

tensor(1.5065, grad_fn=<NllLossBackward0>)
tensor(1.5373, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8508/17426 [13:41<19:26,  7.65it/s]

tensor(1.5306, grad_fn=<NllLossBackward0>)
tensor(1.4928, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8510/17426 [13:41<19:27,  7.64it/s]

tensor(1.5201, grad_fn=<NllLossBackward0>)
tensor(1.4721, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8512/17426 [13:41<19:38,  7.57it/s]

tensor(1.5348, grad_fn=<NllLossBackward0>)
tensor(1.5126, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8514/17426 [13:41<20:48,  7.14it/s]

tensor(1.5018, grad_fn=<NllLossBackward0>)
tensor(1.5179, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8516/17426 [13:42<20:36,  7.20it/s]

tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8518/17426 [13:42<22:09,  6.70it/s]

tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.4975, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8520/17426 [13:42<20:38,  7.19it/s]

tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8522/17426 [13:43<21:20,  6.95it/s]

tensor(1.4859, grad_fn=<NllLossBackward0>)
tensor(1.5418, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8524/17426 [13:43<21:04,  7.04it/s]

tensor(1.5510, grad_fn=<NllLossBackward0>)
tensor(1.5380, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8526/17426 [13:43<22:22,  6.63it/s]

tensor(1.4903, grad_fn=<NllLossBackward0>)
tensor(1.5081, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8528/17426 [13:43<20:23,  7.27it/s]

tensor(1.4626, grad_fn=<NllLossBackward0>)
tensor(1.5428, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8530/17426 [13:44<18:19,  8.09it/s]

tensor(1.4959, grad_fn=<NllLossBackward0>)
tensor(1.4838, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8532/17426 [13:44<17:08,  8.65it/s]

tensor(1.5403, grad_fn=<NllLossBackward0>)
tensor(1.5399, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8535/17426 [13:44<15:43,  9.43it/s]

tensor(1.4769, grad_fn=<NllLossBackward0>)
tensor(1.4971, grad_fn=<NllLossBackward0>)
tensor(1.5039, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8538/17426 [13:44<14:37, 10.13it/s]

tensor(1.4664, grad_fn=<NllLossBackward0>)
tensor(1.4927, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8540/17426 [13:45<15:11,  9.75it/s]

tensor(1.5235, grad_fn=<NllLossBackward0>)
tensor(1.5159, grad_fn=<NllLossBackward0>)
tensor(1.4761, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8543/17426 [13:45<15:01,  9.85it/s]

tensor(1.5365, grad_fn=<NllLossBackward0>)
tensor(1.5234, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8545/17426 [13:45<15:24,  9.60it/s]

tensor(1.4840, grad_fn=<NllLossBackward0>)
tensor(1.5223, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8547/17426 [13:45<15:41,  9.43it/s]

tensor(1.4771, grad_fn=<NllLossBackward0>)
tensor(1.4909, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8549/17426 [13:46<15:38,  9.46it/s]

tensor(1.5337, grad_fn=<NllLossBackward0>)
tensor(1.5151, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8551/17426 [13:46<15:30,  9.54it/s]

tensor(1.4984, grad_fn=<NllLossBackward0>)
tensor(1.4829, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8553/17426 [13:46<16:21,  9.04it/s]

tensor(1.5345, grad_fn=<NllLossBackward0>)
tensor(1.5091, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8555/17426 [13:46<16:02,  9.22it/s]

tensor(1.4847, grad_fn=<NllLossBackward0>)
tensor(1.5147, grad_fn=<NllLossBackward0>)
tensor(1.4616, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8558/17426 [13:47<15:08,  9.76it/s]

tensor(1.5168, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8560/17426 [13:47<14:41, 10.05it/s]

tensor(1.4540, grad_fn=<NllLossBackward0>)
tensor(1.5017, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8564/17426 [13:47<14:15, 10.36it/s]

tensor(1.5267, grad_fn=<NllLossBackward0>)
tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.5077, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8566/17426 [13:47<14:02, 10.51it/s]

tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.5244, grad_fn=<NllLossBackward0>)
tensor(1.4968, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8570/17426 [13:48<13:40, 10.79it/s]

tensor(1.4855, grad_fn=<NllLossBackward0>)
tensor(1.5358, grad_fn=<NllLossBackward0>)
tensor(1.5280, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8572/17426 [13:48<13:51, 10.64it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.4799, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8576/17426 [13:48<13:45, 10.72it/s]

tensor(1.4704, grad_fn=<NllLossBackward0>)
tensor(1.4968, grad_fn=<NllLossBackward0>)
tensor(1.5346, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8578/17426 [13:48<13:56, 10.58it/s]

tensor(1.4963, grad_fn=<NllLossBackward0>)
tensor(1.5175, grad_fn=<NllLossBackward0>)
tensor(1.4862, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8582/17426 [13:49<13:38, 10.81it/s]

tensor(1.5315, grad_fn=<NllLossBackward0>)
tensor(1.5324, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8584/17426 [13:49<13:46, 10.70it/s]

tensor(1.4974, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)
tensor(1.4950, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8588/17426 [13:49<13:45, 10.70it/s]

tensor(1.5074, grad_fn=<NllLossBackward0>)
tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.5233, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8590/17426 [13:50<13:43, 10.73it/s]

tensor(1.4853, grad_fn=<NllLossBackward0>)
tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.5145, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8594/17426 [13:50<13:31, 10.89it/s]

tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.4906, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8596/17426 [13:50<13:50, 10.63it/s]

tensor(1.4630, grad_fn=<NllLossBackward0>)
tensor(1.5230, grad_fn=<NllLossBackward0>)
tensor(1.5375, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8600/17426 [13:51<13:51, 10.62it/s]

tensor(1.5148, grad_fn=<NllLossBackward0>)
tensor(1.5058, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8602/17426 [13:51<14:07, 10.42it/s]

tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.4861, grad_fn=<NllLossBackward0>)
tensor(1.4857, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8606/17426 [13:51<13:46, 10.67it/s]

tensor(1.4881, grad_fn=<NllLossBackward0>)
tensor(1.5334, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8608/17426 [13:51<14:24, 10.20it/s]

tensor(1.4871, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)
tensor(1.5078, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8612/17426 [13:52<13:55, 10.55it/s]

tensor(1.5364, grad_fn=<NllLossBackward0>)
tensor(1.5561, grad_fn=<NllLossBackward0>)
tensor(1.5426, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8614/17426 [13:52<13:56, 10.53it/s]

tensor(1.5041, grad_fn=<NllLossBackward0>)
tensor(1.4554, grad_fn=<NllLossBackward0>)
tensor(1.4951, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8618/17426 [13:52<13:56, 10.53it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.4960, grad_fn=<NllLossBackward0>)
tensor(1.4976, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8620/17426 [13:52<14:06, 10.40it/s]

tensor(1.5019, grad_fn=<NllLossBackward0>)
tensor(1.5222, grad_fn=<NllLossBackward0>)
tensor(1.4936, grad_fn=<NllLossBackward0>)


 49%|████▉     | 8624/17426 [13:53<13:36, 10.78it/s]

tensor(1.5225, grad_fn=<NllLossBackward0>)
tensor(1.5090, grad_fn=<NllLossBackward0>)
tensor(1.5246, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8626/17426 [13:53<13:34, 10.81it/s]

tensor(1.5076, grad_fn=<NllLossBackward0>)
tensor(1.5403, grad_fn=<NllLossBackward0>)
tensor(1.5282, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8628/17426 [13:53<13:55, 10.53it/s]

tensor(1.5145, grad_fn=<NllLossBackward0>)
tensor(1.4822, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8630/17426 [13:53<14:26, 10.15it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.5374, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8633/17426 [13:54<16:36,  8.83it/s]

tensor(1.5023, grad_fn=<NllLossBackward0>)
tensor(1.4657, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8635/17426 [13:54<18:12,  8.05it/s]

tensor(1.4522, grad_fn=<NllLossBackward0>)
tensor(1.4751, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8637/17426 [13:54<19:11,  7.63it/s]

tensor(1.4928, grad_fn=<NllLossBackward0>)
tensor(1.4930, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8639/17426 [13:55<20:33,  7.13it/s]

tensor(1.4975, grad_fn=<NllLossBackward0>)
tensor(1.5250, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8641/17426 [13:55<19:03,  7.68it/s]

tensor(1.5496, grad_fn=<NllLossBackward0>)
tensor(1.4837, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8643/17426 [13:55<18:32,  7.89it/s]

tensor(1.5138, grad_fn=<NllLossBackward0>)
tensor(1.5082, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8645/17426 [13:55<18:02,  8.12it/s]

tensor(1.4898, grad_fn=<NllLossBackward0>)
tensor(1.5141, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8647/17426 [13:56<18:01,  8.12it/s]

tensor(1.4781, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8649/17426 [13:56<17:27,  8.38it/s]

tensor(1.4783, grad_fn=<NllLossBackward0>)
tensor(1.5474, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8651/17426 [13:56<18:18,  7.99it/s]

tensor(1.5260, grad_fn=<NllLossBackward0>)
tensor(1.5123, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8653/17426 [13:56<20:45,  7.05it/s]

tensor(1.5145, grad_fn=<NllLossBackward0>)
tensor(1.5341, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8655/17426 [13:57<20:38,  7.08it/s]

tensor(1.5383, grad_fn=<NllLossBackward0>)
tensor(1.5168, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8657/17426 [13:57<20:21,  7.18it/s]

tensor(1.5347, grad_fn=<NllLossBackward0>)
tensor(1.5126, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8659/17426 [13:57<21:14,  6.88it/s]

tensor(1.4347, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8661/17426 [13:58<21:00,  6.95it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.4948, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8663/17426 [13:58<22:21,  6.53it/s]

tensor(1.4971, grad_fn=<NllLossBackward0>)
tensor(1.5272, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8665/17426 [13:58<19:55,  7.33it/s]

tensor(1.4978, grad_fn=<NllLossBackward0>)
tensor(1.4698, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8667/17426 [13:58<17:53,  8.16it/s]

tensor(1.5234, grad_fn=<NllLossBackward0>)
tensor(1.5077, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8669/17426 [13:59<16:36,  8.79it/s]

tensor(1.5025, grad_fn=<NllLossBackward0>)
tensor(1.5227, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8672/17426 [13:59<15:15,  9.56it/s]

tensor(1.5534, grad_fn=<NllLossBackward0>)
tensor(1.4606, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8675/17426 [13:59<14:13, 10.25it/s]

tensor(1.4904, grad_fn=<NllLossBackward0>)
tensor(1.5075, grad_fn=<NllLossBackward0>)
tensor(1.5129, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8677/17426 [13:59<14:14, 10.24it/s]

tensor(1.5348, grad_fn=<NllLossBackward0>)
tensor(1.4884, grad_fn=<NllLossBackward0>)
tensor(1.5271, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8681/17426 [14:00<13:55, 10.46it/s]

tensor(1.5505, grad_fn=<NllLossBackward0>)
tensor(1.4919, grad_fn=<NllLossBackward0>)
tensor(1.5079, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8683/17426 [14:00<13:53, 10.48it/s]

tensor(1.5025, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.5143, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8687/17426 [14:00<13:29, 10.79it/s]

tensor(1.5104, grad_fn=<NllLossBackward0>)
tensor(1.5038, grad_fn=<NllLossBackward0>)
tensor(1.5535, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8689/17426 [14:01<13:40, 10.65it/s]

tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.5442, grad_fn=<NllLossBackward0>)
tensor(1.4969, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8693/17426 [14:01<13:49, 10.53it/s]

tensor(1.4901, grad_fn=<NllLossBackward0>)
tensor(1.4740, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8695/17426 [14:01<13:53, 10.48it/s]

tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8699/17426 [14:01<13:24, 10.85it/s]

tensor(1.5291, grad_fn=<NllLossBackward0>)
tensor(1.5348, grad_fn=<NllLossBackward0>)
tensor(1.4865, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8701/17426 [14:02<13:40, 10.64it/s]

tensor(1.5078, grad_fn=<NllLossBackward0>)
tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8705/17426 [14:02<13:52, 10.47it/s]

tensor(1.5283, grad_fn=<NllLossBackward0>)
tensor(1.4514, grad_fn=<NllLossBackward0>)
tensor(1.4700, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8707/17426 [14:02<13:57, 10.41it/s]

tensor(1.5018, grad_fn=<NllLossBackward0>)
tensor(1.5168, grad_fn=<NllLossBackward0>)
tensor(1.5033, grad_fn=<NllLossBackward0>)


 50%|████▉     | 8711/17426 [14:03<13:36, 10.67it/s]

tensor(1.4959, grad_fn=<NllLossBackward0>)
tensor(1.5321, grad_fn=<NllLossBackward0>)
tensor(1.4619, grad_fn=<NllLossBackward0>)


 50%|█████     | 8713/17426 [14:03<13:49, 10.50it/s]

tensor(1.5331, grad_fn=<NllLossBackward0>)
tensor(1.5324, grad_fn=<NllLossBackward0>)
tensor(1.4883, grad_fn=<NllLossBackward0>)


 50%|█████     | 8717/17426 [14:03<13:36, 10.67it/s]

tensor(1.5122, grad_fn=<NllLossBackward0>)
tensor(1.5160, grad_fn=<NllLossBackward0>)
tensor(1.4724, grad_fn=<NllLossBackward0>)


 50%|█████     | 8719/17426 [14:03<13:54, 10.44it/s]

tensor(1.4955, grad_fn=<NllLossBackward0>)
tensor(1.4751, grad_fn=<NllLossBackward0>)
tensor(1.5233, grad_fn=<NllLossBackward0>)


 50%|█████     | 8723/17426 [14:04<13:48, 10.51it/s]

tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)
tensor(1.5265, grad_fn=<NllLossBackward0>)


 50%|█████     | 8725/17426 [14:04<14:08, 10.26it/s]

tensor(1.4815, grad_fn=<NllLossBackward0>)
tensor(1.5349, grad_fn=<NllLossBackward0>)
tensor(1.5522, grad_fn=<NllLossBackward0>)


 50%|█████     | 8729/17426 [14:04<13:32, 10.70it/s]

tensor(1.5155, grad_fn=<NllLossBackward0>)
tensor(1.5464, grad_fn=<NllLossBackward0>)
tensor(1.5156, grad_fn=<NllLossBackward0>)


 50%|█████     | 8731/17426 [14:05<13:47, 10.50it/s]

tensor(1.5033, grad_fn=<NllLossBackward0>)
tensor(1.5273, grad_fn=<NllLossBackward0>)
tensor(1.5074, grad_fn=<NllLossBackward0>)


 50%|█████     | 8735/17426 [14:05<13:30, 10.72it/s]

tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.4655, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)


 50%|█████     | 8737/17426 [14:05<14:06, 10.26it/s]

tensor(1.5281, grad_fn=<NllLossBackward0>)
tensor(1.4801, grad_fn=<NllLossBackward0>)
tensor(1.5301, grad_fn=<NllLossBackward0>)


 50%|█████     | 8741/17426 [14:05<13:38, 10.61it/s]

tensor(1.4686, grad_fn=<NllLossBackward0>)
tensor(1.5502, grad_fn=<NllLossBackward0>)
tensor(1.4706, grad_fn=<NllLossBackward0>)


 50%|█████     | 8743/17426 [14:06<13:50, 10.46it/s]

tensor(1.4710, grad_fn=<NllLossBackward0>)
tensor(1.4799, grad_fn=<NllLossBackward0>)
tensor(1.4607, grad_fn=<NllLossBackward0>)


 50%|█████     | 8747/17426 [14:06<13:43, 10.54it/s]

tensor(1.4999, grad_fn=<NllLossBackward0>)
tensor(1.5407, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)


 50%|█████     | 8749/17426 [14:06<13:43, 10.54it/s]

tensor(1.5273, grad_fn=<NllLossBackward0>)
tensor(1.5264, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)


 50%|█████     | 8753/17426 [14:07<13:20, 10.84it/s]

tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.4411, grad_fn=<NllLossBackward0>)
tensor(1.4779, grad_fn=<NllLossBackward0>)


 50%|█████     | 8755/17426 [14:07<13:45, 10.50it/s]

tensor(1.5537, grad_fn=<NllLossBackward0>)
tensor(1.4931, grad_fn=<NllLossBackward0>)
tensor(1.4740, grad_fn=<NllLossBackward0>)


 50%|█████     | 8759/17426 [14:07<13:40, 10.56it/s]

tensor(1.4571, grad_fn=<NllLossBackward0>)
tensor(1.4835, grad_fn=<NllLossBackward0>)
tensor(1.4694, grad_fn=<NllLossBackward0>)


 50%|█████     | 8761/17426 [14:07<13:56, 10.36it/s]

tensor(1.5034, grad_fn=<NllLossBackward0>)
tensor(1.5078, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)


 50%|█████     | 8765/17426 [14:08<13:34, 10.63it/s]

tensor(1.4388, grad_fn=<NllLossBackward0>)
tensor(1.4865, grad_fn=<NllLossBackward0>)
tensor(1.5091, grad_fn=<NllLossBackward0>)


 50%|█████     | 8767/17426 [14:08<13:56, 10.35it/s]

tensor(1.4806, grad_fn=<NllLossBackward0>)
tensor(1.4903, grad_fn=<NllLossBackward0>)


 50%|█████     | 8769/17426 [14:08<15:59,  9.03it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.5138, grad_fn=<NllLossBackward0>)


 50%|█████     | 8771/17426 [14:09<17:35,  8.20it/s]

tensor(1.4773, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)


 50%|█████     | 8773/17426 [14:09<18:40,  7.72it/s]

tensor(1.5347, grad_fn=<NllLossBackward0>)
tensor(1.5322, grad_fn=<NllLossBackward0>)


 50%|█████     | 8775/17426 [14:09<19:08,  7.53it/s]

tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.5167, grad_fn=<NllLossBackward0>)


 50%|█████     | 8777/17426 [14:09<18:49,  7.66it/s]

tensor(1.5385, grad_fn=<NllLossBackward0>)
tensor(1.5152, grad_fn=<NllLossBackward0>)


 50%|█████     | 8779/17426 [14:10<18:18,  7.87it/s]

tensor(1.5069, grad_fn=<NllLossBackward0>)
tensor(1.5017, grad_fn=<NllLossBackward0>)


 50%|█████     | 8781/17426 [14:10<19:08,  7.52it/s]

tensor(1.5422, grad_fn=<NllLossBackward0>)
tensor(1.5223, grad_fn=<NllLossBackward0>)


 50%|█████     | 8783/17426 [14:10<18:38,  7.72it/s]

tensor(1.5219, grad_fn=<NllLossBackward0>)
tensor(1.5261, grad_fn=<NllLossBackward0>)


 50%|█████     | 8785/17426 [14:10<18:34,  7.76it/s]

tensor(1.4905, grad_fn=<NllLossBackward0>)
tensor(1.5026, grad_fn=<NllLossBackward0>)


 50%|█████     | 8787/17426 [14:11<19:08,  7.52it/s]

tensor(1.5152, grad_fn=<NllLossBackward0>)
tensor(1.5195, grad_fn=<NllLossBackward0>)


 50%|█████     | 8789/17426 [14:11<20:55,  6.88it/s]

tensor(1.4875, grad_fn=<NllLossBackward0>)
tensor(1.4861, grad_fn=<NllLossBackward0>)


 50%|█████     | 8791/17426 [14:11<20:42,  6.95it/s]

tensor(1.5481, grad_fn=<NllLossBackward0>)
tensor(1.5297, grad_fn=<NllLossBackward0>)


 50%|█████     | 8793/17426 [14:12<20:59,  6.85it/s]

tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)


 50%|█████     | 8795/17426 [14:12<21:13,  6.78it/s]

tensor(1.5389, grad_fn=<NllLossBackward0>)
tensor(1.4751, grad_fn=<NllLossBackward0>)


 50%|█████     | 8797/17426 [14:12<21:08,  6.80it/s]

tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.5152, grad_fn=<NllLossBackward0>)


 50%|█████     | 8799/17426 [14:12<21:41,  6.63it/s]

tensor(1.4995, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)


 51%|█████     | 8801/17426 [14:13<19:46,  7.27it/s]

tensor(1.5305, grad_fn=<NllLossBackward0>)
tensor(1.5869, grad_fn=<NllLossBackward0>)


 51%|█████     | 8803/17426 [14:13<17:32,  8.19it/s]

tensor(1.4468, grad_fn=<NllLossBackward0>)
tensor(1.4878, grad_fn=<NllLossBackward0>)


 51%|█████     | 8805/17426 [14:13<16:29,  8.72it/s]

tensor(1.5244, grad_fn=<NllLossBackward0>)
tensor(1.5223, grad_fn=<NllLossBackward0>)


 51%|█████     | 8807/17426 [14:13<15:44,  9.12it/s]

tensor(1.5144, grad_fn=<NllLossBackward0>)
tensor(1.4711, grad_fn=<NllLossBackward0>)


 51%|█████     | 8809/17426 [14:14<16:34,  8.67it/s]

tensor(1.4593, grad_fn=<NllLossBackward0>)
tensor(1.5363, grad_fn=<NllLossBackward0>)


 51%|█████     | 8811/17426 [14:14<16:00,  8.97it/s]

tensor(1.4652, grad_fn=<NllLossBackward0>)
tensor(1.4943, grad_fn=<NllLossBackward0>)
tensor(1.5437, grad_fn=<NllLossBackward0>)


 51%|█████     | 8815/17426 [14:14<13:58, 10.27it/s]

tensor(1.4971, grad_fn=<NllLossBackward0>)
tensor(1.5568, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)


 51%|█████     | 8817/17426 [14:14<13:51, 10.36it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.4730, grad_fn=<NllLossBackward0>)
tensor(1.4947, grad_fn=<NllLossBackward0>)


 51%|█████     | 8821/17426 [14:15<13:47, 10.40it/s]

tensor(1.5149, grad_fn=<NllLossBackward0>)
tensor(1.5510, grad_fn=<NllLossBackward0>)
tensor(1.5058, grad_fn=<NllLossBackward0>)


 51%|█████     | 8823/17426 [14:15<13:49, 10.37it/s]

tensor(1.4855, grad_fn=<NllLossBackward0>)
tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.5205, grad_fn=<NllLossBackward0>)


 51%|█████     | 8827/17426 [14:15<13:28, 10.64it/s]

tensor(1.4941, grad_fn=<NllLossBackward0>)
tensor(1.4773, grad_fn=<NllLossBackward0>)
tensor(1.5205, grad_fn=<NllLossBackward0>)


 51%|█████     | 8829/17426 [14:16<13:38, 10.50it/s]

tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.4871, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)


 51%|█████     | 8833/17426 [14:16<13:36, 10.53it/s]

tensor(1.5599, grad_fn=<NllLossBackward0>)
tensor(1.4865, grad_fn=<NllLossBackward0>)
tensor(1.5676, grad_fn=<NllLossBackward0>)


 51%|█████     | 8835/17426 [14:16<13:47, 10.38it/s]

tensor(1.4705, grad_fn=<NllLossBackward0>)
tensor(1.5425, grad_fn=<NllLossBackward0>)
tensor(1.5188, grad_fn=<NllLossBackward0>)


 51%|█████     | 8839/17426 [14:16<13:24, 10.67it/s]

tensor(1.4978, grad_fn=<NllLossBackward0>)
tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)


 51%|█████     | 8841/17426 [14:17<13:55, 10.27it/s]

tensor(1.4934, grad_fn=<NllLossBackward0>)
tensor(1.5317, grad_fn=<NllLossBackward0>)
tensor(1.5082, grad_fn=<NllLossBackward0>)


 51%|█████     | 8845/17426 [14:17<13:24, 10.66it/s]

tensor(1.5363, grad_fn=<NllLossBackward0>)
tensor(1.4993, grad_fn=<NllLossBackward0>)
tensor(1.5145, grad_fn=<NllLossBackward0>)


 51%|█████     | 8847/17426 [14:17<13:42, 10.43it/s]

tensor(1.5057, grad_fn=<NllLossBackward0>)
tensor(1.5157, grad_fn=<NllLossBackward0>)
tensor(1.5000, grad_fn=<NllLossBackward0>)


 51%|█████     | 8851/17426 [14:18<13:22, 10.69it/s]

tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.5223, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)


 51%|█████     | 8853/17426 [14:18<13:55, 10.27it/s]

tensor(1.4815, grad_fn=<NllLossBackward0>)
tensor(1.4739, grad_fn=<NllLossBackward0>)
tensor(1.4970, grad_fn=<NllLossBackward0>)


 51%|█████     | 8857/17426 [14:18<13:20, 10.70it/s]

tensor(1.4834, grad_fn=<NllLossBackward0>)
tensor(1.5312, grad_fn=<NllLossBackward0>)
tensor(1.5265, grad_fn=<NllLossBackward0>)


 51%|█████     | 8859/17426 [14:18<13:33, 10.53it/s]

tensor(1.4694, grad_fn=<NllLossBackward0>)
tensor(1.5418, grad_fn=<NllLossBackward0>)
tensor(1.5422, grad_fn=<NllLossBackward0>)


 51%|█████     | 8863/17426 [14:19<13:35, 10.50it/s]

tensor(1.5054, grad_fn=<NllLossBackward0>)
tensor(1.4902, grad_fn=<NllLossBackward0>)
tensor(1.5137, grad_fn=<NllLossBackward0>)


 51%|█████     | 8865/17426 [14:19<13:44, 10.39it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.4914, grad_fn=<NllLossBackward0>)
tensor(1.4650, grad_fn=<NllLossBackward0>)


 51%|█████     | 8869/17426 [14:19<13:20, 10.69it/s]

tensor(1.5456, grad_fn=<NllLossBackward0>)
tensor(1.4763, grad_fn=<NllLossBackward0>)
tensor(1.5488, grad_fn=<NllLossBackward0>)


 51%|█████     | 8871/17426 [14:20<13:30, 10.56it/s]

tensor(1.5153, grad_fn=<NllLossBackward0>)
tensor(1.5316, grad_fn=<NllLossBackward0>)
tensor(1.5110, grad_fn=<NllLossBackward0>)


 51%|█████     | 8875/17426 [14:20<13:43, 10.38it/s]

tensor(1.4900, grad_fn=<NllLossBackward0>)
tensor(1.4925, grad_fn=<NllLossBackward0>)
tensor(1.4980, grad_fn=<NllLossBackward0>)


 51%|█████     | 8877/17426 [14:20<13:42, 10.39it/s]

tensor(1.5194, grad_fn=<NllLossBackward0>)
tensor(1.5515, grad_fn=<NllLossBackward0>)
tensor(1.5084, grad_fn=<NllLossBackward0>)


 51%|█████     | 8881/17426 [14:20<13:09, 10.82it/s]

tensor(1.4849, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)
tensor(1.5328, grad_fn=<NllLossBackward0>)


 51%|█████     | 8883/17426 [14:21<13:22, 10.64it/s]

tensor(1.5172, grad_fn=<NllLossBackward0>)
tensor(1.5137, grad_fn=<NllLossBackward0>)
tensor(1.4864, grad_fn=<NllLossBackward0>)


 51%|█████     | 8887/17426 [14:21<13:26, 10.58it/s]

tensor(1.5223, grad_fn=<NllLossBackward0>)
tensor(1.4759, grad_fn=<NllLossBackward0>)
tensor(1.5619, grad_fn=<NllLossBackward0>)


 51%|█████     | 8889/17426 [14:21<13:43, 10.37it/s]

tensor(1.4997, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 51%|█████     | 8893/17426 [14:22<13:39, 10.42it/s]

tensor(1.5119, grad_fn=<NllLossBackward0>)
tensor(1.5021, grad_fn=<NllLossBackward0>)
tensor(1.4848, grad_fn=<NllLossBackward0>)


 51%|█████     | 8895/17426 [14:22<13:41, 10.39it/s]

tensor(1.4793, grad_fn=<NllLossBackward0>)
tensor(1.5317, grad_fn=<NllLossBackward0>)
tensor(1.5094, grad_fn=<NllLossBackward0>)


 51%|█████     | 8899/17426 [14:22<13:16, 10.71it/s]

tensor(1.5273, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)
tensor(1.4511, grad_fn=<NllLossBackward0>)


 51%|█████     | 8901/17426 [14:22<13:25, 10.58it/s]

tensor(1.4770, grad_fn=<NllLossBackward0>)
tensor(1.5067, grad_fn=<NllLossBackward0>)
tensor(1.5039, grad_fn=<NllLossBackward0>)


 51%|█████     | 8903/17426 [14:23<13:57, 10.18it/s]

tensor(1.5100, grad_fn=<NllLossBackward0>)
tensor(1.5152, grad_fn=<NllLossBackward0>)


 51%|█████     | 8906/17426 [14:23<16:27,  8.63it/s]

tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.4982, grad_fn=<NllLossBackward0>)


 51%|█████     | 8908/17426 [14:23<17:52,  7.94it/s]

tensor(1.5518, grad_fn=<NllLossBackward0>)
tensor(1.5687, grad_fn=<NllLossBackward0>)


 51%|█████     | 8910/17426 [14:24<18:00,  7.88it/s]

tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.5064, grad_fn=<NllLossBackward0>)


 51%|█████     | 8912/17426 [14:24<18:22,  7.72it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.5115, grad_fn=<NllLossBackward0>)


 51%|█████     | 8914/17426 [14:24<19:24,  7.31it/s]

tensor(1.5253, grad_fn=<NllLossBackward0>)
tensor(1.4722, grad_fn=<NllLossBackward0>)


 51%|█████     | 8916/17426 [14:24<19:20,  7.33it/s]

tensor(1.4807, grad_fn=<NllLossBackward0>)
tensor(1.5201, grad_fn=<NllLossBackward0>)


 51%|█████     | 8918/17426 [14:25<17:44,  7.99it/s]

tensor(1.5287, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)


 51%|█████     | 8920/17426 [14:25<19:04,  7.43it/s]

tensor(1.4911, grad_fn=<NllLossBackward0>)
tensor(1.4470, grad_fn=<NllLossBackward0>)


 51%|█████     | 8922/17426 [14:25<19:31,  7.26it/s]

tensor(1.5199, grad_fn=<NllLossBackward0>)
tensor(1.4896, grad_fn=<NllLossBackward0>)


 51%|█████     | 8924/17426 [14:26<20:15,  7.00it/s]

tensor(1.4996, grad_fn=<NllLossBackward0>)
tensor(1.4987, grad_fn=<NllLossBackward0>)


 51%|█████     | 8926/17426 [14:26<20:55,  6.77it/s]

tensor(1.5465, grad_fn=<NllLossBackward0>)
tensor(1.4781, grad_fn=<NllLossBackward0>)


 51%|█████     | 8928/17426 [14:26<22:02,  6.43it/s]

tensor(1.5117, grad_fn=<NllLossBackward0>)
tensor(1.4722, grad_fn=<NllLossBackward0>)


 51%|█████     | 8930/17426 [14:26<21:33,  6.57it/s]

tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8932/17426 [14:27<21:15,  6.66it/s]

tensor(1.4942, grad_fn=<NllLossBackward0>)
tensor(1.5302, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8935/17426 [14:27<16:18,  8.68it/s]

tensor(1.4468, grad_fn=<NllLossBackward0>)
tensor(1.4627, grad_fn=<NllLossBackward0>)
tensor(1.4989, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8938/17426 [14:27<14:44,  9.59it/s]

tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.4968, grad_fn=<NllLossBackward0>)
tensor(1.5128, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8941/17426 [14:28<14:10,  9.98it/s]

tensor(1.5431, grad_fn=<NllLossBackward0>)
tensor(1.5234, grad_fn=<NllLossBackward0>)
tensor(1.5364, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8944/17426 [14:28<13:41, 10.33it/s]

tensor(1.5195, grad_fn=<NllLossBackward0>)
tensor(1.5051, grad_fn=<NllLossBackward0>)
tensor(1.4654, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8946/17426 [14:28<13:49, 10.23it/s]

tensor(1.5317, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.4710, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8949/17426 [14:28<14:07, 10.00it/s]

tensor(1.4670, grad_fn=<NllLossBackward0>)
tensor(1.4798, grad_fn=<NllLossBackward0>)
tensor(1.5222, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8953/17426 [14:29<13:20, 10.59it/s]

tensor(1.4422, grad_fn=<NllLossBackward0>)
tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.4822, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8955/17426 [14:29<13:27, 10.49it/s]

tensor(1.4885, grad_fn=<NllLossBackward0>)
tensor(1.4833, grad_fn=<NllLossBackward0>)
tensor(1.4640, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8959/17426 [14:29<13:25, 10.51it/s]

tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.4817, grad_fn=<NllLossBackward0>)
tensor(1.4728, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8961/17426 [14:30<13:24, 10.52it/s]

tensor(1.4610, grad_fn=<NllLossBackward0>)
tensor(1.5222, grad_fn=<NllLossBackward0>)
tensor(1.4785, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8965/17426 [14:30<13:16, 10.63it/s]

tensor(1.5057, grad_fn=<NllLossBackward0>)
tensor(1.5188, grad_fn=<NllLossBackward0>)
tensor(1.4749, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8967/17426 [14:30<13:23, 10.53it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.5197, grad_fn=<NllLossBackward0>)
tensor(1.4771, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8971/17426 [14:31<13:19, 10.58it/s]

tensor(1.5062, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)
tensor(1.5233, grad_fn=<NllLossBackward0>)


 51%|█████▏    | 8973/17426 [14:31<13:23, 10.52it/s]

tensor(1.5160, grad_fn=<NllLossBackward0>)
tensor(1.5447, grad_fn=<NllLossBackward0>)
tensor(1.5139, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 8977/17426 [14:31<13:04, 10.77it/s]

tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.5173, grad_fn=<NllLossBackward0>)
tensor(1.5547, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 8979/17426 [14:31<13:11, 10.67it/s]

tensor(1.5270, grad_fn=<NllLossBackward0>)
tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.4958, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 8983/17426 [14:32<13:14, 10.62it/s]

tensor(1.4601, grad_fn=<NllLossBackward0>)
tensor(1.4790, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 8985/17426 [14:32<13:24, 10.49it/s]

tensor(1.5184, grad_fn=<NllLossBackward0>)
tensor(1.4312, grad_fn=<NllLossBackward0>)
tensor(1.4953, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 8989/17426 [14:32<12:54, 10.89it/s]

tensor(1.4788, grad_fn=<NllLossBackward0>)
tensor(1.5350, grad_fn=<NllLossBackward0>)
tensor(1.4793, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 8991/17426 [14:32<13:27, 10.45it/s]

tensor(1.4348, grad_fn=<NllLossBackward0>)
tensor(1.5247, grad_fn=<NllLossBackward0>)
tensor(1.5324, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 8995/17426 [14:33<13:06, 10.72it/s]

tensor(1.5259, grad_fn=<NllLossBackward0>)
tensor(1.4866, grad_fn=<NllLossBackward0>)
tensor(1.5137, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 8997/17426 [14:33<13:17, 10.57it/s]

tensor(1.4711, grad_fn=<NllLossBackward0>)
tensor(1.4759, grad_fn=<NllLossBackward0>)
tensor(1.4654, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9001/17426 [14:33<12:54, 10.87it/s]

tensor(1.5417, grad_fn=<NllLossBackward0>)
tensor(1.4901, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9003/17426 [14:34<13:17, 10.56it/s]

tensor(1.5546, grad_fn=<NllLossBackward0>)
tensor(1.5139, grad_fn=<NllLossBackward0>)
tensor(1.5177, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9007/17426 [14:34<13:05, 10.71it/s]

tensor(1.5211, grad_fn=<NllLossBackward0>)
tensor(1.5287, grad_fn=<NllLossBackward0>)
tensor(1.5405, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9009/17426 [14:34<13:13, 10.61it/s]

tensor(1.4784, grad_fn=<NllLossBackward0>)
tensor(1.5782, grad_fn=<NllLossBackward0>)
tensor(1.4620, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9013/17426 [14:34<13:15, 10.57it/s]

tensor(1.5378, grad_fn=<NllLossBackward0>)
tensor(1.5432, grad_fn=<NllLossBackward0>)
tensor(1.5047, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9015/17426 [14:35<13:21, 10.49it/s]

tensor(1.5104, grad_fn=<NllLossBackward0>)
tensor(1.5221, grad_fn=<NllLossBackward0>)
tensor(1.5027, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9019/17426 [14:35<13:00, 10.78it/s]

tensor(1.5128, grad_fn=<NllLossBackward0>)
tensor(1.4496, grad_fn=<NllLossBackward0>)
tensor(1.4885, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9021/17426 [14:35<13:01, 10.76it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.5229, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9025/17426 [14:36<13:00, 10.76it/s]

tensor(1.5402, grad_fn=<NllLossBackward0>)
tensor(1.5348, grad_fn=<NllLossBackward0>)
tensor(1.4700, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9027/17426 [14:36<13:16, 10.54it/s]

tensor(1.4763, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9031/17426 [14:36<13:03, 10.72it/s]

tensor(1.4943, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.4687, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9033/17426 [14:36<13:08, 10.64it/s]

tensor(1.4837, grad_fn=<NllLossBackward0>)
tensor(1.4719, grad_fn=<NllLossBackward0>)
tensor(1.5636, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9037/17426 [14:37<13:39, 10.24it/s]

tensor(1.4813, grad_fn=<NllLossBackward0>)
tensor(1.5139, grad_fn=<NllLossBackward0>)
tensor(1.4816, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9039/17426 [14:37<15:21,  9.10it/s]

tensor(1.4912, grad_fn=<NllLossBackward0>)
tensor(1.5057, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9041/17426 [14:37<16:41,  8.37it/s]

tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.4907, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9043/17426 [14:38<17:09,  8.14it/s]

tensor(1.5250, grad_fn=<NllLossBackward0>)
tensor(1.5009, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9045/17426 [14:38<17:50,  7.83it/s]

tensor(1.4994, grad_fn=<NllLossBackward0>)
tensor(1.4720, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9047/17426 [14:38<17:09,  8.14it/s]

tensor(1.5047, grad_fn=<NllLossBackward0>)
tensor(1.5079, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9049/17426 [14:38<16:36,  8.41it/s]

tensor(1.4869, grad_fn=<NllLossBackward0>)
tensor(1.4827, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9051/17426 [14:39<16:14,  8.59it/s]

tensor(1.5176, grad_fn=<NllLossBackward0>)
tensor(1.5261, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9053/17426 [14:39<17:05,  8.17it/s]

tensor(1.5195, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9055/17426 [14:39<16:22,  8.52it/s]

tensor(1.5137, grad_fn=<NllLossBackward0>)
tensor(1.4911, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9057/17426 [14:39<16:13,  8.60it/s]

tensor(1.5128, grad_fn=<NllLossBackward0>)
tensor(1.5203, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9059/17426 [14:40<16:21,  8.52it/s]

tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.5002, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9061/17426 [14:40<17:03,  8.18it/s]

tensor(1.5036, grad_fn=<NllLossBackward0>)
tensor(1.5162, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9063/17426 [14:40<18:40,  7.46it/s]

tensor(1.4706, grad_fn=<NllLossBackward0>)
tensor(1.4814, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9065/17426 [14:40<18:56,  7.36it/s]

tensor(1.4780, grad_fn=<NllLossBackward0>)
tensor(1.4982, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9067/17426 [14:41<19:31,  7.13it/s]

tensor(1.5180, grad_fn=<NllLossBackward0>)
tensor(1.4910, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9069/17426 [14:41<21:05,  6.60it/s]

tensor(1.5302, grad_fn=<NllLossBackward0>)
tensor(1.5122, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9071/17426 [14:41<21:24,  6.50it/s]

tensor(1.5347, grad_fn=<NllLossBackward0>)
tensor(1.5239, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9073/17426 [14:42<20:57,  6.64it/s]

tensor(1.5397, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9075/17426 [14:42<18:56,  7.35it/s]

tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.5065, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9077/17426 [14:42<18:05,  7.69it/s]

tensor(1.4599, grad_fn=<NllLossBackward0>)
tensor(1.4904, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9079/17426 [14:42<17:16,  8.05it/s]

tensor(1.5129, grad_fn=<NllLossBackward0>)
tensor(1.5212, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9081/17426 [14:43<16:42,  8.32it/s]

tensor(1.5055, grad_fn=<NllLossBackward0>)
tensor(1.4730, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9083/17426 [14:43<16:06,  8.63it/s]

tensor(1.4789, grad_fn=<NllLossBackward0>)
tensor(1.4772, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9085/17426 [14:43<15:21,  9.05it/s]

tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.5872, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9087/17426 [14:43<16:11,  8.59it/s]

tensor(1.4885, grad_fn=<NllLossBackward0>)
tensor(1.4904, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9089/17426 [14:43<15:33,  8.93it/s]

tensor(1.4897, grad_fn=<NllLossBackward0>)
tensor(1.5147, grad_fn=<NllLossBackward0>)
tensor(1.5426, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9093/17426 [14:44<13:40, 10.16it/s]

tensor(1.5329, grad_fn=<NllLossBackward0>)
tensor(1.4542, grad_fn=<NllLossBackward0>)
tensor(1.5271, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9096/17426 [14:44<13:19, 10.42it/s]

tensor(1.4889, grad_fn=<NllLossBackward0>)
tensor(1.4990, grad_fn=<NllLossBackward0>)
tensor(1.5253, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9098/17426 [14:44<13:41, 10.13it/s]

tensor(1.4980, grad_fn=<NllLossBackward0>)
tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9102/17426 [14:45<13:09, 10.55it/s]

tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9104/17426 [14:45<13:15, 10.46it/s]

tensor(1.4915, grad_fn=<NllLossBackward0>)
tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9108/17426 [14:45<13:00, 10.66it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.4561, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9110/17426 [14:45<13:19, 10.40it/s]

tensor(1.4952, grad_fn=<NllLossBackward0>)
tensor(1.5203, grad_fn=<NllLossBackward0>)
tensor(1.4894, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9114/17426 [14:46<13:10, 10.51it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.5071, grad_fn=<NllLossBackward0>)
tensor(1.5227, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9116/17426 [14:46<13:13, 10.47it/s]

tensor(1.4669, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.5087, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9120/17426 [14:46<12:58, 10.67it/s]

tensor(1.4786, grad_fn=<NllLossBackward0>)
tensor(1.5111, grad_fn=<NllLossBackward0>)
tensor(1.5006, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9122/17426 [14:47<12:58, 10.67it/s]

tensor(1.4975, grad_fn=<NllLossBackward0>)
tensor(1.5387, grad_fn=<NllLossBackward0>)
tensor(1.5611, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9126/17426 [14:47<12:52, 10.75it/s]

tensor(1.5132, grad_fn=<NllLossBackward0>)
tensor(1.5253, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9128/17426 [14:47<13:06, 10.55it/s]

tensor(1.5500, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)
tensor(1.5341, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9132/17426 [14:47<13:09, 10.50it/s]

tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.5227, grad_fn=<NllLossBackward0>)
tensor(1.5511, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9134/17426 [14:48<13:20, 10.36it/s]

tensor(1.4847, grad_fn=<NllLossBackward0>)
tensor(1.5249, grad_fn=<NllLossBackward0>)
tensor(1.5110, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9138/17426 [14:48<12:55, 10.69it/s]

tensor(1.4770, grad_fn=<NllLossBackward0>)
tensor(1.4478, grad_fn=<NllLossBackward0>)
tensor(1.5072, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9140/17426 [14:48<13:04, 10.56it/s]

tensor(1.4537, grad_fn=<NllLossBackward0>)
tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.4835, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9144/17426 [14:49<13:10, 10.48it/s]

tensor(1.4716, grad_fn=<NllLossBackward0>)
tensor(1.5101, grad_fn=<NllLossBackward0>)
tensor(1.5507, grad_fn=<NllLossBackward0>)


 52%|█████▏    | 9146/17426 [14:49<13:18, 10.37it/s]

tensor(1.4701, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)
tensor(1.5171, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9150/17426 [14:49<12:51, 10.72it/s]

tensor(1.5432, grad_fn=<NllLossBackward0>)
tensor(1.4699, grad_fn=<NllLossBackward0>)
tensor(1.5096, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9152/17426 [14:49<13:26, 10.25it/s]

tensor(1.5073, grad_fn=<NllLossBackward0>)
tensor(1.4547, grad_fn=<NllLossBackward0>)
tensor(1.4851, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9156/17426 [14:50<12:53, 10.69it/s]

tensor(1.5111, grad_fn=<NllLossBackward0>)
tensor(1.5340, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9158/17426 [14:50<13:01, 10.57it/s]

tensor(1.5158, grad_fn=<NllLossBackward0>)
tensor(1.5140, grad_fn=<NllLossBackward0>)
tensor(1.5063, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9162/17426 [14:50<12:45, 10.80it/s]

tensor(1.5073, grad_fn=<NllLossBackward0>)
tensor(1.4829, grad_fn=<NllLossBackward0>)
tensor(1.4702, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9164/17426 [14:51<13:16, 10.37it/s]

tensor(1.5244, grad_fn=<NllLossBackward0>)
tensor(1.4946, grad_fn=<NllLossBackward0>)
tensor(1.5119, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9168/17426 [14:51<12:54, 10.66it/s]

tensor(1.5286, grad_fn=<NllLossBackward0>)
tensor(1.4568, grad_fn=<NllLossBackward0>)
tensor(1.4852, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9170/17426 [14:51<12:57, 10.62it/s]

tensor(1.5301, grad_fn=<NllLossBackward0>)
tensor(1.4739, grad_fn=<NllLossBackward0>)
tensor(1.4894, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9174/17426 [14:52<13:00, 10.57it/s]

tensor(1.4943, grad_fn=<NllLossBackward0>)
tensor(1.5663, grad_fn=<NllLossBackward0>)
tensor(1.4859, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9176/17426 [14:52<14:21,  9.58it/s]

tensor(1.5021, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9178/17426 [14:52<15:55,  8.63it/s]

tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.4935, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9180/17426 [14:52<16:58,  8.09it/s]

tensor(1.5113, grad_fn=<NllLossBackward0>)
tensor(1.5459, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9182/17426 [14:53<17:35,  7.81it/s]

tensor(1.4606, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9184/17426 [14:53<18:29,  7.43it/s]

tensor(1.4268, grad_fn=<NllLossBackward0>)
tensor(1.5078, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9186/17426 [14:53<18:37,  7.37it/s]

tensor(1.5550, grad_fn=<NllLossBackward0>)
tensor(1.4649, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9188/17426 [14:53<18:52,  7.28it/s]

tensor(1.4926, grad_fn=<NllLossBackward0>)
tensor(1.5047, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9190/17426 [14:54<20:17,  6.76it/s]

tensor(1.4886, grad_fn=<NllLossBackward0>)
tensor(1.5198, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9192/17426 [14:54<18:06,  7.58it/s]

tensor(1.5264, grad_fn=<NllLossBackward0>)
tensor(1.5567, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9194/17426 [14:54<18:22,  7.46it/s]

tensor(1.5245, grad_fn=<NllLossBackward0>)
tensor(1.4955, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9196/17426 [14:55<19:51,  6.90it/s]

tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.5103, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9198/17426 [14:55<21:06,  6.50it/s]

tensor(1.5140, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9200/17426 [14:55<20:54,  6.55it/s]

tensor(1.5742, grad_fn=<NllLossBackward0>)
tensor(1.4869, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9202/17426 [14:55<21:16,  6.44it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.4965, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9204/17426 [14:56<21:33,  6.36it/s]

tensor(1.5020, grad_fn=<NllLossBackward0>)
tensor(1.4567, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9206/17426 [14:56<19:43,  6.95it/s]

tensor(1.5155, grad_fn=<NllLossBackward0>)
tensor(1.5056, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9209/17426 [14:56<15:31,  8.83it/s]

tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)
tensor(1.4814, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9211/17426 [14:57<15:23,  8.90it/s]

tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.4921, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9213/17426 [14:57<15:07,  9.05it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.5380, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9216/17426 [14:57<14:13,  9.62it/s]

tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.4700, grad_fn=<NllLossBackward0>)
tensor(1.4792, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9218/17426 [14:57<14:31,  9.42it/s]

tensor(1.5119, grad_fn=<NllLossBackward0>)
tensor(1.5063, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9220/17426 [14:58<14:10,  9.65it/s]

tensor(1.4856, grad_fn=<NllLossBackward0>)
tensor(1.5145, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9224/17426 [14:58<13:27, 10.16it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.5518, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9226/17426 [14:58<13:18, 10.27it/s]

tensor(1.5333, grad_fn=<NllLossBackward0>)
tensor(1.4647, grad_fn=<NllLossBackward0>)
tensor(1.4946, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9230/17426 [14:58<12:42, 10.75it/s]

tensor(1.4949, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9232/17426 [14:59<12:57, 10.54it/s]

tensor(1.5409, grad_fn=<NllLossBackward0>)
tensor(1.5533, grad_fn=<NllLossBackward0>)
tensor(1.5540, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9236/17426 [14:59<13:06, 10.41it/s]

tensor(1.4641, grad_fn=<NllLossBackward0>)
tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.5482, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9238/17426 [14:59<13:11, 10.35it/s]

tensor(1.5245, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9242/17426 [15:00<12:57, 10.53it/s]

tensor(1.4966, grad_fn=<NllLossBackward0>)
tensor(1.4959, grad_fn=<NllLossBackward0>)
tensor(1.5143, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9244/17426 [15:00<12:59, 10.50it/s]

tensor(1.4845, grad_fn=<NllLossBackward0>)
tensor(1.4964, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9248/17426 [15:00<13:00, 10.48it/s]

tensor(1.4865, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9250/17426 [15:00<13:04, 10.43it/s]

tensor(1.4993, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)
tensor(1.4960, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9254/17426 [15:01<12:54, 10.55it/s]

tensor(1.5065, grad_fn=<NllLossBackward0>)
tensor(1.5065, grad_fn=<NllLossBackward0>)
tensor(1.5006, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9256/17426 [15:01<13:04, 10.42it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.5152, grad_fn=<NllLossBackward0>)
tensor(1.5077, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9260/17426 [15:01<13:00, 10.46it/s]

tensor(1.4671, grad_fn=<NllLossBackward0>)
tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9262/17426 [15:02<13:16, 10.25it/s]

tensor(1.4857, grad_fn=<NllLossBackward0>)
tensor(1.5020, grad_fn=<NllLossBackward0>)
tensor(1.4975, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9266/17426 [15:02<12:54, 10.53it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)
tensor(1.5000, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9268/17426 [15:02<13:12, 10.29it/s]

tensor(1.4842, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.4696, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9272/17426 [15:03<12:46, 10.64it/s]

tensor(1.5351, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)
tensor(1.4828, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9274/17426 [15:03<12:58, 10.47it/s]

tensor(1.4655, grad_fn=<NllLossBackward0>)
tensor(1.4469, grad_fn=<NllLossBackward0>)
tensor(1.5338, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9278/17426 [15:03<12:41, 10.69it/s]

tensor(1.4873, grad_fn=<NllLossBackward0>)
tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.5632, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9280/17426 [15:03<13:02, 10.42it/s]

tensor(1.5241, grad_fn=<NllLossBackward0>)
tensor(1.4838, grad_fn=<NllLossBackward0>)
tensor(1.5273, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9284/17426 [15:04<12:36, 10.77it/s]

tensor(1.4709, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)
tensor(1.5260, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9286/17426 [15:04<12:47, 10.60it/s]

tensor(1.4767, grad_fn=<NllLossBackward0>)
tensor(1.5223, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9290/17426 [15:04<12:48, 10.59it/s]

tensor(1.5197, grad_fn=<NllLossBackward0>)
tensor(1.4536, grad_fn=<NllLossBackward0>)
tensor(1.4910, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9292/17426 [15:04<12:54, 10.50it/s]

tensor(1.5088, grad_fn=<NllLossBackward0>)
tensor(1.5287, grad_fn=<NllLossBackward0>)
tensor(1.4749, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9296/17426 [15:05<12:34, 10.78it/s]

tensor(1.4470, grad_fn=<NllLossBackward0>)
tensor(1.4647, grad_fn=<NllLossBackward0>)
tensor(1.4723, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9298/17426 [15:05<12:39, 10.71it/s]

tensor(1.4944, grad_fn=<NllLossBackward0>)
tensor(1.5240, grad_fn=<NllLossBackward0>)
tensor(1.4608, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9302/17426 [15:05<12:32, 10.79it/s]

tensor(1.5345, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.5253, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9304/17426 [15:06<12:54, 10.48it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.5258, grad_fn=<NllLossBackward0>)
tensor(1.5519, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9308/17426 [15:06<12:39, 10.69it/s]

tensor(1.4710, grad_fn=<NllLossBackward0>)
tensor(1.4764, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9310/17426 [15:06<14:07,  9.58it/s]

tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.4782, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9312/17426 [15:07<16:43,  8.08it/s]

tensor(1.5126, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9314/17426 [15:07<17:16,  7.83it/s]

tensor(1.5113, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9316/17426 [15:07<17:56,  7.54it/s]

tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.5413, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9318/17426 [15:07<18:22,  7.36it/s]

tensor(1.5312, grad_fn=<NllLossBackward0>)
tensor(1.5012, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9320/17426 [15:08<17:16,  7.82it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.4743, grad_fn=<NllLossBackward0>)


 53%|█████▎    | 9322/17426 [15:08<17:32,  7.70it/s]

tensor(1.5219, grad_fn=<NllLossBackward0>)
tensor(1.4939, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9324/17426 [15:08<18:30,  7.29it/s]

tensor(1.5282, grad_fn=<NllLossBackward0>)
tensor(1.5036, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9326/17426 [15:08<19:38,  6.87it/s]

tensor(1.5227, grad_fn=<NllLossBackward0>)
tensor(1.5546, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9328/17426 [15:09<19:09,  7.05it/s]

tensor(1.4918, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9330/17426 [15:09<19:36,  6.88it/s]

tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.5150, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9332/17426 [15:09<19:50,  6.80it/s]

tensor(1.4959, grad_fn=<NllLossBackward0>)
tensor(1.5134, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9334/17426 [15:10<19:44,  6.83it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.4897, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9336/17426 [15:10<20:48,  6.48it/s]

tensor(1.5246, grad_fn=<NllLossBackward0>)
tensor(1.4512, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9338/17426 [15:10<20:23,  6.61it/s]

tensor(1.5069, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9340/17426 [15:11<20:15,  6.65it/s]

tensor(1.4732, grad_fn=<NllLossBackward0>)
tensor(1.5188, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9342/17426 [15:11<18:31,  7.27it/s]

tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.4786, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9344/17426 [15:11<16:37,  8.10it/s]

tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9346/17426 [15:11<16:24,  8.20it/s]

tensor(1.5388, grad_fn=<NllLossBackward0>)
tensor(1.4946, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9348/17426 [15:11<15:15,  8.83it/s]

tensor(1.4883, grad_fn=<NllLossBackward0>)
tensor(1.5238, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9350/17426 [15:12<15:35,  8.63it/s]

tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.4987, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9352/17426 [15:12<15:09,  8.87it/s]

tensor(1.5127, grad_fn=<NllLossBackward0>)
tensor(1.5254, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9356/17426 [15:12<13:08, 10.24it/s]

tensor(1.4850, grad_fn=<NllLossBackward0>)
tensor(1.5123, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9358/17426 [15:12<13:06, 10.26it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.5035, grad_fn=<NllLossBackward0>)
tensor(1.4769, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9362/17426 [15:13<12:55, 10.40it/s]

tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.4787, grad_fn=<NllLossBackward0>)


 54%|█████▎    | 9364/17426 [15:13<13:03, 10.29it/s]

tensor(1.5352, grad_fn=<NllLossBackward0>)
tensor(1.4804, grad_fn=<NllLossBackward0>)
tensor(1.5165, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9368/17426 [15:13<12:48, 10.49it/s]

tensor(1.4890, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)
tensor(1.5351, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9370/17426 [15:14<12:50, 10.46it/s]

tensor(1.4991, grad_fn=<NllLossBackward0>)
tensor(1.4995, grad_fn=<NllLossBackward0>)
tensor(1.4888, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9374/17426 [15:14<12:54, 10.40it/s]

tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.5429, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9376/17426 [15:14<12:52, 10.42it/s]

tensor(1.5107, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)
tensor(1.4914, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9380/17426 [15:15<12:32, 10.69it/s]

tensor(1.5216, grad_fn=<NllLossBackward0>)
tensor(1.4926, grad_fn=<NllLossBackward0>)
tensor(1.4813, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9382/17426 [15:15<12:55, 10.37it/s]

tensor(1.5292, grad_fn=<NllLossBackward0>)
tensor(1.5327, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9386/17426 [15:15<12:40, 10.58it/s]

tensor(1.5962, grad_fn=<NllLossBackward0>)
tensor(1.5179, grad_fn=<NllLossBackward0>)
tensor(1.5548, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9388/17426 [15:15<12:47, 10.47it/s]

tensor(1.4891, grad_fn=<NllLossBackward0>)
tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.5461, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9392/17426 [15:16<12:26, 10.77it/s]

tensor(1.5091, grad_fn=<NllLossBackward0>)
tensor(1.5086, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9394/17426 [15:16<13:10, 10.16it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.4935, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9398/17426 [15:16<12:37, 10.59it/s]

tensor(1.4826, grad_fn=<NllLossBackward0>)
tensor(1.5006, grad_fn=<NllLossBackward0>)
tensor(1.4856, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9400/17426 [15:16<12:45, 10.48it/s]

tensor(1.5148, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.5400, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9404/17426 [15:17<12:42, 10.52it/s]

tensor(1.4588, grad_fn=<NllLossBackward0>)
tensor(1.5333, grad_fn=<NllLossBackward0>)
tensor(1.5215, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9406/17426 [15:17<12:58, 10.31it/s]

tensor(1.4850, grad_fn=<NllLossBackward0>)
tensor(1.5167, grad_fn=<NllLossBackward0>)
tensor(1.4869, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9410/17426 [15:17<12:34, 10.62it/s]

tensor(1.5166, grad_fn=<NllLossBackward0>)
tensor(1.5398, grad_fn=<NllLossBackward0>)
tensor(1.4774, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9412/17426 [15:18<12:44, 10.48it/s]

tensor(1.4594, grad_fn=<NllLossBackward0>)
tensor(1.4794, grad_fn=<NllLossBackward0>)
tensor(1.5118, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9416/17426 [15:18<12:48, 10.42it/s]

tensor(1.4916, grad_fn=<NllLossBackward0>)
tensor(1.5196, grad_fn=<NllLossBackward0>)
tensor(1.4603, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9418/17426 [15:18<12:52, 10.37it/s]

tensor(1.5276, grad_fn=<NllLossBackward0>)
tensor(1.5019, grad_fn=<NllLossBackward0>)
tensor(1.4535, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9422/17426 [15:19<12:33, 10.62it/s]

tensor(1.5389, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)
tensor(1.4869, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9424/17426 [15:19<12:39, 10.54it/s]

tensor(1.4637, grad_fn=<NllLossBackward0>)
tensor(1.4966, grad_fn=<NllLossBackward0>)
tensor(1.5303, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9428/17426 [15:19<12:44, 10.46it/s]

tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.4388, grad_fn=<NllLossBackward0>)
tensor(1.5139, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9430/17426 [15:19<12:49, 10.39it/s]

tensor(1.5051, grad_fn=<NllLossBackward0>)
tensor(1.5325, grad_fn=<NllLossBackward0>)
tensor(1.5224, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9434/17426 [15:20<12:22, 10.76it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)
tensor(1.4756, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9436/17426 [15:20<12:35, 10.58it/s]

tensor(1.4914, grad_fn=<NllLossBackward0>)
tensor(1.5169, grad_fn=<NllLossBackward0>)
tensor(1.4838, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9440/17426 [15:20<12:32, 10.61it/s]

tensor(1.5280, grad_fn=<NllLossBackward0>)
tensor(1.5090, grad_fn=<NllLossBackward0>)
tensor(1.4965, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9442/17426 [15:21<13:32,  9.82it/s]

tensor(1.5550, grad_fn=<NllLossBackward0>)
tensor(1.5024, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9444/17426 [15:21<15:12,  8.74it/s]

tensor(1.4978, grad_fn=<NllLossBackward0>)
tensor(1.4732, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9446/17426 [15:21<17:18,  7.68it/s]

tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.5181, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9448/17426 [15:21<17:56,  7.41it/s]

tensor(1.5069, grad_fn=<NllLossBackward0>)
tensor(1.4672, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9450/17426 [15:22<17:44,  7.49it/s]

tensor(1.5366, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9452/17426 [15:22<17:07,  7.76it/s]

tensor(1.5200, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9454/17426 [15:22<17:45,  7.48it/s]

tensor(1.4949, grad_fn=<NllLossBackward0>)
tensor(1.5322, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9456/17426 [15:22<18:45,  7.08it/s]

tensor(1.4687, grad_fn=<NllLossBackward0>)
tensor(1.5300, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9458/17426 [15:23<17:57,  7.40it/s]

tensor(1.4886, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9460/17426 [15:23<16:49,  7.89it/s]

tensor(1.4456, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9462/17426 [15:23<17:11,  7.72it/s]

tensor(1.4883, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9464/17426 [15:24<17:47,  7.46it/s]

tensor(1.5174, grad_fn=<NllLossBackward0>)
tensor(1.5588, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9466/17426 [15:24<18:35,  7.14it/s]

tensor(1.4842, grad_fn=<NllLossBackward0>)
tensor(1.4995, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9468/17426 [15:24<18:37,  7.12it/s]

tensor(1.4997, grad_fn=<NllLossBackward0>)
tensor(1.4730, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9470/17426 [15:24<19:41,  6.73it/s]

tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.5193, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9472/17426 [15:25<19:59,  6.63it/s]

tensor(1.4677, grad_fn=<NllLossBackward0>)
tensor(1.4902, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9474/17426 [15:25<20:28,  6.48it/s]

tensor(1.4791, grad_fn=<NllLossBackward0>)
tensor(1.4834, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9476/17426 [15:25<18:20,  7.23it/s]

tensor(1.5340, grad_fn=<NllLossBackward0>)
tensor(1.5315, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9479/17426 [15:26<14:59,  8.84it/s]

tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.4858, grad_fn=<NllLossBackward0>)
tensor(1.5051, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9481/17426 [15:26<14:41,  9.02it/s]

tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.5028, grad_fn=<NllLossBackward0>)
tensor(1.5596, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9485/17426 [15:26<12:53, 10.26it/s]

tensor(1.4936, grad_fn=<NllLossBackward0>)
tensor(1.4734, grad_fn=<NllLossBackward0>)
tensor(1.5487, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9487/17426 [15:26<12:55, 10.23it/s]

tensor(1.5402, grad_fn=<NllLossBackward0>)
tensor(1.4666, grad_fn=<NllLossBackward0>)
tensor(1.4935, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9491/17426 [15:27<12:36, 10.49it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.5455, grad_fn=<NllLossBackward0>)
tensor(1.5125, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9493/17426 [15:27<12:35, 10.51it/s]

tensor(1.5219, grad_fn=<NllLossBackward0>)
tensor(1.4972, grad_fn=<NllLossBackward0>)
tensor(1.5422, grad_fn=<NllLossBackward0>)


 54%|█████▍    | 9497/17426 [15:27<12:25, 10.64it/s]

tensor(1.4856, grad_fn=<NllLossBackward0>)
tensor(1.5272, grad_fn=<NllLossBackward0>)
tensor(1.5052, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9499/17426 [15:28<12:49, 10.29it/s]

tensor(1.5245, grad_fn=<NllLossBackward0>)
tensor(1.5101, grad_fn=<NllLossBackward0>)
tensor(1.5134, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9503/17426 [15:28<12:30, 10.56it/s]

tensor(1.4824, grad_fn=<NllLossBackward0>)
tensor(1.4772, grad_fn=<NllLossBackward0>)
tensor(1.5277, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9505/17426 [15:28<12:46, 10.33it/s]

tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.5461, grad_fn=<NllLossBackward0>)
tensor(1.4484, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9509/17426 [15:28<12:40, 10.41it/s]

tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.4749, grad_fn=<NllLossBackward0>)
tensor(1.5283, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9511/17426 [15:29<12:49, 10.29it/s]

tensor(1.5281, grad_fn=<NllLossBackward0>)
tensor(1.5314, grad_fn=<NllLossBackward0>)
tensor(1.4757, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9515/17426 [15:29<12:21, 10.67it/s]

tensor(1.5204, grad_fn=<NllLossBackward0>)
tensor(1.4771, grad_fn=<NllLossBackward0>)
tensor(1.5013, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9517/17426 [15:29<12:31, 10.53it/s]

tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.5051, grad_fn=<NllLossBackward0>)
tensor(1.4910, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9521/17426 [15:30<12:37, 10.44it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9523/17426 [15:30<12:50, 10.25it/s]

tensor(1.5321, grad_fn=<NllLossBackward0>)
tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.5099, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9527/17426 [15:30<12:23, 10.62it/s]

tensor(1.5245, grad_fn=<NllLossBackward0>)
tensor(1.5581, grad_fn=<NllLossBackward0>)
tensor(1.4686, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9529/17426 [15:30<12:39, 10.40it/s]

tensor(1.5509, grad_fn=<NllLossBackward0>)
tensor(1.5211, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9533/17426 [15:31<12:27, 10.56it/s]

tensor(1.5334, grad_fn=<NllLossBackward0>)
tensor(1.5236, grad_fn=<NllLossBackward0>)
tensor(1.4470, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9535/17426 [15:31<12:34, 10.47it/s]

tensor(1.4861, grad_fn=<NllLossBackward0>)
tensor(1.4665, grad_fn=<NllLossBackward0>)
tensor(1.5031, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9539/17426 [15:31<12:28, 10.53it/s]

tensor(1.5485, grad_fn=<NllLossBackward0>)
tensor(1.4680, grad_fn=<NllLossBackward0>)
tensor(1.4913, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9541/17426 [15:32<12:38, 10.39it/s]

tensor(1.4857, grad_fn=<NllLossBackward0>)
tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.4787, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9545/17426 [15:32<12:36, 10.41it/s]

tensor(1.5489, grad_fn=<NllLossBackward0>)
tensor(1.4999, grad_fn=<NllLossBackward0>)
tensor(1.5540, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9547/17426 [15:32<12:47, 10.27it/s]

tensor(1.5293, grad_fn=<NllLossBackward0>)
tensor(1.4412, grad_fn=<NllLossBackward0>)
tensor(1.4867, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9551/17426 [15:33<12:27, 10.54it/s]

tensor(1.5444, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)
tensor(1.4543, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9553/17426 [15:33<13:02, 10.06it/s]

tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.4935, grad_fn=<NllLossBackward0>)
tensor(1.5215, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9557/17426 [15:33<12:31, 10.47it/s]

tensor(1.5173, grad_fn=<NllLossBackward0>)
tensor(1.5350, grad_fn=<NllLossBackward0>)
tensor(1.5174, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9559/17426 [15:33<12:42, 10.31it/s]

tensor(1.4947, grad_fn=<NllLossBackward0>)
tensor(1.5442, grad_fn=<NllLossBackward0>)
tensor(1.5288, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9563/17426 [15:34<12:28, 10.51it/s]

tensor(1.5060, grad_fn=<NllLossBackward0>)
tensor(1.4850, grad_fn=<NllLossBackward0>)
tensor(1.4612, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9565/17426 [15:34<12:35, 10.41it/s]

tensor(1.4450, grad_fn=<NllLossBackward0>)
tensor(1.4824, grad_fn=<NllLossBackward0>)
tensor(1.4969, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9569/17426 [15:34<12:22, 10.59it/s]

tensor(1.5400, grad_fn=<NllLossBackward0>)
tensor(1.5370, grad_fn=<NllLossBackward0>)
tensor(1.5082, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9571/17426 [15:34<12:31, 10.45it/s]

tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.4806, grad_fn=<NllLossBackward0>)
tensor(1.4509, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9575/17426 [15:35<12:31, 10.45it/s]

tensor(1.5175, grad_fn=<NllLossBackward0>)
tensor(1.4792, grad_fn=<NllLossBackward0>)
tensor(1.5298, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9577/17426 [15:35<13:43,  9.54it/s]

tensor(1.5442, grad_fn=<NllLossBackward0>)
tensor(1.5078, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9579/17426 [15:35<15:33,  8.40it/s]

tensor(1.4902, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9581/17426 [15:36<17:13,  7.59it/s]

tensor(1.5333, grad_fn=<NllLossBackward0>)
tensor(1.5012, grad_fn=<NllLossBackward0>)


 55%|█████▍    | 9583/17426 [15:36<17:18,  7.55it/s]

tensor(1.4884, grad_fn=<NllLossBackward0>)
tensor(1.4774, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9585/17426 [15:36<17:06,  7.64it/s]

tensor(1.4774, grad_fn=<NllLossBackward0>)
tensor(1.4781, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9587/17426 [15:37<18:07,  7.21it/s]

tensor(1.4988, grad_fn=<NllLossBackward0>)
tensor(1.4764, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9589/17426 [15:37<18:04,  7.23it/s]

tensor(1.5094, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9591/17426 [15:37<18:04,  7.23it/s]

tensor(1.5086, grad_fn=<NllLossBackward0>)
tensor(1.4629, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9593/17426 [15:37<19:32,  6.68it/s]

tensor(1.4716, grad_fn=<NllLossBackward0>)
tensor(1.4991, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9595/17426 [15:38<19:12,  6.79it/s]

tensor(1.5199, grad_fn=<NllLossBackward0>)
tensor(1.4835, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9597/17426 [15:38<19:01,  6.86it/s]

tensor(1.4761, grad_fn=<NllLossBackward0>)
tensor(1.4884, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9599/17426 [15:38<18:33,  7.03it/s]

tensor(1.5242, grad_fn=<NllLossBackward0>)
tensor(1.5054, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9601/17426 [15:39<19:13,  6.78it/s]

tensor(1.5600, grad_fn=<NllLossBackward0>)
tensor(1.4680, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9603/17426 [15:39<19:47,  6.59it/s]

tensor(1.5186, grad_fn=<NllLossBackward0>)
tensor(1.5004, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9605/17426 [15:39<19:41,  6.62it/s]

tensor(1.4871, grad_fn=<NllLossBackward0>)
tensor(1.5772, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9607/17426 [15:39<17:53,  7.29it/s]

tensor(1.4791, grad_fn=<NllLossBackward0>)
tensor(1.5164, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9609/17426 [15:40<16:09,  8.06it/s]

tensor(1.5098, grad_fn=<NllLossBackward0>)
tensor(1.4445, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9611/17426 [15:40<15:20,  8.49it/s]

tensor(1.4806, grad_fn=<NllLossBackward0>)
tensor(1.4786, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9613/17426 [15:40<15:16,  8.53it/s]

tensor(1.5072, grad_fn=<NllLossBackward0>)
tensor(1.5305, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9615/17426 [15:40<14:37,  8.90it/s]

tensor(1.5070, grad_fn=<NllLossBackward0>)
tensor(1.4597, grad_fn=<NllLossBackward0>)
tensor(1.5247, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9618/17426 [15:41<13:42,  9.49it/s]

tensor(1.4930, grad_fn=<NllLossBackward0>)
tensor(1.5570, grad_fn=<NllLossBackward0>)
tensor(1.5141, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9621/17426 [15:41<13:13,  9.83it/s]

tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.4659, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9623/17426 [15:41<14:49,  8.77it/s]

tensor(1.5100, grad_fn=<NllLossBackward0>)
tensor(1.5033, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9625/17426 [15:41<14:37,  8.89it/s]

tensor(1.5054, grad_fn=<NllLossBackward0>)
tensor(1.4483, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9627/17426 [15:42<14:42,  8.83it/s]

tensor(1.5371, grad_fn=<NllLossBackward0>)
tensor(1.5535, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9629/17426 [15:42<14:03,  9.24it/s]

tensor(1.5272, grad_fn=<NllLossBackward0>)
tensor(1.4577, grad_fn=<NllLossBackward0>)
tensor(1.4648, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9633/17426 [15:42<13:00,  9.98it/s]

tensor(1.5300, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9636/17426 [15:43<12:45, 10.18it/s]

tensor(1.4759, grad_fn=<NllLossBackward0>)
tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.5139, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9639/17426 [15:43<12:36, 10.29it/s]

tensor(1.4966, grad_fn=<NllLossBackward0>)
tensor(1.5029, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9641/17426 [15:43<12:31, 10.36it/s]

tensor(1.4684, grad_fn=<NllLossBackward0>)
tensor(1.4497, grad_fn=<NllLossBackward0>)
tensor(1.4880, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9645/17426 [15:43<12:27, 10.40it/s]

tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.4967, grad_fn=<NllLossBackward0>)
tensor(1.4931, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9647/17426 [15:44<12:34, 10.31it/s]

tensor(1.5274, grad_fn=<NllLossBackward0>)
tensor(1.5411, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9651/17426 [15:44<12:12, 10.61it/s]

tensor(1.4844, grad_fn=<NllLossBackward0>)
tensor(1.5280, grad_fn=<NllLossBackward0>)
tensor(1.4707, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9653/17426 [15:44<12:18, 10.53it/s]

tensor(1.5003, grad_fn=<NllLossBackward0>)
tensor(1.5490, grad_fn=<NllLossBackward0>)
tensor(1.4853, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9657/17426 [15:45<12:09, 10.65it/s]

tensor(1.5618, grad_fn=<NllLossBackward0>)
tensor(1.5019, grad_fn=<NllLossBackward0>)
tensor(1.5192, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9659/17426 [15:45<12:17, 10.53it/s]

tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.5049, grad_fn=<NllLossBackward0>)
tensor(1.5114, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9663/17426 [15:45<12:11, 10.61it/s]

tensor(1.4963, grad_fn=<NllLossBackward0>)
tensor(1.5219, grad_fn=<NllLossBackward0>)
tensor(1.4733, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9665/17426 [15:45<12:35, 10.27it/s]

tensor(1.4880, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)
tensor(1.5261, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9669/17426 [15:46<12:14, 10.56it/s]

tensor(1.4720, grad_fn=<NllLossBackward0>)
tensor(1.5211, grad_fn=<NllLossBackward0>)
tensor(1.5003, grad_fn=<NllLossBackward0>)


 55%|█████▌    | 9671/17426 [15:46<12:21, 10.46it/s]

tensor(1.4783, grad_fn=<NllLossBackward0>)
tensor(1.5268, grad_fn=<NllLossBackward0>)
tensor(1.5183, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9675/17426 [15:46<12:15, 10.54it/s]

tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.4789, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9677/17426 [15:46<12:39, 10.20it/s]

tensor(1.5822, grad_fn=<NllLossBackward0>)
tensor(1.5153, grad_fn=<NllLossBackward0>)
tensor(1.4780, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9681/17426 [15:47<12:24, 10.41it/s]

tensor(1.4614, grad_fn=<NllLossBackward0>)
tensor(1.4264, grad_fn=<NllLossBackward0>)
tensor(1.5166, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9683/17426 [15:47<12:28, 10.35it/s]

tensor(1.5379, grad_fn=<NllLossBackward0>)
tensor(1.5724, grad_fn=<NllLossBackward0>)
tensor(1.4637, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9687/17426 [15:47<12:16, 10.51it/s]

tensor(1.5071, grad_fn=<NllLossBackward0>)
tensor(1.4982, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9689/17426 [15:48<12:22, 10.41it/s]

tensor(1.5767, grad_fn=<NllLossBackward0>)
tensor(1.4651, grad_fn=<NllLossBackward0>)
tensor(1.4701, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9693/17426 [15:48<12:03, 10.69it/s]

tensor(1.5217, grad_fn=<NllLossBackward0>)
tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.5031, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9695/17426 [15:48<12:19, 10.45it/s]

tensor(1.5387, grad_fn=<NllLossBackward0>)
tensor(1.5095, grad_fn=<NllLossBackward0>)
tensor(1.4911, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9699/17426 [15:49<12:13, 10.53it/s]

tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.4903, grad_fn=<NllLossBackward0>)
tensor(1.5311, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9701/17426 [15:49<12:17, 10.47it/s]

tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.4803, grad_fn=<NllLossBackward0>)
tensor(1.4960, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9705/17426 [15:49<12:14, 10.51it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.5049, grad_fn=<NllLossBackward0>)
tensor(1.5352, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9707/17426 [15:49<13:32,  9.50it/s]

tensor(1.5449, grad_fn=<NllLossBackward0>)
tensor(1.5135, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9709/17426 [15:50<14:39,  8.77it/s]

tensor(1.5439, grad_fn=<NllLossBackward0>)
tensor(1.5342, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9711/17426 [15:50<16:09,  7.96it/s]

tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.5355, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9713/17426 [15:50<16:30,  7.79it/s]

tensor(1.5002, grad_fn=<NllLossBackward0>)
tensor(1.4785, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9715/17426 [15:51<17:20,  7.41it/s]

tensor(1.4668, grad_fn=<NllLossBackward0>)
tensor(1.5254, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9717/17426 [15:51<17:20,  7.41it/s]

tensor(1.5403, grad_fn=<NllLossBackward0>)
tensor(1.5038, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9719/17426 [15:51<17:21,  7.40it/s]

tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.4628, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9721/17426 [15:51<18:18,  7.02it/s]

tensor(1.4840, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9723/17426 [15:52<17:09,  7.48it/s]

tensor(1.5132, grad_fn=<NllLossBackward0>)
tensor(1.4819, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9725/17426 [15:52<16:15,  7.90it/s]

tensor(1.4995, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9727/17426 [15:52<17:35,  7.29it/s]

tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.5271, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9729/17426 [15:52<17:58,  7.14it/s]

tensor(1.4554, grad_fn=<NllLossBackward0>)
tensor(1.5066, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9731/17426 [15:53<17:08,  7.48it/s]

tensor(1.4832, grad_fn=<NllLossBackward0>)
tensor(1.5237, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9733/17426 [15:53<18:40,  6.87it/s]

tensor(1.4841, grad_fn=<NllLossBackward0>)
tensor(1.5073, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9735/17426 [15:53<18:33,  6.91it/s]

tensor(1.5012, grad_fn=<NllLossBackward0>)
tensor(1.5023, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9737/17426 [15:54<19:37,  6.53it/s]

tensor(1.4414, grad_fn=<NllLossBackward0>)
tensor(1.4993, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9739/17426 [15:54<18:18,  7.00it/s]

tensor(1.4787, grad_fn=<NllLossBackward0>)
tensor(1.4680, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9741/17426 [15:54<16:54,  7.57it/s]

tensor(1.4902, grad_fn=<NllLossBackward0>)
tensor(1.4838, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9743/17426 [15:54<15:53,  8.06it/s]

tensor(1.4746, grad_fn=<NllLossBackward0>)
tensor(1.4597, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9745/17426 [15:55<14:48,  8.64it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.5103, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9747/17426 [15:55<14:28,  8.84it/s]

tensor(1.5128, grad_fn=<NllLossBackward0>)
tensor(1.5077, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9750/17426 [15:55<13:13,  9.68it/s]

tensor(1.5089, grad_fn=<NllLossBackward0>)
tensor(1.5420, grad_fn=<NllLossBackward0>)
tensor(1.5285, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9752/17426 [15:55<13:32,  9.44it/s]

tensor(1.4885, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9756/17426 [15:56<12:35, 10.16it/s]

tensor(1.5983, grad_fn=<NllLossBackward0>)
tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.5123, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9758/17426 [15:56<13:51,  9.22it/s]

tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9760/17426 [15:56<13:55,  9.18it/s]

tensor(1.5103, grad_fn=<NllLossBackward0>)
tensor(1.4718, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9762/17426 [15:56<13:10,  9.70it/s]

tensor(1.5393, grad_fn=<NllLossBackward0>)
tensor(1.4782, grad_fn=<NllLossBackward0>)
tensor(1.5115, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9766/17426 [15:57<12:15, 10.42it/s]

tensor(1.5012, grad_fn=<NllLossBackward0>)
tensor(1.5087, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9768/17426 [15:57<12:41, 10.05it/s]

tensor(1.5503, grad_fn=<NllLossBackward0>)
tensor(1.5177, grad_fn=<NllLossBackward0>)
tensor(1.5256, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9772/17426 [15:57<12:16, 10.40it/s]

tensor(1.4781, grad_fn=<NllLossBackward0>)
tensor(1.5009, grad_fn=<NllLossBackward0>)
tensor(1.5101, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9774/17426 [15:58<12:14, 10.42it/s]

tensor(1.4627, grad_fn=<NllLossBackward0>)
tensor(1.4662, grad_fn=<NllLossBackward0>)
tensor(1.5267, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9778/17426 [15:58<11:58, 10.65it/s]

tensor(1.5110, grad_fn=<NllLossBackward0>)
tensor(1.5450, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9780/17426 [15:58<12:21, 10.31it/s]

tensor(1.4917, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.5226, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9784/17426 [15:58<11:57, 10.65it/s]

tensor(1.4887, grad_fn=<NllLossBackward0>)
tensor(1.4720, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9786/17426 [15:59<12:11, 10.44it/s]

tensor(1.5493, grad_fn=<NllLossBackward0>)
tensor(1.5182, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9790/17426 [15:59<12:12, 10.42it/s]

tensor(1.5409, grad_fn=<NllLossBackward0>)
tensor(1.4801, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9792/17426 [15:59<12:17, 10.35it/s]

tensor(1.4971, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)
tensor(1.4651, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9796/17426 [16:00<12:00, 10.59it/s]

tensor(1.4941, grad_fn=<NllLossBackward0>)
tensor(1.4975, grad_fn=<NllLossBackward0>)
tensor(1.4912, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9798/17426 [16:00<12:12, 10.41it/s]

tensor(1.4694, grad_fn=<NllLossBackward0>)
tensor(1.4689, grad_fn=<NllLossBackward0>)
tensor(1.4850, grad_fn=<NllLossBackward0>)


 56%|█████▌    | 9802/17426 [16:00<12:14, 10.38it/s]

tensor(1.5133, grad_fn=<NllLossBackward0>)
tensor(1.5006, grad_fn=<NllLossBackward0>)
tensor(1.4656, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9804/17426 [16:00<12:28, 10.19it/s]

tensor(1.4955, grad_fn=<NllLossBackward0>)
tensor(1.5416, grad_fn=<NllLossBackward0>)
tensor(1.4665, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9808/17426 [16:01<12:00, 10.57it/s]

tensor(1.5123, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)
tensor(1.5216, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9810/17426 [16:01<12:10, 10.43it/s]

tensor(1.5486, grad_fn=<NllLossBackward0>)
tensor(1.5173, grad_fn=<NllLossBackward0>)
tensor(1.5234, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9814/17426 [16:01<12:06, 10.48it/s]

tensor(1.5640, grad_fn=<NllLossBackward0>)
tensor(1.4715, grad_fn=<NllLossBackward0>)
tensor(1.5197, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9816/17426 [16:02<12:18, 10.30it/s]

tensor(1.4600, grad_fn=<NllLossBackward0>)
tensor(1.5260, grad_fn=<NllLossBackward0>)
tensor(1.5090, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9820/17426 [16:02<11:58, 10.58it/s]

tensor(1.4871, grad_fn=<NllLossBackward0>)
tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9822/17426 [16:02<12:32, 10.11it/s]

tensor(1.5187, grad_fn=<NllLossBackward0>)
tensor(1.4793, grad_fn=<NllLossBackward0>)
tensor(1.4915, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9826/17426 [16:03<11:55, 10.62it/s]

tensor(1.5165, grad_fn=<NllLossBackward0>)
tensor(1.4901, grad_fn=<NllLossBackward0>)
tensor(1.5577, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9828/17426 [16:03<12:04, 10.49it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.5375, grad_fn=<NllLossBackward0>)
tensor(1.5066, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9832/17426 [16:03<11:50, 10.69it/s]

tensor(1.5014, grad_fn=<NllLossBackward0>)
tensor(1.5809, grad_fn=<NllLossBackward0>)
tensor(1.5054, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9834/17426 [16:03<12:21, 10.24it/s]

tensor(1.4568, grad_fn=<NllLossBackward0>)
tensor(1.5038, grad_fn=<NllLossBackward0>)
tensor(1.5257, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9838/17426 [16:04<11:53, 10.64it/s]

tensor(1.4605, grad_fn=<NllLossBackward0>)
tensor(1.5017, grad_fn=<NllLossBackward0>)
tensor(1.4886, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9840/17426 [16:04<13:53,  9.10it/s]

tensor(1.5017, grad_fn=<NllLossBackward0>)
tensor(1.5159, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9842/17426 [16:04<15:45,  8.02it/s]

tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.4773, grad_fn=<NllLossBackward0>)


 56%|█████▋    | 9844/17426 [16:05<17:07,  7.38it/s]

tensor(1.4857, grad_fn=<NllLossBackward0>)
tensor(1.4738, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9846/17426 [16:05<16:40,  7.58it/s]

tensor(1.5572, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9848/17426 [16:05<16:52,  7.48it/s]

tensor(1.4855, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9850/17426 [16:05<17:57,  7.03it/s]

tensor(1.4989, grad_fn=<NllLossBackward0>)
tensor(1.4848, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9852/17426 [16:06<18:10,  6.95it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.4507, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9854/17426 [16:06<18:07,  6.96it/s]

tensor(1.5393, grad_fn=<NllLossBackward0>)
tensor(1.4957, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9856/17426 [16:06<18:13,  6.93it/s]

tensor(1.4988, grad_fn=<NllLossBackward0>)
tensor(1.4756, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9858/17426 [16:07<18:03,  6.98it/s]

tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9860/17426 [16:07<18:00,  7.00it/s]

tensor(1.5600, grad_fn=<NllLossBackward0>)
tensor(1.5267, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9862/17426 [16:07<18:43,  6.73it/s]

tensor(1.4405, grad_fn=<NllLossBackward0>)
tensor(1.5188, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9864/17426 [16:07<18:50,  6.69it/s]

tensor(1.5108, grad_fn=<NllLossBackward0>)
tensor(1.4894, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9866/17426 [16:08<18:13,  6.91it/s]

tensor(1.4627, grad_fn=<NllLossBackward0>)
tensor(1.5040, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9868/17426 [16:08<17:54,  7.03it/s]

tensor(1.4904, grad_fn=<NllLossBackward0>)
tensor(1.5094, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9870/17426 [16:08<15:48,  7.97it/s]

tensor(1.5070, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)
tensor(1.5145, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9873/17426 [16:09<14:08,  8.90it/s]

tensor(1.4932, grad_fn=<NllLossBackward0>)
tensor(1.4991, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9875/17426 [16:09<14:06,  8.92it/s]

tensor(1.5203, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9877/17426 [16:09<14:09,  8.89it/s]

tensor(1.4797, grad_fn=<NllLossBackward0>)
tensor(1.4952, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9879/17426 [16:09<14:04,  8.93it/s]

tensor(1.5217, grad_fn=<NllLossBackward0>)
tensor(1.4991, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9881/17426 [16:09<14:09,  8.88it/s]

tensor(1.4638, grad_fn=<NllLossBackward0>)
tensor(1.4889, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9883/17426 [16:10<14:29,  8.67it/s]

tensor(1.5354, grad_fn=<NllLossBackward0>)
tensor(1.4807, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9885/17426 [16:10<14:34,  8.62it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.5040, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9887/17426 [16:10<14:17,  8.79it/s]

tensor(1.5393, grad_fn=<NllLossBackward0>)
tensor(1.5008, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9889/17426 [16:10<14:24,  8.72it/s]

tensor(1.4898, grad_fn=<NllLossBackward0>)
tensor(1.4835, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9891/17426 [16:11<14:33,  8.63it/s]

tensor(1.4926, grad_fn=<NllLossBackward0>)
tensor(1.5256, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9893/17426 [16:11<13:55,  9.01it/s]

tensor(1.5362, grad_fn=<NllLossBackward0>)
tensor(1.4867, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9895/17426 [16:11<14:07,  8.89it/s]

tensor(1.5291, grad_fn=<NllLossBackward0>)
tensor(1.4714, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9897/17426 [16:11<14:46,  8.49it/s]

tensor(1.5459, grad_fn=<NllLossBackward0>)
tensor(1.5613, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9899/17426 [16:12<14:24,  8.71it/s]

tensor(1.5108, grad_fn=<NllLossBackward0>)
tensor(1.4464, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9901/17426 [16:12<14:42,  8.53it/s]

tensor(1.5268, grad_fn=<NllLossBackward0>)
tensor(1.4814, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9903/17426 [16:12<14:01,  8.94it/s]

tensor(1.4984, grad_fn=<NllLossBackward0>)
tensor(1.4837, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9905/17426 [16:12<14:01,  8.93it/s]

tensor(1.5566, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9907/17426 [16:12<13:44,  9.12it/s]

tensor(1.5524, grad_fn=<NllLossBackward0>)
tensor(1.4453, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9909/17426 [16:13<13:43,  9.13it/s]

tensor(1.4424, grad_fn=<NllLossBackward0>)
tensor(1.4980, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9911/17426 [16:13<14:01,  8.93it/s]

tensor(1.4336, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9913/17426 [16:13<13:58,  8.96it/s]

tensor(1.5423, grad_fn=<NllLossBackward0>)
tensor(1.4880, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9915/17426 [16:13<13:43,  9.12it/s]

tensor(1.4839, grad_fn=<NllLossBackward0>)
tensor(1.4577, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9917/17426 [16:14<13:50,  9.04it/s]

tensor(1.4837, grad_fn=<NllLossBackward0>)
tensor(1.5428, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9919/17426 [16:14<14:13,  8.80it/s]

tensor(1.5057, grad_fn=<NllLossBackward0>)
tensor(1.4886, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9921/17426 [16:14<13:57,  8.96it/s]

tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.4933, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9923/17426 [16:14<13:55,  8.99it/s]

tensor(1.5335, grad_fn=<NllLossBackward0>)
tensor(1.4634, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9925/17426 [16:14<13:39,  9.15it/s]

tensor(1.4950, grad_fn=<NllLossBackward0>)
tensor(1.5587, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9927/17426 [16:15<13:43,  9.11it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.4710, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9929/17426 [16:15<13:27,  9.28it/s]

tensor(1.4971, grad_fn=<NllLossBackward0>)
tensor(1.4926, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9931/17426 [16:15<13:31,  9.23it/s]

tensor(1.4668, grad_fn=<NllLossBackward0>)
tensor(1.5335, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9933/17426 [16:15<13:44,  9.09it/s]

tensor(1.5095, grad_fn=<NllLossBackward0>)
tensor(1.4990, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9935/17426 [16:16<13:40,  9.13it/s]

tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.4271, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9937/17426 [16:16<13:01,  9.59it/s]

tensor(1.4765, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)
tensor(1.5022, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9940/17426 [16:16<13:04,  9.54it/s]

tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.4969, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9943/17426 [16:16<12:46,  9.76it/s]

tensor(1.5207, grad_fn=<NllLossBackward0>)
tensor(1.4844, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9945/17426 [16:17<13:09,  9.48it/s]

tensor(1.4885, grad_fn=<NllLossBackward0>)
tensor(1.4534, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9947/17426 [16:17<13:35,  9.18it/s]

tensor(1.4671, grad_fn=<NllLossBackward0>)
tensor(1.4745, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9949/17426 [16:17<13:15,  9.40it/s]

tensor(1.4832, grad_fn=<NllLossBackward0>)
tensor(1.5016, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9951/17426 [16:17<12:53,  9.66it/s]

tensor(1.5343, grad_fn=<NllLossBackward0>)
tensor(1.4968, grad_fn=<NllLossBackward0>)
tensor(1.4964, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9955/17426 [16:18<11:59, 10.38it/s]

tensor(1.4501, grad_fn=<NllLossBackward0>)
tensor(1.5095, grad_fn=<NllLossBackward0>)
tensor(1.4973, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9957/17426 [16:18<12:05, 10.30it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.4251, grad_fn=<NllLossBackward0>)
tensor(1.5181, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9960/17426 [16:18<13:20,  9.33it/s]

tensor(1.5156, grad_fn=<NllLossBackward0>)
tensor(1.5188, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9962/17426 [16:18<14:47,  8.41it/s]

tensor(1.5495, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9964/17426 [16:19<16:12,  7.67it/s]

tensor(1.4995, grad_fn=<NllLossBackward0>)
tensor(1.4790, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9966/17426 [16:19<16:05,  7.73it/s]

tensor(1.5507, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9968/17426 [16:19<15:57,  7.79it/s]

tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.5508, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9970/17426 [16:19<15:36,  7.96it/s]

tensor(1.4495, grad_fn=<NllLossBackward0>)
tensor(1.4611, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9972/17426 [16:20<16:08,  7.70it/s]

tensor(1.5098, grad_fn=<NllLossBackward0>)
tensor(1.5214, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9974/17426 [16:20<17:00,  7.30it/s]

tensor(1.5476, grad_fn=<NllLossBackward0>)
tensor(1.5723, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9976/17426 [16:20<17:20,  7.16it/s]

tensor(1.4401, grad_fn=<NllLossBackward0>)
tensor(1.4579, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9978/17426 [16:21<16:42,  7.43it/s]

tensor(1.5047, grad_fn=<NllLossBackward0>)
tensor(1.5252, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9980/17426 [16:21<17:23,  7.14it/s]

tensor(1.5415, grad_fn=<NllLossBackward0>)
tensor(1.5123, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9982/17426 [16:21<17:55,  6.92it/s]

tensor(1.5111, grad_fn=<NllLossBackward0>)
tensor(1.4954, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9984/17426 [16:21<18:01,  6.88it/s]

tensor(1.4861, grad_fn=<NllLossBackward0>)
tensor(1.5266, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9986/17426 [16:22<18:56,  6.55it/s]

tensor(1.4268, grad_fn=<NllLossBackward0>)
tensor(1.4665, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9988/17426 [16:22<18:42,  6.63it/s]

tensor(1.5757, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9990/17426 [16:22<19:15,  6.43it/s]

tensor(1.5206, grad_fn=<NllLossBackward0>)
tensor(1.4814, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9992/17426 [16:23<17:17,  7.16it/s]

tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.5282, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9994/17426 [16:23<16:29,  7.51it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.5404, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9996/17426 [16:23<15:46,  7.85it/s]

tensor(1.5050, grad_fn=<NllLossBackward0>)
tensor(1.4819, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 9998/17426 [16:23<15:14,  8.12it/s]

tensor(1.4686, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 10000/17426 [16:24<15:12,  8.14it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.4723, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 10002/17426 [16:24<14:43,  8.40it/s]

tensor(1.4974, grad_fn=<NllLossBackward0>)
tensor(1.5445, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 10004/17426 [16:24<14:14,  8.69it/s]

tensor(1.4522, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 10006/17426 [16:24<14:01,  8.82it/s]

tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.5051, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 10008/17426 [16:25<14:32,  8.50it/s]

tensor(1.5089, grad_fn=<NllLossBackward0>)
tensor(1.4743, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 10010/17426 [16:25<13:59,  8.83it/s]

tensor(1.5132, grad_fn=<NllLossBackward0>)
tensor(1.4895, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 10012/17426 [16:25<13:46,  8.97it/s]

tensor(1.5227, grad_fn=<NllLossBackward0>)
tensor(1.4462, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 10014/17426 [16:25<13:44,  8.99it/s]

tensor(1.5038, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 10016/17426 [16:25<13:50,  8.93it/s]

tensor(1.4979, grad_fn=<NllLossBackward0>)
tensor(1.5438, grad_fn=<NllLossBackward0>)


 57%|█████▋    | 10018/17426 [16:26<14:06,  8.75it/s]

tensor(1.4615, grad_fn=<NllLossBackward0>)
tensor(1.5497, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10020/17426 [16:26<13:49,  8.92it/s]

tensor(1.4894, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10022/17426 [16:26<13:44,  8.98it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5124, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10024/17426 [16:26<13:40,  9.02it/s]

tensor(1.5167, grad_fn=<NllLossBackward0>)
tensor(1.5215, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10026/17426 [16:26<12:53,  9.56it/s]

tensor(1.4984, grad_fn=<NllLossBackward0>)
tensor(1.4863, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10028/17426 [16:27<13:30,  9.13it/s]

tensor(1.5198, grad_fn=<NllLossBackward0>)
tensor(1.4730, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10030/17426 [16:27<13:33,  9.09it/s]

tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.5207, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10032/17426 [16:27<13:41,  9.00it/s]

tensor(1.5083, grad_fn=<NllLossBackward0>)
tensor(1.5237, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10034/17426 [16:27<13:24,  9.19it/s]

tensor(1.5085, grad_fn=<NllLossBackward0>)
tensor(1.5028, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10036/17426 [16:28<14:05,  8.74it/s]

tensor(1.4872, grad_fn=<NllLossBackward0>)
tensor(1.4549, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10039/17426 [16:28<12:46,  9.64it/s]

tensor(1.4925, grad_fn=<NllLossBackward0>)
tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.4919, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10042/17426 [16:28<12:15, 10.04it/s]

tensor(1.4825, grad_fn=<NllLossBackward0>)
tensor(1.5268, grad_fn=<NllLossBackward0>)
tensor(1.5190, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10044/17426 [16:28<12:52,  9.55it/s]

tensor(1.5186, grad_fn=<NllLossBackward0>)
tensor(1.4567, grad_fn=<NllLossBackward0>)
tensor(1.4677, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10047/17426 [16:29<12:45,  9.64it/s]

tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.5283, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10050/17426 [16:29<12:27,  9.87it/s]

tensor(1.4972, grad_fn=<NllLossBackward0>)
tensor(1.4445, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10054/17426 [16:29<11:46, 10.44it/s]

tensor(1.5087, grad_fn=<NllLossBackward0>)
tensor(1.5110, grad_fn=<NllLossBackward0>)
tensor(1.4967, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10056/17426 [16:30<12:04, 10.17it/s]

tensor(1.5338, grad_fn=<NllLossBackward0>)
tensor(1.5241, grad_fn=<NllLossBackward0>)
tensor(1.5018, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10059/17426 [16:30<12:28,  9.85it/s]

tensor(1.4770, grad_fn=<NllLossBackward0>)
tensor(1.5540, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10061/17426 [16:30<12:52,  9.53it/s]

tensor(1.4577, grad_fn=<NllLossBackward0>)
tensor(1.5033, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10063/17426 [16:30<13:11,  9.30it/s]

tensor(1.5249, grad_fn=<NllLossBackward0>)
tensor(1.4811, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10065/17426 [16:31<13:19,  9.21it/s]

tensor(1.5161, grad_fn=<NllLossBackward0>)
tensor(1.5241, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10067/17426 [16:31<13:42,  8.95it/s]

tensor(1.5172, grad_fn=<NllLossBackward0>)
tensor(1.5340, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10069/17426 [16:31<13:48,  8.88it/s]

tensor(1.5037, grad_fn=<NllLossBackward0>)
tensor(1.5283, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10071/17426 [16:31<13:32,  9.05it/s]

tensor(1.5119, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10073/17426 [16:32<13:31,  9.07it/s]

tensor(1.5615, grad_fn=<NllLossBackward0>)
tensor(1.5320, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10075/17426 [16:32<14:18,  8.57it/s]

tensor(1.5541, grad_fn=<NllLossBackward0>)
tensor(1.4451, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10077/17426 [16:32<13:45,  8.90it/s]

tensor(1.4562, grad_fn=<NllLossBackward0>)
tensor(1.4901, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10079/17426 [16:32<13:03,  9.38it/s]

tensor(1.4675, grad_fn=<NllLossBackward0>)
tensor(1.4987, grad_fn=<NllLossBackward0>)
tensor(1.5392, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10081/17426 [16:32<12:42,  9.63it/s]

tensor(1.4908, grad_fn=<NllLossBackward0>)
tensor(1.5075, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10084/17426 [16:33<13:50,  8.84it/s]

tensor(1.4987, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10086/17426 [16:33<15:56,  7.67it/s]

tensor(1.5014, grad_fn=<NllLossBackward0>)
tensor(1.5199, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10088/17426 [16:33<16:21,  7.48it/s]

tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.4954, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10090/17426 [16:34<15:59,  7.65it/s]

tensor(1.4828, grad_fn=<NllLossBackward0>)
tensor(1.5035, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10092/17426 [16:34<16:30,  7.40it/s]

tensor(1.4747, grad_fn=<NllLossBackward0>)
tensor(1.5138, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10094/17426 [16:34<16:55,  7.22it/s]

tensor(1.4647, grad_fn=<NllLossBackward0>)
tensor(1.4716, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10096/17426 [16:34<17:52,  6.83it/s]

tensor(1.4739, grad_fn=<NllLossBackward0>)
tensor(1.5616, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10098/17426 [16:35<17:10,  7.11it/s]

tensor(1.5262, grad_fn=<NllLossBackward0>)
tensor(1.4559, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10100/17426 [16:35<15:45,  7.75it/s]

tensor(1.4808, grad_fn=<NllLossBackward0>)
tensor(1.5106, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10102/17426 [16:35<17:20,  7.04it/s]

tensor(1.4977, grad_fn=<NllLossBackward0>)
tensor(1.5298, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10104/17426 [16:36<17:33,  6.95it/s]

tensor(1.5239, grad_fn=<NllLossBackward0>)
tensor(1.4895, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10106/17426 [16:36<17:15,  7.07it/s]

tensor(1.5230, grad_fn=<NllLossBackward0>)
tensor(1.5116, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10108/17426 [16:36<17:03,  7.15it/s]

tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.5165, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10110/17426 [16:36<18:26,  6.61it/s]

tensor(1.5110, grad_fn=<NllLossBackward0>)
tensor(1.5236, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10112/17426 [16:37<17:41,  6.89it/s]

tensor(1.4875, grad_fn=<NllLossBackward0>)
tensor(1.5035, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10114/17426 [16:37<18:16,  6.67it/s]

tensor(1.4833, grad_fn=<NllLossBackward0>)
tensor(1.5596, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10116/17426 [16:37<16:24,  7.43it/s]

tensor(1.4711, grad_fn=<NllLossBackward0>)
tensor(1.5030, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10118/17426 [16:37<15:30,  7.86it/s]

tensor(1.4752, grad_fn=<NllLossBackward0>)
tensor(1.4648, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10120/17426 [16:38<15:22,  7.92it/s]

tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.4748, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10122/17426 [16:38<14:18,  8.51it/s]

tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.5314, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10124/17426 [16:38<14:02,  8.67it/s]

tensor(1.4839, grad_fn=<NllLossBackward0>)
tensor(1.5088, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10126/17426 [16:38<13:44,  8.86it/s]

tensor(1.5066, grad_fn=<NllLossBackward0>)
tensor(1.4897, grad_fn=<NllLossBackward0>)
tensor(1.4878, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10130/17426 [16:39<11:50, 10.27it/s]

tensor(1.5180, grad_fn=<NllLossBackward0>)
tensor(1.4638, grad_fn=<NllLossBackward0>)
tensor(1.5159, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10132/17426 [16:39<11:47, 10.31it/s]

tensor(1.5336, grad_fn=<NllLossBackward0>)
tensor(1.5443, grad_fn=<NllLossBackward0>)
tensor(1.5508, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10136/17426 [16:39<11:42, 10.38it/s]

tensor(1.5280, grad_fn=<NllLossBackward0>)
tensor(1.5668, grad_fn=<NllLossBackward0>)
tensor(1.4758, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10138/17426 [16:40<11:42, 10.37it/s]

tensor(1.4719, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)
tensor(1.4841, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10142/17426 [16:40<11:34, 10.49it/s]

tensor(1.4724, grad_fn=<NllLossBackward0>)
tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.5283, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10144/17426 [16:40<11:52, 10.22it/s]

tensor(1.4926, grad_fn=<NllLossBackward0>)
tensor(1.5059, grad_fn=<NllLossBackward0>)
tensor(1.4876, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10147/17426 [16:40<12:16,  9.88it/s]

tensor(1.4724, grad_fn=<NllLossBackward0>)
tensor(1.4820, grad_fn=<NllLossBackward0>)
tensor(1.5444, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10151/17426 [16:41<11:43, 10.35it/s]

tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)
tensor(1.4794, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10153/17426 [16:41<11:41, 10.36it/s]

tensor(1.4877, grad_fn=<NllLossBackward0>)
tensor(1.4814, grad_fn=<NllLossBackward0>)
tensor(1.5135, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10157/17426 [16:41<11:53, 10.19it/s]

tensor(1.5123, grad_fn=<NllLossBackward0>)
tensor(1.5416, grad_fn=<NllLossBackward0>)
tensor(1.4738, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10159/17426 [16:42<12:20,  9.81it/s]

tensor(1.5221, grad_fn=<NllLossBackward0>)
tensor(1.4799, grad_fn=<NllLossBackward0>)
tensor(1.4974, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10162/17426 [16:42<12:13,  9.90it/s]

tensor(1.4590, grad_fn=<NllLossBackward0>)
tensor(1.5013, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10164/17426 [16:42<12:49,  9.44it/s]

tensor(1.5290, grad_fn=<NllLossBackward0>)
tensor(1.5710, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10166/17426 [16:42<13:40,  8.84it/s]

tensor(1.5098, grad_fn=<NllLossBackward0>)
tensor(1.4800, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10168/17426 [16:43<13:26,  9.00it/s]

tensor(1.4774, grad_fn=<NllLossBackward0>)
tensor(1.4935, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10170/17426 [16:43<13:52,  8.72it/s]

tensor(1.5154, grad_fn=<NllLossBackward0>)
tensor(1.5194, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10172/17426 [16:43<13:38,  8.87it/s]

tensor(1.4889, grad_fn=<NllLossBackward0>)
tensor(1.4995, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10174/17426 [16:43<13:46,  8.78it/s]

tensor(1.5319, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10176/17426 [16:44<13:46,  8.77it/s]

tensor(1.4905, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10178/17426 [16:44<13:37,  8.86it/s]

tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.5174, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10180/17426 [16:44<13:27,  8.97it/s]

tensor(1.5237, grad_fn=<NllLossBackward0>)
tensor(1.4640, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10182/17426 [16:44<13:19,  9.06it/s]

tensor(1.5017, grad_fn=<NllLossBackward0>)
tensor(1.4947, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10184/17426 [16:44<13:13,  9.13it/s]

tensor(1.4825, grad_fn=<NllLossBackward0>)
tensor(1.4776, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10186/17426 [16:45<13:56,  8.66it/s]

tensor(1.4572, grad_fn=<NllLossBackward0>)
tensor(1.5288, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10188/17426 [16:45<13:39,  8.83it/s]

tensor(1.5193, grad_fn=<NllLossBackward0>)
tensor(1.5066, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10190/17426 [16:45<12:46,  9.43it/s]

tensor(1.4939, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)
tensor(1.4982, grad_fn=<NllLossBackward0>)


 58%|█████▊    | 10194/17426 [16:45<11:35, 10.39it/s]

tensor(1.4522, grad_fn=<NllLossBackward0>)
tensor(1.5253, grad_fn=<NllLossBackward0>)
tensor(1.4807, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10197/17426 [16:46<12:09,  9.91it/s]

tensor(1.4599, grad_fn=<NllLossBackward0>)
tensor(1.4553, grad_fn=<NllLossBackward0>)
tensor(1.4656, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10200/17426 [16:46<11:46, 10.22it/s]

tensor(1.5113, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10202/17426 [16:46<11:51, 10.16it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.4783, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10206/17426 [16:47<11:36, 10.36it/s]

tensor(1.4828, grad_fn=<NllLossBackward0>)
tensor(1.4903, grad_fn=<NllLossBackward0>)
tensor(1.5214, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10208/17426 [16:47<11:39, 10.32it/s]

tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10211/17426 [16:47<12:57,  9.28it/s]

tensor(1.4516, grad_fn=<NllLossBackward0>)
tensor(1.5138, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10213/17426 [16:47<13:46,  8.72it/s]

tensor(1.4931, grad_fn=<NllLossBackward0>)
tensor(1.5054, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10215/17426 [16:48<15:50,  7.58it/s]

tensor(1.5090, grad_fn=<NllLossBackward0>)
tensor(1.5209, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10217/17426 [16:48<15:14,  7.88it/s]

tensor(1.4506, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10219/17426 [16:48<14:51,  8.08it/s]

tensor(1.4641, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10221/17426 [16:48<14:30,  8.28it/s]

tensor(1.5002, grad_fn=<NllLossBackward0>)
tensor(1.4464, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10223/17426 [16:49<14:47,  8.11it/s]

tensor(1.5086, grad_fn=<NllLossBackward0>)
tensor(1.5023, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10225/17426 [16:49<15:30,  7.74it/s]

tensor(1.5255, grad_fn=<NllLossBackward0>)
tensor(1.5184, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10227/17426 [16:49<15:05,  7.95it/s]

tensor(1.5435, grad_fn=<NllLossBackward0>)
tensor(1.5564, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10229/17426 [16:49<14:18,  8.38it/s]

tensor(1.5622, grad_fn=<NllLossBackward0>)
tensor(1.4952, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10231/17426 [16:50<14:05,  8.51it/s]

tensor(1.5260, grad_fn=<NllLossBackward0>)
tensor(1.5127, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10233/17426 [16:50<15:46,  7.60it/s]

tensor(1.5287, grad_fn=<NllLossBackward0>)
tensor(1.4981, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10235/17426 [16:50<16:35,  7.22it/s]

tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)


 59%|█████▊    | 10237/17426 [16:51<17:24,  6.88it/s]

tensor(1.4616, grad_fn=<NllLossBackward0>)
tensor(1.4500, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10239/17426 [16:51<16:40,  7.18it/s]

tensor(1.4912, grad_fn=<NllLossBackward0>)
tensor(1.5151, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10241/17426 [16:51<18:10,  6.59it/s]

tensor(1.4714, grad_fn=<NllLossBackward0>)
tensor(1.5259, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10243/17426 [16:51<17:55,  6.68it/s]

tensor(1.5241, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10245/17426 [16:52<18:18,  6.54it/s]

tensor(1.5071, grad_fn=<NllLossBackward0>)
tensor(1.5122, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10247/17426 [16:52<17:37,  6.79it/s]

tensor(1.4680, grad_fn=<NllLossBackward0>)
tensor(1.4336, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10249/17426 [16:52<15:54,  7.52it/s]

tensor(1.5500, grad_fn=<NllLossBackward0>)
tensor(1.4829, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10251/17426 [16:53<15:06,  7.91it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.5263, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10253/17426 [16:53<14:55,  8.01it/s]

tensor(1.4845, grad_fn=<NllLossBackward0>)
tensor(1.5859, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10255/17426 [16:53<14:55,  8.01it/s]

tensor(1.4836, grad_fn=<NllLossBackward0>)
tensor(1.5197, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10257/17426 [16:53<14:35,  8.19it/s]

tensor(1.4739, grad_fn=<NllLossBackward0>)
tensor(1.5190, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10259/17426 [16:54<13:53,  8.60it/s]

tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.5152, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10261/17426 [16:54<13:43,  8.70it/s]

tensor(1.5056, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10263/17426 [16:54<13:18,  8.97it/s]

tensor(1.5363, grad_fn=<NllLossBackward0>)
tensor(1.5074, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10265/17426 [16:54<13:33,  8.81it/s]

tensor(1.5058, grad_fn=<NllLossBackward0>)
tensor(1.5051, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10267/17426 [16:54<13:34,  8.79it/s]

tensor(1.4988, grad_fn=<NllLossBackward0>)
tensor(1.5197, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10269/17426 [16:55<13:27,  8.86it/s]

tensor(1.4944, grad_fn=<NllLossBackward0>)
tensor(1.4960, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10271/17426 [16:55<12:40,  9.41it/s]

tensor(1.4897, grad_fn=<NllLossBackward0>)
tensor(1.4739, grad_fn=<NllLossBackward0>)
tensor(1.4868, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10274/17426 [16:55<12:55,  9.22it/s]

tensor(1.5031, grad_fn=<NllLossBackward0>)
tensor(1.4667, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10276/17426 [16:55<13:15,  8.98it/s]

tensor(1.4743, grad_fn=<NllLossBackward0>)
tensor(1.4878, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10278/17426 [16:56<13:24,  8.89it/s]

tensor(1.4876, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10280/17426 [16:56<13:20,  8.93it/s]

tensor(1.4922, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10282/17426 [16:56<13:25,  8.87it/s]

tensor(1.4876, grad_fn=<NllLossBackward0>)
tensor(1.4764, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10285/17426 [16:56<12:31,  9.50it/s]

tensor(1.5013, grad_fn=<NllLossBackward0>)
tensor(1.4558, grad_fn=<NllLossBackward0>)
tensor(1.4828, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10287/17426 [16:57<12:44,  9.33it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.5056, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10289/17426 [16:57<12:56,  9.20it/s]

tensor(1.4808, grad_fn=<NllLossBackward0>)
tensor(1.4639, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10291/17426 [16:57<13:08,  9.04it/s]

tensor(1.4538, grad_fn=<NllLossBackward0>)
tensor(1.5248, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10293/17426 [16:57<13:32,  8.78it/s]

tensor(1.4929, grad_fn=<NllLossBackward0>)
tensor(1.4767, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10295/17426 [16:58<13:17,  8.94it/s]

tensor(1.4803, grad_fn=<NllLossBackward0>)
tensor(1.4988, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10297/17426 [16:58<13:10,  9.02it/s]

tensor(1.4742, grad_fn=<NllLossBackward0>)
tensor(1.4795, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10299/17426 [16:58<13:15,  8.96it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.4716, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10301/17426 [16:58<13:04,  9.09it/s]

tensor(1.4510, grad_fn=<NllLossBackward0>)
tensor(1.5256, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10303/17426 [16:58<13:34,  8.74it/s]

tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10305/17426 [16:59<13:41,  8.66it/s]

tensor(1.4619, grad_fn=<NllLossBackward0>)
tensor(1.5197, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10307/17426 [16:59<13:15,  8.94it/s]

tensor(1.5214, grad_fn=<NllLossBackward0>)
tensor(1.4784, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10309/17426 [16:59<13:18,  8.92it/s]

tensor(1.4865, grad_fn=<NllLossBackward0>)
tensor(1.4889, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10311/17426 [16:59<13:43,  8.64it/s]

tensor(1.5022, grad_fn=<NllLossBackward0>)
tensor(1.4713, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10314/17426 [17:00<12:20,  9.61it/s]

tensor(1.4918, grad_fn=<NllLossBackward0>)
tensor(1.4930, grad_fn=<NllLossBackward0>)
tensor(1.5071, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10317/17426 [17:00<11:49, 10.02it/s]

tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.5126, grad_fn=<NllLossBackward0>)
tensor(1.5073, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10320/17426 [17:00<11:36, 10.20it/s]

tensor(1.5149, grad_fn=<NllLossBackward0>)
tensor(1.4613, grad_fn=<NllLossBackward0>)
tensor(1.4805, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10322/17426 [17:00<12:42,  9.32it/s]

tensor(1.4672, grad_fn=<NllLossBackward0>)
tensor(1.5029, grad_fn=<NllLossBackward0>)
tensor(1.4928, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10326/17426 [17:01<11:29, 10.30it/s]

tensor(1.4967, grad_fn=<NllLossBackward0>)
tensor(1.5472, grad_fn=<NllLossBackward0>)
tensor(1.4430, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10328/17426 [17:01<11:37, 10.18it/s]

tensor(1.4952, grad_fn=<NllLossBackward0>)
tensor(1.5090, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10332/17426 [17:01<11:36, 10.19it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.5270, grad_fn=<NllLossBackward0>)
tensor(1.4856, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10334/17426 [17:02<11:42, 10.09it/s]

tensor(1.5109, grad_fn=<NllLossBackward0>)
tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.5023, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10338/17426 [17:02<11:22, 10.39it/s]

tensor(1.4922, grad_fn=<NllLossBackward0>)
tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.4586, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10340/17426 [17:02<12:42,  9.29it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.5179, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10342/17426 [17:03<14:10,  8.33it/s]

tensor(1.5060, grad_fn=<NllLossBackward0>)
tensor(1.4951, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10344/17426 [17:03<14:27,  8.17it/s]

tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.4696, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10346/17426 [17:03<15:08,  7.79it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.4637, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10348/17426 [17:03<14:55,  7.90it/s]

tensor(1.4745, grad_fn=<NllLossBackward0>)
tensor(1.5163, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10350/17426 [17:04<15:22,  7.67it/s]

tensor(1.5644, grad_fn=<NllLossBackward0>)
tensor(1.4741, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10352/17426 [17:04<14:48,  7.96it/s]

tensor(1.5024, grad_fn=<NllLossBackward0>)
tensor(1.4979, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10354/17426 [17:04<16:15,  7.25it/s]

tensor(1.5255, grad_fn=<NllLossBackward0>)
tensor(1.5190, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10356/17426 [17:04<15:57,  7.39it/s]

tensor(1.5471, grad_fn=<NllLossBackward0>)
tensor(1.5210, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10358/17426 [17:05<15:44,  7.48it/s]

tensor(1.4698, grad_fn=<NllLossBackward0>)
tensor(1.4902, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10360/17426 [17:05<16:18,  7.22it/s]

tensor(1.4539, grad_fn=<NllLossBackward0>)
tensor(1.4968, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10362/17426 [17:05<15:55,  7.39it/s]

tensor(1.4829, grad_fn=<NllLossBackward0>)
tensor(1.5087, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10364/17426 [17:06<16:16,  7.23it/s]

tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.5073, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10366/17426 [17:06<17:58,  6.55it/s]

tensor(1.5626, grad_fn=<NllLossBackward0>)
tensor(1.4802, grad_fn=<NllLossBackward0>)


 59%|█████▉    | 10368/17426 [17:06<19:14,  6.11it/s]

tensor(1.5184, grad_fn=<NllLossBackward0>)
tensor(1.5333, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10370/17426 [17:06<17:29,  6.72it/s]

tensor(1.4600, grad_fn=<NllLossBackward0>)
tensor(1.4709, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10372/17426 [17:07<16:03,  7.32it/s]

tensor(1.5001, grad_fn=<NllLossBackward0>)
tensor(1.4680, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10374/17426 [17:07<15:37,  7.52it/s]

tensor(1.4768, grad_fn=<NllLossBackward0>)
tensor(1.5193, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10376/17426 [17:07<14:55,  7.88it/s]

tensor(1.5017, grad_fn=<NllLossBackward0>)
tensor(1.5155, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10378/17426 [17:07<14:28,  8.11it/s]

tensor(1.5160, grad_fn=<NllLossBackward0>)
tensor(1.4922, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10380/17426 [17:08<14:14,  8.24it/s]

tensor(1.5230, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10382/17426 [17:08<14:44,  7.97it/s]

tensor(1.5022, grad_fn=<NllLossBackward0>)
tensor(1.4728, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10384/17426 [17:08<14:30,  8.09it/s]

tensor(1.5243, grad_fn=<NllLossBackward0>)
tensor(1.4993, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10386/17426 [17:08<13:42,  8.56it/s]

tensor(1.4794, grad_fn=<NllLossBackward0>)
tensor(1.4904, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10388/17426 [17:09<13:32,  8.66it/s]

tensor(1.4837, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10390/17426 [17:09<13:33,  8.65it/s]

tensor(1.4816, grad_fn=<NllLossBackward0>)
tensor(1.4872, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10392/17426 [17:09<13:19,  8.80it/s]

tensor(1.4856, grad_fn=<NllLossBackward0>)
tensor(1.4676, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10394/17426 [17:09<13:16,  8.83it/s]

tensor(1.5210, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10396/17426 [17:10<13:26,  8.71it/s]

tensor(1.4849, grad_fn=<NllLossBackward0>)
tensor(1.5032, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10398/17426 [17:10<13:19,  8.79it/s]

tensor(1.5098, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10400/17426 [17:10<13:43,  8.53it/s]

tensor(1.4997, grad_fn=<NllLossBackward0>)
tensor(1.5138, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10402/17426 [17:10<13:07,  8.92it/s]

tensor(1.4985, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10404/17426 [17:10<13:16,  8.82it/s]

tensor(1.4480, grad_fn=<NllLossBackward0>)
tensor(1.4554, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10406/17426 [17:11<13:25,  8.72it/s]

tensor(1.4608, grad_fn=<NllLossBackward0>)
tensor(1.4580, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10408/17426 [17:11<13:16,  8.81it/s]

tensor(1.4766, grad_fn=<NllLossBackward0>)
tensor(1.4679, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10410/17426 [17:11<13:29,  8.67it/s]

tensor(1.4832, grad_fn=<NllLossBackward0>)
tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10414/17426 [17:12<11:33, 10.12it/s]

tensor(1.4943, grad_fn=<NllLossBackward0>)
tensor(1.5020, grad_fn=<NllLossBackward0>)
tensor(1.5776, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10416/17426 [17:12<12:08,  9.62it/s]

tensor(1.5392, grad_fn=<NllLossBackward0>)
tensor(1.5197, grad_fn=<NllLossBackward0>)
tensor(1.4740, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10419/17426 [17:12<12:33,  9.30it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.4460, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10421/17426 [17:12<12:47,  9.13it/s]

tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.5150, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10423/17426 [17:13<12:49,  9.10it/s]

tensor(1.5017, grad_fn=<NllLossBackward0>)
tensor(1.5042, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10425/17426 [17:13<13:04,  8.92it/s]

tensor(1.5160, grad_fn=<NllLossBackward0>)
tensor(1.4682, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10427/17426 [17:13<13:04,  8.92it/s]

tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.5158, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10429/17426 [17:13<13:40,  8.53it/s]

tensor(1.4897, grad_fn=<NllLossBackward0>)
tensor(1.4513, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10431/17426 [17:13<13:28,  8.65it/s]

tensor(1.4907, grad_fn=<NllLossBackward0>)
tensor(1.4729, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10433/17426 [17:14<13:29,  8.64it/s]

tensor(1.4929, grad_fn=<NllLossBackward0>)
tensor(1.4713, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10435/17426 [17:14<13:11,  8.83it/s]

tensor(1.5049, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10437/17426 [17:14<13:35,  8.57it/s]

tensor(1.5418, grad_fn=<NllLossBackward0>)
tensor(1.4632, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10439/17426 [17:14<13:12,  8.82it/s]

tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.5032, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10441/17426 [17:15<12:26,  9.36it/s]

tensor(1.5050, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10443/17426 [17:15<12:48,  9.08it/s]

tensor(1.5433, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10445/17426 [17:15<12:48,  9.08it/s]

tensor(1.4668, grad_fn=<NllLossBackward0>)
tensor(1.5421, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10447/17426 [17:15<12:28,  9.32it/s]

tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.4951, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10449/17426 [17:15<12:42,  9.14it/s]

tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.4977, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10451/17426 [17:16<12:41,  9.15it/s]

tensor(1.5051, grad_fn=<NllLossBackward0>)
tensor(1.4686, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10453/17426 [17:16<12:48,  9.07it/s]

tensor(1.5226, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)


 60%|█████▉    | 10455/17426 [17:16<12:41,  9.16it/s]

tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.5316, grad_fn=<NllLossBackward0>)


 60%|██████    | 10457/17426 [17:16<13:44,  8.45it/s]

tensor(1.5322, grad_fn=<NllLossBackward0>)
tensor(1.5156, grad_fn=<NllLossBackward0>)


 60%|██████    | 10459/17426 [17:17<15:15,  7.61it/s]

tensor(1.4904, grad_fn=<NllLossBackward0>)
tensor(1.5056, grad_fn=<NllLossBackward0>)


 60%|██████    | 10461/17426 [17:17<15:56,  7.28it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.4985, grad_fn=<NllLossBackward0>)


 60%|██████    | 10463/17426 [17:17<15:34,  7.45it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.5564, grad_fn=<NllLossBackward0>)


 60%|██████    | 10465/17426 [17:17<15:21,  7.55it/s]

tensor(1.4588, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)


 60%|██████    | 10467/17426 [17:18<14:32,  7.98it/s]

tensor(1.4645, grad_fn=<NllLossBackward0>)
tensor(1.4889, grad_fn=<NllLossBackward0>)


 60%|██████    | 10469/17426 [17:18<15:38,  7.41it/s]

tensor(1.5222, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)


 60%|██████    | 10471/17426 [17:18<15:35,  7.43it/s]

tensor(1.4700, grad_fn=<NllLossBackward0>)
tensor(1.5082, grad_fn=<NllLossBackward0>)


 60%|██████    | 10473/17426 [17:19<16:27,  7.04it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.4451, grad_fn=<NllLossBackward0>)


 60%|██████    | 10475/17426 [17:19<16:46,  6.90it/s]

tensor(1.5328, grad_fn=<NllLossBackward0>)
tensor(1.4669, grad_fn=<NllLossBackward0>)


 60%|██████    | 10477/17426 [17:19<16:56,  6.84it/s]

tensor(1.4841, grad_fn=<NllLossBackward0>)
tensor(1.5057, grad_fn=<NllLossBackward0>)


 60%|██████    | 10479/17426 [17:20<17:34,  6.59it/s]

tensor(1.5210, grad_fn=<NllLossBackward0>)
tensor(1.5411, grad_fn=<NllLossBackward0>)


 60%|██████    | 10481/17426 [17:20<17:38,  6.56it/s]

tensor(1.5166, grad_fn=<NllLossBackward0>)
tensor(1.5073, grad_fn=<NllLossBackward0>)


 60%|██████    | 10483/17426 [17:20<18:04,  6.40it/s]

tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.5406, grad_fn=<NllLossBackward0>)


 60%|██████    | 10485/17426 [17:20<17:41,  6.54it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.5134, grad_fn=<NllLossBackward0>)


 60%|██████    | 10487/17426 [17:21<17:23,  6.65it/s]

tensor(1.5310, grad_fn=<NllLossBackward0>)
tensor(1.4758, grad_fn=<NllLossBackward0>)


 60%|██████    | 10489/17426 [17:21<17:16,  6.69it/s]

tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.4854, grad_fn=<NllLossBackward0>)


 60%|██████    | 10491/17426 [17:21<15:22,  7.52it/s]

tensor(1.5292, grad_fn=<NllLossBackward0>)
tensor(1.5115, grad_fn=<NllLossBackward0>)


 60%|██████    | 10493/17426 [17:22<14:32,  7.94it/s]

tensor(1.5319, grad_fn=<NllLossBackward0>)
tensor(1.5057, grad_fn=<NllLossBackward0>)


 60%|██████    | 10495/17426 [17:22<14:36,  7.91it/s]

tensor(1.5324, grad_fn=<NllLossBackward0>)
tensor(1.4612, grad_fn=<NllLossBackward0>)


 60%|██████    | 10497/17426 [17:22<13:33,  8.52it/s]

tensor(1.4770, grad_fn=<NllLossBackward0>)
tensor(1.5186, grad_fn=<NllLossBackward0>)


 60%|██████    | 10499/17426 [17:22<13:07,  8.80it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.4573, grad_fn=<NllLossBackward0>)


 60%|██████    | 10501/17426 [17:22<12:57,  8.91it/s]

tensor(1.5225, grad_fn=<NllLossBackward0>)
tensor(1.4633, grad_fn=<NllLossBackward0>)


 60%|██████    | 10503/17426 [17:23<13:07,  8.79it/s]

tensor(1.4598, grad_fn=<NllLossBackward0>)
tensor(1.4758, grad_fn=<NllLossBackward0>)


 60%|██████    | 10505/17426 [17:23<13:08,  8.78it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)
tensor(1.5208, grad_fn=<NllLossBackward0>)


 60%|██████    | 10507/17426 [17:23<13:01,  8.85it/s]

tensor(1.5108, grad_fn=<NllLossBackward0>)
tensor(1.5097, grad_fn=<NllLossBackward0>)


 60%|██████    | 10509/17426 [17:23<12:37,  9.13it/s]

tensor(1.5521, grad_fn=<NllLossBackward0>)
tensor(1.5436, grad_fn=<NllLossBackward0>)


 60%|██████    | 10511/17426 [17:24<12:41,  9.08it/s]

tensor(1.4875, grad_fn=<NllLossBackward0>)
tensor(1.4592, grad_fn=<NllLossBackward0>)


 60%|██████    | 10513/17426 [17:24<13:22,  8.62it/s]

tensor(1.5716, grad_fn=<NllLossBackward0>)
tensor(1.4948, grad_fn=<NllLossBackward0>)


 60%|██████    | 10516/17426 [17:24<12:04,  9.54it/s]

tensor(1.5404, grad_fn=<NllLossBackward0>)
tensor(1.4654, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)


 60%|██████    | 10518/17426 [17:24<12:32,  9.18it/s]

tensor(1.4633, grad_fn=<NllLossBackward0>)
tensor(1.4671, grad_fn=<NllLossBackward0>)


 60%|██████    | 10521/17426 [17:25<11:54,  9.67it/s]

tensor(1.5320, grad_fn=<NllLossBackward0>)
tensor(1.5243, grad_fn=<NllLossBackward0>)
tensor(1.4888, grad_fn=<NllLossBackward0>)


 60%|██████    | 10523/17426 [17:25<12:28,  9.23it/s]

tensor(1.4552, grad_fn=<NllLossBackward0>)
tensor(1.5267, grad_fn=<NllLossBackward0>)


 60%|██████    | 10525/17426 [17:25<12:36,  9.12it/s]

tensor(1.5006, grad_fn=<NllLossBackward0>)
tensor(1.4947, grad_fn=<NllLossBackward0>)


 60%|██████    | 10527/17426 [17:25<12:40,  9.08it/s]

tensor(1.4959, grad_fn=<NllLossBackward0>)
tensor(1.4887, grad_fn=<NllLossBackward0>)
tensor(1.5272, grad_fn=<NllLossBackward0>)


 60%|██████    | 10531/17426 [17:26<11:07, 10.33it/s]

tensor(1.4663, grad_fn=<NllLossBackward0>)
tensor(1.4759, grad_fn=<NllLossBackward0>)
tensor(1.4892, grad_fn=<NllLossBackward0>)


 60%|██████    | 10533/17426 [17:26<11:32,  9.95it/s]

tensor(1.4988, grad_fn=<NllLossBackward0>)
tensor(1.5548, grad_fn=<NllLossBackward0>)
tensor(1.4637, grad_fn=<NllLossBackward0>)


 60%|██████    | 10536/17426 [17:26<11:41,  9.83it/s]

tensor(1.5571, grad_fn=<NllLossBackward0>)
tensor(1.5153, grad_fn=<NllLossBackward0>)
tensor(1.5002, grad_fn=<NllLossBackward0>)


 60%|██████    | 10540/17426 [17:27<11:00, 10.42it/s]

tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)
tensor(1.4734, grad_fn=<NllLossBackward0>)


 60%|██████    | 10542/17426 [17:27<11:14, 10.21it/s]

tensor(1.5446, grad_fn=<NllLossBackward0>)
tensor(1.4472, grad_fn=<NllLossBackward0>)
tensor(1.4721, grad_fn=<NllLossBackward0>)


 61%|██████    | 10546/17426 [17:27<11:14, 10.20it/s]

tensor(1.5525, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)
tensor(1.4624, grad_fn=<NllLossBackward0>)


 61%|██████    | 10548/17426 [17:27<11:20, 10.11it/s]

tensor(1.5109, grad_fn=<NllLossBackward0>)
tensor(1.5335, grad_fn=<NllLossBackward0>)
tensor(1.4920, grad_fn=<NllLossBackward0>)


 61%|██████    | 10552/17426 [17:28<10:58, 10.44it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.4891, grad_fn=<NllLossBackward0>)
tensor(1.4953, grad_fn=<NllLossBackward0>)


 61%|██████    | 10554/17426 [17:28<11:16, 10.15it/s]

tensor(1.5030, grad_fn=<NllLossBackward0>)
tensor(1.4821, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 61%|██████    | 10558/17426 [17:28<10:55, 10.48it/s]

tensor(1.4886, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)
tensor(1.4629, grad_fn=<NllLossBackward0>)


 61%|██████    | 10560/17426 [17:29<11:11, 10.22it/s]

tensor(1.5080, grad_fn=<NllLossBackward0>)
tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.4903, grad_fn=<NllLossBackward0>)


 61%|██████    | 10564/17426 [17:29<10:48, 10.58it/s]

tensor(1.5130, grad_fn=<NllLossBackward0>)
tensor(1.5331, grad_fn=<NllLossBackward0>)
tensor(1.5287, grad_fn=<NllLossBackward0>)


 61%|██████    | 10566/17426 [17:29<11:19, 10.10it/s]

tensor(1.4637, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)


 61%|██████    | 10570/17426 [17:29<10:51, 10.52it/s]

tensor(1.4467, grad_fn=<NllLossBackward0>)
tensor(1.4975, grad_fn=<NllLossBackward0>)
tensor(1.5157, grad_fn=<NllLossBackward0>)


 61%|██████    | 10572/17426 [17:30<11:08, 10.25it/s]

tensor(1.4660, grad_fn=<NllLossBackward0>)
tensor(1.5178, grad_fn=<NllLossBackward0>)
tensor(1.4886, grad_fn=<NllLossBackward0>)


 61%|██████    | 10576/17426 [17:30<11:15, 10.15it/s]

tensor(1.4843, grad_fn=<NllLossBackward0>)
tensor(1.5217, grad_fn=<NllLossBackward0>)
tensor(1.4821, grad_fn=<NllLossBackward0>)


 61%|██████    | 10578/17426 [17:30<11:21, 10.05it/s]

tensor(1.4939, grad_fn=<NllLossBackward0>)
tensor(1.5068, grad_fn=<NllLossBackward0>)
tensor(1.4561, grad_fn=<NllLossBackward0>)


 61%|██████    | 10582/17426 [17:31<10:59, 10.38it/s]

tensor(1.4914, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)


 61%|██████    | 10584/17426 [17:31<11:03, 10.31it/s]

tensor(1.5136, grad_fn=<NllLossBackward0>)
tensor(1.4974, grad_fn=<NllLossBackward0>)
tensor(1.4746, grad_fn=<NllLossBackward0>)


 61%|██████    | 10587/17426 [17:31<12:18,  9.25it/s]

tensor(1.5150, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 61%|██████    | 10589/17426 [17:31<13:17,  8.58it/s]

tensor(1.5377, grad_fn=<NllLossBackward0>)
tensor(1.5255, grad_fn=<NllLossBackward0>)


 61%|██████    | 10591/17426 [17:32<14:49,  7.69it/s]

tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.5215, grad_fn=<NllLossBackward0>)


 61%|██████    | 10593/17426 [17:32<15:32,  7.33it/s]

tensor(1.4651, grad_fn=<NllLossBackward0>)
tensor(1.4569, grad_fn=<NllLossBackward0>)


 61%|██████    | 10595/17426 [17:32<15:24,  7.39it/s]

tensor(1.4674, grad_fn=<NllLossBackward0>)
tensor(1.5012, grad_fn=<NllLossBackward0>)


 61%|██████    | 10597/17426 [17:33<16:16,  6.99it/s]

tensor(1.4931, grad_fn=<NllLossBackward0>)
tensor(1.4852, grad_fn=<NllLossBackward0>)


 61%|██████    | 10599/17426 [17:33<15:10,  7.50it/s]

tensor(1.5141, grad_fn=<NllLossBackward0>)
tensor(1.5342, grad_fn=<NllLossBackward0>)


 61%|██████    | 10601/17426 [17:33<15:53,  7.16it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5320, grad_fn=<NllLossBackward0>)


 61%|██████    | 10603/17426 [17:33<16:42,  6.81it/s]

tensor(1.5255, grad_fn=<NllLossBackward0>)
tensor(1.4676, grad_fn=<NllLossBackward0>)


 61%|██████    | 10605/17426 [17:34<16:50,  6.75it/s]

tensor(1.4919, grad_fn=<NllLossBackward0>)
tensor(1.4759, grad_fn=<NllLossBackward0>)


 61%|██████    | 10607/17426 [17:34<16:58,  6.69it/s]

tensor(1.5362, grad_fn=<NllLossBackward0>)
tensor(1.5301, grad_fn=<NllLossBackward0>)


 61%|██████    | 10609/17426 [17:34<17:00,  6.68it/s]

tensor(1.4764, grad_fn=<NllLossBackward0>)
tensor(1.4983, grad_fn=<NllLossBackward0>)


 61%|██████    | 10611/17426 [17:35<16:38,  6.82it/s]

tensor(1.4853, grad_fn=<NllLossBackward0>)
tensor(1.4989, grad_fn=<NllLossBackward0>)


 61%|██████    | 10613/17426 [17:35<16:14,  6.99it/s]

tensor(1.4806, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)


 61%|██████    | 10615/17426 [17:35<16:10,  7.02it/s]

tensor(1.5203, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 61%|██████    | 10617/17426 [17:35<14:59,  7.57it/s]

tensor(1.4638, grad_fn=<NllLossBackward0>)
tensor(1.5263, grad_fn=<NllLossBackward0>)


 61%|██████    | 10619/17426 [17:36<14:14,  7.97it/s]

tensor(1.4554, grad_fn=<NllLossBackward0>)
tensor(1.4911, grad_fn=<NllLossBackward0>)


 61%|██████    | 10621/17426 [17:36<13:37,  8.33it/s]

tensor(1.4887, grad_fn=<NllLossBackward0>)
tensor(1.5293, grad_fn=<NllLossBackward0>)


 61%|██████    | 10623/17426 [17:36<13:15,  8.55it/s]

tensor(1.4618, grad_fn=<NllLossBackward0>)
tensor(1.4905, grad_fn=<NllLossBackward0>)


 61%|██████    | 10625/17426 [17:36<14:08,  8.02it/s]

tensor(1.4595, grad_fn=<NllLossBackward0>)
tensor(1.5367, grad_fn=<NllLossBackward0>)


 61%|██████    | 10627/17426 [17:37<13:58,  8.11it/s]

tensor(1.5335, grad_fn=<NllLossBackward0>)
tensor(1.4550, grad_fn=<NllLossBackward0>)


 61%|██████    | 10629/17426 [17:37<13:35,  8.33it/s]

tensor(1.5155, grad_fn=<NllLossBackward0>)
tensor(1.4652, grad_fn=<NllLossBackward0>)


 61%|██████    | 10631/17426 [17:37<13:25,  8.43it/s]

tensor(1.4678, grad_fn=<NllLossBackward0>)
tensor(1.5311, grad_fn=<NllLossBackward0>)


 61%|██████    | 10633/17426 [17:37<13:13,  8.56it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.5393, grad_fn=<NllLossBackward0>)


 61%|██████    | 10635/17426 [17:38<13:21,  8.47it/s]

tensor(1.5281, grad_fn=<NllLossBackward0>)
tensor(1.5110, grad_fn=<NllLossBackward0>)


 61%|██████    | 10637/17426 [17:38<13:09,  8.60it/s]

tensor(1.4757, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 61%|██████    | 10639/17426 [17:38<12:47,  8.84it/s]

tensor(1.5991, grad_fn=<NllLossBackward0>)
tensor(1.5403, grad_fn=<NllLossBackward0>)


 61%|██████    | 10641/17426 [17:38<12:45,  8.86it/s]

tensor(1.5211, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)


 61%|██████    | 10643/17426 [17:39<13:10,  8.58it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)


 61%|██████    | 10645/17426 [17:39<12:55,  8.74it/s]

tensor(1.4733, grad_fn=<NllLossBackward0>)
tensor(1.5139, grad_fn=<NllLossBackward0>)


 61%|██████    | 10647/17426 [17:39<12:40,  8.91it/s]

tensor(1.4809, grad_fn=<NllLossBackward0>)
tensor(1.5327, grad_fn=<NllLossBackward0>)


 61%|██████    | 10649/17426 [17:39<12:44,  8.86it/s]

tensor(1.4668, grad_fn=<NllLossBackward0>)
tensor(1.5290, grad_fn=<NllLossBackward0>)


 61%|██████    | 10651/17426 [17:39<12:37,  8.94it/s]

tensor(1.4756, grad_fn=<NllLossBackward0>)
tensor(1.4718, grad_fn=<NllLossBackward0>)


 61%|██████    | 10653/17426 [17:40<12:59,  8.69it/s]

tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.4514, grad_fn=<NllLossBackward0>)


 61%|██████    | 10655/17426 [17:40<12:58,  8.70it/s]

tensor(1.5239, grad_fn=<NllLossBackward0>)
tensor(1.4934, grad_fn=<NllLossBackward0>)


 61%|██████    | 10657/17426 [17:40<12:08,  9.30it/s]

tensor(1.5035, grad_fn=<NllLossBackward0>)
tensor(1.4448, grad_fn=<NllLossBackward0>)


 61%|██████    | 10659/17426 [17:40<12:21,  9.13it/s]

tensor(1.5013, grad_fn=<NllLossBackward0>)
tensor(1.4828, grad_fn=<NllLossBackward0>)


 61%|██████    | 10661/17426 [17:41<12:35,  8.95it/s]

tensor(1.4677, grad_fn=<NllLossBackward0>)
tensor(1.5244, grad_fn=<NllLossBackward0>)


 61%|██████    | 10663/17426 [17:41<12:52,  8.76it/s]

tensor(1.4812, grad_fn=<NllLossBackward0>)
tensor(1.4665, grad_fn=<NllLossBackward0>)


 61%|██████    | 10665/17426 [17:41<12:39,  8.90it/s]

tensor(1.4901, grad_fn=<NllLossBackward0>)
tensor(1.4577, grad_fn=<NllLossBackward0>)


 61%|██████    | 10667/17426 [17:41<12:28,  9.03it/s]

tensor(1.4952, grad_fn=<NllLossBackward0>)
tensor(1.4992, grad_fn=<NllLossBackward0>)


 61%|██████    | 10669/17426 [17:41<12:42,  8.86it/s]

tensor(1.4931, grad_fn=<NllLossBackward0>)
tensor(1.4920, grad_fn=<NllLossBackward0>)


 61%|██████    | 10671/17426 [17:42<13:14,  8.50it/s]

tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.5375, grad_fn=<NllLossBackward0>)


 61%|██████    | 10673/17426 [17:42<12:38,  8.90it/s]

tensor(1.5364, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)
tensor(1.4406, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10677/17426 [17:42<11:28,  9.80it/s]

tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)
tensor(1.4626, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10679/17426 [17:43<11:52,  9.47it/s]

tensor(1.5566, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10681/17426 [17:43<12:55,  8.70it/s]

tensor(1.4675, grad_fn=<NllLossBackward0>)
tensor(1.5200, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10683/17426 [17:43<12:28,  9.01it/s]

tensor(1.5070, grad_fn=<NllLossBackward0>)
tensor(1.4721, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10685/17426 [17:43<12:36,  8.91it/s]

tensor(1.5189, grad_fn=<NllLossBackward0>)
tensor(1.4814, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10687/17426 [17:43<12:20,  9.10it/s]

tensor(1.4786, grad_fn=<NllLossBackward0>)
tensor(1.4797, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10689/17426 [17:44<12:31,  8.96it/s]

tensor(1.4978, grad_fn=<NllLossBackward0>)
tensor(1.4317, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10691/17426 [17:44<12:57,  8.67it/s]

tensor(1.4510, grad_fn=<NllLossBackward0>)
tensor(1.4944, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10693/17426 [17:44<12:40,  8.86it/s]

tensor(1.4984, grad_fn=<NllLossBackward0>)
tensor(1.5134, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10695/17426 [17:44<12:27,  9.01it/s]

tensor(1.4823, grad_fn=<NllLossBackward0>)
tensor(1.4768, grad_fn=<NllLossBackward0>)
tensor(1.4479, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10698/17426 [17:45<11:45,  9.53it/s]

tensor(1.5278, grad_fn=<NllLossBackward0>)
tensor(1.4727, grad_fn=<NllLossBackward0>)
tensor(1.5091, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10702/17426 [17:45<10:59, 10.20it/s]

tensor(1.4936, grad_fn=<NllLossBackward0>)
tensor(1.5516, grad_fn=<NllLossBackward0>)
tensor(1.4982, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10704/17426 [17:45<11:41,  9.58it/s]

tensor(1.5122, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10706/17426 [17:46<13:26,  8.33it/s]

tensor(1.4880, grad_fn=<NllLossBackward0>)
tensor(1.5194, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10708/17426 [17:46<14:36,  7.66it/s]

tensor(1.5073, grad_fn=<NllLossBackward0>)
tensor(1.5072, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10710/17426 [17:46<15:00,  7.46it/s]

tensor(1.5445, grad_fn=<NllLossBackward0>)
tensor(1.4854, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10712/17426 [17:46<15:06,  7.41it/s]

tensor(1.4856, grad_fn=<NllLossBackward0>)
tensor(1.5252, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10714/17426 [17:47<15:14,  7.34it/s]

tensor(1.5494, grad_fn=<NllLossBackward0>)
tensor(1.4346, grad_fn=<NllLossBackward0>)


 61%|██████▏   | 10716/17426 [17:47<15:06,  7.40it/s]

tensor(1.4835, grad_fn=<NllLossBackward0>)
tensor(1.5105, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10718/17426 [17:47<15:09,  7.38it/s]

tensor(1.5200, grad_fn=<NllLossBackward0>)
tensor(1.4930, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10720/17426 [17:48<15:57,  7.00it/s]

tensor(1.5102, grad_fn=<NllLossBackward0>)
tensor(1.4945, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10722/17426 [17:48<15:16,  7.32it/s]

tensor(1.4820, grad_fn=<NllLossBackward0>)
tensor(1.4937, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10724/17426 [17:48<15:30,  7.20it/s]

tensor(1.5059, grad_fn=<NllLossBackward0>)
tensor(1.5337, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10726/17426 [17:48<16:38,  6.71it/s]

tensor(1.4715, grad_fn=<NllLossBackward0>)
tensor(1.5289, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10728/17426 [17:49<15:42,  7.11it/s]

tensor(1.4756, grad_fn=<NllLossBackward0>)
tensor(1.4729, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10730/17426 [17:49<15:26,  7.23it/s]

tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.5417, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10732/17426 [17:49<16:12,  6.89it/s]

tensor(1.5312, grad_fn=<NllLossBackward0>)
tensor(1.4639, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10734/17426 [17:50<15:55,  7.01it/s]

tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.4925, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10736/17426 [17:50<16:38,  6.70it/s]

tensor(1.5149, grad_fn=<NllLossBackward0>)
tensor(1.4865, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10738/17426 [17:50<15:10,  7.35it/s]

tensor(1.5074, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10740/17426 [17:50<14:24,  7.73it/s]

tensor(1.5216, grad_fn=<NllLossBackward0>)
tensor(1.5013, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10742/17426 [17:51<13:41,  8.13it/s]

tensor(1.5192, grad_fn=<NllLossBackward0>)
tensor(1.4708, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10744/17426 [17:51<13:44,  8.10it/s]

tensor(1.4861, grad_fn=<NllLossBackward0>)
tensor(1.5025, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10746/17426 [17:51<13:50,  8.04it/s]

tensor(1.4883, grad_fn=<NllLossBackward0>)
tensor(1.5155, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10748/17426 [17:51<13:51,  8.03it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.4328, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10750/17426 [17:52<12:51,  8.65it/s]

tensor(1.5275, grad_fn=<NllLossBackward0>)
tensor(1.5157, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10752/17426 [17:52<12:46,  8.71it/s]

tensor(1.5262, grad_fn=<NllLossBackward0>)
tensor(1.5413, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10754/17426 [17:52<12:38,  8.79it/s]

tensor(1.4918, grad_fn=<NllLossBackward0>)
tensor(1.4760, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10756/17426 [17:52<12:24,  8.96it/s]

tensor(1.5145, grad_fn=<NllLossBackward0>)
tensor(1.4658, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10758/17426 [17:52<12:38,  8.79it/s]

tensor(1.4792, grad_fn=<NllLossBackward0>)
tensor(1.4724, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10760/17426 [17:53<12:47,  8.69it/s]

tensor(1.4751, grad_fn=<NllLossBackward0>)
tensor(1.4610, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10762/17426 [17:53<12:35,  8.82it/s]

tensor(1.4894, grad_fn=<NllLossBackward0>)
tensor(1.4786, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10764/17426 [17:53<12:26,  8.93it/s]

tensor(1.5236, grad_fn=<NllLossBackward0>)
tensor(1.4603, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10766/17426 [17:53<12:11,  9.10it/s]

tensor(1.5566, grad_fn=<NllLossBackward0>)
tensor(1.5282, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10768/17426 [17:54<12:30,  8.87it/s]

tensor(1.5013, grad_fn=<NllLossBackward0>)
tensor(1.5181, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10770/17426 [17:54<12:34,  8.82it/s]

tensor(1.4732, grad_fn=<NllLossBackward0>)
tensor(1.4597, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10772/17426 [17:54<12:24,  8.94it/s]

tensor(1.5104, grad_fn=<NllLossBackward0>)
tensor(1.4790, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10774/17426 [17:54<12:23,  8.94it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.4995, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10776/17426 [17:54<12:50,  8.63it/s]

tensor(1.4972, grad_fn=<NllLossBackward0>)
tensor(1.4971, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10778/17426 [17:55<12:40,  8.74it/s]

tensor(1.4648, grad_fn=<NllLossBackward0>)
tensor(1.4344, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10780/17426 [17:55<12:32,  8.83it/s]

tensor(1.4738, grad_fn=<NllLossBackward0>)
tensor(1.4619, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10782/17426 [17:55<12:27,  8.89it/s]

tensor(1.4715, grad_fn=<NllLossBackward0>)
tensor(1.4599, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10784/17426 [17:55<12:22,  8.95it/s]

tensor(1.5506, grad_fn=<NllLossBackward0>)
tensor(1.5469, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10786/17426 [17:56<12:54,  8.58it/s]

tensor(1.5273, grad_fn=<NllLossBackward0>)
tensor(1.5510, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10788/17426 [17:56<12:35,  8.79it/s]

tensor(1.5247, grad_fn=<NllLossBackward0>)
tensor(1.4836, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10790/17426 [17:56<12:45,  8.67it/s]

tensor(1.5114, grad_fn=<NllLossBackward0>)
tensor(1.4949, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10792/17426 [17:56<12:33,  8.80it/s]

tensor(1.4873, grad_fn=<NllLossBackward0>)
tensor(1.4609, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10794/17426 [17:56<11:43,  9.43it/s]

tensor(1.4745, grad_fn=<NllLossBackward0>)
tensor(1.4646, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10796/17426 [17:57<12:20,  8.96it/s]

tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.4930, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10798/17426 [17:57<12:28,  8.85it/s]

tensor(1.4723, grad_fn=<NllLossBackward0>)
tensor(1.4879, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10800/17426 [17:57<12:29,  8.84it/s]

tensor(1.5209, grad_fn=<NllLossBackward0>)
tensor(1.4779, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10802/17426 [17:57<12:34,  8.78it/s]

tensor(1.5335, grad_fn=<NllLossBackward0>)
tensor(1.5219, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10804/17426 [17:58<13:04,  8.44it/s]

tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.5273, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10806/17426 [17:58<12:47,  8.62it/s]

tensor(1.5287, grad_fn=<NllLossBackward0>)
tensor(1.5113, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10808/17426 [17:58<12:39,  8.71it/s]

tensor(1.5547, grad_fn=<NllLossBackward0>)
tensor(1.4945, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10810/17426 [17:58<12:20,  8.94it/s]

tensor(1.4980, grad_fn=<NllLossBackward0>)
tensor(1.4734, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10812/17426 [17:59<12:32,  8.79it/s]

tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.5156, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10814/17426 [17:59<12:50,  8.58it/s]

tensor(1.5078, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10816/17426 [17:59<12:40,  8.69it/s]

tensor(1.4614, grad_fn=<NllLossBackward0>)
tensor(1.4954, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10818/17426 [17:59<12:31,  8.80it/s]

tensor(1.5122, grad_fn=<NllLossBackward0>)
tensor(1.5385, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10820/17426 [17:59<12:25,  8.87it/s]

tensor(1.4546, grad_fn=<NllLossBackward0>)
tensor(1.4891, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10822/17426 [18:00<12:43,  8.65it/s]

tensor(1.4512, grad_fn=<NllLossBackward0>)
tensor(1.4961, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10824/17426 [18:00<13:23,  8.22it/s]

tensor(1.5266, grad_fn=<NllLossBackward0>)
tensor(1.5054, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10826/17426 [18:00<14:15,  7.72it/s]

tensor(1.4419, grad_fn=<NllLossBackward0>)
tensor(1.5083, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10828/17426 [18:01<14:48,  7.43it/s]

tensor(1.4724, grad_fn=<NllLossBackward0>)
tensor(1.4736, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10830/17426 [18:01<15:42,  7.00it/s]

tensor(1.5018, grad_fn=<NllLossBackward0>)
tensor(1.5239, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10832/17426 [18:01<15:40,  7.01it/s]

tensor(1.5258, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10834/17426 [18:01<16:07,  6.82it/s]

tensor(1.5035, grad_fn=<NllLossBackward0>)
tensor(1.4937, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10836/17426 [18:02<15:33,  7.06it/s]

tensor(1.4955, grad_fn=<NllLossBackward0>)
tensor(1.4962, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10838/17426 [18:02<15:56,  6.88it/s]

tensor(1.5517, grad_fn=<NllLossBackward0>)
tensor(1.5291, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10840/17426 [18:02<15:46,  6.96it/s]

tensor(1.5061, grad_fn=<NllLossBackward0>)
tensor(1.5544, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10842/17426 [18:03<15:11,  7.22it/s]

tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.4906, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10844/17426 [18:03<15:24,  7.12it/s]

tensor(1.4533, grad_fn=<NllLossBackward0>)
tensor(1.4857, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10846/17426 [18:03<16:26,  6.67it/s]

tensor(1.4696, grad_fn=<NllLossBackward0>)
tensor(1.4868, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10848/17426 [18:03<16:41,  6.57it/s]

tensor(1.5366, grad_fn=<NllLossBackward0>)
tensor(1.4969, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10850/17426 [18:04<17:13,  6.36it/s]

tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.4662, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10852/17426 [18:04<16:08,  6.79it/s]

tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.4616, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10854/17426 [18:04<16:55,  6.47it/s]

tensor(1.5055, grad_fn=<NllLossBackward0>)
tensor(1.5230, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10856/17426 [18:05<15:57,  6.86it/s]

tensor(1.5104, grad_fn=<NllLossBackward0>)
tensor(1.5304, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10858/17426 [18:05<14:15,  7.68it/s]

tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.4506, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10860/17426 [18:05<13:29,  8.11it/s]

tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.5370, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10862/17426 [18:05<13:47,  7.93it/s]

tensor(1.4887, grad_fn=<NllLossBackward0>)
tensor(1.4887, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10864/17426 [18:06<13:25,  8.15it/s]

tensor(1.4506, grad_fn=<NllLossBackward0>)
tensor(1.4567, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10866/17426 [18:06<13:12,  8.28it/s]

tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10868/17426 [18:06<13:02,  8.38it/s]

tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.5493, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10870/17426 [18:06<13:01,  8.39it/s]

tensor(1.5321, grad_fn=<NllLossBackward0>)
tensor(1.5096, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10872/17426 [18:07<12:35,  8.68it/s]

tensor(1.4600, grad_fn=<NllLossBackward0>)
tensor(1.5507, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10874/17426 [18:07<12:34,  8.68it/s]

tensor(1.4748, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10876/17426 [18:07<12:26,  8.77it/s]

tensor(1.4939, grad_fn=<NllLossBackward0>)
tensor(1.5057, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10878/17426 [18:07<12:31,  8.71it/s]

tensor(1.5030, grad_fn=<NllLossBackward0>)
tensor(1.4875, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10880/17426 [18:07<12:30,  8.72it/s]

tensor(1.5154, grad_fn=<NllLossBackward0>)
tensor(1.4842, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10882/17426 [18:08<12:42,  8.58it/s]

tensor(1.5154, grad_fn=<NllLossBackward0>)
tensor(1.5229, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10884/17426 [18:08<12:13,  8.91it/s]

tensor(1.5247, grad_fn=<NllLossBackward0>)
tensor(1.4852, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10886/17426 [18:08<12:23,  8.79it/s]

tensor(1.5025, grad_fn=<NllLossBackward0>)
tensor(1.4782, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10888/17426 [18:08<12:51,  8.48it/s]

tensor(1.5086, grad_fn=<NllLossBackward0>)
tensor(1.5601, grad_fn=<NllLossBackward0>)


 62%|██████▏   | 10890/17426 [18:09<12:27,  8.75it/s]

tensor(1.4922, grad_fn=<NllLossBackward0>)
tensor(1.5253, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10892/17426 [18:09<12:34,  8.66it/s]

tensor(1.4642, grad_fn=<NllLossBackward0>)
tensor(1.5006, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10894/17426 [18:09<12:20,  8.82it/s]

tensor(1.5304, grad_fn=<NllLossBackward0>)
tensor(1.4490, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10896/17426 [18:09<12:47,  8.51it/s]

tensor(1.4869, grad_fn=<NllLossBackward0>)
tensor(1.5311, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10898/17426 [18:10<12:52,  8.45it/s]

tensor(1.4600, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10900/17426 [18:10<12:40,  8.58it/s]

tensor(1.4497, grad_fn=<NllLossBackward0>)
tensor(1.4456, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10902/17426 [18:10<12:42,  8.56it/s]

tensor(1.4908, grad_fn=<NllLossBackward0>)
tensor(1.4856, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10904/17426 [18:10<12:45,  8.52it/s]

tensor(1.4984, grad_fn=<NllLossBackward0>)
tensor(1.5042, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10906/17426 [18:10<13:04,  8.31it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5113, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10908/17426 [18:11<12:56,  8.39it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10910/17426 [18:11<12:51,  8.44it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.5052, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10912/17426 [18:11<12:38,  8.59it/s]

tensor(1.4758, grad_fn=<NllLossBackward0>)
tensor(1.4754, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10914/17426 [18:11<12:31,  8.67it/s]

tensor(1.4816, grad_fn=<NllLossBackward0>)
tensor(1.5241, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10916/17426 [18:12<12:57,  8.37it/s]

tensor(1.4412, grad_fn=<NllLossBackward0>)
tensor(1.5057, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10918/17426 [18:12<12:34,  8.63it/s]

tensor(1.4921, grad_fn=<NllLossBackward0>)
tensor(1.4852, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10920/17426 [18:12<12:20,  8.79it/s]

tensor(1.4363, grad_fn=<NllLossBackward0>)
tensor(1.4917, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10922/17426 [18:12<12:35,  8.61it/s]

tensor(1.4848, grad_fn=<NllLossBackward0>)
tensor(1.5266, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10924/17426 [18:13<13:07,  8.26it/s]

tensor(1.5520, grad_fn=<NllLossBackward0>)
tensor(1.5268, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10926/17426 [18:13<12:34,  8.61it/s]

tensor(1.4835, grad_fn=<NllLossBackward0>)
tensor(1.4926, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10928/17426 [18:13<12:24,  8.73it/s]

tensor(1.4767, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10930/17426 [18:13<12:26,  8.71it/s]

tensor(1.4517, grad_fn=<NllLossBackward0>)
tensor(1.4827, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10932/17426 [18:13<12:21,  8.76it/s]

tensor(1.5224, grad_fn=<NllLossBackward0>)
tensor(1.5459, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10934/17426 [18:14<12:51,  8.41it/s]

tensor(1.4527, grad_fn=<NllLossBackward0>)
tensor(1.4852, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10936/17426 [18:14<12:32,  8.62it/s]

tensor(1.5186, grad_fn=<NllLossBackward0>)
tensor(1.4689, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10938/17426 [18:14<12:31,  8.64it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10940/17426 [18:14<12:22,  8.73it/s]

tensor(1.5086, grad_fn=<NllLossBackward0>)
tensor(1.5025, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10942/17426 [18:15<13:13,  8.18it/s]

tensor(1.5372, grad_fn=<NllLossBackward0>)
tensor(1.4817, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10944/17426 [18:15<14:20,  7.53it/s]

tensor(1.5156, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10946/17426 [18:15<14:13,  7.59it/s]

tensor(1.5128, grad_fn=<NllLossBackward0>)
tensor(1.5634, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10948/17426 [18:16<14:55,  7.24it/s]

tensor(1.4792, grad_fn=<NllLossBackward0>)
tensor(1.4651, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10950/17426 [18:16<15:17,  7.06it/s]

tensor(1.4602, grad_fn=<NllLossBackward0>)
tensor(1.5103, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10952/17426 [18:16<14:39,  7.36it/s]

tensor(1.4916, grad_fn=<NllLossBackward0>)
tensor(1.5464, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10954/17426 [18:16<14:56,  7.22it/s]

tensor(1.4923, grad_fn=<NllLossBackward0>)
tensor(1.5118, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10956/17426 [18:17<15:20,  7.03it/s]

tensor(1.5301, grad_fn=<NllLossBackward0>)
tensor(1.5077, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10958/17426 [18:17<15:02,  7.17it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.4546, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10960/17426 [18:17<14:44,  7.31it/s]

tensor(1.5215, grad_fn=<NllLossBackward0>)
tensor(1.5182, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10962/17426 [18:18<15:48,  6.82it/s]

tensor(1.5090, grad_fn=<NllLossBackward0>)
tensor(1.4911, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10964/17426 [18:18<15:26,  6.97it/s]

tensor(1.4945, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10966/17426 [18:18<15:38,  6.88it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.4657, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10968/17426 [18:18<15:29,  6.95it/s]

tensor(1.4677, grad_fn=<NllLossBackward0>)
tensor(1.4527, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10970/17426 [18:19<15:37,  6.89it/s]

tensor(1.4947, grad_fn=<NllLossBackward0>)
tensor(1.5244, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10972/17426 [18:19<16:13,  6.63it/s]

tensor(1.5034, grad_fn=<NllLossBackward0>)
tensor(1.4494, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10974/17426 [18:19<14:30,  7.41it/s]

tensor(1.4845, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10976/17426 [18:19<13:38,  7.88it/s]

tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.4797, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10978/17426 [18:20<13:38,  7.88it/s]

tensor(1.4758, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10980/17426 [18:20<13:18,  8.08it/s]

tensor(1.4701, grad_fn=<NllLossBackward0>)
tensor(1.5278, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10982/17426 [18:20<13:31,  7.94it/s]

tensor(1.4754, grad_fn=<NllLossBackward0>)
tensor(1.5227, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10984/17426 [18:20<12:57,  8.29it/s]

tensor(1.4980, grad_fn=<NllLossBackward0>)
tensor(1.5072, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10986/17426 [18:21<12:58,  8.27it/s]

tensor(1.5352, grad_fn=<NllLossBackward0>)
tensor(1.4599, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10988/17426 [18:21<13:01,  8.23it/s]

tensor(1.5023, grad_fn=<NllLossBackward0>)
tensor(1.4807, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10990/17426 [18:21<13:39,  7.85it/s]

tensor(1.4731, grad_fn=<NllLossBackward0>)
tensor(1.4458, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10992/17426 [18:21<13:16,  8.08it/s]

tensor(1.4429, grad_fn=<NllLossBackward0>)
tensor(1.4313, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10994/17426 [18:22<13:26,  7.97it/s]

tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.4915, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10996/17426 [18:22<13:08,  8.16it/s]

tensor(1.5239, grad_fn=<NllLossBackward0>)
tensor(1.5129, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 10998/17426 [18:22<12:37,  8.49it/s]

tensor(1.4983, grad_fn=<NllLossBackward0>)
tensor(1.4821, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11000/17426 [18:22<12:32,  8.54it/s]

tensor(1.4640, grad_fn=<NllLossBackward0>)
tensor(1.4989, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11002/17426 [18:23<12:36,  8.49it/s]

tensor(1.5098, grad_fn=<NllLossBackward0>)
tensor(1.5016, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11004/17426 [18:23<12:29,  8.57it/s]

tensor(1.4709, grad_fn=<NllLossBackward0>)
tensor(1.4607, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11006/17426 [18:23<12:31,  8.55it/s]

tensor(1.5357, grad_fn=<NllLossBackward0>)
tensor(1.4436, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11008/17426 [18:23<13:08,  8.13it/s]

tensor(1.4826, grad_fn=<NllLossBackward0>)
tensor(1.4916, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11010/17426 [18:24<12:41,  8.43it/s]

tensor(1.4847, grad_fn=<NllLossBackward0>)
tensor(1.5298, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11012/17426 [18:24<12:42,  8.41it/s]

tensor(1.5091, grad_fn=<NllLossBackward0>)
tensor(1.5216, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11014/17426 [18:24<12:18,  8.68it/s]

tensor(1.5520, grad_fn=<NllLossBackward0>)
tensor(1.5255, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11016/17426 [18:24<12:02,  8.87it/s]

tensor(1.5265, grad_fn=<NllLossBackward0>)
tensor(1.4493, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11018/17426 [18:25<12:43,  8.40it/s]

tensor(1.4950, grad_fn=<NllLossBackward0>)
tensor(1.4790, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11020/17426 [18:25<12:34,  8.49it/s]

tensor(1.5074, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11022/17426 [18:25<12:23,  8.61it/s]

tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11024/17426 [18:25<12:18,  8.66it/s]

tensor(1.5351, grad_fn=<NllLossBackward0>)
tensor(1.4959, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11026/17426 [18:25<12:38,  8.43it/s]

tensor(1.5223, grad_fn=<NllLossBackward0>)
tensor(1.4600, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11028/17426 [18:26<12:39,  8.42it/s]

tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.4744, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11030/17426 [18:26<12:23,  8.60it/s]

tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.5450, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11032/17426 [18:26<12:19,  8.65it/s]

tensor(1.4756, grad_fn=<NllLossBackward0>)
tensor(1.4753, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11034/17426 [18:26<12:07,  8.78it/s]

tensor(1.4504, grad_fn=<NllLossBackward0>)
tensor(1.4930, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11036/17426 [18:27<12:27,  8.55it/s]

tensor(1.4653, grad_fn=<NllLossBackward0>)
tensor(1.4676, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11038/17426 [18:27<12:12,  8.72it/s]

tensor(1.4856, grad_fn=<NllLossBackward0>)
tensor(1.5232, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11040/17426 [18:27<12:08,  8.76it/s]

tensor(1.4819, grad_fn=<NllLossBackward0>)
tensor(1.5045, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11042/17426 [18:27<12:06,  8.79it/s]

tensor(1.4590, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11044/17426 [18:28<12:29,  8.51it/s]

tensor(1.4690, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11046/17426 [18:28<12:28,  8.53it/s]

tensor(1.5016, grad_fn=<NllLossBackward0>)
tensor(1.5060, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11048/17426 [18:28<12:28,  8.52it/s]

tensor(1.4772, grad_fn=<NllLossBackward0>)
tensor(1.4367, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11050/17426 [18:28<12:20,  8.61it/s]

tensor(1.5160, grad_fn=<NllLossBackward0>)
tensor(1.5141, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11052/17426 [18:28<12:20,  8.61it/s]

tensor(1.5094, grad_fn=<NllLossBackward0>)
tensor(1.4782, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11054/17426 [18:29<12:46,  8.31it/s]

tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.4853, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11056/17426 [18:29<12:19,  8.61it/s]

tensor(1.4723, grad_fn=<NllLossBackward0>)
tensor(1.5340, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11058/17426 [18:29<13:41,  7.75it/s]

tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.5081, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11060/17426 [18:30<14:34,  7.28it/s]

tensor(1.4900, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11062/17426 [18:30<15:19,  6.92it/s]

tensor(1.4939, grad_fn=<NllLossBackward0>)
tensor(1.5387, grad_fn=<NllLossBackward0>)


 63%|██████▎   | 11064/17426 [18:30<14:52,  7.13it/s]

tensor(1.4964, grad_fn=<NllLossBackward0>)
tensor(1.4681, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11066/17426 [18:30<15:11,  6.97it/s]

tensor(1.4684, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11068/17426 [18:31<15:23,  6.88it/s]

tensor(1.5137, grad_fn=<NllLossBackward0>)
tensor(1.4739, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11070/17426 [18:31<16:06,  6.57it/s]

tensor(1.4952, grad_fn=<NllLossBackward0>)
tensor(1.5542, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11072/17426 [18:31<14:55,  7.09it/s]

tensor(1.4886, grad_fn=<NllLossBackward0>)
tensor(1.4612, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11074/17426 [18:32<13:33,  7.81it/s]

tensor(1.4386, grad_fn=<NllLossBackward0>)
tensor(1.4362, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11076/17426 [18:32<14:19,  7.39it/s]

tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.4917, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11078/17426 [18:32<14:40,  7.21it/s]

tensor(1.5383, grad_fn=<NllLossBackward0>)
tensor(1.4581, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11080/17426 [18:32<15:06,  7.00it/s]

tensor(1.5298, grad_fn=<NllLossBackward0>)
tensor(1.4500, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11082/17426 [18:33<15:57,  6.63it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.5253, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11084/17426 [18:33<16:14,  6.51it/s]

tensor(1.4737, grad_fn=<NllLossBackward0>)
tensor(1.4641, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11086/17426 [18:33<14:59,  7.04it/s]

tensor(1.4798, grad_fn=<NllLossBackward0>)
tensor(1.4603, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11088/17426 [18:34<15:05,  7.00it/s]

tensor(1.4880, grad_fn=<NllLossBackward0>)
tensor(1.4608, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11090/17426 [18:34<14:40,  7.20it/s]

tensor(1.4518, grad_fn=<NllLossBackward0>)
tensor(1.5256, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11092/17426 [18:34<13:47,  7.66it/s]

tensor(1.4606, grad_fn=<NllLossBackward0>)
tensor(1.5078, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11094/17426 [18:34<13:08,  8.03it/s]

tensor(1.4853, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11096/17426 [18:35<12:43,  8.29it/s]

tensor(1.4796, grad_fn=<NllLossBackward0>)
tensor(1.5236, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11098/17426 [18:35<12:44,  8.28it/s]

tensor(1.4773, grad_fn=<NllLossBackward0>)
tensor(1.4651, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11100/17426 [18:35<13:18,  7.92it/s]

tensor(1.4954, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11102/17426 [18:35<13:03,  8.08it/s]

tensor(1.4833, grad_fn=<NllLossBackward0>)
tensor(1.4812, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11104/17426 [18:36<12:52,  8.18it/s]

tensor(1.4757, grad_fn=<NllLossBackward0>)
tensor(1.4669, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11106/17426 [18:36<12:56,  8.14it/s]

tensor(1.4775, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)


 64%|██████▎   | 11108/17426 [18:36<12:48,  8.22it/s]

tensor(1.4686, grad_fn=<NllLossBackward0>)
tensor(1.5382, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11110/17426 [18:36<12:38,  8.33it/s]

tensor(1.5464, grad_fn=<NllLossBackward0>)
tensor(1.4944, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11112/17426 [18:36<11:36,  9.06it/s]

tensor(1.4826, grad_fn=<NllLossBackward0>)
tensor(1.5086, grad_fn=<NllLossBackward0>)
tensor(1.4833, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11115/17426 [18:37<11:11,  9.40it/s]

tensor(1.4612, grad_fn=<NllLossBackward0>)
tensor(1.5113, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11117/17426 [18:37<11:34,  9.08it/s]

tensor(1.5207, grad_fn=<NllLossBackward0>)
tensor(1.5064, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11119/17426 [18:37<12:18,  8.54it/s]

tensor(1.4642, grad_fn=<NllLossBackward0>)
tensor(1.5125, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11121/17426 [18:37<12:09,  8.64it/s]

tensor(1.5036, grad_fn=<NllLossBackward0>)
tensor(1.5277, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11123/17426 [18:38<12:04,  8.70it/s]

tensor(1.5143, grad_fn=<NllLossBackward0>)
tensor(1.4872, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11125/17426 [18:38<12:09,  8.63it/s]

tensor(1.4718, grad_fn=<NllLossBackward0>)
tensor(1.4630, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11127/17426 [18:38<12:02,  8.71it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.4957, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11129/17426 [18:38<12:22,  8.48it/s]

tensor(1.4789, grad_fn=<NllLossBackward0>)
tensor(1.5328, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11131/17426 [18:39<12:20,  8.50it/s]

tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.4771, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11133/17426 [18:39<12:06,  8.66it/s]

tensor(1.4856, grad_fn=<NllLossBackward0>)
tensor(1.4746, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11135/17426 [18:39<12:18,  8.52it/s]

tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.4785, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11137/17426 [18:39<12:49,  8.17it/s]

tensor(1.4581, grad_fn=<NllLossBackward0>)
tensor(1.4928, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11139/17426 [18:40<12:25,  8.44it/s]

tensor(1.4851, grad_fn=<NllLossBackward0>)
tensor(1.4733, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11141/17426 [18:40<12:24,  8.45it/s]

tensor(1.4724, grad_fn=<NllLossBackward0>)
tensor(1.4817, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11143/17426 [18:40<12:22,  8.47it/s]

tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.5580, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11145/17426 [18:40<12:02,  8.69it/s]

tensor(1.4979, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11147/17426 [18:41<12:16,  8.53it/s]

tensor(1.4588, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11149/17426 [18:41<12:01,  8.70it/s]

tensor(1.5488, grad_fn=<NllLossBackward0>)
tensor(1.4686, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11151/17426 [18:41<11:57,  8.74it/s]

tensor(1.5375, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11153/17426 [18:41<12:05,  8.64it/s]

tensor(1.5090, grad_fn=<NllLossBackward0>)
tensor(1.5274, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11155/17426 [18:41<12:32,  8.33it/s]

tensor(1.5080, grad_fn=<NllLossBackward0>)
tensor(1.5442, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11157/17426 [18:42<12:19,  8.48it/s]

tensor(1.5145, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11159/17426 [18:42<12:05,  8.64it/s]

tensor(1.4884, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11161/17426 [18:42<12:08,  8.60it/s]

tensor(1.5125, grad_fn=<NllLossBackward0>)
tensor(1.5209, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11163/17426 [18:42<12:03,  8.66it/s]

tensor(1.5214, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11165/17426 [18:43<12:46,  8.16it/s]

tensor(1.4925, grad_fn=<NllLossBackward0>)
tensor(1.4367, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11167/17426 [18:43<12:22,  8.43it/s]

tensor(1.4927, grad_fn=<NllLossBackward0>)
tensor(1.4762, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11169/17426 [18:43<12:04,  8.63it/s]

tensor(1.5157, grad_fn=<NllLossBackward0>)
tensor(1.4905, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11171/17426 [18:43<12:02,  8.66it/s]

tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.4683, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11173/17426 [18:44<12:14,  8.51it/s]

tensor(1.5004, grad_fn=<NllLossBackward0>)
tensor(1.4689, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11175/17426 [18:44<12:39,  8.24it/s]

tensor(1.5102, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11177/17426 [18:44<13:58,  7.45it/s]

tensor(1.4754, grad_fn=<NllLossBackward0>)
tensor(1.4477, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11179/17426 [18:44<14:46,  7.05it/s]

tensor(1.4832, grad_fn=<NllLossBackward0>)
tensor(1.4932, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11181/17426 [18:45<14:23,  7.23it/s]

tensor(1.5105, grad_fn=<NllLossBackward0>)
tensor(1.4832, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11183/17426 [18:45<13:51,  7.51it/s]

tensor(1.4583, grad_fn=<NllLossBackward0>)
tensor(1.5220, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11185/17426 [18:45<13:41,  7.60it/s]

tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11187/17426 [18:45<13:37,  7.63it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.5167, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11189/17426 [18:46<14:31,  7.15it/s]

tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.4611, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11191/17426 [18:46<14:39,  7.09it/s]

tensor(1.4716, grad_fn=<NllLossBackward0>)
tensor(1.4954, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11193/17426 [18:46<14:45,  7.04it/s]

tensor(1.5420, grad_fn=<NllLossBackward0>)
tensor(1.5208, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11195/17426 [18:47<15:27,  6.72it/s]

tensor(1.5346, grad_fn=<NllLossBackward0>)
tensor(1.4872, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11197/17426 [18:47<14:41,  7.07it/s]

tensor(1.4683, grad_fn=<NllLossBackward0>)
tensor(1.5290, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11199/17426 [18:47<14:22,  7.22it/s]

tensor(1.5368, grad_fn=<NllLossBackward0>)
tensor(1.5024, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11201/17426 [18:47<14:58,  6.93it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.5346, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11203/17426 [18:48<15:58,  6.49it/s]

tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.5045, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11205/17426 [18:48<16:15,  6.38it/s]

tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.4920, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11207/17426 [18:48<14:40,  7.07it/s]

tensor(1.4392, grad_fn=<NllLossBackward0>)
tensor(1.5162, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11209/17426 [18:49<13:31,  7.66it/s]

tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.5247, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11211/17426 [18:49<13:18,  7.79it/s]

tensor(1.4886, grad_fn=<NllLossBackward0>)
tensor(1.4752, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11213/17426 [18:49<12:58,  7.99it/s]

tensor(1.5286, grad_fn=<NllLossBackward0>)
tensor(1.4972, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11215/17426 [18:49<12:39,  8.17it/s]

tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11217/17426 [18:50<12:25,  8.33it/s]

tensor(1.4828, grad_fn=<NllLossBackward0>)
tensor(1.4817, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11219/17426 [18:50<12:34,  8.23it/s]

tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11221/17426 [18:50<13:12,  7.83it/s]

tensor(1.5167, grad_fn=<NllLossBackward0>)
tensor(1.5045, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11223/17426 [18:50<12:39,  8.17it/s]

tensor(1.5251, grad_fn=<NllLossBackward0>)
tensor(1.4829, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11225/17426 [18:51<12:48,  8.07it/s]

tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.4927, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11227/17426 [18:51<12:40,  8.15it/s]

tensor(1.4858, grad_fn=<NllLossBackward0>)
tensor(1.5093, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11229/17426 [18:51<12:59,  7.95it/s]

tensor(1.4753, grad_fn=<NllLossBackward0>)
tensor(1.5239, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11231/17426 [18:51<12:44,  8.10it/s]

tensor(1.4998, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11233/17426 [18:52<12:25,  8.30it/s]

tensor(1.5528, grad_fn=<NllLossBackward0>)
tensor(1.4803, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11235/17426 [18:52<12:36,  8.18it/s]

tensor(1.5087, grad_fn=<NllLossBackward0>)
tensor(1.4675, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11237/17426 [18:52<12:45,  8.08it/s]

tensor(1.4937, grad_fn=<NllLossBackward0>)
tensor(1.4702, grad_fn=<NllLossBackward0>)


 64%|██████▍   | 11239/17426 [18:52<12:26,  8.29it/s]

tensor(1.4755, grad_fn=<NllLossBackward0>)
tensor(1.4977, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11241/17426 [18:53<12:11,  8.45it/s]

tensor(1.5115, grad_fn=<NllLossBackward0>)
tensor(1.4562, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11243/17426 [18:53<12:11,  8.45it/s]

tensor(1.5057, grad_fn=<NllLossBackward0>)
tensor(1.5409, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11245/17426 [18:53<12:00,  8.58it/s]

tensor(1.5211, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11247/17426 [18:53<12:07,  8.49it/s]

tensor(1.4954, grad_fn=<NllLossBackward0>)
tensor(1.4808, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11249/17426 [18:53<11:45,  8.75it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.5164, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11251/17426 [18:54<11:58,  8.59it/s]

tensor(1.5122, grad_fn=<NllLossBackward0>)
tensor(1.4806, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11253/17426 [18:54<11:48,  8.72it/s]

tensor(1.4820, grad_fn=<NllLossBackward0>)
tensor(1.4909, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11255/17426 [18:54<12:15,  8.38it/s]

tensor(1.4845, grad_fn=<NllLossBackward0>)
tensor(1.4581, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11257/17426 [18:54<11:58,  8.58it/s]

tensor(1.4944, grad_fn=<NllLossBackward0>)
tensor(1.5028, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11260/17426 [18:55<10:53,  9.43it/s]

tensor(1.4723, grad_fn=<NllLossBackward0>)
tensor(1.4864, grad_fn=<NllLossBackward0>)
tensor(1.4943, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11262/17426 [18:55<11:17,  9.10it/s]

tensor(1.5041, grad_fn=<NllLossBackward0>)
tensor(1.4851, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11264/17426 [18:55<11:33,  8.89it/s]

tensor(1.4535, grad_fn=<NllLossBackward0>)
tensor(1.4657, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11266/17426 [18:55<12:10,  8.44it/s]

tensor(1.5616, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11268/17426 [18:56<12:01,  8.54it/s]

tensor(1.4726, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11270/17426 [18:56<12:05,  8.48it/s]

tensor(1.5223, grad_fn=<NllLossBackward0>)
tensor(1.5026, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11272/17426 [18:56<11:49,  8.67it/s]

tensor(1.5013, grad_fn=<NllLossBackward0>)
tensor(1.4632, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11274/17426 [18:56<12:13,  8.39it/s]

tensor(1.4520, grad_fn=<NllLossBackward0>)
tensor(1.4512, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11276/17426 [18:57<11:44,  8.73it/s]

tensor(1.4900, grad_fn=<NllLossBackward0>)
tensor(1.5473, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11278/17426 [18:57<11:52,  8.63it/s]

tensor(1.4975, grad_fn=<NllLossBackward0>)
tensor(1.5228, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11280/17426 [18:57<12:01,  8.52it/s]

tensor(1.4881, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11282/17426 [18:57<12:12,  8.38it/s]

tensor(1.4773, grad_fn=<NllLossBackward0>)
tensor(1.4825, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11284/17426 [18:58<12:02,  8.50it/s]

tensor(1.4758, grad_fn=<NllLossBackward0>)
tensor(1.5196, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11286/17426 [18:58<11:52,  8.62it/s]

tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.5016, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11288/17426 [18:58<11:50,  8.63it/s]

tensor(1.5331, grad_fn=<NllLossBackward0>)
tensor(1.5138, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11290/17426 [18:58<12:11,  8.39it/s]

tensor(1.4716, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11292/17426 [18:59<13:35,  7.52it/s]

tensor(1.4676, grad_fn=<NllLossBackward0>)
tensor(1.5122, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11294/17426 [18:59<14:06,  7.24it/s]

tensor(1.4876, grad_fn=<NllLossBackward0>)
tensor(1.4920, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11296/17426 [18:59<14:42,  6.95it/s]

tensor(1.4701, grad_fn=<NllLossBackward0>)
tensor(1.4869, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11298/17426 [18:59<13:58,  7.31it/s]

tensor(1.4805, grad_fn=<NllLossBackward0>)
tensor(1.4781, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11300/17426 [19:00<14:24,  7.08it/s]

tensor(1.4823, grad_fn=<NllLossBackward0>)
tensor(1.4783, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11302/17426 [19:00<13:31,  7.54it/s]

tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.4717, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11304/17426 [19:00<13:46,  7.41it/s]

tensor(1.4871, grad_fn=<NllLossBackward0>)
tensor(1.4880, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11306/17426 [19:00<14:09,  7.20it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.5348, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11308/17426 [19:01<14:56,  6.82it/s]

tensor(1.5025, grad_fn=<NllLossBackward0>)
tensor(1.5116, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11310/17426 [19:01<14:51,  6.86it/s]

tensor(1.4980, grad_fn=<NllLossBackward0>)
tensor(1.5565, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11312/17426 [19:01<15:50,  6.43it/s]

tensor(1.5044, grad_fn=<NllLossBackward0>)
tensor(1.5265, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11314/17426 [19:02<16:14,  6.27it/s]

tensor(1.4625, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11316/17426 [19:02<15:24,  6.61it/s]

tensor(1.4927, grad_fn=<NllLossBackward0>)
tensor(1.4672, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11318/17426 [19:02<16:09,  6.30it/s]

tensor(1.5370, grad_fn=<NllLossBackward0>)
tensor(1.4586, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11320/17426 [19:03<16:31,  6.16it/s]

tensor(1.4994, grad_fn=<NllLossBackward0>)
tensor(1.4622, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11322/17426 [19:03<15:44,  6.46it/s]

tensor(1.4678, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11324/17426 [19:03<14:06,  7.20it/s]

tensor(1.4960, grad_fn=<NllLossBackward0>)
tensor(1.4729, grad_fn=<NllLossBackward0>)


 65%|██████▍   | 11326/17426 [19:03<13:05,  7.76it/s]

tensor(1.4819, grad_fn=<NllLossBackward0>)
tensor(1.4948, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11328/17426 [19:04<12:48,  7.93it/s]

tensor(1.4559, grad_fn=<NllLossBackward0>)
tensor(1.5002, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11330/17426 [19:04<12:53,  7.88it/s]

tensor(1.4796, grad_fn=<NllLossBackward0>)
tensor(1.4624, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11332/17426 [19:04<12:20,  8.23it/s]

tensor(1.4655, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11334/17426 [19:04<12:16,  8.27it/s]

tensor(1.5158, grad_fn=<NllLossBackward0>)
tensor(1.5318, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11336/17426 [19:05<12:18,  8.24it/s]

tensor(1.5052, grad_fn=<NllLossBackward0>)
tensor(1.4806, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11338/17426 [19:05<12:55,  7.85it/s]

tensor(1.4512, grad_fn=<NllLossBackward0>)
tensor(1.4538, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11340/17426 [19:05<12:28,  8.13it/s]

tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11342/17426 [19:05<12:07,  8.36it/s]

tensor(1.5652, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11344/17426 [19:06<12:34,  8.06it/s]

tensor(1.5175, grad_fn=<NllLossBackward0>)
tensor(1.5011, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11346/17426 [19:06<12:31,  8.09it/s]

tensor(1.5242, grad_fn=<NllLossBackward0>)
tensor(1.4952, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11348/17426 [19:06<12:21,  8.20it/s]

tensor(1.4813, grad_fn=<NllLossBackward0>)
tensor(1.4987, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11350/17426 [19:06<11:51,  8.54it/s]

tensor(1.5228, grad_fn=<NllLossBackward0>)
tensor(1.5024, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11352/17426 [19:07<11:52,  8.53it/s]

tensor(1.5246, grad_fn=<NllLossBackward0>)
tensor(1.4616, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11354/17426 [19:07<11:45,  8.60it/s]

tensor(1.5105, grad_fn=<NllLossBackward0>)
tensor(1.4815, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11356/17426 [19:07<12:11,  8.30it/s]

tensor(1.5115, grad_fn=<NllLossBackward0>)
tensor(1.4543, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11358/17426 [19:07<12:01,  8.41it/s]

tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.4939, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11360/17426 [19:08<11:49,  8.54it/s]

tensor(1.4848, grad_fn=<NllLossBackward0>)
tensor(1.4520, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11362/17426 [19:08<12:03,  8.38it/s]

tensor(1.4417, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11364/17426 [19:08<12:12,  8.28it/s]

tensor(1.4902, grad_fn=<NllLossBackward0>)
tensor(1.4717, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11367/17426 [19:08<10:45,  9.39it/s]

tensor(1.5080, grad_fn=<NllLossBackward0>)
tensor(1.5067, grad_fn=<NllLossBackward0>)
tensor(1.4764, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11369/17426 [19:09<11:09,  9.04it/s]

tensor(1.4781, grad_fn=<NllLossBackward0>)
tensor(1.5021, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11371/17426 [19:09<11:28,  8.80it/s]

tensor(1.5056, grad_fn=<NllLossBackward0>)
tensor(1.4769, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11373/17426 [19:09<12:09,  8.30it/s]

tensor(1.4403, grad_fn=<NllLossBackward0>)
tensor(1.4554, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11375/17426 [19:09<11:56,  8.44it/s]

tensor(1.4819, grad_fn=<NllLossBackward0>)
tensor(1.4731, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11377/17426 [19:10<11:46,  8.57it/s]

tensor(1.4835, grad_fn=<NllLossBackward0>)
tensor(1.4722, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11379/17426 [19:10<11:51,  8.49it/s]

tensor(1.5295, grad_fn=<NllLossBackward0>)
tensor(1.4819, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11381/17426 [19:10<12:08,  8.30it/s]

tensor(1.4926, grad_fn=<NllLossBackward0>)
tensor(1.5357, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11383/17426 [19:10<12:18,  8.18it/s]

tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.5056, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11385/17426 [19:11<11:54,  8.46it/s]

tensor(1.5025, grad_fn=<NllLossBackward0>)
tensor(1.5505, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11387/17426 [19:11<11:44,  8.57it/s]

tensor(1.4635, grad_fn=<NllLossBackward0>)
tensor(1.4661, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11389/17426 [19:11<11:50,  8.50it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.4948, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11391/17426 [19:11<12:21,  8.14it/s]

tensor(1.4627, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11393/17426 [19:11<11:56,  8.41it/s]

tensor(1.5029, grad_fn=<NllLossBackward0>)
tensor(1.4697, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11395/17426 [19:12<11:57,  8.41it/s]

tensor(1.5113, grad_fn=<NllLossBackward0>)
tensor(1.4353, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11397/17426 [19:12<11:41,  8.59it/s]

tensor(1.5022, grad_fn=<NllLossBackward0>)
tensor(1.4851, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11399/17426 [19:12<11:38,  8.63it/s]

tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11401/17426 [19:12<11:50,  8.48it/s]

tensor(1.4754, grad_fn=<NllLossBackward0>)
tensor(1.4845, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11403/17426 [19:13<12:04,  8.31it/s]

tensor(1.5000, grad_fn=<NllLossBackward0>)
tensor(1.4858, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11405/17426 [19:13<12:40,  7.91it/s]

tensor(1.4890, grad_fn=<NllLossBackward0>)
tensor(1.5063, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11407/17426 [19:13<13:34,  7.39it/s]

tensor(1.5004, grad_fn=<NllLossBackward0>)
tensor(1.5105, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11409/17426 [19:13<14:04,  7.13it/s]

tensor(1.4792, grad_fn=<NllLossBackward0>)
tensor(1.4723, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11411/17426 [19:14<14:19,  7.00it/s]

tensor(1.4795, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)


 65%|██████▌   | 11413/17426 [19:14<14:16,  7.02it/s]

tensor(1.5326, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11415/17426 [19:14<14:41,  6.82it/s]

tensor(1.4726, grad_fn=<NllLossBackward0>)
tensor(1.5410, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11417/17426 [19:15<13:57,  7.17it/s]

tensor(1.4896, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11419/17426 [19:15<15:04,  6.64it/s]

tensor(1.5275, grad_fn=<NllLossBackward0>)
tensor(1.5212, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11421/17426 [19:15<13:32,  7.39it/s]

tensor(1.4835, grad_fn=<NllLossBackward0>)
tensor(1.4637, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11423/17426 [19:15<13:00,  7.69it/s]

tensor(1.4650, grad_fn=<NllLossBackward0>)
tensor(1.5221, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11425/17426 [19:16<12:45,  7.84it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.4764, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11427/17426 [19:16<12:18,  8.13it/s]

tensor(1.4661, grad_fn=<NllLossBackward0>)
tensor(1.5192, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11429/17426 [19:16<14:09,  7.06it/s]

tensor(1.5115, grad_fn=<NllLossBackward0>)
tensor(1.4655, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11431/17426 [19:17<13:20,  7.49it/s]

tensor(1.4787, grad_fn=<NllLossBackward0>)
tensor(1.4916, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11433/17426 [19:17<14:25,  6.92it/s]

tensor(1.5785, grad_fn=<NllLossBackward0>)
tensor(1.5290, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11435/17426 [19:17<15:18,  6.53it/s]

tensor(1.5367, grad_fn=<NllLossBackward0>)
tensor(1.5228, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11437/17426 [19:17<15:20,  6.51it/s]

tensor(1.4872, grad_fn=<NllLossBackward0>)
tensor(1.5118, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11439/17426 [19:18<14:42,  6.79it/s]

tensor(1.4553, grad_fn=<NllLossBackward0>)
tensor(1.5032, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11441/17426 [19:18<13:19,  7.48it/s]

tensor(1.4884, grad_fn=<NllLossBackward0>)
tensor(1.5153, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11443/17426 [19:18<12:51,  7.76it/s]

tensor(1.4939, grad_fn=<NllLossBackward0>)
tensor(1.5286, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11445/17426 [19:18<12:27,  8.00it/s]

tensor(1.5216, grad_fn=<NllLossBackward0>)
tensor(1.5416, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11447/17426 [19:19<12:50,  7.76it/s]

tensor(1.5080, grad_fn=<NllLossBackward0>)
tensor(1.4889, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11449/17426 [19:19<12:08,  8.20it/s]

tensor(1.4939, grad_fn=<NllLossBackward0>)
tensor(1.4158, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11451/17426 [19:19<12:07,  8.21it/s]

tensor(1.4648, grad_fn=<NllLossBackward0>)
tensor(1.5267, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11453/17426 [19:19<11:53,  8.37it/s]

tensor(1.5020, grad_fn=<NllLossBackward0>)
tensor(1.5241, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11455/17426 [19:20<11:53,  8.37it/s]

tensor(1.4738, grad_fn=<NllLossBackward0>)
tensor(1.4749, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11457/17426 [19:20<12:03,  8.25it/s]

tensor(1.4681, grad_fn=<NllLossBackward0>)
tensor(1.4902, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11459/17426 [19:20<11:57,  8.32it/s]

tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.4374, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11461/17426 [19:20<11:39,  8.53it/s]

tensor(1.4673, grad_fn=<NllLossBackward0>)
tensor(1.5087, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11463/17426 [19:21<11:33,  8.60it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.4945, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11465/17426 [19:21<12:02,  8.25it/s]

tensor(1.4481, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11467/17426 [19:21<12:06,  8.20it/s]

tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.5005, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11469/17426 [19:21<11:44,  8.45it/s]

tensor(1.4732, grad_fn=<NllLossBackward0>)
tensor(1.4948, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11471/17426 [19:22<11:48,  8.40it/s]

tensor(1.5155, grad_fn=<NllLossBackward0>)
tensor(1.4436, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11473/17426 [19:22<12:05,  8.21it/s]

tensor(1.4914, grad_fn=<NllLossBackward0>)
tensor(1.4997, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11475/17426 [19:22<12:01,  8.25it/s]

tensor(1.5108, grad_fn=<NllLossBackward0>)
tensor(1.5187, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11477/17426 [19:22<11:50,  8.38it/s]

tensor(1.5032, grad_fn=<NllLossBackward0>)
tensor(1.4583, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11479/17426 [19:23<11:41,  8.47it/s]

tensor(1.5083, grad_fn=<NllLossBackward0>)
tensor(1.4776, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11481/17426 [19:23<11:27,  8.65it/s]

tensor(1.5370, grad_fn=<NllLossBackward0>)
tensor(1.4852, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11483/17426 [19:23<12:00,  8.25it/s]

tensor(1.4993, grad_fn=<NllLossBackward0>)
tensor(1.5163, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11485/17426 [19:23<11:49,  8.37it/s]

tensor(1.4722, grad_fn=<NllLossBackward0>)
tensor(1.5289, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11487/17426 [19:24<11:21,  8.71it/s]

tensor(1.5159, grad_fn=<NllLossBackward0>)
tensor(1.5333, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11489/17426 [19:24<11:30,  8.60it/s]

tensor(1.5077, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11491/17426 [19:24<11:26,  8.65it/s]

tensor(1.4831, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11493/17426 [19:24<11:42,  8.44it/s]

tensor(1.4869, grad_fn=<NllLossBackward0>)
tensor(1.5091, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11495/17426 [19:24<11:28,  8.61it/s]

tensor(1.5065, grad_fn=<NllLossBackward0>)
tensor(1.4761, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11497/17426 [19:25<11:22,  8.68it/s]

tensor(1.5118, grad_fn=<NllLossBackward0>)
tensor(1.4987, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11499/17426 [19:25<11:18,  8.73it/s]

tensor(1.4989, grad_fn=<NllLossBackward0>)
tensor(1.4846, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11501/17426 [19:25<11:54,  8.29it/s]

tensor(1.4879, grad_fn=<NllLossBackward0>)
tensor(1.4859, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11503/17426 [19:25<11:20,  8.70it/s]

tensor(1.4935, grad_fn=<NllLossBackward0>)
tensor(1.4658, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11505/17426 [19:26<11:20,  8.70it/s]

tensor(1.5022, grad_fn=<NllLossBackward0>)
tensor(1.4819, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11507/17426 [19:26<11:08,  8.85it/s]

tensor(1.4725, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11509/17426 [19:26<11:17,  8.73it/s]

tensor(1.4854, grad_fn=<NllLossBackward0>)
tensor(1.5214, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11511/17426 [19:26<11:45,  8.38it/s]

tensor(1.5336, grad_fn=<NllLossBackward0>)
tensor(1.4914, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11513/17426 [19:27<11:33,  8.53it/s]

tensor(1.5029, grad_fn=<NllLossBackward0>)
tensor(1.4702, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11515/17426 [19:27<11:32,  8.54it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.4960, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11517/17426 [19:27<11:18,  8.70it/s]

tensor(1.4947, grad_fn=<NllLossBackward0>)
tensor(1.5213, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11519/17426 [19:27<11:47,  8.35it/s]

tensor(1.4591, grad_fn=<NllLossBackward0>)
tensor(1.4692, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11521/17426 [19:27<11:26,  8.60it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.4527, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11523/17426 [19:28<11:49,  8.32it/s]

tensor(1.4742, grad_fn=<NllLossBackward0>)
tensor(1.4563, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11525/17426 [19:28<12:20,  7.97it/s]

tensor(1.5199, grad_fn=<NllLossBackward0>)
tensor(1.5203, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11527/17426 [19:28<13:12,  7.45it/s]

tensor(1.4784, grad_fn=<NllLossBackward0>)
tensor(1.4953, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11529/17426 [19:29<13:40,  7.19it/s]

tensor(1.5056, grad_fn=<NllLossBackward0>)
tensor(1.4830, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11531/17426 [19:29<14:37,  6.72it/s]

tensor(1.5722, grad_fn=<NllLossBackward0>)
tensor(1.5018, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11533/17426 [19:29<13:53,  7.07it/s]

tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.5155, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11535/17426 [19:29<13:33,  7.25it/s]

tensor(1.4828, grad_fn=<NllLossBackward0>)
tensor(1.4859, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11537/17426 [19:30<13:25,  7.31it/s]

tensor(1.5216, grad_fn=<NllLossBackward0>)
tensor(1.4917, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11539/17426 [19:30<12:36,  7.78it/s]

tensor(1.5137, grad_fn=<NllLossBackward0>)
tensor(1.5037, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11541/17426 [19:30<12:20,  7.95it/s]

tensor(1.5127, grad_fn=<NllLossBackward0>)
tensor(1.4953, grad_fn=<NllLossBackward0>)


 66%|██████▌   | 11543/17426 [19:30<11:59,  8.18it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.4731, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11545/17426 [19:31<13:10,  7.44it/s]

tensor(1.4814, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11547/17426 [19:31<13:48,  7.09it/s]

tensor(1.4869, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11549/17426 [19:31<13:06,  7.47it/s]

tensor(1.4844, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11551/17426 [19:32<13:38,  7.18it/s]

tensor(1.5226, grad_fn=<NllLossBackward0>)
tensor(1.4909, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11553/17426 [19:32<14:15,  6.87it/s]

tensor(1.5059, grad_fn=<NllLossBackward0>)
tensor(1.5244, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11555/17426 [19:32<13:36,  7.19it/s]

tensor(1.5003, grad_fn=<NllLossBackward0>)
tensor(1.5035, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11557/17426 [19:32<14:12,  6.89it/s]

tensor(1.4850, grad_fn=<NllLossBackward0>)
tensor(1.5072, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11559/17426 [19:33<13:43,  7.13it/s]

tensor(1.4726, grad_fn=<NllLossBackward0>)
tensor(1.4952, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11561/17426 [19:33<12:33,  7.78it/s]

tensor(1.5294, grad_fn=<NllLossBackward0>)
tensor(1.4789, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11563/17426 [19:33<12:04,  8.09it/s]

tensor(1.4552, grad_fn=<NllLossBackward0>)
tensor(1.5301, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11565/17426 [19:33<11:40,  8.37it/s]

tensor(1.5163, grad_fn=<NllLossBackward0>)
tensor(1.5205, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11567/17426 [19:34<11:29,  8.50it/s]

tensor(1.4507, grad_fn=<NllLossBackward0>)
tensor(1.4657, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11569/17426 [19:34<11:53,  8.21it/s]

tensor(1.5537, grad_fn=<NllLossBackward0>)
tensor(1.5348, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11571/17426 [19:34<11:50,  8.24it/s]

tensor(1.4725, grad_fn=<NllLossBackward0>)
tensor(1.4776, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11573/17426 [19:34<11:35,  8.41it/s]

tensor(1.4640, grad_fn=<NllLossBackward0>)
tensor(1.4952, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11575/17426 [19:35<11:25,  8.53it/s]

tensor(1.4808, grad_fn=<NllLossBackward0>)
tensor(1.5044, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11577/17426 [19:35<11:23,  8.56it/s]

tensor(1.4522, grad_fn=<NllLossBackward0>)
tensor(1.4959, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11579/17426 [19:35<11:35,  8.41it/s]

tensor(1.4763, grad_fn=<NllLossBackward0>)
tensor(1.4704, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11581/17426 [19:35<11:46,  8.27it/s]

tensor(1.5000, grad_fn=<NllLossBackward0>)
tensor(1.4796, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11583/17426 [19:36<11:48,  8.25it/s]

tensor(1.5166, grad_fn=<NllLossBackward0>)
tensor(1.4770, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11585/17426 [19:36<12:14,  7.95it/s]

tensor(1.5078, grad_fn=<NllLossBackward0>)
tensor(1.5232, grad_fn=<NllLossBackward0>)


 66%|██████▋   | 11587/17426 [19:36<12:06,  8.04it/s]

tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.4765, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11589/17426 [19:36<11:59,  8.11it/s]

tensor(1.5049, grad_fn=<NllLossBackward0>)
tensor(1.5325, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11591/17426 [19:37<11:43,  8.30it/s]

tensor(1.4746, grad_fn=<NllLossBackward0>)
tensor(1.5046, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11593/17426 [19:37<11:32,  8.42it/s]

tensor(1.4890, grad_fn=<NllLossBackward0>)
tensor(1.4848, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11595/17426 [19:37<11:48,  8.23it/s]

tensor(1.5370, grad_fn=<NllLossBackward0>)
tensor(1.5148, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11597/17426 [19:37<11:32,  8.42it/s]

tensor(1.5326, grad_fn=<NllLossBackward0>)
tensor(1.4646, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11599/17426 [19:37<11:30,  8.44it/s]

tensor(1.4920, grad_fn=<NllLossBackward0>)
tensor(1.4377, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11601/17426 [19:38<11:38,  8.34it/s]

tensor(1.4772, grad_fn=<NllLossBackward0>)
tensor(1.4868, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11603/17426 [19:38<11:53,  8.16it/s]

tensor(1.4675, grad_fn=<NllLossBackward0>)
tensor(1.4864, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11605/17426 [19:38<11:42,  8.28it/s]

tensor(1.4817, grad_fn=<NllLossBackward0>)
tensor(1.5168, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11607/17426 [19:38<11:28,  8.45it/s]

tensor(1.4912, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11609/17426 [19:39<11:39,  8.32it/s]

tensor(1.5197, grad_fn=<NllLossBackward0>)
tensor(1.4667, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11611/17426 [19:39<11:48,  8.20it/s]

tensor(1.5224, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11613/17426 [19:39<11:35,  8.35it/s]

tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.4761, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11615/17426 [19:39<11:23,  8.51it/s]

tensor(1.5198, grad_fn=<NllLossBackward0>)
tensor(1.4544, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11617/17426 [19:40<11:17,  8.57it/s]

tensor(1.5021, grad_fn=<NllLossBackward0>)
tensor(1.5363, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11619/17426 [19:40<11:31,  8.39it/s]

tensor(1.5270, grad_fn=<NllLossBackward0>)
tensor(1.4621, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11621/17426 [19:40<11:51,  8.16it/s]

tensor(1.5105, grad_fn=<NllLossBackward0>)
tensor(1.4764, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11623/17426 [19:40<11:18,  8.55it/s]

tensor(1.4693, grad_fn=<NllLossBackward0>)
tensor(1.4867, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11625/17426 [19:41<11:12,  8.63it/s]

tensor(1.4960, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11627/17426 [19:41<10:58,  8.81it/s]

tensor(1.4466, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11629/17426 [19:41<11:02,  8.75it/s]

tensor(1.4727, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11631/17426 [19:41<11:19,  8.53it/s]

tensor(1.5156, grad_fn=<NllLossBackward0>)
tensor(1.5063, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11633/17426 [19:41<10:56,  8.82it/s]

tensor(1.4464, grad_fn=<NllLossBackward0>)
tensor(1.5320, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11635/17426 [19:42<11:03,  8.72it/s]

tensor(1.4945, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11637/17426 [19:42<11:01,  8.74it/s]

tensor(1.5009, grad_fn=<NllLossBackward0>)
tensor(1.4738, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11639/17426 [19:42<11:25,  8.44it/s]

tensor(1.4706, grad_fn=<NllLossBackward0>)
tensor(1.5345, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11641/17426 [19:42<11:22,  8.48it/s]

tensor(1.4914, grad_fn=<NllLossBackward0>)
tensor(1.4720, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11643/17426 [19:43<12:23,  7.77it/s]

tensor(1.4884, grad_fn=<NllLossBackward0>)
tensor(1.5216, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11645/17426 [19:43<12:58,  7.42it/s]

tensor(1.4811, grad_fn=<NllLossBackward0>)
tensor(1.4595, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11647/17426 [19:43<13:29,  7.14it/s]

tensor(1.4973, grad_fn=<NllLossBackward0>)
tensor(1.5054, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11649/17426 [19:44<13:55,  6.92it/s]

tensor(1.4805, grad_fn=<NllLossBackward0>)
tensor(1.4897, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11651/17426 [19:44<13:46,  6.98it/s]

tensor(1.5365, grad_fn=<NllLossBackward0>)
tensor(1.5053, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11653/17426 [19:44<14:14,  6.75it/s]

tensor(1.5221, grad_fn=<NllLossBackward0>)
tensor(1.5189, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11655/17426 [19:44<13:51,  6.94it/s]

tensor(1.4466, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11657/17426 [19:45<13:30,  7.12it/s]

tensor(1.5205, grad_fn=<NllLossBackward0>)
tensor(1.5344, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11659/17426 [19:45<12:57,  7.42it/s]

tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.5459, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11661/17426 [19:45<13:08,  7.31it/s]

tensor(1.4788, grad_fn=<NllLossBackward0>)
tensor(1.5038, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11663/17426 [19:46<13:21,  7.19it/s]

tensor(1.4809, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11665/17426 [19:46<13:08,  7.31it/s]

tensor(1.4749, grad_fn=<NllLossBackward0>)
tensor(1.4884, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11667/17426 [19:46<13:28,  7.12it/s]

tensor(1.5246, grad_fn=<NllLossBackward0>)
tensor(1.5159, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11669/17426 [19:46<14:00,  6.85it/s]

tensor(1.4834, grad_fn=<NllLossBackward0>)
tensor(1.4614, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11671/17426 [19:47<13:49,  6.94it/s]

tensor(1.5128, grad_fn=<NllLossBackward0>)
tensor(1.4518, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11673/17426 [19:47<14:18,  6.70it/s]

tensor(1.4923, grad_fn=<NllLossBackward0>)
tensor(1.5002, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11675/17426 [19:47<13:01,  7.36it/s]

tensor(1.5473, grad_fn=<NllLossBackward0>)
tensor(1.5302, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11677/17426 [19:48<12:22,  7.74it/s]

tensor(1.4727, grad_fn=<NllLossBackward0>)
tensor(1.5521, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11679/17426 [19:48<12:02,  7.95it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.4416, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11681/17426 [19:48<11:48,  8.11it/s]

tensor(1.5036, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11683/17426 [19:48<11:36,  8.24it/s]

tensor(1.4707, grad_fn=<NllLossBackward0>)
tensor(1.4850, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11685/17426 [19:48<11:15,  8.50it/s]

tensor(1.5047, grad_fn=<NllLossBackward0>)
tensor(1.4611, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11687/17426 [19:49<11:44,  8.14it/s]

tensor(1.4829, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11689/17426 [19:49<11:21,  8.42it/s]

tensor(1.4635, grad_fn=<NllLossBackward0>)
tensor(1.4747, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11691/17426 [19:49<11:23,  8.40it/s]

tensor(1.5067, grad_fn=<NllLossBackward0>)
tensor(1.5384, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11693/17426 [19:49<11:31,  8.29it/s]

tensor(1.4695, grad_fn=<NllLossBackward0>)
tensor(1.4989, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11695/17426 [19:50<11:51,  8.05it/s]

tensor(1.5220, grad_fn=<NllLossBackward0>)
tensor(1.5087, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11697/17426 [19:50<11:32,  8.27it/s]

tensor(1.4911, grad_fn=<NllLossBackward0>)
tensor(1.4897, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11699/17426 [19:50<11:26,  8.34it/s]

tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.4743, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11701/17426 [19:50<11:33,  8.25it/s]

tensor(1.5453, grad_fn=<NllLossBackward0>)
tensor(1.4846, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11703/17426 [19:51<11:33,  8.26it/s]

tensor(1.4777, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11705/17426 [19:51<12:00,  7.94it/s]

tensor(1.5023, grad_fn=<NllLossBackward0>)
tensor(1.4655, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11707/17426 [19:51<11:43,  8.13it/s]

tensor(1.5402, grad_fn=<NllLossBackward0>)
tensor(1.4528, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11709/17426 [19:51<11:25,  8.35it/s]

tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.5096, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11711/17426 [19:52<11:34,  8.23it/s]

tensor(1.4832, grad_fn=<NllLossBackward0>)
tensor(1.5084, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11713/17426 [19:52<11:56,  7.97it/s]

tensor(1.5169, grad_fn=<NllLossBackward0>)
tensor(1.5018, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11715/17426 [19:52<11:48,  8.06it/s]

tensor(1.5182, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11717/17426 [19:52<11:26,  8.32it/s]

tensor(1.5060, grad_fn=<NllLossBackward0>)
tensor(1.4665, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11719/17426 [19:53<11:53,  8.00it/s]

tensor(1.4760, grad_fn=<NllLossBackward0>)
tensor(1.4784, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11721/17426 [19:53<12:04,  7.88it/s]

tensor(1.4652, grad_fn=<NllLossBackward0>)
tensor(1.4842, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11723/17426 [19:53<11:40,  8.14it/s]

tensor(1.4530, grad_fn=<NllLossBackward0>)
tensor(1.4837, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11725/17426 [19:53<11:27,  8.29it/s]

tensor(1.5328, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11727/17426 [19:54<11:08,  8.52it/s]

tensor(1.4717, grad_fn=<NllLossBackward0>)
tensor(1.5012, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11729/17426 [19:54<11:06,  8.54it/s]

tensor(1.4916, grad_fn=<NllLossBackward0>)
tensor(1.4807, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11731/17426 [19:54<11:30,  8.25it/s]

tensor(1.4637, grad_fn=<NllLossBackward0>)
tensor(1.4834, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11733/17426 [19:54<11:03,  8.58it/s]

tensor(1.4664, grad_fn=<NllLossBackward0>)
tensor(1.4965, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11735/17426 [19:55<10:49,  8.76it/s]

tensor(1.4873, grad_fn=<NllLossBackward0>)
tensor(1.5643, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11737/17426 [19:55<11:04,  8.57it/s]

tensor(1.5296, grad_fn=<NllLossBackward0>)
tensor(1.5026, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11739/17426 [19:55<11:33,  8.20it/s]

tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.4413, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11741/17426 [19:55<11:16,  8.40it/s]

tensor(1.4445, grad_fn=<NllLossBackward0>)
tensor(1.4815, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11743/17426 [19:55<11:16,  8.40it/s]

tensor(1.5302, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11745/17426 [19:56<11:16,  8.40it/s]

tensor(1.5247, grad_fn=<NllLossBackward0>)
tensor(1.4807, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11747/17426 [19:56<11:16,  8.39it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.5177, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11749/17426 [19:56<11:20,  8.35it/s]

tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.4887, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11751/17426 [19:56<11:17,  8.38it/s]

tensor(1.4853, grad_fn=<NllLossBackward0>)
tensor(1.4682, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11753/17426 [19:57<11:09,  8.48it/s]

tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.4933, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11755/17426 [19:57<11:01,  8.57it/s]

tensor(1.5113, grad_fn=<NllLossBackward0>)
tensor(1.5106, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11757/17426 [19:57<12:06,  7.80it/s]

tensor(1.4706, grad_fn=<NllLossBackward0>)
tensor(1.5355, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11759/17426 [19:57<12:43,  7.42it/s]

tensor(1.4829, grad_fn=<NllLossBackward0>)
tensor(1.4920, grad_fn=<NllLossBackward0>)


 67%|██████▋   | 11761/17426 [19:58<13:12,  7.15it/s]

tensor(1.5354, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11763/17426 [19:58<13:10,  7.17it/s]

tensor(1.5492, grad_fn=<NllLossBackward0>)
tensor(1.5041, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11765/17426 [19:58<13:14,  7.13it/s]

tensor(1.4727, grad_fn=<NllLossBackward0>)
tensor(1.4669, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11767/17426 [19:59<12:57,  7.27it/s]

tensor(1.4844, grad_fn=<NllLossBackward0>)
tensor(1.5245, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11769/17426 [19:59<13:51,  6.80it/s]

tensor(1.4706, grad_fn=<NllLossBackward0>)
tensor(1.4847, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11771/17426 [19:59<13:24,  7.03it/s]

tensor(1.4920, grad_fn=<NllLossBackward0>)
tensor(1.4564, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11773/17426 [19:59<12:59,  7.26it/s]

tensor(1.4919, grad_fn=<NllLossBackward0>)
tensor(1.5058, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11775/17426 [20:00<13:15,  7.11it/s]

tensor(1.4702, grad_fn=<NllLossBackward0>)
tensor(1.5438, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11777/17426 [20:00<14:11,  6.64it/s]

tensor(1.4637, grad_fn=<NllLossBackward0>)
tensor(1.5105, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11779/17426 [20:00<13:55,  6.75it/s]

tensor(1.4969, grad_fn=<NllLossBackward0>)
tensor(1.5029, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11781/17426 [20:01<14:48,  6.36it/s]

tensor(1.4399, grad_fn=<NllLossBackward0>)
tensor(1.4512, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11783/17426 [20:01<14:21,  6.55it/s]

tensor(1.5482, grad_fn=<NllLossBackward0>)
tensor(1.4919, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11785/17426 [20:01<14:50,  6.33it/s]

tensor(1.4709, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11787/17426 [20:02<13:23,  7.02it/s]

tensor(1.4813, grad_fn=<NllLossBackward0>)
tensor(1.4578, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11789/17426 [20:02<12:15,  7.66it/s]

tensor(1.5074, grad_fn=<NllLossBackward0>)
tensor(1.5376, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11791/17426 [20:02<11:49,  7.94it/s]

tensor(1.5177, grad_fn=<NllLossBackward0>)
tensor(1.5170, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11793/17426 [20:02<11:57,  7.85it/s]

tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.4798, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11795/17426 [20:03<11:50,  7.92it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.4606, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11797/17426 [20:03<11:29,  8.17it/s]

tensor(1.5032, grad_fn=<NllLossBackward0>)
tensor(1.4521, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11799/17426 [20:03<11:23,  8.23it/s]

tensor(1.4916, grad_fn=<NllLossBackward0>)
tensor(1.5187, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11801/17426 [20:03<11:21,  8.26it/s]

tensor(1.5029, grad_fn=<NllLossBackward0>)
tensor(1.4966, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11803/17426 [20:04<12:03,  7.77it/s]

tensor(1.4564, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11805/17426 [20:04<11:36,  8.08it/s]

tensor(1.5217, grad_fn=<NllLossBackward0>)
tensor(1.5406, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11807/17426 [20:04<11:18,  8.29it/s]

tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.4971, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11809/17426 [20:04<11:21,  8.24it/s]

tensor(1.5300, grad_fn=<NllLossBackward0>)
tensor(1.5141, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11811/17426 [20:05<11:43,  7.98it/s]

tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.5267, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11813/17426 [20:05<11:45,  7.96it/s]

tensor(1.4849, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11815/17426 [20:05<11:29,  8.14it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11817/17426 [20:05<11:29,  8.13it/s]

tensor(1.4382, grad_fn=<NllLossBackward0>)
tensor(1.4960, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11819/17426 [20:06<11:24,  8.19it/s]

tensor(1.5000, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11821/17426 [20:06<11:47,  7.92it/s]

tensor(1.4672, grad_fn=<NllLossBackward0>)
tensor(1.4880, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11823/17426 [20:06<11:39,  8.01it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11825/17426 [20:06<11:25,  8.16it/s]

tensor(1.5403, grad_fn=<NllLossBackward0>)
tensor(1.4858, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11827/17426 [20:07<11:19,  8.24it/s]

tensor(1.4857, grad_fn=<NllLossBackward0>)
tensor(1.4812, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11829/17426 [20:07<12:06,  7.70it/s]

tensor(1.5036, grad_fn=<NllLossBackward0>)
tensor(1.4517, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11831/17426 [20:07<11:24,  8.17it/s]

tensor(1.5117, grad_fn=<NllLossBackward0>)
tensor(1.5300, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11833/17426 [20:07<11:11,  8.33it/s]

tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.5268, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11835/17426 [20:08<11:03,  8.43it/s]

tensor(1.5068, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11837/17426 [20:08<11:12,  8.32it/s]

tensor(1.4854, grad_fn=<NllLossBackward0>)
tensor(1.4914, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11839/17426 [20:08<11:28,  8.11it/s]

tensor(1.5229, grad_fn=<NllLossBackward0>)
tensor(1.5292, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11841/17426 [20:08<11:10,  8.33it/s]

tensor(1.4390, grad_fn=<NllLossBackward0>)
tensor(1.5222, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11843/17426 [20:08<11:10,  8.32it/s]

tensor(1.5122, grad_fn=<NllLossBackward0>)
tensor(1.4331, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11845/17426 [20:09<10:58,  8.47it/s]

tensor(1.4819, grad_fn=<NllLossBackward0>)
tensor(1.4689, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11847/17426 [20:09<11:28,  8.10it/s]

tensor(1.4709, grad_fn=<NllLossBackward0>)
tensor(1.4438, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11849/17426 [20:09<11:20,  8.19it/s]

tensor(1.4608, grad_fn=<NllLossBackward0>)
tensor(1.4958, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11851/17426 [20:09<11:09,  8.32it/s]

tensor(1.4936, grad_fn=<NllLossBackward0>)
tensor(1.4691, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11853/17426 [20:10<11:04,  8.39it/s]

tensor(1.4784, grad_fn=<NllLossBackward0>)
tensor(1.5161, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11855/17426 [20:10<11:18,  8.21it/s]

tensor(1.4840, grad_fn=<NllLossBackward0>)
tensor(1.4954, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11857/17426 [20:10<11:28,  8.09it/s]

tensor(1.4674, grad_fn=<NllLossBackward0>)
tensor(1.4297, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11859/17426 [20:10<11:09,  8.32it/s]

tensor(1.5036, grad_fn=<NllLossBackward0>)
tensor(1.4865, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11861/17426 [20:11<10:51,  8.54it/s]

tensor(1.4599, grad_fn=<NllLossBackward0>)
tensor(1.4607, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11863/17426 [20:11<10:40,  8.69it/s]

tensor(1.5683, grad_fn=<NllLossBackward0>)
tensor(1.4466, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11865/17426 [20:11<10:45,  8.62it/s]

tensor(1.4528, grad_fn=<NllLossBackward0>)
tensor(1.4663, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11867/17426 [20:11<10:41,  8.67it/s]

tensor(1.5135, grad_fn=<NllLossBackward0>)
tensor(1.5184, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11869/17426 [20:12<11:21,  8.15it/s]

tensor(1.4626, grad_fn=<NllLossBackward0>)
tensor(1.4997, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11871/17426 [20:12<11:56,  7.75it/s]

tensor(1.5003, grad_fn=<NllLossBackward0>)
tensor(1.4726, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11873/17426 [20:12<12:48,  7.22it/s]

tensor(1.5129, grad_fn=<NllLossBackward0>)
tensor(1.5161, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11875/17426 [20:12<12:26,  7.44it/s]

tensor(1.5178, grad_fn=<NllLossBackward0>)
tensor(1.5277, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11877/17426 [20:13<12:02,  7.68it/s]

tensor(1.4730, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11879/17426 [20:13<11:26,  8.08it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.4666, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11881/17426 [20:13<12:32,  7.37it/s]

tensor(1.5677, grad_fn=<NllLossBackward0>)
tensor(1.4265, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11883/17426 [20:13<12:42,  7.26it/s]

tensor(1.5175, grad_fn=<NllLossBackward0>)
tensor(1.4777, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11885/17426 [20:14<12:53,  7.16it/s]

tensor(1.5117, grad_fn=<NllLossBackward0>)
tensor(1.5052, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11887/17426 [20:14<12:35,  7.33it/s]

tensor(1.4452, grad_fn=<NllLossBackward0>)
tensor(1.5236, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11889/17426 [20:14<12:17,  7.50it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.4789, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11891/17426 [20:15<13:29,  6.84it/s]

tensor(1.5434, grad_fn=<NllLossBackward0>)
tensor(1.4756, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11893/17426 [20:15<13:45,  6.70it/s]

tensor(1.4459, grad_fn=<NllLossBackward0>)
tensor(1.5489, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11895/17426 [20:15<13:12,  6.98it/s]

tensor(1.4913, grad_fn=<NllLossBackward0>)
tensor(1.4753, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11897/17426 [20:16<13:50,  6.66it/s]

tensor(1.4799, grad_fn=<NllLossBackward0>)
tensor(1.4919, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11899/17426 [20:16<13:03,  7.06it/s]

tensor(1.5494, grad_fn=<NllLossBackward0>)
tensor(1.4976, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11901/17426 [20:16<13:41,  6.72it/s]

tensor(1.4702, grad_fn=<NllLossBackward0>)
tensor(1.5191, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11903/17426 [20:16<12:30,  7.36it/s]

tensor(1.5004, grad_fn=<NllLossBackward0>)
tensor(1.5070, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11905/17426 [20:17<11:59,  7.67it/s]

tensor(1.4817, grad_fn=<NllLossBackward0>)
tensor(1.4904, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11907/17426 [20:17<11:33,  7.96it/s]

tensor(1.5227, grad_fn=<NllLossBackward0>)
tensor(1.5159, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11909/17426 [20:17<11:16,  8.16it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.5262, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11911/17426 [20:17<11:15,  8.17it/s]

tensor(1.4864, grad_fn=<NllLossBackward0>)
tensor(1.4811, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11913/17426 [20:18<11:45,  7.81it/s]

tensor(1.4854, grad_fn=<NllLossBackward0>)
tensor(1.5096, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11915/17426 [20:18<11:26,  8.03it/s]

tensor(1.4524, grad_fn=<NllLossBackward0>)
tensor(1.4768, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11917/17426 [20:18<11:12,  8.20it/s]

tensor(1.4611, grad_fn=<NllLossBackward0>)
tensor(1.4813, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11919/17426 [20:18<11:09,  8.23it/s]

tensor(1.4940, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11921/17426 [20:19<11:00,  8.34it/s]

tensor(1.5468, grad_fn=<NllLossBackward0>)
tensor(1.5278, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11923/17426 [20:19<11:08,  8.24it/s]

tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.4827, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11925/17426 [20:19<11:01,  8.32it/s]

tensor(1.4355, grad_fn=<NllLossBackward0>)
tensor(1.4672, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11927/17426 [20:19<10:58,  8.35it/s]

tensor(1.4981, grad_fn=<NllLossBackward0>)
tensor(1.4856, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11929/17426 [20:20<10:54,  8.40it/s]

tensor(1.4537, grad_fn=<NllLossBackward0>)
tensor(1.5221, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11931/17426 [20:20<11:11,  8.18it/s]

tensor(1.4638, grad_fn=<NllLossBackward0>)
tensor(1.4832, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11933/17426 [20:20<11:08,  8.22it/s]

tensor(1.4870, grad_fn=<NllLossBackward0>)
tensor(1.4499, grad_fn=<NllLossBackward0>)


 68%|██████▊   | 11935/17426 [20:20<11:13,  8.15it/s]

tensor(1.4793, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11937/17426 [20:21<11:19,  8.08it/s]

tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.5190, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11939/17426 [20:21<11:41,  7.82it/s]

tensor(1.4782, grad_fn=<NllLossBackward0>)
tensor(1.4824, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11941/17426 [20:21<11:23,  8.02it/s]

tensor(1.4847, grad_fn=<NllLossBackward0>)
tensor(1.4794, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11943/17426 [20:21<11:09,  8.19it/s]

tensor(1.5276, grad_fn=<NllLossBackward0>)
tensor(1.4813, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11945/17426 [20:21<11:06,  8.22it/s]

tensor(1.5027, grad_fn=<NllLossBackward0>)
tensor(1.5020, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11947/17426 [20:22<11:42,  7.80it/s]

tensor(1.4989, grad_fn=<NllLossBackward0>)
tensor(1.4510, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11949/17426 [20:22<11:15,  8.11it/s]

tensor(1.4915, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11951/17426 [20:22<11:09,  8.17it/s]

tensor(1.4537, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11953/17426 [20:22<11:03,  8.24it/s]

tensor(1.5195, grad_fn=<NllLossBackward0>)
tensor(1.5264, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11955/17426 [20:23<11:06,  8.20it/s]

tensor(1.4870, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11957/17426 [20:23<11:05,  8.21it/s]

tensor(1.4851, grad_fn=<NllLossBackward0>)
tensor(1.4340, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11959/17426 [20:23<11:02,  8.25it/s]

tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.4985, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11961/17426 [20:23<10:55,  8.34it/s]

tensor(1.5298, grad_fn=<NllLossBackward0>)
tensor(1.4657, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11963/17426 [20:24<10:59,  8.28it/s]

tensor(1.4915, grad_fn=<NllLossBackward0>)
tensor(1.4720, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11965/17426 [20:24<11:16,  8.08it/s]

tensor(1.4897, grad_fn=<NllLossBackward0>)
tensor(1.4474, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11967/17426 [20:24<11:13,  8.11it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.4900, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11969/17426 [20:24<10:47,  8.43it/s]

tensor(1.4692, grad_fn=<NllLossBackward0>)
tensor(1.4834, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11971/17426 [20:25<10:37,  8.55it/s]

tensor(1.4371, grad_fn=<NllLossBackward0>)
tensor(1.5036, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11973/17426 [20:25<11:10,  8.13it/s]

tensor(1.4610, grad_fn=<NllLossBackward0>)
tensor(1.5604, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11975/17426 [20:25<10:50,  8.38it/s]

tensor(1.4763, grad_fn=<NllLossBackward0>)
tensor(1.4193, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11977/17426 [20:25<10:37,  8.55it/s]

tensor(1.5085, grad_fn=<NllLossBackward0>)
tensor(1.4713, grad_fn=<NllLossBackward0>)


 69%|██████▊   | 11979/17426 [20:26<10:35,  8.57it/s]

tensor(1.5028, grad_fn=<NllLossBackward0>)
tensor(1.4634, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 11981/17426 [20:26<10:41,  8.49it/s]

tensor(1.5101, grad_fn=<NllLossBackward0>)
tensor(1.5211, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 11983/17426 [20:26<10:51,  8.36it/s]

tensor(1.4959, grad_fn=<NllLossBackward0>)
tensor(1.5298, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 11985/17426 [20:26<11:11,  8.10it/s]

tensor(1.5381, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 11987/17426 [20:27<11:43,  7.73it/s]

tensor(1.4852, grad_fn=<NllLossBackward0>)
tensor(1.4803, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 11989/17426 [20:27<12:26,  7.28it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.4873, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 11991/17426 [20:27<12:20,  7.34it/s]

tensor(1.5103, grad_fn=<NllLossBackward0>)
tensor(1.5474, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 11993/17426 [20:27<12:28,  7.26it/s]

tensor(1.4653, grad_fn=<NllLossBackward0>)
tensor(1.4867, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 11995/17426 [20:28<11:43,  7.72it/s]

tensor(1.4998, grad_fn=<NllLossBackward0>)
tensor(1.4900, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 11997/17426 [20:28<12:30,  7.24it/s]

tensor(1.4662, grad_fn=<NllLossBackward0>)
tensor(1.4414, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 11999/17426 [20:28<12:24,  7.29it/s]

tensor(1.4425, grad_fn=<NllLossBackward0>)
tensor(1.4605, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12001/17426 [20:29<11:51,  7.62it/s]

tensor(1.5172, grad_fn=<NllLossBackward0>)
tensor(1.5084, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12003/17426 [20:29<12:21,  7.31it/s]

tensor(1.4762, grad_fn=<NllLossBackward0>)
tensor(1.5085, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12005/17426 [20:29<12:41,  7.12it/s]

tensor(1.4826, grad_fn=<NllLossBackward0>)
tensor(1.4531, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12007/17426 [20:29<13:44,  6.58it/s]

tensor(1.5116, grad_fn=<NllLossBackward0>)
tensor(1.5139, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12009/17426 [20:30<13:10,  6.86it/s]

tensor(1.4871, grad_fn=<NllLossBackward0>)
tensor(1.5125, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12011/17426 [20:30<13:18,  6.78it/s]

tensor(1.4514, grad_fn=<NllLossBackward0>)
tensor(1.4577, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12013/17426 [20:30<13:25,  6.72it/s]

tensor(1.4689, grad_fn=<NllLossBackward0>)
tensor(1.5222, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12015/17426 [20:31<14:24,  6.26it/s]

tensor(1.4506, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12017/17426 [20:31<12:49,  7.03it/s]

tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.4806, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12019/17426 [20:31<11:48,  7.63it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.4589, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12021/17426 [20:31<12:02,  7.48it/s]

tensor(1.4958, grad_fn=<NllLossBackward0>)
tensor(1.5044, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12023/17426 [20:32<11:23,  7.91it/s]

tensor(1.4510, grad_fn=<NllLossBackward0>)
tensor(1.4690, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12025/17426 [20:32<11:18,  7.96it/s]

tensor(1.4786, grad_fn=<NllLossBackward0>)
tensor(1.4757, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12027/17426 [20:32<10:58,  8.20it/s]

tensor(1.5324, grad_fn=<NllLossBackward0>)
tensor(1.4957, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12029/17426 [20:32<10:47,  8.34it/s]

tensor(1.4998, grad_fn=<NllLossBackward0>)
tensor(1.4768, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12031/17426 [20:33<11:06,  8.09it/s]

tensor(1.5300, grad_fn=<NllLossBackward0>)
tensor(1.4888, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12033/17426 [20:33<10:54,  8.24it/s]

tensor(1.4742, grad_fn=<NllLossBackward0>)
tensor(1.4777, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12035/17426 [20:33<10:55,  8.22it/s]

tensor(1.5269, grad_fn=<NllLossBackward0>)
tensor(1.5490, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12037/17426 [20:33<10:49,  8.30it/s]

tensor(1.5296, grad_fn=<NllLossBackward0>)
tensor(1.5071, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12039/17426 [20:34<10:54,  8.23it/s]

tensor(1.4797, grad_fn=<NllLossBackward0>)
tensor(1.5211, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12041/17426 [20:34<10:45,  8.34it/s]

tensor(1.4635, grad_fn=<NllLossBackward0>)
tensor(1.4979, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12043/17426 [20:34<10:40,  8.40it/s]

tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12045/17426 [20:34<10:42,  8.38it/s]

tensor(1.5384, grad_fn=<NllLossBackward0>)
tensor(1.4889, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12047/17426 [20:35<10:41,  8.39it/s]

tensor(1.4744, grad_fn=<NllLossBackward0>)
tensor(1.5607, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12049/17426 [20:35<10:57,  8.18it/s]

tensor(1.5126, grad_fn=<NllLossBackward0>)
tensor(1.4313, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12051/17426 [20:35<10:43,  8.35it/s]

tensor(1.5168, grad_fn=<NllLossBackward0>)
tensor(1.4750, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12053/17426 [20:35<10:48,  8.28it/s]

tensor(1.4747, grad_fn=<NllLossBackward0>)
tensor(1.4959, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12055/17426 [20:36<10:37,  8.42it/s]

tensor(1.5087, grad_fn=<NllLossBackward0>)
tensor(1.4837, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12057/17426 [20:36<11:02,  8.11it/s]

tensor(1.5318, grad_fn=<NllLossBackward0>)
tensor(1.5116, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12059/17426 [20:36<10:46,  8.30it/s]

tensor(1.4884, grad_fn=<NllLossBackward0>)
tensor(1.5024, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12061/17426 [20:36<10:56,  8.18it/s]

tensor(1.5219, grad_fn=<NllLossBackward0>)
tensor(1.5219, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12063/17426 [20:36<10:43,  8.33it/s]

tensor(1.4879, grad_fn=<NllLossBackward0>)
tensor(1.5028, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12065/17426 [20:37<10:57,  8.16it/s]

tensor(1.4913, grad_fn=<NllLossBackward0>)
tensor(1.4879, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12067/17426 [20:37<10:39,  8.37it/s]

tensor(1.4943, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12069/17426 [20:37<10:47,  8.27it/s]

tensor(1.5455, grad_fn=<NllLossBackward0>)
tensor(1.5044, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12071/17426 [20:37<10:38,  8.39it/s]

tensor(1.4760, grad_fn=<NllLossBackward0>)
tensor(1.4993, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12073/17426 [20:38<10:36,  8.41it/s]

tensor(1.4850, grad_fn=<NllLossBackward0>)
tensor(1.5051, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12075/17426 [20:38<10:51,  8.21it/s]

tensor(1.4753, grad_fn=<NllLossBackward0>)
tensor(1.5329, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12077/17426 [20:38<10:53,  8.18it/s]

tensor(1.4452, grad_fn=<NllLossBackward0>)
tensor(1.5278, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12079/17426 [20:38<10:45,  8.28it/s]

tensor(1.5167, grad_fn=<NllLossBackward0>)
tensor(1.5026, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12081/17426 [20:39<10:56,  8.15it/s]

tensor(1.5518, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12083/17426 [20:39<11:00,  8.08it/s]

tensor(1.4954, grad_fn=<NllLossBackward0>)
tensor(1.4521, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12085/17426 [20:39<11:01,  8.07it/s]

tensor(1.4591, grad_fn=<NllLossBackward0>)
tensor(1.4939, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12087/17426 [20:39<10:49,  8.22it/s]

tensor(1.5136, grad_fn=<NllLossBackward0>)
tensor(1.4955, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12089/17426 [20:40<10:46,  8.25it/s]

tensor(1.4900, grad_fn=<NllLossBackward0>)
tensor(1.4865, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12091/17426 [20:40<11:10,  7.95it/s]

tensor(1.4917, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12093/17426 [20:40<10:55,  8.14it/s]

tensor(1.4811, grad_fn=<NllLossBackward0>)
tensor(1.4827, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12095/17426 [20:40<11:04,  8.02it/s]

tensor(1.5025, grad_fn=<NllLossBackward0>)
tensor(1.5278, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12097/17426 [20:41<10:58,  8.10it/s]

tensor(1.4656, grad_fn=<NllLossBackward0>)
tensor(1.5206, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12099/17426 [20:41<11:28,  7.73it/s]

tensor(1.4794, grad_fn=<NllLossBackward0>)
tensor(1.5767, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12101/17426 [20:41<12:33,  7.07it/s]

tensor(1.4786, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12103/17426 [20:42<12:19,  7.20it/s]

tensor(1.4757, grad_fn=<NllLossBackward0>)
tensor(1.4564, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12105/17426 [20:42<12:45,  6.95it/s]

tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.4329, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12107/17426 [20:42<12:52,  6.89it/s]

tensor(1.4535, grad_fn=<NllLossBackward0>)
tensor(1.4588, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12109/17426 [20:42<12:30,  7.09it/s]

tensor(1.4362, grad_fn=<NllLossBackward0>)
tensor(1.4993, grad_fn=<NllLossBackward0>)


 69%|██████▉   | 12111/17426 [20:43<12:12,  7.26it/s]

tensor(1.5246, grad_fn=<NllLossBackward0>)
tensor(1.4965, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12113/17426 [20:43<12:15,  7.22it/s]

tensor(1.4400, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12115/17426 [20:43<11:59,  7.39it/s]

tensor(1.4923, grad_fn=<NllLossBackward0>)
tensor(1.5047, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12117/17426 [20:43<12:34,  7.03it/s]

tensor(1.4770, grad_fn=<NllLossBackward0>)
tensor(1.4945, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12119/17426 [20:44<13:14,  6.68it/s]

tensor(1.4656, grad_fn=<NllLossBackward0>)
tensor(1.4846, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12121/17426 [20:44<13:27,  6.57it/s]

tensor(1.4864, grad_fn=<NllLossBackward0>)
tensor(1.4915, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12123/17426 [20:44<13:19,  6.63it/s]

tensor(1.4640, grad_fn=<NllLossBackward0>)
tensor(1.4868, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12125/17426 [20:45<13:45,  6.42it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12127/17426 [20:45<13:58,  6.32it/s]

tensor(1.4831, grad_fn=<NllLossBackward0>)
tensor(1.4455, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12129/17426 [20:45<13:18,  6.64it/s]

tensor(1.5435, grad_fn=<NllLossBackward0>)
tensor(1.5094, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12131/17426 [20:46<12:17,  7.18it/s]

tensor(1.4906, grad_fn=<NllLossBackward0>)
tensor(1.5025, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12133/17426 [20:46<11:32,  7.64it/s]

tensor(1.4624, grad_fn=<NllLossBackward0>)
tensor(1.4652, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12135/17426 [20:46<11:08,  7.92it/s]

tensor(1.4889, grad_fn=<NllLossBackward0>)
tensor(1.4600, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12137/17426 [20:46<10:51,  8.12it/s]

tensor(1.4803, grad_fn=<NllLossBackward0>)
tensor(1.4885, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12139/17426 [20:47<10:58,  8.03it/s]

tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.5094, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12141/17426 [20:47<10:44,  8.20it/s]

tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.5063, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12143/17426 [20:47<10:29,  8.39it/s]

tensor(1.5140, grad_fn=<NllLossBackward0>)
tensor(1.4864, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12145/17426 [20:47<10:40,  8.25it/s]

tensor(1.4332, grad_fn=<NllLossBackward0>)
tensor(1.5593, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12147/17426 [20:48<10:47,  8.15it/s]

tensor(1.5128, grad_fn=<NllLossBackward0>)
tensor(1.4760, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12149/17426 [20:48<10:42,  8.21it/s]

tensor(1.4720, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12151/17426 [20:48<10:49,  8.13it/s]

tensor(1.4521, grad_fn=<NllLossBackward0>)
tensor(1.5087, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12153/17426 [20:48<10:41,  8.22it/s]

tensor(1.4657, grad_fn=<NllLossBackward0>)
tensor(1.5140, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12155/17426 [20:49<10:38,  8.25it/s]

tensor(1.5085, grad_fn=<NllLossBackward0>)
tensor(1.4829, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12157/17426 [20:49<10:48,  8.12it/s]

tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.4666, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12159/17426 [20:49<10:34,  8.31it/s]

tensor(1.5057, grad_fn=<NllLossBackward0>)
tensor(1.4516, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12161/17426 [20:49<10:19,  8.50it/s]

tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.4713, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12163/17426 [20:50<10:33,  8.31it/s]

tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.5429, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12165/17426 [20:50<11:02,  7.94it/s]

tensor(1.5072, grad_fn=<NllLossBackward0>)
tensor(1.4766, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12167/17426 [20:50<10:39,  8.22it/s]

tensor(1.4275, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12169/17426 [20:50<10:27,  8.38it/s]

tensor(1.4660, grad_fn=<NllLossBackward0>)
tensor(1.4484, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12171/17426 [20:50<10:33,  8.30it/s]

tensor(1.4664, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12173/17426 [20:51<10:49,  8.09it/s]

tensor(1.4700, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12175/17426 [20:51<10:34,  8.27it/s]

tensor(1.5314, grad_fn=<NllLossBackward0>)
tensor(1.5094, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12177/17426 [20:51<10:46,  8.12it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.4684, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12179/17426 [20:51<10:37,  8.22it/s]

tensor(1.4397, grad_fn=<NllLossBackward0>)
tensor(1.5306, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12181/17426 [20:52<10:31,  8.31it/s]

tensor(1.4825, grad_fn=<NllLossBackward0>)
tensor(1.4836, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12183/17426 [20:52<10:49,  8.07it/s]

tensor(1.5141, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12185/17426 [20:52<10:48,  8.08it/s]

tensor(1.4726, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12187/17426 [20:52<10:47,  8.10it/s]

tensor(1.4496, grad_fn=<NllLossBackward0>)
tensor(1.4935, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12189/17426 [20:53<10:35,  8.24it/s]

tensor(1.4819, grad_fn=<NllLossBackward0>)
tensor(1.4777, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12191/17426 [20:53<10:48,  8.08it/s]

tensor(1.4506, grad_fn=<NllLossBackward0>)
tensor(1.4581, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12193/17426 [20:53<10:36,  8.23it/s]

tensor(1.5080, grad_fn=<NllLossBackward0>)
tensor(1.5045, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12195/17426 [20:53<10:37,  8.20it/s]

tensor(1.4711, grad_fn=<NllLossBackward0>)
tensor(1.5056, grad_fn=<NllLossBackward0>)


 70%|██████▉   | 12197/17426 [20:54<10:34,  8.24it/s]

tensor(1.4700, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 70%|███████   | 12199/17426 [20:54<11:03,  7.88it/s]

tensor(1.4716, grad_fn=<NllLossBackward0>)
tensor(1.5144, grad_fn=<NllLossBackward0>)


 70%|███████   | 12201/17426 [20:54<10:54,  7.99it/s]

tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)


 70%|███████   | 12203/17426 [20:54<10:58,  7.94it/s]

tensor(1.4856, grad_fn=<NllLossBackward0>)
tensor(1.4834, grad_fn=<NllLossBackward0>)


 70%|███████   | 12205/17426 [20:55<10:39,  8.17it/s]

tensor(1.4542, grad_fn=<NllLossBackward0>)
tensor(1.4554, grad_fn=<NllLossBackward0>)


 70%|███████   | 12207/17426 [20:55<10:22,  8.39it/s]

tensor(1.5033, grad_fn=<NllLossBackward0>)
tensor(1.4761, grad_fn=<NllLossBackward0>)


 70%|███████   | 12209/17426 [20:55<10:57,  7.94it/s]

tensor(1.4616, grad_fn=<NllLossBackward0>)
tensor(1.4854, grad_fn=<NllLossBackward0>)


 70%|███████   | 12211/17426 [20:55<11:40,  7.44it/s]

tensor(1.4858, grad_fn=<NllLossBackward0>)
tensor(1.4677, grad_fn=<NllLossBackward0>)


 70%|███████   | 12213/17426 [20:56<12:26,  6.98it/s]

tensor(1.4974, grad_fn=<NllLossBackward0>)
tensor(1.5105, grad_fn=<NllLossBackward0>)


 70%|███████   | 12215/17426 [20:56<12:45,  6.81it/s]

tensor(1.5130, grad_fn=<NllLossBackward0>)
tensor(1.4670, grad_fn=<NllLossBackward0>)


 70%|███████   | 12217/17426 [20:56<12:42,  6.83it/s]

tensor(1.4404, grad_fn=<NllLossBackward0>)
tensor(1.4420, grad_fn=<NllLossBackward0>)


 70%|███████   | 12219/17426 [20:57<12:07,  7.16it/s]

tensor(1.5202, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)


 70%|███████   | 12221/17426 [20:57<11:38,  7.45it/s]

tensor(1.5023, grad_fn=<NllLossBackward0>)
tensor(1.4971, grad_fn=<NllLossBackward0>)


 70%|███████   | 12223/17426 [20:57<11:52,  7.30it/s]

tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)


 70%|███████   | 12225/17426 [20:57<11:16,  7.69it/s]

tensor(1.5104, grad_fn=<NllLossBackward0>)
tensor(1.4817, grad_fn=<NllLossBackward0>)


 70%|███████   | 12227/17426 [20:58<11:30,  7.53it/s]

tensor(1.5241, grad_fn=<NllLossBackward0>)
tensor(1.4569, grad_fn=<NllLossBackward0>)


 70%|███████   | 12229/17426 [20:58<11:29,  7.54it/s]

tensor(1.4840, grad_fn=<NllLossBackward0>)
tensor(1.5073, grad_fn=<NllLossBackward0>)


 70%|███████   | 12231/17426 [20:58<11:42,  7.39it/s]

tensor(1.4980, grad_fn=<NllLossBackward0>)
tensor(1.4943, grad_fn=<NllLossBackward0>)


 70%|███████   | 12233/17426 [20:59<12:06,  7.15it/s]

tensor(1.5023, grad_fn=<NllLossBackward0>)
tensor(1.5057, grad_fn=<NllLossBackward0>)


 70%|███████   | 12235/17426 [20:59<11:42,  7.39it/s]

tensor(1.4774, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 70%|███████   | 12237/17426 [20:59<12:29,  6.93it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.5513, grad_fn=<NllLossBackward0>)


 70%|███████   | 12239/17426 [20:59<12:46,  6.77it/s]

tensor(1.4699, grad_fn=<NllLossBackward0>)
tensor(1.4663, grad_fn=<NllLossBackward0>)


 70%|███████   | 12241/17426 [21:00<13:42,  6.30it/s]

tensor(1.4545, grad_fn=<NllLossBackward0>)
tensor(1.4703, grad_fn=<NllLossBackward0>)


 70%|███████   | 12243/17426 [21:00<12:08,  7.12it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.4591, grad_fn=<NllLossBackward0>)


 70%|███████   | 12245/17426 [21:00<11:16,  7.66it/s]

tensor(1.5200, grad_fn=<NllLossBackward0>)
tensor(1.5090, grad_fn=<NllLossBackward0>)


 70%|███████   | 12247/17426 [21:00<11:13,  7.69it/s]

tensor(1.4592, grad_fn=<NllLossBackward0>)
tensor(1.5082, grad_fn=<NllLossBackward0>)


 70%|███████   | 12249/17426 [21:01<11:03,  7.80it/s]

tensor(1.5346, grad_fn=<NllLossBackward0>)
tensor(1.5174, grad_fn=<NllLossBackward0>)


 70%|███████   | 12251/17426 [21:01<10:31,  8.19it/s]

tensor(1.4372, grad_fn=<NllLossBackward0>)
tensor(1.5049, grad_fn=<NllLossBackward0>)


 70%|███████   | 12253/17426 [21:01<10:25,  8.27it/s]

tensor(1.4544, grad_fn=<NllLossBackward0>)
tensor(1.4431, grad_fn=<NllLossBackward0>)


 70%|███████   | 12255/17426 [21:01<10:59,  7.84it/s]

tensor(1.4878, grad_fn=<NllLossBackward0>)
tensor(1.5022, grad_fn=<NllLossBackward0>)


 70%|███████   | 12257/17426 [21:02<10:44,  8.02it/s]

tensor(1.5052, grad_fn=<NllLossBackward0>)
tensor(1.4652, grad_fn=<NllLossBackward0>)


 70%|███████   | 12259/17426 [21:02<10:34,  8.14it/s]

tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.5475, grad_fn=<NllLossBackward0>)


 70%|███████   | 12261/17426 [21:02<10:21,  8.32it/s]

tensor(1.4591, grad_fn=<NllLossBackward0>)
tensor(1.4520, grad_fn=<NllLossBackward0>)


 70%|███████   | 12263/17426 [21:02<10:21,  8.31it/s]

tensor(1.4874, grad_fn=<NllLossBackward0>)
tensor(1.4591, grad_fn=<NllLossBackward0>)


 70%|███████   | 12265/17426 [21:03<10:42,  8.03it/s]

tensor(1.5421, grad_fn=<NllLossBackward0>)
tensor(1.4347, grad_fn=<NllLossBackward0>)


 70%|███████   | 12267/17426 [21:03<10:34,  8.13it/s]

tensor(1.4648, grad_fn=<NllLossBackward0>)
tensor(1.5336, grad_fn=<NllLossBackward0>)


 70%|███████   | 12269/17426 [21:03<10:18,  8.34it/s]

tensor(1.4621, grad_fn=<NllLossBackward0>)
tensor(1.4862, grad_fn=<NllLossBackward0>)


 70%|███████   | 12271/17426 [21:03<10:14,  8.39it/s]

tensor(1.4685, grad_fn=<NllLossBackward0>)
tensor(1.4436, grad_fn=<NllLossBackward0>)


 70%|███████   | 12273/17426 [21:04<10:45,  7.99it/s]

tensor(1.4661, grad_fn=<NllLossBackward0>)
tensor(1.5057, grad_fn=<NllLossBackward0>)


 70%|███████   | 12275/17426 [21:04<10:28,  8.20it/s]

tensor(1.4760, grad_fn=<NllLossBackward0>)
tensor(1.5591, grad_fn=<NllLossBackward0>)


 70%|███████   | 12277/17426 [21:04<10:26,  8.22it/s]

tensor(1.5255, grad_fn=<NllLossBackward0>)
tensor(1.4849, grad_fn=<NllLossBackward0>)


 70%|███████   | 12279/17426 [21:04<10:24,  8.24it/s]

tensor(1.5003, grad_fn=<NllLossBackward0>)
tensor(1.4999, grad_fn=<NllLossBackward0>)


 70%|███████   | 12281/17426 [21:05<10:50,  7.91it/s]

tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.4997, grad_fn=<NllLossBackward0>)


 70%|███████   | 12283/17426 [21:05<10:44,  7.98it/s]

tensor(1.5279, grad_fn=<NllLossBackward0>)
tensor(1.5066, grad_fn=<NllLossBackward0>)


 70%|███████   | 12285/17426 [21:05<10:42,  8.00it/s]

tensor(1.4946, grad_fn=<NllLossBackward0>)
tensor(1.5217, grad_fn=<NllLossBackward0>)


 71%|███████   | 12287/17426 [21:05<10:20,  8.28it/s]

tensor(1.5001, grad_fn=<NllLossBackward0>)
tensor(1.4968, grad_fn=<NllLossBackward0>)


 71%|███████   | 12289/17426 [21:06<10:17,  8.32it/s]

tensor(1.4869, grad_fn=<NllLossBackward0>)
tensor(1.4748, grad_fn=<NllLossBackward0>)


 71%|███████   | 12291/17426 [21:06<10:43,  7.99it/s]

tensor(1.4668, grad_fn=<NllLossBackward0>)
tensor(1.4525, grad_fn=<NllLossBackward0>)


 71%|███████   | 12293/17426 [21:06<10:27,  8.18it/s]

tensor(1.4615, grad_fn=<NllLossBackward0>)
tensor(1.4738, grad_fn=<NllLossBackward0>)


 71%|███████   | 12295/17426 [21:06<10:35,  8.08it/s]

tensor(1.4619, grad_fn=<NllLossBackward0>)
tensor(1.5124, grad_fn=<NllLossBackward0>)


 71%|███████   | 12297/17426 [21:07<10:26,  8.18it/s]

tensor(1.5380, grad_fn=<NllLossBackward0>)
tensor(1.4579, grad_fn=<NllLossBackward0>)


 71%|███████   | 12299/17426 [21:07<10:43,  7.96it/s]

tensor(1.5266, grad_fn=<NllLossBackward0>)
tensor(1.5006, grad_fn=<NllLossBackward0>)


 71%|███████   | 12301/17426 [21:07<10:26,  8.18it/s]

tensor(1.4952, grad_fn=<NllLossBackward0>)
tensor(1.5148, grad_fn=<NllLossBackward0>)


 71%|███████   | 12303/17426 [21:07<10:34,  8.07it/s]

tensor(1.4996, grad_fn=<NllLossBackward0>)
tensor(1.4332, grad_fn=<NllLossBackward0>)


 71%|███████   | 12305/17426 [21:08<10:13,  8.34it/s]

tensor(1.5020, grad_fn=<NllLossBackward0>)
tensor(1.5510, grad_fn=<NllLossBackward0>)


 71%|███████   | 12307/17426 [21:08<10:54,  7.82it/s]

tensor(1.4478, grad_fn=<NllLossBackward0>)
tensor(1.5020, grad_fn=<NllLossBackward0>)


 71%|███████   | 12309/17426 [21:08<10:26,  8.17it/s]

tensor(1.5381, grad_fn=<NllLossBackward0>)
tensor(1.5411, grad_fn=<NllLossBackward0>)


 71%|███████   | 12311/17426 [21:08<10:09,  8.39it/s]

tensor(1.4575, grad_fn=<NllLossBackward0>)
tensor(1.4934, grad_fn=<NllLossBackward0>)


 71%|███████   | 12313/17426 [21:09<10:14,  8.32it/s]

tensor(1.4761, grad_fn=<NllLossBackward0>)
tensor(1.5005, grad_fn=<NllLossBackward0>)


 71%|███████   | 12315/17426 [21:09<10:16,  8.29it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.4681, grad_fn=<NllLossBackward0>)


 71%|███████   | 12317/17426 [21:09<10:42,  7.95it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.5340, grad_fn=<NllLossBackward0>)


 71%|███████   | 12319/17426 [21:09<10:18,  8.26it/s]

tensor(1.4618, grad_fn=<NllLossBackward0>)
tensor(1.5400, grad_fn=<NllLossBackward0>)


 71%|███████   | 12321/17426 [21:10<10:10,  8.36it/s]

tensor(1.4750, grad_fn=<NllLossBackward0>)
tensor(1.5301, grad_fn=<NllLossBackward0>)


 71%|███████   | 12323/17426 [21:10<10:35,  8.03it/s]

tensor(1.4853, grad_fn=<NllLossBackward0>)
tensor(1.4713, grad_fn=<NllLossBackward0>)


 71%|███████   | 12325/17426 [21:10<11:29,  7.39it/s]

tensor(1.5117, grad_fn=<NllLossBackward0>)
tensor(1.4684, grad_fn=<NllLossBackward0>)


 71%|███████   | 12327/17426 [21:10<11:55,  7.13it/s]

tensor(1.4279, grad_fn=<NllLossBackward0>)
tensor(1.4490, grad_fn=<NllLossBackward0>)


 71%|███████   | 12329/17426 [21:11<11:28,  7.40it/s]

tensor(1.5022, grad_fn=<NllLossBackward0>)
tensor(1.5024, grad_fn=<NllLossBackward0>)


 71%|███████   | 12331/17426 [21:11<11:45,  7.22it/s]

tensor(1.5470, grad_fn=<NllLossBackward0>)
tensor(1.4763, grad_fn=<NllLossBackward0>)


 71%|███████   | 12333/17426 [21:11<11:25,  7.43it/s]

tensor(1.5015, grad_fn=<NllLossBackward0>)
tensor(1.4765, grad_fn=<NllLossBackward0>)


 71%|███████   | 12335/17426 [21:11<11:21,  7.47it/s]

tensor(1.4596, grad_fn=<NllLossBackward0>)
tensor(1.4602, grad_fn=<NllLossBackward0>)


 71%|███████   | 12337/17426 [21:12<12:10,  6.97it/s]

tensor(1.4736, grad_fn=<NllLossBackward0>)
tensor(1.5320, grad_fn=<NllLossBackward0>)


 71%|███████   | 12339/17426 [21:12<11:51,  7.15it/s]

tensor(1.5044, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)


 71%|███████   | 12341/17426 [21:12<11:55,  7.11it/s]

tensor(1.4969, grad_fn=<NllLossBackward0>)
tensor(1.4406, grad_fn=<NllLossBackward0>)


 71%|███████   | 12343/17426 [21:13<12:07,  6.99it/s]

tensor(1.4934, grad_fn=<NllLossBackward0>)
tensor(1.5110, grad_fn=<NllLossBackward0>)


 71%|███████   | 12345/17426 [21:13<12:32,  6.75it/s]

tensor(1.4603, grad_fn=<NllLossBackward0>)
tensor(1.4927, grad_fn=<NllLossBackward0>)


 71%|███████   | 12347/17426 [21:13<12:43,  6.65it/s]

tensor(1.5211, grad_fn=<NllLossBackward0>)
tensor(1.5626, grad_fn=<NllLossBackward0>)


 71%|███████   | 12349/17426 [21:14<12:33,  6.73it/s]

tensor(1.4558, grad_fn=<NllLossBackward0>)
tensor(1.5589, grad_fn=<NllLossBackward0>)


 71%|███████   | 12351/17426 [21:14<12:55,  6.55it/s]

tensor(1.4721, grad_fn=<NllLossBackward0>)
tensor(1.4532, grad_fn=<NllLossBackward0>)


 71%|███████   | 12353/17426 [21:14<12:16,  6.89it/s]

tensor(1.4894, grad_fn=<NllLossBackward0>)
tensor(1.5227, grad_fn=<NllLossBackward0>)


 71%|███████   | 12355/17426 [21:14<13:05,  6.46it/s]

tensor(1.5007, grad_fn=<NllLossBackward0>)
tensor(1.4472, grad_fn=<NllLossBackward0>)


 71%|███████   | 12357/17426 [21:15<12:39,  6.67it/s]

tensor(1.5025, grad_fn=<NllLossBackward0>)
tensor(1.5514, grad_fn=<NllLossBackward0>)


 71%|███████   | 12359/17426 [21:15<11:14,  7.51it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.4495, grad_fn=<NllLossBackward0>)


 71%|███████   | 12361/17426 [21:15<10:48,  7.81it/s]

tensor(1.4875, grad_fn=<NllLossBackward0>)
tensor(1.5120, grad_fn=<NllLossBackward0>)


 71%|███████   | 12363/17426 [21:15<10:44,  7.86it/s]

tensor(1.4989, grad_fn=<NllLossBackward0>)
tensor(1.5027, grad_fn=<NllLossBackward0>)


 71%|███████   | 12365/17426 [21:16<10:26,  8.07it/s]

tensor(1.4913, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)


 71%|███████   | 12367/17426 [21:16<10:20,  8.15it/s]

tensor(1.4928, grad_fn=<NllLossBackward0>)
tensor(1.4955, grad_fn=<NllLossBackward0>)


 71%|███████   | 12369/17426 [21:16<10:16,  8.20it/s]

tensor(1.4788, grad_fn=<NllLossBackward0>)
tensor(1.4847, grad_fn=<NllLossBackward0>)


 71%|███████   | 12371/17426 [21:16<10:36,  7.94it/s]

tensor(1.4720, grad_fn=<NllLossBackward0>)
tensor(1.4701, grad_fn=<NllLossBackward0>)


 71%|███████   | 12373/17426 [21:17<10:25,  8.08it/s]

tensor(1.4726, grad_fn=<NllLossBackward0>)
tensor(1.4903, grad_fn=<NllLossBackward0>)


 71%|███████   | 12375/17426 [21:17<10:21,  8.12it/s]

tensor(1.5267, grad_fn=<NllLossBackward0>)
tensor(1.5551, grad_fn=<NllLossBackward0>)


 71%|███████   | 12377/17426 [21:17<10:09,  8.28it/s]

tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)


 71%|███████   | 12379/17426 [21:17<10:29,  8.02it/s]

tensor(1.4243, grad_fn=<NllLossBackward0>)
tensor(1.6018, grad_fn=<NllLossBackward0>)


 71%|███████   | 12381/17426 [21:18<10:15,  8.20it/s]

tensor(1.4787, grad_fn=<NllLossBackward0>)
tensor(1.5236, grad_fn=<NllLossBackward0>)


 71%|███████   | 12383/17426 [21:18<10:04,  8.34it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)


 71%|███████   | 12385/17426 [21:18<10:05,  8.33it/s]

tensor(1.5580, grad_fn=<NllLossBackward0>)
tensor(1.4737, grad_fn=<NllLossBackward0>)


 71%|███████   | 12387/17426 [21:18<10:16,  8.17it/s]

tensor(1.4837, grad_fn=<NllLossBackward0>)
tensor(1.4717, grad_fn=<NllLossBackward0>)


 71%|███████   | 12389/17426 [21:19<10:28,  8.01it/s]

tensor(1.4988, grad_fn=<NllLossBackward0>)
tensor(1.4912, grad_fn=<NllLossBackward0>)


 71%|███████   | 12391/17426 [21:19<10:08,  8.27it/s]

tensor(1.4680, grad_fn=<NllLossBackward0>)
tensor(1.4915, grad_fn=<NllLossBackward0>)


 71%|███████   | 12393/17426 [21:19<10:08,  8.27it/s]

tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 71%|███████   | 12395/17426 [21:19<10:30,  7.98it/s]

tensor(1.5027, grad_fn=<NllLossBackward0>)
tensor(1.4657, grad_fn=<NllLossBackward0>)


 71%|███████   | 12397/17426 [21:20<10:33,  7.94it/s]

tensor(1.4803, grad_fn=<NllLossBackward0>)
tensor(1.5260, grad_fn=<NllLossBackward0>)


 71%|███████   | 12399/17426 [21:20<10:15,  8.17it/s]

tensor(1.5092, grad_fn=<NllLossBackward0>)
tensor(1.4730, grad_fn=<NllLossBackward0>)


 71%|███████   | 12401/17426 [21:20<10:06,  8.29it/s]

tensor(1.4470, grad_fn=<NllLossBackward0>)
tensor(1.4767, grad_fn=<NllLossBackward0>)


 71%|███████   | 12403/17426 [21:20<10:12,  8.21it/s]

tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.4927, grad_fn=<NllLossBackward0>)


 71%|███████   | 12405/17426 [21:21<10:43,  7.80it/s]

tensor(1.5119, grad_fn=<NllLossBackward0>)
tensor(1.4669, grad_fn=<NllLossBackward0>)


 71%|███████   | 12407/17426 [21:21<10:53,  7.68it/s]

tensor(1.5221, grad_fn=<NllLossBackward0>)
tensor(1.4612, grad_fn=<NllLossBackward0>)


 71%|███████   | 12409/17426 [21:21<10:26,  8.01it/s]

tensor(1.5214, grad_fn=<NllLossBackward0>)
tensor(1.4910, grad_fn=<NllLossBackward0>)


 71%|███████   | 12411/17426 [21:21<10:21,  8.07it/s]

tensor(1.4839, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)


 71%|███████   | 12413/17426 [21:22<10:31,  7.94it/s]

tensor(1.4683, grad_fn=<NllLossBackward0>)
tensor(1.4736, grad_fn=<NllLossBackward0>)


 71%|███████   | 12415/17426 [21:22<10:22,  8.04it/s]

tensor(1.4549, grad_fn=<NllLossBackward0>)
tensor(1.4619, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12417/17426 [21:22<10:24,  8.02it/s]

tensor(1.4494, grad_fn=<NllLossBackward0>)
tensor(1.4464, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12419/17426 [21:22<10:14,  8.14it/s]

tensor(1.5150, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12421/17426 [21:23<10:04,  8.28it/s]

tensor(1.5237, grad_fn=<NllLossBackward0>)
tensor(1.5246, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12423/17426 [21:23<10:18,  8.09it/s]

tensor(1.5102, grad_fn=<NllLossBackward0>)
tensor(1.4845, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12425/17426 [21:23<10:07,  8.23it/s]

tensor(1.4709, grad_fn=<NllLossBackward0>)
tensor(1.4990, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12427/17426 [21:23<09:59,  8.33it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5604, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12429/17426 [21:24<09:58,  8.34it/s]

tensor(1.4634, grad_fn=<NllLossBackward0>)
tensor(1.4975, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12431/17426 [21:24<10:08,  8.21it/s]

tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.4961, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12433/17426 [21:24<09:53,  8.41it/s]

tensor(1.4628, grad_fn=<NllLossBackward0>)
tensor(1.4461, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12435/17426 [21:24<09:56,  8.37it/s]

tensor(1.4758, grad_fn=<NllLossBackward0>)
tensor(1.4883, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12437/17426 [21:25<09:57,  8.35it/s]

tensor(1.4683, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12439/17426 [21:25<10:46,  7.71it/s]

tensor(1.4902, grad_fn=<NllLossBackward0>)
tensor(1.4488, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12441/17426 [21:25<11:33,  7.19it/s]

tensor(1.4912, grad_fn=<NllLossBackward0>)
tensor(1.4412, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12443/17426 [21:25<11:43,  7.08it/s]

tensor(1.4765, grad_fn=<NllLossBackward0>)
tensor(1.4567, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12445/17426 [21:26<10:53,  7.62it/s]

tensor(1.5155, grad_fn=<NllLossBackward0>)
tensor(1.4742, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12447/17426 [21:26<11:33,  7.18it/s]

tensor(1.5024, grad_fn=<NllLossBackward0>)
tensor(1.4525, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12449/17426 [21:26<11:37,  7.14it/s]

tensor(1.5014, grad_fn=<NllLossBackward0>)
tensor(1.4760, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12451/17426 [21:27<11:45,  7.06it/s]

tensor(1.5096, grad_fn=<NllLossBackward0>)
tensor(1.4712, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12453/17426 [21:27<11:50,  7.00it/s]

tensor(1.5136, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12455/17426 [21:27<11:47,  7.02it/s]

tensor(1.5422, grad_fn=<NllLossBackward0>)
tensor(1.4966, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12457/17426 [21:27<11:38,  7.11it/s]

tensor(1.4761, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 71%|███████▏  | 12459/17426 [21:28<11:27,  7.22it/s]

tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.5293, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12461/17426 [21:28<12:06,  6.83it/s]

tensor(1.4788, grad_fn=<NllLossBackward0>)
tensor(1.4792, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12463/17426 [21:28<12:00,  6.89it/s]

tensor(1.4792, grad_fn=<NllLossBackward0>)
tensor(1.4713, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12465/17426 [21:29<13:01,  6.35it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.4782, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12467/17426 [21:29<12:30,  6.61it/s]

tensor(1.5206, grad_fn=<NllLossBackward0>)
tensor(1.5119, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12469/17426 [21:29<13:06,  6.30it/s]

tensor(1.5888, grad_fn=<NllLossBackward0>)
tensor(1.5432, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12471/17426 [21:29<12:04,  6.84it/s]

tensor(1.5224, grad_fn=<NllLossBackward0>)
tensor(1.4871, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12473/17426 [21:30<11:03,  7.46it/s]

tensor(1.4936, grad_fn=<NllLossBackward0>)
tensor(1.4781, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12475/17426 [21:30<10:48,  7.63it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.5067, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12477/17426 [21:30<10:39,  7.74it/s]

tensor(1.4812, grad_fn=<NllLossBackward0>)
tensor(1.5264, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12479/17426 [21:31<10:21,  7.96it/s]

tensor(1.4799, grad_fn=<NllLossBackward0>)
tensor(1.4711, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12481/17426 [21:31<10:19,  7.99it/s]

tensor(1.5261, grad_fn=<NllLossBackward0>)
tensor(1.4925, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12483/17426 [21:31<10:11,  8.09it/s]

tensor(1.5003, grad_fn=<NllLossBackward0>)
tensor(1.4784, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12485/17426 [21:31<10:14,  8.04it/s]

tensor(1.4458, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12487/17426 [21:31<09:58,  8.25it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.4733, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12489/17426 [21:32<10:08,  8.12it/s]

tensor(1.5201, grad_fn=<NllLossBackward0>)
tensor(1.4977, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12491/17426 [21:32<09:50,  8.35it/s]

tensor(1.5225, grad_fn=<NllLossBackward0>)
tensor(1.4850, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12493/17426 [21:32<09:48,  8.39it/s]

tensor(1.4861, grad_fn=<NllLossBackward0>)
tensor(1.5095, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12495/17426 [21:32<10:12,  8.05it/s]

tensor(1.4823, grad_fn=<NllLossBackward0>)
tensor(1.4859, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12497/17426 [21:33<09:57,  8.24it/s]

tensor(1.4694, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12499/17426 [21:33<09:48,  8.38it/s]

tensor(1.4746, grad_fn=<NllLossBackward0>)
tensor(1.4644, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12501/17426 [21:33<09:42,  8.46it/s]

tensor(1.4809, grad_fn=<NllLossBackward0>)
tensor(1.5135, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12503/17426 [21:33<10:17,  7.98it/s]

tensor(1.5428, grad_fn=<NllLossBackward0>)
tensor(1.5026, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12505/17426 [21:34<10:10,  8.06it/s]

tensor(1.5127, grad_fn=<NllLossBackward0>)
tensor(1.4483, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12507/17426 [21:34<09:53,  8.29it/s]

tensor(1.4859, grad_fn=<NllLossBackward0>)
tensor(1.5138, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12509/17426 [21:34<09:40,  8.47it/s]

tensor(1.4866, grad_fn=<NllLossBackward0>)
tensor(1.4633, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12511/17426 [21:34<09:46,  8.38it/s]

tensor(1.4706, grad_fn=<NllLossBackward0>)
tensor(1.4926, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12513/17426 [21:35<10:10,  8.04it/s]

tensor(1.4963, grad_fn=<NllLossBackward0>)
tensor(1.5157, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12515/17426 [21:35<09:53,  8.27it/s]

tensor(1.5123, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12517/17426 [21:35<09:51,  8.29it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.5264, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12519/17426 [21:35<09:47,  8.35it/s]

tensor(1.4878, grad_fn=<NllLossBackward0>)
tensor(1.4822, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12521/17426 [21:36<10:09,  8.05it/s]

tensor(1.4719, grad_fn=<NllLossBackward0>)
tensor(1.5221, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12523/17426 [21:36<10:00,  8.17it/s]

tensor(1.4752, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12525/17426 [21:36<09:50,  8.31it/s]

tensor(1.4687, grad_fn=<NllLossBackward0>)
tensor(1.5305, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12527/17426 [21:36<09:29,  8.60it/s]

tensor(1.4775, grad_fn=<NllLossBackward0>)
tensor(1.4945, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12529/17426 [21:37<10:01,  8.14it/s]

tensor(1.4605, grad_fn=<NllLossBackward0>)
tensor(1.4698, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12531/17426 [21:37<10:05,  8.08it/s]

tensor(1.5094, grad_fn=<NllLossBackward0>)
tensor(1.4856, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12533/17426 [21:37<10:10,  8.02it/s]

tensor(1.5310, grad_fn=<NllLossBackward0>)
tensor(1.4672, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12535/17426 [21:37<10:16,  7.93it/s]

tensor(1.4741, grad_fn=<NllLossBackward0>)
tensor(1.4798, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12537/17426 [21:38<10:22,  7.85it/s]

tensor(1.5072, grad_fn=<NllLossBackward0>)
tensor(1.5091, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12539/17426 [21:38<10:22,  7.85it/s]

tensor(1.4997, grad_fn=<NllLossBackward0>)
tensor(1.4592, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12541/17426 [21:38<10:11,  7.99it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.5025, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12543/17426 [21:38<10:00,  8.13it/s]

tensor(1.5076, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12545/17426 [21:39<09:53,  8.23it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.4753, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12547/17426 [21:39<10:14,  7.94it/s]

tensor(1.4659, grad_fn=<NllLossBackward0>)
tensor(1.4932, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12549/17426 [21:39<10:16,  7.91it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.4750, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12551/17426 [21:39<09:52,  8.22it/s]

tensor(1.4920, grad_fn=<NllLossBackward0>)
tensor(1.4828, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12553/17426 [21:40<10:46,  7.53it/s]

tensor(1.4635, grad_fn=<NllLossBackward0>)
tensor(1.4371, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12555/17426 [21:40<11:36,  6.99it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12557/17426 [21:40<11:59,  6.77it/s]

tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.5041, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12559/17426 [21:41<11:44,  6.91it/s]

tensor(1.5002, grad_fn=<NllLossBackward0>)
tensor(1.4939, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12561/17426 [21:41<11:28,  7.07it/s]

tensor(1.5465, grad_fn=<NllLossBackward0>)
tensor(1.5025, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12563/17426 [21:41<11:09,  7.26it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.4784, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12565/17426 [21:41<11:13,  7.22it/s]

tensor(1.4945, grad_fn=<NllLossBackward0>)
tensor(1.5164, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12567/17426 [21:42<10:19,  7.85it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5318, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12569/17426 [21:42<09:58,  8.11it/s]

tensor(1.5174, grad_fn=<NllLossBackward0>)
tensor(1.5045, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12571/17426 [21:42<10:12,  7.93it/s]

tensor(1.5214, grad_fn=<NllLossBackward0>)
tensor(1.4816, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12573/17426 [21:42<10:29,  7.71it/s]

tensor(1.4720, grad_fn=<NllLossBackward0>)
tensor(1.4758, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12575/17426 [21:43<11:04,  7.30it/s]

tensor(1.4637, grad_fn=<NllLossBackward0>)
tensor(1.4862, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12577/17426 [21:43<11:14,  7.19it/s]

tensor(1.5115, grad_fn=<NllLossBackward0>)
tensor(1.4871, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12579/17426 [21:43<11:28,  7.04it/s]

tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.4628, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12581/17426 [21:44<11:38,  6.94it/s]

tensor(1.5520, grad_fn=<NllLossBackward0>)
tensor(1.5003, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12583/17426 [21:44<11:28,  7.04it/s]

tensor(1.4994, grad_fn=<NllLossBackward0>)
tensor(1.4983, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12585/17426 [21:44<11:27,  7.04it/s]

tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.5051, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12587/17426 [21:44<11:31,  7.00it/s]

tensor(1.5238, grad_fn=<NllLossBackward0>)
tensor(1.4818, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12589/17426 [21:45<10:44,  7.50it/s]

tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.4912, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12591/17426 [21:45<10:06,  7.98it/s]

tensor(1.4921, grad_fn=<NllLossBackward0>)
tensor(1.5031, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12593/17426 [21:45<09:52,  8.16it/s]

tensor(1.4435, grad_fn=<NllLossBackward0>)
tensor(1.5093, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12595/17426 [21:45<09:57,  8.09it/s]

tensor(1.5070, grad_fn=<NllLossBackward0>)
tensor(1.4674, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12597/17426 [21:46<09:54,  8.12it/s]

tensor(1.4906, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12599/17426 [21:46<09:51,  8.16it/s]

tensor(1.5080, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12601/17426 [21:46<09:37,  8.36it/s]

tensor(1.4930, grad_fn=<NllLossBackward0>)
tensor(1.4967, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12603/17426 [21:46<09:53,  8.13it/s]

tensor(1.4915, grad_fn=<NllLossBackward0>)
tensor(1.4593, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12605/17426 [21:47<09:47,  8.20it/s]

tensor(1.5202, grad_fn=<NllLossBackward0>)
tensor(1.5171, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12607/17426 [21:47<09:44,  8.25it/s]

tensor(1.4568, grad_fn=<NllLossBackward0>)
tensor(1.4857, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12609/17426 [21:47<09:30,  8.44it/s]

tensor(1.4991, grad_fn=<NllLossBackward0>)
tensor(1.4974, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12611/17426 [21:47<09:24,  8.52it/s]

tensor(1.5198, grad_fn=<NllLossBackward0>)
tensor(1.4740, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12613/17426 [21:48<09:37,  8.34it/s]

tensor(1.4683, grad_fn=<NllLossBackward0>)
tensor(1.4752, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12615/17426 [21:48<09:33,  8.39it/s]

tensor(1.4618, grad_fn=<NllLossBackward0>)
tensor(1.4984, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12617/17426 [21:48<09:31,  8.41it/s]

tensor(1.4847, grad_fn=<NllLossBackward0>)
tensor(1.5253, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12619/17426 [21:48<09:33,  8.38it/s]

tensor(1.5234, grad_fn=<NllLossBackward0>)
tensor(1.4632, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12621/17426 [21:48<10:09,  7.88it/s]

tensor(1.5427, grad_fn=<NllLossBackward0>)
tensor(1.4803, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12623/17426 [21:49<10:07,  7.91it/s]

tensor(1.4704, grad_fn=<NllLossBackward0>)
tensor(1.5234, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12625/17426 [21:49<09:51,  8.12it/s]

tensor(1.4366, grad_fn=<NllLossBackward0>)
tensor(1.4710, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12627/17426 [21:49<09:44,  8.21it/s]

tensor(1.5142, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12629/17426 [21:49<09:27,  8.46it/s]

tensor(1.5194, grad_fn=<NllLossBackward0>)
tensor(1.4605, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12631/17426 [21:50<09:50,  8.12it/s]

tensor(1.4960, grad_fn=<NllLossBackward0>)
tensor(1.5099, grad_fn=<NllLossBackward0>)


 72%|███████▏  | 12633/17426 [21:50<09:33,  8.36it/s]

tensor(1.4911, grad_fn=<NllLossBackward0>)
tensor(1.4602, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12635/17426 [21:50<09:40,  8.25it/s]

tensor(1.4872, grad_fn=<NllLossBackward0>)
tensor(1.4735, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12637/17426 [21:50<09:33,  8.35it/s]

tensor(1.5004, grad_fn=<NllLossBackward0>)
tensor(1.5013, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12639/17426 [21:51<10:07,  7.88it/s]

tensor(1.5255, grad_fn=<NllLossBackward0>)
tensor(1.4937, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12641/17426 [21:51<09:57,  8.00it/s]

tensor(1.4691, grad_fn=<NllLossBackward0>)
tensor(1.5036, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12643/17426 [21:51<09:46,  8.15it/s]

tensor(1.5241, grad_fn=<NllLossBackward0>)
tensor(1.4979, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12645/17426 [21:51<09:51,  8.08it/s]

tensor(1.5012, grad_fn=<NllLossBackward0>)
tensor(1.4488, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12647/17426 [21:52<09:47,  8.14it/s]

tensor(1.4875, grad_fn=<NllLossBackward0>)
tensor(1.4796, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12649/17426 [21:52<09:42,  8.19it/s]

tensor(1.5019, grad_fn=<NllLossBackward0>)
tensor(1.4704, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12651/17426 [21:52<09:46,  8.14it/s]

tensor(1.4667, grad_fn=<NllLossBackward0>)
tensor(1.4780, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12653/17426 [21:52<09:51,  8.07it/s]

tensor(1.4381, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12655/17426 [21:53<09:49,  8.09it/s]

tensor(1.4873, grad_fn=<NllLossBackward0>)
tensor(1.5490, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12657/17426 [21:53<09:46,  8.13it/s]

tensor(1.5453, grad_fn=<NllLossBackward0>)
tensor(1.4885, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12659/17426 [21:53<09:31,  8.34it/s]

tensor(1.4592, grad_fn=<NllLossBackward0>)
tensor(1.4757, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12661/17426 [21:53<09:31,  8.34it/s]

tensor(1.5137, grad_fn=<NllLossBackward0>)
tensor(1.4447, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12663/17426 [21:54<09:31,  8.34it/s]

tensor(1.4494, grad_fn=<NllLossBackward0>)
tensor(1.4770, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12665/17426 [21:54<09:56,  7.98it/s]

tensor(1.4997, grad_fn=<NllLossBackward0>)
tensor(1.4856, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12667/17426 [21:54<09:41,  8.18it/s]

tensor(1.4766, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12669/17426 [21:54<09:57,  7.96it/s]

tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.5079, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12671/17426 [21:55<10:16,  7.72it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12673/17426 [21:55<11:15,  7.04it/s]

tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.4937, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12675/17426 [21:55<11:03,  7.16it/s]

tensor(1.4703, grad_fn=<NllLossBackward0>)
tensor(1.5175, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12677/17426 [21:55<10:29,  7.55it/s]

tensor(1.5033, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12679/17426 [21:56<10:16,  7.70it/s]

tensor(1.4572, grad_fn=<NllLossBackward0>)
tensor(1.4898, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12681/17426 [21:56<10:27,  7.57it/s]

tensor(1.4404, grad_fn=<NllLossBackward0>)
tensor(1.4252, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12683/17426 [21:56<10:07,  7.81it/s]

tensor(1.4862, grad_fn=<NllLossBackward0>)
tensor(1.4983, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12685/17426 [21:57<09:42,  8.15it/s]

tensor(1.4338, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12687/17426 [21:57<10:19,  7.65it/s]

tensor(1.4494, grad_fn=<NllLossBackward0>)
tensor(1.5172, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12689/17426 [21:57<11:17,  6.99it/s]

tensor(1.5103, grad_fn=<NllLossBackward0>)
tensor(1.4612, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12691/17426 [21:57<11:38,  6.77it/s]

tensor(1.4886, grad_fn=<NllLossBackward0>)
tensor(1.5303, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12693/17426 [21:58<12:04,  6.53it/s]

tensor(1.4794, grad_fn=<NllLossBackward0>)
tensor(1.4593, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12695/17426 [21:58<11:46,  6.69it/s]

tensor(1.4974, grad_fn=<NllLossBackward0>)
tensor(1.5276, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12697/17426 [21:58<12:22,  6.37it/s]

tensor(1.4994, grad_fn=<NllLossBackward0>)
tensor(1.5376, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12699/17426 [21:59<12:07,  6.49it/s]

tensor(1.4659, grad_fn=<NllLossBackward0>)
tensor(1.4623, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12701/17426 [21:59<12:01,  6.55it/s]

tensor(1.5067, grad_fn=<NllLossBackward0>)
tensor(1.4731, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12703/17426 [21:59<11:17,  6.97it/s]

tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.5270, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12705/17426 [21:59<10:14,  7.68it/s]

tensor(1.4551, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12707/17426 [22:00<09:50,  7.99it/s]

tensor(1.5242, grad_fn=<NllLossBackward0>)
tensor(1.4654, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12709/17426 [22:00<09:51,  7.97it/s]

tensor(1.4875, grad_fn=<NllLossBackward0>)
tensor(1.4578, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12711/17426 [22:00<09:34,  8.21it/s]

tensor(1.5226, grad_fn=<NllLossBackward0>)
tensor(1.4573, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12713/17426 [22:00<09:35,  8.19it/s]

tensor(1.4679, grad_fn=<NllLossBackward0>)
tensor(1.5280, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12715/17426 [22:01<09:41,  8.10it/s]

tensor(1.4652, grad_fn=<NllLossBackward0>)
tensor(1.5132, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12717/17426 [22:01<09:40,  8.11it/s]

tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.4897, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12719/17426 [22:01<09:24,  8.33it/s]

tensor(1.5035, grad_fn=<NllLossBackward0>)
tensor(1.4534, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12721/17426 [22:01<09:51,  7.95it/s]

tensor(1.5049, grad_fn=<NllLossBackward0>)
tensor(1.4298, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12723/17426 [22:02<09:49,  7.97it/s]

tensor(1.5098, grad_fn=<NllLossBackward0>)
tensor(1.4550, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12725/17426 [22:02<09:36,  8.15it/s]

tensor(1.5306, grad_fn=<NllLossBackward0>)
tensor(1.5101, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12727/17426 [22:02<09:34,  8.18it/s]

tensor(1.4717, grad_fn=<NllLossBackward0>)
tensor(1.4485, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12729/17426 [22:02<10:01,  7.81it/s]

tensor(1.4768, grad_fn=<NllLossBackward0>)
tensor(1.5186, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12731/17426 [22:03<09:47,  8.00it/s]

tensor(1.5001, grad_fn=<NllLossBackward0>)
tensor(1.5019, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12733/17426 [22:03<09:36,  8.14it/s]

tensor(1.5188, grad_fn=<NllLossBackward0>)
tensor(1.4990, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12735/17426 [22:03<09:32,  8.19it/s]

tensor(1.4677, grad_fn=<NllLossBackward0>)
tensor(1.4581, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12737/17426 [22:03<09:24,  8.31it/s]

tensor(1.4832, grad_fn=<NllLossBackward0>)
tensor(1.4885, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12739/17426 [22:04<09:38,  8.10it/s]

tensor(1.5317, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12741/17426 [22:04<09:26,  8.27it/s]

tensor(1.4834, grad_fn=<NllLossBackward0>)
tensor(1.5253, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12743/17426 [22:04<09:21,  8.34it/s]

tensor(1.4690, grad_fn=<NllLossBackward0>)
tensor(1.5016, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12745/17426 [22:04<09:36,  8.12it/s]

tensor(1.4560, grad_fn=<NllLossBackward0>)
tensor(1.4930, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12747/17426 [22:05<10:07,  7.70it/s]

tensor(1.5307, grad_fn=<NllLossBackward0>)
tensor(1.4926, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12749/17426 [22:05<09:35,  8.12it/s]

tensor(1.5318, grad_fn=<NllLossBackward0>)
tensor(1.4705, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12751/17426 [22:05<09:34,  8.14it/s]

tensor(1.4624, grad_fn=<NllLossBackward0>)
tensor(1.5599, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12753/17426 [22:05<09:30,  8.20it/s]

tensor(1.5421, grad_fn=<NllLossBackward0>)
tensor(1.5347, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12755/17426 [22:06<09:45,  7.98it/s]

tensor(1.4741, grad_fn=<NllLossBackward0>)
tensor(1.4747, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12757/17426 [22:06<09:28,  8.22it/s]

tensor(1.4488, grad_fn=<NllLossBackward0>)
tensor(1.4605, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12759/17426 [22:06<09:18,  8.35it/s]

tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12761/17426 [22:06<09:11,  8.46it/s]

tensor(1.4366, grad_fn=<NllLossBackward0>)
tensor(1.5418, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12763/17426 [22:07<09:12,  8.45it/s]

tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.4690, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12765/17426 [22:07<09:47,  7.93it/s]

tensor(1.4775, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12767/17426 [22:07<09:28,  8.20it/s]

tensor(1.5194, grad_fn=<NllLossBackward0>)
tensor(1.4520, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12769/17426 [22:07<09:20,  8.31it/s]

tensor(1.5174, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12771/17426 [22:08<09:15,  8.38it/s]

tensor(1.5031, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12773/17426 [22:08<09:43,  7.97it/s]

tensor(1.4468, grad_fn=<NllLossBackward0>)
tensor(1.4446, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12775/17426 [22:08<09:26,  8.20it/s]

tensor(1.4710, grad_fn=<NllLossBackward0>)
tensor(1.4791, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12777/17426 [22:08<09:24,  8.24it/s]

tensor(1.4575, grad_fn=<NllLossBackward0>)
tensor(1.4790, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12779/17426 [22:09<09:23,  8.25it/s]

tensor(1.5274, grad_fn=<NllLossBackward0>)
tensor(1.4807, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12781/17426 [22:09<09:55,  7.80it/s]

tensor(1.5001, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12783/17426 [22:09<09:47,  7.91it/s]

tensor(1.4426, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12785/17426 [22:09<10:37,  7.28it/s]

tensor(1.4872, grad_fn=<NllLossBackward0>)
tensor(1.5141, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12787/17426 [22:10<11:08,  6.94it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.5092, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12789/17426 [22:10<11:06,  6.96it/s]

tensor(1.4849, grad_fn=<NllLossBackward0>)
tensor(1.4710, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12791/17426 [22:10<11:12,  6.89it/s]

tensor(1.5165, grad_fn=<NllLossBackward0>)
tensor(1.4807, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12793/17426 [22:10<10:24,  7.42it/s]

tensor(1.5258, grad_fn=<NllLossBackward0>)
tensor(1.5024, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12795/17426 [22:11<10:03,  7.67it/s]

tensor(1.5337, grad_fn=<NllLossBackward0>)
tensor(1.4627, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12797/17426 [22:11<10:02,  7.68it/s]

tensor(1.5192, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12799/17426 [22:11<10:00,  7.71it/s]

tensor(1.4405, grad_fn=<NllLossBackward0>)
tensor(1.5466, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12801/17426 [22:11<09:21,  8.24it/s]

tensor(1.4813, grad_fn=<NllLossBackward0>)
tensor(1.5023, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12803/17426 [22:12<09:13,  8.35it/s]

tensor(1.4633, grad_fn=<NllLossBackward0>)
tensor(1.4680, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12805/17426 [22:12<09:24,  8.19it/s]

tensor(1.4733, grad_fn=<NllLossBackward0>)
tensor(1.5316, grad_fn=<NllLossBackward0>)


 73%|███████▎  | 12807/17426 [22:12<10:04,  7.64it/s]

tensor(1.4698, grad_fn=<NllLossBackward0>)
tensor(1.4860, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12809/17426 [22:13<10:46,  7.14it/s]

tensor(1.4917, grad_fn=<NllLossBackward0>)
tensor(1.4879, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12811/17426 [22:13<10:55,  7.05it/s]

tensor(1.5431, grad_fn=<NllLossBackward0>)
tensor(1.4713, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12813/17426 [22:13<10:43,  7.17it/s]

tensor(1.4967, grad_fn=<NllLossBackward0>)
tensor(1.4820, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12815/17426 [22:13<11:32,  6.66it/s]

tensor(1.4730, grad_fn=<NllLossBackward0>)
tensor(1.4493, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12817/17426 [22:14<11:23,  6.74it/s]

tensor(1.4932, grad_fn=<NllLossBackward0>)
tensor(1.4689, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12819/17426 [22:14<11:24,  6.73it/s]

tensor(1.5043, grad_fn=<NllLossBackward0>)
tensor(1.4560, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12821/17426 [22:14<10:45,  7.13it/s]

tensor(1.5425, grad_fn=<NllLossBackward0>)
tensor(1.4766, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12823/17426 [22:15<09:46,  7.85it/s]

tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.4920, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12825/17426 [22:15<09:29,  8.07it/s]

tensor(1.4700, grad_fn=<NllLossBackward0>)
tensor(1.4931, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12827/17426 [22:15<09:21,  8.20it/s]

tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.4428, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12829/17426 [22:15<09:28,  8.08it/s]

tensor(1.4300, grad_fn=<NllLossBackward0>)
tensor(1.5540, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12831/17426 [22:16<09:46,  7.83it/s]

tensor(1.5246, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12833/17426 [22:16<09:34,  8.00it/s]

tensor(1.4820, grad_fn=<NllLossBackward0>)
tensor(1.4760, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12835/17426 [22:16<09:28,  8.07it/s]

tensor(1.4674, grad_fn=<NllLossBackward0>)
tensor(1.4640, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12837/17426 [22:16<09:19,  8.21it/s]

tensor(1.4646, grad_fn=<NllLossBackward0>)
tensor(1.4466, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12839/17426 [22:17<09:28,  8.06it/s]

tensor(1.5140, grad_fn=<NllLossBackward0>)
tensor(1.4970, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12841/17426 [22:17<09:20,  8.18it/s]

tensor(1.4618, grad_fn=<NllLossBackward0>)
tensor(1.4684, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12843/17426 [22:17<09:17,  8.21it/s]

tensor(1.4941, grad_fn=<NllLossBackward0>)
tensor(1.5833, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12845/17426 [22:17<09:17,  8.22it/s]

tensor(1.4659, grad_fn=<NllLossBackward0>)
tensor(1.5109, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12847/17426 [22:18<09:48,  7.78it/s]

tensor(1.5114, grad_fn=<NllLossBackward0>)
tensor(1.4789, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12849/17426 [22:18<09:18,  8.19it/s]

tensor(1.5091, grad_fn=<NllLossBackward0>)
tensor(1.4842, grad_fn=<NllLossBackward0>)


 74%|███████▎  | 12851/17426 [22:18<09:05,  8.38it/s]

tensor(1.4627, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12853/17426 [22:18<09:01,  8.44it/s]

tensor(1.4663, grad_fn=<NllLossBackward0>)
tensor(1.5190, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12855/17426 [22:18<08:55,  8.53it/s]

tensor(1.4560, grad_fn=<NllLossBackward0>)
tensor(1.4698, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12857/17426 [22:19<09:28,  8.04it/s]

tensor(1.5239, grad_fn=<NllLossBackward0>)
tensor(1.5111, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12859/17426 [22:19<09:21,  8.14it/s]

tensor(1.4645, grad_fn=<NllLossBackward0>)
tensor(1.4832, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12861/17426 [22:19<09:29,  8.02it/s]

tensor(1.5067, grad_fn=<NllLossBackward0>)
tensor(1.4864, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12863/17426 [22:19<09:16,  8.21it/s]

tensor(1.4865, grad_fn=<NllLossBackward0>)
tensor(1.4910, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12865/17426 [22:20<09:43,  7.82it/s]

tensor(1.5034, grad_fn=<NllLossBackward0>)
tensor(1.4296, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12867/17426 [22:20<09:27,  8.03it/s]

tensor(1.4470, grad_fn=<NllLossBackward0>)
tensor(1.4727, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12869/17426 [22:20<09:12,  8.25it/s]

tensor(1.4810, grad_fn=<NllLossBackward0>)
tensor(1.4459, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12871/17426 [22:20<09:24,  8.06it/s]

tensor(1.5147, grad_fn=<NllLossBackward0>)
tensor(1.4662, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12873/17426 [22:21<09:50,  7.71it/s]

tensor(1.4817, grad_fn=<NllLossBackward0>)
tensor(1.4889, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12875/17426 [22:21<09:31,  7.96it/s]

tensor(1.4494, grad_fn=<NllLossBackward0>)
tensor(1.4744, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12877/17426 [22:21<09:18,  8.15it/s]

tensor(1.4493, grad_fn=<NllLossBackward0>)
tensor(1.4312, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12879/17426 [22:21<08:57,  8.45it/s]

tensor(1.4682, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12881/17426 [22:22<09:06,  8.32it/s]

tensor(1.4796, grad_fn=<NllLossBackward0>)
tensor(1.4999, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12883/17426 [22:22<09:32,  7.94it/s]

tensor(1.4642, grad_fn=<NllLossBackward0>)
tensor(1.5072, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12885/17426 [22:22<09:17,  8.14it/s]

tensor(1.4011, grad_fn=<NllLossBackward0>)
tensor(1.5003, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12887/17426 [22:22<09:10,  8.24it/s]

tensor(1.5079, grad_fn=<NllLossBackward0>)
tensor(1.4916, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12889/17426 [22:23<09:19,  8.11it/s]

tensor(1.5378, grad_fn=<NllLossBackward0>)
tensor(1.5074, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12891/17426 [22:23<09:35,  7.88it/s]

tensor(1.4827, grad_fn=<NllLossBackward0>)
tensor(1.4581, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12893/17426 [22:23<09:36,  7.87it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.5025, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12895/17426 [22:23<09:05,  8.31it/s]

tensor(1.4730, grad_fn=<NllLossBackward0>)
tensor(1.4725, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12897/17426 [22:24<09:17,  8.12it/s]

tensor(1.5133, grad_fn=<NllLossBackward0>)
tensor(1.4674, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12899/17426 [22:24<09:41,  7.78it/s]

tensor(1.4706, grad_fn=<NllLossBackward0>)
tensor(1.4934, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12901/17426 [22:24<09:57,  7.58it/s]

tensor(1.5029, grad_fn=<NllLossBackward0>)
tensor(1.4848, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12903/17426 [22:24<09:48,  7.68it/s]

tensor(1.5228, grad_fn=<NllLossBackward0>)
tensor(1.4738, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12905/17426 [22:25<10:43,  7.02it/s]

tensor(1.5240, grad_fn=<NllLossBackward0>)
tensor(1.4764, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12907/17426 [22:25<11:27,  6.58it/s]

tensor(1.4135, grad_fn=<NllLossBackward0>)
tensor(1.5320, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12909/17426 [22:25<10:44,  7.01it/s]

tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.4772, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12911/17426 [22:26<09:42,  7.76it/s]

tensor(1.5285, grad_fn=<NllLossBackward0>)
tensor(1.4795, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12913/17426 [22:26<09:09,  8.21it/s]

tensor(1.4474, grad_fn=<NllLossBackward0>)
tensor(1.4955, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12915/17426 [22:26<09:28,  7.93it/s]

tensor(1.4941, grad_fn=<NllLossBackward0>)
tensor(1.4535, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12917/17426 [22:26<10:39,  7.05it/s]

tensor(1.4969, grad_fn=<NllLossBackward0>)
tensor(1.5021, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12919/17426 [22:27<10:54,  6.89it/s]

tensor(1.4915, grad_fn=<NllLossBackward0>)
tensor(1.5383, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12921/17426 [22:27<10:52,  6.90it/s]

tensor(1.4788, grad_fn=<NllLossBackward0>)
tensor(1.4845, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12923/17426 [22:27<10:59,  6.83it/s]

tensor(1.4624, grad_fn=<NllLossBackward0>)
tensor(1.4972, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12925/17426 [22:28<11:29,  6.53it/s]

tensor(1.5457, grad_fn=<NllLossBackward0>)
tensor(1.4853, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12927/17426 [22:28<10:50,  6.91it/s]

tensor(1.4847, grad_fn=<NllLossBackward0>)
tensor(1.5169, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12929/17426 [22:28<10:20,  7.25it/s]

tensor(1.4561, grad_fn=<NllLossBackward0>)
tensor(1.5090, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12931/17426 [22:28<11:13,  6.68it/s]

tensor(1.4483, grad_fn=<NllLossBackward0>)
tensor(1.5005, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12933/17426 [22:29<10:57,  6.83it/s]

tensor(1.4651, grad_fn=<NllLossBackward0>)
tensor(1.4969, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12935/17426 [22:29<10:40,  7.01it/s]

tensor(1.4840, grad_fn=<NllLossBackward0>)
tensor(1.5143, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12937/17426 [22:29<10:56,  6.83it/s]

tensor(1.5107, grad_fn=<NllLossBackward0>)
tensor(1.4514, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12939/17426 [22:30<09:48,  7.62it/s]

tensor(1.4926, grad_fn=<NllLossBackward0>)
tensor(1.5026, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12941/17426 [22:30<09:17,  8.04it/s]

tensor(1.4567, grad_fn=<NllLossBackward0>)
tensor(1.5145, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12943/17426 [22:30<08:59,  8.31it/s]

tensor(1.5373, grad_fn=<NllLossBackward0>)
tensor(1.4905, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12945/17426 [22:30<08:44,  8.55it/s]

tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.5316, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12947/17426 [22:31<08:58,  8.31it/s]

tensor(1.4798, grad_fn=<NllLossBackward0>)
tensor(1.5201, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12949/17426 [22:31<08:46,  8.51it/s]

tensor(1.5264, grad_fn=<NllLossBackward0>)
tensor(1.4860, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12951/17426 [22:31<08:43,  8.55it/s]

tensor(1.4795, grad_fn=<NllLossBackward0>)
tensor(1.5120, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12953/17426 [22:31<08:43,  8.54it/s]

tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12955/17426 [22:31<08:53,  8.37it/s]

tensor(1.4937, grad_fn=<NllLossBackward0>)
tensor(1.4531, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12957/17426 [22:32<08:45,  8.50it/s]

tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.5232, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12959/17426 [22:32<08:44,  8.52it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.4753, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12961/17426 [22:32<08:45,  8.50it/s]

tensor(1.4874, grad_fn=<NllLossBackward0>)
tensor(1.4498, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12963/17426 [22:32<08:41,  8.56it/s]

tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.5011, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12965/17426 [22:33<09:21,  7.95it/s]

tensor(1.4921, grad_fn=<NllLossBackward0>)
tensor(1.5135, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12967/17426 [22:33<08:57,  8.30it/s]

tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.4437, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12969/17426 [22:33<08:46,  8.46it/s]

tensor(1.5309, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12971/17426 [22:33<08:43,  8.51it/s]

tensor(1.4647, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12973/17426 [22:34<09:04,  8.18it/s]

tensor(1.4783, grad_fn=<NllLossBackward0>)
tensor(1.4671, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12975/17426 [22:34<08:55,  8.31it/s]

tensor(1.4643, grad_fn=<NllLossBackward0>)
tensor(1.4965, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12977/17426 [22:34<08:51,  8.38it/s]

tensor(1.4650, grad_fn=<NllLossBackward0>)
tensor(1.5332, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12979/17426 [22:34<08:47,  8.42it/s]

tensor(1.4926, grad_fn=<NllLossBackward0>)
tensor(1.5323, grad_fn=<NllLossBackward0>)


 74%|███████▍  | 12981/17426 [22:35<08:46,  8.44it/s]

tensor(1.4743, grad_fn=<NllLossBackward0>)
tensor(1.5056, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 12983/17426 [22:35<09:11,  8.06it/s]

tensor(1.4850, grad_fn=<NllLossBackward0>)
tensor(1.5456, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 12985/17426 [22:35<08:57,  8.26it/s]

tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.4641, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 12987/17426 [22:35<08:46,  8.42it/s]

tensor(1.4975, grad_fn=<NllLossBackward0>)
tensor(1.4684, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 12989/17426 [22:36<08:44,  8.45it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 12991/17426 [22:36<09:38,  7.67it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.4864, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 12993/17426 [22:36<09:13,  8.01it/s]

tensor(1.4984, grad_fn=<NllLossBackward0>)
tensor(1.4763, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 12995/17426 [22:36<08:52,  8.32it/s]

tensor(1.4410, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 12997/17426 [22:37<08:43,  8.46it/s]

tensor(1.4646, grad_fn=<NllLossBackward0>)
tensor(1.4755, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 12999/17426 [22:37<08:43,  8.46it/s]

tensor(1.4626, grad_fn=<NllLossBackward0>)
tensor(1.5199, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13001/17426 [22:37<09:05,  8.12it/s]

tensor(1.4765, grad_fn=<NllLossBackward0>)
tensor(1.4791, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13003/17426 [22:37<08:55,  8.27it/s]

tensor(1.5069, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13005/17426 [22:37<08:54,  8.27it/s]

tensor(1.4972, grad_fn=<NllLossBackward0>)
tensor(1.5275, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13007/17426 [22:38<08:59,  8.19it/s]

tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.5059, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13009/17426 [22:38<09:06,  8.08it/s]

tensor(1.4735, grad_fn=<NllLossBackward0>)
tensor(1.5003, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13011/17426 [22:38<08:53,  8.28it/s]

tensor(1.4913, grad_fn=<NllLossBackward0>)
tensor(1.4900, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13013/17426 [22:38<08:50,  8.31it/s]

tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.5122, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13015/17426 [22:39<09:01,  8.14it/s]

tensor(1.4857, grad_fn=<NllLossBackward0>)
tensor(1.4936, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13017/17426 [22:39<09:15,  7.93it/s]

tensor(1.4307, grad_fn=<NllLossBackward0>)
tensor(1.4741, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13019/17426 [22:39<08:57,  8.19it/s]

tensor(1.4742, grad_fn=<NllLossBackward0>)
tensor(1.5272, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13021/17426 [22:40<09:49,  7.47it/s]

tensor(1.4581, grad_fn=<NllLossBackward0>)
tensor(1.5690, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13023/17426 [22:40<10:41,  6.86it/s]

tensor(1.4613, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13025/17426 [22:40<10:39,  6.88it/s]

tensor(1.4548, grad_fn=<NllLossBackward0>)
tensor(1.5379, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13027/17426 [22:40<10:24,  7.05it/s]

tensor(1.4994, grad_fn=<NllLossBackward0>)
tensor(1.5415, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13029/17426 [22:41<10:37,  6.90it/s]

tensor(1.4920, grad_fn=<NllLossBackward0>)
tensor(1.4797, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13031/17426 [22:41<09:50,  7.45it/s]

tensor(1.4787, grad_fn=<NllLossBackward0>)
tensor(1.4553, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13033/17426 [22:41<09:43,  7.52it/s]

tensor(1.4485, grad_fn=<NllLossBackward0>)
tensor(1.4627, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13035/17426 [22:41<09:20,  7.83it/s]

tensor(1.5027, grad_fn=<NllLossBackward0>)
tensor(1.4457, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13037/17426 [22:42<09:25,  7.77it/s]

tensor(1.4766, grad_fn=<NllLossBackward0>)
tensor(1.4951, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13039/17426 [22:42<08:50,  8.27it/s]

tensor(1.4397, grad_fn=<NllLossBackward0>)
tensor(1.4954, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13041/17426 [22:42<08:34,  8.53it/s]

tensor(1.4689, grad_fn=<NllLossBackward0>)
tensor(1.4540, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13043/17426 [22:42<09:38,  7.58it/s]

tensor(1.4854, grad_fn=<NllLossBackward0>)
tensor(1.5035, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13045/17426 [22:43<10:25,  7.00it/s]

tensor(1.4364, grad_fn=<NllLossBackward0>)
tensor(1.4747, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13047/17426 [22:43<09:35,  7.60it/s]

tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.4885, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13049/17426 [22:43<10:07,  7.20it/s]

tensor(1.4936, grad_fn=<NllLossBackward0>)
tensor(1.4950, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13051/17426 [22:44<10:32,  6.91it/s]

tensor(1.4935, grad_fn=<NllLossBackward0>)
tensor(1.4562, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13053/17426 [22:44<10:30,  6.93it/s]

tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.4630, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13055/17426 [22:44<10:44,  6.78it/s]

tensor(1.4832, grad_fn=<NllLossBackward0>)
tensor(1.4886, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13057/17426 [22:44<10:09,  7.17it/s]

tensor(1.4806, grad_fn=<NllLossBackward0>)
tensor(1.4848, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13059/17426 [22:45<09:44,  7.47it/s]

tensor(1.4747, grad_fn=<NllLossBackward0>)
tensor(1.5410, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13061/17426 [22:45<09:11,  7.91it/s]

tensor(1.4781, grad_fn=<NllLossBackward0>)
tensor(1.5338, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13063/17426 [22:45<09:00,  8.08it/s]

tensor(1.5022, grad_fn=<NllLossBackward0>)
tensor(1.4869, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13065/17426 [22:45<08:42,  8.35it/s]

tensor(1.4865, grad_fn=<NllLossBackward0>)
tensor(1.4904, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13067/17426 [22:46<08:44,  8.31it/s]

tensor(1.4838, grad_fn=<NllLossBackward0>)
tensor(1.4861, grad_fn=<NllLossBackward0>)


 75%|███████▍  | 13069/17426 [22:46<08:44,  8.31it/s]

tensor(1.4835, grad_fn=<NllLossBackward0>)
tensor(1.4931, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13071/17426 [22:46<08:48,  8.25it/s]

tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.4950, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13073/17426 [22:46<08:44,  8.30it/s]

tensor(1.4777, grad_fn=<NllLossBackward0>)
tensor(1.4869, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13075/17426 [22:47<09:03,  8.00it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.5067, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13077/17426 [22:47<09:08,  7.92it/s]

tensor(1.5077, grad_fn=<NllLossBackward0>)
tensor(1.4548, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13079/17426 [22:47<08:52,  8.17it/s]

tensor(1.4827, grad_fn=<NllLossBackward0>)
tensor(1.4829, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13081/17426 [22:47<08:53,  8.15it/s]

tensor(1.5353, grad_fn=<NllLossBackward0>)
tensor(1.4732, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13083/17426 [22:48<08:58,  8.07it/s]

tensor(1.4989, grad_fn=<NllLossBackward0>)
tensor(1.4933, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13085/17426 [22:48<08:56,  8.10it/s]

tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.5094, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13087/17426 [22:48<08:47,  8.22it/s]

tensor(1.4921, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13089/17426 [22:48<08:49,  8.19it/s]

tensor(1.5242, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13091/17426 [22:49<08:54,  8.11it/s]

tensor(1.4698, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13093/17426 [22:49<08:50,  8.17it/s]

tensor(1.4538, grad_fn=<NllLossBackward0>)
tensor(1.5155, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13095/17426 [22:49<08:42,  8.29it/s]

tensor(1.4550, grad_fn=<NllLossBackward0>)
tensor(1.5334, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13097/17426 [22:49<08:37,  8.36it/s]

tensor(1.4797, grad_fn=<NllLossBackward0>)
tensor(1.4881, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13099/17426 [22:50<08:40,  8.32it/s]

tensor(1.4569, grad_fn=<NllLossBackward0>)
tensor(1.4921, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13101/17426 [22:50<08:59,  8.01it/s]

tensor(1.5096, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13103/17426 [22:50<09:06,  7.92it/s]

tensor(1.4748, grad_fn=<NllLossBackward0>)
tensor(1.5002, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13105/17426 [22:50<08:48,  8.18it/s]

tensor(1.5269, grad_fn=<NllLossBackward0>)
tensor(1.4569, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13107/17426 [22:51<08:46,  8.21it/s]

tensor(1.4198, grad_fn=<NllLossBackward0>)
tensor(1.4626, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13109/17426 [22:51<09:02,  7.96it/s]

tensor(1.5225, grad_fn=<NllLossBackward0>)
tensor(1.5011, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13111/17426 [22:51<08:52,  8.10it/s]

tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.4802, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13113/17426 [22:51<08:33,  8.39it/s]

tensor(1.4453, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13115/17426 [22:52<08:27,  8.50it/s]

tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.4949, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13117/17426 [22:52<08:42,  8.24it/s]

tensor(1.4621, grad_fn=<NllLossBackward0>)
tensor(1.4955, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13119/17426 [22:52<08:52,  8.09it/s]

tensor(1.4991, grad_fn=<NllLossBackward0>)
tensor(1.4759, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13121/17426 [22:52<08:41,  8.26it/s]

tensor(1.4531, grad_fn=<NllLossBackward0>)
tensor(1.5143, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13123/17426 [22:52<08:34,  8.36it/s]

tensor(1.4476, grad_fn=<NllLossBackward0>)
tensor(1.4834, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13125/17426 [22:53<08:28,  8.46it/s]

tensor(1.4605, grad_fn=<NllLossBackward0>)
tensor(1.4665, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13127/17426 [22:53<08:44,  8.19it/s]

tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13129/17426 [22:53<08:40,  8.26it/s]

tensor(1.5368, grad_fn=<NllLossBackward0>)
tensor(1.4958, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13131/17426 [22:53<08:25,  8.50it/s]

tensor(1.5270, grad_fn=<NllLossBackward0>)
tensor(1.4697, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13133/17426 [22:54<08:28,  8.44it/s]

tensor(1.4872, grad_fn=<NllLossBackward0>)
tensor(1.4627, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13135/17426 [22:54<08:54,  8.03it/s]

tensor(1.4592, grad_fn=<NllLossBackward0>)
tensor(1.4573, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13137/17426 [22:54<09:24,  7.59it/s]

tensor(1.5019, grad_fn=<NllLossBackward0>)
tensor(1.4660, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13139/17426 [22:55<09:49,  7.27it/s]

tensor(1.4859, grad_fn=<NllLossBackward0>)
tensor(1.4493, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13141/17426 [22:55<09:56,  7.19it/s]

tensor(1.4615, grad_fn=<NllLossBackward0>)
tensor(1.4825, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13143/17426 [22:55<10:22,  6.88it/s]

tensor(1.4836, grad_fn=<NllLossBackward0>)
tensor(1.4528, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13145/17426 [22:55<10:03,  7.09it/s]

tensor(1.4492, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13147/17426 [22:56<09:13,  7.73it/s]

tensor(1.4595, grad_fn=<NllLossBackward0>)
tensor(1.4830, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13149/17426 [22:56<08:52,  8.03it/s]

tensor(1.4942, grad_fn=<NllLossBackward0>)
tensor(1.5202, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13151/17426 [22:56<09:21,  7.61it/s]

tensor(1.4348, grad_fn=<NllLossBackward0>)
tensor(1.4665, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13153/17426 [22:56<10:20,  6.89it/s]

tensor(1.5190, grad_fn=<NllLossBackward0>)
tensor(1.4908, grad_fn=<NllLossBackward0>)


 75%|███████▌  | 13155/17426 [22:57<10:11,  6.99it/s]

tensor(1.4880, grad_fn=<NllLossBackward0>)
tensor(1.4446, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13157/17426 [22:57<09:55,  7.16it/s]

tensor(1.5070, grad_fn=<NllLossBackward0>)
tensor(1.5215, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13159/17426 [22:57<10:18,  6.90it/s]

tensor(1.4925, grad_fn=<NllLossBackward0>)
tensor(1.4936, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13161/17426 [22:58<10:37,  6.69it/s]

tensor(1.4898, grad_fn=<NllLossBackward0>)
tensor(1.5280, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13163/17426 [22:58<10:11,  6.97it/s]

tensor(1.4705, grad_fn=<NllLossBackward0>)
tensor(1.4693, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13165/17426 [22:58<10:04,  7.05it/s]

tensor(1.5072, grad_fn=<NllLossBackward0>)
tensor(1.4781, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13167/17426 [22:58<10:48,  6.57it/s]

tensor(1.5071, grad_fn=<NllLossBackward0>)
tensor(1.5122, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13169/17426 [22:59<10:23,  6.83it/s]

tensor(1.4486, grad_fn=<NllLossBackward0>)
tensor(1.5186, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13171/17426 [22:59<11:04,  6.41it/s]

tensor(1.5051, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13173/17426 [22:59<10:08,  6.99it/s]

tensor(1.5065, grad_fn=<NllLossBackward0>)
tensor(1.4832, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13175/17426 [23:00<09:20,  7.58it/s]

tensor(1.5145, grad_fn=<NllLossBackward0>)
tensor(1.4921, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13177/17426 [23:00<08:51,  7.99it/s]

tensor(1.4901, grad_fn=<NllLossBackward0>)
tensor(1.4716, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13179/17426 [23:00<08:52,  7.97it/s]

tensor(1.4997, grad_fn=<NllLossBackward0>)
tensor(1.5075, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13181/17426 [23:00<08:35,  8.24it/s]

tensor(1.4801, grad_fn=<NllLossBackward0>)
tensor(1.5392, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13183/17426 [23:01<08:44,  8.08it/s]

tensor(1.4964, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13185/17426 [23:01<08:36,  8.21it/s]

tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.4709, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13187/17426 [23:01<08:33,  8.25it/s]

tensor(1.4969, grad_fn=<NllLossBackward0>)
tensor(1.4608, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13189/17426 [23:01<08:37,  8.19it/s]

tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.5025, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13191/17426 [23:02<08:47,  8.03it/s]

tensor(1.5143, grad_fn=<NllLossBackward0>)
tensor(1.5151, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13193/17426 [23:02<08:42,  8.10it/s]

tensor(1.4372, grad_fn=<NllLossBackward0>)
tensor(1.4765, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13195/17426 [23:02<08:25,  8.37it/s]

tensor(1.4946, grad_fn=<NllLossBackward0>)
tensor(1.4707, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13197/17426 [23:02<08:19,  8.47it/s]

tensor(1.4537, grad_fn=<NllLossBackward0>)
tensor(1.4582, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13199/17426 [23:03<08:31,  8.27it/s]

tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.4858, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13201/17426 [23:03<08:35,  8.19it/s]

tensor(1.4688, grad_fn=<NllLossBackward0>)
tensor(1.4731, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13203/17426 [23:03<08:32,  8.25it/s]

tensor(1.5366, grad_fn=<NllLossBackward0>)
tensor(1.4746, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13205/17426 [23:03<08:23,  8.38it/s]

tensor(1.4795, grad_fn=<NllLossBackward0>)
tensor(1.4619, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13207/17426 [23:03<08:21,  8.41it/s]

tensor(1.5228, grad_fn=<NllLossBackward0>)
tensor(1.5739, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13209/17426 [23:04<08:37,  8.15it/s]

tensor(1.5155, grad_fn=<NllLossBackward0>)
tensor(1.5197, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13211/17426 [23:04<08:26,  8.32it/s]

tensor(1.4704, grad_fn=<NllLossBackward0>)
tensor(1.4465, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13213/17426 [23:04<08:17,  8.46it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.4819, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13215/17426 [23:04<08:29,  8.27it/s]

tensor(1.5278, grad_fn=<NllLossBackward0>)
tensor(1.4049, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13217/17426 [23:05<08:43,  8.03it/s]

tensor(1.5158, grad_fn=<NllLossBackward0>)
tensor(1.5093, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13219/17426 [23:05<08:33,  8.20it/s]

tensor(1.4759, grad_fn=<NllLossBackward0>)
tensor(1.4514, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13221/17426 [23:05<08:18,  8.43it/s]

tensor(1.5091, grad_fn=<NllLossBackward0>)
tensor(1.4322, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13223/17426 [23:05<08:16,  8.47it/s]

tensor(1.4736, grad_fn=<NllLossBackward0>)
tensor(1.4975, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13225/17426 [23:06<08:08,  8.59it/s]

tensor(1.5227, grad_fn=<NllLossBackward0>)
tensor(1.4553, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13227/17426 [23:06<08:33,  8.18it/s]

tensor(1.4582, grad_fn=<NllLossBackward0>)
tensor(1.4266, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13229/17426 [23:06<08:26,  8.28it/s]

tensor(1.5259, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13231/17426 [23:06<08:21,  8.37it/s]

tensor(1.4715, grad_fn=<NllLossBackward0>)
tensor(1.5323, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13233/17426 [23:07<08:21,  8.36it/s]

tensor(1.5766, grad_fn=<NllLossBackward0>)
tensor(1.4821, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13235/17426 [23:07<08:47,  7.95it/s]

tensor(1.4879, grad_fn=<NllLossBackward0>)
tensor(1.5021, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13237/17426 [23:07<08:38,  8.07it/s]

tensor(1.4679, grad_fn=<NllLossBackward0>)
tensor(1.4749, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13239/17426 [23:07<08:32,  8.17it/s]

tensor(1.5166, grad_fn=<NllLossBackward0>)
tensor(1.4562, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13241/17426 [23:08<08:22,  8.33it/s]

tensor(1.4531, grad_fn=<NllLossBackward0>)
tensor(1.4922, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13243/17426 [23:08<08:26,  8.27it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.4677, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13245/17426 [23:08<08:34,  8.13it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)
tensor(1.4988, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13247/17426 [23:08<08:25,  8.27it/s]

tensor(1.4918, grad_fn=<NllLossBackward0>)
tensor(1.4773, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13249/17426 [23:09<08:18,  8.37it/s]

tensor(1.4650, grad_fn=<NllLossBackward0>)
tensor(1.5184, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13251/17426 [23:09<08:13,  8.45it/s]

tensor(1.4662, grad_fn=<NllLossBackward0>)
tensor(1.4394, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13253/17426 [23:09<08:38,  8.05it/s]

tensor(1.5255, grad_fn=<NllLossBackward0>)
tensor(1.5145, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13255/17426 [23:09<09:20,  7.44it/s]

tensor(1.4841, grad_fn=<NllLossBackward0>)
tensor(1.4904, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13257/17426 [23:10<09:33,  7.27it/s]

tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.4870, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13259/17426 [23:10<10:10,  6.82it/s]

tensor(1.4953, grad_fn=<NllLossBackward0>)
tensor(1.4634, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13261/17426 [23:10<09:55,  6.99it/s]

tensor(1.5027, grad_fn=<NllLossBackward0>)
tensor(1.4876, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13263/17426 [23:10<09:16,  7.48it/s]

tensor(1.5109, grad_fn=<NllLossBackward0>)
tensor(1.4891, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13265/17426 [23:11<08:47,  7.88it/s]

tensor(1.4763, grad_fn=<NllLossBackward0>)
tensor(1.4988, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13267/17426 [23:11<08:34,  8.09it/s]

tensor(1.5057, grad_fn=<NllLossBackward0>)
tensor(1.5560, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13269/17426 [23:11<09:12,  7.52it/s]

tensor(1.5276, grad_fn=<NllLossBackward0>)
tensor(1.4607, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13271/17426 [23:11<08:33,  8.09it/s]

tensor(1.4769, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13273/17426 [23:12<08:22,  8.26it/s]

tensor(1.5084, grad_fn=<NllLossBackward0>)
tensor(1.4521, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13275/17426 [23:12<08:16,  8.36it/s]

tensor(1.4819, grad_fn=<NllLossBackward0>)
tensor(1.5257, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13277/17426 [23:12<08:10,  8.46it/s]

tensor(1.4395, grad_fn=<NllLossBackward0>)
tensor(1.4895, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13279/17426 [23:12<09:04,  7.61it/s]

tensor(1.4998, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13281/17426 [23:13<09:49,  7.03it/s]

tensor(1.4720, grad_fn=<NllLossBackward0>)
tensor(1.4545, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13283/17426 [23:13<10:12,  6.77it/s]

tensor(1.4940, grad_fn=<NllLossBackward0>)
tensor(1.4546, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13285/17426 [23:13<10:25,  6.62it/s]

tensor(1.4483, grad_fn=<NllLossBackward0>)
tensor(1.4714, grad_fn=<NllLossBackward0>)


 76%|███████▌  | 13287/17426 [23:14<09:55,  6.95it/s]

tensor(1.4796, grad_fn=<NllLossBackward0>)
tensor(1.4700, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13289/17426 [23:14<10:45,  6.41it/s]

tensor(1.4625, grad_fn=<NllLossBackward0>)
tensor(1.4530, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13291/17426 [23:14<09:29,  7.26it/s]

tensor(1.4641, grad_fn=<NllLossBackward0>)
tensor(1.4981, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13293/17426 [23:14<09:01,  7.64it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.5556, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13295/17426 [23:15<08:54,  7.73it/s]

tensor(1.5477, grad_fn=<NllLossBackward0>)
tensor(1.4869, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13297/17426 [23:15<08:53,  7.73it/s]

tensor(1.4598, grad_fn=<NllLossBackward0>)
tensor(1.5602, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13299/17426 [23:15<08:30,  8.08it/s]

tensor(1.4676, grad_fn=<NllLossBackward0>)
tensor(1.4536, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13301/17426 [23:15<08:27,  8.14it/s]

tensor(1.4726, grad_fn=<NllLossBackward0>)
tensor(1.4919, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13303/17426 [23:16<08:29,  8.09it/s]

tensor(1.4412, grad_fn=<NllLossBackward0>)
tensor(1.5084, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13305/17426 [23:16<08:16,  8.30it/s]

tensor(1.4779, grad_fn=<NllLossBackward0>)
tensor(1.5143, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13307/17426 [23:16<08:11,  8.38it/s]

tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.4902, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13309/17426 [23:16<08:13,  8.34it/s]

tensor(1.4652, grad_fn=<NllLossBackward0>)
tensor(1.4777, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13311/17426 [23:17<08:16,  8.30it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.4870, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13313/17426 [23:17<08:14,  8.32it/s]

tensor(1.5103, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13315/17426 [23:17<08:08,  8.42it/s]

tensor(1.4770, grad_fn=<NllLossBackward0>)
tensor(1.4927, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13317/17426 [23:17<08:12,  8.35it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)
tensor(1.5427, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13319/17426 [23:18<08:27,  8.10it/s]

tensor(1.5054, grad_fn=<NllLossBackward0>)
tensor(1.5037, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13321/17426 [23:18<08:25,  8.12it/s]

tensor(1.5435, grad_fn=<NllLossBackward0>)
tensor(1.5154, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13323/17426 [23:18<08:14,  8.29it/s]

tensor(1.4667, grad_fn=<NllLossBackward0>)
tensor(1.4876, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13325/17426 [23:18<08:09,  8.38it/s]

tensor(1.4842, grad_fn=<NllLossBackward0>)
tensor(1.4634, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13327/17426 [23:19<08:34,  7.96it/s]

tensor(1.5101, grad_fn=<NllLossBackward0>)
tensor(1.5074, grad_fn=<NllLossBackward0>)


 76%|███████▋  | 13329/17426 [23:19<08:22,  8.15it/s]

tensor(1.4706, grad_fn=<NllLossBackward0>)
tensor(1.4585, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13331/17426 [23:19<08:29,  8.03it/s]

tensor(1.5604, grad_fn=<NllLossBackward0>)
tensor(1.4562, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13333/17426 [23:19<08:16,  8.24it/s]

tensor(1.4674, grad_fn=<NllLossBackward0>)
tensor(1.4906, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13335/17426 [23:20<08:31,  7.99it/s]

tensor(1.4987, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13337/17426 [23:20<08:36,  7.91it/s]

tensor(1.4948, grad_fn=<NllLossBackward0>)
tensor(1.4569, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13339/17426 [23:20<08:24,  8.09it/s]

tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13341/17426 [23:20<08:18,  8.19it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.5495, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13343/17426 [23:21<08:16,  8.22it/s]

tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.5459, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13345/17426 [23:21<08:35,  7.91it/s]

tensor(1.5035, grad_fn=<NllLossBackward0>)
tensor(1.5346, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13347/17426 [23:21<08:22,  8.12it/s]

tensor(1.4753, grad_fn=<NllLossBackward0>)
tensor(1.4524, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13349/17426 [23:21<08:21,  8.13it/s]

tensor(1.5160, grad_fn=<NllLossBackward0>)
tensor(1.5025, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13351/17426 [23:22<08:25,  8.06it/s]

tensor(1.4993, grad_fn=<NllLossBackward0>)
tensor(1.5049, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13353/17426 [23:22<08:46,  7.74it/s]

tensor(1.4694, grad_fn=<NllLossBackward0>)
tensor(1.4851, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13355/17426 [23:22<08:26,  8.04it/s]

tensor(1.4696, grad_fn=<NllLossBackward0>)
tensor(1.4637, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13357/17426 [23:22<08:10,  8.30it/s]

tensor(1.5015, grad_fn=<NllLossBackward0>)
tensor(1.4672, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13359/17426 [23:23<08:11,  8.28it/s]

tensor(1.4660, grad_fn=<NllLossBackward0>)
tensor(1.4812, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13361/17426 [23:23<08:46,  7.73it/s]

tensor(1.5018, grad_fn=<NllLossBackward0>)
tensor(1.4596, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13363/17426 [23:23<08:33,  7.92it/s]

tensor(1.5187, grad_fn=<NllLossBackward0>)
tensor(1.4509, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13365/17426 [23:23<08:06,  8.35it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.4953, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13367/17426 [23:24<08:17,  8.16it/s]

tensor(1.4735, grad_fn=<NllLossBackward0>)
tensor(1.5058, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13369/17426 [23:24<08:17,  8.16it/s]

tensor(1.4712, grad_fn=<NllLossBackward0>)
tensor(1.4531, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13371/17426 [23:24<09:15,  7.30it/s]

tensor(1.4966, grad_fn=<NllLossBackward0>)
tensor(1.5076, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13373/17426 [23:24<09:51,  6.85it/s]

tensor(1.4881, grad_fn=<NllLossBackward0>)
tensor(1.5005, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13375/17426 [23:25<09:58,  6.77it/s]

tensor(1.5197, grad_fn=<NllLossBackward0>)
tensor(1.4713, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13377/17426 [23:25<10:20,  6.52it/s]

tensor(1.5434, grad_fn=<NllLossBackward0>)
tensor(1.4001, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13379/17426 [23:25<09:59,  6.75it/s]

tensor(1.5177, grad_fn=<NllLossBackward0>)
tensor(1.5026, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13381/17426 [23:26<10:14,  6.59it/s]

tensor(1.5016, grad_fn=<NllLossBackward0>)
tensor(1.4968, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13383/17426 [23:26<09:24,  7.17it/s]

tensor(1.4784, grad_fn=<NllLossBackward0>)
tensor(1.4579, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13385/17426 [23:26<09:21,  7.20it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.5262, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13387/17426 [23:26<08:31,  7.89it/s]

tensor(1.4923, grad_fn=<NllLossBackward0>)
tensor(1.4922, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13389/17426 [23:27<08:05,  8.31it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.4925, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13391/17426 [23:27<09:01,  7.45it/s]

tensor(1.4952, grad_fn=<NllLossBackward0>)
tensor(1.4544, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13393/17426 [23:27<09:24,  7.15it/s]

tensor(1.5016, grad_fn=<NllLossBackward0>)
tensor(1.5208, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13395/17426 [23:28<09:38,  6.97it/s]

tensor(1.5137, grad_fn=<NllLossBackward0>)
tensor(1.5088, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13397/17426 [23:28<09:48,  6.85it/s]

tensor(1.4697, grad_fn=<NllLossBackward0>)
tensor(1.4748, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13399/17426 [23:28<10:02,  6.68it/s]

tensor(1.5353, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13401/17426 [23:28<09:46,  6.86it/s]

tensor(1.5110, grad_fn=<NllLossBackward0>)
tensor(1.5376, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13403/17426 [23:29<10:08,  6.62it/s]

tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.4735, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13405/17426 [23:29<09:15,  7.24it/s]

tensor(1.4562, grad_fn=<NllLossBackward0>)
tensor(1.4580, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13407/17426 [23:29<08:46,  7.63it/s]

tensor(1.4769, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13409/17426 [23:30<08:30,  7.87it/s]

tensor(1.4684, grad_fn=<NllLossBackward0>)
tensor(1.4583, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13411/17426 [23:30<08:22,  7.98it/s]

tensor(1.4480, grad_fn=<NllLossBackward0>)
tensor(1.4915, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13413/17426 [23:30<08:21,  7.99it/s]

tensor(1.4620, grad_fn=<NllLossBackward0>)
tensor(1.4176, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13415/17426 [23:30<08:24,  7.94it/s]

tensor(1.4769, grad_fn=<NllLossBackward0>)
tensor(1.5053, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13417/17426 [23:31<08:17,  8.06it/s]

tensor(1.5088, grad_fn=<NllLossBackward0>)
tensor(1.4887, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13419/17426 [23:31<08:11,  8.15it/s]

tensor(1.4713, grad_fn=<NllLossBackward0>)
tensor(1.4385, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13421/17426 [23:31<08:05,  8.26it/s]

tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.4807, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13423/17426 [23:31<08:06,  8.22it/s]

tensor(1.5036, grad_fn=<NllLossBackward0>)
tensor(1.4702, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13425/17426 [23:32<08:22,  7.97it/s]

tensor(1.5179, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13427/17426 [23:32<08:26,  7.89it/s]

tensor(1.4718, grad_fn=<NllLossBackward0>)
tensor(1.5311, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13429/17426 [23:32<08:13,  8.10it/s]

tensor(1.4966, grad_fn=<NllLossBackward0>)
tensor(1.5017, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13431/17426 [23:32<08:24,  7.91it/s]

tensor(1.4486, grad_fn=<NllLossBackward0>)
tensor(1.4806, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13433/17426 [23:33<08:14,  8.07it/s]

tensor(1.4915, grad_fn=<NllLossBackward0>)
tensor(1.4827, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13435/17426 [23:33<08:04,  8.23it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.5038, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13437/17426 [23:33<08:09,  8.14it/s]

tensor(1.4989, grad_fn=<NllLossBackward0>)
tensor(1.5170, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13439/17426 [23:33<08:10,  8.13it/s]

tensor(1.4917, grad_fn=<NllLossBackward0>)
tensor(1.4793, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13441/17426 [23:34<08:34,  7.75it/s]

tensor(1.4859, grad_fn=<NllLossBackward0>)
tensor(1.4945, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13443/17426 [23:34<08:19,  7.97it/s]

tensor(1.5284, grad_fn=<NllLossBackward0>)
tensor(1.4677, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13445/17426 [23:34<08:14,  8.06it/s]

tensor(1.5267, grad_fn=<NllLossBackward0>)
tensor(1.4949, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13447/17426 [23:34<08:18,  7.98it/s]

tensor(1.4764, grad_fn=<NllLossBackward0>)
tensor(1.4926, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13449/17426 [23:34<08:00,  8.28it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.4896, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13451/17426 [23:35<08:04,  8.20it/s]

tensor(1.4623, grad_fn=<NllLossBackward0>)
tensor(1.4729, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13453/17426 [23:35<07:55,  8.36it/s]

tensor(1.4314, grad_fn=<NllLossBackward0>)
tensor(1.4829, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13455/17426 [23:35<08:01,  8.25it/s]

tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13457/17426 [23:35<08:05,  8.18it/s]

tensor(1.4568, grad_fn=<NllLossBackward0>)
tensor(1.5224, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13459/17426 [23:36<08:29,  7.78it/s]

tensor(1.5078, grad_fn=<NllLossBackward0>)
tensor(1.4801, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13461/17426 [23:36<08:22,  7.90it/s]

tensor(1.4675, grad_fn=<NllLossBackward0>)
tensor(1.4325, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13463/17426 [23:36<08:18,  7.95it/s]

tensor(1.4802, grad_fn=<NllLossBackward0>)
tensor(1.5018, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13465/17426 [23:36<08:16,  7.98it/s]

tensor(1.4694, grad_fn=<NllLossBackward0>)
tensor(1.4430, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13467/17426 [23:37<08:37,  7.65it/s]

tensor(1.4954, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13469/17426 [23:37<08:25,  7.82it/s]

tensor(1.4604, grad_fn=<NllLossBackward0>)
tensor(1.4960, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13471/17426 [23:37<08:14,  7.99it/s]

tensor(1.5226, grad_fn=<NllLossBackward0>)
tensor(1.4690, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13473/17426 [23:37<08:08,  8.10it/s]

tensor(1.5294, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13475/17426 [23:38<08:12,  8.02it/s]

tensor(1.4778, grad_fn=<NllLossBackward0>)
tensor(1.4709, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13477/17426 [23:38<08:03,  8.17it/s]

tensor(1.4692, grad_fn=<NllLossBackward0>)
tensor(1.4770, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13479/17426 [23:38<08:07,  8.09it/s]

tensor(1.4737, grad_fn=<NllLossBackward0>)
tensor(1.5199, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13481/17426 [23:38<07:56,  8.29it/s]

tensor(1.4759, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13483/17426 [23:39<08:01,  8.18it/s]

tensor(1.4611, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13485/17426 [23:39<08:33,  7.68it/s]

tensor(1.5198, grad_fn=<NllLossBackward0>)
tensor(1.4919, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13487/17426 [23:39<08:58,  7.31it/s]

tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.4531, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13489/17426 [23:40<09:10,  7.15it/s]

tensor(1.4849, grad_fn=<NllLossBackward0>)
tensor(1.4235, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13491/17426 [23:40<09:43,  6.74it/s]

tensor(1.4534, grad_fn=<NllLossBackward0>)
tensor(1.4773, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13493/17426 [23:40<09:08,  7.17it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.4959, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13495/17426 [23:40<09:09,  7.16it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)
tensor(1.4867, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13497/17426 [23:41<08:34,  7.63it/s]

tensor(1.5077, grad_fn=<NllLossBackward0>)
tensor(1.4744, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13499/17426 [23:41<08:42,  7.52it/s]

tensor(1.5662, grad_fn=<NllLossBackward0>)
tensor(1.4603, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13501/17426 [23:41<09:13,  7.09it/s]

tensor(1.4576, grad_fn=<NllLossBackward0>)
tensor(1.4729, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13503/17426 [23:41<08:32,  7.66it/s]

tensor(1.5053, grad_fn=<NllLossBackward0>)
tensor(1.4767, grad_fn=<NllLossBackward0>)


 77%|███████▋  | 13505/17426 [23:42<08:07,  8.04it/s]

tensor(1.4721, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13507/17426 [23:42<09:02,  7.22it/s]

tensor(1.4870, grad_fn=<NllLossBackward0>)
tensor(1.5230, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13509/17426 [23:42<09:13,  7.08it/s]

tensor(1.4671, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13511/17426 [23:43<09:14,  7.06it/s]

tensor(1.4921, grad_fn=<NllLossBackward0>)
tensor(1.4393, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13513/17426 [23:43<09:39,  6.75it/s]

tensor(1.4697, grad_fn=<NllLossBackward0>)
tensor(1.5168, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13515/17426 [23:43<09:54,  6.58it/s]

tensor(1.4533, grad_fn=<NllLossBackward0>)
tensor(1.4624, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13517/17426 [23:43<09:44,  6.68it/s]

tensor(1.4935, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13519/17426 [23:44<09:03,  7.18it/s]

tensor(1.4838, grad_fn=<NllLossBackward0>)
tensor(1.4648, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13521/17426 [23:44<08:26,  7.70it/s]

tensor(1.5071, grad_fn=<NllLossBackward0>)
tensor(1.5428, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13523/17426 [23:44<08:21,  7.78it/s]

tensor(1.4600, grad_fn=<NllLossBackward0>)
tensor(1.4696, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13525/17426 [23:44<07:52,  8.25it/s]

tensor(1.4630, grad_fn=<NllLossBackward0>)
tensor(1.4520, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13527/17426 [23:45<07:49,  8.30it/s]

tensor(1.4974, grad_fn=<NllLossBackward0>)
tensor(1.4822, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13529/17426 [23:45<07:47,  8.33it/s]

tensor(1.4610, grad_fn=<NllLossBackward0>)
tensor(1.5251, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13531/17426 [23:45<07:57,  8.16it/s]

tensor(1.5361, grad_fn=<NllLossBackward0>)
tensor(1.5025, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13533/17426 [23:45<08:02,  8.06it/s]

tensor(1.4616, grad_fn=<NllLossBackward0>)
tensor(1.5137, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13535/17426 [23:46<07:54,  8.21it/s]

tensor(1.4676, grad_fn=<NllLossBackward0>)
tensor(1.4673, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13537/17426 [23:46<07:45,  8.35it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)
tensor(1.4171, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13539/17426 [23:46<07:44,  8.37it/s]

tensor(1.4370, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13541/17426 [23:46<07:51,  8.24it/s]

tensor(1.4715, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13543/17426 [23:47<07:52,  8.22it/s]

tensor(1.4351, grad_fn=<NllLossBackward0>)
tensor(1.5247, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13545/17426 [23:47<07:38,  8.47it/s]

tensor(1.4583, grad_fn=<NllLossBackward0>)
tensor(1.4987, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13547/17426 [23:47<07:40,  8.43it/s]

tensor(1.4862, grad_fn=<NllLossBackward0>)
tensor(1.4596, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13549/17426 [23:47<07:49,  8.26it/s]

tensor(1.5310, grad_fn=<NllLossBackward0>)
tensor(1.5300, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13551/17426 [23:48<07:44,  8.34it/s]

tensor(1.5093, grad_fn=<NllLossBackward0>)
tensor(1.4618, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13553/17426 [23:48<07:36,  8.48it/s]

tensor(1.4898, grad_fn=<NllLossBackward0>)
tensor(1.4400, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13555/17426 [23:48<07:36,  8.47it/s]

tensor(1.5093, grad_fn=<NllLossBackward0>)
tensor(1.5164, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13557/17426 [23:48<07:30,  8.58it/s]

tensor(1.5038, grad_fn=<NllLossBackward0>)
tensor(1.5281, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13559/17426 [23:49<07:51,  8.19it/s]

tensor(1.4265, grad_fn=<NllLossBackward0>)
tensor(1.4730, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13561/17426 [23:49<07:46,  8.28it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.4770, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13563/17426 [23:49<07:46,  8.28it/s]

tensor(1.5074, grad_fn=<NllLossBackward0>)
tensor(1.5093, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13565/17426 [23:49<07:42,  8.35it/s]

tensor(1.5049, grad_fn=<NllLossBackward0>)
tensor(1.5038, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13567/17426 [23:50<08:03,  7.98it/s]

tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.5017, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13569/17426 [23:50<07:57,  8.09it/s]

tensor(1.4946, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13571/17426 [23:50<07:47,  8.24it/s]

tensor(1.4898, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13573/17426 [23:50<07:36,  8.44it/s]

tensor(1.4637, grad_fn=<NllLossBackward0>)
tensor(1.5035, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13575/17426 [23:51<07:36,  8.44it/s]

tensor(1.4746, grad_fn=<NllLossBackward0>)
tensor(1.4617, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13577/17426 [23:51<07:49,  8.20it/s]

tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13579/17426 [23:51<07:44,  8.28it/s]

tensor(1.4835, grad_fn=<NllLossBackward0>)
tensor(1.4630, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13581/17426 [23:51<07:37,  8.41it/s]

tensor(1.4681, grad_fn=<NllLossBackward0>)
tensor(1.4981, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13583/17426 [23:51<07:30,  8.53it/s]

tensor(1.4880, grad_fn=<NllLossBackward0>)
tensor(1.5228, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13585/17426 [23:52<07:49,  8.18it/s]

tensor(1.4925, grad_fn=<NllLossBackward0>)
tensor(1.4635, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13587/17426 [23:52<07:37,  8.40it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.4764, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13589/17426 [23:52<07:49,  8.17it/s]

tensor(1.4705, grad_fn=<NllLossBackward0>)
tensor(1.5037, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13591/17426 [23:52<07:40,  8.33it/s]

tensor(1.5335, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13593/17426 [23:53<08:02,  7.95it/s]

tensor(1.4974, grad_fn=<NllLossBackward0>)
tensor(1.5117, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13595/17426 [23:53<07:46,  8.21it/s]

tensor(1.4690, grad_fn=<NllLossBackward0>)
tensor(1.5262, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13597/17426 [23:53<07:51,  8.12it/s]

tensor(1.4968, grad_fn=<NllLossBackward0>)
tensor(1.5019, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13599/17426 [23:53<07:50,  8.13it/s]

tensor(1.4709, grad_fn=<NllLossBackward0>)
tensor(1.5084, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13601/17426 [23:54<08:00,  7.97it/s]

tensor(1.4322, grad_fn=<NllLossBackward0>)
tensor(1.4945, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13603/17426 [23:54<08:40,  7.35it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.4919, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13605/17426 [23:54<08:54,  7.15it/s]

tensor(1.4677, grad_fn=<NllLossBackward0>)
tensor(1.5387, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13607/17426 [23:55<08:53,  7.16it/s]

tensor(1.4485, grad_fn=<NllLossBackward0>)
tensor(1.4872, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13609/17426 [23:55<09:17,  6.85it/s]

tensor(1.4920, grad_fn=<NllLossBackward0>)
tensor(1.4659, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13611/17426 [23:55<08:56,  7.10it/s]

tensor(1.5144, grad_fn=<NllLossBackward0>)
tensor(1.4799, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13613/17426 [23:55<08:19,  7.63it/s]

tensor(1.5031, grad_fn=<NllLossBackward0>)
tensor(1.4711, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13615/17426 [23:56<09:01,  7.03it/s]

tensor(1.4855, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13617/17426 [23:56<08:31,  7.45it/s]

tensor(1.5124, grad_fn=<NllLossBackward0>)
tensor(1.4979, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13619/17426 [23:56<08:23,  7.56it/s]

tensor(1.4754, grad_fn=<NllLossBackward0>)
tensor(1.4564, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13621/17426 [23:56<08:25,  7.53it/s]

tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.5389, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13623/17426 [23:57<08:55,  7.10it/s]

tensor(1.5273, grad_fn=<NllLossBackward0>)
tensor(1.5128, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13625/17426 [23:57<09:21,  6.77it/s]

tensor(1.4317, grad_fn=<NllLossBackward0>)
tensor(1.4664, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13627/17426 [23:57<09:12,  6.87it/s]

tensor(1.4907, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13629/17426 [23:58<09:16,  6.83it/s]

tensor(1.4882, grad_fn=<NllLossBackward0>)
tensor(1.4534, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13631/17426 [23:58<09:21,  6.75it/s]

tensor(1.5073, grad_fn=<NllLossBackward0>)
tensor(1.4845, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13633/17426 [23:58<09:57,  6.35it/s]

tensor(1.5083, grad_fn=<NllLossBackward0>)
tensor(1.4987, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13635/17426 [23:59<08:40,  7.28it/s]

tensor(1.4955, grad_fn=<NllLossBackward0>)
tensor(1.5491, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13637/17426 [23:59<08:17,  7.61it/s]

tensor(1.5135, grad_fn=<NllLossBackward0>)
tensor(1.5309, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13639/17426 [23:59<07:56,  7.94it/s]

tensor(1.4329, grad_fn=<NllLossBackward0>)
tensor(1.4854, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13641/17426 [23:59<07:58,  7.90it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.4579, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13643/17426 [24:00<07:46,  8.11it/s]

tensor(1.4790, grad_fn=<NllLossBackward0>)
tensor(1.4898, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13645/17426 [24:00<07:34,  8.32it/s]

tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.5114, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13647/17426 [24:00<07:35,  8.29it/s]

tensor(1.4734, grad_fn=<NllLossBackward0>)
tensor(1.5411, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13649/17426 [24:00<07:49,  8.04it/s]

tensor(1.4516, grad_fn=<NllLossBackward0>)
tensor(1.4539, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13651/17426 [24:00<07:43,  8.14it/s]

tensor(1.4673, grad_fn=<NllLossBackward0>)
tensor(1.4787, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13653/17426 [24:01<07:29,  8.39it/s]

tensor(1.5127, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13655/17426 [24:01<07:28,  8.40it/s]

tensor(1.4904, grad_fn=<NllLossBackward0>)
tensor(1.5213, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13657/17426 [24:01<07:33,  8.31it/s]

tensor(1.4700, grad_fn=<NllLossBackward0>)
tensor(1.5120, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13659/17426 [24:01<07:36,  8.25it/s]

tensor(1.4769, grad_fn=<NllLossBackward0>)
tensor(1.4939, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13661/17426 [24:02<07:34,  8.28it/s]

tensor(1.4765, grad_fn=<NllLossBackward0>)
tensor(1.4905, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13663/17426 [24:02<07:31,  8.34it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13665/17426 [24:02<07:32,  8.32it/s]

tensor(1.5308, grad_fn=<NllLossBackward0>)
tensor(1.4833, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13667/17426 [24:02<07:48,  8.02it/s]

tensor(1.4898, grad_fn=<NllLossBackward0>)
tensor(1.4615, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13669/17426 [24:03<07:32,  8.29it/s]

tensor(1.4051, grad_fn=<NllLossBackward0>)
tensor(1.5161, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13671/17426 [24:03<07:34,  8.26it/s]

tensor(1.4596, grad_fn=<NllLossBackward0>)
tensor(1.4991, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13673/17426 [24:03<07:37,  8.20it/s]

tensor(1.4521, grad_fn=<NllLossBackward0>)
tensor(1.5041, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13675/17426 [24:03<07:24,  8.44it/s]

tensor(1.4480, grad_fn=<NllLossBackward0>)
tensor(1.5203, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13677/17426 [24:04<07:44,  8.06it/s]

tensor(1.4950, grad_fn=<NllLossBackward0>)
tensor(1.4651, grad_fn=<NllLossBackward0>)


 78%|███████▊  | 13679/17426 [24:04<07:35,  8.22it/s]

tensor(1.4161, grad_fn=<NllLossBackward0>)
tensor(1.4685, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13681/17426 [24:04<07:39,  8.14it/s]

tensor(1.4588, grad_fn=<NllLossBackward0>)
tensor(1.4867, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13683/17426 [24:04<07:32,  8.28it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.4274, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13685/17426 [24:05<07:41,  8.11it/s]

tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.5367, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13687/17426 [24:05<07:25,  8.39it/s]

tensor(1.5354, grad_fn=<NllLossBackward0>)
tensor(1.5353, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13689/17426 [24:05<07:23,  8.42it/s]

tensor(1.4817, grad_fn=<NllLossBackward0>)
tensor(1.4520, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13691/17426 [24:05<07:28,  8.33it/s]

tensor(1.4645, grad_fn=<NllLossBackward0>)
tensor(1.4959, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13693/17426 [24:06<07:22,  8.44it/s]

tensor(1.4823, grad_fn=<NllLossBackward0>)
tensor(1.5484, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13695/17426 [24:06<07:37,  8.16it/s]

tensor(1.4931, grad_fn=<NllLossBackward0>)
tensor(1.4560, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13697/17426 [24:06<07:31,  8.25it/s]

tensor(1.5092, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13699/17426 [24:06<07:28,  8.30it/s]

tensor(1.4798, grad_fn=<NllLossBackward0>)
tensor(1.4792, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13701/17426 [24:07<07:23,  8.39it/s]

tensor(1.5341, grad_fn=<NllLossBackward0>)
tensor(1.5059, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13703/17426 [24:07<07:58,  7.78it/s]

tensor(1.4460, grad_fn=<NllLossBackward0>)
tensor(1.4793, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13705/17426 [24:07<07:35,  8.18it/s]

tensor(1.5229, grad_fn=<NllLossBackward0>)
tensor(1.4649, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13707/17426 [24:07<07:24,  8.36it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.4910, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13709/17426 [24:08<07:19,  8.45it/s]

tensor(1.4900, grad_fn=<NllLossBackward0>)
tensor(1.5534, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13711/17426 [24:08<07:27,  8.30it/s]

tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.4542, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13713/17426 [24:08<07:29,  8.26it/s]

tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13715/17426 [24:08<07:28,  8.27it/s]

tensor(1.5288, grad_fn=<NllLossBackward0>)
tensor(1.4836, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13717/17426 [24:09<07:52,  7.86it/s]

tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.4812, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13719/17426 [24:09<08:11,  7.53it/s]

tensor(1.5195, grad_fn=<NllLossBackward0>)
tensor(1.5137, grad_fn=<NllLossBackward0>)


 79%|███████▊  | 13721/17426 [24:09<08:15,  7.48it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.5116, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13723/17426 [24:09<08:13,  7.51it/s]

tensor(1.4506, grad_fn=<NllLossBackward0>)
tensor(1.4877, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13725/17426 [24:10<08:20,  7.40it/s]

tensor(1.5096, grad_fn=<NllLossBackward0>)
tensor(1.4909, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13727/17426 [24:10<07:54,  7.79it/s]

tensor(1.5527, grad_fn=<NllLossBackward0>)
tensor(1.4465, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13729/17426 [24:10<07:53,  7.81it/s]

tensor(1.4673, grad_fn=<NllLossBackward0>)
tensor(1.4636, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13731/17426 [24:10<08:22,  7.36it/s]

tensor(1.5276, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13733/17426 [24:11<08:04,  7.62it/s]

tensor(1.4549, grad_fn=<NllLossBackward0>)
tensor(1.5194, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13735/17426 [24:11<07:37,  8.08it/s]

tensor(1.4810, grad_fn=<NllLossBackward0>)
tensor(1.4740, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13737/17426 [24:11<08:11,  7.51it/s]

tensor(1.4712, grad_fn=<NllLossBackward0>)
tensor(1.5137, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13739/17426 [24:11<08:50,  6.95it/s]

tensor(1.4894, grad_fn=<NllLossBackward0>)
tensor(1.5108, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13741/17426 [24:12<08:50,  6.95it/s]

tensor(1.5084, grad_fn=<NllLossBackward0>)
tensor(1.4745, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13743/17426 [24:12<08:42,  7.05it/s]

tensor(1.4663, grad_fn=<NllLossBackward0>)
tensor(1.4959, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13745/17426 [24:12<09:16,  6.61it/s]

tensor(1.4613, grad_fn=<NllLossBackward0>)
tensor(1.4785, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13747/17426 [24:13<09:29,  6.46it/s]

tensor(1.5129, grad_fn=<NllLossBackward0>)
tensor(1.4903, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13749/17426 [24:13<09:28,  6.47it/s]

tensor(1.4906, grad_fn=<NllLossBackward0>)
tensor(1.4908, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13751/17426 [24:13<09:11,  6.66it/s]

tensor(1.4849, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13753/17426 [24:14<08:13,  7.45it/s]

tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.4971, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13755/17426 [24:14<07:48,  7.84it/s]

tensor(1.5079, grad_fn=<NllLossBackward0>)
tensor(1.5638, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13757/17426 [24:14<07:28,  8.17it/s]

tensor(1.4632, grad_fn=<NllLossBackward0>)
tensor(1.4845, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13759/17426 [24:14<07:37,  8.02it/s]

tensor(1.5106, grad_fn=<NllLossBackward0>)
tensor(1.4914, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13761/17426 [24:15<07:31,  8.11it/s]

tensor(1.4918, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13763/17426 [24:15<07:23,  8.25it/s]

tensor(1.4919, grad_fn=<NllLossBackward0>)
tensor(1.4867, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13765/17426 [24:15<07:13,  8.44it/s]

tensor(1.5580, grad_fn=<NllLossBackward0>)
tensor(1.5140, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13767/17426 [24:15<07:17,  8.36it/s]

tensor(1.5025, grad_fn=<NllLossBackward0>)
tensor(1.4769, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13769/17426 [24:15<07:25,  8.20it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.4714, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13771/17426 [24:16<07:12,  8.45it/s]

tensor(1.4702, grad_fn=<NllLossBackward0>)
tensor(1.4635, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13773/17426 [24:16<07:08,  8.52it/s]

tensor(1.5364, grad_fn=<NllLossBackward0>)
tensor(1.4674, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13775/17426 [24:16<07:06,  8.55it/s]

tensor(1.4751, grad_fn=<NllLossBackward0>)
tensor(1.4930, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13777/17426 [24:16<07:24,  8.22it/s]

tensor(1.4817, grad_fn=<NllLossBackward0>)
tensor(1.5215, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13779/17426 [24:17<07:22,  8.24it/s]

tensor(1.5417, grad_fn=<NllLossBackward0>)
tensor(1.4895, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13781/17426 [24:17<07:09,  8.48it/s]

tensor(1.4795, grad_fn=<NllLossBackward0>)
tensor(1.4902, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13783/17426 [24:17<07:14,  8.38it/s]

tensor(1.4393, grad_fn=<NllLossBackward0>)
tensor(1.4718, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13785/17426 [24:17<07:07,  8.52it/s]

tensor(1.5255, grad_fn=<NllLossBackward0>)
tensor(1.4681, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13787/17426 [24:18<07:26,  8.14it/s]

tensor(1.4493, grad_fn=<NllLossBackward0>)
tensor(1.4833, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13789/17426 [24:18<07:12,  8.41it/s]

tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.4589, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13791/17426 [24:18<07:10,  8.44it/s]

tensor(1.4608, grad_fn=<NllLossBackward0>)
tensor(1.4799, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13793/17426 [24:18<07:12,  8.40it/s]

tensor(1.5101, grad_fn=<NllLossBackward0>)
tensor(1.4732, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13795/17426 [24:19<07:33,  8.01it/s]

tensor(1.5231, grad_fn=<NllLossBackward0>)
tensor(1.4649, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13797/17426 [24:19<07:36,  7.96it/s]

tensor(1.4335, grad_fn=<NllLossBackward0>)
tensor(1.5201, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13799/17426 [24:19<07:23,  8.18it/s]

tensor(1.5347, grad_fn=<NllLossBackward0>)
tensor(1.5120, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13801/17426 [24:19<07:22,  8.19it/s]

tensor(1.5351, grad_fn=<NllLossBackward0>)
tensor(1.4555, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13803/17426 [24:20<07:11,  8.39it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.4738, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13805/17426 [24:20<07:28,  8.08it/s]

tensor(1.4475, grad_fn=<NllLossBackward0>)
tensor(1.4949, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13807/17426 [24:20<07:25,  8.12it/s]

tensor(1.5326, grad_fn=<NllLossBackward0>)
tensor(1.4913, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13809/17426 [24:20<07:15,  8.31it/s]

tensor(1.5040, grad_fn=<NllLossBackward0>)
tensor(1.5369, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13811/17426 [24:21<07:19,  8.22it/s]

tensor(1.4704, grad_fn=<NllLossBackward0>)
tensor(1.4957, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13813/17426 [24:21<07:35,  7.94it/s]

tensor(1.4928, grad_fn=<NllLossBackward0>)
tensor(1.5147, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13815/17426 [24:21<07:35,  7.92it/s]

tensor(1.4851, grad_fn=<NllLossBackward0>)
tensor(1.4912, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13817/17426 [24:21<07:27,  8.06it/s]

tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.4985, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13819/17426 [24:22<07:28,  8.05it/s]

tensor(1.4680, grad_fn=<NllLossBackward0>)
tensor(1.4634, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13821/17426 [24:22<07:39,  7.84it/s]

tensor(1.4985, grad_fn=<NllLossBackward0>)
tensor(1.4414, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13823/17426 [24:22<07:23,  8.13it/s]

tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.4821, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13825/17426 [24:22<07:23,  8.11it/s]

tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.4689, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13827/17426 [24:23<07:19,  8.20it/s]

tensor(1.4411, grad_fn=<NllLossBackward0>)
tensor(1.5304, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13829/17426 [24:23<07:21,  8.14it/s]

tensor(1.4665, grad_fn=<NllLossBackward0>)
tensor(1.4659, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13831/17426 [24:23<07:14,  8.26it/s]

tensor(1.5225, grad_fn=<NllLossBackward0>)
tensor(1.4820, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13833/17426 [24:23<07:25,  8.06it/s]

tensor(1.4880, grad_fn=<NllLossBackward0>)
tensor(1.4672, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13835/17426 [24:24<07:46,  7.69it/s]

tensor(1.4739, grad_fn=<NllLossBackward0>)
tensor(1.5287, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13837/17426 [24:24<07:54,  7.56it/s]

tensor(1.4711, grad_fn=<NllLossBackward0>)
tensor(1.5533, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13839/17426 [24:24<08:09,  7.33it/s]

tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.4810, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13841/17426 [24:24<07:54,  7.55it/s]

tensor(1.4865, grad_fn=<NllLossBackward0>)
tensor(1.4831, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13843/17426 [24:25<07:48,  7.64it/s]

tensor(1.5114, grad_fn=<NllLossBackward0>)
tensor(1.4755, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13845/17426 [24:25<07:23,  8.08it/s]

tensor(1.4820, grad_fn=<NllLossBackward0>)
tensor(1.4653, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13847/17426 [24:25<07:35,  7.85it/s]

tensor(1.4703, grad_fn=<NllLossBackward0>)
tensor(1.5436, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13849/17426 [24:25<07:31,  7.93it/s]

tensor(1.4473, grad_fn=<NllLossBackward0>)
tensor(1.4702, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13851/17426 [24:26<07:12,  8.26it/s]

tensor(1.4922, grad_fn=<NllLossBackward0>)
tensor(1.4681, grad_fn=<NllLossBackward0>)


 79%|███████▉  | 13853/17426 [24:26<07:01,  8.47it/s]

tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.4495, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13855/17426 [24:26<07:12,  8.25it/s]

tensor(1.4567, grad_fn=<NllLossBackward0>)
tensor(1.5313, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13857/17426 [24:26<07:45,  7.67it/s]

tensor(1.4825, grad_fn=<NllLossBackward0>)
tensor(1.4821, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13859/17426 [24:27<08:19,  7.13it/s]

tensor(1.4551, grad_fn=<NllLossBackward0>)
tensor(1.4905, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13861/17426 [24:27<08:05,  7.34it/s]

tensor(1.4724, grad_fn=<NllLossBackward0>)
tensor(1.5013, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13863/17426 [24:27<08:08,  7.30it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.4580, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13865/17426 [24:28<08:36,  6.89it/s]

tensor(1.4707, grad_fn=<NllLossBackward0>)
tensor(1.4791, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13867/17426 [24:28<08:22,  7.08it/s]

tensor(1.4744, grad_fn=<NllLossBackward0>)
tensor(1.5008, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13869/17426 [24:28<09:10,  6.47it/s]

tensor(1.5032, grad_fn=<NllLossBackward0>)
tensor(1.4841, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13871/17426 [24:28<08:38,  6.85it/s]

tensor(1.4794, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13873/17426 [24:29<07:42,  7.68it/s]

tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.5131, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13875/17426 [24:29<07:20,  8.06it/s]

tensor(1.4578, grad_fn=<NllLossBackward0>)
tensor(1.4916, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13877/17426 [24:29<07:15,  8.14it/s]

tensor(1.4744, grad_fn=<NllLossBackward0>)
tensor(1.4669, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13879/17426 [24:29<07:17,  8.11it/s]

tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.4909, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13881/17426 [24:30<07:15,  8.15it/s]

tensor(1.5218, grad_fn=<NllLossBackward0>)
tensor(1.5265, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13883/17426 [24:30<07:14,  8.14it/s]

tensor(1.5015, grad_fn=<NllLossBackward0>)
tensor(1.4211, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13885/17426 [24:30<07:05,  8.31it/s]

tensor(1.5368, grad_fn=<NllLossBackward0>)
tensor(1.4471, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13887/17426 [24:30<07:03,  8.36it/s]

tensor(1.4410, grad_fn=<NllLossBackward0>)
tensor(1.4789, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13889/17426 [24:31<07:10,  8.22it/s]

tensor(1.4710, grad_fn=<NllLossBackward0>)
tensor(1.4881, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13891/17426 [24:31<07:04,  8.32it/s]

tensor(1.4856, grad_fn=<NllLossBackward0>)
tensor(1.5255, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13893/17426 [24:31<07:00,  8.40it/s]

tensor(1.4465, grad_fn=<NllLossBackward0>)
tensor(1.5468, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13895/17426 [24:31<07:01,  8.37it/s]

tensor(1.4681, grad_fn=<NllLossBackward0>)
tensor(1.4863, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13897/17426 [24:32<07:05,  8.30it/s]

tensor(1.4782, grad_fn=<NllLossBackward0>)
tensor(1.4921, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13899/17426 [24:32<07:08,  8.22it/s]

tensor(1.5086, grad_fn=<NllLossBackward0>)
tensor(1.4572, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13901/17426 [24:32<07:01,  8.36it/s]

tensor(1.5278, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13903/17426 [24:32<06:58,  8.41it/s]

tensor(1.5236, grad_fn=<NllLossBackward0>)
tensor(1.4740, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13905/17426 [24:32<06:56,  8.45it/s]

tensor(1.4709, grad_fn=<NllLossBackward0>)
tensor(1.4984, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13907/17426 [24:33<07:08,  8.22it/s]

tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.4803, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13909/17426 [24:33<07:00,  8.36it/s]

tensor(1.4907, grad_fn=<NllLossBackward0>)
tensor(1.4888, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13911/17426 [24:33<06:55,  8.47it/s]

tensor(1.5160, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13913/17426 [24:33<06:49,  8.59it/s]

tensor(1.4788, grad_fn=<NllLossBackward0>)
tensor(1.4721, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13915/17426 [24:34<07:01,  8.33it/s]

tensor(1.4688, grad_fn=<NllLossBackward0>)
tensor(1.4694, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13917/17426 [24:34<06:54,  8.46it/s]

tensor(1.4690, grad_fn=<NllLossBackward0>)
tensor(1.4821, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13919/17426 [24:34<07:00,  8.34it/s]

tensor(1.4732, grad_fn=<NllLossBackward0>)
tensor(1.5024, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13921/17426 [24:34<06:56,  8.42it/s]

tensor(1.5047, grad_fn=<NllLossBackward0>)
tensor(1.5116, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13923/17426 [24:35<06:51,  8.51it/s]

tensor(1.4959, grad_fn=<NllLossBackward0>)
tensor(1.5029, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13925/17426 [24:35<07:04,  8.25it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.4528, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13927/17426 [24:35<06:59,  8.33it/s]

tensor(1.4244, grad_fn=<NllLossBackward0>)
tensor(1.5494, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13929/17426 [24:35<06:58,  8.36it/s]

tensor(1.4641, grad_fn=<NllLossBackward0>)
tensor(1.4516, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13931/17426 [24:36<06:56,  8.39it/s]

tensor(1.4878, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13933/17426 [24:36<07:20,  7.93it/s]

tensor(1.4585, grad_fn=<NllLossBackward0>)
tensor(1.4893, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13935/17426 [24:36<07:06,  8.18it/s]

tensor(1.4552, grad_fn=<NllLossBackward0>)
tensor(1.4793, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13937/17426 [24:36<06:53,  8.44it/s]

tensor(1.5178, grad_fn=<NllLossBackward0>)
tensor(1.4594, grad_fn=<NllLossBackward0>)


 80%|███████▉  | 13939/17426 [24:37<07:20,  7.92it/s]

tensor(1.4628, grad_fn=<NllLossBackward0>)
tensor(1.5310, grad_fn=<NllLossBackward0>)


 80%|████████  | 13941/17426 [24:37<08:45,  6.63it/s]

tensor(1.5091, grad_fn=<NllLossBackward0>)
tensor(1.4634, grad_fn=<NllLossBackward0>)


 80%|████████  | 13943/17426 [24:37<08:28,  6.85it/s]

tensor(1.4764, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)


 80%|████████  | 13945/17426 [24:37<07:34,  7.66it/s]

tensor(1.4714, grad_fn=<NllLossBackward0>)
tensor(1.4517, grad_fn=<NllLossBackward0>)


 80%|████████  | 13947/17426 [24:38<07:10,  8.07it/s]

tensor(1.4483, grad_fn=<NllLossBackward0>)
tensor(1.4922, grad_fn=<NllLossBackward0>)


 80%|████████  | 13949/17426 [24:38<07:25,  7.80it/s]

tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.5131, grad_fn=<NllLossBackward0>)


 80%|████████  | 13951/17426 [24:38<07:33,  7.66it/s]

tensor(1.5378, grad_fn=<NllLossBackward0>)
tensor(1.4958, grad_fn=<NllLossBackward0>)


 80%|████████  | 13953/17426 [24:38<07:30,  7.70it/s]

tensor(1.4826, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 80%|████████  | 13955/17426 [24:39<08:04,  7.16it/s]

tensor(1.4980, grad_fn=<NllLossBackward0>)
tensor(1.4904, grad_fn=<NllLossBackward0>)


 80%|████████  | 13957/17426 [24:39<08:24,  6.88it/s]

tensor(1.4783, grad_fn=<NllLossBackward0>)
tensor(1.4871, grad_fn=<NllLossBackward0>)


 80%|████████  | 13959/17426 [24:39<08:18,  6.95it/s]

tensor(1.4700, grad_fn=<NllLossBackward0>)
tensor(1.4508, grad_fn=<NllLossBackward0>)


 80%|████████  | 13961/17426 [24:40<07:59,  7.23it/s]

tensor(1.4527, grad_fn=<NllLossBackward0>)
tensor(1.5000, grad_fn=<NllLossBackward0>)


 80%|████████  | 13963/17426 [24:40<07:43,  7.48it/s]

tensor(1.4480, grad_fn=<NllLossBackward0>)
tensor(1.4916, grad_fn=<NllLossBackward0>)


 80%|████████  | 13965/17426 [24:40<07:40,  7.52it/s]

tensor(1.4482, grad_fn=<NllLossBackward0>)
tensor(1.5083, grad_fn=<NllLossBackward0>)


 80%|████████  | 13967/17426 [24:40<08:09,  7.06it/s]

tensor(1.4824, grad_fn=<NllLossBackward0>)
tensor(1.4666, grad_fn=<NllLossBackward0>)


 80%|████████  | 13969/17426 [24:41<07:38,  7.55it/s]

tensor(1.5091, grad_fn=<NllLossBackward0>)
tensor(1.5455, grad_fn=<NllLossBackward0>)


 80%|████████  | 13971/17426 [24:41<07:38,  7.54it/s]

tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.5032, grad_fn=<NllLossBackward0>)


 80%|████████  | 13973/17426 [24:41<08:11,  7.03it/s]

tensor(1.4763, grad_fn=<NllLossBackward0>)
tensor(1.4849, grad_fn=<NllLossBackward0>)


 80%|████████  | 13975/17426 [24:42<08:22,  6.86it/s]

tensor(1.5161, grad_fn=<NllLossBackward0>)
tensor(1.4595, grad_fn=<NllLossBackward0>)


 80%|████████  | 13977/17426 [24:42<08:28,  6.78it/s]

tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.4975, grad_fn=<NllLossBackward0>)


 80%|████████  | 13979/17426 [24:42<08:36,  6.68it/s]

tensor(1.4518, grad_fn=<NllLossBackward0>)
tensor(1.4776, grad_fn=<NllLossBackward0>)


 80%|████████  | 13981/17426 [24:42<08:27,  6.79it/s]

tensor(1.5175, grad_fn=<NllLossBackward0>)
tensor(1.4316, grad_fn=<NllLossBackward0>)


 80%|████████  | 13983/17426 [24:43<08:33,  6.71it/s]

tensor(1.5114, grad_fn=<NllLossBackward0>)
tensor(1.4948, grad_fn=<NllLossBackward0>)


 80%|████████  | 13985/17426 [24:43<08:02,  7.14it/s]

tensor(1.4582, grad_fn=<NllLossBackward0>)
tensor(1.4659, grad_fn=<NllLossBackward0>)


 80%|████████  | 13987/17426 [24:43<07:46,  7.37it/s]

tensor(1.4673, grad_fn=<NllLossBackward0>)
tensor(1.4697, grad_fn=<NllLossBackward0>)


 80%|████████  | 13989/17426 [24:44<07:09,  7.99it/s]

tensor(1.4546, grad_fn=<NllLossBackward0>)
tensor(1.4717, grad_fn=<NllLossBackward0>)


 80%|████████  | 13991/17426 [24:44<06:59,  8.19it/s]

tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.4491, grad_fn=<NllLossBackward0>)


 80%|████████  | 13993/17426 [24:44<06:49,  8.39it/s]

tensor(1.4833, grad_fn=<NllLossBackward0>)
tensor(1.4538, grad_fn=<NllLossBackward0>)


 80%|████████  | 13995/17426 [24:44<06:43,  8.51it/s]

tensor(1.5265, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)


 80%|████████  | 13997/17426 [24:45<06:55,  8.24it/s]

tensor(1.5087, grad_fn=<NllLossBackward0>)
tensor(1.4928, grad_fn=<NllLossBackward0>)


 80%|████████  | 13999/17426 [24:45<06:51,  8.33it/s]

tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 80%|████████  | 14001/17426 [24:45<06:54,  8.26it/s]

tensor(1.4808, grad_fn=<NllLossBackward0>)
tensor(1.4895, grad_fn=<NllLossBackward0>)


 80%|████████  | 14003/17426 [24:45<07:13,  7.90it/s]

tensor(1.5188, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)


 80%|████████  | 14005/17426 [24:46<07:33,  7.55it/s]

tensor(1.4891, grad_fn=<NllLossBackward0>)
tensor(1.5154, grad_fn=<NllLossBackward0>)


 80%|████████  | 14007/17426 [24:46<07:12,  7.91it/s]

tensor(1.4756, grad_fn=<NllLossBackward0>)
tensor(1.5398, grad_fn=<NllLossBackward0>)


 80%|████████  | 14009/17426 [24:46<07:01,  8.11it/s]

tensor(1.5747, grad_fn=<NllLossBackward0>)
tensor(1.4461, grad_fn=<NllLossBackward0>)


 80%|████████  | 14011/17426 [24:46<06:56,  8.19it/s]

tensor(1.4937, grad_fn=<NllLossBackward0>)
tensor(1.4779, grad_fn=<NllLossBackward0>)


 80%|████████  | 14013/17426 [24:47<07:07,  7.98it/s]

tensor(1.4656, grad_fn=<NllLossBackward0>)
tensor(1.5017, grad_fn=<NllLossBackward0>)


 80%|████████  | 14015/17426 [24:47<07:01,  8.09it/s]

tensor(1.5002, grad_fn=<NllLossBackward0>)
tensor(1.5155, grad_fn=<NllLossBackward0>)


 80%|████████  | 14017/17426 [24:47<06:51,  8.29it/s]

tensor(1.4876, grad_fn=<NllLossBackward0>)
tensor(1.5132, grad_fn=<NllLossBackward0>)


 80%|████████  | 14019/17426 [24:47<06:48,  8.35it/s]

tensor(1.4491, grad_fn=<NllLossBackward0>)
tensor(1.5129, grad_fn=<NllLossBackward0>)


 80%|████████  | 14021/17426 [24:47<06:50,  8.30it/s]

tensor(1.5226, grad_fn=<NllLossBackward0>)
tensor(1.4530, grad_fn=<NllLossBackward0>)


 80%|████████  | 14023/17426 [24:48<07:05,  7.99it/s]

tensor(1.5276, grad_fn=<NllLossBackward0>)
tensor(1.4535, grad_fn=<NllLossBackward0>)


 80%|████████  | 14025/17426 [24:48<06:53,  8.23it/s]

tensor(1.4729, grad_fn=<NllLossBackward0>)
tensor(1.4549, grad_fn=<NllLossBackward0>)


 80%|████████  | 14027/17426 [24:48<06:41,  8.46it/s]

tensor(1.5441, grad_fn=<NllLossBackward0>)
tensor(1.5032, grad_fn=<NllLossBackward0>)


 81%|████████  | 14029/17426 [24:48<06:38,  8.52it/s]

tensor(1.5135, grad_fn=<NllLossBackward0>)
tensor(1.4907, grad_fn=<NllLossBackward0>)


 81%|████████  | 14031/17426 [24:49<06:58,  8.11it/s]

tensor(1.4552, grad_fn=<NllLossBackward0>)
tensor(1.4582, grad_fn=<NllLossBackward0>)


 81%|████████  | 14033/17426 [24:49<06:49,  8.29it/s]

tensor(1.4935, grad_fn=<NllLossBackward0>)
tensor(1.4841, grad_fn=<NllLossBackward0>)


 81%|████████  | 14035/17426 [24:49<06:48,  8.30it/s]

tensor(1.4640, grad_fn=<NllLossBackward0>)
tensor(1.4627, grad_fn=<NllLossBackward0>)


 81%|████████  | 14037/17426 [24:49<06:49,  8.28it/s]

tensor(1.5027, grad_fn=<NllLossBackward0>)
tensor(1.5285, grad_fn=<NllLossBackward0>)


 81%|████████  | 14039/17426 [24:50<06:49,  8.27it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)
tensor(1.4685, grad_fn=<NllLossBackward0>)


 81%|████████  | 14041/17426 [24:50<06:47,  8.32it/s]

tensor(1.4754, grad_fn=<NllLossBackward0>)
tensor(1.4865, grad_fn=<NllLossBackward0>)


 81%|████████  | 14043/17426 [24:50<06:42,  8.41it/s]

tensor(1.5092, grad_fn=<NllLossBackward0>)
tensor(1.4791, grad_fn=<NllLossBackward0>)


 81%|████████  | 14045/17426 [24:50<06:51,  8.21it/s]

tensor(1.5098, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 81%|████████  | 14047/17426 [24:51<06:41,  8.42it/s]

tensor(1.5124, grad_fn=<NllLossBackward0>)
tensor(1.4827, grad_fn=<NllLossBackward0>)


 81%|████████  | 14049/17426 [24:51<06:54,  8.14it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5032, grad_fn=<NllLossBackward0>)


 81%|████████  | 14051/17426 [24:51<06:49,  8.24it/s]

tensor(1.5127, grad_fn=<NllLossBackward0>)
tensor(1.4525, grad_fn=<NllLossBackward0>)


 81%|████████  | 14053/17426 [24:51<06:55,  8.12it/s]

tensor(1.4828, grad_fn=<NllLossBackward0>)
tensor(1.4342, grad_fn=<NllLossBackward0>)


 81%|████████  | 14055/17426 [24:52<06:52,  8.17it/s]

tensor(1.4936, grad_fn=<NllLossBackward0>)
tensor(1.5111, grad_fn=<NllLossBackward0>)


 81%|████████  | 14057/17426 [24:52<07:05,  7.93it/s]

tensor(1.4925, grad_fn=<NllLossBackward0>)
tensor(1.4990, grad_fn=<NllLossBackward0>)


 81%|████████  | 14059/17426 [24:52<06:47,  8.25it/s]

tensor(1.4825, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 81%|████████  | 14061/17426 [24:52<06:45,  8.29it/s]

tensor(1.4739, grad_fn=<NllLossBackward0>)
tensor(1.4375, grad_fn=<NllLossBackward0>)


 81%|████████  | 14063/17426 [24:53<06:46,  8.27it/s]

tensor(1.5169, grad_fn=<NllLossBackward0>)
tensor(1.4191, grad_fn=<NllLossBackward0>)


 81%|████████  | 14065/17426 [24:53<06:47,  8.25it/s]

tensor(1.5055, grad_fn=<NllLossBackward0>)
tensor(1.4845, grad_fn=<NllLossBackward0>)


 81%|████████  | 14067/17426 [24:53<07:22,  7.60it/s]

tensor(1.4677, grad_fn=<NllLossBackward0>)
tensor(1.4848, grad_fn=<NllLossBackward0>)


 81%|████████  | 14069/17426 [24:53<07:48,  7.16it/s]

tensor(1.4322, grad_fn=<NllLossBackward0>)
tensor(1.4710, grad_fn=<NllLossBackward0>)


 81%|████████  | 14071/17426 [24:54<07:24,  7.55it/s]

tensor(1.4609, grad_fn=<NllLossBackward0>)
tensor(1.4678, grad_fn=<NllLossBackward0>)


 81%|████████  | 14073/17426 [24:54<07:32,  7.41it/s]

tensor(1.4872, grad_fn=<NllLossBackward0>)
tensor(1.5484, grad_fn=<NllLossBackward0>)


 81%|████████  | 14075/17426 [24:54<07:33,  7.38it/s]

tensor(1.5044, grad_fn=<NllLossBackward0>)
tensor(1.4832, grad_fn=<NllLossBackward0>)


 81%|████████  | 14077/17426 [24:54<07:38,  7.31it/s]

tensor(1.4437, grad_fn=<NllLossBackward0>)
tensor(1.5243, grad_fn=<NllLossBackward0>)


 81%|████████  | 14079/17426 [24:55<07:18,  7.63it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.4952, grad_fn=<NllLossBackward0>)


 81%|████████  | 14081/17426 [24:55<07:38,  7.29it/s]

tensor(1.4678, grad_fn=<NllLossBackward0>)
tensor(1.4957, grad_fn=<NllLossBackward0>)


 81%|████████  | 14083/17426 [24:55<07:35,  7.34it/s]

tensor(1.4947, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)


 81%|████████  | 14085/17426 [24:56<07:46,  7.17it/s]

tensor(1.4203, grad_fn=<NllLossBackward0>)
tensor(1.4396, grad_fn=<NllLossBackward0>)


 81%|████████  | 14087/17426 [24:56<08:30,  6.54it/s]

tensor(1.4848, grad_fn=<NllLossBackward0>)
tensor(1.5408, grad_fn=<NllLossBackward0>)


 81%|████████  | 14089/17426 [24:56<08:42,  6.39it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.4731, grad_fn=<NllLossBackward0>)


 81%|████████  | 14091/17426 [24:57<08:33,  6.50it/s]

tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.4367, grad_fn=<NllLossBackward0>)


 81%|████████  | 14093/17426 [24:57<08:50,  6.28it/s]

tensor(1.4984, grad_fn=<NllLossBackward0>)
tensor(1.4950, grad_fn=<NllLossBackward0>)


 81%|████████  | 14095/17426 [24:57<08:32,  6.49it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.4885, grad_fn=<NllLossBackward0>)


 81%|████████  | 14097/17426 [24:57<08:44,  6.34it/s]

tensor(1.4797, grad_fn=<NllLossBackward0>)
tensor(1.4838, grad_fn=<NllLossBackward0>)


 81%|████████  | 14099/17426 [24:58<07:53,  7.03it/s]

tensor(1.4686, grad_fn=<NllLossBackward0>)
tensor(1.4786, grad_fn=<NllLossBackward0>)


 81%|████████  | 14101/17426 [24:58<07:11,  7.71it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)
tensor(1.4634, grad_fn=<NllLossBackward0>)


 81%|████████  | 14103/17426 [24:58<06:49,  8.11it/s]

tensor(1.4718, grad_fn=<NllLossBackward0>)
tensor(1.4944, grad_fn=<NllLossBackward0>)


 81%|████████  | 14105/17426 [24:58<07:01,  7.88it/s]

tensor(1.4745, grad_fn=<NllLossBackward0>)
tensor(1.4602, grad_fn=<NllLossBackward0>)


 81%|████████  | 14107/17426 [24:59<06:47,  8.14it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.5285, grad_fn=<NllLossBackward0>)


 81%|████████  | 14109/17426 [24:59<06:35,  8.40it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.4607, grad_fn=<NllLossBackward0>)


 81%|████████  | 14111/17426 [24:59<06:30,  8.49it/s]

tensor(1.4880, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 81%|████████  | 14113/17426 [24:59<06:34,  8.41it/s]

tensor(1.4880, grad_fn=<NllLossBackward0>)
tensor(1.4784, grad_fn=<NllLossBackward0>)


 81%|████████  | 14115/17426 [25:00<06:36,  8.35it/s]

tensor(1.4748, grad_fn=<NllLossBackward0>)
tensor(1.4985, grad_fn=<NllLossBackward0>)


 81%|████████  | 14117/17426 [25:00<06:47,  8.12it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.4432, grad_fn=<NllLossBackward0>)


 81%|████████  | 14119/17426 [25:00<06:45,  8.15it/s]

tensor(1.4921, grad_fn=<NllLossBackward0>)
tensor(1.4968, grad_fn=<NllLossBackward0>)


 81%|████████  | 14121/17426 [25:00<06:47,  8.11it/s]

tensor(1.5013, grad_fn=<NllLossBackward0>)
tensor(1.4737, grad_fn=<NllLossBackward0>)


 81%|████████  | 14123/17426 [25:01<06:47,  8.11it/s]

tensor(1.4670, grad_fn=<NllLossBackward0>)
tensor(1.4969, grad_fn=<NllLossBackward0>)


 81%|████████  | 14125/17426 [25:01<06:40,  8.23it/s]

tensor(1.4885, grad_fn=<NllLossBackward0>)
tensor(1.5249, grad_fn=<NllLossBackward0>)


 81%|████████  | 14127/17426 [25:01<06:43,  8.17it/s]

tensor(1.4496, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 81%|████████  | 14129/17426 [25:01<06:31,  8.42it/s]

tensor(1.5083, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)


 81%|████████  | 14131/17426 [25:02<06:39,  8.24it/s]

tensor(1.4759, grad_fn=<NllLossBackward0>)
tensor(1.5119, grad_fn=<NllLossBackward0>)


 81%|████████  | 14133/17426 [25:02<06:35,  8.33it/s]

tensor(1.4633, grad_fn=<NllLossBackward0>)
tensor(1.4936, grad_fn=<NllLossBackward0>)


 81%|████████  | 14135/17426 [25:02<06:33,  8.35it/s]

tensor(1.4925, grad_fn=<NllLossBackward0>)
tensor(1.4654, grad_fn=<NllLossBackward0>)


 81%|████████  | 14137/17426 [25:02<06:36,  8.29it/s]

tensor(1.4861, grad_fn=<NllLossBackward0>)
tensor(1.5182, grad_fn=<NllLossBackward0>)


 81%|████████  | 14139/17426 [25:03<06:39,  8.24it/s]

tensor(1.4817, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)


 81%|████████  | 14141/17426 [25:03<06:44,  8.11it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.4803, grad_fn=<NllLossBackward0>)


 81%|████████  | 14143/17426 [25:03<06:39,  8.22it/s]

tensor(1.4350, grad_fn=<NllLossBackward0>)
tensor(1.5059, grad_fn=<NllLossBackward0>)


 81%|████████  | 14145/17426 [25:03<06:43,  8.13it/s]

tensor(1.4635, grad_fn=<NllLossBackward0>)
tensor(1.4746, grad_fn=<NllLossBackward0>)


 81%|████████  | 14147/17426 [25:04<06:46,  8.07it/s]

tensor(1.4904, grad_fn=<NllLossBackward0>)
tensor(1.4282, grad_fn=<NllLossBackward0>)


 81%|████████  | 14149/17426 [25:04<06:41,  8.16it/s]

tensor(1.4945, grad_fn=<NllLossBackward0>)
tensor(1.4962, grad_fn=<NllLossBackward0>)


 81%|████████  | 14151/17426 [25:04<06:35,  8.29it/s]

tensor(1.4930, grad_fn=<NllLossBackward0>)
tensor(1.4768, grad_fn=<NllLossBackward0>)


 81%|████████  | 14153/17426 [25:04<06:33,  8.32it/s]

tensor(1.4424, grad_fn=<NllLossBackward0>)
tensor(1.4785, grad_fn=<NllLossBackward0>)


 81%|████████  | 14155/17426 [25:05<06:23,  8.53it/s]

tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.4961, grad_fn=<NllLossBackward0>)


 81%|████████  | 14157/17426 [25:05<06:33,  8.30it/s]

tensor(1.4869, grad_fn=<NllLossBackward0>)
tensor(1.4808, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14159/17426 [25:05<06:29,  8.40it/s]

tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.4813, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14161/17426 [25:05<06:24,  8.49it/s]

tensor(1.4632, grad_fn=<NllLossBackward0>)
tensor(1.4875, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14163/17426 [25:05<06:26,  8.44it/s]

tensor(1.4481, grad_fn=<NllLossBackward0>)
tensor(1.4813, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14165/17426 [25:06<06:46,  8.02it/s]

tensor(1.5139, grad_fn=<NllLossBackward0>)
tensor(1.4476, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14167/17426 [25:06<06:36,  8.22it/s]

tensor(1.4600, grad_fn=<NllLossBackward0>)
tensor(1.4677, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14169/17426 [25:06<06:29,  8.36it/s]

tensor(1.4950, grad_fn=<NllLossBackward0>)
tensor(1.5031, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14171/17426 [25:06<06:26,  8.42it/s]

tensor(1.5150, grad_fn=<NllLossBackward0>)
tensor(1.4582, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14173/17426 [25:07<06:30,  8.34it/s]

tensor(1.4794, grad_fn=<NllLossBackward0>)
tensor(1.4521, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14175/17426 [25:07<06:41,  8.10it/s]

tensor(1.4565, grad_fn=<NllLossBackward0>)
tensor(1.4850, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14177/17426 [25:07<06:43,  8.06it/s]

tensor(1.5310, grad_fn=<NllLossBackward0>)
tensor(1.5195, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14179/17426 [25:07<06:31,  8.30it/s]

tensor(1.4737, grad_fn=<NllLossBackward0>)
tensor(1.4906, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14181/17426 [25:08<06:48,  7.94it/s]

tensor(1.4744, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14183/17426 [25:08<07:15,  7.45it/s]

tensor(1.4579, grad_fn=<NllLossBackward0>)
tensor(1.4799, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14185/17426 [25:08<07:15,  7.44it/s]

tensor(1.4767, grad_fn=<NllLossBackward0>)
tensor(1.5114, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14187/17426 [25:09<06:59,  7.73it/s]

tensor(1.4561, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14189/17426 [25:09<06:48,  7.92it/s]

tensor(1.4641, grad_fn=<NllLossBackward0>)
tensor(1.4992, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14191/17426 [25:09<06:43,  8.01it/s]

tensor(1.4788, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14193/17426 [25:09<06:31,  8.26it/s]

tensor(1.4511, grad_fn=<NllLossBackward0>)
tensor(1.4711, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14195/17426 [25:10<06:43,  8.01it/s]

tensor(1.4340, grad_fn=<NllLossBackward0>)
tensor(1.4745, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14197/17426 [25:10<06:31,  8.25it/s]

tensor(1.4500, grad_fn=<NllLossBackward0>)
tensor(1.4678, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14199/17426 [25:10<06:56,  7.74it/s]

tensor(1.4586, grad_fn=<NllLossBackward0>)
tensor(1.4955, grad_fn=<NllLossBackward0>)


 81%|████████▏ | 14201/17426 [25:10<06:41,  8.04it/s]

tensor(1.4339, grad_fn=<NllLossBackward0>)
tensor(1.5214, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14203/17426 [25:11<07:16,  7.39it/s]

tensor(1.5115, grad_fn=<NllLossBackward0>)
tensor(1.4979, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14205/17426 [25:11<07:50,  6.85it/s]

tensor(1.5069, grad_fn=<NllLossBackward0>)
tensor(1.5081, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14207/17426 [25:11<07:43,  6.95it/s]

tensor(1.4662, grad_fn=<NllLossBackward0>)
tensor(1.5083, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14209/17426 [25:11<07:59,  6.72it/s]

tensor(1.4583, grad_fn=<NllLossBackward0>)
tensor(1.4782, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14211/17426 [25:12<07:35,  7.06it/s]

tensor(1.4789, grad_fn=<NllLossBackward0>)
tensor(1.4792, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14213/17426 [25:12<07:44,  6.92it/s]

tensor(1.4900, grad_fn=<NllLossBackward0>)
tensor(1.4783, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14215/17426 [25:12<07:34,  7.07it/s]

tensor(1.4757, grad_fn=<NllLossBackward0>)
tensor(1.4478, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14217/17426 [25:13<06:58,  7.67it/s]

tensor(1.5229, grad_fn=<NllLossBackward0>)
tensor(1.5124, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14219/17426 [25:13<06:40,  8.01it/s]

tensor(1.4606, grad_fn=<NllLossBackward0>)
tensor(1.5300, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14221/17426 [25:13<06:31,  8.18it/s]

tensor(1.4832, grad_fn=<NllLossBackward0>)
tensor(1.4905, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14223/17426 [25:13<06:58,  7.66it/s]

tensor(1.5071, grad_fn=<NllLossBackward0>)
tensor(1.4983, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14225/17426 [25:14<06:38,  8.03it/s]

tensor(1.4925, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14227/17426 [25:14<06:30,  8.19it/s]

tensor(1.4836, grad_fn=<NllLossBackward0>)
tensor(1.4380, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14229/17426 [25:14<06:41,  7.97it/s]

tensor(1.4573, grad_fn=<NllLossBackward0>)
tensor(1.4644, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14231/17426 [25:14<06:35,  8.07it/s]

tensor(1.4551, grad_fn=<NllLossBackward0>)
tensor(1.4723, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14233/17426 [25:15<06:37,  8.03it/s]

tensor(1.5055, grad_fn=<NllLossBackward0>)
tensor(1.5416, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14235/17426 [25:15<06:27,  8.22it/s]

tensor(1.4662, grad_fn=<NllLossBackward0>)
tensor(1.4798, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14237/17426 [25:15<06:25,  8.28it/s]

tensor(1.5089, grad_fn=<NllLossBackward0>)
tensor(1.4556, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14239/17426 [25:15<06:34,  8.09it/s]

tensor(1.4875, grad_fn=<NllLossBackward0>)
tensor(1.4863, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14241/17426 [25:16<06:34,  8.07it/s]

tensor(1.4598, grad_fn=<NllLossBackward0>)
tensor(1.4853, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14243/17426 [25:16<06:31,  8.13it/s]

tensor(1.4687, grad_fn=<NllLossBackward0>)
tensor(1.4662, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14245/17426 [25:16<06:25,  8.26it/s]

tensor(1.5325, grad_fn=<NllLossBackward0>)
tensor(1.5264, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14247/17426 [25:16<06:23,  8.29it/s]

tensor(1.4820, grad_fn=<NllLossBackward0>)
tensor(1.4745, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14249/17426 [25:16<06:26,  8.22it/s]

tensor(1.4942, grad_fn=<NllLossBackward0>)
tensor(1.4566, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14251/17426 [25:17<06:32,  8.09it/s]

tensor(1.5288, grad_fn=<NllLossBackward0>)
tensor(1.5219, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14253/17426 [25:17<06:24,  8.25it/s]

tensor(1.4731, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14255/17426 [25:17<06:28,  8.16it/s]

tensor(1.4991, grad_fn=<NllLossBackward0>)
tensor(1.4829, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14257/17426 [25:17<06:27,  8.17it/s]

tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.5091, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14259/17426 [25:18<06:18,  8.36it/s]

tensor(1.5229, grad_fn=<NllLossBackward0>)
tensor(1.5171, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14261/17426 [25:18<06:12,  8.50it/s]

tensor(1.4693, grad_fn=<NllLossBackward0>)
tensor(1.5132, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14263/17426 [25:18<06:30,  8.11it/s]

tensor(1.5288, grad_fn=<NllLossBackward0>)
tensor(1.5183, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14265/17426 [25:18<06:21,  8.28it/s]

tensor(1.4579, grad_fn=<NllLossBackward0>)
tensor(1.4594, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14267/17426 [25:19<06:30,  8.10it/s]

tensor(1.4586, grad_fn=<NllLossBackward0>)
tensor(1.4399, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14269/17426 [25:19<06:15,  8.40it/s]

tensor(1.4579, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14271/17426 [25:19<06:12,  8.48it/s]

tensor(1.4635, grad_fn=<NllLossBackward0>)
tensor(1.4114, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14273/17426 [25:19<06:11,  8.49it/s]

tensor(1.4746, grad_fn=<NllLossBackward0>)
tensor(1.4848, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14275/17426 [25:20<06:21,  8.26it/s]

tensor(1.4444, grad_fn=<NllLossBackward0>)
tensor(1.5180, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14277/17426 [25:20<06:16,  8.35it/s]

tensor(1.4538, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14279/17426 [25:20<06:12,  8.45it/s]

tensor(1.5089, grad_fn=<NllLossBackward0>)
tensor(1.4858, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14281/17426 [25:20<06:13,  8.43it/s]

tensor(1.5075, grad_fn=<NllLossBackward0>)
tensor(1.4610, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14283/17426 [25:21<06:08,  8.53it/s]

tensor(1.4429, grad_fn=<NllLossBackward0>)
tensor(1.5052, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14285/17426 [25:21<06:21,  8.24it/s]

tensor(1.5329, grad_fn=<NllLossBackward0>)
tensor(1.5114, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14287/17426 [25:21<06:15,  8.35it/s]

tensor(1.5119, grad_fn=<NllLossBackward0>)
tensor(1.4652, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14289/17426 [25:21<06:14,  8.39it/s]

tensor(1.5187, grad_fn=<NllLossBackward0>)
tensor(1.5051, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14291/17426 [25:22<06:18,  8.28it/s]

tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.4856, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14293/17426 [25:22<06:29,  8.04it/s]

tensor(1.4941, grad_fn=<NllLossBackward0>)
tensor(1.5400, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14295/17426 [25:22<06:20,  8.22it/s]

tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.5211, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14297/17426 [25:22<06:42,  7.77it/s]

tensor(1.4441, grad_fn=<NllLossBackward0>)
tensor(1.5293, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14299/17426 [25:23<07:10,  7.26it/s]

tensor(1.4585, grad_fn=<NllLossBackward0>)
tensor(1.4700, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14301/17426 [25:23<07:45,  6.71it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.5365, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14303/17426 [25:23<07:14,  7.19it/s]

tensor(1.4709, grad_fn=<NllLossBackward0>)
tensor(1.4490, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14305/17426 [25:23<07:03,  7.38it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14307/17426 [25:24<06:44,  7.71it/s]

tensor(1.4682, grad_fn=<NllLossBackward0>)
tensor(1.5391, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14309/17426 [25:24<06:38,  7.83it/s]

tensor(1.4258, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14311/17426 [25:24<06:46,  7.67it/s]

tensor(1.4784, grad_fn=<NllLossBackward0>)
tensor(1.4762, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14313/17426 [25:24<06:43,  7.71it/s]

tensor(1.4408, grad_fn=<NllLossBackward0>)
tensor(1.4363, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14315/17426 [25:25<06:30,  7.96it/s]

tensor(1.4738, grad_fn=<NllLossBackward0>)
tensor(1.5026, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14317/17426 [25:25<06:36,  7.85it/s]

tensor(1.4884, grad_fn=<NllLossBackward0>)
tensor(1.4800, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14319/17426 [25:25<06:52,  7.54it/s]

tensor(1.4831, grad_fn=<NllLossBackward0>)
tensor(1.4976, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14321/17426 [25:26<07:17,  7.10it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14323/17426 [25:26<07:20,  7.04it/s]

tensor(1.4666, grad_fn=<NllLossBackward0>)
tensor(1.4663, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14325/17426 [25:26<07:18,  7.06it/s]

tensor(1.5177, grad_fn=<NllLossBackward0>)
tensor(1.4754, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14327/17426 [25:26<07:56,  6.51it/s]

tensor(1.4833, grad_fn=<NllLossBackward0>)
tensor(1.4457, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14329/17426 [25:27<07:25,  6.96it/s]

tensor(1.5270, grad_fn=<NllLossBackward0>)
tensor(1.5070, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14331/17426 [25:27<07:23,  6.97it/s]

tensor(1.5118, grad_fn=<NllLossBackward0>)
tensor(1.5173, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14333/17426 [25:27<07:00,  7.36it/s]

tensor(1.4919, grad_fn=<NllLossBackward0>)
tensor(1.5029, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14335/17426 [25:28<06:45,  7.62it/s]

tensor(1.4662, grad_fn=<NllLossBackward0>)
tensor(1.5076, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14337/17426 [25:28<06:30,  7.91it/s]

tensor(1.5326, grad_fn=<NllLossBackward0>)
tensor(1.4883, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14339/17426 [25:28<06:11,  8.30it/s]

tensor(1.4798, grad_fn=<NllLossBackward0>)
tensor(1.4345, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14341/17426 [25:28<06:14,  8.24it/s]

tensor(1.4020, grad_fn=<NllLossBackward0>)
tensor(1.5085, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14343/17426 [25:28<06:07,  8.39it/s]

tensor(1.4714, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14345/17426 [25:29<06:04,  8.45it/s]

tensor(1.5070, grad_fn=<NllLossBackward0>)
tensor(1.4672, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14347/17426 [25:29<06:07,  8.39it/s]

tensor(1.4695, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14349/17426 [25:29<06:16,  8.17it/s]

tensor(1.5110, grad_fn=<NllLossBackward0>)
tensor(1.5301, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14351/17426 [25:29<06:12,  8.26it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.4816, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14353/17426 [25:30<06:06,  8.37it/s]

tensor(1.4760, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14355/17426 [25:30<06:07,  8.35it/s]

tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.4422, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14357/17426 [25:30<06:08,  8.33it/s]

tensor(1.5228, grad_fn=<NllLossBackward0>)
tensor(1.5060, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14359/17426 [25:30<06:19,  8.08it/s]

tensor(1.4594, grad_fn=<NllLossBackward0>)
tensor(1.5181, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14361/17426 [25:31<06:11,  8.25it/s]

tensor(1.4411, grad_fn=<NllLossBackward0>)
tensor(1.4766, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14363/17426 [25:31<06:05,  8.39it/s]

tensor(1.4740, grad_fn=<NllLossBackward0>)
tensor(1.5405, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14365/17426 [25:31<06:06,  8.36it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.4667, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14367/17426 [25:31<06:32,  7.79it/s]

tensor(1.5330, grad_fn=<NllLossBackward0>)
tensor(1.4879, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14369/17426 [25:32<06:16,  8.13it/s]

tensor(1.5015, grad_fn=<NllLossBackward0>)
tensor(1.4466, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14371/17426 [25:32<06:02,  8.42it/s]

tensor(1.4870, grad_fn=<NllLossBackward0>)
tensor(1.4886, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14373/17426 [25:32<05:56,  8.56it/s]

tensor(1.4718, grad_fn=<NllLossBackward0>)
tensor(1.4861, grad_fn=<NllLossBackward0>)


 82%|████████▏ | 14375/17426 [25:32<05:52,  8.65it/s]

tensor(1.4544, grad_fn=<NllLossBackward0>)
tensor(1.4799, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14377/17426 [25:33<06:03,  8.39it/s]

tensor(1.4631, grad_fn=<NllLossBackward0>)
tensor(1.4874, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14379/17426 [25:33<06:02,  8.40it/s]

tensor(1.5360, grad_fn=<NllLossBackward0>)
tensor(1.4838, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14381/17426 [25:33<06:12,  8.18it/s]

tensor(1.4595, grad_fn=<NllLossBackward0>)
tensor(1.4463, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14383/17426 [25:33<06:03,  8.37it/s]

tensor(1.5272, grad_fn=<NllLossBackward0>)
tensor(1.4901, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14385/17426 [25:34<06:13,  8.14it/s]

tensor(1.5154, grad_fn=<NllLossBackward0>)
tensor(1.5094, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14387/17426 [25:34<06:07,  8.27it/s]

tensor(1.4333, grad_fn=<NllLossBackward0>)
tensor(1.4534, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14389/17426 [25:34<05:57,  8.48it/s]

tensor(1.5325, grad_fn=<NllLossBackward0>)
tensor(1.4880, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14391/17426 [25:34<05:54,  8.56it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.5107, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14393/17426 [25:35<06:19,  8.00it/s]

tensor(1.4474, grad_fn=<NllLossBackward0>)
tensor(1.4838, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14395/17426 [25:35<06:11,  8.15it/s]

tensor(1.5003, grad_fn=<NllLossBackward0>)
tensor(1.5409, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14397/17426 [25:35<06:09,  8.20it/s]

tensor(1.4615, grad_fn=<NllLossBackward0>)
tensor(1.4873, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14399/17426 [25:35<06:02,  8.35it/s]

tensor(1.4681, grad_fn=<NllLossBackward0>)
tensor(1.4537, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14401/17426 [25:35<05:59,  8.42it/s]

tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.4752, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14403/17426 [25:36<06:11,  8.13it/s]

tensor(1.5038, grad_fn=<NllLossBackward0>)
tensor(1.4419, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14405/17426 [25:36<06:15,  8.05it/s]

tensor(1.5053, grad_fn=<NllLossBackward0>)
tensor(1.4866, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14407/17426 [25:36<06:12,  8.10it/s]

tensor(1.4637, grad_fn=<NllLossBackward0>)
tensor(1.4767, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14409/17426 [25:36<06:13,  8.09it/s]

tensor(1.4975, grad_fn=<NllLossBackward0>)
tensor(1.4854, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14411/17426 [25:37<06:17,  8.00it/s]

tensor(1.4699, grad_fn=<NllLossBackward0>)
tensor(1.5245, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14413/17426 [25:37<06:04,  8.27it/s]

tensor(1.4835, grad_fn=<NllLossBackward0>)
tensor(1.4655, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14415/17426 [25:37<06:34,  7.63it/s]

tensor(1.4805, grad_fn=<NllLossBackward0>)
tensor(1.4930, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14417/17426 [25:38<06:51,  7.32it/s]

tensor(1.4829, grad_fn=<NllLossBackward0>)
tensor(1.4632, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14419/17426 [25:38<07:01,  7.13it/s]

tensor(1.4955, grad_fn=<NllLossBackward0>)
tensor(1.4808, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14421/17426 [25:38<06:44,  7.42it/s]

tensor(1.4479, grad_fn=<NllLossBackward0>)
tensor(1.4741, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14423/17426 [25:38<06:47,  7.37it/s]

tensor(1.5396, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14425/17426 [25:39<06:21,  7.87it/s]

tensor(1.4370, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14427/17426 [25:39<06:24,  7.81it/s]

tensor(1.5198, grad_fn=<NllLossBackward0>)
tensor(1.5112, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14429/17426 [25:39<06:43,  7.42it/s]

tensor(1.4744, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14431/17426 [25:39<06:15,  7.98it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.5125, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14433/17426 [25:40<06:00,  8.30it/s]

tensor(1.4647, grad_fn=<NllLossBackward0>)
tensor(1.4871, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14435/17426 [25:40<06:19,  7.89it/s]

tensor(1.5251, grad_fn=<NllLossBackward0>)
tensor(1.4847, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14437/17426 [25:40<06:40,  7.47it/s]

tensor(1.4797, grad_fn=<NllLossBackward0>)
tensor(1.4968, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14439/17426 [25:40<07:00,  7.11it/s]

tensor(1.5054, grad_fn=<NllLossBackward0>)
tensor(1.4658, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14441/17426 [25:41<07:03,  7.05it/s]

tensor(1.4503, grad_fn=<NllLossBackward0>)
tensor(1.4696, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14443/17426 [25:41<07:12,  6.90it/s]

tensor(1.5196, grad_fn=<NllLossBackward0>)
tensor(1.5180, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14445/17426 [25:41<07:24,  6.70it/s]

tensor(1.5115, grad_fn=<NllLossBackward0>)
tensor(1.4901, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14447/17426 [25:42<06:59,  7.11it/s]

tensor(1.4388, grad_fn=<NllLossBackward0>)
tensor(1.5119, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14449/17426 [25:42<07:37,  6.51it/s]

tensor(1.4980, grad_fn=<NllLossBackward0>)
tensor(1.4957, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14451/17426 [25:42<06:55,  7.17it/s]

tensor(1.4781, grad_fn=<NllLossBackward0>)
tensor(1.4484, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14453/17426 [25:42<06:16,  7.90it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.4934, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14455/17426 [25:43<06:07,  8.09it/s]

tensor(1.4835, grad_fn=<NllLossBackward0>)
tensor(1.4835, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14457/17426 [25:43<05:59,  8.25it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5226, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14459/17426 [25:43<06:16,  7.89it/s]

tensor(1.4766, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14461/17426 [25:43<06:02,  8.19it/s]

tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.5002, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14463/17426 [25:44<06:00,  8.23it/s]

tensor(1.5365, grad_fn=<NllLossBackward0>)
tensor(1.4519, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14465/17426 [25:44<05:55,  8.32it/s]

tensor(1.4490, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14467/17426 [25:44<05:57,  8.28it/s]

tensor(1.4896, grad_fn=<NllLossBackward0>)
tensor(1.5195, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14469/17426 [25:44<06:02,  8.15it/s]

tensor(1.4490, grad_fn=<NllLossBackward0>)
tensor(1.5364, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14471/17426 [25:45<05:53,  8.36it/s]

tensor(1.4613, grad_fn=<NllLossBackward0>)
tensor(1.5331, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14473/17426 [25:45<05:52,  8.38it/s]

tensor(1.5354, grad_fn=<NllLossBackward0>)
tensor(1.4149, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14475/17426 [25:45<05:48,  8.48it/s]

tensor(1.4889, grad_fn=<NllLossBackward0>)
tensor(1.4753, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14477/17426 [25:45<05:57,  8.25it/s]

tensor(1.4987, grad_fn=<NllLossBackward0>)
tensor(1.4337, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14479/17426 [25:46<05:55,  8.30it/s]

tensor(1.4897, grad_fn=<NllLossBackward0>)
tensor(1.4636, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14481/17426 [25:46<05:52,  8.36it/s]

tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.5067, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14483/17426 [25:46<06:18,  7.78it/s]

tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.4931, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14485/17426 [25:46<06:25,  7.63it/s]

tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.4413, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14487/17426 [25:47<06:01,  8.14it/s]

tensor(1.5263, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14489/17426 [25:47<06:03,  8.07it/s]

tensor(1.4727, grad_fn=<NllLossBackward0>)
tensor(1.4665, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14491/17426 [25:47<05:52,  8.32it/s]

tensor(1.4693, grad_fn=<NllLossBackward0>)
tensor(1.5120, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14493/17426 [25:47<06:07,  7.99it/s]

tensor(1.4330, grad_fn=<NllLossBackward0>)
tensor(1.4519, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14495/17426 [25:48<05:57,  8.20it/s]

tensor(1.5447, grad_fn=<NllLossBackward0>)
tensor(1.4357, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14497/17426 [25:48<05:42,  8.55it/s]

tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.4739, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14499/17426 [25:48<05:44,  8.49it/s]

tensor(1.4944, grad_fn=<NllLossBackward0>)
tensor(1.5140, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14501/17426 [25:48<05:42,  8.53it/s]

tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.4494, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14503/17426 [25:49<05:54,  8.24it/s]

tensor(1.5258, grad_fn=<NllLossBackward0>)
tensor(1.4891, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14505/17426 [25:49<05:53,  8.27it/s]

tensor(1.4929, grad_fn=<NllLossBackward0>)
tensor(1.4790, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14507/17426 [25:49<05:45,  8.44it/s]

tensor(1.4641, grad_fn=<NllLossBackward0>)
tensor(1.5277, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14509/17426 [25:49<05:42,  8.52it/s]

tensor(1.4969, grad_fn=<NllLossBackward0>)
tensor(1.5397, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14511/17426 [25:49<06:02,  8.04it/s]

tensor(1.4531, grad_fn=<NllLossBackward0>)
tensor(1.4950, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14513/17426 [25:50<05:59,  8.10it/s]

tensor(1.5034, grad_fn=<NllLossBackward0>)
tensor(1.4982, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14515/17426 [25:50<05:57,  8.13it/s]

tensor(1.5076, grad_fn=<NllLossBackward0>)
tensor(1.5101, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14517/17426 [25:50<05:55,  8.18it/s]

tensor(1.5018, grad_fn=<NllLossBackward0>)
tensor(1.4322, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14519/17426 [25:50<05:49,  8.33it/s]

tensor(1.5016, grad_fn=<NllLossBackward0>)
tensor(1.4669, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14521/17426 [25:51<06:00,  8.06it/s]

tensor(1.4941, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14523/17426 [25:51<05:56,  8.13it/s]

tensor(1.4279, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14525/17426 [25:51<05:51,  8.26it/s]

tensor(1.4637, grad_fn=<NllLossBackward0>)
tensor(1.4800, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14527/17426 [25:51<05:51,  8.25it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.4812, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14529/17426 [25:52<05:57,  8.11it/s]

tensor(1.4824, grad_fn=<NllLossBackward0>)
tensor(1.5087, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14531/17426 [25:52<05:50,  8.27it/s]

tensor(1.4944, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14533/17426 [25:52<06:09,  7.84it/s]

tensor(1.4874, grad_fn=<NllLossBackward0>)
tensor(1.4948, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14535/17426 [25:53<06:49,  7.06it/s]

tensor(1.4695, grad_fn=<NllLossBackward0>)
tensor(1.4512, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14537/17426 [25:53<07:03,  6.82it/s]

tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.4868, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14539/17426 [25:53<06:47,  7.09it/s]

tensor(1.5103, grad_fn=<NllLossBackward0>)
tensor(1.4019, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14541/17426 [25:53<06:43,  7.15it/s]

tensor(1.4665, grad_fn=<NllLossBackward0>)
tensor(1.4288, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14543/17426 [25:54<06:32,  7.34it/s]

tensor(1.5430, grad_fn=<NllLossBackward0>)
tensor(1.4556, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14545/17426 [25:54<06:48,  7.05it/s]

tensor(1.4980, grad_fn=<NllLossBackward0>)
tensor(1.5196, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14547/17426 [25:54<06:59,  6.86it/s]

tensor(1.5172, grad_fn=<NllLossBackward0>)
tensor(1.4511, grad_fn=<NllLossBackward0>)


 83%|████████▎ | 14549/17426 [25:54<06:16,  7.65it/s]

tensor(1.4971, grad_fn=<NllLossBackward0>)
tensor(1.4749, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14551/17426 [25:55<05:54,  8.10it/s]

tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14553/17426 [25:55<05:54,  8.11it/s]

tensor(1.4610, grad_fn=<NllLossBackward0>)
tensor(1.4690, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14555/17426 [25:55<05:54,  8.09it/s]

tensor(1.4943, grad_fn=<NllLossBackward0>)
tensor(1.4933, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14557/17426 [25:55<06:30,  7.34it/s]

tensor(1.4502, grad_fn=<NllLossBackward0>)
tensor(1.4877, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14559/17426 [25:56<06:41,  7.15it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.4873, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14561/17426 [25:56<06:50,  6.97it/s]

tensor(1.4524, grad_fn=<NllLossBackward0>)
tensor(1.4669, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14563/17426 [25:56<07:00,  6.81it/s]

tensor(1.4506, grad_fn=<NllLossBackward0>)
tensor(1.4671, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14565/17426 [25:57<06:50,  6.97it/s]

tensor(1.4703, grad_fn=<NllLossBackward0>)
tensor(1.5028, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14567/17426 [25:57<06:52,  6.93it/s]

tensor(1.4684, grad_fn=<NllLossBackward0>)
tensor(1.4946, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14569/17426 [25:57<06:28,  7.36it/s]

tensor(1.4503, grad_fn=<NllLossBackward0>)
tensor(1.4806, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14571/17426 [25:57<06:08,  7.74it/s]

tensor(1.5338, grad_fn=<NllLossBackward0>)
tensor(1.4735, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14573/17426 [25:58<06:00,  7.91it/s]

tensor(1.4455, grad_fn=<NllLossBackward0>)
tensor(1.5059, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14575/17426 [25:58<05:52,  8.10it/s]

tensor(1.4828, grad_fn=<NllLossBackward0>)
tensor(1.4836, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14577/17426 [25:58<06:00,  7.90it/s]

tensor(1.4705, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14579/17426 [25:58<05:52,  8.08it/s]

tensor(1.4808, grad_fn=<NllLossBackward0>)
tensor(1.5290, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14581/17426 [25:59<05:49,  8.14it/s]

tensor(1.4666, grad_fn=<NllLossBackward0>)
tensor(1.4519, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14583/17426 [25:59<05:36,  8.44it/s]

tensor(1.4937, grad_fn=<NllLossBackward0>)
tensor(1.4354, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14585/17426 [25:59<06:00,  7.89it/s]

tensor(1.5041, grad_fn=<NllLossBackward0>)
tensor(1.5139, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14587/17426 [25:59<06:07,  7.73it/s]

tensor(1.4993, grad_fn=<NllLossBackward0>)
tensor(1.4698, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14589/17426 [26:00<05:51,  8.06it/s]

tensor(1.4880, grad_fn=<NllLossBackward0>)
tensor(1.4301, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14591/17426 [26:00<05:50,  8.10it/s]

tensor(1.4479, grad_fn=<NllLossBackward0>)
tensor(1.4544, grad_fn=<NllLossBackward0>)


 84%|████████▎ | 14593/17426 [26:00<06:02,  7.81it/s]

tensor(1.4542, grad_fn=<NllLossBackward0>)
tensor(1.4702, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14595/17426 [26:00<05:49,  8.10it/s]

tensor(1.4732, grad_fn=<NllLossBackward0>)
tensor(1.4905, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14597/17426 [26:01<05:42,  8.25it/s]

tensor(1.4927, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14599/17426 [26:01<05:42,  8.24it/s]

tensor(1.4942, grad_fn=<NllLossBackward0>)
tensor(1.4932, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14601/17426 [26:01<05:36,  8.39it/s]

tensor(1.5317, grad_fn=<NllLossBackward0>)
tensor(1.4248, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14603/17426 [26:01<05:40,  8.29it/s]

tensor(1.4374, grad_fn=<NllLossBackward0>)
tensor(1.4823, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14605/17426 [26:02<05:37,  8.36it/s]

tensor(1.4821, grad_fn=<NllLossBackward0>)
tensor(1.4727, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14607/17426 [26:02<05:36,  8.38it/s]

tensor(1.4741, grad_fn=<NllLossBackward0>)
tensor(1.4864, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14609/17426 [26:02<05:35,  8.40it/s]

tensor(1.4810, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14611/17426 [26:02<05:46,  8.11it/s]

tensor(1.4889, grad_fn=<NllLossBackward0>)
tensor(1.4746, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14613/17426 [26:03<05:40,  8.27it/s]

tensor(1.5143, grad_fn=<NllLossBackward0>)
tensor(1.4757, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14615/17426 [26:03<05:33,  8.42it/s]

tensor(1.5131, grad_fn=<NllLossBackward0>)
tensor(1.4794, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14617/17426 [26:03<05:34,  8.41it/s]

tensor(1.4466, grad_fn=<NllLossBackward0>)
tensor(1.4722, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14619/17426 [26:03<05:32,  8.44it/s]

tensor(1.5152, grad_fn=<NllLossBackward0>)
tensor(1.5129, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14621/17426 [26:04<05:45,  8.12it/s]

tensor(1.4437, grad_fn=<NllLossBackward0>)
tensor(1.5113, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14623/17426 [26:04<05:37,  8.31it/s]

tensor(1.4658, grad_fn=<NllLossBackward0>)
tensor(1.4733, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14625/17426 [26:04<05:35,  8.36it/s]

tensor(1.4492, grad_fn=<NllLossBackward0>)
tensor(1.5154, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14627/17426 [26:04<05:33,  8.39it/s]

tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14629/17426 [26:05<05:46,  8.07it/s]

tensor(1.5143, grad_fn=<NllLossBackward0>)
tensor(1.4441, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14631/17426 [26:05<05:32,  8.41it/s]

tensor(1.4991, grad_fn=<NllLossBackward0>)
tensor(1.4916, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14633/17426 [26:05<05:23,  8.63it/s]

tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.4377, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14635/17426 [26:05<05:23,  8.64it/s]

tensor(1.4543, grad_fn=<NllLossBackward0>)
tensor(1.4792, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14637/17426 [26:05<05:34,  8.34it/s]

tensor(1.4615, grad_fn=<NllLossBackward0>)
tensor(1.5023, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14639/17426 [26:06<05:42,  8.13it/s]

tensor(1.4804, grad_fn=<NllLossBackward0>)
tensor(1.4383, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14641/17426 [26:06<05:31,  8.39it/s]

tensor(1.4637, grad_fn=<NllLossBackward0>)
tensor(1.4892, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14643/17426 [26:06<05:24,  8.58it/s]

tensor(1.4972, grad_fn=<NllLossBackward0>)
tensor(1.5289, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14645/17426 [26:06<05:20,  8.68it/s]

tensor(1.4825, grad_fn=<NllLossBackward0>)
tensor(1.5047, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14647/17426 [26:07<05:43,  8.08it/s]

tensor(1.4815, grad_fn=<NllLossBackward0>)
tensor(1.4786, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14649/17426 [26:07<05:35,  8.29it/s]

tensor(1.4746, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14651/17426 [26:07<05:58,  7.73it/s]

tensor(1.4407, grad_fn=<NllLossBackward0>)
tensor(1.5126, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14653/17426 [26:07<06:16,  7.37it/s]

tensor(1.4333, grad_fn=<NllLossBackward0>)
tensor(1.4695, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14655/17426 [26:08<06:21,  7.26it/s]

tensor(1.4974, grad_fn=<NllLossBackward0>)
tensor(1.4984, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14657/17426 [26:08<06:06,  7.55it/s]

tensor(1.5011, grad_fn=<NllLossBackward0>)
tensor(1.4758, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14659/17426 [26:08<06:20,  7.28it/s]

tensor(1.4632, grad_fn=<NllLossBackward0>)
tensor(1.4278, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14661/17426 [26:09<05:56,  7.76it/s]

tensor(1.4474, grad_fn=<NllLossBackward0>)
tensor(1.4421, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14663/17426 [26:09<05:59,  7.68it/s]

tensor(1.5129, grad_fn=<NllLossBackward0>)
tensor(1.4482, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14665/17426 [26:09<05:44,  8.02it/s]

tensor(1.5394, grad_fn=<NllLossBackward0>)
tensor(1.4836, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14667/17426 [26:09<06:03,  7.58it/s]

tensor(1.4691, grad_fn=<NllLossBackward0>)
tensor(1.4428, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14669/17426 [26:10<05:41,  8.07it/s]

tensor(1.5062, grad_fn=<NllLossBackward0>)
tensor(1.5177, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14671/17426 [26:10<05:35,  8.22it/s]

tensor(1.4746, grad_fn=<NllLossBackward0>)
tensor(1.4544, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14673/17426 [26:10<05:39,  8.10it/s]

tensor(1.4727, grad_fn=<NllLossBackward0>)
tensor(1.5302, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14675/17426 [26:10<06:17,  7.30it/s]

tensor(1.4872, grad_fn=<NllLossBackward0>)
tensor(1.4796, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14677/17426 [26:11<06:31,  7.01it/s]

tensor(1.4703, grad_fn=<NllLossBackward0>)
tensor(1.5223, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14679/17426 [26:11<06:40,  6.85it/s]

tensor(1.4796, grad_fn=<NllLossBackward0>)
tensor(1.4591, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14681/17426 [26:11<06:50,  6.68it/s]

tensor(1.4350, grad_fn=<NllLossBackward0>)
tensor(1.4729, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14683/17426 [26:12<06:46,  6.75it/s]

tensor(1.5254, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14685/17426 [26:12<06:50,  6.67it/s]

tensor(1.4978, grad_fn=<NllLossBackward0>)
tensor(1.4881, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14687/17426 [26:12<06:40,  6.83it/s]

tensor(1.4572, grad_fn=<NllLossBackward0>)
tensor(1.4946, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14689/17426 [26:12<06:02,  7.55it/s]

tensor(1.5261, grad_fn=<NllLossBackward0>)
tensor(1.5029, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14691/17426 [26:13<05:44,  7.93it/s]

tensor(1.4642, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14693/17426 [26:13<05:30,  8.28it/s]

tensor(1.5373, grad_fn=<NllLossBackward0>)
tensor(1.5134, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14695/17426 [26:13<05:28,  8.32it/s]

tensor(1.4268, grad_fn=<NllLossBackward0>)
tensor(1.4932, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14697/17426 [26:13<05:38,  8.06it/s]

tensor(1.4944, grad_fn=<NllLossBackward0>)
tensor(1.4797, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14699/17426 [26:14<05:33,  8.19it/s]

tensor(1.4935, grad_fn=<NllLossBackward0>)
tensor(1.5153, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14701/17426 [26:14<05:41,  7.97it/s]

tensor(1.4619, grad_fn=<NllLossBackward0>)
tensor(1.4682, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14703/17426 [26:14<05:30,  8.25it/s]

tensor(1.5214, grad_fn=<NllLossBackward0>)
tensor(1.5254, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14705/17426 [26:14<05:42,  7.95it/s]

tensor(1.5582, grad_fn=<NllLossBackward0>)
tensor(1.4782, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14707/17426 [26:15<05:26,  8.34it/s]

tensor(1.4542, grad_fn=<NllLossBackward0>)
tensor(1.4421, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14709/17426 [26:15<05:27,  8.31it/s]

tensor(1.5244, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14711/17426 [26:15<05:20,  8.48it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.4975, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14713/17426 [26:15<05:48,  7.77it/s]

tensor(1.4583, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14715/17426 [26:16<05:32,  8.15it/s]

tensor(1.4698, grad_fn=<NllLossBackward0>)
tensor(1.5417, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14717/17426 [26:16<05:29,  8.23it/s]

tensor(1.4972, grad_fn=<NllLossBackward0>)
tensor(1.4753, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14719/17426 [26:16<05:21,  8.41it/s]

tensor(1.4911, grad_fn=<NllLossBackward0>)
tensor(1.5011, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14721/17426 [26:16<05:20,  8.43it/s]

tensor(1.4642, grad_fn=<NllLossBackward0>)
tensor(1.4622, grad_fn=<NllLossBackward0>)


 84%|████████▍ | 14723/17426 [26:16<05:22,  8.39it/s]

tensor(1.4417, grad_fn=<NllLossBackward0>)
tensor(1.4705, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14725/17426 [26:17<05:28,  8.23it/s]

tensor(1.5009, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14727/17426 [26:17<05:20,  8.42it/s]

tensor(1.4949, grad_fn=<NllLossBackward0>)
tensor(1.4474, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14729/17426 [26:17<05:21,  8.38it/s]

tensor(1.5161, grad_fn=<NllLossBackward0>)
tensor(1.4580, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14731/17426 [26:17<05:31,  8.13it/s]

tensor(1.5099, grad_fn=<NllLossBackward0>)
tensor(1.4761, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14733/17426 [26:18<05:25,  8.27it/s]

tensor(1.4769, grad_fn=<NllLossBackward0>)
tensor(1.4758, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14735/17426 [26:18<05:25,  8.27it/s]

tensor(1.4779, grad_fn=<NllLossBackward0>)
tensor(1.4623, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14737/17426 [26:18<05:19,  8.41it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.4917, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14739/17426 [26:18<05:26,  8.22it/s]

tensor(1.4610, grad_fn=<NllLossBackward0>)
tensor(1.5005, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14741/17426 [26:19<05:22,  8.31it/s]

tensor(1.4521, grad_fn=<NllLossBackward0>)
tensor(1.4781, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14743/17426 [26:19<05:28,  8.18it/s]

tensor(1.4542, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14745/17426 [26:19<05:33,  8.03it/s]

tensor(1.4974, grad_fn=<NllLossBackward0>)
tensor(1.4496, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14747/17426 [26:19<05:22,  8.32it/s]

tensor(1.4534, grad_fn=<NllLossBackward0>)
tensor(1.4921, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14749/17426 [26:20<05:40,  7.86it/s]

tensor(1.4542, grad_fn=<NllLossBackward0>)
tensor(1.4736, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14751/17426 [26:20<05:26,  8.19it/s]

tensor(1.4737, grad_fn=<NllLossBackward0>)
tensor(1.4360, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14753/17426 [26:20<05:21,  8.32it/s]

tensor(1.4591, grad_fn=<NllLossBackward0>)
tensor(1.4357, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14755/17426 [26:20<05:15,  8.47it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.4582, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14757/17426 [26:21<05:33,  8.01it/s]

tensor(1.4901, grad_fn=<NllLossBackward0>)
tensor(1.4709, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14759/17426 [26:21<05:23,  8.25it/s]

tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.5068, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14761/17426 [26:21<05:19,  8.35it/s]

tensor(1.4754, grad_fn=<NllLossBackward0>)
tensor(1.4666, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14763/17426 [26:21<05:10,  8.57it/s]

tensor(1.4973, grad_fn=<NllLossBackward0>)
tensor(1.5333, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14765/17426 [26:22<05:19,  8.32it/s]

tensor(1.4679, grad_fn=<NllLossBackward0>)
tensor(1.4672, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14767/17426 [26:22<05:34,  7.95it/s]

tensor(1.4632, grad_fn=<NllLossBackward0>)
tensor(1.5007, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14769/17426 [26:22<05:37,  7.87it/s]

tensor(1.4590, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14771/17426 [26:22<05:48,  7.63it/s]

tensor(1.4837, grad_fn=<NllLossBackward0>)
tensor(1.5116, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14773/17426 [26:23<06:17,  7.04it/s]

tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.4861, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14775/17426 [26:23<05:59,  7.36it/s]

tensor(1.4504, grad_fn=<NllLossBackward0>)
tensor(1.4658, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14777/17426 [26:23<05:57,  7.41it/s]

tensor(1.4778, grad_fn=<NllLossBackward0>)
tensor(1.4944, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14779/17426 [26:24<06:16,  7.03it/s]

tensor(1.4345, grad_fn=<NllLossBackward0>)
tensor(1.4830, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14781/17426 [26:24<05:56,  7.42it/s]

tensor(1.4841, grad_fn=<NllLossBackward0>)
tensor(1.4637, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14783/17426 [26:24<06:10,  7.14it/s]

tensor(1.4287, grad_fn=<NllLossBackward0>)
tensor(1.4669, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14785/17426 [26:24<06:12,  7.08it/s]

tensor(1.4918, grad_fn=<NllLossBackward0>)
tensor(1.5149, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14787/17426 [26:25<06:11,  7.11it/s]

tensor(1.4552, grad_fn=<NllLossBackward0>)
tensor(1.5199, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14789/17426 [26:25<06:13,  7.07it/s]

tensor(1.4510, grad_fn=<NllLossBackward0>)
tensor(1.4507, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14791/17426 [26:25<06:33,  6.69it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.4532, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14793/17426 [26:26<06:36,  6.65it/s]

tensor(1.4939, grad_fn=<NllLossBackward0>)
tensor(1.4745, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14795/17426 [26:26<06:44,  6.50it/s]

tensor(1.4940, grad_fn=<NllLossBackward0>)
tensor(1.4601, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14797/17426 [26:26<07:21,  5.95it/s]

tensor(1.4943, grad_fn=<NllLossBackward0>)
tensor(1.4700, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14799/17426 [26:26<06:14,  7.02it/s]

tensor(1.5157, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14801/17426 [26:27<05:53,  7.44it/s]

tensor(1.5097, grad_fn=<NllLossBackward0>)
tensor(1.4560, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14803/17426 [26:27<05:35,  7.81it/s]

tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.4401, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14805/17426 [26:27<05:32,  7.88it/s]

tensor(1.4887, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14807/17426 [26:27<05:17,  8.24it/s]

tensor(1.4962, grad_fn=<NllLossBackward0>)
tensor(1.4746, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14809/17426 [26:28<05:14,  8.33it/s]

tensor(1.4640, grad_fn=<NllLossBackward0>)
tensor(1.4811, grad_fn=<NllLossBackward0>)


 85%|████████▍ | 14811/17426 [26:28<05:10,  8.43it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14813/17426 [26:28<05:19,  8.17it/s]

tensor(1.4633, grad_fn=<NllLossBackward0>)
tensor(1.5131, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14815/17426 [26:28<05:04,  8.57it/s]

tensor(1.4514, grad_fn=<NllLossBackward0>)
tensor(1.5255, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14817/17426 [26:29<05:09,  8.44it/s]

tensor(1.4757, grad_fn=<NllLossBackward0>)
tensor(1.4615, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14819/17426 [26:29<05:08,  8.46it/s]

tensor(1.4088, grad_fn=<NllLossBackward0>)
tensor(1.5204, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14821/17426 [26:29<05:19,  8.16it/s]

tensor(1.4985, grad_fn=<NllLossBackward0>)
tensor(1.4747, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14823/17426 [26:29<05:16,  8.22it/s]

tensor(1.4942, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14825/17426 [26:30<05:04,  8.53it/s]

tensor(1.4682, grad_fn=<NllLossBackward0>)
tensor(1.4765, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14827/17426 [26:30<05:11,  8.35it/s]

tensor(1.4426, grad_fn=<NllLossBackward0>)
tensor(1.4970, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14829/17426 [26:30<05:07,  8.44it/s]

tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14831/17426 [26:30<05:19,  8.13it/s]

tensor(1.4214, grad_fn=<NllLossBackward0>)
tensor(1.4754, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14833/17426 [26:31<05:26,  7.93it/s]

tensor(1.4725, grad_fn=<NllLossBackward0>)
tensor(1.4535, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14835/17426 [26:31<05:13,  8.26it/s]

tensor(1.4788, grad_fn=<NllLossBackward0>)
tensor(1.5008, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14837/17426 [26:31<05:08,  8.41it/s]

tensor(1.5215, grad_fn=<NllLossBackward0>)
tensor(1.4266, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14839/17426 [26:31<05:22,  8.02it/s]

tensor(1.4964, grad_fn=<NllLossBackward0>)
tensor(1.5070, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14841/17426 [26:32<05:10,  8.32it/s]

tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.4599, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14843/17426 [26:32<05:21,  8.03it/s]

tensor(1.4932, grad_fn=<NllLossBackward0>)
tensor(1.5018, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14845/17426 [26:32<05:07,  8.40it/s]

tensor(1.4590, grad_fn=<NllLossBackward0>)
tensor(1.4800, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14847/17426 [26:32<05:14,  8.21it/s]

tensor(1.4849, grad_fn=<NllLossBackward0>)
tensor(1.5094, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14849/17426 [26:33<05:13,  8.23it/s]

tensor(1.5374, grad_fn=<NllLossBackward0>)
tensor(1.4650, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14851/17426 [26:33<05:13,  8.22it/s]

tensor(1.5333, grad_fn=<NllLossBackward0>)
tensor(1.4967, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14853/17426 [26:33<05:05,  8.43it/s]

tensor(1.4670, grad_fn=<NllLossBackward0>)
tensor(1.4742, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14855/17426 [26:33<05:06,  8.39it/s]

tensor(1.4730, grad_fn=<NllLossBackward0>)
tensor(1.5191, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14857/17426 [26:33<05:08,  8.34it/s]

tensor(1.4450, grad_fn=<NllLossBackward0>)
tensor(1.5018, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14859/17426 [26:34<05:06,  8.37it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5097, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14861/17426 [26:34<05:05,  8.39it/s]

tensor(1.4771, grad_fn=<NllLossBackward0>)
tensor(1.4801, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14863/17426 [26:34<05:13,  8.19it/s]

tensor(1.4723, grad_fn=<NllLossBackward0>)
tensor(1.4987, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14865/17426 [26:34<05:26,  7.83it/s]

tensor(1.4645, grad_fn=<NllLossBackward0>)
tensor(1.4561, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14867/17426 [26:35<05:14,  8.14it/s]

tensor(1.4788, grad_fn=<NllLossBackward0>)
tensor(1.5158, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14869/17426 [26:35<05:08,  8.29it/s]

tensor(1.5223, grad_fn=<NllLossBackward0>)
tensor(1.4512, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14871/17426 [26:35<05:06,  8.35it/s]

tensor(1.5245, grad_fn=<NllLossBackward0>)
tensor(1.5292, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14873/17426 [26:35<05:03,  8.43it/s]

tensor(1.5032, grad_fn=<NllLossBackward0>)
tensor(1.4685, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14875/17426 [26:36<05:09,  8.25it/s]

tensor(1.5105, grad_fn=<NllLossBackward0>)
tensor(1.4973, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14877/17426 [26:36<05:04,  8.37it/s]

tensor(1.4536, grad_fn=<NllLossBackward0>)
tensor(1.4902, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14879/17426 [26:36<05:03,  8.40it/s]

tensor(1.4642, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14881/17426 [26:36<05:39,  7.50it/s]

tensor(1.4689, grad_fn=<NllLossBackward0>)
tensor(1.4962, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14883/17426 [26:37<05:53,  7.19it/s]

tensor(1.4702, grad_fn=<NllLossBackward0>)
tensor(1.5030, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14885/17426 [26:37<05:49,  7.28it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.4801, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14887/17426 [26:37<06:10,  6.86it/s]

tensor(1.4801, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14889/17426 [26:38<06:03,  6.98it/s]

tensor(1.4067, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14891/17426 [26:38<05:46,  7.32it/s]

tensor(1.5308, grad_fn=<NllLossBackward0>)
tensor(1.4695, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14893/17426 [26:38<05:23,  7.84it/s]

tensor(1.4510, grad_fn=<NllLossBackward0>)
tensor(1.4774, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14895/17426 [26:38<05:22,  7.86it/s]

tensor(1.4765, grad_fn=<NllLossBackward0>)
tensor(1.4970, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14897/17426 [26:39<05:18,  7.95it/s]

tensor(1.5015, grad_fn=<NllLossBackward0>)
tensor(1.4806, grad_fn=<NllLossBackward0>)


 85%|████████▌ | 14899/17426 [26:39<05:30,  7.65it/s]

tensor(1.4997, grad_fn=<NllLossBackward0>)
tensor(1.4664, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14901/17426 [26:39<05:36,  7.51it/s]

tensor(1.4769, grad_fn=<NllLossBackward0>)
tensor(1.4708, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14903/17426 [26:39<05:23,  7.80it/s]

tensor(1.4504, grad_fn=<NllLossBackward0>)
tensor(1.4984, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14905/17426 [26:40<05:37,  7.48it/s]

tensor(1.5311, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14907/17426 [26:40<05:49,  7.21it/s]

tensor(1.4721, grad_fn=<NllLossBackward0>)
tensor(1.5314, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14909/17426 [26:40<05:53,  7.11it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.4457, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14911/17426 [26:40<05:53,  7.11it/s]

tensor(1.4959, grad_fn=<NllLossBackward0>)
tensor(1.4887, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14913/17426 [26:41<06:17,  6.65it/s]

tensor(1.5182, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14915/17426 [26:41<05:51,  7.15it/s]

tensor(1.4741, grad_fn=<NllLossBackward0>)
tensor(1.5223, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14917/17426 [26:41<05:20,  7.83it/s]

tensor(1.4881, grad_fn=<NllLossBackward0>)
tensor(1.5646, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14919/17426 [26:42<05:13,  7.99it/s]

tensor(1.4757, grad_fn=<NllLossBackward0>)
tensor(1.4621, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14921/17426 [26:42<05:05,  8.20it/s]

tensor(1.4824, grad_fn=<NllLossBackward0>)
tensor(1.4943, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14923/17426 [26:42<05:15,  7.93it/s]

tensor(1.4998, grad_fn=<NllLossBackward0>)
tensor(1.4679, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14925/17426 [26:42<05:04,  8.21it/s]

tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.4833, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14927/17426 [26:43<04:58,  8.36it/s]

tensor(1.4598, grad_fn=<NllLossBackward0>)
tensor(1.5125, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14929/17426 [26:43<04:55,  8.44it/s]

tensor(1.4925, grad_fn=<NllLossBackward0>)
tensor(1.4789, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14931/17426 [26:43<04:49,  8.62it/s]

tensor(1.5214, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14933/17426 [26:43<04:53,  8.48it/s]

tensor(1.4912, grad_fn=<NllLossBackward0>)
tensor(1.5059, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14935/17426 [26:43<04:50,  8.57it/s]

tensor(1.4294, grad_fn=<NllLossBackward0>)
tensor(1.4818, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14937/17426 [26:44<04:47,  8.65it/s]

tensor(1.4991, grad_fn=<NllLossBackward0>)
tensor(1.4693, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14939/17426 [26:44<04:45,  8.70it/s]

tensor(1.4783, grad_fn=<NllLossBackward0>)
tensor(1.4902, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14941/17426 [26:44<05:05,  8.14it/s]

tensor(1.4654, grad_fn=<NllLossBackward0>)
tensor(1.4590, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14943/17426 [26:44<04:53,  8.46it/s]

tensor(1.5321, grad_fn=<NllLossBackward0>)
tensor(1.4398, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14945/17426 [26:45<04:57,  8.34it/s]

tensor(1.4922, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14947/17426 [26:45<05:10,  7.99it/s]

tensor(1.4779, grad_fn=<NllLossBackward0>)
tensor(1.4896, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14949/17426 [26:45<05:03,  8.17it/s]

tensor(1.4750, grad_fn=<NllLossBackward0>)
tensor(1.4884, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14951/17426 [26:45<05:06,  8.07it/s]

tensor(1.4550, grad_fn=<NllLossBackward0>)
tensor(1.5081, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14953/17426 [26:46<05:05,  8.10it/s]

tensor(1.4580, grad_fn=<NllLossBackward0>)
tensor(1.4940, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14955/17426 [26:46<05:00,  8.23it/s]

tensor(1.4825, grad_fn=<NllLossBackward0>)
tensor(1.5147, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14957/17426 [26:46<05:06,  8.07it/s]

tensor(1.4761, grad_fn=<NllLossBackward0>)
tensor(1.4908, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14959/17426 [26:46<05:23,  7.63it/s]

tensor(1.4630, grad_fn=<NllLossBackward0>)
tensor(1.4797, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14961/17426 [26:47<05:16,  7.78it/s]

tensor(1.4380, grad_fn=<NllLossBackward0>)
tensor(1.4966, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14963/17426 [26:47<05:08,  7.99it/s]

tensor(1.4919, grad_fn=<NllLossBackward0>)
tensor(1.4633, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14965/17426 [26:47<04:57,  8.27it/s]

tensor(1.5252, grad_fn=<NllLossBackward0>)
tensor(1.5207, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14967/17426 [26:47<05:12,  7.86it/s]

tensor(1.4874, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14969/17426 [26:48<05:06,  8.02it/s]

tensor(1.4845, grad_fn=<NllLossBackward0>)
tensor(1.4907, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14971/17426 [26:48<04:59,  8.20it/s]

tensor(1.5122, grad_fn=<NllLossBackward0>)
tensor(1.5114, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14973/17426 [26:48<04:57,  8.26it/s]

tensor(1.4577, grad_fn=<NllLossBackward0>)
tensor(1.4951, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14975/17426 [26:48<04:50,  8.43it/s]

tensor(1.5049, grad_fn=<NllLossBackward0>)
tensor(1.4489, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14977/17426 [26:49<04:59,  8.17it/s]

tensor(1.5066, grad_fn=<NllLossBackward0>)
tensor(1.4765, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14979/17426 [26:49<04:59,  8.18it/s]

tensor(1.4496, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14981/17426 [26:49<04:56,  8.23it/s]

tensor(1.4786, grad_fn=<NllLossBackward0>)
tensor(1.5515, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14983/17426 [26:49<04:50,  8.42it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.4689, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14985/17426 [26:50<05:05,  7.98it/s]

tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.5052, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14987/17426 [26:50<05:05,  7.99it/s]

tensor(1.4027, grad_fn=<NllLossBackward0>)
tensor(1.4908, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14989/17426 [26:50<04:59,  8.14it/s]

tensor(1.4989, grad_fn=<NllLossBackward0>)
tensor(1.4514, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14991/17426 [26:50<04:55,  8.23it/s]

tensor(1.4392, grad_fn=<NllLossBackward0>)
tensor(1.4876, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14993/17426 [26:51<05:10,  7.84it/s]

tensor(1.4793, grad_fn=<NllLossBackward0>)
tensor(1.4874, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14995/17426 [26:51<05:00,  8.10it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.4405, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14997/17426 [26:51<05:09,  7.84it/s]

tensor(1.4572, grad_fn=<NllLossBackward0>)
tensor(1.4926, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 14999/17426 [26:51<05:18,  7.61it/s]

tensor(1.5346, grad_fn=<NllLossBackward0>)
tensor(1.5040, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15001/17426 [26:52<05:30,  7.33it/s]

tensor(1.5215, grad_fn=<NllLossBackward0>)
tensor(1.4507, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15003/17426 [26:52<05:34,  7.25it/s]

tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.4866, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15005/17426 [26:52<05:21,  7.53it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.4135, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15007/17426 [26:52<05:00,  8.06it/s]

tensor(1.4706, grad_fn=<NllLossBackward0>)
tensor(1.4416, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15009/17426 [26:53<05:20,  7.54it/s]

tensor(1.4755, grad_fn=<NllLossBackward0>)
tensor(1.4557, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15011/17426 [26:53<05:30,  7.31it/s]

tensor(1.4802, grad_fn=<NllLossBackward0>)
tensor(1.5002, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15013/17426 [26:53<05:16,  7.62it/s]

tensor(1.5007, grad_fn=<NllLossBackward0>)
tensor(1.4696, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15015/17426 [26:53<04:53,  8.21it/s]

tensor(1.4670, grad_fn=<NllLossBackward0>)
tensor(1.4877, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15017/17426 [26:54<05:07,  7.83it/s]

tensor(1.5309, grad_fn=<NllLossBackward0>)
tensor(1.4787, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15019/17426 [26:54<04:59,  8.05it/s]

tensor(1.4552, grad_fn=<NllLossBackward0>)
tensor(1.4746, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15021/17426 [26:54<05:15,  7.61it/s]

tensor(1.4735, grad_fn=<NllLossBackward0>)
tensor(1.4529, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15023/17426 [26:55<05:26,  7.36it/s]

tensor(1.4857, grad_fn=<NllLossBackward0>)
tensor(1.4728, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15025/17426 [26:55<05:46,  6.94it/s]

tensor(1.4663, grad_fn=<NllLossBackward0>)
tensor(1.4595, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15027/17426 [26:55<05:46,  6.93it/s]

tensor(1.4194, grad_fn=<NllLossBackward0>)
tensor(1.4675, grad_fn=<NllLossBackward0>)


 86%|████████▌ | 15029/17426 [26:55<05:47,  6.90it/s]

tensor(1.4715, grad_fn=<NllLossBackward0>)
tensor(1.4816, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15031/17426 [26:56<05:46,  6.91it/s]

tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.5305, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15033/17426 [26:56<05:34,  7.16it/s]

tensor(1.4907, grad_fn=<NllLossBackward0>)
tensor(1.4309, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15035/17426 [26:56<05:06,  7.81it/s]

tensor(1.4857, grad_fn=<NllLossBackward0>)
tensor(1.4593, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15037/17426 [26:56<04:54,  8.11it/s]

tensor(1.4793, grad_fn=<NllLossBackward0>)
tensor(1.5061, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15039/17426 [26:57<04:53,  8.14it/s]

tensor(1.4529, grad_fn=<NllLossBackward0>)
tensor(1.4318, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15041/17426 [26:57<05:09,  7.71it/s]

tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.4481, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15043/17426 [26:57<05:02,  7.89it/s]

tensor(1.4608, grad_fn=<NllLossBackward0>)
tensor(1.4301, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15045/17426 [26:57<04:56,  8.03it/s]

tensor(1.4831, grad_fn=<NllLossBackward0>)
tensor(1.4875, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15047/17426 [26:58<04:52,  8.13it/s]

tensor(1.4594, grad_fn=<NllLossBackward0>)
tensor(1.4824, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15049/17426 [26:58<05:07,  7.73it/s]

tensor(1.4647, grad_fn=<NllLossBackward0>)
tensor(1.4372, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15051/17426 [26:58<04:53,  8.10it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.4800, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15053/17426 [26:58<04:50,  8.16it/s]

tensor(1.5313, grad_fn=<NllLossBackward0>)
tensor(1.4603, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15055/17426 [26:59<04:43,  8.37it/s]

tensor(1.4643, grad_fn=<NllLossBackward0>)
tensor(1.4607, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15057/17426 [26:59<04:52,  8.09it/s]

tensor(1.4894, grad_fn=<NllLossBackward0>)
tensor(1.4806, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15059/17426 [26:59<04:45,  8.28it/s]

tensor(1.4419, grad_fn=<NllLossBackward0>)
tensor(1.5013, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15061/17426 [26:59<04:43,  8.33it/s]

tensor(1.4627, grad_fn=<NllLossBackward0>)
tensor(1.4737, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15063/17426 [27:00<04:40,  8.44it/s]

tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.4757, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15065/17426 [27:00<04:43,  8.33it/s]

tensor(1.4913, grad_fn=<NllLossBackward0>)
tensor(1.4952, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15067/17426 [27:00<04:45,  8.26it/s]

tensor(1.4979, grad_fn=<NllLossBackward0>)
tensor(1.4590, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15069/17426 [27:00<04:47,  8.21it/s]

tensor(1.5085, grad_fn=<NllLossBackward0>)
tensor(1.5248, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15071/17426 [27:01<04:44,  8.26it/s]

tensor(1.5317, grad_fn=<NllLossBackward0>)
tensor(1.4598, grad_fn=<NllLossBackward0>)


 86%|████████▋ | 15073/17426 [27:01<04:46,  8.22it/s]

tensor(1.4836, grad_fn=<NllLossBackward0>)
tensor(1.4879, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15075/17426 [27:01<04:56,  7.93it/s]

tensor(1.5106, grad_fn=<NllLossBackward0>)
tensor(1.4844, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15077/17426 [27:01<04:48,  8.16it/s]

tensor(1.4726, grad_fn=<NllLossBackward0>)
tensor(1.5019, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15079/17426 [27:02<04:43,  8.27it/s]

tensor(1.4472, grad_fn=<NllLossBackward0>)
tensor(1.4986, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15081/17426 [27:02<04:46,  8.18it/s]

tensor(1.4773, grad_fn=<NllLossBackward0>)
tensor(1.5514, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15083/17426 [27:02<04:47,  8.14it/s]

tensor(1.5193, grad_fn=<NllLossBackward0>)
tensor(1.4636, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15085/17426 [27:02<04:44,  8.23it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.5040, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15087/17426 [27:03<04:39,  8.36it/s]

tensor(1.4394, grad_fn=<NllLossBackward0>)
tensor(1.5030, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15089/17426 [27:03<04:44,  8.20it/s]

tensor(1.4295, grad_fn=<NllLossBackward0>)
tensor(1.5187, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15091/17426 [27:03<04:44,  8.21it/s]

tensor(1.5087, grad_fn=<NllLossBackward0>)
tensor(1.5022, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15093/17426 [27:03<04:42,  8.26it/s]

tensor(1.4313, grad_fn=<NllLossBackward0>)
tensor(1.5272, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15095/17426 [27:04<04:35,  8.45it/s]

tensor(1.4705, grad_fn=<NllLossBackward0>)
tensor(1.5231, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15097/17426 [27:04<04:38,  8.36it/s]

tensor(1.4422, grad_fn=<NllLossBackward0>)
tensor(1.4636, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15099/17426 [27:04<04:38,  8.36it/s]

tensor(1.4877, grad_fn=<NllLossBackward0>)
tensor(1.4965, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15101/17426 [27:04<04:51,  7.96it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.5047, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15103/17426 [27:05<04:53,  7.91it/s]

tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.5140, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15105/17426 [27:05<04:47,  8.08it/s]

tensor(1.4837, grad_fn=<NllLossBackward0>)
tensor(1.4856, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15107/17426 [27:05<04:45,  8.14it/s]

tensor(1.4609, grad_fn=<NllLossBackward0>)
tensor(1.4698, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15109/17426 [27:05<04:57,  7.80it/s]

tensor(1.4337, grad_fn=<NllLossBackward0>)
tensor(1.5320, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15111/17426 [27:06<04:48,  8.01it/s]

tensor(1.5029, grad_fn=<NllLossBackward0>)
tensor(1.4359, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15113/17426 [27:06<04:46,  8.08it/s]

tensor(1.4635, grad_fn=<NllLossBackward0>)
tensor(1.5148, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15115/17426 [27:06<05:29,  7.01it/s]

tensor(1.4616, grad_fn=<NllLossBackward0>)
tensor(1.4587, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15117/17426 [27:06<05:40,  6.78it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.4650, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15119/17426 [27:07<05:39,  6.80it/s]

tensor(1.4704, grad_fn=<NllLossBackward0>)
tensor(1.5505, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15121/17426 [27:07<05:18,  7.23it/s]

tensor(1.5131, grad_fn=<NllLossBackward0>)
tensor(1.4933, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15123/17426 [27:07<05:06,  7.52it/s]

tensor(1.4927, grad_fn=<NllLossBackward0>)
tensor(1.4835, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15125/17426 [27:08<05:11,  7.40it/s]

tensor(1.5031, grad_fn=<NllLossBackward0>)
tensor(1.5365, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15127/17426 [27:08<05:06,  7.50it/s]

tensor(1.4805, grad_fn=<NllLossBackward0>)
tensor(1.5167, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15129/17426 [27:08<05:04,  7.54it/s]

tensor(1.4417, grad_fn=<NllLossBackward0>)
tensor(1.4598, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15131/17426 [27:08<05:21,  7.15it/s]

tensor(1.5192, grad_fn=<NllLossBackward0>)
tensor(1.4590, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15133/17426 [27:09<05:23,  7.08it/s]

tensor(1.4679, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15135/17426 [27:09<05:50,  6.54it/s]

tensor(1.4617, grad_fn=<NllLossBackward0>)
tensor(1.4853, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15137/17426 [27:09<05:49,  6.55it/s]

tensor(1.4880, grad_fn=<NllLossBackward0>)
tensor(1.4500, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15139/17426 [27:10<05:17,  7.21it/s]

tensor(1.4704, grad_fn=<NllLossBackward0>)
tensor(1.4644, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15141/17426 [27:10<05:40,  6.71it/s]

tensor(1.5441, grad_fn=<NllLossBackward0>)
tensor(1.5083, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15143/17426 [27:10<05:23,  7.06it/s]

tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.4744, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15145/17426 [27:10<05:42,  6.65it/s]

tensor(1.5004, grad_fn=<NllLossBackward0>)
tensor(1.4871, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15147/17426 [27:11<05:32,  6.86it/s]

tensor(1.4993, grad_fn=<NllLossBackward0>)
tensor(1.4655, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15149/17426 [27:11<05:05,  7.45it/s]

tensor(1.4616, grad_fn=<NllLossBackward0>)
tensor(1.4660, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15151/17426 [27:11<04:57,  7.66it/s]

tensor(1.4706, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15153/17426 [27:11<04:43,  8.01it/s]

tensor(1.4589, grad_fn=<NllLossBackward0>)
tensor(1.4881, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15155/17426 [27:12<04:33,  8.31it/s]

tensor(1.4497, grad_fn=<NllLossBackward0>)
tensor(1.4473, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15157/17426 [27:12<04:41,  8.07it/s]

tensor(1.5306, grad_fn=<NllLossBackward0>)
tensor(1.4614, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15159/17426 [27:12<04:36,  8.20it/s]

tensor(1.4739, grad_fn=<NllLossBackward0>)
tensor(1.4710, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15161/17426 [27:12<04:28,  8.44it/s]

tensor(1.4750, grad_fn=<NllLossBackward0>)
tensor(1.5023, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15163/17426 [27:13<04:43,  7.98it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)
tensor(1.5194, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15165/17426 [27:13<04:48,  7.84it/s]

tensor(1.4802, grad_fn=<NllLossBackward0>)
tensor(1.5214, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15167/17426 [27:13<04:41,  8.03it/s]

tensor(1.5481, grad_fn=<NllLossBackward0>)
tensor(1.4512, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15169/17426 [27:13<04:36,  8.16it/s]

tensor(1.5262, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15171/17426 [27:14<04:30,  8.33it/s]

tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.4806, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15173/17426 [27:14<04:38,  8.09it/s]

tensor(1.4541, grad_fn=<NllLossBackward0>)
tensor(1.5056, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15175/17426 [27:14<04:33,  8.23it/s]

tensor(1.5006, grad_fn=<NllLossBackward0>)
tensor(1.4906, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15177/17426 [27:14<04:28,  8.37it/s]

tensor(1.4820, grad_fn=<NllLossBackward0>)
tensor(1.4357, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15179/17426 [27:15<04:24,  8.50it/s]

tensor(1.4971, grad_fn=<NllLossBackward0>)
tensor(1.4795, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15181/17426 [27:15<04:25,  8.45it/s]

tensor(1.4745, grad_fn=<NllLossBackward0>)
tensor(1.4647, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15183/17426 [27:15<04:33,  8.20it/s]

tensor(1.4325, grad_fn=<NllLossBackward0>)
tensor(1.4764, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15185/17426 [27:15<04:34,  8.17it/s]

tensor(1.4657, grad_fn=<NllLossBackward0>)
tensor(1.5151, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15187/17426 [27:16<04:26,  8.42it/s]

tensor(1.4600, grad_fn=<NllLossBackward0>)
tensor(1.4811, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15189/17426 [27:16<04:23,  8.50it/s]

tensor(1.4692, grad_fn=<NllLossBackward0>)
tensor(1.5082, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15191/17426 [27:16<04:35,  8.12it/s]

tensor(1.4929, grad_fn=<NllLossBackward0>)
tensor(1.4997, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15193/17426 [27:16<04:33,  8.16it/s]

tensor(1.5158, grad_fn=<NllLossBackward0>)
tensor(1.4906, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15195/17426 [27:17<04:32,  8.20it/s]

tensor(1.4470, grad_fn=<NllLossBackward0>)
tensor(1.5296, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15197/17426 [27:17<04:31,  8.22it/s]

tensor(1.5102, grad_fn=<NllLossBackward0>)
tensor(1.4958, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15199/17426 [27:17<04:31,  8.20it/s]

tensor(1.4743, grad_fn=<NllLossBackward0>)
tensor(1.4511, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15201/17426 [27:17<04:33,  8.14it/s]

tensor(1.4345, grad_fn=<NllLossBackward0>)
tensor(1.4569, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15203/17426 [27:18<04:27,  8.32it/s]

tensor(1.4655, grad_fn=<NllLossBackward0>)
tensor(1.5179, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15205/17426 [27:18<04:22,  8.46it/s]

tensor(1.4756, grad_fn=<NllLossBackward0>)
tensor(1.5058, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15207/17426 [27:18<04:17,  8.61it/s]

tensor(1.4679, grad_fn=<NllLossBackward0>)
tensor(1.4943, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15209/17426 [27:18<04:30,  8.19it/s]

tensor(1.4701, grad_fn=<NllLossBackward0>)
tensor(1.5034, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15211/17426 [27:18<04:30,  8.19it/s]

tensor(1.4943, grad_fn=<NllLossBackward0>)
tensor(1.4362, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15213/17426 [27:19<04:28,  8.24it/s]

tensor(1.4391, grad_fn=<NllLossBackward0>)
tensor(1.4584, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15215/17426 [27:19<04:23,  8.38it/s]

tensor(1.4702, grad_fn=<NllLossBackward0>)
tensor(1.5270, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15217/17426 [27:19<04:37,  7.97it/s]

tensor(1.4668, grad_fn=<NllLossBackward0>)
tensor(1.4784, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15219/17426 [27:19<04:27,  8.25it/s]

tensor(1.4785, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15221/17426 [27:20<04:26,  8.28it/s]

tensor(1.5151, grad_fn=<NllLossBackward0>)
tensor(1.4662, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15223/17426 [27:20<04:21,  8.42it/s]

tensor(1.5217, grad_fn=<NllLossBackward0>)
tensor(1.4831, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15225/17426 [27:20<04:20,  8.46it/s]

tensor(1.5136, grad_fn=<NllLossBackward0>)
tensor(1.5222, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15227/17426 [27:20<04:31,  8.11it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.4305, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15229/17426 [27:21<04:35,  7.97it/s]

tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.5714, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15231/17426 [27:21<04:50,  7.56it/s]

tensor(1.4969, grad_fn=<NllLossBackward0>)
tensor(1.4808, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15233/17426 [27:21<04:58,  7.36it/s]

tensor(1.4586, grad_fn=<NllLossBackward0>)
tensor(1.4540, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15235/17426 [27:22<04:57,  7.36it/s]

tensor(1.4814, grad_fn=<NllLossBackward0>)
tensor(1.4754, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15237/17426 [27:22<05:03,  7.20it/s]

tensor(1.4805, grad_fn=<NllLossBackward0>)
tensor(1.4815, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15239/17426 [27:22<05:01,  7.26it/s]

tensor(1.4493, grad_fn=<NllLossBackward0>)
tensor(1.5120, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15241/17426 [27:22<05:22,  6.77it/s]

tensor(1.5444, grad_fn=<NllLossBackward0>)
tensor(1.4552, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15243/17426 [27:23<05:18,  6.86it/s]

tensor(1.4636, grad_fn=<NllLossBackward0>)
tensor(1.4441, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15245/17426 [27:23<05:23,  6.75it/s]

tensor(1.4681, grad_fn=<NllLossBackward0>)
tensor(1.4841, grad_fn=<NllLossBackward0>)


 87%|████████▋ | 15247/17426 [27:23<05:27,  6.65it/s]

tensor(1.4841, grad_fn=<NllLossBackward0>)
tensor(1.4542, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15249/17426 [27:24<05:00,  7.24it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.4735, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15251/17426 [27:24<05:25,  6.68it/s]

tensor(1.4919, grad_fn=<NllLossBackward0>)
tensor(1.5243, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15253/17426 [27:24<05:15,  6.89it/s]

tensor(1.4802, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15255/17426 [27:24<05:12,  6.95it/s]

tensor(1.4949, grad_fn=<NllLossBackward0>)
tensor(1.4639, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15257/17426 [27:25<05:14,  6.89it/s]

tensor(1.4558, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15259/17426 [27:25<05:14,  6.90it/s]

tensor(1.4420, grad_fn=<NllLossBackward0>)
tensor(1.5008, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15261/17426 [27:25<05:25,  6.65it/s]

tensor(1.4588, grad_fn=<NllLossBackward0>)
tensor(1.4805, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15263/17426 [27:26<04:54,  7.34it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.5279, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15265/17426 [27:26<04:46,  7.55it/s]

tensor(1.4609, grad_fn=<NllLossBackward0>)
tensor(1.4621, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15267/17426 [27:26<04:41,  7.66it/s]

tensor(1.4940, grad_fn=<NllLossBackward0>)
tensor(1.4564, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15269/17426 [27:26<04:26,  8.08it/s]

tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.5113, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15271/17426 [27:27<04:18,  8.32it/s]

tensor(1.4682, grad_fn=<NllLossBackward0>)
tensor(1.5344, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15273/17426 [27:27<04:25,  8.12it/s]

tensor(1.4927, grad_fn=<NllLossBackward0>)
tensor(1.4572, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15275/17426 [27:27<04:20,  8.26it/s]

tensor(1.4591, grad_fn=<NllLossBackward0>)
tensor(1.4531, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15277/17426 [27:27<04:15,  8.41it/s]

tensor(1.5079, grad_fn=<NllLossBackward0>)
tensor(1.4605, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15279/17426 [27:28<04:14,  8.44it/s]

tensor(1.4864, grad_fn=<NllLossBackward0>)
tensor(1.5106, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15281/17426 [27:28<04:18,  8.31it/s]

tensor(1.5439, grad_fn=<NllLossBackward0>)
tensor(1.5067, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15283/17426 [27:28<04:19,  8.27it/s]

tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.4952, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15285/17426 [27:28<04:21,  8.18it/s]

tensor(1.5084, grad_fn=<NllLossBackward0>)
tensor(1.4730, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15287/17426 [27:28<04:18,  8.28it/s]

tensor(1.4621, grad_fn=<NllLossBackward0>)
tensor(1.4972, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15289/17426 [27:29<04:19,  8.23it/s]

tensor(1.4403, grad_fn=<NllLossBackward0>)
tensor(1.4983, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15291/17426 [27:29<04:22,  8.12it/s]

tensor(1.5258, grad_fn=<NllLossBackward0>)
tensor(1.5292, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15293/17426 [27:29<04:15,  8.35it/s]

tensor(1.5233, grad_fn=<NllLossBackward0>)
tensor(1.4669, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15295/17426 [27:29<04:14,  8.36it/s]

tensor(1.4675, grad_fn=<NllLossBackward0>)
tensor(1.4781, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15297/17426 [27:30<04:25,  8.03it/s]

tensor(1.4527, grad_fn=<NllLossBackward0>)
tensor(1.4820, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15299/17426 [27:30<04:34,  7.76it/s]

tensor(1.4994, grad_fn=<NllLossBackward0>)
tensor(1.4420, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15301/17426 [27:30<04:23,  8.06it/s]

tensor(1.5306, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15303/17426 [27:30<04:17,  8.26it/s]

tensor(1.4852, grad_fn=<NllLossBackward0>)
tensor(1.5093, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15305/17426 [27:31<04:15,  8.29it/s]

tensor(1.4575, grad_fn=<NllLossBackward0>)
tensor(1.4812, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15307/17426 [27:31<04:21,  8.11it/s]

tensor(1.5009, grad_fn=<NllLossBackward0>)
tensor(1.4953, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15309/17426 [27:31<04:25,  7.96it/s]

tensor(1.5050, grad_fn=<NllLossBackward0>)
tensor(1.4539, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15311/17426 [27:31<04:16,  8.24it/s]

tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.5205, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15313/17426 [27:32<04:20,  8.12it/s]

tensor(1.4583, grad_fn=<NllLossBackward0>)
tensor(1.5546, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15315/17426 [27:32<04:13,  8.32it/s]

tensor(1.5050, grad_fn=<NllLossBackward0>)
tensor(1.4750, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15317/17426 [27:32<04:19,  8.14it/s]

tensor(1.4855, grad_fn=<NllLossBackward0>)
tensor(1.4729, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15319/17426 [27:32<04:10,  8.42it/s]

tensor(1.5375, grad_fn=<NllLossBackward0>)
tensor(1.4718, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15321/17426 [27:33<04:08,  8.47it/s]

tensor(1.4635, grad_fn=<NllLossBackward0>)
tensor(1.5054, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15323/17426 [27:33<04:10,  8.40it/s]

tensor(1.4649, grad_fn=<NllLossBackward0>)
tensor(1.4826, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15325/17426 [27:33<04:22,  8.01it/s]

tensor(1.5055, grad_fn=<NllLossBackward0>)
tensor(1.4484, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15327/17426 [27:33<04:10,  8.38it/s]

tensor(1.4758, grad_fn=<NllLossBackward0>)
tensor(1.4393, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15329/17426 [27:34<04:05,  8.53it/s]

tensor(1.4616, grad_fn=<NllLossBackward0>)
tensor(1.4721, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15331/17426 [27:34<04:06,  8.50it/s]

tensor(1.4716, grad_fn=<NllLossBackward0>)
tensor(1.5123, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15333/17426 [27:34<04:05,  8.51it/s]

tensor(1.4963, grad_fn=<NllLossBackward0>)
tensor(1.5129, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15335/17426 [27:34<04:12,  8.28it/s]

tensor(1.5311, grad_fn=<NllLossBackward0>)
tensor(1.5390, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15337/17426 [27:35<04:06,  8.48it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.4984, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15339/17426 [27:35<04:08,  8.40it/s]

tensor(1.4741, grad_fn=<NllLossBackward0>)
tensor(1.4894, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15341/17426 [27:35<04:04,  8.51it/s]

tensor(1.4410, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15343/17426 [27:35<04:21,  7.96it/s]

tensor(1.5345, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15345/17426 [27:36<04:43,  7.33it/s]

tensor(1.4770, grad_fn=<NllLossBackward0>)
tensor(1.4967, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15347/17426 [27:36<04:41,  7.38it/s]

tensor(1.4596, grad_fn=<NllLossBackward0>)
tensor(1.4466, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15349/17426 [27:36<05:08,  6.73it/s]

tensor(1.5216, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15351/17426 [27:36<04:53,  7.06it/s]

tensor(1.4745, grad_fn=<NllLossBackward0>)
tensor(1.4663, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15353/17426 [27:37<04:39,  7.42it/s]

tensor(1.4492, grad_fn=<NllLossBackward0>)
tensor(1.4746, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15355/17426 [27:37<04:36,  7.49it/s]

tensor(1.4339, grad_fn=<NllLossBackward0>)
tensor(1.4700, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15357/17426 [27:37<04:44,  7.27it/s]

tensor(1.4462, grad_fn=<NllLossBackward0>)
tensor(1.4815, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15359/17426 [27:38<05:13,  6.59it/s]

tensor(1.4534, grad_fn=<NllLossBackward0>)
tensor(1.4747, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15361/17426 [27:38<04:57,  6.94it/s]

tensor(1.4511, grad_fn=<NllLossBackward0>)
tensor(1.4619, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15363/17426 [27:38<04:49,  7.13it/s]

tensor(1.5403, grad_fn=<NllLossBackward0>)
tensor(1.4995, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15365/17426 [27:38<05:02,  6.82it/s]

tensor(1.5020, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15367/17426 [27:39<04:58,  6.91it/s]

tensor(1.5173, grad_fn=<NllLossBackward0>)
tensor(1.4802, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15369/17426 [27:39<04:53,  7.01it/s]

tensor(1.4764, grad_fn=<NllLossBackward0>)
tensor(1.4758, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15371/17426 [27:39<05:03,  6.76it/s]

tensor(1.4149, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15373/17426 [27:40<04:57,  6.91it/s]

tensor(1.4796, grad_fn=<NllLossBackward0>)
tensor(1.4588, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15375/17426 [27:40<05:02,  6.78it/s]

tensor(1.5028, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15377/17426 [27:40<04:39,  7.33it/s]

tensor(1.4788, grad_fn=<NllLossBackward0>)
tensor(1.4925, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15379/17426 [27:40<04:17,  7.94it/s]

tensor(1.4462, grad_fn=<NllLossBackward0>)
tensor(1.4877, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15381/17426 [27:41<04:18,  7.91it/s]

tensor(1.5047, grad_fn=<NllLossBackward0>)
tensor(1.4382, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15383/17426 [27:41<04:10,  8.15it/s]

tensor(1.4881, grad_fn=<NllLossBackward0>)
tensor(1.4734, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15385/17426 [27:41<04:18,  7.90it/s]

tensor(1.4536, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15387/17426 [27:41<04:09,  8.18it/s]

tensor(1.5076, grad_fn=<NllLossBackward0>)
tensor(1.5297, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15389/17426 [27:42<04:11,  8.09it/s]

tensor(1.4315, grad_fn=<NllLossBackward0>)
tensor(1.5190, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15391/17426 [27:42<04:05,  8.28it/s]

tensor(1.4426, grad_fn=<NllLossBackward0>)
tensor(1.4879, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15393/17426 [27:42<04:05,  8.29it/s]

tensor(1.4761, grad_fn=<NllLossBackward0>)
tensor(1.4872, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15395/17426 [27:42<04:05,  8.28it/s]

tensor(1.4392, grad_fn=<NllLossBackward0>)
tensor(1.4929, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15397/17426 [27:43<04:03,  8.35it/s]

tensor(1.5110, grad_fn=<NllLossBackward0>)
tensor(1.4573, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15399/17426 [27:43<04:07,  8.18it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.4908, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15401/17426 [27:43<04:05,  8.27it/s]

tensor(1.4940, grad_fn=<NllLossBackward0>)
tensor(1.4700, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15403/17426 [27:43<03:58,  8.48it/s]

tensor(1.4752, grad_fn=<NllLossBackward0>)
tensor(1.4668, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15405/17426 [27:44<03:59,  8.45it/s]

tensor(1.5056, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15407/17426 [27:44<04:16,  7.88it/s]

tensor(1.4746, grad_fn=<NllLossBackward0>)
tensor(1.4723, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15409/17426 [27:44<04:08,  8.11it/s]

tensor(1.4721, grad_fn=<NllLossBackward0>)
tensor(1.5081, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15411/17426 [27:44<04:05,  8.20it/s]

tensor(1.5152, grad_fn=<NllLossBackward0>)
tensor(1.4711, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15413/17426 [27:45<04:09,  8.08it/s]

tensor(1.5182, grad_fn=<NllLossBackward0>)
tensor(1.5167, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15415/17426 [27:45<04:04,  8.23it/s]

tensor(1.4804, grad_fn=<NllLossBackward0>)
tensor(1.4750, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15417/17426 [27:45<04:12,  7.96it/s]

tensor(1.4604, grad_fn=<NllLossBackward0>)
tensor(1.4838, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15419/17426 [27:45<04:09,  8.04it/s]

tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.5263, grad_fn=<NllLossBackward0>)


 88%|████████▊ | 15421/17426 [27:46<04:03,  8.25it/s]

tensor(1.4552, grad_fn=<NllLossBackward0>)
tensor(1.5171, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15423/17426 [27:46<04:03,  8.22it/s]

tensor(1.4795, grad_fn=<NllLossBackward0>)
tensor(1.4790, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15425/17426 [27:46<04:18,  7.74it/s]

tensor(1.4848, grad_fn=<NllLossBackward0>)
tensor(1.4682, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15427/17426 [27:46<04:10,  7.97it/s]

tensor(1.4966, grad_fn=<NllLossBackward0>)
tensor(1.4682, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15429/17426 [27:47<04:01,  8.27it/s]

tensor(1.5017, grad_fn=<NllLossBackward0>)
tensor(1.4542, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15431/17426 [27:47<04:05,  8.12it/s]

tensor(1.4907, grad_fn=<NllLossBackward0>)
tensor(1.4597, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15433/17426 [27:47<04:07,  8.06it/s]

tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.4788, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15435/17426 [27:47<03:56,  8.43it/s]

tensor(1.5307, grad_fn=<NllLossBackward0>)
tensor(1.4569, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15437/17426 [27:48<03:50,  8.64it/s]

tensor(1.4487, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15439/17426 [27:48<03:48,  8.71it/s]

tensor(1.4258, grad_fn=<NllLossBackward0>)
tensor(1.4778, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15441/17426 [27:48<03:57,  8.37it/s]

tensor(1.5038, grad_fn=<NllLossBackward0>)
tensor(1.5219, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15443/17426 [27:48<04:07,  8.02it/s]

tensor(1.4866, grad_fn=<NllLossBackward0>)
tensor(1.4541, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15445/17426 [27:49<03:57,  8.34it/s]

tensor(1.4807, grad_fn=<NllLossBackward0>)
tensor(1.4572, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15447/17426 [27:49<03:54,  8.46it/s]

tensor(1.4649, grad_fn=<NllLossBackward0>)
tensor(1.4574, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15449/17426 [27:49<03:54,  8.42it/s]

tensor(1.4495, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15451/17426 [27:49<04:13,  7.80it/s]

tensor(1.4493, grad_fn=<NllLossBackward0>)
tensor(1.4793, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15453/17426 [27:49<04:02,  8.13it/s]

tensor(1.4477, grad_fn=<NllLossBackward0>)
tensor(1.4882, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15455/17426 [27:50<03:59,  8.24it/s]

tensor(1.4571, grad_fn=<NllLossBackward0>)
tensor(1.4700, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15457/17426 [27:50<03:56,  8.31it/s]

tensor(1.4779, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15459/17426 [27:50<04:23,  7.46it/s]

tensor(1.4968, grad_fn=<NllLossBackward0>)
tensor(1.4979, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15461/17426 [27:51<04:34,  7.15it/s]

tensor(1.4467, grad_fn=<NllLossBackward0>)
tensor(1.4583, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15463/17426 [27:51<04:20,  7.53it/s]

tensor(1.4773, grad_fn=<NllLossBackward0>)
tensor(1.4838, grad_fn=<NllLossBackward0>)


 89%|████████▊ | 15465/17426 [27:51<04:22,  7.48it/s]

tensor(1.5162, grad_fn=<NllLossBackward0>)
tensor(1.4849, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15467/17426 [27:51<04:30,  7.25it/s]

tensor(1.4661, grad_fn=<NllLossBackward0>)
tensor(1.5003, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15469/17426 [27:52<04:09,  7.84it/s]

tensor(1.4593, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15471/17426 [27:52<04:07,  7.89it/s]

tensor(1.4750, grad_fn=<NllLossBackward0>)
tensor(1.4550, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15473/17426 [27:52<04:09,  7.82it/s]

tensor(1.4573, grad_fn=<NllLossBackward0>)
tensor(1.4973, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15475/17426 [27:52<04:05,  7.96it/s]

tensor(1.4971, grad_fn=<NllLossBackward0>)
tensor(1.4958, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15477/17426 [27:53<04:00,  8.10it/s]

tensor(1.4719, grad_fn=<NllLossBackward0>)
tensor(1.5297, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15479/17426 [27:53<03:51,  8.40it/s]

tensor(1.4591, grad_fn=<NllLossBackward0>)
tensor(1.4530, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15481/17426 [27:53<03:54,  8.31it/s]

tensor(1.5051, grad_fn=<NllLossBackward0>)
tensor(1.5547, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15483/17426 [27:53<04:27,  7.25it/s]

tensor(1.4722, grad_fn=<NllLossBackward0>)
tensor(1.4665, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15485/17426 [27:54<04:42,  6.88it/s]

tensor(1.4781, grad_fn=<NllLossBackward0>)
tensor(1.4623, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15487/17426 [27:54<04:16,  7.55it/s]

tensor(1.4655, grad_fn=<NllLossBackward0>)
tensor(1.4800, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15489/17426 [27:54<04:37,  6.99it/s]

tensor(1.4796, grad_fn=<NllLossBackward0>)
tensor(1.5026, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15491/17426 [27:54<04:29,  7.17it/s]

tensor(1.5103, grad_fn=<NllLossBackward0>)
tensor(1.4788, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15493/17426 [27:55<04:33,  7.06it/s]

tensor(1.4819, grad_fn=<NllLossBackward0>)
tensor(1.4961, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15495/17426 [27:55<04:34,  7.03it/s]

tensor(1.4938, grad_fn=<NllLossBackward0>)
tensor(1.4674, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15497/17426 [27:55<04:08,  7.77it/s]

tensor(1.4696, grad_fn=<NllLossBackward0>)
tensor(1.5085, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15499/17426 [27:56<04:06,  7.83it/s]

tensor(1.4698, grad_fn=<NllLossBackward0>)
tensor(1.4834, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15501/17426 [27:56<04:09,  7.70it/s]

tensor(1.4241, grad_fn=<NllLossBackward0>)
tensor(1.4368, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15503/17426 [27:56<03:58,  8.07it/s]

tensor(1.4992, grad_fn=<NllLossBackward0>)
tensor(1.5157, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15505/17426 [27:56<03:54,  8.20it/s]

tensor(1.4283, grad_fn=<NllLossBackward0>)
tensor(1.4398, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15507/17426 [27:57<03:58,  8.06it/s]

tensor(1.4765, grad_fn=<NllLossBackward0>)
tensor(1.5020, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15509/17426 [27:57<03:59,  7.99it/s]

tensor(1.5798, grad_fn=<NllLossBackward0>)
tensor(1.4783, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15511/17426 [27:57<03:53,  8.21it/s]

tensor(1.4768, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15513/17426 [27:57<03:53,  8.19it/s]

tensor(1.4686, grad_fn=<NllLossBackward0>)
tensor(1.4437, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15515/17426 [27:58<03:53,  8.17it/s]

tensor(1.4387, grad_fn=<NllLossBackward0>)
tensor(1.4394, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15517/17426 [27:58<03:56,  8.07it/s]

tensor(1.5019, grad_fn=<NllLossBackward0>)
tensor(1.4702, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15519/17426 [27:58<03:57,  8.04it/s]

tensor(1.5073, grad_fn=<NllLossBackward0>)
tensor(1.4652, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15521/17426 [27:58<03:47,  8.37it/s]

tensor(1.4884, grad_fn=<NllLossBackward0>)
tensor(1.5165, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15523/17426 [27:59<03:47,  8.37it/s]

tensor(1.4817, grad_fn=<NllLossBackward0>)
tensor(1.4551, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15525/17426 [27:59<03:54,  8.12it/s]

tensor(1.4607, grad_fn=<NllLossBackward0>)
tensor(1.4381, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15527/17426 [27:59<03:49,  8.29it/s]

tensor(1.4783, grad_fn=<NllLossBackward0>)
tensor(1.5263, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15529/17426 [27:59<03:45,  8.42it/s]

tensor(1.5076, grad_fn=<NllLossBackward0>)
tensor(1.4562, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15531/17426 [27:59<03:44,  8.43it/s]

tensor(1.4796, grad_fn=<NllLossBackward0>)
tensor(1.4785, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15533/17426 [28:00<03:44,  8.43it/s]

tensor(1.5215, grad_fn=<NllLossBackward0>)
tensor(1.4974, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15535/17426 [28:00<03:57,  7.98it/s]

tensor(1.4850, grad_fn=<NllLossBackward0>)
tensor(1.4647, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15537/17426 [28:00<03:49,  8.23it/s]

tensor(1.4956, grad_fn=<NllLossBackward0>)
tensor(1.4958, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15539/17426 [28:00<03:48,  8.27it/s]

tensor(1.4985, grad_fn=<NllLossBackward0>)
tensor(1.4897, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15541/17426 [28:01<03:46,  8.33it/s]

tensor(1.4899, grad_fn=<NllLossBackward0>)
tensor(1.4788, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15543/17426 [28:01<03:47,  8.26it/s]

tensor(1.4800, grad_fn=<NllLossBackward0>)
tensor(1.4750, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15545/17426 [28:01<03:49,  8.21it/s]

tensor(1.4966, grad_fn=<NllLossBackward0>)
tensor(1.4473, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15547/17426 [28:01<03:43,  8.41it/s]

tensor(1.4236, grad_fn=<NllLossBackward0>)
tensor(1.4582, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15549/17426 [28:02<03:43,  8.40it/s]

tensor(1.4759, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15551/17426 [28:02<03:43,  8.40it/s]

tensor(1.4805, grad_fn=<NllLossBackward0>)
tensor(1.5091, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15553/17426 [28:02<03:52,  8.06it/s]

tensor(1.4151, grad_fn=<NllLossBackward0>)
tensor(1.4909, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15555/17426 [28:02<03:46,  8.26it/s]

tensor(1.4626, grad_fn=<NllLossBackward0>)
tensor(1.5253, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15557/17426 [28:03<03:43,  8.36it/s]

tensor(1.4915, grad_fn=<NllLossBackward0>)
tensor(1.5235, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15559/17426 [28:03<03:39,  8.51it/s]

tensor(1.5564, grad_fn=<NllLossBackward0>)
tensor(1.4589, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15561/17426 [28:03<03:49,  8.13it/s]

tensor(1.4675, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15563/17426 [28:03<03:39,  8.50it/s]

tensor(1.5149, grad_fn=<NllLossBackward0>)
tensor(1.4765, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15565/17426 [28:04<03:40,  8.44it/s]

tensor(1.4567, grad_fn=<NllLossBackward0>)
tensor(1.4776, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15567/17426 [28:04<03:37,  8.55it/s]

tensor(1.4522, grad_fn=<NllLossBackward0>)
tensor(1.4461, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15569/17426 [28:04<03:42,  8.34it/s]

tensor(1.4946, grad_fn=<NllLossBackward0>)
tensor(1.4844, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15571/17426 [28:04<03:50,  8.06it/s]

tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15573/17426 [28:05<03:43,  8.27it/s]

tensor(1.4704, grad_fn=<NllLossBackward0>)
tensor(1.4808, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15575/17426 [28:05<03:43,  8.28it/s]

tensor(1.4993, grad_fn=<NllLossBackward0>)
tensor(1.4654, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15577/17426 [28:05<03:53,  7.91it/s]

tensor(1.4704, grad_fn=<NllLossBackward0>)
tensor(1.4971, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15579/17426 [28:05<04:14,  7.27it/s]

tensor(1.4495, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15581/17426 [28:06<04:09,  7.38it/s]

tensor(1.4537, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15583/17426 [28:06<04:06,  7.48it/s]

tensor(1.4545, grad_fn=<NllLossBackward0>)
tensor(1.4632, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15585/17426 [28:06<04:07,  7.45it/s]

tensor(1.4091, grad_fn=<NllLossBackward0>)
tensor(1.4560, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15587/17426 [28:06<04:12,  7.28it/s]

tensor(1.4869, grad_fn=<NllLossBackward0>)
tensor(1.4700, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15589/17426 [28:07<03:58,  7.70it/s]

tensor(1.4525, grad_fn=<NllLossBackward0>)
tensor(1.5219, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15591/17426 [28:07<03:58,  7.70it/s]

tensor(1.4640, grad_fn=<NllLossBackward0>)
tensor(1.5047, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15593/17426 [28:07<04:13,  7.23it/s]

tensor(1.5535, grad_fn=<NllLossBackward0>)
tensor(1.4704, grad_fn=<NllLossBackward0>)


 89%|████████▉ | 15595/17426 [28:08<04:06,  7.42it/s]

tensor(1.4987, grad_fn=<NllLossBackward0>)
tensor(1.4576, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15597/17426 [28:08<04:18,  7.06it/s]

tensor(1.4812, grad_fn=<NllLossBackward0>)
tensor(1.4438, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15599/17426 [28:08<04:23,  6.94it/s]

tensor(1.4821, grad_fn=<NllLossBackward0>)
tensor(1.4451, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15601/17426 [28:08<04:29,  6.78it/s]

tensor(1.4393, grad_fn=<NllLossBackward0>)
tensor(1.5225, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15603/17426 [28:09<04:32,  6.69it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.4835, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15605/17426 [28:09<04:40,  6.49it/s]

tensor(1.4752, grad_fn=<NllLossBackward0>)
tensor(1.5030, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15607/17426 [28:09<04:40,  6.49it/s]

tensor(1.4733, grad_fn=<NllLossBackward0>)
tensor(1.4905, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15609/17426 [28:10<04:46,  6.35it/s]

tensor(1.4463, grad_fn=<NllLossBackward0>)
tensor(1.4807, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15611/17426 [28:10<04:26,  6.80it/s]

tensor(1.4717, grad_fn=<NllLossBackward0>)
tensor(1.4873, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15613/17426 [28:10<04:07,  7.33it/s]

tensor(1.4839, grad_fn=<NllLossBackward0>)
tensor(1.5190, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15615/17426 [28:10<03:55,  7.69it/s]

tensor(1.5036, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15617/17426 [28:11<03:48,  7.91it/s]

tensor(1.4795, grad_fn=<NllLossBackward0>)
tensor(1.5185, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15619/17426 [28:11<03:50,  7.85it/s]

tensor(1.4904, grad_fn=<NllLossBackward0>)
tensor(1.4728, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15621/17426 [28:11<03:46,  7.98it/s]

tensor(1.4945, grad_fn=<NllLossBackward0>)
tensor(1.4919, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15623/17426 [28:11<03:39,  8.21it/s]

tensor(1.4728, grad_fn=<NllLossBackward0>)
tensor(1.4752, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15625/17426 [28:12<03:44,  8.02it/s]

tensor(1.4671, grad_fn=<NllLossBackward0>)
tensor(1.4782, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15627/17426 [28:12<03:46,  7.94it/s]

tensor(1.4929, grad_fn=<NllLossBackward0>)
tensor(1.4656, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15629/17426 [28:12<03:40,  8.16it/s]

tensor(1.4604, grad_fn=<NllLossBackward0>)
tensor(1.4505, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15631/17426 [28:12<03:33,  8.40it/s]

tensor(1.4625, grad_fn=<NllLossBackward0>)
tensor(1.4974, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15633/17426 [28:13<03:33,  8.41it/s]

tensor(1.4690, grad_fn=<NllLossBackward0>)
tensor(1.4654, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15635/17426 [28:13<03:43,  8.00it/s]

tensor(1.4861, grad_fn=<NllLossBackward0>)
tensor(1.4536, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15637/17426 [28:13<03:35,  8.29it/s]

tensor(1.4699, grad_fn=<NllLossBackward0>)
tensor(1.4562, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15639/17426 [28:13<03:32,  8.42it/s]

tensor(1.5048, grad_fn=<NllLossBackward0>)
tensor(1.4888, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15641/17426 [28:14<03:41,  8.06it/s]

tensor(1.5027, grad_fn=<NllLossBackward0>)
tensor(1.4884, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15643/17426 [28:14<03:42,  8.01it/s]

tensor(1.4842, grad_fn=<NllLossBackward0>)
tensor(1.4776, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15645/17426 [28:14<03:51,  7.70it/s]

tensor(1.4852, grad_fn=<NllLossBackward0>)
tensor(1.5083, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15647/17426 [28:14<03:39,  8.09it/s]

tensor(1.4658, grad_fn=<NllLossBackward0>)
tensor(1.4771, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15649/17426 [28:15<03:35,  8.23it/s]

tensor(1.4770, grad_fn=<NllLossBackward0>)
tensor(1.4582, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15651/17426 [28:15<03:43,  7.95it/s]

tensor(1.5254, grad_fn=<NllLossBackward0>)
tensor(1.4488, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15653/17426 [28:15<03:53,  7.60it/s]

tensor(1.4829, grad_fn=<NllLossBackward0>)
tensor(1.4613, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15655/17426 [28:15<03:44,  7.87it/s]

tensor(1.4877, grad_fn=<NllLossBackward0>)
tensor(1.5293, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15657/17426 [28:16<03:38,  8.09it/s]

tensor(1.4764, grad_fn=<NllLossBackward0>)
tensor(1.4629, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15659/17426 [28:16<03:41,  7.97it/s]

tensor(1.4925, grad_fn=<NllLossBackward0>)
tensor(1.4545, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15661/17426 [28:16<03:45,  7.83it/s]

tensor(1.5276, grad_fn=<NllLossBackward0>)
tensor(1.4900, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15663/17426 [28:16<03:42,  7.92it/s]

tensor(1.4587, grad_fn=<NllLossBackward0>)
tensor(1.4639, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15665/17426 [28:17<03:40,  7.99it/s]

tensor(1.4728, grad_fn=<NllLossBackward0>)
tensor(1.4861, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15667/17426 [28:17<03:47,  7.73it/s]

tensor(1.4982, grad_fn=<NllLossBackward0>)
tensor(1.4777, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15669/17426 [28:17<03:54,  7.48it/s]

tensor(1.4727, grad_fn=<NllLossBackward0>)
tensor(1.5012, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15671/17426 [28:17<03:39,  7.99it/s]

tensor(1.4854, grad_fn=<NllLossBackward0>)
tensor(1.4667, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15673/17426 [28:18<03:34,  8.17it/s]

tensor(1.4858, grad_fn=<NllLossBackward0>)
tensor(1.4828, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15675/17426 [28:18<03:32,  8.25it/s]

tensor(1.5007, grad_fn=<NllLossBackward0>)
tensor(1.5091, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15677/17426 [28:18<03:31,  8.26it/s]

tensor(1.4714, grad_fn=<NllLossBackward0>)
tensor(1.4449, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15679/17426 [28:18<03:34,  8.14it/s]

tensor(1.4737, grad_fn=<NllLossBackward0>)
tensor(1.4918, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15681/17426 [28:19<03:29,  8.35it/s]

tensor(1.4696, grad_fn=<NllLossBackward0>)
tensor(1.5269, grad_fn=<NllLossBackward0>)


 90%|████████▉ | 15683/17426 [28:19<03:27,  8.41it/s]

tensor(1.4616, grad_fn=<NllLossBackward0>)
tensor(1.4771, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15685/17426 [28:19<03:28,  8.35it/s]

tensor(1.4548, grad_fn=<NllLossBackward0>)
tensor(1.4627, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15687/17426 [28:19<03:39,  7.93it/s]

tensor(1.4777, grad_fn=<NllLossBackward0>)
tensor(1.4684, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15689/17426 [28:20<03:29,  8.31it/s]

tensor(1.4914, grad_fn=<NllLossBackward0>)
tensor(1.5514, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15691/17426 [28:20<03:32,  8.18it/s]

tensor(1.4842, grad_fn=<NllLossBackward0>)
tensor(1.4780, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15693/17426 [28:20<03:45,  7.68it/s]

tensor(1.4441, grad_fn=<NllLossBackward0>)
tensor(1.4678, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15695/17426 [28:20<04:08,  6.98it/s]

tensor(1.4704, grad_fn=<NllLossBackward0>)
tensor(1.4766, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15697/17426 [28:21<03:55,  7.35it/s]

tensor(1.4889, grad_fn=<NllLossBackward0>)
tensor(1.4624, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15699/17426 [28:21<03:50,  7.50it/s]

tensor(1.4853, grad_fn=<NllLossBackward0>)
tensor(1.4498, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15701/17426 [28:21<03:53,  7.38it/s]

tensor(1.5391, grad_fn=<NllLossBackward0>)
tensor(1.4707, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15703/17426 [28:22<03:54,  7.35it/s]

tensor(1.5154, grad_fn=<NllLossBackward0>)
tensor(1.5027, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15705/17426 [28:22<03:35,  7.98it/s]

tensor(1.5133, grad_fn=<NllLossBackward0>)
tensor(1.4506, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15707/17426 [28:22<03:35,  7.97it/s]

tensor(1.4542, grad_fn=<NllLossBackward0>)
tensor(1.4869, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15709/17426 [28:22<03:34,  8.00it/s]

tensor(1.5056, grad_fn=<NllLossBackward0>)
tensor(1.4394, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15711/17426 [28:22<03:25,  8.36it/s]

tensor(1.4911, grad_fn=<NllLossBackward0>)
tensor(1.4931, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15713/17426 [28:23<03:29,  8.19it/s]

tensor(1.5004, grad_fn=<NllLossBackward0>)
tensor(1.4774, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15715/17426 [28:23<03:20,  8.53it/s]

tensor(1.5002, grad_fn=<NllLossBackward0>)
tensor(1.4788, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15717/17426 [28:23<03:40,  7.73it/s]

tensor(1.4929, grad_fn=<NllLossBackward0>)
tensor(1.4635, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15719/17426 [28:24<03:49,  7.43it/s]

tensor(1.5144, grad_fn=<NllLossBackward0>)
tensor(1.5093, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15721/17426 [28:24<03:59,  7.13it/s]

tensor(1.5169, grad_fn=<NllLossBackward0>)
tensor(1.5351, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15723/17426 [28:24<04:09,  6.82it/s]

tensor(1.4642, grad_fn=<NllLossBackward0>)
tensor(1.5069, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15725/17426 [28:24<04:19,  6.56it/s]

tensor(1.4852, grad_fn=<NllLossBackward0>)
tensor(1.4865, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15727/17426 [28:25<04:10,  6.78it/s]

tensor(1.4666, grad_fn=<NllLossBackward0>)
tensor(1.4396, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15729/17426 [28:25<04:27,  6.35it/s]

tensor(1.4938, grad_fn=<NllLossBackward0>)
tensor(1.4830, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15731/17426 [28:25<03:56,  7.17it/s]

tensor(1.4979, grad_fn=<NllLossBackward0>)
tensor(1.4902, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15733/17426 [28:26<03:37,  7.80it/s]

tensor(1.4978, grad_fn=<NllLossBackward0>)
tensor(1.4912, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15735/17426 [28:26<03:30,  8.05it/s]

tensor(1.5418, grad_fn=<NllLossBackward0>)
tensor(1.4867, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15737/17426 [28:26<03:36,  7.80it/s]

tensor(1.4737, grad_fn=<NllLossBackward0>)
tensor(1.4933, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15739/17426 [28:26<03:30,  8.03it/s]

tensor(1.5008, grad_fn=<NllLossBackward0>)
tensor(1.4353, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15741/17426 [28:27<03:23,  8.27it/s]

tensor(1.5012, grad_fn=<NllLossBackward0>)
tensor(1.4990, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15743/17426 [28:27<03:20,  8.38it/s]

tensor(1.4659, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15745/17426 [28:27<03:26,  8.13it/s]

tensor(1.4837, grad_fn=<NllLossBackward0>)
tensor(1.4796, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15747/17426 [28:27<03:33,  7.87it/s]

tensor(1.4869, grad_fn=<NllLossBackward0>)
tensor(1.5338, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15749/17426 [28:28<03:31,  7.92it/s]

tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.4536, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15751/17426 [28:28<03:29,  7.98it/s]

tensor(1.4574, grad_fn=<NllLossBackward0>)
tensor(1.4428, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15753/17426 [28:28<03:38,  7.64it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.4603, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15755/17426 [28:28<03:31,  7.92it/s]

tensor(1.4537, grad_fn=<NllLossBackward0>)
tensor(1.4614, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15757/17426 [28:29<03:28,  8.00it/s]

tensor(1.5041, grad_fn=<NllLossBackward0>)
tensor(1.5127, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15759/17426 [28:29<03:24,  8.17it/s]

tensor(1.4789, grad_fn=<NllLossBackward0>)
tensor(1.4878, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15761/17426 [28:29<03:29,  7.94it/s]

tensor(1.4363, grad_fn=<NllLossBackward0>)
tensor(1.4221, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15763/17426 [28:29<03:34,  7.74it/s]

tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.4766, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15765/17426 [28:30<03:27,  8.02it/s]

tensor(1.4966, grad_fn=<NllLossBackward0>)
tensor(1.4145, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15767/17426 [28:30<03:27,  7.99it/s]

tensor(1.4394, grad_fn=<NllLossBackward0>)
tensor(1.5024, grad_fn=<NllLossBackward0>)


 90%|█████████ | 15769/17426 [28:30<03:27,  7.98it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.4318, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15771/17426 [28:30<03:28,  7.93it/s]

tensor(1.4869, grad_fn=<NllLossBackward0>)
tensor(1.4387, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15773/17426 [28:31<03:24,  8.07it/s]

tensor(1.4861, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15775/17426 [28:31<03:20,  8.23it/s]

tensor(1.4619, grad_fn=<NllLossBackward0>)
tensor(1.5313, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15777/17426 [28:31<03:21,  8.18it/s]

tensor(1.5339, grad_fn=<NllLossBackward0>)
tensor(1.4741, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15779/17426 [28:31<03:29,  7.84it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.4378, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15781/17426 [28:32<03:24,  8.05it/s]

tensor(1.4818, grad_fn=<NllLossBackward0>)
tensor(1.4949, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15783/17426 [28:32<03:28,  7.89it/s]

tensor(1.4927, grad_fn=<NllLossBackward0>)
tensor(1.4982, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15785/17426 [28:32<03:23,  8.05it/s]

tensor(1.4757, grad_fn=<NllLossBackward0>)
tensor(1.5064, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15787/17426 [28:32<03:28,  7.85it/s]

tensor(1.4585, grad_fn=<NllLossBackward0>)
tensor(1.4755, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15789/17426 [28:33<03:22,  8.10it/s]

tensor(1.4232, grad_fn=<NllLossBackward0>)
tensor(1.4887, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15791/17426 [28:33<03:24,  8.00it/s]

tensor(1.4740, grad_fn=<NllLossBackward0>)
tensor(1.4906, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15793/17426 [28:33<03:25,  7.95it/s]

tensor(1.4587, grad_fn=<NllLossBackward0>)
tensor(1.4628, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15795/17426 [28:33<03:37,  7.49it/s]

tensor(1.4854, grad_fn=<NllLossBackward0>)
tensor(1.4566, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15797/17426 [28:34<03:31,  7.72it/s]

tensor(1.4924, grad_fn=<NllLossBackward0>)
tensor(1.4663, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15799/17426 [28:34<03:24,  7.95it/s]

tensor(1.4529, grad_fn=<NllLossBackward0>)
tensor(1.4912, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15801/17426 [28:34<03:29,  7.76it/s]

tensor(1.4249, grad_fn=<NllLossBackward0>)
tensor(1.4941, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15803/17426 [28:34<03:29,  7.76it/s]

tensor(1.4742, grad_fn=<NllLossBackward0>)
tensor(1.4281, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15805/17426 [28:35<03:19,  8.12it/s]

tensor(1.4409, grad_fn=<NllLossBackward0>)
tensor(1.4742, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15807/17426 [28:35<03:16,  8.24it/s]

tensor(1.4691, grad_fn=<NllLossBackward0>)
tensor(1.4904, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15809/17426 [28:35<03:21,  8.02it/s]

tensor(1.5226, grad_fn=<NllLossBackward0>)
tensor(1.4858, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15811/17426 [28:35<03:42,  7.24it/s]

tensor(1.4534, grad_fn=<NllLossBackward0>)
tensor(1.4607, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15813/17426 [28:36<04:02,  6.64it/s]

tensor(1.4975, grad_fn=<NllLossBackward0>)
tensor(1.4800, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15815/17426 [28:36<04:04,  6.59it/s]

tensor(1.4693, grad_fn=<NllLossBackward0>)
tensor(1.4388, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15817/17426 [28:36<04:09,  6.45it/s]

tensor(1.4726, grad_fn=<NllLossBackward0>)
tensor(1.4652, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15819/17426 [28:37<04:03,  6.60it/s]

tensor(1.5110, grad_fn=<NllLossBackward0>)
tensor(1.4817, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15821/17426 [28:37<03:57,  6.77it/s]

tensor(1.5031, grad_fn=<NllLossBackward0>)
tensor(1.4812, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15823/17426 [28:37<03:56,  6.78it/s]

tensor(1.4783, grad_fn=<NllLossBackward0>)
tensor(1.4977, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15825/17426 [28:38<04:03,  6.58it/s]

tensor(1.4577, grad_fn=<NllLossBackward0>)
tensor(1.4230, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15827/17426 [28:38<03:42,  7.17it/s]

tensor(1.4801, grad_fn=<NllLossBackward0>)
tensor(1.4679, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15829/17426 [28:38<03:57,  6.72it/s]

tensor(1.5111, grad_fn=<NllLossBackward0>)
tensor(1.4360, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15831/17426 [28:38<03:53,  6.84it/s]

tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.4377, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15833/17426 [28:39<03:51,  6.88it/s]

tensor(1.4879, grad_fn=<NllLossBackward0>)
tensor(1.4544, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15835/17426 [28:39<04:03,  6.53it/s]

tensor(1.5035, grad_fn=<NllLossBackward0>)
tensor(1.4570, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15837/17426 [28:39<04:04,  6.51it/s]

tensor(1.4593, grad_fn=<NllLossBackward0>)
tensor(1.4836, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15839/17426 [28:40<04:03,  6.52it/s]

tensor(1.4507, grad_fn=<NllLossBackward0>)
tensor(1.4973, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15841/17426 [28:40<03:57,  6.68it/s]

tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15843/17426 [28:40<03:32,  7.43it/s]

tensor(1.5031, grad_fn=<NllLossBackward0>)
tensor(1.4630, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15845/17426 [28:40<03:32,  7.45it/s]

tensor(1.5056, grad_fn=<NllLossBackward0>)
tensor(1.4706, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15847/17426 [28:41<03:26,  7.64it/s]

tensor(1.4450, grad_fn=<NllLossBackward0>)
tensor(1.4837, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15849/17426 [28:41<03:22,  7.80it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.4801, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15851/17426 [28:41<03:17,  7.97it/s]

tensor(1.4814, grad_fn=<NllLossBackward0>)
tensor(1.4483, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15853/17426 [28:41<03:24,  7.71it/s]

tensor(1.5043, grad_fn=<NllLossBackward0>)
tensor(1.5124, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15855/17426 [28:42<03:25,  7.66it/s]

tensor(1.4443, grad_fn=<NllLossBackward0>)
tensor(1.4869, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15857/17426 [28:42<03:20,  7.82it/s]

tensor(1.4537, grad_fn=<NllLossBackward0>)
tensor(1.4543, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15859/17426 [28:42<03:21,  7.76it/s]

tensor(1.5337, grad_fn=<NllLossBackward0>)
tensor(1.5022, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15861/17426 [28:42<03:15,  8.01it/s]

tensor(1.4894, grad_fn=<NllLossBackward0>)
tensor(1.4280, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15863/17426 [28:43<03:13,  8.09it/s]

tensor(1.4744, grad_fn=<NllLossBackward0>)
tensor(1.4969, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15865/17426 [28:43<03:13,  8.07it/s]

tensor(1.4731, grad_fn=<NllLossBackward0>)
tensor(1.5188, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15867/17426 [28:43<03:08,  8.29it/s]

tensor(1.4494, grad_fn=<NllLossBackward0>)
tensor(1.5000, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15869/17426 [28:43<03:11,  8.11it/s]

tensor(1.5175, grad_fn=<NllLossBackward0>)
tensor(1.5053, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15871/17426 [28:44<03:15,  7.96it/s]

tensor(1.4833, grad_fn=<NllLossBackward0>)
tensor(1.4651, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15873/17426 [28:44<03:16,  7.89it/s]

tensor(1.4894, grad_fn=<NllLossBackward0>)
tensor(1.4634, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15875/17426 [28:44<03:07,  8.25it/s]

tensor(1.4495, grad_fn=<NllLossBackward0>)
tensor(1.4648, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15877/17426 [28:44<03:12,  8.05it/s]

tensor(1.4770, grad_fn=<NllLossBackward0>)
tensor(1.4676, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15879/17426 [28:45<03:15,  7.92it/s]

tensor(1.4704, grad_fn=<NllLossBackward0>)
tensor(1.5163, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15881/17426 [28:45<03:07,  8.23it/s]

tensor(1.4295, grad_fn=<NllLossBackward0>)
tensor(1.5028, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15883/17426 [28:45<03:04,  8.38it/s]

tensor(1.4944, grad_fn=<NllLossBackward0>)
tensor(1.5587, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15885/17426 [28:45<03:04,  8.36it/s]

tensor(1.4770, grad_fn=<NllLossBackward0>)
tensor(1.4225, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15887/17426 [28:46<03:11,  8.02it/s]

tensor(1.4526, grad_fn=<NllLossBackward0>)
tensor(1.4813, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15889/17426 [28:46<03:07,  8.21it/s]

tensor(1.4851, grad_fn=<NllLossBackward0>)
tensor(1.4644, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15891/17426 [28:46<03:10,  8.06it/s]

tensor(1.5001, grad_fn=<NllLossBackward0>)
tensor(1.5043, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15893/17426 [28:46<03:09,  8.09it/s]

tensor(1.4573, grad_fn=<NllLossBackward0>)
tensor(1.4752, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15895/17426 [28:47<03:11,  8.00it/s]

tensor(1.5175, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15897/17426 [28:47<03:07,  8.14it/s]

tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.4404, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15899/17426 [28:47<03:14,  7.85it/s]

tensor(1.4900, grad_fn=<NllLossBackward0>)
tensor(1.4939, grad_fn=<NllLossBackward0>)


 91%|█████████ | 15901/17426 [28:47<03:19,  7.64it/s]

tensor(1.4797, grad_fn=<NllLossBackward0>)
tensor(1.4409, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15903/17426 [28:48<03:10,  7.98it/s]

tensor(1.4431, grad_fn=<NllLossBackward0>)
tensor(1.5089, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15905/17426 [28:48<03:09,  8.02it/s]

tensor(1.4169, grad_fn=<NllLossBackward0>)
tensor(1.4173, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15907/17426 [28:48<03:11,  7.94it/s]

tensor(1.4731, grad_fn=<NllLossBackward0>)
tensor(1.4586, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15909/17426 [28:48<03:09,  8.01it/s]

tensor(1.4773, grad_fn=<NllLossBackward0>)
tensor(1.4930, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15911/17426 [28:49<03:07,  8.10it/s]

tensor(1.4330, grad_fn=<NllLossBackward0>)
tensor(1.5309, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15913/17426 [28:49<03:05,  8.17it/s]

tensor(1.4822, grad_fn=<NllLossBackward0>)
tensor(1.5129, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15915/17426 [28:49<03:01,  8.34it/s]

tensor(1.4677, grad_fn=<NllLossBackward0>)
tensor(1.4372, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15917/17426 [28:49<03:10,  7.93it/s]

tensor(1.4853, grad_fn=<NllLossBackward0>)
tensor(1.4691, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15919/17426 [28:50<03:10,  7.92it/s]

tensor(1.5085, grad_fn=<NllLossBackward0>)
tensor(1.5030, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15921/17426 [28:50<03:17,  7.62it/s]

tensor(1.4654, grad_fn=<NllLossBackward0>)
tensor(1.4795, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15923/17426 [28:50<03:26,  7.28it/s]

tensor(1.4900, grad_fn=<NllLossBackward0>)
tensor(1.4470, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15925/17426 [28:51<03:48,  6.57it/s]

tensor(1.4637, grad_fn=<NllLossBackward0>)
tensor(1.5358, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15927/17426 [28:51<03:33,  7.02it/s]

tensor(1.4803, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15929/17426 [28:51<03:24,  7.32it/s]

tensor(1.4676, grad_fn=<NllLossBackward0>)
tensor(1.4926, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15931/17426 [28:51<03:21,  7.44it/s]

tensor(1.4600, grad_fn=<NllLossBackward0>)
tensor(1.4721, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15933/17426 [28:52<03:14,  7.68it/s]

tensor(1.4857, grad_fn=<NllLossBackward0>)
tensor(1.5509, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15935/17426 [28:52<03:07,  7.97it/s]

tensor(1.4718, grad_fn=<NllLossBackward0>)
tensor(1.4971, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15937/17426 [28:52<03:02,  8.17it/s]

tensor(1.4327, grad_fn=<NllLossBackward0>)
tensor(1.4776, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15939/17426 [28:52<03:08,  7.89it/s]

tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.4640, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15941/17426 [28:53<03:20,  7.39it/s]

tensor(1.4819, grad_fn=<NllLossBackward0>)
tensor(1.4316, grad_fn=<NllLossBackward0>)


 91%|█████████▏| 15943/17426 [28:53<03:33,  6.95it/s]

tensor(1.4286, grad_fn=<NllLossBackward0>)
tensor(1.4687, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15945/17426 [28:53<03:24,  7.26it/s]

tensor(1.5144, grad_fn=<NllLossBackward0>)
tensor(1.4349, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15947/17426 [28:53<03:22,  7.30it/s]

tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.4887, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15949/17426 [28:54<03:35,  6.86it/s]

tensor(1.4787, grad_fn=<NllLossBackward0>)
tensor(1.4693, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15951/17426 [28:54<03:36,  6.80it/s]

tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.5407, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15953/17426 [28:54<03:28,  7.05it/s]

tensor(1.4619, grad_fn=<NllLossBackward0>)
tensor(1.4555, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15955/17426 [28:55<03:10,  7.74it/s]

tensor(1.4965, grad_fn=<NllLossBackward0>)
tensor(1.4504, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15957/17426 [28:55<03:09,  7.75it/s]

tensor(1.4390, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15959/17426 [28:55<03:12,  7.64it/s]

tensor(1.4370, grad_fn=<NllLossBackward0>)
tensor(1.4552, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15961/17426 [28:55<03:08,  7.77it/s]

tensor(1.4584, grad_fn=<NllLossBackward0>)
tensor(1.4799, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15963/17426 [28:56<02:59,  8.15it/s]

tensor(1.4896, grad_fn=<NllLossBackward0>)
tensor(1.5206, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15965/17426 [28:56<03:01,  8.05it/s]

tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.4834, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15967/17426 [28:56<03:01,  8.05it/s]

tensor(1.4632, grad_fn=<NllLossBackward0>)
tensor(1.4883, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15969/17426 [28:56<03:04,  7.89it/s]

tensor(1.4969, grad_fn=<NllLossBackward0>)
tensor(1.4778, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15971/17426 [28:57<02:58,  8.15it/s]

tensor(1.4970, grad_fn=<NllLossBackward0>)
tensor(1.4912, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15973/17426 [28:57<03:01,  8.02it/s]

tensor(1.5092, grad_fn=<NllLossBackward0>)
tensor(1.4701, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15975/17426 [28:57<03:04,  7.87it/s]

tensor(1.5227, grad_fn=<NllLossBackward0>)
tensor(1.4794, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15977/17426 [28:57<03:06,  7.76it/s]

tensor(1.4979, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15979/17426 [28:58<03:03,  7.89it/s]

tensor(1.5289, grad_fn=<NllLossBackward0>)
tensor(1.4910, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15981/17426 [28:58<02:59,  8.04it/s]

tensor(1.4490, grad_fn=<NllLossBackward0>)
tensor(1.4743, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15983/17426 [28:58<03:03,  7.85it/s]

tensor(1.4758, grad_fn=<NllLossBackward0>)
tensor(1.4636, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15985/17426 [28:58<02:58,  8.07it/s]

tensor(1.5584, grad_fn=<NllLossBackward0>)
tensor(1.4718, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15987/17426 [28:59<02:54,  8.27it/s]

tensor(1.4689, grad_fn=<NllLossBackward0>)
tensor(1.4668, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15989/17426 [28:59<02:50,  8.41it/s]

tensor(1.5293, grad_fn=<NllLossBackward0>)
tensor(1.5135, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15991/17426 [28:59<02:57,  8.08it/s]

tensor(1.5083, grad_fn=<NllLossBackward0>)
tensor(1.5209, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15993/17426 [28:59<02:55,  8.17it/s]

tensor(1.4702, grad_fn=<NllLossBackward0>)
tensor(1.4477, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15995/17426 [29:00<02:51,  8.35it/s]

tensor(1.4379, grad_fn=<NllLossBackward0>)
tensor(1.4924, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15997/17426 [29:00<02:50,  8.38it/s]

tensor(1.5321, grad_fn=<NllLossBackward0>)
tensor(1.5242, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 15999/17426 [29:00<03:00,  7.89it/s]

tensor(1.4540, grad_fn=<NllLossBackward0>)
tensor(1.4710, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16001/17426 [29:00<02:58,  8.00it/s]

tensor(1.4194, grad_fn=<NllLossBackward0>)
tensor(1.4994, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16003/17426 [29:01<02:54,  8.14it/s]

tensor(1.4751, grad_fn=<NllLossBackward0>)
tensor(1.4631, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16005/17426 [29:01<02:54,  8.14it/s]

tensor(1.5014, grad_fn=<NllLossBackward0>)
tensor(1.4465, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16007/17426 [29:01<02:57,  7.99it/s]

tensor(1.4958, grad_fn=<NllLossBackward0>)
tensor(1.4547, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16009/17426 [29:01<03:04,  7.70it/s]

tensor(1.4972, grad_fn=<NllLossBackward0>)
tensor(1.4643, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16011/17426 [29:02<02:57,  7.97it/s]

tensor(1.4405, grad_fn=<NllLossBackward0>)
tensor(1.4697, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16013/17426 [29:02<02:52,  8.21it/s]

tensor(1.4922, grad_fn=<NllLossBackward0>)
tensor(1.4898, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16015/17426 [29:02<02:51,  8.23it/s]

tensor(1.4367, grad_fn=<NllLossBackward0>)
tensor(1.4746, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16017/17426 [29:02<02:55,  8.04it/s]

tensor(1.4745, grad_fn=<NllLossBackward0>)
tensor(1.5280, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16019/17426 [29:03<02:53,  8.12it/s]

tensor(1.5221, grad_fn=<NllLossBackward0>)
tensor(1.4989, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16021/17426 [29:03<02:52,  8.17it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.4560, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16023/17426 [29:03<02:50,  8.25it/s]

tensor(1.4838, grad_fn=<NllLossBackward0>)
tensor(1.4661, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16025/17426 [29:03<03:00,  7.74it/s]

tensor(1.4313, grad_fn=<NllLossBackward0>)
tensor(1.5158, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16027/17426 [29:04<02:52,  8.11it/s]

tensor(1.4575, grad_fn=<NllLossBackward0>)
tensor(1.5206, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16029/17426 [29:04<02:50,  8.19it/s]

tensor(1.5185, grad_fn=<NllLossBackward0>)
tensor(1.4572, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16031/17426 [29:04<02:49,  8.25it/s]

tensor(1.4517, grad_fn=<NllLossBackward0>)
tensor(1.4092, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16033/17426 [29:04<02:53,  8.02it/s]

tensor(1.4763, grad_fn=<NllLossBackward0>)
tensor(1.5047, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16035/17426 [29:05<03:09,  7.35it/s]

tensor(1.4785, grad_fn=<NllLossBackward0>)
tensor(1.4790, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16037/17426 [29:05<03:06,  7.43it/s]

tensor(1.4873, grad_fn=<NllLossBackward0>)
tensor(1.4894, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16039/17426 [29:05<03:15,  7.09it/s]

tensor(1.4611, grad_fn=<NllLossBackward0>)
tensor(1.4846, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16041/17426 [29:05<03:18,  6.98it/s]

tensor(1.5016, grad_fn=<NllLossBackward0>)
tensor(1.5409, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16043/17426 [29:06<03:30,  6.56it/s]

tensor(1.4368, grad_fn=<NllLossBackward0>)
tensor(1.5011, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16045/17426 [29:06<03:33,  6.47it/s]

tensor(1.5327, grad_fn=<NllLossBackward0>)
tensor(1.4692, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16047/17426 [29:06<03:36,  6.37it/s]

tensor(1.4937, grad_fn=<NllLossBackward0>)
tensor(1.5809, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16049/17426 [29:07<03:32,  6.46it/s]

tensor(1.4824, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16051/17426 [29:07<03:15,  7.03it/s]

tensor(1.5057, grad_fn=<NllLossBackward0>)
tensor(1.5004, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16053/17426 [29:07<03:24,  6.73it/s]

tensor(1.4919, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16055/17426 [29:08<03:34,  6.38it/s]

tensor(1.4985, grad_fn=<NllLossBackward0>)
tensor(1.4717, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16057/17426 [29:08<03:26,  6.63it/s]

tensor(1.4591, grad_fn=<NllLossBackward0>)
tensor(1.4766, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16059/17426 [29:08<03:29,  6.53it/s]

tensor(1.4701, grad_fn=<NllLossBackward0>)
tensor(1.4594, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16061/17426 [29:09<03:32,  6.42it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)
tensor(1.4784, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16063/17426 [29:09<03:19,  6.82it/s]

tensor(1.5031, grad_fn=<NllLossBackward0>)
tensor(1.4689, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16065/17426 [29:09<03:06,  7.30it/s]

tensor(1.4631, grad_fn=<NllLossBackward0>)
tensor(1.4513, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16067/17426 [29:09<02:56,  7.70it/s]

tensor(1.5002, grad_fn=<NllLossBackward0>)
tensor(1.4850, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16069/17426 [29:10<02:49,  8.01it/s]

tensor(1.5024, grad_fn=<NllLossBackward0>)
tensor(1.4966, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16071/17426 [29:10<02:56,  7.67it/s]

tensor(1.4581, grad_fn=<NllLossBackward0>)
tensor(1.4553, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16073/17426 [29:10<02:51,  7.87it/s]

tensor(1.5023, grad_fn=<NllLossBackward0>)
tensor(1.4726, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16075/17426 [29:10<02:48,  8.03it/s]

tensor(1.5260, grad_fn=<NllLossBackward0>)
tensor(1.4298, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16077/17426 [29:11<02:44,  8.18it/s]

tensor(1.4584, grad_fn=<NllLossBackward0>)
tensor(1.4672, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16079/17426 [29:11<02:43,  8.22it/s]

tensor(1.4692, grad_fn=<NllLossBackward0>)
tensor(1.4662, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16081/17426 [29:11<02:46,  8.08it/s]

tensor(1.4760, grad_fn=<NllLossBackward0>)
tensor(1.4808, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16083/17426 [29:11<02:50,  7.86it/s]

tensor(1.4295, grad_fn=<NllLossBackward0>)
tensor(1.4981, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16085/17426 [29:12<02:48,  7.95it/s]

tensor(1.4814, grad_fn=<NllLossBackward0>)
tensor(1.4357, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16087/17426 [29:12<02:45,  8.10it/s]

tensor(1.5024, grad_fn=<NllLossBackward0>)
tensor(1.4653, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16089/17426 [29:12<02:45,  8.10it/s]

tensor(1.4651, grad_fn=<NllLossBackward0>)
tensor(1.4646, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16091/17426 [29:12<02:42,  8.20it/s]

tensor(1.4814, grad_fn=<NllLossBackward0>)
tensor(1.4939, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16093/17426 [29:13<02:43,  8.16it/s]

tensor(1.4191, grad_fn=<NllLossBackward0>)
tensor(1.4593, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16095/17426 [29:13<02:41,  8.26it/s]

tensor(1.5496, grad_fn=<NllLossBackward0>)
tensor(1.4789, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16097/17426 [29:13<02:48,  7.87it/s]

tensor(1.4843, grad_fn=<NllLossBackward0>)
tensor(1.5065, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16099/17426 [29:13<02:40,  8.26it/s]

tensor(1.5018, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16101/17426 [29:14<02:41,  8.19it/s]

tensor(1.4393, grad_fn=<NllLossBackward0>)
tensor(1.4836, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16103/17426 [29:14<02:39,  8.30it/s]

tensor(1.4761, grad_fn=<NllLossBackward0>)
tensor(1.4763, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16105/17426 [29:14<02:35,  8.47it/s]

tensor(1.4702, grad_fn=<NllLossBackward0>)
tensor(1.4453, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16107/17426 [29:14<02:41,  8.18it/s]

tensor(1.4651, grad_fn=<NllLossBackward0>)
tensor(1.4652, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16109/17426 [29:14<02:41,  8.17it/s]

tensor(1.4419, grad_fn=<NllLossBackward0>)
tensor(1.5111, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16111/17426 [29:15<02:40,  8.21it/s]

tensor(1.4915, grad_fn=<NllLossBackward0>)
tensor(1.4862, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16113/17426 [29:15<02:36,  8.39it/s]

tensor(1.4581, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16115/17426 [29:15<02:44,  7.99it/s]

tensor(1.5232, grad_fn=<NllLossBackward0>)
tensor(1.4741, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16117/17426 [29:15<02:39,  8.19it/s]

tensor(1.5126, grad_fn=<NllLossBackward0>)
tensor(1.5100, grad_fn=<NllLossBackward0>)


 92%|█████████▏| 16119/17426 [29:16<02:41,  8.11it/s]

tensor(1.5001, grad_fn=<NllLossBackward0>)
tensor(1.4555, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16121/17426 [29:16<02:44,  7.93it/s]

tensor(1.4995, grad_fn=<NllLossBackward0>)
tensor(1.4225, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16123/17426 [29:16<02:44,  7.93it/s]

tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.4857, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16125/17426 [29:16<02:39,  8.16it/s]

tensor(1.4741, grad_fn=<NllLossBackward0>)
tensor(1.5093, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16127/17426 [29:17<02:42,  8.02it/s]

tensor(1.4383, grad_fn=<NllLossBackward0>)
tensor(1.4579, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16129/17426 [29:17<02:43,  7.95it/s]

tensor(1.4457, grad_fn=<NllLossBackward0>)
tensor(1.4692, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16131/17426 [29:17<02:46,  7.79it/s]

tensor(1.4580, grad_fn=<NllLossBackward0>)
tensor(1.5162, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16133/17426 [29:17<02:40,  8.06it/s]

tensor(1.5108, grad_fn=<NllLossBackward0>)
tensor(1.4631, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16135/17426 [29:18<02:53,  7.45it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5166, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16137/17426 [29:18<02:45,  7.81it/s]

tensor(1.4395, grad_fn=<NllLossBackward0>)
tensor(1.4683, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16139/17426 [29:18<02:36,  8.25it/s]

tensor(1.4724, grad_fn=<NllLossBackward0>)
tensor(1.4878, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16141/17426 [29:18<02:37,  8.16it/s]

tensor(1.4624, grad_fn=<NllLossBackward0>)
tensor(1.5268, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16143/17426 [29:19<02:44,  7.82it/s]

tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.4731, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16145/17426 [29:19<02:45,  7.72it/s]

tensor(1.4577, grad_fn=<NllLossBackward0>)
tensor(1.5039, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16147/17426 [29:19<02:45,  7.75it/s]

tensor(1.4237, grad_fn=<NllLossBackward0>)
tensor(1.4708, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16149/17426 [29:20<02:50,  7.51it/s]

tensor(1.4981, grad_fn=<NllLossBackward0>)
tensor(1.5328, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16151/17426 [29:20<02:50,  7.46it/s]

tensor(1.4809, grad_fn=<NllLossBackward0>)
tensor(1.4921, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16153/17426 [29:20<02:45,  7.69it/s]

tensor(1.4893, grad_fn=<NllLossBackward0>)
tensor(1.4788, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16155/17426 [29:20<02:38,  8.00it/s]

tensor(1.4840, grad_fn=<NllLossBackward0>)
tensor(1.4692, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16157/17426 [29:21<02:48,  7.55it/s]

tensor(1.4609, grad_fn=<NllLossBackward0>)
tensor(1.5111, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16159/17426 [29:21<02:56,  7.18it/s]

tensor(1.4399, grad_fn=<NllLossBackward0>)
tensor(1.4745, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16161/17426 [29:21<02:54,  7.26it/s]

tensor(1.4864, grad_fn=<NllLossBackward0>)
tensor(1.4822, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16163/17426 [29:21<02:48,  7.48it/s]

tensor(1.4630, grad_fn=<NllLossBackward0>)
tensor(1.4301, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16165/17426 [29:22<02:48,  7.46it/s]

tensor(1.4859, grad_fn=<NllLossBackward0>)
tensor(1.4636, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16167/17426 [29:22<03:01,  6.93it/s]

tensor(1.4595, grad_fn=<NllLossBackward0>)
tensor(1.5450, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16169/17426 [29:22<02:56,  7.12it/s]

tensor(1.4739, grad_fn=<NllLossBackward0>)
tensor(1.4787, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16171/17426 [29:23<03:07,  6.69it/s]

tensor(1.5075, grad_fn=<NllLossBackward0>)
tensor(1.4654, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16173/17426 [29:23<03:08,  6.63it/s]

tensor(1.4935, grad_fn=<NllLossBackward0>)
tensor(1.5027, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16175/17426 [29:23<03:06,  6.71it/s]

tensor(1.4435, grad_fn=<NllLossBackward0>)
tensor(1.4477, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16177/17426 [29:23<03:02,  6.84it/s]

tensor(1.4908, grad_fn=<NllLossBackward0>)
tensor(1.4329, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16179/17426 [29:24<02:55,  7.09it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5080, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16181/17426 [29:24<02:46,  7.46it/s]

tensor(1.4703, grad_fn=<NllLossBackward0>)
tensor(1.4590, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16183/17426 [29:24<02:39,  7.77it/s]

tensor(1.4769, grad_fn=<NllLossBackward0>)
tensor(1.4802, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16185/17426 [29:25<02:32,  8.15it/s]

tensor(1.4692, grad_fn=<NllLossBackward0>)
tensor(1.4762, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16187/17426 [29:25<02:31,  8.16it/s]

tensor(1.4982, grad_fn=<NllLossBackward0>)
tensor(1.5238, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16189/17426 [29:25<02:35,  7.94it/s]

tensor(1.4841, grad_fn=<NllLossBackward0>)
tensor(1.4951, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16191/17426 [29:25<02:34,  8.00it/s]

tensor(1.4643, grad_fn=<NllLossBackward0>)
tensor(1.4329, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16193/17426 [29:26<02:32,  8.11it/s]

tensor(1.4816, grad_fn=<NllLossBackward0>)
tensor(1.5036, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16195/17426 [29:26<02:30,  8.17it/s]

tensor(1.4777, grad_fn=<NllLossBackward0>)
tensor(1.4915, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16197/17426 [29:26<02:34,  7.96it/s]

tensor(1.4639, grad_fn=<NllLossBackward0>)
tensor(1.4642, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16199/17426 [29:26<02:35,  7.88it/s]

tensor(1.5301, grad_fn=<NllLossBackward0>)
tensor(1.4588, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16201/17426 [29:27<02:30,  8.13it/s]

tensor(1.4816, grad_fn=<NllLossBackward0>)
tensor(1.4871, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16203/17426 [29:27<02:27,  8.30it/s]

tensor(1.4755, grad_fn=<NllLossBackward0>)
tensor(1.4556, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16205/17426 [29:27<02:34,  7.90it/s]

tensor(1.4870, grad_fn=<NllLossBackward0>)
tensor(1.4671, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16207/17426 [29:27<02:28,  8.20it/s]

tensor(1.4996, grad_fn=<NllLossBackward0>)
tensor(1.4962, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16209/17426 [29:28<02:29,  8.16it/s]

tensor(1.4897, grad_fn=<NllLossBackward0>)
tensor(1.4664, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16211/17426 [29:28<02:30,  8.05it/s]

tensor(1.4402, grad_fn=<NllLossBackward0>)
tensor(1.4325, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16213/17426 [29:28<02:31,  7.99it/s]

tensor(1.4789, grad_fn=<NllLossBackward0>)
tensor(1.5004, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16215/17426 [29:28<02:27,  8.18it/s]

tensor(1.4683, grad_fn=<NllLossBackward0>)
tensor(1.5110, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16217/17426 [29:29<02:28,  8.16it/s]

tensor(1.4737, grad_fn=<NllLossBackward0>)
tensor(1.4815, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16219/17426 [29:29<02:25,  8.32it/s]

tensor(1.4712, grad_fn=<NllLossBackward0>)
tensor(1.4639, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16221/17426 [29:29<02:23,  8.39it/s]

tensor(1.4689, grad_fn=<NllLossBackward0>)
tensor(1.5073, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16223/17426 [29:29<02:27,  8.17it/s]

tensor(1.4832, grad_fn=<NllLossBackward0>)
tensor(1.4492, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16225/17426 [29:29<02:31,  7.94it/s]

tensor(1.4523, grad_fn=<NllLossBackward0>)
tensor(1.4486, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16227/17426 [29:30<02:26,  8.19it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.4578, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16229/17426 [29:30<02:24,  8.27it/s]

tensor(1.4558, grad_fn=<NllLossBackward0>)
tensor(1.4703, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16231/17426 [29:30<02:29,  7.99it/s]

tensor(1.5251, grad_fn=<NllLossBackward0>)
tensor(1.4638, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16233/17426 [29:30<02:26,  8.17it/s]

tensor(1.4838, grad_fn=<NllLossBackward0>)
tensor(1.4998, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16235/17426 [29:31<02:23,  8.31it/s]

tensor(1.5243, grad_fn=<NllLossBackward0>)
tensor(1.5024, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16237/17426 [29:31<02:22,  8.35it/s]

tensor(1.4853, grad_fn=<NllLossBackward0>)
tensor(1.4908, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16239/17426 [29:31<02:30,  7.90it/s]

tensor(1.5179, grad_fn=<NllLossBackward0>)
tensor(1.4744, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16241/17426 [29:31<02:25,  8.16it/s]

tensor(1.4842, grad_fn=<NllLossBackward0>)
tensor(1.5082, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16243/17426 [29:32<02:24,  8.18it/s]

tensor(1.4814, grad_fn=<NllLossBackward0>)
tensor(1.4507, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16245/17426 [29:32<02:24,  8.16it/s]

tensor(1.4827, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16247/17426 [29:32<02:23,  8.22it/s]

tensor(1.4579, grad_fn=<NllLossBackward0>)
tensor(1.5002, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16249/17426 [29:32<02:27,  7.97it/s]

tensor(1.4840, grad_fn=<NllLossBackward0>)
tensor(1.5032, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16251/17426 [29:33<02:24,  8.16it/s]

tensor(1.5173, grad_fn=<NllLossBackward0>)
tensor(1.4849, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16253/17426 [29:33<02:22,  8.23it/s]

tensor(1.4812, grad_fn=<NllLossBackward0>)
tensor(1.4659, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16255/17426 [29:33<02:24,  8.09it/s]

tensor(1.4967, grad_fn=<NllLossBackward0>)
tensor(1.4787, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16257/17426 [29:33<02:28,  7.89it/s]

tensor(1.4885, grad_fn=<NllLossBackward0>)
tensor(1.4917, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16259/17426 [29:34<02:30,  7.76it/s]

tensor(1.4861, grad_fn=<NllLossBackward0>)
tensor(1.5031, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16261/17426 [29:34<02:31,  7.67it/s]

tensor(1.4954, grad_fn=<NllLossBackward0>)
tensor(1.4887, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16263/17426 [29:34<02:36,  7.42it/s]

tensor(1.4674, grad_fn=<NllLossBackward0>)
tensor(1.4816, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16265/17426 [29:35<02:42,  7.15it/s]

tensor(1.4855, grad_fn=<NllLossBackward0>)
tensor(1.5053, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16267/17426 [29:35<02:32,  7.58it/s]

tensor(1.4526, grad_fn=<NllLossBackward0>)
tensor(1.4811, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16269/17426 [29:35<02:24,  7.98it/s]

tensor(1.4583, grad_fn=<NllLossBackward0>)
tensor(1.4627, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16271/17426 [29:35<02:25,  7.96it/s]

tensor(1.4757, grad_fn=<NllLossBackward0>)
tensor(1.5267, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16273/17426 [29:36<02:34,  7.44it/s]

tensor(1.4711, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16275/17426 [29:36<02:39,  7.20it/s]

tensor(1.4934, grad_fn=<NllLossBackward0>)
tensor(1.4562, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16277/17426 [29:36<02:37,  7.30it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.5063, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16279/17426 [29:36<02:45,  6.92it/s]

tensor(1.4774, grad_fn=<NllLossBackward0>)
tensor(1.4752, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16281/17426 [29:37<02:51,  6.68it/s]

tensor(1.4712, grad_fn=<NllLossBackward0>)
tensor(1.5103, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16283/17426 [29:37<02:47,  6.82it/s]

tensor(1.4828, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16285/17426 [29:37<02:42,  7.03it/s]

tensor(1.5136, grad_fn=<NllLossBackward0>)
tensor(1.4796, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16287/17426 [29:38<02:50,  6.69it/s]

tensor(1.4592, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16289/17426 [29:38<02:48,  6.74it/s]

tensor(1.4552, grad_fn=<NllLossBackward0>)
tensor(1.4618, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16291/17426 [29:38<02:51,  6.61it/s]

tensor(1.5015, grad_fn=<NllLossBackward0>)
tensor(1.4785, grad_fn=<NllLossBackward0>)


 93%|█████████▎| 16293/17426 [29:38<02:41,  7.02it/s]

tensor(1.4939, grad_fn=<NllLossBackward0>)
tensor(1.4477, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16295/17426 [29:39<02:39,  7.10it/s]

tensor(1.4833, grad_fn=<NllLossBackward0>)
tensor(1.5086, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16297/17426 [29:39<02:28,  7.59it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.5200, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16299/17426 [29:39<02:23,  7.87it/s]

tensor(1.5246, grad_fn=<NllLossBackward0>)
tensor(1.4511, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16301/17426 [29:39<02:18,  8.14it/s]

tensor(1.5020, grad_fn=<NllLossBackward0>)
tensor(1.4609, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16303/17426 [29:40<02:23,  7.84it/s]

tensor(1.4820, grad_fn=<NllLossBackward0>)
tensor(1.4980, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16305/17426 [29:40<02:21,  7.91it/s]

tensor(1.5214, grad_fn=<NllLossBackward0>)
tensor(1.4546, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16307/17426 [29:40<02:18,  8.09it/s]

tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.4965, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16309/17426 [29:40<02:15,  8.22it/s]

tensor(1.4422, grad_fn=<NllLossBackward0>)
tensor(1.5060, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16311/17426 [29:41<02:18,  8.08it/s]

tensor(1.4399, grad_fn=<NllLossBackward0>)
tensor(1.4736, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16313/17426 [29:41<02:15,  8.18it/s]

tensor(1.5378, grad_fn=<NllLossBackward0>)
tensor(1.4784, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16315/17426 [29:41<02:19,  7.98it/s]

tensor(1.5129, grad_fn=<NllLossBackward0>)
tensor(1.4468, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16317/17426 [29:41<02:15,  8.18it/s]

tensor(1.4534, grad_fn=<NllLossBackward0>)
tensor(1.4182, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16319/17426 [29:42<02:13,  8.27it/s]

tensor(1.4747, grad_fn=<NllLossBackward0>)
tensor(1.5015, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16321/17426 [29:42<02:18,  7.97it/s]

tensor(1.4859, grad_fn=<NllLossBackward0>)
tensor(1.4553, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16323/17426 [29:42<02:15,  8.16it/s]

tensor(1.4647, grad_fn=<NllLossBackward0>)
tensor(1.5440, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16325/17426 [29:42<02:15,  8.14it/s]

tensor(1.4682, grad_fn=<NllLossBackward0>)
tensor(1.5249, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16327/17426 [29:43<02:12,  8.31it/s]

tensor(1.4944, grad_fn=<NllLossBackward0>)
tensor(1.4690, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16329/17426 [29:43<02:18,  7.94it/s]

tensor(1.5316, grad_fn=<NllLossBackward0>)
tensor(1.4586, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16331/17426 [29:43<02:14,  8.16it/s]

tensor(1.4589, grad_fn=<NllLossBackward0>)
tensor(1.4823, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16333/17426 [29:43<02:15,  8.05it/s]

tensor(1.4748, grad_fn=<NllLossBackward0>)
tensor(1.4587, grad_fn=<NllLossBackward0>)


 94%|█████████▎| 16335/17426 [29:44<02:12,  8.24it/s]

tensor(1.4981, grad_fn=<NllLossBackward0>)
tensor(1.5300, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16337/17426 [29:44<02:09,  8.41it/s]

tensor(1.4787, grad_fn=<NllLossBackward0>)
tensor(1.4895, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16339/17426 [29:44<02:10,  8.33it/s]

tensor(1.4772, grad_fn=<NllLossBackward0>)
tensor(1.5003, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16341/17426 [29:44<02:09,  8.37it/s]

tensor(1.4656, grad_fn=<NllLossBackward0>)
tensor(1.4561, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16343/17426 [29:45<02:18,  7.83it/s]

tensor(1.4518, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16345/17426 [29:45<02:14,  8.06it/s]

tensor(1.4626, grad_fn=<NllLossBackward0>)
tensor(1.5098, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16347/17426 [29:45<02:15,  7.98it/s]

tensor(1.4338, grad_fn=<NllLossBackward0>)
tensor(1.4583, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16349/17426 [29:45<02:10,  8.28it/s]

tensor(1.4960, grad_fn=<NllLossBackward0>)
tensor(1.5079, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16351/17426 [29:46<02:14,  8.00it/s]

tensor(1.4651, grad_fn=<NllLossBackward0>)
tensor(1.4917, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16353/17426 [29:46<02:12,  8.12it/s]

tensor(1.5083, grad_fn=<NllLossBackward0>)
tensor(1.4413, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16355/17426 [29:46<02:13,  8.04it/s]

tensor(1.5177, grad_fn=<NllLossBackward0>)
tensor(1.5351, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16357/17426 [29:46<02:11,  8.12it/s]

tensor(1.4926, grad_fn=<NllLossBackward0>)
tensor(1.4605, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16359/17426 [29:47<02:08,  8.28it/s]

tensor(1.4831, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16361/17426 [29:47<02:08,  8.26it/s]

tensor(1.4035, grad_fn=<NllLossBackward0>)
tensor(1.4664, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16363/17426 [29:47<02:07,  8.35it/s]

tensor(1.4604, grad_fn=<NllLossBackward0>)
tensor(1.4452, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16365/17426 [29:47<02:10,  8.11it/s]

tensor(1.4491, grad_fn=<NllLossBackward0>)
tensor(1.4781, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16367/17426 [29:48<02:09,  8.20it/s]

tensor(1.4467, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16369/17426 [29:48<02:17,  7.70it/s]

tensor(1.5081, grad_fn=<NllLossBackward0>)
tensor(1.4866, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16371/17426 [29:48<02:16,  7.71it/s]

tensor(1.5355, grad_fn=<NllLossBackward0>)
tensor(1.4709, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16373/17426 [29:48<02:25,  7.26it/s]

tensor(1.4654, grad_fn=<NllLossBackward0>)
tensor(1.5382, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16375/17426 [29:49<02:23,  7.31it/s]

tensor(1.5033, grad_fn=<NllLossBackward0>)
tensor(1.4745, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16377/17426 [29:49<02:26,  7.17it/s]

tensor(1.5117, grad_fn=<NllLossBackward0>)
tensor(1.4786, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16379/17426 [29:49<02:23,  7.28it/s]

tensor(1.4829, grad_fn=<NllLossBackward0>)
tensor(1.4695, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16381/17426 [29:50<02:32,  6.86it/s]

tensor(1.4545, grad_fn=<NllLossBackward0>)
tensor(1.5221, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16383/17426 [29:50<02:22,  7.31it/s]

tensor(1.4655, grad_fn=<NllLossBackward0>)
tensor(1.4789, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16385/17426 [29:50<02:12,  7.85it/s]

tensor(1.4919, grad_fn=<NllLossBackward0>)
tensor(1.4981, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16387/17426 [29:50<02:07,  8.16it/s]

tensor(1.4540, grad_fn=<NllLossBackward0>)
tensor(1.4720, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16389/17426 [29:51<02:10,  7.93it/s]

tensor(1.4625, grad_fn=<NllLossBackward0>)
tensor(1.4141, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16391/17426 [29:51<02:08,  8.08it/s]

tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.5136, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16393/17426 [29:51<02:03,  8.36it/s]

tensor(1.4554, grad_fn=<NllLossBackward0>)
tensor(1.4972, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16395/17426 [29:51<02:03,  8.36it/s]

tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.5363, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16397/17426 [29:52<02:02,  8.43it/s]

tensor(1.4428, grad_fn=<NllLossBackward0>)
tensor(1.4811, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16399/17426 [29:52<02:19,  7.36it/s]

tensor(1.4771, grad_fn=<NllLossBackward0>)
tensor(1.5392, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16401/17426 [29:52<02:31,  6.78it/s]

tensor(1.4994, grad_fn=<NllLossBackward0>)
tensor(1.5257, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16403/17426 [29:52<02:25,  7.05it/s]

tensor(1.4824, grad_fn=<NllLossBackward0>)
tensor(1.5023, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16405/17426 [29:53<02:26,  6.98it/s]

tensor(1.4703, grad_fn=<NllLossBackward0>)
tensor(1.5048, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16407/17426 [29:53<02:24,  7.06it/s]

tensor(1.5213, grad_fn=<NllLossBackward0>)
tensor(1.4297, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16409/17426 [29:53<02:25,  7.01it/s]

tensor(1.4628, grad_fn=<NllLossBackward0>)
tensor(1.5243, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16411/17426 [29:54<02:29,  6.78it/s]

tensor(1.4748, grad_fn=<NllLossBackward0>)
tensor(1.4993, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16413/17426 [29:54<02:19,  7.24it/s]

tensor(1.4801, grad_fn=<NllLossBackward0>)
tensor(1.4911, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16415/17426 [29:54<02:12,  7.64it/s]

tensor(1.5169, grad_fn=<NllLossBackward0>)
tensor(1.4376, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16417/17426 [29:54<02:05,  8.04it/s]

tensor(1.4821, grad_fn=<NllLossBackward0>)
tensor(1.4781, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16419/17426 [29:55<02:05,  8.01it/s]

tensor(1.5136, grad_fn=<NllLossBackward0>)
tensor(1.4293, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16421/17426 [29:55<02:06,  7.91it/s]

tensor(1.4921, grad_fn=<NllLossBackward0>)
tensor(1.4847, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16423/17426 [29:55<02:02,  8.16it/s]

tensor(1.4971, grad_fn=<NllLossBackward0>)
tensor(1.5190, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16425/17426 [29:55<01:59,  8.37it/s]

tensor(1.5021, grad_fn=<NllLossBackward0>)
tensor(1.4583, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16427/17426 [29:56<01:58,  8.45it/s]

tensor(1.4923, grad_fn=<NllLossBackward0>)
tensor(1.5040, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16429/17426 [29:56<02:02,  8.15it/s]

tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.4562, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16431/17426 [29:56<02:04,  8.00it/s]

tensor(1.4989, grad_fn=<NllLossBackward0>)
tensor(1.5047, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16433/17426 [29:56<01:59,  8.33it/s]

tensor(1.5259, grad_fn=<NllLossBackward0>)
tensor(1.4704, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16435/17426 [29:57<01:57,  8.45it/s]

tensor(1.4981, grad_fn=<NllLossBackward0>)
tensor(1.4978, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16437/17426 [29:57<01:57,  8.43it/s]

tensor(1.4518, grad_fn=<NllLossBackward0>)
tensor(1.4886, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16439/17426 [29:57<02:03,  8.00it/s]

tensor(1.4937, grad_fn=<NllLossBackward0>)
tensor(1.4956, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16441/17426 [29:57<02:02,  8.05it/s]

tensor(1.4788, grad_fn=<NllLossBackward0>)
tensor(1.4859, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16443/17426 [29:58<02:01,  8.08it/s]

tensor(1.4726, grad_fn=<NllLossBackward0>)
tensor(1.4982, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16445/17426 [29:58<02:03,  7.95it/s]

tensor(1.4383, grad_fn=<NllLossBackward0>)
tensor(1.4710, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16447/17426 [29:58<02:01,  8.08it/s]

tensor(1.4170, grad_fn=<NllLossBackward0>)
tensor(1.5030, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16449/17426 [29:58<01:59,  8.18it/s]

tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.4875, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16451/17426 [29:58<01:56,  8.40it/s]

tensor(1.4496, grad_fn=<NllLossBackward0>)
tensor(1.5342, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16453/17426 [29:59<01:56,  8.36it/s]

tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.4770, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16455/17426 [29:59<01:53,  8.55it/s]

tensor(1.5000, grad_fn=<NllLossBackward0>)
tensor(1.5119, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16457/17426 [29:59<01:57,  8.23it/s]

tensor(1.4644, grad_fn=<NllLossBackward0>)
tensor(1.4884, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16459/17426 [29:59<01:58,  8.18it/s]

tensor(1.5201, grad_fn=<NllLossBackward0>)
tensor(1.5184, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16461/17426 [30:00<01:56,  8.26it/s]

tensor(1.4835, grad_fn=<NllLossBackward0>)
tensor(1.4913, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16463/17426 [30:00<01:54,  8.40it/s]

tensor(1.5253, grad_fn=<NllLossBackward0>)
tensor(1.4745, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16465/17426 [30:00<02:01,  7.88it/s]

tensor(1.5017, grad_fn=<NllLossBackward0>)
tensor(1.4827, grad_fn=<NllLossBackward0>)


 94%|█████████▍| 16467/17426 [30:00<01:57,  8.19it/s]

tensor(1.4646, grad_fn=<NllLossBackward0>)
tensor(1.4863, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16469/17426 [30:01<01:55,  8.32it/s]

tensor(1.5304, grad_fn=<NllLossBackward0>)
tensor(1.4830, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16471/17426 [30:01<01:56,  8.22it/s]

tensor(1.5085, grad_fn=<NllLossBackward0>)
tensor(1.4863, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16473/17426 [30:01<01:55,  8.29it/s]

tensor(1.4602, grad_fn=<NllLossBackward0>)
tensor(1.4735, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16475/17426 [30:01<01:58,  8.03it/s]

tensor(1.5136, grad_fn=<NllLossBackward0>)
tensor(1.4595, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16477/17426 [30:02<01:57,  8.07it/s]

tensor(1.4952, grad_fn=<NllLossBackward0>)
tensor(1.4699, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16479/17426 [30:02<01:57,  8.09it/s]

tensor(1.4718, grad_fn=<NllLossBackward0>)
tensor(1.5122, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16481/17426 [30:02<01:54,  8.27it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.4404, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16483/17426 [30:02<02:00,  7.82it/s]

tensor(1.4877, grad_fn=<NllLossBackward0>)
tensor(1.4740, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16485/17426 [30:03<02:00,  7.81it/s]

tensor(1.4579, grad_fn=<NllLossBackward0>)
tensor(1.5062, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16487/17426 [30:03<01:55,  8.10it/s]

tensor(1.4596, grad_fn=<NllLossBackward0>)
tensor(1.5416, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16489/17426 [30:03<01:53,  8.22it/s]

tensor(1.4790, grad_fn=<NllLossBackward0>)
tensor(1.4528, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16491/17426 [30:03<01:56,  8.04it/s]

tensor(1.4492, grad_fn=<NllLossBackward0>)
tensor(1.4638, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16493/17426 [30:04<01:55,  8.05it/s]

tensor(1.4879, grad_fn=<NllLossBackward0>)
tensor(1.4936, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16495/17426 [30:04<02:03,  7.54it/s]

tensor(1.4942, grad_fn=<NllLossBackward0>)
tensor(1.5047, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16497/17426 [30:04<02:11,  7.09it/s]

tensor(1.5247, grad_fn=<NllLossBackward0>)
tensor(1.4854, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16499/17426 [30:05<02:17,  6.76it/s]

tensor(1.4740, grad_fn=<NllLossBackward0>)
tensor(1.5039, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16501/17426 [30:05<02:13,  6.91it/s]

tensor(1.4957, grad_fn=<NllLossBackward0>)
tensor(1.5077, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16503/17426 [30:05<02:09,  7.12it/s]

tensor(1.5013, grad_fn=<NllLossBackward0>)
tensor(1.4673, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16505/17426 [30:05<02:02,  7.53it/s]

tensor(1.4845, grad_fn=<NllLossBackward0>)
tensor(1.5094, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16507/17426 [30:06<02:02,  7.50it/s]

tensor(1.4401, grad_fn=<NllLossBackward0>)
tensor(1.4995, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16509/17426 [30:06<02:03,  7.44it/s]

tensor(1.4613, grad_fn=<NllLossBackward0>)
tensor(1.5013, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16511/17426 [30:06<01:56,  7.85it/s]

tensor(1.4819, grad_fn=<NllLossBackward0>)
tensor(1.4637, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16513/17426 [30:06<01:55,  7.92it/s]

tensor(1.5410, grad_fn=<NllLossBackward0>)
tensor(1.5283, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16515/17426 [30:07<01:55,  7.88it/s]

tensor(1.4883, grad_fn=<NllLossBackward0>)
tensor(1.5170, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16517/17426 [30:07<02:04,  7.28it/s]

tensor(1.4885, grad_fn=<NllLossBackward0>)
tensor(1.4972, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16519/17426 [30:07<02:05,  7.21it/s]

tensor(1.4801, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16521/17426 [30:08<02:10,  6.95it/s]

tensor(1.4842, grad_fn=<NllLossBackward0>)
tensor(1.5063, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16523/17426 [30:08<02:16,  6.60it/s]

tensor(1.4638, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16525/17426 [30:08<02:22,  6.34it/s]

tensor(1.4694, grad_fn=<NllLossBackward0>)
tensor(1.4852, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16527/17426 [30:08<02:12,  6.80it/s]

tensor(1.5088, grad_fn=<NllLossBackward0>)
tensor(1.4852, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16529/17426 [30:09<02:07,  7.04it/s]

tensor(1.5249, grad_fn=<NllLossBackward0>)
tensor(1.4893, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16531/17426 [30:09<01:54,  7.79it/s]

tensor(1.5253, grad_fn=<NllLossBackward0>)
tensor(1.4752, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16533/17426 [30:09<01:50,  8.10it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.4766, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16535/17426 [30:09<01:48,  8.20it/s]

tensor(1.4472, grad_fn=<NllLossBackward0>)
tensor(1.4957, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16537/17426 [30:10<01:55,  7.70it/s]

tensor(1.5055, grad_fn=<NllLossBackward0>)
tensor(1.4747, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16539/17426 [30:10<01:52,  7.87it/s]

tensor(1.4943, grad_fn=<NllLossBackward0>)
tensor(1.4420, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16541/17426 [30:10<01:52,  7.86it/s]

tensor(1.4880, grad_fn=<NllLossBackward0>)
tensor(1.5010, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16543/17426 [30:11<01:51,  7.89it/s]

tensor(1.4686, grad_fn=<NllLossBackward0>)
tensor(1.4886, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16545/17426 [30:11<01:52,  7.86it/s]

tensor(1.4500, grad_fn=<NllLossBackward0>)
tensor(1.5029, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16547/17426 [30:11<01:50,  7.93it/s]

tensor(1.4874, grad_fn=<NllLossBackward0>)
tensor(1.4590, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16549/17426 [30:11<01:50,  7.95it/s]

tensor(1.4477, grad_fn=<NllLossBackward0>)
tensor(1.5215, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16551/17426 [30:12<01:46,  8.19it/s]

tensor(1.5039, grad_fn=<NllLossBackward0>)
tensor(1.4226, grad_fn=<NllLossBackward0>)


 95%|█████████▍| 16553/17426 [30:12<01:52,  7.77it/s]

tensor(1.4708, grad_fn=<NllLossBackward0>)
tensor(1.4987, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16555/17426 [30:12<01:50,  7.90it/s]

tensor(1.5110, grad_fn=<NllLossBackward0>)
tensor(1.5278, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16557/17426 [30:12<01:46,  8.16it/s]

tensor(1.4725, grad_fn=<NllLossBackward0>)
tensor(1.4954, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16559/17426 [30:13<01:45,  8.21it/s]

tensor(1.4461, grad_fn=<NllLossBackward0>)
tensor(1.5029, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16561/17426 [30:13<01:47,  8.03it/s]

tensor(1.4520, grad_fn=<NllLossBackward0>)
tensor(1.4646, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16563/17426 [30:13<01:50,  7.79it/s]

tensor(1.4619, grad_fn=<NllLossBackward0>)
tensor(1.4932, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16565/17426 [30:13<01:48,  7.95it/s]

tensor(1.4982, grad_fn=<NllLossBackward0>)
tensor(1.4480, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16567/17426 [30:14<01:46,  8.10it/s]

tensor(1.4519, grad_fn=<NllLossBackward0>)
tensor(1.4287, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16569/17426 [30:14<01:49,  7.80it/s]

tensor(1.4206, grad_fn=<NllLossBackward0>)
tensor(1.4079, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16571/17426 [30:14<01:51,  7.69it/s]

tensor(1.4932, grad_fn=<NllLossBackward0>)
tensor(1.4347, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16573/17426 [30:14<01:47,  7.93it/s]

tensor(1.4875, grad_fn=<NllLossBackward0>)
tensor(1.4522, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16575/17426 [30:15<01:47,  7.95it/s]

tensor(1.5198, grad_fn=<NllLossBackward0>)
tensor(1.4440, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16577/17426 [30:15<01:46,  7.96it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.4751, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16579/17426 [30:15<01:46,  7.93it/s]

tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.4624, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16581/17426 [30:15<01:45,  8.03it/s]

tensor(1.4415, grad_fn=<NllLossBackward0>)
tensor(1.4554, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16583/17426 [30:16<01:41,  8.29it/s]

tensor(1.4840, grad_fn=<NllLossBackward0>)
tensor(1.4815, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16585/17426 [30:16<01:44,  8.05it/s]

tensor(1.4820, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16587/17426 [30:16<01:41,  8.25it/s]

tensor(1.4148, grad_fn=<NllLossBackward0>)
tensor(1.4856, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16589/17426 [30:16<01:42,  8.13it/s]

tensor(1.4922, grad_fn=<NllLossBackward0>)
tensor(1.4927, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16591/17426 [30:17<01:41,  8.24it/s]

tensor(1.5068, grad_fn=<NllLossBackward0>)
tensor(1.4336, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16593/17426 [30:17<01:40,  8.29it/s]

tensor(1.4114, grad_fn=<NllLossBackward0>)
tensor(1.5093, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16595/17426 [30:17<01:41,  8.22it/s]

tensor(1.5243, grad_fn=<NllLossBackward0>)
tensor(1.4655, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16597/17426 [30:17<01:42,  8.07it/s]

tensor(1.4988, grad_fn=<NllLossBackward0>)
tensor(1.4755, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16599/17426 [30:17<01:38,  8.36it/s]

tensor(1.4548, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16601/17426 [30:18<01:41,  8.11it/s]

tensor(1.5267, grad_fn=<NllLossBackward0>)
tensor(1.4567, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16603/17426 [30:18<01:45,  7.80it/s]

tensor(1.4868, grad_fn=<NllLossBackward0>)
tensor(1.5196, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16605/17426 [30:18<01:48,  7.54it/s]

tensor(1.4929, grad_fn=<NllLossBackward0>)
tensor(1.4716, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16607/17426 [30:19<01:50,  7.39it/s]

tensor(1.4632, grad_fn=<NllLossBackward0>)
tensor(1.4878, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16609/17426 [30:19<01:59,  6.86it/s]

tensor(1.4719, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16611/17426 [30:19<02:03,  6.62it/s]

tensor(1.5409, grad_fn=<NllLossBackward0>)
tensor(1.4636, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16613/17426 [30:19<01:53,  7.16it/s]

tensor(1.4765, grad_fn=<NllLossBackward0>)
tensor(1.4867, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16615/17426 [30:20<01:44,  7.76it/s]

tensor(1.4699, grad_fn=<NllLossBackward0>)
tensor(1.4656, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16617/17426 [30:20<01:39,  8.17it/s]

tensor(1.4755, grad_fn=<NllLossBackward0>)
tensor(1.4984, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16619/17426 [30:20<01:37,  8.28it/s]

tensor(1.5202, grad_fn=<NllLossBackward0>)
tensor(1.5393, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16621/17426 [30:20<01:48,  7.44it/s]

tensor(1.5054, grad_fn=<NllLossBackward0>)
tensor(1.4930, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16623/17426 [30:21<01:48,  7.42it/s]

tensor(1.4851, grad_fn=<NllLossBackward0>)
tensor(1.5040, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16625/17426 [30:21<01:39,  8.02it/s]

tensor(1.4988, grad_fn=<NllLossBackward0>)
tensor(1.4821, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16627/17426 [30:21<01:51,  7.18it/s]

tensor(1.4661, grad_fn=<NllLossBackward0>)
tensor(1.4562, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16629/17426 [30:22<02:01,  6.58it/s]

tensor(1.4841, grad_fn=<NllLossBackward0>)
tensor(1.4900, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16631/17426 [30:22<02:00,  6.62it/s]

tensor(1.4763, grad_fn=<NllLossBackward0>)
tensor(1.4812, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16633/17426 [30:22<02:06,  6.27it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)
tensor(1.4890, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16635/17426 [30:23<02:05,  6.28it/s]

tensor(1.4734, grad_fn=<NllLossBackward0>)
tensor(1.5009, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16637/17426 [30:23<02:04,  6.35it/s]

tensor(1.4766, grad_fn=<NllLossBackward0>)
tensor(1.4663, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16639/17426 [30:23<01:59,  6.57it/s]

tensor(1.4955, grad_fn=<NllLossBackward0>)
tensor(1.4576, grad_fn=<NllLossBackward0>)


 95%|█████████▌| 16641/17426 [30:23<01:51,  7.05it/s]

tensor(1.4910, grad_fn=<NllLossBackward0>)
tensor(1.4608, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16643/17426 [30:24<01:50,  7.08it/s]

tensor(1.5068, grad_fn=<NllLossBackward0>)
tensor(1.4977, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16645/17426 [30:24<01:43,  7.55it/s]

tensor(1.5075, grad_fn=<NllLossBackward0>)
tensor(1.4977, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16647/17426 [30:24<01:43,  7.53it/s]

tensor(1.4587, grad_fn=<NllLossBackward0>)
tensor(1.4923, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16649/17426 [30:24<01:39,  7.82it/s]

tensor(1.4737, grad_fn=<NllLossBackward0>)
tensor(1.4383, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16651/17426 [30:25<01:44,  7.39it/s]

tensor(1.4709, grad_fn=<NllLossBackward0>)
tensor(1.4981, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16653/17426 [30:25<01:37,  7.94it/s]

tensor(1.5159, grad_fn=<NllLossBackward0>)
tensor(1.4680, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16655/17426 [30:25<01:34,  8.19it/s]

tensor(1.4617, grad_fn=<NllLossBackward0>)
tensor(1.5287, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16657/17426 [30:25<01:32,  8.31it/s]

tensor(1.4876, grad_fn=<NllLossBackward0>)
tensor(1.4258, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16659/17426 [30:26<01:33,  8.17it/s]

tensor(1.4828, grad_fn=<NllLossBackward0>)
tensor(1.4965, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16661/17426 [30:26<01:38,  7.78it/s]

tensor(1.4480, grad_fn=<NllLossBackward0>)
tensor(1.4480, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16663/17426 [30:26<01:34,  8.09it/s]

tensor(1.4651, grad_fn=<NllLossBackward0>)
tensor(1.5249, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16665/17426 [30:26<01:37,  7.80it/s]

tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.4877, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16667/17426 [30:27<01:34,  7.99it/s]

tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.5172, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16669/17426 [30:27<01:35,  7.94it/s]

tensor(1.4787, grad_fn=<NllLossBackward0>)
tensor(1.4740, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16671/17426 [30:27<01:32,  8.14it/s]

tensor(1.4334, grad_fn=<NllLossBackward0>)
tensor(1.4566, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16673/17426 [30:27<01:29,  8.41it/s]

tensor(1.4549, grad_fn=<NllLossBackward0>)
tensor(1.4568, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16675/17426 [30:28<01:31,  8.20it/s]

tensor(1.4628, grad_fn=<NllLossBackward0>)
tensor(1.4632, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16677/17426 [30:28<01:32,  8.07it/s]

tensor(1.5070, grad_fn=<NllLossBackward0>)
tensor(1.5019, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16679/17426 [30:28<01:30,  8.25it/s]

tensor(1.4759, grad_fn=<NllLossBackward0>)
tensor(1.4949, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16681/17426 [30:28<01:28,  8.43it/s]

tensor(1.4951, grad_fn=<NllLossBackward0>)
tensor(1.4884, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16683/17426 [30:29<01:29,  8.32it/s]

tensor(1.4753, grad_fn=<NllLossBackward0>)
tensor(1.4776, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16685/17426 [30:29<01:29,  8.32it/s]

tensor(1.4864, grad_fn=<NllLossBackward0>)
tensor(1.5070, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16687/17426 [30:29<01:31,  8.10it/s]

tensor(1.4699, grad_fn=<NllLossBackward0>)
tensor(1.4494, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16689/17426 [30:29<01:30,  8.13it/s]

tensor(1.4547, grad_fn=<NllLossBackward0>)
tensor(1.4568, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16691/17426 [30:30<01:29,  8.25it/s]

tensor(1.4421, grad_fn=<NllLossBackward0>)
tensor(1.4248, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16693/17426 [30:30<01:29,  8.23it/s]

tensor(1.4580, grad_fn=<NllLossBackward0>)
tensor(1.5167, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16695/17426 [30:30<01:32,  7.94it/s]

tensor(1.4671, grad_fn=<NllLossBackward0>)
tensor(1.4292, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16697/17426 [30:30<01:32,  7.92it/s]

tensor(1.5446, grad_fn=<NllLossBackward0>)
tensor(1.4795, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16699/17426 [30:31<01:29,  8.13it/s]

tensor(1.5176, grad_fn=<NllLossBackward0>)
tensor(1.4554, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16701/17426 [30:31<01:28,  8.21it/s]

tensor(1.4760, grad_fn=<NllLossBackward0>)
tensor(1.4661, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16703/17426 [30:31<01:31,  7.86it/s]

tensor(1.5193, grad_fn=<NllLossBackward0>)
tensor(1.4828, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16705/17426 [30:31<01:29,  8.06it/s]

tensor(1.4520, grad_fn=<NllLossBackward0>)
tensor(1.4323, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16707/17426 [30:32<01:28,  8.11it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.5302, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16709/17426 [30:32<01:28,  8.11it/s]

tensor(1.4500, grad_fn=<NllLossBackward0>)
tensor(1.4981, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16711/17426 [30:32<01:26,  8.22it/s]

tensor(1.4887, grad_fn=<NllLossBackward0>)
tensor(1.4968, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16713/17426 [30:32<01:29,  7.99it/s]

tensor(1.4768, grad_fn=<NllLossBackward0>)
tensor(1.4378, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16715/17426 [30:33<01:26,  8.26it/s]

tensor(1.4562, grad_fn=<NllLossBackward0>)
tensor(1.4544, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16717/17426 [30:33<01:27,  8.12it/s]

tensor(1.4850, grad_fn=<NllLossBackward0>)
tensor(1.4380, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16719/17426 [30:33<01:27,  8.07it/s]

tensor(1.4687, grad_fn=<NllLossBackward0>)
tensor(1.4898, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16721/17426 [30:33<01:38,  7.14it/s]

tensor(1.4869, grad_fn=<NllLossBackward0>)
tensor(1.5142, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16723/17426 [30:34<01:37,  7.18it/s]

tensor(1.5014, grad_fn=<NllLossBackward0>)
tensor(1.4980, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16725/17426 [30:34<01:34,  7.45it/s]

tensor(1.4681, grad_fn=<NllLossBackward0>)
tensor(1.5057, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16727/17426 [30:34<01:32,  7.54it/s]

tensor(1.4929, grad_fn=<NllLossBackward0>)
tensor(1.4848, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16729/17426 [30:34<01:34,  7.34it/s]

tensor(1.4699, grad_fn=<NllLossBackward0>)
tensor(1.4925, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16731/17426 [30:35<01:29,  7.77it/s]

tensor(1.4574, grad_fn=<NllLossBackward0>)
tensor(1.4899, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16733/17426 [30:35<01:25,  8.09it/s]

tensor(1.4797, grad_fn=<NllLossBackward0>)
tensor(1.4171, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16735/17426 [30:35<01:26,  7.96it/s]

tensor(1.4330, grad_fn=<NllLossBackward0>)
tensor(1.5127, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16737/17426 [30:35<01:24,  8.17it/s]

tensor(1.5025, grad_fn=<NllLossBackward0>)
tensor(1.4520, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16739/17426 [30:36<01:25,  8.06it/s]

tensor(1.4907, grad_fn=<NllLossBackward0>)
tensor(1.4583, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16741/17426 [30:36<01:22,  8.26it/s]

tensor(1.4056, grad_fn=<NllLossBackward0>)
tensor(1.5095, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16743/17426 [30:36<01:30,  7.55it/s]

tensor(1.4531, grad_fn=<NllLossBackward0>)
tensor(1.5114, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16745/17426 [30:37<01:34,  7.21it/s]

tensor(1.4921, grad_fn=<NllLossBackward0>)
tensor(1.4843, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16747/17426 [30:37<01:34,  7.17it/s]

tensor(1.5140, grad_fn=<NllLossBackward0>)
tensor(1.4719, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16749/17426 [30:37<01:35,  7.11it/s]

tensor(1.4755, grad_fn=<NllLossBackward0>)
tensor(1.4958, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16751/17426 [30:37<01:40,  6.72it/s]

tensor(1.4519, grad_fn=<NllLossBackward0>)
tensor(1.5144, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16753/17426 [30:38<01:46,  6.31it/s]

tensor(1.5045, grad_fn=<NllLossBackward0>)
tensor(1.4574, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16755/17426 [30:38<01:43,  6.49it/s]

tensor(1.4580, grad_fn=<NllLossBackward0>)
tensor(1.4574, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16757/17426 [30:38<01:40,  6.65it/s]

tensor(1.4851, grad_fn=<NllLossBackward0>)
tensor(1.4488, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16759/17426 [30:39<01:32,  7.20it/s]

tensor(1.4639, grad_fn=<NllLossBackward0>)
tensor(1.4762, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16761/17426 [30:39<01:29,  7.42it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.5309, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16763/17426 [30:39<01:23,  7.93it/s]

tensor(1.4636, grad_fn=<NllLossBackward0>)
tensor(1.4820, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16765/17426 [30:39<01:23,  7.91it/s]

tensor(1.4927, grad_fn=<NllLossBackward0>)
tensor(1.5130, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16767/17426 [30:40<01:19,  8.27it/s]

tensor(1.4625, grad_fn=<NllLossBackward0>)
tensor(1.4774, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16769/17426 [30:40<01:23,  7.85it/s]

tensor(1.5481, grad_fn=<NllLossBackward0>)
tensor(1.4490, grad_fn=<NllLossBackward0>)


 96%|█████████▌| 16771/17426 [30:40<01:20,  8.15it/s]

tensor(1.4326, grad_fn=<NllLossBackward0>)
tensor(1.5050, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16773/17426 [30:40<01:18,  8.31it/s]

tensor(1.4588, grad_fn=<NllLossBackward0>)
tensor(1.4907, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16775/17426 [30:41<01:18,  8.30it/s]

tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.4602, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16777/17426 [30:41<01:21,  7.96it/s]

tensor(1.4820, grad_fn=<NllLossBackward0>)
tensor(1.4076, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16779/17426 [30:41<01:21,  7.93it/s]

tensor(1.5362, grad_fn=<NllLossBackward0>)
tensor(1.4757, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16781/17426 [30:41<01:21,  7.95it/s]

tensor(1.5205, grad_fn=<NllLossBackward0>)
tensor(1.4489, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16783/17426 [30:42<01:18,  8.19it/s]

tensor(1.4775, grad_fn=<NllLossBackward0>)
tensor(1.4570, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16785/17426 [30:42<01:18,  8.13it/s]

tensor(1.5079, grad_fn=<NllLossBackward0>)
tensor(1.5083, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16787/17426 [30:42<01:24,  7.57it/s]

tensor(1.4809, grad_fn=<NllLossBackward0>)
tensor(1.4599, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16789/17426 [30:42<01:18,  8.14it/s]

tensor(1.4863, grad_fn=<NllLossBackward0>)
tensor(1.4537, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16791/17426 [30:43<01:16,  8.25it/s]

tensor(1.4944, grad_fn=<NllLossBackward0>)
tensor(1.4778, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16793/17426 [30:43<01:16,  8.33it/s]

tensor(1.4908, grad_fn=<NllLossBackward0>)
tensor(1.5285, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16795/17426 [30:43<01:18,  7.99it/s]

tensor(1.4590, grad_fn=<NllLossBackward0>)
tensor(1.4666, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16797/17426 [30:43<01:17,  8.12it/s]

tensor(1.4202, grad_fn=<NllLossBackward0>)
tensor(1.5042, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16799/17426 [30:44<01:15,  8.32it/s]

tensor(1.4432, grad_fn=<NllLossBackward0>)
tensor(1.5221, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16801/17426 [30:44<01:14,  8.41it/s]

tensor(1.4645, grad_fn=<NllLossBackward0>)
tensor(1.4981, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16803/17426 [30:44<01:15,  8.21it/s]

tensor(1.4610, grad_fn=<NllLossBackward0>)
tensor(1.5320, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16805/17426 [30:44<01:15,  8.22it/s]

tensor(1.4892, grad_fn=<NllLossBackward0>)
tensor(1.4771, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16807/17426 [30:45<01:14,  8.32it/s]

tensor(1.4851, grad_fn=<NllLossBackward0>)
tensor(1.5084, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16809/17426 [30:45<01:13,  8.38it/s]

tensor(1.4756, grad_fn=<NllLossBackward0>)
tensor(1.5457, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16811/17426 [30:45<01:13,  8.40it/s]

tensor(1.5204, grad_fn=<NllLossBackward0>)
tensor(1.5359, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16813/17426 [30:45<01:17,  7.96it/s]

tensor(1.4916, grad_fn=<NllLossBackward0>)
tensor(1.4722, grad_fn=<NllLossBackward0>)


 96%|█████████▋| 16815/17426 [30:45<01:14,  8.19it/s]

tensor(1.4487, grad_fn=<NllLossBackward0>)
tensor(1.4517, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16817/17426 [30:46<01:15,  8.05it/s]

tensor(1.4666, grad_fn=<NllLossBackward0>)
tensor(1.4749, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16819/17426 [30:46<01:12,  8.40it/s]

tensor(1.4636, grad_fn=<NllLossBackward0>)
tensor(1.4955, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16821/17426 [30:46<01:16,  7.91it/s]

tensor(1.5066, grad_fn=<NllLossBackward0>)
tensor(1.4931, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16823/17426 [30:46<01:12,  8.28it/s]

tensor(1.4918, grad_fn=<NllLossBackward0>)
tensor(1.5011, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16825/17426 [30:47<01:15,  7.98it/s]

tensor(1.4341, grad_fn=<NllLossBackward0>)
tensor(1.4656, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16827/17426 [30:47<01:11,  8.34it/s]

tensor(1.4526, grad_fn=<NllLossBackward0>)
tensor(1.4842, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16829/17426 [30:47<01:10,  8.42it/s]

tensor(1.4600, grad_fn=<NllLossBackward0>)
tensor(1.5264, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16831/17426 [30:47<01:14,  8.01it/s]

tensor(1.4619, grad_fn=<NllLossBackward0>)
tensor(1.4417, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16833/17426 [30:48<01:12,  8.18it/s]

tensor(1.4614, grad_fn=<NllLossBackward0>)
tensor(1.5111, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16835/17426 [30:48<01:11,  8.31it/s]

tensor(1.4844, grad_fn=<NllLossBackward0>)
tensor(1.4382, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16837/17426 [30:48<01:15,  7.84it/s]

tensor(1.4761, grad_fn=<NllLossBackward0>)
tensor(1.4876, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16839/17426 [30:48<01:18,  7.43it/s]

tensor(1.4677, grad_fn=<NllLossBackward0>)
tensor(1.4823, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16841/17426 [30:49<01:19,  7.35it/s]

tensor(1.5177, grad_fn=<NllLossBackward0>)
tensor(1.4478, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16843/17426 [30:49<01:21,  7.17it/s]

tensor(1.5388, grad_fn=<NllLossBackward0>)
tensor(1.4389, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16845/17426 [30:49<01:27,  6.64it/s]

tensor(1.5091, grad_fn=<NllLossBackward0>)
tensor(1.5088, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16847/17426 [30:50<01:34,  6.10it/s]

tensor(1.4154, grad_fn=<NllLossBackward0>)
tensor(1.4537, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16849/17426 [30:50<01:28,  6.49it/s]

tensor(1.4441, grad_fn=<NllLossBackward0>)
tensor(1.4907, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16851/17426 [30:50<01:27,  6.59it/s]

tensor(1.4471, grad_fn=<NllLossBackward0>)
tensor(1.4584, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16853/17426 [30:51<01:25,  6.72it/s]

tensor(1.5159, grad_fn=<NllLossBackward0>)
tensor(1.4492, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16855/17426 [30:51<01:22,  6.92it/s]

tensor(1.4759, grad_fn=<NllLossBackward0>)
tensor(1.5041, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16857/17426 [30:51<01:22,  6.91it/s]

tensor(1.5560, grad_fn=<NllLossBackward0>)
tensor(1.4282, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16859/17426 [30:51<01:24,  6.70it/s]

tensor(1.4959, grad_fn=<NllLossBackward0>)
tensor(1.4955, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16861/17426 [30:52<01:25,  6.58it/s]

tensor(1.5222, grad_fn=<NllLossBackward0>)
tensor(1.4175, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16863/17426 [30:52<01:23,  6.70it/s]

tensor(1.5002, grad_fn=<NllLossBackward0>)
tensor(1.4751, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16865/17426 [30:52<01:27,  6.44it/s]

tensor(1.4717, grad_fn=<NllLossBackward0>)
tensor(1.4430, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16867/17426 [30:53<01:24,  6.59it/s]

tensor(1.5398, grad_fn=<NllLossBackward0>)
tensor(1.4885, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16869/17426 [30:53<01:29,  6.26it/s]

tensor(1.5259, grad_fn=<NllLossBackward0>)
tensor(1.4603, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16871/17426 [30:53<01:20,  6.93it/s]

tensor(1.5032, grad_fn=<NllLossBackward0>)
tensor(1.5023, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16873/17426 [30:54<01:15,  7.31it/s]

tensor(1.4705, grad_fn=<NllLossBackward0>)
tensor(1.4859, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16875/17426 [30:54<01:14,  7.43it/s]

tensor(1.4568, grad_fn=<NllLossBackward0>)
tensor(1.5021, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16877/17426 [30:54<01:09,  7.93it/s]

tensor(1.4977, grad_fn=<NllLossBackward0>)
tensor(1.4796, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16879/17426 [30:54<01:08,  8.03it/s]

tensor(1.4833, grad_fn=<NllLossBackward0>)
tensor(1.4996, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16881/17426 [30:55<01:05,  8.35it/s]

tensor(1.4452, grad_fn=<NllLossBackward0>)
tensor(1.4589, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16883/17426 [30:55<01:08,  7.95it/s]

tensor(1.5375, grad_fn=<NllLossBackward0>)
tensor(1.4626, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16885/17426 [30:55<01:05,  8.25it/s]

tensor(1.4459, grad_fn=<NllLossBackward0>)
tensor(1.4705, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16887/17426 [30:55<01:04,  8.42it/s]

tensor(1.5326, grad_fn=<NllLossBackward0>)
tensor(1.5008, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16889/17426 [30:56<01:03,  8.45it/s]

tensor(1.4842, grad_fn=<NllLossBackward0>)
tensor(1.4612, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16891/17426 [30:56<01:07,  7.92it/s]

tensor(1.5179, grad_fn=<NllLossBackward0>)
tensor(1.5357, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16893/17426 [30:56<01:07,  7.87it/s]

tensor(1.4586, grad_fn=<NllLossBackward0>)
tensor(1.5067, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16895/17426 [30:56<01:06,  7.93it/s]

tensor(1.5060, grad_fn=<NllLossBackward0>)
tensor(1.4551, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16897/17426 [30:57<01:07,  7.83it/s]

tensor(1.4510, grad_fn=<NllLossBackward0>)
tensor(1.4716, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16899/17426 [30:57<01:08,  7.64it/s]

tensor(1.4955, grad_fn=<NllLossBackward0>)
tensor(1.4452, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16901/17426 [30:57<01:07,  7.74it/s]

tensor(1.4269, grad_fn=<NllLossBackward0>)
tensor(1.4588, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16903/17426 [30:57<01:07,  7.72it/s]

tensor(1.5026, grad_fn=<NllLossBackward0>)
tensor(1.4889, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16905/17426 [30:58<01:04,  8.14it/s]

tensor(1.5108, grad_fn=<NllLossBackward0>)
tensor(1.4811, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16907/17426 [30:58<01:04,  8.09it/s]

tensor(1.4644, grad_fn=<NllLossBackward0>)
tensor(1.4727, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16909/17426 [30:58<01:06,  7.76it/s]

tensor(1.4632, grad_fn=<NllLossBackward0>)
tensor(1.4681, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16911/17426 [30:58<01:03,  8.10it/s]

tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.4468, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16913/17426 [30:59<01:03,  8.09it/s]

tensor(1.4713, grad_fn=<NllLossBackward0>)
tensor(1.4500, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16915/17426 [30:59<01:02,  8.24it/s]

tensor(1.4442, grad_fn=<NllLossBackward0>)
tensor(1.4754, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16917/17426 [30:59<01:04,  7.86it/s]

tensor(1.5090, grad_fn=<NllLossBackward0>)
tensor(1.4507, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16919/17426 [30:59<01:01,  8.25it/s]

tensor(1.4981, grad_fn=<NllLossBackward0>)
tensor(1.5330, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16921/17426 [31:00<01:00,  8.34it/s]

tensor(1.4392, grad_fn=<NllLossBackward0>)
tensor(1.4602, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16923/17426 [31:00<01:00,  8.36it/s]

tensor(1.5163, grad_fn=<NllLossBackward0>)
tensor(1.4634, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16925/17426 [31:00<00:58,  8.49it/s]

tensor(1.4532, grad_fn=<NllLossBackward0>)
tensor(1.4825, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16927/17426 [31:00<01:01,  8.15it/s]

tensor(1.4697, grad_fn=<NllLossBackward0>)
tensor(1.4931, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16929/17426 [31:01<00:59,  8.33it/s]

tensor(1.5036, grad_fn=<NllLossBackward0>)
tensor(1.4627, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16931/17426 [31:01<01:00,  8.20it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.4639, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16933/17426 [31:01<00:58,  8.38it/s]

tensor(1.4973, grad_fn=<NllLossBackward0>)
tensor(1.4642, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16935/17426 [31:01<01:05,  7.51it/s]

tensor(1.4954, grad_fn=<NllLossBackward0>)
tensor(1.5340, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16937/17426 [31:02<01:00,  8.03it/s]

tensor(1.5226, grad_fn=<NllLossBackward0>)
tensor(1.4304, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16939/17426 [31:02<00:58,  8.27it/s]

tensor(1.4820, grad_fn=<NllLossBackward0>)
tensor(1.5126, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16941/17426 [31:02<00:57,  8.44it/s]

tensor(1.4930, grad_fn=<NllLossBackward0>)
tensor(1.4643, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16943/17426 [31:02<00:59,  8.12it/s]

tensor(1.4812, grad_fn=<NllLossBackward0>)
tensor(1.5284, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16945/17426 [31:02<00:59,  8.03it/s]

tensor(1.5123, grad_fn=<NllLossBackward0>)
tensor(1.4736, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16947/17426 [31:03<00:57,  8.36it/s]

tensor(1.4456, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16949/17426 [31:03<00:56,  8.39it/s]

tensor(1.4795, grad_fn=<NllLossBackward0>)
tensor(1.4841, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16951/17426 [31:03<00:59,  7.97it/s]

tensor(1.4612, grad_fn=<NllLossBackward0>)
tensor(1.4681, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16953/17426 [31:04<01:02,  7.52it/s]

tensor(1.4508, grad_fn=<NllLossBackward0>)
tensor(1.5371, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16955/17426 [31:04<01:00,  7.76it/s]

tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.4946, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16957/17426 [31:04<01:00,  7.72it/s]

tensor(1.5134, grad_fn=<NllLossBackward0>)
tensor(1.5191, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16959/17426 [31:04<01:04,  7.21it/s]

tensor(1.4974, grad_fn=<NllLossBackward0>)
tensor(1.5065, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16961/17426 [31:05<01:02,  7.50it/s]

tensor(1.4488, grad_fn=<NllLossBackward0>)
tensor(1.4891, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16963/17426 [31:05<00:58,  7.94it/s]

tensor(1.4453, grad_fn=<NllLossBackward0>)
tensor(1.5135, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16965/17426 [31:05<00:56,  8.14it/s]

tensor(1.5279, grad_fn=<NllLossBackward0>)
tensor(1.4611, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16967/17426 [31:05<00:56,  8.17it/s]

tensor(1.4500, grad_fn=<NllLossBackward0>)
tensor(1.5036, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16969/17426 [31:06<00:59,  7.68it/s]

tensor(1.5005, grad_fn=<NllLossBackward0>)
tensor(1.4707, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16971/17426 [31:06<00:56,  8.09it/s]

tensor(1.4450, grad_fn=<NllLossBackward0>)
tensor(1.4979, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16973/17426 [31:06<00:54,  8.33it/s]

tensor(1.4752, grad_fn=<NllLossBackward0>)
tensor(1.4543, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16975/17426 [31:06<00:58,  7.77it/s]

tensor(1.4878, grad_fn=<NllLossBackward0>)
tensor(1.4816, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16977/17426 [31:07<01:01,  7.27it/s]

tensor(1.4212, grad_fn=<NllLossBackward0>)
tensor(1.4786, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16979/17426 [31:07<01:03,  7.08it/s]

tensor(1.4353, grad_fn=<NllLossBackward0>)
tensor(1.4778, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16981/17426 [31:07<01:01,  7.20it/s]

tensor(1.4834, grad_fn=<NllLossBackward0>)
tensor(1.4546, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16983/17426 [31:07<01:04,  6.90it/s]

tensor(1.4270, grad_fn=<NllLossBackward0>)
tensor(1.4537, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16985/17426 [31:08<01:03,  6.90it/s]

tensor(1.4655, grad_fn=<NllLossBackward0>)
tensor(1.4471, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16987/17426 [31:08<01:07,  6.48it/s]

tensor(1.4625, grad_fn=<NllLossBackward0>)
tensor(1.4789, grad_fn=<NllLossBackward0>)


 97%|█████████▋| 16989/17426 [31:08<01:00,  7.18it/s]

tensor(1.4652, grad_fn=<NllLossBackward0>)
tensor(1.4903, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 16991/17426 [31:09<00:54,  7.92it/s]

tensor(1.5013, grad_fn=<NllLossBackward0>)
tensor(1.5016, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 16993/17426 [31:09<00:55,  7.85it/s]

tensor(1.5063, grad_fn=<NllLossBackward0>)
tensor(1.4942, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 16995/17426 [31:09<00:52,  8.25it/s]

tensor(1.5380, grad_fn=<NllLossBackward0>)
tensor(1.4594, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 16997/17426 [31:09<00:50,  8.48it/s]

tensor(1.5381, grad_fn=<NllLossBackward0>)
tensor(1.4310, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 16999/17426 [31:10<00:51,  8.26it/s]

tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.4260, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17001/17426 [31:10<00:52,  8.04it/s]

tensor(1.4611, grad_fn=<NllLossBackward0>)
tensor(1.4833, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17003/17426 [31:10<00:51,  8.26it/s]

tensor(1.4997, grad_fn=<NllLossBackward0>)
tensor(1.5504, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17005/17426 [31:10<00:54,  7.69it/s]

tensor(1.5037, grad_fn=<NllLossBackward0>)
tensor(1.4425, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17007/17426 [31:11<00:56,  7.47it/s]

tensor(1.4837, grad_fn=<NllLossBackward0>)
tensor(1.4853, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17009/17426 [31:11<00:55,  7.51it/s]

tensor(1.4551, grad_fn=<NllLossBackward0>)
tensor(1.5105, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17011/17426 [31:11<00:53,  7.70it/s]

tensor(1.4646, grad_fn=<NllLossBackward0>)
tensor(1.4992, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17013/17426 [31:11<00:53,  7.75it/s]

tensor(1.4701, grad_fn=<NllLossBackward0>)
tensor(1.4767, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17015/17426 [31:12<00:51,  7.97it/s]

tensor(1.4739, grad_fn=<NllLossBackward0>)
tensor(1.4793, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17017/17426 [31:12<00:51,  7.95it/s]

tensor(1.4408, grad_fn=<NllLossBackward0>)
tensor(1.4670, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17019/17426 [31:12<00:51,  7.92it/s]

tensor(1.5356, grad_fn=<NllLossBackward0>)
tensor(1.5038, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17021/17426 [31:12<00:49,  8.11it/s]

tensor(1.4477, grad_fn=<NllLossBackward0>)
tensor(1.4625, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17023/17426 [31:13<00:50,  8.02it/s]

tensor(1.4885, grad_fn=<NllLossBackward0>)
tensor(1.4844, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17025/17426 [31:13<00:50,  7.97it/s]

tensor(1.4658, grad_fn=<NllLossBackward0>)
tensor(1.4934, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17027/17426 [31:13<00:50,  7.92it/s]

tensor(1.4809, grad_fn=<NllLossBackward0>)
tensor(1.4646, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17029/17426 [31:13<00:49,  8.04it/s]

tensor(1.5193, grad_fn=<NllLossBackward0>)
tensor(1.5063, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17031/17426 [31:14<00:47,  8.26it/s]

tensor(1.4610, grad_fn=<NllLossBackward0>)
tensor(1.5012, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17033/17426 [31:14<00:47,  8.34it/s]

tensor(1.4981, grad_fn=<NllLossBackward0>)
tensor(1.5176, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17035/17426 [31:14<00:48,  8.02it/s]

tensor(1.4614, grad_fn=<NllLossBackward0>)
tensor(1.5512, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17037/17426 [31:14<00:48,  8.01it/s]

tensor(1.5318, grad_fn=<NllLossBackward0>)
tensor(1.4432, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17039/17426 [31:15<00:46,  8.33it/s]

tensor(1.4790, grad_fn=<NllLossBackward0>)
tensor(1.4951, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17041/17426 [31:15<00:47,  8.15it/s]

tensor(1.4660, grad_fn=<NllLossBackward0>)
tensor(1.4868, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17043/17426 [31:15<00:48,  7.90it/s]

tensor(1.5296, grad_fn=<NllLossBackward0>)
tensor(1.4687, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17045/17426 [31:15<00:47,  8.09it/s]

tensor(1.5031, grad_fn=<NllLossBackward0>)
tensor(1.5276, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17047/17426 [31:16<00:45,  8.27it/s]

tensor(1.4882, grad_fn=<NllLossBackward0>)
tensor(1.5006, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17049/17426 [31:16<00:46,  8.16it/s]

tensor(1.4898, grad_fn=<NllLossBackward0>)
tensor(1.4958, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17051/17426 [31:16<00:45,  8.25it/s]

tensor(1.5066, grad_fn=<NllLossBackward0>)
tensor(1.4997, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17053/17426 [31:16<00:46,  8.00it/s]

tensor(1.4914, grad_fn=<NllLossBackward0>)
tensor(1.4827, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17055/17426 [31:17<00:44,  8.26it/s]

tensor(1.4461, grad_fn=<NllLossBackward0>)
tensor(1.4324, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17057/17426 [31:17<00:44,  8.36it/s]

tensor(1.4481, grad_fn=<NllLossBackward0>)
tensor(1.4803, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17059/17426 [31:17<00:43,  8.39it/s]

tensor(1.4618, grad_fn=<NllLossBackward0>)
tensor(1.4348, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17061/17426 [31:17<00:46,  7.90it/s]

tensor(1.4780, grad_fn=<NllLossBackward0>)
tensor(1.4282, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17063/17426 [31:18<00:43,  8.25it/s]

tensor(1.4891, grad_fn=<NllLossBackward0>)
tensor(1.4821, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17065/17426 [31:18<00:43,  8.39it/s]

tensor(1.5035, grad_fn=<NllLossBackward0>)
tensor(1.4445, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17067/17426 [31:18<00:43,  8.34it/s]

tensor(1.4692, grad_fn=<NllLossBackward0>)
tensor(1.4727, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17069/17426 [31:18<00:44,  8.08it/s]

tensor(1.4515, grad_fn=<NllLossBackward0>)
tensor(1.5354, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17071/17426 [31:19<00:46,  7.64it/s]

tensor(1.4922, grad_fn=<NllLossBackward0>)
tensor(1.4984, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17073/17426 [31:19<00:48,  7.23it/s]

tensor(1.5042, grad_fn=<NllLossBackward0>)
tensor(1.4846, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17075/17426 [31:19<00:52,  6.69it/s]

tensor(1.4865, grad_fn=<NllLossBackward0>)
tensor(1.5674, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17077/17426 [31:19<00:52,  6.59it/s]

tensor(1.4718, grad_fn=<NllLossBackward0>)
tensor(1.5150, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17079/17426 [31:20<00:48,  7.19it/s]

tensor(1.4379, grad_fn=<NllLossBackward0>)
tensor(1.5001, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17081/17426 [31:20<00:47,  7.19it/s]

tensor(1.4741, grad_fn=<NllLossBackward0>)
tensor(1.4733, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17083/17426 [31:20<00:50,  6.83it/s]

tensor(1.4676, grad_fn=<NllLossBackward0>)
tensor(1.4775, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17085/17426 [31:21<00:48,  7.00it/s]

tensor(1.4991, grad_fn=<NllLossBackward0>)
tensor(1.5008, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17087/17426 [31:21<00:47,  7.19it/s]

tensor(1.4657, grad_fn=<NllLossBackward0>)
tensor(1.5352, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17089/17426 [31:21<00:46,  7.28it/s]

tensor(1.5037, grad_fn=<NllLossBackward0>)
tensor(1.4641, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17091/17426 [31:21<00:48,  6.94it/s]

tensor(1.5164, grad_fn=<NllLossBackward0>)
tensor(1.5102, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17093/17426 [31:22<00:49,  6.71it/s]

tensor(1.4654, grad_fn=<NllLossBackward0>)
tensor(1.4728, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17095/17426 [31:22<00:48,  6.83it/s]

tensor(1.4982, grad_fn=<NllLossBackward0>)
tensor(1.5133, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17097/17426 [31:22<00:49,  6.71it/s]

tensor(1.4182, grad_fn=<NllLossBackward0>)
tensor(1.5281, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17099/17426 [31:23<00:50,  6.54it/s]

tensor(1.4416, grad_fn=<NllLossBackward0>)
tensor(1.4973, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17101/17426 [31:23<00:48,  6.76it/s]

tensor(1.4917, grad_fn=<NllLossBackward0>)
tensor(1.4854, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17103/17426 [31:23<00:43,  7.46it/s]

tensor(1.5431, grad_fn=<NllLossBackward0>)
tensor(1.5085, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17105/17426 [31:23<00:41,  7.77it/s]

tensor(1.5451, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17107/17426 [31:24<00:41,  7.61it/s]

tensor(1.4619, grad_fn=<NllLossBackward0>)
tensor(1.4963, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17109/17426 [31:24<00:39,  8.00it/s]

tensor(1.5012, grad_fn=<NllLossBackward0>)
tensor(1.4583, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17111/17426 [31:24<00:39,  7.93it/s]

tensor(1.5007, grad_fn=<NllLossBackward0>)
tensor(1.4493, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17113/17426 [31:24<00:38,  8.16it/s]

tensor(1.4506, grad_fn=<NllLossBackward0>)
tensor(1.4427, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17115/17426 [31:25<00:39,  7.93it/s]

tensor(1.4419, grad_fn=<NllLossBackward0>)
tensor(1.4670, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17117/17426 [31:25<00:40,  7.72it/s]

tensor(1.4851, grad_fn=<NllLossBackward0>)
tensor(1.4692, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17119/17426 [31:25<00:38,  8.02it/s]

tensor(1.4565, grad_fn=<NllLossBackward0>)
tensor(1.4943, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17121/17426 [31:25<00:36,  8.25it/s]

tensor(1.4769, grad_fn=<NllLossBackward0>)
tensor(1.4897, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17123/17426 [31:26<00:35,  8.45it/s]

tensor(1.4869, grad_fn=<NllLossBackward0>)
tensor(1.4863, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17125/17426 [31:26<00:38,  7.91it/s]

tensor(1.5464, grad_fn=<NllLossBackward0>)
tensor(1.4780, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17127/17426 [31:26<00:37,  7.92it/s]

tensor(1.4568, grad_fn=<NllLossBackward0>)
tensor(1.4785, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17129/17426 [31:26<00:36,  8.14it/s]

tensor(1.4775, grad_fn=<NllLossBackward0>)
tensor(1.5038, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17131/17426 [31:27<00:35,  8.21it/s]

tensor(1.4885, grad_fn=<NllLossBackward0>)
tensor(1.4689, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17133/17426 [31:27<00:37,  7.91it/s]

tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.4519, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17135/17426 [31:27<00:35,  8.17it/s]

tensor(1.4807, grad_fn=<NllLossBackward0>)
tensor(1.4696, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17137/17426 [31:27<00:34,  8.31it/s]

tensor(1.4866, grad_fn=<NllLossBackward0>)
tensor(1.5154, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17139/17426 [31:28<00:34,  8.32it/s]

tensor(1.4606, grad_fn=<NllLossBackward0>)
tensor(1.5106, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17141/17426 [31:28<00:33,  8.49it/s]

tensor(1.4804, grad_fn=<NllLossBackward0>)
tensor(1.4784, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17143/17426 [31:28<00:34,  8.16it/s]

tensor(1.5064, grad_fn=<NllLossBackward0>)
tensor(1.4548, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17145/17426 [31:28<00:33,  8.27it/s]

tensor(1.4720, grad_fn=<NllLossBackward0>)
tensor(1.5208, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17147/17426 [31:29<00:33,  8.42it/s]

tensor(1.4543, grad_fn=<NllLossBackward0>)
tensor(1.4207, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17149/17426 [31:29<00:33,  8.38it/s]

tensor(1.5132, grad_fn=<NllLossBackward0>)
tensor(1.4412, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17151/17426 [31:29<00:34,  8.00it/s]

tensor(1.4428, grad_fn=<NllLossBackward0>)
tensor(1.4629, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17153/17426 [31:29<00:33,  8.17it/s]

tensor(1.4407, grad_fn=<NllLossBackward0>)
tensor(1.4900, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17155/17426 [31:30<00:32,  8.27it/s]

tensor(1.4993, grad_fn=<NllLossBackward0>)
tensor(1.4723, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17157/17426 [31:30<00:32,  8.38it/s]

tensor(1.4619, grad_fn=<NllLossBackward0>)
tensor(1.5141, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17159/17426 [31:30<00:32,  8.17it/s]

tensor(1.5084, grad_fn=<NllLossBackward0>)
tensor(1.4760, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17161/17426 [31:30<00:31,  8.37it/s]

tensor(1.5117, grad_fn=<NllLossBackward0>)
tensor(1.5121, grad_fn=<NllLossBackward0>)


 98%|█████████▊| 17163/17426 [31:31<00:31,  8.37it/s]

tensor(1.5245, grad_fn=<NllLossBackward0>)
tensor(1.4889, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17165/17426 [31:31<00:31,  8.41it/s]

tensor(1.5082, grad_fn=<NllLossBackward0>)
tensor(1.4855, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17167/17426 [31:31<00:30,  8.47it/s]

tensor(1.4968, grad_fn=<NllLossBackward0>)
tensor(1.4435, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17169/17426 [31:31<00:31,  8.15it/s]

tensor(1.4703, grad_fn=<NllLossBackward0>)
tensor(1.4796, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17171/17426 [31:31<00:30,  8.36it/s]

tensor(1.4641, grad_fn=<NllLossBackward0>)
tensor(1.4624, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17173/17426 [31:32<00:30,  8.28it/s]

tensor(1.4691, grad_fn=<NllLossBackward0>)
tensor(1.4427, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17175/17426 [31:32<00:30,  8.18it/s]

tensor(1.4883, grad_fn=<NllLossBackward0>)
tensor(1.4544, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17177/17426 [31:32<00:30,  8.12it/s]

tensor(1.4922, grad_fn=<NllLossBackward0>)
tensor(1.5086, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17179/17426 [31:32<00:29,  8.36it/s]

tensor(1.5017, grad_fn=<NllLossBackward0>)
tensor(1.4885, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17181/17426 [31:33<00:29,  8.40it/s]

tensor(1.4691, grad_fn=<NllLossBackward0>)
tensor(1.5104, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17183/17426 [31:33<00:30,  8.01it/s]

tensor(1.5010, grad_fn=<NllLossBackward0>)
tensor(1.4816, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17185/17426 [31:33<00:31,  7.67it/s]

tensor(1.4986, grad_fn=<NllLossBackward0>)
tensor(1.5308, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17187/17426 [31:33<00:31,  7.66it/s]

tensor(1.4741, grad_fn=<NllLossBackward0>)
tensor(1.4569, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17189/17426 [31:34<00:30,  7.75it/s]

tensor(1.5119, grad_fn=<NllLossBackward0>)
tensor(1.5221, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17191/17426 [31:34<00:29,  7.88it/s]

tensor(1.5107, grad_fn=<NllLossBackward0>)
tensor(1.4987, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17193/17426 [31:34<00:29,  8.02it/s]

tensor(1.5285, grad_fn=<NllLossBackward0>)
tensor(1.4623, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17195/17426 [31:35<00:28,  8.05it/s]

tensor(1.4766, grad_fn=<NllLossBackward0>)
tensor(1.4809, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17197/17426 [31:35<00:27,  8.26it/s]

tensor(1.4840, grad_fn=<NllLossBackward0>)
tensor(1.4563, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17199/17426 [31:35<00:27,  8.16it/s]

tensor(1.4436, grad_fn=<NllLossBackward0>)
tensor(1.4760, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17201/17426 [31:35<00:26,  8.51it/s]

tensor(1.4848, grad_fn=<NllLossBackward0>)
tensor(1.4584, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17203/17426 [31:35<00:27,  8.12it/s]

tensor(1.4710, grad_fn=<NllLossBackward0>)
tensor(1.5128, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17205/17426 [31:36<00:27,  8.02it/s]

tensor(1.5178, grad_fn=<NllLossBackward0>)
tensor(1.4752, grad_fn=<NllLossBackward0>)


 99%|█████████▊| 17207/17426 [31:36<00:29,  7.44it/s]

tensor(1.5154, grad_fn=<NllLossBackward0>)
tensor(1.4861, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17209/17426 [31:36<00:29,  7.31it/s]

tensor(1.4363, grad_fn=<NllLossBackward0>)
tensor(1.4408, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17211/17426 [31:37<00:29,  7.18it/s]

tensor(1.4874, grad_fn=<NllLossBackward0>)
tensor(1.4857, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17213/17426 [31:37<00:30,  7.06it/s]

tensor(1.4830, grad_fn=<NllLossBackward0>)
tensor(1.4904, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17215/17426 [31:37<00:30,  6.85it/s]

tensor(1.4814, grad_fn=<NllLossBackward0>)
tensor(1.4835, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17217/17426 [31:37<00:29,  7.00it/s]

tensor(1.4995, grad_fn=<NllLossBackward0>)
tensor(1.4484, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17219/17426 [31:38<00:31,  6.61it/s]

tensor(1.4799, grad_fn=<NllLossBackward0>)
tensor(1.4719, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17221/17426 [31:38<00:28,  7.28it/s]

tensor(1.4955, grad_fn=<NllLossBackward0>)
tensor(1.5164, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17223/17426 [31:38<00:25,  7.90it/s]

tensor(1.4968, grad_fn=<NllLossBackward0>)
tensor(1.4810, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17225/17426 [31:38<00:24,  8.22it/s]

tensor(1.5346, grad_fn=<NllLossBackward0>)
tensor(1.4688, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17227/17426 [31:39<00:25,  7.86it/s]

tensor(1.4846, grad_fn=<NllLossBackward0>)
tensor(1.4591, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17229/17426 [31:39<00:24,  8.20it/s]

tensor(1.4815, grad_fn=<NllLossBackward0>)
tensor(1.4552, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17231/17426 [31:39<00:23,  8.31it/s]

tensor(1.4367, grad_fn=<NllLossBackward0>)
tensor(1.4930, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17233/17426 [31:39<00:22,  8.52it/s]

tensor(1.4417, grad_fn=<NllLossBackward0>)
tensor(1.4876, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17235/17426 [31:40<00:22,  8.50it/s]

tensor(1.4600, grad_fn=<NllLossBackward0>)
tensor(1.4514, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17237/17426 [31:40<00:23,  8.01it/s]

tensor(1.5145, grad_fn=<NllLossBackward0>)
tensor(1.4639, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17239/17426 [31:40<00:22,  8.28it/s]

tensor(1.4919, grad_fn=<NllLossBackward0>)
tensor(1.5326, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17241/17426 [31:40<00:21,  8.54it/s]

tensor(1.4792, grad_fn=<NllLossBackward0>)
tensor(1.4886, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17243/17426 [31:41<00:21,  8.50it/s]

tensor(1.4410, grad_fn=<NllLossBackward0>)
tensor(1.4652, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17245/17426 [31:41<00:22,  8.08it/s]

tensor(1.4654, grad_fn=<NllLossBackward0>)
tensor(1.5038, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17247/17426 [31:41<00:21,  8.20it/s]

tensor(1.4826, grad_fn=<NllLossBackward0>)
tensor(1.4847, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17249/17426 [31:41<00:21,  8.42it/s]

tensor(1.4776, grad_fn=<NllLossBackward0>)
tensor(1.4638, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17251/17426 [31:42<00:20,  8.40it/s]

tensor(1.4949, grad_fn=<NllLossBackward0>)
tensor(1.4888, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17253/17426 [31:42<00:20,  8.33it/s]

tensor(1.4909, grad_fn=<NllLossBackward0>)
tensor(1.4588, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17255/17426 [31:42<00:21,  7.84it/s]

tensor(1.4918, grad_fn=<NllLossBackward0>)
tensor(1.5035, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17257/17426 [31:42<00:20,  8.21it/s]

tensor(1.5146, grad_fn=<NllLossBackward0>)
tensor(1.4457, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17259/17426 [31:43<00:20,  8.29it/s]

tensor(1.4960, grad_fn=<NllLossBackward0>)
tensor(1.4776, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17261/17426 [31:43<00:19,  8.26it/s]

tensor(1.4866, grad_fn=<NllLossBackward0>)
tensor(1.5084, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17263/17426 [31:43<00:19,  8.21it/s]

tensor(1.5046, grad_fn=<NllLossBackward0>)
tensor(1.4422, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17265/17426 [31:43<00:19,  8.38it/s]

tensor(1.5153, grad_fn=<NllLossBackward0>)
tensor(1.5014, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17267/17426 [31:44<00:18,  8.56it/s]

tensor(1.4493, grad_fn=<NllLossBackward0>)
tensor(1.5181, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17269/17426 [31:44<00:18,  8.43it/s]

tensor(1.4762, grad_fn=<NllLossBackward0>)
tensor(1.4525, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17271/17426 [31:44<00:19,  7.97it/s]

tensor(1.4125, grad_fn=<NllLossBackward0>)
tensor(1.4712, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17273/17426 [31:44<00:18,  8.11it/s]

tensor(1.4709, grad_fn=<NllLossBackward0>)
tensor(1.5074, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17275/17426 [31:45<00:17,  8.44it/s]

tensor(1.4671, grad_fn=<NllLossBackward0>)
tensor(1.4796, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17277/17426 [31:45<00:17,  8.43it/s]

tensor(1.5138, grad_fn=<NllLossBackward0>)
tensor(1.4603, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17279/17426 [31:45<00:17,  8.54it/s]

tensor(1.4459, grad_fn=<NllLossBackward0>)
tensor(1.4938, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17281/17426 [31:45<00:17,  8.16it/s]

tensor(1.5204, grad_fn=<NllLossBackward0>)
tensor(1.5082, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17283/17426 [31:45<00:16,  8.48it/s]

tensor(1.4373, grad_fn=<NllLossBackward0>)
tensor(1.4917, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17285/17426 [31:46<00:16,  8.47it/s]

tensor(1.4536, grad_fn=<NllLossBackward0>)
tensor(1.4550, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17287/17426 [31:46<00:16,  8.38it/s]

tensor(1.4283, grad_fn=<NllLossBackward0>)
tensor(1.5208, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17289/17426 [31:46<00:17,  8.03it/s]

tensor(1.4580, grad_fn=<NllLossBackward0>)
tensor(1.5000, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17291/17426 [31:46<00:16,  8.27it/s]

tensor(1.5172, grad_fn=<NllLossBackward0>)
tensor(1.4325, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17293/17426 [31:47<00:15,  8.38it/s]

tensor(1.4458, grad_fn=<NllLossBackward0>)
tensor(1.4841, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17295/17426 [31:47<00:15,  8.44it/s]

tensor(1.4906, grad_fn=<NllLossBackward0>)
tensor(1.4535, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17297/17426 [31:47<00:15,  8.49it/s]

tensor(1.4888, grad_fn=<NllLossBackward0>)
tensor(1.5073, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17299/17426 [31:47<00:15,  8.26it/s]

tensor(1.4750, grad_fn=<NllLossBackward0>)
tensor(1.4660, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17301/17426 [31:48<00:15,  8.24it/s]

tensor(1.4711, grad_fn=<NllLossBackward0>)
tensor(1.4324, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17303/17426 [31:48<00:15,  8.07it/s]

tensor(1.4681, grad_fn=<NllLossBackward0>)
tensor(1.4954, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17305/17426 [31:48<00:15,  7.73it/s]

tensor(1.4660, grad_fn=<NllLossBackward0>)
tensor(1.4887, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17307/17426 [31:48<00:15,  7.70it/s]

tensor(1.4860, grad_fn=<NllLossBackward0>)
tensor(1.5187, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17309/17426 [31:49<00:16,  7.11it/s]

tensor(1.5170, grad_fn=<NllLossBackward0>)
tensor(1.4815, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17311/17426 [31:49<00:15,  7.23it/s]

tensor(1.4523, grad_fn=<NllLossBackward0>)
tensor(1.4859, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17313/17426 [31:49<00:14,  7.58it/s]

tensor(1.4615, grad_fn=<NllLossBackward0>)
tensor(1.4701, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17315/17426 [31:50<00:14,  7.83it/s]

tensor(1.4348, grad_fn=<NllLossBackward0>)
tensor(1.5053, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17317/17426 [31:50<00:13,  8.01it/s]

tensor(1.4598, grad_fn=<NllLossBackward0>)
tensor(1.5340, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17319/17426 [31:50<00:13,  7.94it/s]

tensor(1.4498, grad_fn=<NllLossBackward0>)
tensor(1.4479, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17321/17426 [31:50<00:13,  7.70it/s]

tensor(1.4990, grad_fn=<NllLossBackward0>)
tensor(1.4990, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17323/17426 [31:51<00:13,  7.82it/s]

tensor(1.5158, grad_fn=<NllLossBackward0>)
tensor(1.4860, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17325/17426 [31:51<00:12,  7.78it/s]

tensor(1.4594, grad_fn=<NllLossBackward0>)
tensor(1.4741, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17327/17426 [31:51<00:12,  7.64it/s]

tensor(1.4699, grad_fn=<NllLossBackward0>)
tensor(1.4424, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17329/17426 [31:51<00:13,  7.09it/s]

tensor(1.4987, grad_fn=<NllLossBackward0>)
tensor(1.4912, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17331/17426 [31:52<00:13,  7.29it/s]

tensor(1.5112, grad_fn=<NllLossBackward0>)
tensor(1.5248, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17333/17426 [31:52<00:12,  7.59it/s]

tensor(1.4737, grad_fn=<NllLossBackward0>)
tensor(1.4346, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17335/17426 [31:52<00:13,  6.99it/s]

tensor(1.4930, grad_fn=<NllLossBackward0>)
tensor(1.4742, grad_fn=<NllLossBackward0>)


 99%|█████████▉| 17337/17426 [31:52<00:12,  7.04it/s]

tensor(1.4598, grad_fn=<NllLossBackward0>)
tensor(1.4679, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17339/17426 [31:53<00:12,  6.75it/s]

tensor(1.4816, grad_fn=<NllLossBackward0>)
tensor(1.5221, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17341/17426 [31:53<00:12,  6.95it/s]

tensor(1.4549, grad_fn=<NllLossBackward0>)
tensor(1.4511, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17343/17426 [31:53<00:10,  7.65it/s]

tensor(1.4688, grad_fn=<NllLossBackward0>)
tensor(1.4840, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17345/17426 [31:54<00:10,  7.88it/s]

tensor(1.4764, grad_fn=<NllLossBackward0>)
tensor(1.4708, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17347/17426 [31:54<00:10,  7.88it/s]

tensor(1.4971, grad_fn=<NllLossBackward0>)
tensor(1.4514, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17349/17426 [31:54<00:09,  8.09it/s]

tensor(1.4406, grad_fn=<NllLossBackward0>)
tensor(1.4795, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17351/17426 [31:54<00:09,  8.27it/s]

tensor(1.4797, grad_fn=<NllLossBackward0>)
tensor(1.4332, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17353/17426 [31:55<00:08,  8.23it/s]

tensor(1.4979, grad_fn=<NllLossBackward0>)
tensor(1.4758, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17355/17426 [31:55<00:08,  8.01it/s]

tensor(1.4758, grad_fn=<NllLossBackward0>)
tensor(1.4297, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17357/17426 [31:55<00:08,  8.14it/s]

tensor(1.4421, grad_fn=<NllLossBackward0>)
tensor(1.4839, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17359/17426 [31:55<00:07,  8.45it/s]

tensor(1.5120, grad_fn=<NllLossBackward0>)
tensor(1.4993, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17361/17426 [31:56<00:07,  8.49it/s]

tensor(1.4901, grad_fn=<NllLossBackward0>)
tensor(1.4649, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17363/17426 [31:56<00:07,  8.17it/s]

tensor(1.4961, grad_fn=<NllLossBackward0>)
tensor(1.4726, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17365/17426 [31:56<00:07,  7.93it/s]

tensor(1.4512, grad_fn=<NllLossBackward0>)
tensor(1.4753, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17367/17426 [31:56<00:07,  8.01it/s]

tensor(1.4669, grad_fn=<NllLossBackward0>)
tensor(1.4670, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17369/17426 [31:57<00:06,  8.26it/s]

tensor(1.4767, grad_fn=<NllLossBackward0>)
tensor(1.4680, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17371/17426 [31:57<00:06,  8.19it/s]

tensor(1.5121, grad_fn=<NllLossBackward0>)
tensor(1.4659, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17373/17426 [31:57<00:06,  8.20it/s]

tensor(1.4976, grad_fn=<NllLossBackward0>)
tensor(1.4652, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17375/17426 [31:57<00:06,  8.32it/s]

tensor(1.5054, grad_fn=<NllLossBackward0>)
tensor(1.5162, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17377/17426 [31:57<00:05,  8.40it/s]

tensor(1.5169, grad_fn=<NllLossBackward0>)
tensor(1.5216, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17379/17426 [31:58<00:05,  8.50it/s]

tensor(1.4617, grad_fn=<NllLossBackward0>)
tensor(1.4692, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17381/17426 [31:58<00:05,  8.05it/s]

tensor(1.4933, grad_fn=<NllLossBackward0>)
tensor(1.4254, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17383/17426 [31:58<00:05,  8.12it/s]

tensor(1.4921, grad_fn=<NllLossBackward0>)
tensor(1.4666, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17385/17426 [31:58<00:04,  8.34it/s]

tensor(1.4972, grad_fn=<NllLossBackward0>)
tensor(1.4851, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17387/17426 [31:59<00:04,  8.47it/s]

tensor(1.4591, grad_fn=<NllLossBackward0>)
tensor(1.5165, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17389/17426 [31:59<00:04,  8.11it/s]

tensor(1.4627, grad_fn=<NllLossBackward0>)
tensor(1.4693, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17391/17426 [31:59<00:04,  7.91it/s]

tensor(1.4575, grad_fn=<NllLossBackward0>)
tensor(1.4811, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17393/17426 [31:59<00:04,  8.25it/s]

tensor(1.4585, grad_fn=<NllLossBackward0>)
tensor(1.4872, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17395/17426 [32:00<00:03,  8.34it/s]

tensor(1.4628, grad_fn=<NllLossBackward0>)
tensor(1.5275, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17397/17426 [32:00<00:03,  8.39it/s]

tensor(1.4443, grad_fn=<NllLossBackward0>)
tensor(1.4808, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17399/17426 [32:00<00:03,  8.04it/s]

tensor(1.4561, grad_fn=<NllLossBackward0>)
tensor(1.5124, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17401/17426 [32:00<00:03,  8.21it/s]

tensor(1.4612, grad_fn=<NllLossBackward0>)
tensor(1.4889, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17403/17426 [32:01<00:02,  8.32it/s]

tensor(1.4543, grad_fn=<NllLossBackward0>)
tensor(1.5017, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17405/17426 [32:01<00:02,  8.43it/s]

tensor(1.4849, grad_fn=<NllLossBackward0>)
tensor(1.4711, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17407/17426 [32:01<00:02,  7.80it/s]

tensor(1.4836, grad_fn=<NllLossBackward0>)
tensor(1.4554, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17409/17426 [32:01<00:02,  7.88it/s]

tensor(1.4876, grad_fn=<NllLossBackward0>)
tensor(1.4585, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17411/17426 [32:02<00:01,  7.98it/s]

tensor(1.4571, grad_fn=<NllLossBackward0>)
tensor(1.4735, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17413/17426 [32:02<00:01,  8.17it/s]

tensor(1.5217, grad_fn=<NllLossBackward0>)
tensor(1.4872, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17415/17426 [32:02<00:01,  7.44it/s]

tensor(1.4715, grad_fn=<NllLossBackward0>)
tensor(1.4672, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17417/17426 [32:02<00:01,  7.95it/s]

tensor(1.4794, grad_fn=<NllLossBackward0>)
tensor(1.4679, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17419/17426 [32:03<00:00,  8.14it/s]

tensor(1.4694, grad_fn=<NllLossBackward0>)
tensor(1.4927, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17421/17426 [32:03<00:00,  8.32it/s]

tensor(1.5139, grad_fn=<NllLossBackward0>)
tensor(1.4849, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17423/17426 [32:03<00:00,  7.74it/s]

tensor(1.4708, grad_fn=<NllLossBackward0>)
tensor(1.5800, grad_fn=<NllLossBackward0>)


100%|█████████▉| 17425/17426 [32:03<00:00,  7.20it/s]

tensor(1.5183, grad_fn=<NllLossBackward0>)
tensor(1.5273, grad_fn=<NllLossBackward0>)


100%|██████████| 17426/17426 [32:04<00:00,  7.06it/s]

tensor(1.4895, grad_fn=<NllLossBackward0>)


100%|██████████| 17426/17426 [32:04<00:00,  9.06it/s]


In [56]:
torch.argmax(model(input_sequence)[0][8])

tensor(41)

In [55]:
target[8:]

tensor([41, 46,  1,  ..., 56, 43,  1])

In [57]:
torch.argmax(model(input_sequence)[0][9])

tensor(46)

In [58]:
torch.argmax(model(input_sequence)[0][10])

tensor(1)