In [1]:
import pandas as pd
import numpy as np
import torch
import tqdm
import os
import random
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

2021-10-24 09:53:07.088063: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="4"
tf_device='/gpu:0'

In [3]:
tokenizer = BertTokenizer.from_pretrained("indolem/indobert-base-uncased")
config = BertConfig.from_pretrained("indolem/indobert-base-uncased")
config.num_labels = 3
model = BertForSequenceClassification.from_pretrained("indolem/indobert-base-uncased", config=config)

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indober

In [4]:
class DocumentSentimentDataset(Dataset):
    # Static constant variable (We need to have this part to comply with IndoNLU standard)
    LABEL2INDEX = {'negative': 0, 'neutral': 1, 'positive': 2} # Label string to index
    INDEX2LABEL = {0:'negative', 1:'neutral', 2: 'positive'} # Index to label string
    NUM_LABELS = 3 # Number of label
   
    def load_dataset(self, path):
        df = pd.read_csv(path, sep='\t', header=None) # Read tsv file with pandas
        df.columns = ['text','sentiment'] # Rename the columns
        df['sentiment'] = df['sentiment'].apply(lambda lab: self.LABEL2INDEX[lab]) # Convert string label into index
        return df
   
    def __init__(self, dataset_path, tokenizer, *args, **kwargs):
        self.data = self.load_dataset(dataset_path) # Load the tsv file

        # Assign the tokenizer for tokenization
        # here we use subword tokenizer from HuggingFace
        self.tokenizer = tokenizer
    
    def __getitem__(self, index):
        data = self.data.loc[index,:] # Taking data from a specific row from Pandas
        text, sentiment = data['text'], data['sentiment'] # Take text and sentiment from the row
        subwords = self.tokenizer.encode(text) # Tokenize the text with tokenizer
        return np.array(subwords), np.array(sentiment), data['text']
   
    def __len__(self):
        return len(self.data)  # Return the length of the dataset

In [5]:
class DocumentSentimentDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)
        self.max_seq_len = max_seq_len # Assign max limit of the sequence length
        self.collate_fn = self._collate_fn # Assign the collate_fn function with our function
       
    def _collate_fn(self, batch):
        batch_size = len(batch) # Take the batch size
        max_seq_len = max(map(lambda x: len(x[0]), batch)) # Find maximum sequence length from the batch 
        max_seq_len = min(self.max_seq_len, max_seq_len) # Compare with our defined limit
       
    # Create buffer for subword, mask, and sentiment labels, initialize all with 0
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)
       
    # Fill all of the buffer
        seq_list = []
        for i, (subwords, sentiment, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            sentiment_batch[i,0] = sentiment
            
            seq_list.append(raw_seq)
           
    # Return the subword, mask, and sentiment data
        return subword_batch, mask_batch, sentiment_batch, seq_list

In [6]:
def forward_sequence_classification(model, batch_data, i2w, is_test=False, device = 'cuda', **kwargs):
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)
            
    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()
    
    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    print(outputs)
    loss, logits = outputs[:2]
    
    # generate prediction & label list
    list_hyp = []
    list_label = []
    hyp = torch.topk(logits, 1)[1]
    for j in range(len(hyp)):
        list_hyp.append(i2w[hyp[j].item()])
        list_label.append(i2w[label_batch[j][0].item()])
    return loss, list_hyp, list_label

In [7]:
def document_sentiment_metrics_fn(list_hyp, list_label):
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

In [8]:
import itertools
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
set_seed(241021)

In [9]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'negative': 0, 'neutral': 1, 'positive': 2}
{0: 'negative', 1: 'neutral', 2: 'positive'}


In [10]:
text =  'satu satu aku sayang ibu, dua dua juga sayang ayah, tiga-tiga sayang adik kakak, satu dua tiga sayang semuanya'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
print(subwords)
logits = model(subwords)[0]
print(logits)
print(torch.topk(logits, k=1))
label = torch.topk(logits, k=1)[1].squeeze().item()
print(f'Text: {text} | Label : {i2w[label]} ({torch.nn.functional.softmax(logits, dim=-1).squeeze()[label] * 100}%)')

tensor([[   3, 1713, 1713, 2254, 5458, 2477,   16, 1854, 1854, 1614, 5458, 3095,
           16, 2139,   17, 2139, 5458, 4962, 6962,   16, 1713, 1854, 2139, 5458,
         4398,    4]])
tensor([[-0.0717, -0.1426, -0.1880]], grad_fn=<AddmmBackward0>)
torch.return_types.topk(
values=tensor([[-0.0717]], grad_fn=<TopkBackward0>),
indices=tensor([[0]]))
Text: satu satu aku sayang ibu, dua dua juga sayang ayah, tiga-tiga sayang adik kakak, satu dua tiga sayang semuanya | Label : negative (35.43781280517578%)


In [11]:
print(f'Text: {text} | Label : {i2w[label]} ({torch.nn.functional.softmax(logits, dim=-1).squeeze()[label] * 100}%)')

Text: satu satu aku sayang ibu, dua dua juga sayang ayah, tiga-tiga sayang adik kakak, satu dua tiga sayang semuanya | Label : negative (35.43781280517578%)


In [12]:
train = DocumentSentimentDataset('dataset/train_preprocess.tsv', tokenizer)
valid = DocumentSentimentDataset('dataset/valid_preprocess.tsv', tokenizer)
train_loader = DocumentSentimentDataLoader(dataset=train, max_seq_len=100, batch_size=32, num_workers=1, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid, max_seq_len=100, batch_size=32, num_workers=1, shuffle=True)

In [13]:
torch.cuda.empty_cache()

In [14]:
def count_param(module, trainable=True):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
count_param(model)

110560515

In [15]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [16]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm.tqdm(iter(train_loader), leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

(Epoch 1) TRAIN LOSS:1.1248 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:07,  3.33it/s]

SequenceClassifierOutput(loss=tensor(1.1248, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.6925e-01,  1.9000e-01, -4.6412e-01],
        [-1.3331e-01,  2.0073e-01,  1.3766e-01],
        [ 4.1188e-02,  3.6316e-01, -4.0002e-01],
        [ 1.1602e-01, -2.2928e-01, -8.4232e-02],
        [ 3.6628e-01,  2.1939e-03, -7.7200e-02],
        [-9.3783e-02,  4.2242e-01, -8.1898e-02],
        [ 2.8314e-01,  3.9384e-01, -5.4406e-02],
        [ 2.4333e-01, -6.2231e-02,  5.0183e-02],
        [ 8.3639e-02,  5.0924e-02, -3.3784e-03],
        [ 3.6789e-01, -9.4257e-02, -3.1544e-01],
        [ 2.6537e-01, -2.2608e-01,  3.0173e-01],
        [ 4.1376e-01, -2.0723e-02, -9.7583e-02],
        [ 1.6983e-01, -1.7244e-01, -8.5138e-02],
        [-3.8402e-02, -7.5647e-02,  1.7384e-01],
        [ 2.5068e-01,  1.9365e-01, -2.4366e-01],
        [ 2.1272e-01,  6.6451e-02,  2.3712e-01],
        [-1.4738e-01, -1.5499e-01, -1.8508e-01],
        [-2.6014e-01,  7.0196e-04, -3.5519e-01],
        [ 1.4630e-02

(Epoch 1) TRAIN LOSS:1.1244 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:08,  2.79it/s]

SequenceClassifierOutput(loss=tensor(1.0610, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 3.3358e-01,  2.7102e-02,  1.7650e-01],
        [-1.6536e-01, -2.9028e-01, -1.5298e-01],
        [-1.3705e-01, -3.3812e-01,  1.9219e-01],
        [ 8.6969e-02,  5.7553e-02, -2.3976e-01],
        [-2.4553e-01, -1.2515e-04, -1.9184e-01],
        [-2.2806e-01, -1.2971e-01,  1.4047e-01],
        [-4.4066e-01,  2.6273e-02,  6.6142e-02],
        [ 5.9680e-02, -1.1481e-01,  7.5484e-02],
        [-1.4658e-01,  9.3736e-02, -1.7792e-01],
        [ 2.8832e-02, -1.0281e-01,  3.6410e-01],
        [ 2.2029e-01,  6.0928e-02,  7.9498e-02],
        [ 2.3717e-01, -1.0097e-01, -4.8112e-01],
        [ 9.6440e-02, -1.7729e-01,  7.1182e-02],
        [-9.4717e-03, -2.3322e-02, -1.3896e-01],
        [ 3.6283e-01, -2.9003e-01, -6.5457e-03],
        [-2.1048e-01, -3.0933e-02, -7.2701e-02],
        [ 3.3075e-01, -5.3597e-02,  5.7673e-02],
        [ 9.8947e-02, -6.1667e-02, -1.4133e-01],
        [ 3.1645e-01

(Epoch 1) TRAIN LOSS:1.1033 LR:0.00000300:  12%|████▉                                    | 3/25 [00:01<00:08,  2.61it/s]

SequenceClassifierOutput(loss=tensor(1.0615, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1340, -0.5509,  0.3776],
        [-0.0694, -0.6235,  0.1631],
        [ 0.5687,  0.0137,  0.0080],
        [-0.4053, -0.2414,  0.1331],
        [ 0.3528, -0.4841, -0.2105],
        [ 0.4094, -0.0494,  0.2016],
        [ 0.4795, -0.5914,  0.4766],
        [ 0.0547, -0.0368,  0.1275],
        [ 0.0724, -0.4891,  0.5190],
        [ 0.0820, -0.2980, -0.0596],
        [-0.0238, -0.2505,  0.2299],
        [ 0.4447, -0.0023,  0.4011],
        [-0.2058, -0.3361,  0.3087],
        [-0.2067, -0.3249, -0.1905],
        [ 0.2569, -0.3917, -0.0438],
        [-0.1258, -0.5785,  0.5227],
        [-0.3510, -0.4580,  0.2713],
        [-0.2333, -0.4424,  0.5394],
        [ 0.0653, -0.3911,  0.2993],
        [-0.0888,  0.1530,  0.0632],
        [ 0.1534, -0.2568,  0.0868],
        [ 0.0470, -0.2686,  0.4769],
        [-0.0308, -0.2131, -0.0741],
        [-0.4789, -0.4295,  0.2014],
        [-0.13

(Epoch 1) TRAIN LOSS:1.0928 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:01<00:08,  2.41it/s]

SequenceClassifierOutput(loss=tensor(1.0372, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1493, -0.6989,  0.3508],
        [-0.0971, -0.0396,  0.0840],
        [ 0.0215, -0.3114,  0.0156],
        [-0.1057, -0.5323,  0.2386],
        [-0.1444, -0.3982,  0.3872],
        [-0.0866, -0.4815,  0.1459],
        [ 0.0849, -0.2159,  0.1663],
        [ 0.0723, -0.2304,  0.1425],
        [-0.0508, -0.0150,  0.0508],
        [-0.1079, -0.1526,  0.3381],
        [-0.2642, -0.3151,  0.2060],
        [ 0.1091, -0.0841,  0.4527],
        [ 0.3440,  0.0392,  0.2497],
        [ 0.2210,  0.0796,  0.0032],
        [-0.0499, -0.0354, -0.1896],
        [-0.0130, -0.1318,  0.2586],
        [ 0.0374, -0.2186,  0.1086],
        [ 0.1585, -0.1679,  0.0833],
        [ 0.3560, -0.3254,  0.2173],
        [-0.0426, -0.5343,  0.1369],
        [ 0.0199, -0.5955, -0.0829],
        [-0.0822,  0.0454,  0.2163],
        [ 0.1003, -0.2734,  0.5006],
        [ 0.0669, -0.2741,  0.0727],
        [ 0.09

(Epoch 1) TRAIN LOSS:1.0817 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:08,  2.46it/s]

SequenceClassifierOutput(loss=tensor(1.0365, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0667, -0.1287,  0.0376],
        [ 0.0999, -0.3311,  0.1549],
        [-0.2925, -0.2376,  0.2684],
        [ 0.0451, -0.0779, -0.1364],
        [-0.0611,  0.0108,  0.2624],
        [ 0.1488, -0.1283,  0.0836],
        [-0.2128, -0.3336,  0.3780],
        [ 0.1999, -0.1691, -0.2566],
        [ 0.1041, -0.1666,  0.1574],
        [-0.0218, -0.2759,  0.2302],
        [ 0.1763,  0.0167,  0.2072],
        [ 0.0189, -0.3799,  0.1364],
        [ 0.3692, -0.1574,  0.4087],
        [ 0.2367, -0.5093,  0.6395],
        [ 0.1913, -0.0918,  0.4638],
        [ 0.2617, -0.6148,  0.2005],
        [-0.0523, -0.1130,  0.0901],
        [-0.2293,  0.0314, -0.1460],
        [ 0.0047, -0.3441,  0.0410],
        [-0.0805,  0.1155,  0.1840],
        [ 0.3428, -0.1029,  0.1371],
        [ 0.0398, -0.1462,  0.1763],
        [-0.0709, -0.4675,  0.2537],
        [ 0.0277, -0.4445,  0.2619],
        [ 0.03

(Epoch 1) TRAIN LOSS:1.0742 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:02<00:07,  2.50it/s]

SequenceClassifierOutput(loss=tensor(0.9702, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0222, -0.2227,  0.2113],
        [ 0.0096, -0.1628, -0.0314],
        [ 0.1040, -0.0338, -0.1019],
        [ 0.0868, -0.2884,  0.1040],
        [-0.0271, -0.0932,  0.3653],
        [ 0.2423, -0.3177,  0.2804],
        [ 0.1364, -0.4017,  0.2334],
        [-0.1969, -0.3921, -0.0836],
        [-0.2177, -0.0127,  0.0388],
        [ 0.3047, -0.2319,  0.1149],
        [ 0.0889, -0.4452,  0.2171],
        [-0.2579, -0.5966,  0.1794],
        [ 0.2408, -0.3814,  0.1841],
        [ 0.0130, -0.2900,  0.1651],
        [ 0.1286, -0.3962,  0.2196],
        [ 0.2029, -0.1170,  0.3518],
        [-0.0498, -0.0455,  0.2099],
        [ 0.2243,  0.1677, -0.0605],
        [-0.0816, -0.2837,  0.5820],
        [ 0.2512, -0.2328,  0.1844],
        [-0.0056, -0.3039,  0.5057],
        [ 0.2966, -0.3696,  0.2931],
        [ 0.1467, -0.5735,  0.1385],
        [ 0.1041, -0.2432,  0.3079],
        [ 0.03

(Epoch 1) TRAIN LOSS:1.0593 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:02<00:07,  2.52it/s]

SequenceClassifierOutput(loss=tensor(0.9969, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 7.9393e-02, -9.7987e-02,  2.4586e-01],
        [-5.7275e-02, -2.4713e-01,  1.2641e-01],
        [-2.4816e-01, -6.5503e-01,  2.5040e-01],
        [-4.5414e-01, -3.4971e-01,  1.6484e-01],
        [-3.2194e-02, -1.3143e-01,  3.9579e-01],
        [-2.1518e-01, -1.6353e-01,  2.9668e-01],
        [ 6.2327e-02, -2.3232e-01,  2.0404e-01],
        [ 1.0601e-02, -1.8525e-01, -3.7496e-01],
        [-3.2031e-02, -1.6302e-01, -3.8806e-02],
        [-1.0071e-01, -1.4529e-01,  1.2113e-04],
        [-2.5240e-01, -3.5174e-01,  1.6851e-01],
        [ 4.8148e-03, -7.1397e-02,  1.3092e-02],
        [-2.2161e-01, -2.5575e-01,  1.1327e-02],
        [ 7.1641e-02, -2.1320e-02,  2.0522e-01],
        [ 7.0831e-02, -7.1277e-01,  4.1497e-01],
        [-2.0854e-01, -5.6149e-01,  1.6750e-01],
        [ 3.2728e-02, -2.2744e-02,  3.4253e-01],
        [-1.3173e-01, -1.8153e-01,  4.0255e-01],
        [-1.6737e-01

(Epoch 1) TRAIN LOSS:1.0515 LR:0.00000300:  32%|█████████████                            | 8/25 [00:03<00:06,  2.58it/s]

SequenceClassifierOutput(loss=tensor(0.9002, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2016, -0.4135,  0.2004],
        [ 0.0547, -0.0815,  0.6142],
        [ 0.1484, -0.4244,  0.2624],
        [ 0.2684, -0.6842,  0.2830],
        [ 0.0296, -0.2577,  0.6357],
        [-0.0600, -0.4174, -0.1249],
        [ 0.0382,  0.1387,  0.3408],
        [-0.2638, -0.4285,  0.2095],
        [ 0.0949, -0.6501,  0.4609],
        [-0.0784, -0.5600, -0.0347],
        [-0.0195, -0.3495,  0.4478],
        [-0.2140, -0.4175,  0.2795],
        [ 0.0196, -0.2563,  0.0139],
        [-0.1589, -0.6613,  0.7786],
        [-0.0683, -0.3798,  0.2919],
        [-0.2344,  0.0354,  0.2088],
        [ 0.0272, -0.5480,  0.2825],
        [ 0.2917, -0.3479,  0.6506],
        [-0.0819, -0.2094,  0.3546],
        [-0.2368, -0.5442,  0.1293],
        [ 0.0602, -0.3466,  0.2831],
        [ 0.0912, -0.2863,  0.2597],
        [ 0.1390, -0.2674,  0.4325],
        [-0.1569, -0.1676,  0.4193],
        [-0.25

(Epoch 1) TRAIN LOSS:1.0347 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:03<00:06,  2.60it/s]

SequenceClassifierOutput(loss=tensor(0.9324, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3963, -0.5222,  0.3155],
        [ 0.2065, -0.4531,  0.6299],
        [ 0.3739, -0.4688,  0.1563],
        [ 0.1123,  0.0737,  0.2484],
        [-0.2079, -0.0664,  0.3209],
        [-0.1542, -0.6695,  0.3463],
        [ 0.1677, -0.2735,  0.5586],
        [-0.1993, -0.2785,  0.3509],
        [-0.1477, -0.5803,  0.6436],
        [-0.0484, -0.4623,  0.2004],
        [ 0.3414,  0.0339,  0.4165],
        [-0.0751, -0.6740,  0.5520],
        [-0.0626, -0.6240,  0.4403],
        [ 0.1521, -0.6802,  0.1305],
        [-0.0605, -0.6274,  0.3598],
        [-0.0013, -0.6324,  0.4078],
        [ 0.3415, -0.7303,  0.5804],
        [-0.2959, -0.6862,  0.4366],
        [ 0.1570, -0.4957,  0.0550],
        [-0.1901, -0.3536,  0.1790],
        [-0.0957, -0.5148,  0.2876],
        [-0.1523, -0.7670,  0.3344],
        [ 0.0273, -0.6518,  0.0783],
        [-0.2589, -0.2921,  0.2518],
        [-0.28

(Epoch 1) TRAIN LOSS:1.0245 LR:0.00000300:  40%|████████████████                        | 10/25 [00:03<00:05,  2.52it/s]

SequenceClassifierOutput(loss=tensor(0.9166, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0306, -0.6399,  0.3528],
        [ 0.1832, -0.7018,  0.2969],
        [-0.0793, -0.2859,  0.6106],
        [-0.1038, -0.0375,  0.3993],
        [-0.1286, -0.4076,  0.6667],
        [ 0.3442, -0.4900,  0.4649],
        [ 0.1167, -0.9258,  0.3196],
        [-0.0804, -0.4257,  0.0058],
        [-0.1057, -0.8058,  0.1649],
        [ 0.1470, -0.4924,  0.1195],
        [-0.0261, -0.2380,  0.3106],
        [ 0.5308, -0.5494,  0.5333],
        [-0.2835, -0.2392,  0.1654],
        [ 0.0284, -0.2479,  0.5108],
        [-0.6869,  0.2942,  0.3682],
        [ 0.4942, -0.6269,  0.4972],
        [ 0.0867, -0.3611,  0.5693],
        [-0.1904, -0.3808,  0.4595],
        [-0.0365, -0.7043,  0.7587],
        [ 0.1986, -0.4265,  0.1028],
        [-0.1341, -0.3920,  0.7128],
        [ 0.0945, -0.5226,  0.4172],
        [ 0.1335, -0.6054,  0.7292],
        [-0.1767, -0.4574,  0.4550],
        [ 0.24

(Epoch 1) TRAIN LOSS:1.0147 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:04<00:05,  2.53it/s]

SequenceClassifierOutput(loss=tensor(0.9021, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0846, -0.7251,  0.5418],
        [ 0.0031, -0.5051, -0.0968],
        [ 0.0818, -0.1684,  0.5874],
        [ 0.4047, -0.3564,  0.6372],
        [ 0.0841, -0.8040,  0.4113],
        [ 0.2465, -0.5708,  0.4717],
        [ 0.3120, -0.1966,  0.0911],
        [-0.1031, -0.4685,  0.6242],
        [ 0.2910, -0.5022,  0.5820],
        [ 0.0840, -0.7423,  0.8136],
        [-0.2388, -0.6925,  0.3818],
        [ 0.0704, -0.7290,  0.5985],
        [ 0.0054, -0.4618,  0.4556],
        [-0.0764, -0.6533,  0.1922],
        [-0.0145, -0.4946,  0.6377],
        [ 0.2504, -0.4083,  0.4510],
        [-0.2310, -0.4149,  0.2688],
        [-0.0633, -0.7442,  0.6351],
        [-0.2014, -0.4025,  0.5354],
        [ 0.0788, -0.3669,  0.4509],
        [-0.2864, -0.5872, -0.0945],
        [ 0.0895, -0.4694,  0.7252],
        [ 0.1168, -0.4552,  0.1592],
        [-0.0743, -0.1858,  0.3195],
        [-0.05

(Epoch 1) TRAIN LOSS:1.0053 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:04<00:05,  2.52it/s]

SequenceClassifierOutput(loss=tensor(0.9332, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0415, -0.6027,  0.6417],
        [-0.1980, -0.4362,  0.4287],
        [ 0.3696, -0.4611,  0.5162],
        [ 0.2274, -0.5220,  0.2358],
        [-0.0764, -0.8370,  0.3601],
        [-0.2669, -0.2173,  0.3753],
        [-0.1100, -0.7778,  0.5228],
        [-0.2142, -0.2869,  0.4333],
        [ 0.1425, -0.5088,  0.9641],
        [ 0.0171, -0.3709,  0.3535],
        [-0.1190, -0.5203,  0.4814],
        [-0.0508, -0.4496,  0.3007],
        [-0.4326, -0.7226,  0.5245],
        [-0.2371, -0.4252,  0.8325],
        [ 0.0515, -0.4939,  0.5270],
        [-0.0174, -0.5728,  0.3409],
        [ 0.1288, -0.5749,  0.6537],
        [-0.0431, -0.3727,  0.2271],
        [-0.3141, -0.4360,  0.4376],
        [ 0.2178, -0.4167,  0.7684],
        [-0.0111, -0.2785,  0.4638],
        [ 0.1730, -0.5005,  0.5179],
        [-0.2116,  0.0127,  0.1443],
        [-0.1392, -1.1283,  0.5485],
        [ 0.03

(Epoch 1) TRAIN LOSS:0.9909 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:05<00:03,  2.77it/s]

SequenceClassifierOutput(loss=tensor(0.8757, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1142, -0.7027,  0.6708],
        [ 0.0698, -0.5978,  0.7303],
        [ 0.0481, -0.4122,  1.1504],
        [ 0.2226, -0.2647,  0.3410],
        [ 0.2492, -0.3842,  0.7737],
        [-0.0282, -0.8701,  0.5156],
        [-0.2132, -0.5232,  0.3349],
        [ 0.2309, -0.3823,  0.3188],
        [ 0.4349, -0.3952,  0.7973],
        [-0.0216, -0.8941,  0.4774],
        [-0.0296, -0.4920,  0.7232],
        [-0.2689, -0.5302,  0.5977],
        [ 0.0271, -0.2440,  0.5131],
        [ 0.1858, -0.0746,  0.7798],
        [-0.1045, -0.3827,  0.2579],
        [ 0.1493, -0.2731,  0.3004],
        [-0.1409, -0.2613,  0.4228],
        [-0.1956, -0.6479,  0.6974],
        [ 0.0857, -0.3690,  0.3855],
        [-0.4522, -0.4030,  0.7865],
        [ 0.0787, -0.6925,  0.9883],
        [-0.1011, -0.2046,  0.3528],
        [ 0.0189, -0.6803,  0.5599],
        [ 0.3292, -0.9006,  0.7602],
        [-0.07

(Epoch 1) TRAIN LOSS:0.9796 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:05<00:03,  3.21it/s]

SequenceClassifierOutput(loss=tensor(0.8224, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.8648e-02, -6.7016e-01,  9.2199e-01],
        [ 8.9469e-02, -6.0849e-01,  4.5248e-01],
        [-2.2142e-01, -4.5064e-01,  9.5107e-01],
        [-1.6129e-01, -6.7572e-01,  7.2915e-01],
        [ 3.0077e-01, -7.0519e-01,  4.1560e-01],
        [ 2.4125e-01, -8.5453e-02,  2.2386e-01],
        [ 1.4607e-01, -5.2336e-01,  2.5771e-01],
        [ 3.5518e-01, -3.6471e-01,  5.9907e-01],
        [-1.2070e-01, -3.3562e-01,  5.7717e-01],
        [-2.0808e-01, -5.3816e-01,  6.3059e-01],
        [-3.9521e-02, -6.7447e-01,  2.5797e-01],
        [ 3.6892e-01, -5.3289e-01,  5.1673e-01],
        [-1.3885e-01, -4.6798e-01,  8.9434e-01],
        [ 2.1320e-02, -8.5664e-01,  7.6790e-01],
        [-1.8935e-01, -2.3584e-01,  4.7885e-01],
        [ 2.7568e-02, -6.6484e-01,  3.4655e-01],
        [-1.2317e-02, -4.6198e-01,  8.7088e-01],
        [ 3.7116e-02, -5.7991e-01,  5.6252e-01],
        [-3.3225e-01

(Epoch 1) TRAIN LOSS:0.9817 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:05<00:02,  3.55it/s]

SequenceClassifierOutput(loss=tensor(1.0124, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.5656e-02, -6.6036e-01,  5.2107e-01],
        [ 8.5843e-02, -4.0896e-01,  2.6328e-01],
        [ 2.6714e-01, -8.7140e-02,  1.8863e-01],
        [ 1.5771e-01, -6.6834e-01,  4.4659e-01],
        [-1.8599e-01, -4.5941e-01,  5.3161e-01],
        [-7.0813e-02, -7.5072e-01,  2.7333e-01],
        [ 1.2279e-01, -6.9127e-01,  9.8342e-01],
        [ 2.4457e-01, -6.1196e-01,  3.1311e-01],
        [ 8.0251e-02, -6.3897e-01,  3.2901e-01],
        [-2.5606e-01, -7.2804e-01,  5.3054e-01],
        [-1.1152e-01, -4.1241e-01,  6.5080e-01],
        [ 4.3108e-01, -1.0459e+00,  6.0192e-01],
        [-3.9425e-01, -3.2938e-01,  7.8016e-01],
        [-2.3681e-01, -3.6838e-01,  5.1663e-01],
        [ 4.9295e-01, -4.9648e-01,  9.8322e-01],
        [ 2.6647e-01, -6.4179e-01,  5.8438e-01],
        [ 9.0721e-02, -5.4125e-01,  3.2321e-01],
        [-6.5454e-02, -4.6786e-01,  7.6955e-01],
        [-1.3051e-01

(Epoch 1) TRAIN LOSS:0.9784 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:05<00:02,  3.88it/s]

SequenceClassifierOutput(loss=tensor(0.9263, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1813, -0.3994,  0.1493],
        [-0.0935, -0.5233,  0.2611],
        [-0.0326, -0.5417,  0.9535],
        [ 0.0483, -0.1171,  0.4649],
        [-0.4558, -0.5221,  0.9158],
        [ 0.1035, -1.0802,  0.5600],
        [ 0.1901, -0.3123,  0.3322],
        [-0.1204, -0.8926,  0.7585],
        [-0.1391, -0.5241,  0.6593],
        [ 0.2183, -0.0234,  0.8859],
        [ 0.1024, -0.1639,  0.6858],
        [-0.2391, -0.5666,  0.3030],
        [ 0.1807, -0.5536,  0.6969],
        [-0.1155, -0.7157,  0.4988],
        [-0.3781, -0.4375,  0.6251],
        [ 0.0036, -1.0070,  0.6655],
        [ 0.1640, -1.0617,  0.3713],
        [ 0.4110, -0.3088,  0.9205],
        [-0.1032, -0.1886,  0.5006],
        [-0.0210, -0.6375,  0.6977],
        [-0.1555, -0.2444,  0.8272],
        [-0.1417, -0.8076,  0.9280],
        [-0.1142, -0.6713,  0.4324],
        [ 0.2160, -0.6792,  0.5045],
        [ 0.21

(Epoch 1) TRAIN LOSS:0.9834 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:06<00:01,  4.07it/s]

SequenceClassifierOutput(loss=tensor(1.0669, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0175, -0.7519,  1.1467],
        [ 0.0292, -1.1152,  0.7949],
        [-0.0588, -0.4687,  0.5534],
        [-0.1458, -0.4300,  0.4703],
        [ 0.1464, -0.9850,  0.5694],
        [-0.0892, -0.6576,  1.0174],
        [-0.0403, -0.5205,  0.8553],
        [ 0.1207, -0.5727,  0.9107],
        [ 0.0497, -0.8648,  0.6952],
        [ 0.0246, -0.1661,  0.2200],
        [-0.2581, -0.4526,  0.5135],
        [-0.0271, -0.5902,  0.5469],
        [ 0.0745, -0.5474,  0.4889],
        [-0.0989, -0.1044,  0.4537],
        [ 0.1883, -0.6639,  0.2835],
        [ 0.0896, -0.4013,  0.4431],
        [ 0.0147, -0.1434,  0.5790],
        [ 0.0679, -0.9187,  0.7031],
        [-0.0963, -0.5039,  0.4173],
        [-0.1806, -1.0145,  0.6005],
        [ 0.0030, -0.8518,  1.1681],
        [-0.1002, -0.6827,  1.1897],
        [-0.0521, -0.5946,  0.6365],
        [ 0.4418, -0.2941,  0.6963],
        [-0.24

(Epoch 1) TRAIN LOSS:0.9809 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:06<00:01,  4.27it/s]

SequenceClassifierOutput(loss=tensor(0.9369, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1475, -0.6777,  0.7068],
        [-0.2156, -0.7146,  0.8661],
        [ 0.2613, -0.6071,  0.6907],
        [ 0.0078, -0.5580,  0.7244],
        [ 0.2373, -0.6909,  0.5463],
        [ 0.0288, -0.8159,  1.0018],
        [-0.1305, -0.7856,  0.9309],
        [ 0.2106, -0.5452,  0.4343],
        [-0.1274, -0.6384,  0.4587],
        [ 0.0446, -0.5684,  0.2207],
        [-0.1441, -0.2612,  0.4499],
        [ 0.2031, -0.9887,  0.8707],
        [-0.1727, -0.6291,  0.6705],
        [ 0.1326, -1.0208,  0.6739],
        [-0.0247, -0.8737,  0.6033],
        [-0.3223, -0.8048,  0.8545],
        [-0.0063, -0.6078,  0.6252],
        [ 0.2938, -0.5873,  0.2774],
        [-0.1666, -0.6250,  1.0847],
        [-0.0430, -0.6180,  0.7827],
        [ 0.0381, -0.4871,  0.2965],
        [ 0.2482, -0.8873,  0.7857],
        [ 0.0862, -0.5559,  0.7614],
        [-0.1692, -0.6421,  0.9235],
        [-0.35

(Epoch 1) TRAIN LOSS:0.9700 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:06<00:01,  4.31it/s]

SequenceClassifierOutput(loss=tensor(0.7635, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-1.0126e-01, -4.7615e-01,  6.4448e-01],
        [ 1.2918e-01, -9.4977e-01,  9.0531e-01],
        [-1.7065e-01, -5.6584e-01,  3.2853e-01],
        [-1.0213e-02, -9.8314e-01,  9.6738e-01],
        [ 3.0964e-01, -7.9875e-01,  8.0656e-01],
        [ 2.3948e-01, -4.0519e-01,  7.7497e-01],
        [ 2.2472e-01, -8.8481e-01,  7.8148e-01],
        [-1.2846e-04, -5.8809e-01,  4.4413e-01],
        [ 8.8781e-02, -7.4617e-01,  1.0308e+00],
        [-5.8330e-02, -8.5676e-01,  1.3882e+00],
        [ 1.9730e-01, -1.0764e+00,  3.0932e-01],
        [-4.7853e-02, -4.5759e-01,  2.8254e-01],
        [-2.9463e-02, -6.3303e-01,  6.3553e-01],
        [-2.0228e-01, -8.2991e-01,  6.7451e-01],
        [-2.5075e-01, -2.1260e-01,  8.9432e-01],
        [-2.5003e-01, -7.1013e-01,  8.0725e-01],
        [-7.4495e-02, -9.4418e-01,  6.0694e-01],
        [ 1.0079e-01, -5.1695e-01,  6.8412e-01],
        [-6.5351e-02

(Epoch 1) TRAIN LOSS:0.9642 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:06<00:00,  4.43it/s]

SequenceClassifierOutput(loss=tensor(0.8478, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1111, -0.7749,  0.5282],
        [ 0.0362, -0.6698,  0.8365],
        [ 0.0671, -0.3720,  0.2022],
        [-0.2045, -0.5167,  0.6711],
        [-0.0123, -0.7486,  0.7273],
        [ 0.1546, -0.4823,  0.7538],
        [-0.1031, -0.5235,  1.1433],
        [ 0.1628, -0.6466,  0.8370],
        [ 0.0971, -0.9451,  1.1519],
        [ 0.2274, -0.7514,  1.1286],
        [ 0.1628, -1.2230,  0.7909],
        [-0.0732, -0.4958,  0.3679],
        [ 0.3587, -0.5303,  0.4468],
        [-0.4759, -0.6393,  0.4885],
        [ 0.3110, -0.6209,  0.7514],
        [ 0.1259, -0.6767,  1.0410],
        [ 0.1548, -0.5350,  0.9006],
        [ 0.2270, -0.4543,  0.5755],
        [ 0.2075, -1.1786,  0.9231],
        [-0.0072, -0.7681,  0.4082],
        [-0.1082, -0.9831,  0.6909],
        [ 0.1667, -0.8078,  0.2322],
        [ 0.1699, -0.5147,  0.7579],
        [-0.2242, -0.5411,  1.0666],
        [ 0.15

(Epoch 1) TRAIN LOSS:0.9639 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:07<00:00,  4.51it/s]

SequenceClassifierOutput(loss=tensor(0.9582, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1906, -0.6284,  0.7881],
        [ 0.2414, -0.9075,  0.7046],
        [ 0.2195,  0.0851,  0.6061],
        [-0.1698, -0.9217,  0.5380],
        [ 0.3476, -0.7425,  1.0558],
        [-0.0828, -0.6574,  0.6908],
        [-0.1532, -0.7833,  0.6784],
        [ 0.1079, -0.8592,  0.2451],
        [-0.3385, -0.7335,  0.8486],
        [-0.1847, -0.5605,  1.0569],
        [ 0.1868, -0.8518,  0.7720],
        [-0.1446, -0.8799,  0.9109],
        [-0.0507, -0.8690,  1.1429],
        [-0.0561, -0.3774,  0.5954],
        [ 0.1124, -0.6568,  0.6986],
        [-0.2282, -0.5949,  0.5425],
        [-0.2697, -0.6703,  0.1957],
        [ 0.1769, -0.9458,  1.2094],
        [ 0.0564, -0.7471,  0.8505],
        [ 0.0388, -0.2339,  0.3161],
        [ 0.0115, -0.5494,  0.7050],
        [ 0.2472, -0.0644,  0.4554],
        [-0.1494, -1.0546,  0.3427],
        [ 0.1992, -0.7906,  0.5804],
        [ 0.19

(Epoch 1) TRAIN LOSS:0.9600 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:07<00:00,  4.46it/s]

SequenceClassifierOutput(loss=tensor(0.8738, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0966, -0.9325,  0.5917],
        [-0.0083, -0.8107,  0.6669],
        [-0.0710, -0.9265,  0.7445],
        [-0.0421, -1.2009,  1.3356],
        [-0.0015, -0.7218,  0.8566],
        [ 0.2343, -0.6988,  1.0578],
        [ 0.4248, -0.3604,  0.7132],
        [ 0.3032, -0.6501,  0.7092],
        [-0.2319, -0.6621,  0.9878],
        [-0.0912, -0.5627,  0.8475],
        [ 0.2327, -0.1340,  0.4819],
        [-0.0852, -0.4447,  0.4359],
        [ 0.2031, -0.9364,  0.9172],
        [-0.5070, -0.5297,  0.8280],
        [-0.1197, -0.6349,  1.2565],
        [-0.0231, -0.8581,  0.9153],
        [ 0.3066, -0.6603,  0.9881],
        [-0.0477, -0.8443,  1.0945],
        [-0.0488, -0.6422,  0.8274],
        [ 0.0173, -0.8333,  0.9097],
        [-0.2000, -0.7317,  1.1298],
        [ 0.0801, -1.0383,  1.0741],
        [ 0.0679, -0.8831,  1.0554],
        [-0.0814, -1.0360,  0.8463],
        [-0.06

(Epoch 1) TRAIN LOSS:0.9584 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:07<00:00,  4.55it/s]

SequenceClassifierOutput(loss=tensor(0.9201, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1817, -0.7084,  0.7431],
        [-0.1144, -0.6407,  1.2811],
        [-0.0264, -0.8520,  0.8112],
        [ 0.1851, -1.1412,  1.0010],
        [ 0.0499, -0.7380,  0.9502],
        [ 0.1091, -1.0725,  1.4149],
        [-0.3822, -0.6273,  0.6779],
        [-0.1868, -0.5445,  0.4649],
        [-0.1455, -0.7085,  1.1274],
        [ 0.0073,  0.0634,  0.4055],
        [-0.0738, -0.8954,  1.3477],
        [ 0.1789, -0.4016,  0.6285],
        [-0.1088, -0.6050,  1.0420],
        [-0.0967, -0.3660,  0.3591],
        [-0.0166, -0.7161,  0.8765],
        [-0.1621, -0.9309,  0.8155],
        [-0.0089, -0.7893,  0.7741],
        [ 0.1848, -0.3442,  0.8682],
        [ 0.2443, -0.6570,  0.6353],
        [ 0.3746, -0.1889,  0.2220],
        [-0.1030, -0.2997,  0.6890],
        [-0.0065, -0.4413,  0.7149],
        [ 0.3530, -0.5450,  0.6345],
        [-0.2478, -0.5321,  0.4992],
        [ 0.19

(Epoch 1) TRAIN LOSS:0.9577 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  4.66it/s]

SequenceClassifierOutput(loss=tensor(0.9416, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1525, -1.0648,  1.0386],
        [-0.0680, -0.9170,  1.0968],
        [-0.3841, -0.2706,  0.5594],
        [-0.0020, -0.8182,  0.9177],
        [ 0.1029, -0.8539,  0.8301],
        [ 0.0989, -1.0192,  0.4793],
        [-0.3886, -0.3726,  0.4933],
        [ 0.0671, -0.6122,  1.2697],
        [-0.0944, -0.9342,  0.9069],
        [ 0.1613, -0.5023,  0.9792],
        [-0.0214, -0.5828,  0.9330],
        [-0.2935, -0.3914,  0.1125],
        [-0.3040, -0.8037,  0.4042],
        [-0.0989, -0.6412,  0.6864],
        [ 0.0992, -0.2099,  0.2200],
        [-0.2286, -0.8483,  0.5324],
        [-0.4303, -0.6675,  0.5514],
        [-0.1782, -0.9155,  0.7959],
        [-0.0299, -0.6706,  0.7173],
        [-0.1217, -0.9560,  0.8457],
        [ 0.2989, -0.8207,  0.4650],
        [ 0.0535, -0.5303,  0.5354],
        [-0.3089, -0.7810,  1.1352],
        [ 0.1884, -0.4083,  1.0585],
        [-0.19

(Epoch 1) TRAIN LOSS:0.9577 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  3.20it/s]

(Epoch 1) TRAIN LOSS:0.9577 ACC:0.54 F1:0.31 REC:0.34 PRE:0.34 LR:0.00000300



(Epoch 2) TRAIN LOSS:0.7039 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:06,  3.89it/s]

SequenceClassifierOutput(loss=tensor(0.7039, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1592, -0.7999,  0.9578],
        [ 0.0149, -0.9191,  0.9650],
        [-0.0853, -0.8161,  1.1864],
        [ 0.1720, -0.6981,  0.7955],
        [-0.0794, -0.5787,  0.4588],
        [ 0.1329, -0.8311,  0.7978],
        [ 0.2198, -0.8174,  0.9731],
        [ 0.0364, -0.5772,  1.0846],
        [-0.0785, -1.0725,  0.7858],
        [ 0.0486, -0.7979,  0.5895],
        [-0.5730, -1.0902,  0.7295],
        [-0.0933, -0.6289,  1.0875],
        [-0.2773, -1.0191,  1.0735],
        [-0.0633, -0.5084,  0.6485],
        [ 0.2443, -0.9031,  1.0926],
        [-0.0583, -0.7818,  1.2266],
        [-0.0708, -0.8995,  0.7057],
        [ 0.2447, -0.8757,  0.8887],
        [-0.0908, -0.7672,  0.8157],
        [ 0.0797, -0.8186,  0.8219],
        [ 0.0412, -0.7753,  0.9460],
        [-0.1528, -0.5956,  0.8554],
        [-0.2130, -0.8975,  0.9808],
        [ 0.2563, -0.5503,  0.6028],
        [-0.17

(Epoch 2) TRAIN LOSS:0.7388 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:05,  4.47it/s]

SequenceClassifierOutput(loss=tensor(0.7737, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0877, -0.6930,  0.8591],
        [-0.2624, -0.9598,  1.2055],
        [ 0.1150, -0.2457,  0.0524],
        [-0.2175, -0.9836,  1.2614],
        [-0.0928, -0.7885,  0.5072],
        [ 0.0121, -1.0300,  0.9461],
        [ 0.0427, -1.1419,  1.3136],
        [ 0.1796, -0.8759,  0.6328],
        [-0.2517, -1.1903,  0.8747],
        [ 0.2695, -0.4115,  0.5780],
        [-0.1392, -1.0124,  1.2839],
        [-0.1637, -0.5965,  1.0481],
        [ 0.1304, -0.7986,  0.9268],
        [ 0.0482, -1.0282,  0.3370],
        [-0.2639, -0.5984,  0.8372],
        [ 0.1259, -0.7802,  0.5065],
        [ 0.1218, -0.6533,  1.1693],
        [ 0.1921, -0.5880,  0.4709],
        [ 0.2051, -0.4137,  0.8174],
        [ 0.2927, -0.6360,  0.3383],
        [-0.0515, -0.7687,  1.0627],
        [-0.1409, -0.4915,  1.0582],
        [ 0.3583, -0.8780,  0.9225],
        [ 0.1979, -0.2652,  0.5231],
        [-0.46

(Epoch 2) TRAIN LOSS:0.7791 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:04,  4.54it/s]

SequenceClassifierOutput(loss=tensor(0.8598, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1854, -0.9913,  0.7684],
        [ 0.3111, -0.8245,  0.9603],
        [-0.1071, -0.6187,  0.6274],
        [-0.1115, -0.9826,  1.4855],
        [-0.3831, -1.0843,  0.7512],
        [ 0.2607, -0.8475,  1.1805],
        [ 0.1872, -0.8522,  0.2137],
        [ 0.0736, -0.9340,  1.1696],
        [ 0.1887, -0.4951,  0.2805],
        [-0.0095, -0.8522,  1.2556],
        [-0.1884, -1.1059,  0.4581],
        [-0.1672, -0.6347,  0.8099],
        [ 0.0546, -0.8841,  1.1973],
        [ 0.3528, -0.6149,  0.4689],
        [ 0.0730, -0.9926,  1.2586],
        [ 0.3256, -1.0211,  1.2078],
        [ 0.1361, -0.4053,  0.5721],
        [ 0.0635, -0.7035,  0.8255],
        [-0.0350, -0.7496,  1.0948],
        [ 0.2590, -0.3401,  0.3846],
        [ 0.0485, -0.4946,  0.1024],
        [-0.2991, -1.4822,  0.6738],
        [-0.0557, -0.7434,  0.6846],
        [ 0.1605, -0.6740,  0.9809],
        [ 0.19

(Epoch 2) TRAIN LOSS:0.7928 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:04,  4.63it/s]

SequenceClassifierOutput(loss=tensor(0.8337, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2667, -0.9292,  1.1384],
        [ 0.0102, -0.6333,  1.1972],
        [-0.1371, -1.0745,  1.2027],
        [-0.1057, -1.1758,  1.0331],
        [ 0.3707, -0.8163,  0.9717],
        [ 0.0680, -0.7311,  0.2469],
        [ 0.1332, -0.3686,  0.6985],
        [-0.2888, -0.9675,  1.3273],
        [-0.0590, -1.2216,  0.8063],
        [ 0.2739, -0.5588,  0.8806],
        [ 0.0148, -1.1877,  1.3692],
        [-0.0810, -0.9107,  1.2522],
        [ 0.0328, -0.8248,  0.6787],
        [-0.2439, -1.1022,  1.0139],
        [ 0.0741, -0.5982,  0.8639],
        [ 0.1750, -0.6898,  0.7798],
        [ 0.0298, -0.6738,  0.6937],
        [-0.2105, -0.4878,  0.4183],
        [ 0.1758, -0.9512,  0.9887],
        [-0.2305, -0.4862,  0.0080],
        [-0.0482, -0.5761,  0.7483],
        [-0.1536, -0.6930,  0.6631],
        [-0.1762, -0.8862,  1.4509],
        [ 0.1658, -1.0474,  0.9394],
        [-0.22

(Epoch 2) TRAIN LOSS:0.8211 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:04,  4.73it/s]

SequenceClassifierOutput(loss=tensor(0.9345, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0046, -0.4317,  0.7966],
        [ 0.0634, -0.5394,  0.2638],
        [ 0.0869, -0.9770,  1.3569],
        [-0.0181, -0.7895,  1.0054],
        [ 0.0109, -0.8327,  1.0849],
        [ 0.2751, -0.5610,  0.2727],
        [-0.0877, -1.1207,  1.0566],
        [ 0.4621, -0.7337,  1.0626],
        [-0.0323, -1.0569,  1.4318],
        [ 0.4125, -0.4275,  0.5938],
        [ 0.4792, -0.6268,  0.5286],
        [ 0.1522, -0.6391,  0.8460],
        [ 0.4077, -0.8957,  0.4269],
        [ 0.1431, -0.8486,  0.9604],
        [-0.1963, -0.8149,  1.1442],
        [ 0.1959, -0.3986,  0.4555],
        [-0.1955, -1.0887,  1.3514],
        [ 0.2355, -0.9736,  0.7303],
        [-0.1112, -0.9156,  0.2781],
        [ 0.1508, -1.1231,  0.8335],
        [-0.1270, -0.8903,  0.9743],
        [ 0.2866,  0.1514,  0.2328],
        [ 0.0933, -0.8592,  1.2949],
        [ 0.0354, -0.9795,  0.5696],
        [ 0.09

(Epoch 2) TRAIN LOSS:0.8063 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:04,  4.71it/s]

SequenceClassifierOutput(loss=tensor(0.7320, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3521, -0.7442,  1.1025],
        [-0.0095, -0.9306,  0.9018],
        [-0.1526, -0.6442,  1.0494],
        [-0.1389, -0.8162,  1.0220],
        [-0.0356, -0.9548,  1.3152],
        [-0.2115, -1.2563,  1.3084],
        [ 0.0862, -0.1124,  0.2437],
        [-0.1764, -0.7041,  0.8586],
        [ 0.2833, -0.7206,  1.2775],
        [-0.0297, -1.0472,  1.1267],
        [-0.3685, -0.9092,  0.8978],
        [ 0.1264, -1.1413,  0.7602],
        [-0.0683, -0.8304,  0.7574],
        [-0.0740, -1.0766,  1.2606],
        [ 0.0039, -0.5197,  0.7837],
        [ 0.2413, -0.6752,  0.6923],
        [ 0.3124, -0.2027,  0.5331],
        [ 0.1175, -1.3382,  0.7812],
        [ 0.0937, -1.1937,  1.2864],
        [-0.4383, -1.0280,  0.5841],
        [-0.1183, -0.9283,  1.1077],
        [ 0.1445, -0.4876,  0.3006],
        [-0.0332, -1.1685,  1.3712],
        [-0.0742, -1.2082,  1.2343],
        [-0.19

(Epoch 2) TRAIN LOSS:0.8475 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:03,  4.71it/s]

SequenceClassifierOutput(loss=tensor(1.0946, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0176e-01, -8.0723e-01,  1.2632e+00],
        [ 2.0327e-01, -9.1339e-01,  1.3769e+00],
        [ 1.7947e-01, -1.0199e+00,  7.3266e-01],
        [ 3.0451e-03, -6.2212e-01,  3.0336e-01],
        [ 1.4360e-01, -6.5112e-01,  4.5201e-01],
        [ 5.0962e-01, -8.1016e-01,  1.0531e+00],
        [ 1.3729e-01, -7.4656e-01,  7.0792e-01],
        [ 4.3834e-01, -1.2564e+00,  1.1688e+00],
        [ 2.1686e-01, -3.0127e-02,  5.1363e-02],
        [ 2.2642e-01, -8.6005e-01,  6.9542e-01],
        [ 7.7182e-02, -2.7203e-01,  7.5045e-01],
        [-2.6672e-01, -1.0322e+00,  1.0214e+00],
        [ 5.3505e-02, -8.7808e-01,  1.3750e+00],
        [ 2.6488e-01, -8.7053e-01,  9.5814e-01],
        [-1.5178e-01, -8.6134e-01,  5.3941e-01],
        [ 1.7993e-01, -7.8811e-01,  7.9299e-01],
        [-3.0431e-04, -8.5028e-01,  1.2657e+00],
        [ 1.1067e-01, -8.3452e-01,  7.0563e-01],
        [-4.7118e-02

(Epoch 2) TRAIN LOSS:0.8497 LR:0.00000300:  32%|█████████████                            | 8/25 [00:01<00:03,  4.74it/s]

SequenceClassifierOutput(loss=tensor(0.8654, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6152, -0.5402,  0.6130],
        [ 0.0562, -0.5798,  0.3144],
        [ 0.3274, -0.7128,  0.6443],
        [ 0.0370, -0.7991,  0.6129],
        [ 0.2620, -0.8702,  1.0319],
        [-0.3500, -0.9858,  1.3103],
        [-0.1707, -0.9167,  1.1227],
        [ 0.0942, -0.8499,  1.2670],
        [ 0.4300, -0.2544,  0.2157],
        [-0.2011, -0.5379,  0.3225],
        [ 0.1216, -0.9174,  0.8455],
        [-0.1591, -0.8084,  0.8021],
        [-0.1619, -0.9330,  0.8767],
        [-0.2076, -0.7051,  0.6383],
        [ 0.0480, -0.8080,  1.1312],
        [ 0.0196, -1.1087,  0.9321],
        [ 0.0071, -0.7752,  0.8737],
        [ 0.1053, -0.7030,  0.2493],
        [ 0.0044, -1.0563,  1.3377],
        [ 0.4290, -0.0068,  0.5714],
        [-0.1376, -0.8140,  0.3172],
        [ 0.2849, -0.8523,  1.1735],
        [ 0.2101, -1.0044,  0.8823],
        [ 0.1622, -0.8600,  0.2021],
        [ 0.48

(Epoch 2) TRAIN LOSS:0.8456 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:03,  4.83it/s]

SequenceClassifierOutput(loss=tensor(0.8670, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-4.5760e-02, -1.5043e+00,  1.0717e+00],
        [ 7.7669e-01, -4.8818e-01,  4.4058e-01],
        [-2.4262e-01, -8.6449e-01,  1.0691e+00],
        [ 5.7549e-03, -9.7002e-01,  1.4218e+00],
        [ 4.2959e-01, -5.7071e-01,  1.0371e+00],
        [ 2.9325e-02, -7.5787e-01,  1.0130e+00],
        [ 1.0450e-01, -2.8565e-01,  1.9077e-01],
        [-8.2019e-02, -6.9507e-01,  4.6602e-01],
        [ 1.3446e-01, -1.2387e-01,  6.9971e-01],
        [ 1.3128e-01, -9.8105e-01,  1.0332e+00],
        [ 2.0683e-01, -2.1771e-01,  2.4458e-01],
        [-2.6119e-03, -7.8773e-01,  9.7055e-01],
        [-3.1458e-01, -9.9457e-01,  1.4177e+00],
        [ 2.6851e-02, -7.7641e-01,  1.1388e+00],
        [ 4.5816e-03, -5.0916e-01,  4.5685e-01],
        [-4.2355e-02, -8.1727e-01,  7.8341e-01],
        [-5.5161e-02, -9.5588e-01,  1.0758e+00],
        [ 1.0476e-01, -1.1008e+00,  1.2492e+00],
        [ 3.7620e-01

(Epoch 2) TRAIN LOSS:0.8493 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:02<00:02,  4.72it/s]

SequenceClassifierOutput(loss=tensor(0.8863, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2477, -0.3345,  0.2441],
        [ 0.0088, -0.8496,  1.2487],
        [ 0.0373, -0.8642,  0.4183],
        [ 0.1080, -1.0074,  0.6974],
        [ 0.3196, -0.8210,  0.5859],
        [ 0.5176, -0.3483,  0.3085],
        [ 0.0387, -0.3915,  0.1263],
        [-0.0948, -1.0232,  1.2533],
        [ 0.3366, -0.7016,  0.4849],
        [ 0.1814, -1.0448,  0.7600],
        [ 0.0996, -0.9767,  1.3664],
        [ 0.0944, -0.9461,  1.2905],
        [ 0.0130, -0.8597,  0.7523],
        [ 0.1128, -1.2993,  1.1264],
        [ 0.3583, -0.8039,  0.9910],
        [ 0.1827, -0.7216,  0.1866],
        [ 0.1590, -0.7996,  0.5456],
        [ 0.3457, -0.3787,  1.3621],
        [ 0.5184, -0.7858,  0.9721],
        [-0.0251, -0.9919,  0.9626],
        [ 0.1294, -1.0333,  0.4877],
        [ 0.0660, -0.7720,  1.1214],
        [-0.0825, -0.9418,  1.1023],
        [ 0.0982, -0.7541,  0.4304],
        [ 0.00

(Epoch 2) TRAIN LOSS:0.8423 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:02<00:02,  4.54it/s]

SequenceClassifierOutput(loss=tensor(0.7651, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 4.6697e-01, -9.3096e-01,  1.1157e+00],
        [ 4.4385e-02, -1.1211e+00,  8.3171e-01],
        [-3.7517e-02, -1.3544e+00,  8.8888e-01],
        [ 1.2627e-01, -1.0407e+00,  1.0679e+00],
        [ 4.3626e-02, -6.7411e-01,  1.2798e+00],
        [-1.7936e-01, -7.3593e-01,  9.4224e-01],
        [ 3.4264e-01, -2.1939e-01,  1.5757e-01],
        [ 2.6555e-02, -9.5775e-01,  1.2653e+00],
        [ 9.7455e-03, -1.2518e+00,  1.4533e+00],
        [-1.2085e-01, -1.1335e+00,  8.6772e-01],
        [ 5.2248e-01, -6.6131e-01,  3.8688e-01],
        [ 3.9611e-01, -1.2382e+00,  1.2887e+00],
        [ 2.2883e-01, -8.1186e-01,  1.0305e+00],
        [-6.8743e-05, -9.8086e-01,  1.5254e+00],
        [ 1.5081e-01, -1.0865e+00,  6.7293e-01],
        [-4.8483e-02, -1.1566e+00,  1.1095e+00],
        [-4.1225e-02, -1.0702e+00,  9.7986e-01],
        [-3.0608e-01, -1.0398e+00,  7.2309e-01],
        [ 3.0671e-01

(Epoch 2) TRAIN LOSS:0.8462 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:02<00:02,  5.02it/s]

SequenceClassifierOutput(loss=tensor(0.8929, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0748, -0.9109,  1.3235],
        [ 0.2293, -0.5705,  0.7299],
        [ 0.3791, -0.6322,  0.4943],
        [-0.1343, -0.9608,  1.3584],
        [ 0.0910, -0.5177,  0.3981],
        [ 0.0242, -0.5850,  0.6216],
        [ 0.0592, -0.8782,  1.1776],
        [-0.1656, -0.7426,  1.2020],
        [-0.0373, -0.9281,  0.9831],
        [ 0.5689, -0.4708,  0.5433],
        [ 0.1997, -0.2444,  0.2462],
        [ 0.0299, -0.9398,  0.6622],
        [ 0.3835, -0.8153,  0.4889],
        [ 0.4609, -1.0282,  1.0794],
        [-0.2253, -0.7796,  1.3117],
        [ 0.0212, -1.0697,  1.1578],
        [ 0.1310, -0.6253,  1.5314],
        [ 0.3239, -0.8129,  0.5027],
        [-0.0026, -1.0728,  0.9069],
        [-0.1705, -1.0322,  0.7877],
        [ 0.3754, -0.9816,  0.8263],
        [ 0.1925, -0.6332,  1.1335],
        [-0.0341, -0.7240,  0.4933],
        [-0.0956, -0.5998,  1.0466],
        [ 0.08

(Epoch 2) TRAIN LOSS:0.8526 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:03<00:02,  3.76it/s]

SequenceClassifierOutput(loss=tensor(0.8225, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3078, -0.4849,  0.2691],
        [ 0.0222, -0.9684,  0.8712],
        [-0.1289, -0.7857,  1.4670],
        [ 0.4083, -0.6740,  0.5201],
        [-0.0935, -1.0873,  0.7709],
        [ 0.4544, -1.0906,  1.1817],
        [ 0.2310, -1.1792,  0.8238],
        [ 0.2349, -0.5178,  0.3009],
        [ 0.0635, -1.1543,  0.4927],
        [ 0.1141, -0.7910,  1.0713],
        [ 0.1786, -0.8902,  0.5179],
        [ 0.3322, -0.6745,  0.8766],
        [ 0.3805, -1.1251,  1.4948],
        [-0.4852, -1.1850,  1.4165],
        [ 0.1964, -0.8939,  1.0807],
        [-0.1326, -1.1654,  1.0228],
        [ 0.4063, -0.5576,  0.5014],
        [ 0.4877, -0.1854,  0.3751],
        [-0.1466, -0.8432,  0.4918],
        [ 0.1337, -0.5157,  0.7449],
        [ 0.1194, -1.1470,  1.3643],
        [ 0.2110, -0.0610,  0.3054],
        [ 0.2003, -0.8093,  1.1795],
        [-0.0247, -0.9813,  0.7891],
        [ 0.02

(Epoch 2) TRAIN LOSS:0.8506 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:03<00:03,  3.29it/s]

SequenceClassifierOutput(loss=tensor(0.8360, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0438, -1.1512,  1.1811],
        [-0.3751, -0.7753,  1.2070],
        [ 0.4919, -1.0280,  1.3430],
        [ 0.5129, -0.9499,  0.8745],
        [ 0.4135, -0.8516,  0.8302],
        [ 0.3860, -0.7295,  0.7339],
        [ 0.2161, -0.8114,  0.8397],
        [ 0.2187, -1.0404,  0.7202],
        [ 0.1772, -1.4596,  1.0622],
        [ 0.2666, -0.7956,  0.4386],
        [-0.2644, -0.7720,  1.3472],
        [-0.2110, -0.8835,  1.2706],
        [ 0.1243, -0.9184,  0.5223],
        [ 0.2555, -1.0024,  0.6677],
        [-0.0682, -0.8714,  1.2335],
        [-0.2750, -0.9955,  0.9138],
        [ 0.1540, -0.2813,  0.3956],
        [-0.1888, -1.2091,  1.0832],
        [ 0.5724, -0.3134,  0.6620],
        [ 0.0330, -0.9669,  0.7195],
        [ 0.1837, -0.2432,  0.2719],
        [-0.2521, -1.2470,  1.1284],
        [-0.0065, -1.0319,  0.9162],
        [ 0.2338, -0.8545,  0.8399],
        [ 0.32

(Epoch 2) TRAIN LOSS:0.8497 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:03<00:03,  2.91it/s]

SequenceClassifierOutput(loss=tensor(0.8402, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4975, -0.9545,  0.6433],
        [-0.2931, -0.5337,  0.6551],
        [ 0.1872, -0.6929,  1.3597],
        [ 0.4425, -0.7112,  1.2482],
        [-0.2163, -1.1052,  0.8714],
        [ 0.2767, -1.3205,  0.9862],
        [ 0.1864, -0.9834,  0.8258],
        [ 0.2556, -0.9623,  1.2576],
        [ 0.0092, -0.9564,  1.0297],
        [-0.1884, -0.8576,  1.2069],
        [ 0.0769, -0.1124, -0.0393],
        [ 0.1421, -0.9237,  0.5313],
        [-0.1457, -0.9969,  1.2950],
        [-0.3279, -0.8533,  1.2190],
        [ 0.1922, -0.7410,  0.7343],
        [ 0.1770, -0.2475,  0.3326],
        [ 0.0222, -1.1052,  1.2275],
        [ 0.2551, -0.5444,  0.3817],
        [ 0.5478, -0.4616,  0.3508],
        [ 0.1267, -1.0704,  1.0569],
        [ 0.0278, -0.9406,  0.9138],
        [ 0.0372, -0.8945,  1.3828],
        [-0.2011, -0.1561,  0.3617],
        [-0.0838, -0.8743,  1.2034],
        [ 0.17

(Epoch 2) TRAIN LOSS:0.8492 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:04<00:02,  2.83it/s]

SequenceClassifierOutput(loss=tensor(0.8456, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0674, -1.1564,  1.1779],
        [ 0.4111, -1.2923,  0.6727],
        [-0.0833, -0.2386,  0.5603],
        [-0.0476, -1.3371,  1.3545],
        [-0.1939, -0.9372,  1.1663],
        [-0.0651, -1.0036,  1.1274],
        [-0.0433, -0.8824,  1.0176],
        [-0.1479, -0.7227,  0.9529],
        [ 0.4658, -0.3802,  0.4443],
        [ 0.1501, -1.0894,  0.9579],
        [ 0.2001, -0.9396,  1.5522],
        [ 0.3410, -0.8002,  0.3917],
        [-0.2585, -0.4324,  0.7537],
        [-0.1404, -0.9240,  1.2594],
        [ 0.1661, -0.7832,  0.8496],
        [ 0.4179, -1.4293,  1.2554],
        [ 0.3823, -0.8117,  1.0462],
        [ 0.1726, -0.9849,  1.3200],
        [ 0.2854, -0.9854,  1.2755],
        [-0.2430, -0.8554,  1.2437],
        [ 0.1574, -0.2669,  0.5089],
        [ 0.4123, -0.6690,  0.5312],
        [-0.4440, -0.9470,  0.7351],
        [ 0.5400, -0.4100,  0.6607],
        [ 0.17

(Epoch 2) TRAIN LOSS:0.8490 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:04<00:02,  2.65it/s]

SequenceClassifierOutput(loss=tensor(0.9012, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2755, -0.5840,  0.3449],
        [-0.3158, -0.9953,  1.0951],
        [ 0.4117, -1.0016,  0.9531],
        [ 0.2102, -0.8042,  0.7346],
        [ 0.7859, -0.9654,  0.9466],
        [ 0.1869, -0.8179,  1.3038],
        [ 0.4460, -0.9530,  0.7901],
        [ 0.7906, -0.5781,  0.5131],
        [ 0.6112, -0.5356,  0.4636],
        [-0.1475, -0.9860,  1.1460],
        [ 0.0841, -0.6708,  0.8260],
        [ 0.5502, -0.3269,  0.3085],
        [ 0.0021, -1.0817,  1.2298],
        [-0.1091, -1.2893,  1.2385],
        [-0.0561, -1.2197,  1.1971],
        [ 0.0090, -0.1124,  0.2028],
        [ 0.4566, -0.6387,  0.3251],
        [ 0.6586, -1.0864,  0.7793],
        [ 0.3831, -0.8554,  0.8498],
        [ 0.3023, -1.0869,  0.9780],
        [-0.0078, -0.8250,  1.2071],
        [ 0.3862, -1.0807,  1.0641],
        [ 0.1776, -0.4306,  0.2759],
        [-0.0021, -1.3171,  1.3407],
        [ 0.35

(Epoch 2) TRAIN LOSS:0.8517 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:05<00:02,  2.60it/s]

SequenceClassifierOutput(loss=tensor(0.7065, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4325, -1.0613,  1.1843],
        [ 0.5970, -0.1357,  0.0257],
        [ 0.2526, -0.9219,  0.7665],
        [ 0.1310, -0.9561,  0.8232],
        [-0.0494, -0.8066,  1.0536],
        [-0.1436, -1.2067,  1.5645],
        [-0.1092, -0.9877,  1.1951],
        [ 0.3662, -0.6569,  0.3993],
        [ 0.2526, -0.8962,  0.1521],
        [ 0.0201, -1.1655,  0.9154],
        [ 0.2051, -1.0757,  1.3224],
        [-0.1939, -1.0450,  1.2789],
        [ 0.1823, -0.8342,  0.9953],
        [-0.1703, -1.0106,  1.3030],
        [ 0.5915, -0.5390,  0.3694],
        [ 0.3045, -0.9281,  0.8106],
        [ 0.3986, -0.6939,  0.2312],
        [ 0.3624, -1.0404,  0.8919],
        [ 0.0998, -0.7992,  0.2085],
        [ 0.3287, -0.6894,  0.6545],
        [ 0.2902, -1.0433,  0.9595],
        [ 0.3721, -1.1146,  0.9098],
        [-0.1249, -0.5326,  0.4312],
        [ 0.2562, -0.9726,  1.0318],
        [ 0.09

(Epoch 2) TRAIN LOSS:0.8444 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:05<00:01,  2.60it/s]

SequenceClassifierOutput(loss=tensor(0.8510, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1051, -0.3685,  0.1505],
        [ 0.0100, -0.0442,  0.2851],
        [ 0.0904, -0.7676,  0.8169],
        [ 0.4916, -1.1934,  0.7157],
        [ 0.0886, -1.0784,  1.2934],
        [ 0.6036, -0.9968,  0.5650],
        [-0.2683, -0.8118,  1.1486],
        [ 0.1562, -0.7981,  0.9152],
        [ 0.3244, -1.2397,  0.8965],
        [ 0.1950, -1.3666,  0.7612],
        [ 0.0059, -0.9800,  1.2985],
        [-0.0537, -1.1807,  0.6825],
        [ 0.2570, -0.7746,  0.5829],
        [-0.1614, -0.9099,  1.4343],
        [-0.4072, -0.5619,  0.0172],
        [-0.2242, -0.9736,  1.1954],
        [ 0.3822, -0.8470,  0.7568],
        [ 0.2164, -0.9629,  0.7556],
        [ 0.5129, -0.7794,  0.5123],
        [ 0.2718, -0.6780,  1.0120],
        [ 0.0822, -0.8999,  1.1999],
        [ 0.3284, -1.0870,  0.6923],
        [-0.0137, -0.1740,  0.2687],
        [ 0.2986, -0.6777,  1.0083],
        [ 0.15

(Epoch 2) TRAIN LOSS:0.8448 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:05<00:01,  2.54it/s]

SequenceClassifierOutput(loss=tensor(0.6223, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0221, -0.5572,  1.1103],
        [ 0.0990, -1.3479,  1.2579],
        [ 0.3691, -0.9317,  0.8459],
        [-0.0356, -1.2689,  1.0656],
        [ 0.4289, -0.5875,  0.5198],
        [ 0.2539, -1.0927,  1.2153],
        [ 0.0457, -1.1542,  1.3486],
        [-0.0687, -1.2967,  1.1176],
        [ 0.1612, -0.9980,  0.9726],
        [ 0.3463, -0.4505,  0.6605],
        [ 0.0863, -0.7678,  1.2658],
        [ 0.1620, -0.9795,  1.0464],
        [ 0.4650, -0.9405,  0.8119],
        [ 0.3523, -0.6553,  0.4143],
        [ 0.0901, -0.9154,  1.4367],
        [ 0.2121, -1.1805,  1.1769],
        [-0.2109, -0.8756,  1.0905],
        [ 0.3410, -0.6066,  0.2745],
        [-0.1194, -1.0894,  1.1321],
        [ 0.1911, -0.9442,  0.5906],
        [ 0.4433, -0.5778,  0.4640],
        [ 0.1584, -0.9105,  1.4111],
        [ 0.0815, -0.8548,  1.1689],
        [-0.2904, -0.8790,  1.2693],
        [ 0.03

(Epoch 2) TRAIN LOSS:0.8346 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:06<00:01,  2.56it/s]

SequenceClassifierOutput(loss=tensor(0.7567, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5235, -0.4691,  0.2735],
        [-0.2022, -0.8725,  1.2389],
        [ 0.2764,  0.1065,  0.2590],
        [-0.1617, -0.8830,  1.2146],
        [ 0.2289, -0.8432,  0.0729],
        [ 0.1139, -0.7843,  0.7278],
        [ 0.4932, -0.3249,  0.2102],
        [ 0.1380, -0.7633,  0.8901],
        [ 0.4270, -0.8079,  1.0831],
        [ 0.3278, -0.1446,  0.3991],
        [-0.1611, -0.9756,  1.4388],
        [ 0.1835, -0.9484,  1.1058],
        [ 0.3716, -1.1248,  1.2218],
        [ 0.2768, -1.3044,  1.1797],
        [ 0.4224, -0.5450,  0.6679],
        [ 0.2246, -1.1249,  1.0694],
        [ 0.1903, -1.0047,  1.7038],
        [ 0.1415, -0.8924,  1.2654],
        [ 0.0935, -1.1187,  1.3240],
        [-0.2986, -0.9708,  1.4376],
        [ 0.4715, -0.6620,  0.6708],
        [ 0.5553, -0.1779,  0.0653],
        [-0.1344, -1.1122,  0.9666],
        [-0.0970, -0.9814,  1.3884],
        [ 0.54

(Epoch 2) TRAIN LOSS:0.8313 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:06<00:00,  2.57it/s]

SequenceClassifierOutput(loss=tensor(0.9337, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 3.2339e-01, -7.0839e-01,  3.4629e-01],
        [ 5.4264e-01, -9.6412e-01,  6.9278e-01],
        [ 1.8134e-01, -9.5148e-01,  6.7730e-01],
        [ 4.2538e-01, -1.0035e+00,  6.5619e-01],
        [ 3.3227e-02, -8.9489e-01,  8.9877e-01],
        [ 2.1303e-01, -7.1892e-01,  1.9778e-01],
        [-4.9885e-02, -1.1819e+00,  5.9463e-01],
        [ 1.9051e-01, -7.3810e-01,  2.5069e-01],
        [ 2.3751e-01, -6.3382e-01, -1.1156e-01],
        [ 1.3085e-01, -1.0402e-01,  3.9632e-01],
        [ 1.4425e-01, -9.8437e-01,  8.7930e-01],
        [ 3.6156e-04, -7.9293e-01,  8.1379e-01],
        [-9.2349e-02, -1.0328e+00,  1.2826e+00],
        [-9.6631e-02, -1.1029e+00,  1.2870e+00],
        [ 1.9741e-01, -9.8555e-01,  8.1123e-01],
        [-2.2549e-02, -1.1275e+00,  1.4293e+00],
        [ 3.3038e-01, -5.0395e-01,  1.2521e-01],
        [-3.0399e-01, -1.2574e+00,  1.5732e+00],
        [ 1.3000e-01

(Epoch 2) TRAIN LOSS:0.8326 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  3.10it/s]

SequenceClassifierOutput(loss=tensor(0.7618, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2385, -0.9433,  0.8924],
        [ 0.2505, -1.2524,  1.1816],
        [ 0.2208, -0.6909,  0.5340],
        [ 0.2181, -1.2090,  1.2164],
        [ 0.2019, -0.8690,  1.0733],
        [ 0.1142, -1.1859,  1.1021],
        [ 0.3410, -1.4427,  1.0686],
        [ 0.1322, -0.8554,  1.2189],
        [ 0.4192, -0.8475,  0.8376],
        [ 0.5038, -0.6000,  0.2535],
        [ 0.3406, -0.8370,  0.3082],
        [ 0.1733, -0.7748,  1.1350],
        [ 0.4742, -0.9012,  0.5266],
        [ 0.1851, -1.2110,  0.9930],
        [ 0.4484, -0.6431,  0.0319],
        [ 0.4000, -1.2496,  0.9858],
        [ 0.1496, -0.5018,  0.4164],
        [ 0.0369, -0.9024,  1.1027],
        [ 0.0049, -0.8853,  0.8515],
        [ 0.1377, -1.1039,  0.7810],
        [-0.0960, -1.0959,  1.3320],
        [ 0.3733, -0.7708,  0.5167],
        [ 0.4451, -0.9657,  0.3067],
        [ 0.7508, -0.7235,  0.0528],
        [ 0.55

(Epoch 2) TRAIN LOSS:0.8326 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  3.37it/s]

(Epoch 2) TRAIN LOSS:0.8326 ACC:0.61 F1:0.33 REC:0.37 PRE:0.40 LR:0.00000300



  _warn_prf(average, modifier, msg_start, len(result))
(Epoch 3) TRAIN LOSS:0.8746 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:07,  3.41it/s]

SequenceClassifierOutput(loss=tensor(0.8746, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2967, -0.8865,  0.9710],
        [ 0.3775, -0.7771,  1.0419],
        [ 0.0788, -0.9675,  0.8421],
        [ 0.0107, -1.1029,  1.2875],
        [ 0.3483, -1.2951,  1.0705],
        [ 0.2658, -0.4679,  0.4147],
        [-0.0073, -0.7989,  0.6183],
        [ 0.2695, -0.7273,  0.3143],
        [ 0.3622, -0.5671,  0.2783],
        [ 0.2182, -1.0593,  0.3235],
        [ 0.1905, -0.9454,  0.8407],
        [ 0.1123, -1.0165,  0.7491],
        [ 0.3646, -1.3566,  1.2406],
        [ 0.1935, -0.3574,  0.0582],
        [ 0.0734, -0.4184,  0.3009],
        [-0.0322, -0.4740,  0.3989],
        [ 0.5026, -1.3559,  0.6595],
        [ 0.1507, -0.6332,  0.6527],
        [ 0.4751, -0.9251,  0.6043],
        [ 0.2415, -0.5646,  0.0943],
        [ 0.4045, -1.0802,  0.7755],
        [ 0.3821, -0.2584,  1.0404],
        [-0.1440, -1.1048,  1.4258],
        [ 0.5985, -0.5399,  0.5322],
        [-0.17

(Epoch 3) TRAIN LOSS:0.8110 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:05,  4.04it/s]

SequenceClassifierOutput(loss=tensor(0.7474, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3340, -0.9585,  0.5865],
        [ 0.3479, -1.0597,  1.2809],
        [ 0.3818, -1.2053,  1.0131],
        [ 0.2522, -1.2343,  0.7457],
        [ 0.1104, -0.9746,  1.4478],
        [-0.3259, -1.2100,  1.4313],
        [ 0.1068, -1.1088,  1.3603],
        [-0.0333, -1.4093,  1.3449],
        [ 0.3924, -0.9285,  0.5662],
        [ 0.2830, -0.8376,  0.0839],
        [-0.1549, -1.1007,  1.2620],
        [-0.1644, -1.1777,  0.9182],
        [ 0.2316, -0.8826,  0.4622],
        [-0.1464, -1.1171,  1.5775],
        [-0.0926, -0.9131,  1.0159],
        [ 0.3043, -0.9318,  0.9096],
        [ 0.4273, -0.5745,  0.5913],
        [ 0.2905, -0.5196,  0.8345],
        [-0.2574, -1.3904,  1.1889],
        [ 0.1000, -1.2563,  1.3594],
        [ 0.0982, -1.0099,  0.6050],
        [ 0.1945, -0.3390,  0.4751],
        [ 0.2353, -0.1768,  0.3760],
        [ 0.0962, -1.3443,  1.3660],
        [-0.01

(Epoch 3) TRAIN LOSS:0.8142 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:05,  4.38it/s]

SequenceClassifierOutput(loss=tensor(0.8207, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 3.0204e-01, -1.1400e+00,  8.1329e-01],
        [ 2.6536e-01, -2.1560e-01,  1.3802e-01],
        [ 3.6345e-04, -9.5236e-01,  1.4791e+00],
        [ 1.4364e-01, -1.2678e+00,  1.5176e+00],
        [ 1.2089e-01, -7.4667e-01,  1.5667e+00],
        [ 2.4247e-01, -9.3243e-01,  1.1724e+00],
        [-1.9372e-02, -5.9826e-01,  6.9467e-01],
        [ 3.0239e-01, -8.9872e-01,  9.3306e-01],
        [-1.9738e-01, -2.2527e-01,  3.9944e-02],
        [ 5.0238e-01, -2.8617e-01,  3.7243e-01],
        [ 4.9542e-01, -5.1529e-01,  1.8928e-01],
        [ 1.8575e-01, -9.5149e-01,  9.6580e-01],
        [ 3.9506e-01, -8.2329e-01,  7.9339e-01],
        [ 1.3805e-01, -6.7990e-01,  1.2863e+00],
        [-5.3616e-02, -1.0666e+00,  1.2516e+00],
        [-5.6706e-02, -6.7935e-01,  6.1387e-01],
        [ 1.8205e-01, -1.2071e+00,  1.0483e+00],
        [ 3.4479e-01, -1.0720e+00,  1.3175e+00],
        [-3.1537e-03

(Epoch 3) TRAIN LOSS:0.8146 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:04,  4.58it/s]

SequenceClassifierOutput(loss=tensor(0.8157, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0445, -0.9769,  1.1717],
        [ 0.4457, -1.2407,  0.7527],
        [ 0.1986, -0.9216,  0.3353],
        [ 0.0484, -0.0222,  0.1503],
        [-0.2063, -1.0860,  1.2117],
        [-0.2456, -1.1811,  1.2308],
        [ 0.6402, -0.4343,  0.4473],
        [ 0.2532, -0.3442,  0.2256],
        [-0.0060, -0.9975,  1.5430],
        [ 0.0747, -0.6540,  0.6364],
        [ 0.0611, -1.0699,  1.1320],
        [ 0.0173, -0.9548,  1.5011],
        [ 0.1032, -0.9356,  1.4141],
        [ 0.5697, -0.7870,  0.6239],
        [ 0.2413, -0.2724,  0.3474],
        [ 0.4550, -0.4244,  0.4243],
        [ 0.2909, -0.7465,  1.1355],
        [-0.1620, -1.3256,  1.3142],
        [ 0.3837, -0.5406,  0.3507],
        [ 0.6050, -0.4997,  0.0040],
        [ 0.2152, -0.4536,  0.0676],
        [ 0.0364, -1.2689,  1.3745],
        [ 0.0103, -1.1718,  1.3614],
        [ 0.3909, -0.5870,  0.4722],
        [ 0.40

(Epoch 3) TRAIN LOSS:0.8254 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:04,  4.53it/s]

SequenceClassifierOutput(loss=tensor(0.8689, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1376, -0.6062,  1.1794],
        [ 0.1527, -1.1155,  1.5053],
        [-0.0697, -1.2184,  1.3200],
        [ 0.1222, -0.9927,  1.2814],
        [ 0.3322, -0.6588,  0.8618],
        [ 0.7973, -1.1226,  0.6973],
        [ 0.3785, -1.2371,  0.6165],
        [ 0.6827, -0.1507, -0.0328],
        [ 0.0900, -0.9262,  0.6074],
        [ 0.3028, -0.8938,  0.5232],
        [ 0.0491, -0.9275,  1.4118],
        [ 0.3488, -0.2221,  0.2569],
        [-0.0825, -1.2239,  1.1572],
        [ 0.0536, -1.1608,  1.2469],
        [ 0.1891, -0.9337,  0.5450],
        [-0.1399, -1.2673,  1.2037],
        [ 0.0111, -0.9385,  1.2913],
        [ 0.4909, -0.7883,  0.8333],
        [-0.0359, -0.6820,  0.5073],
        [ 0.5270, -1.1949,  1.1182],
        [ 0.1779, -1.1601,  1.0164],
        [ 0.1175, -0.8434,  1.0948],
        [-0.1615, -0.6159,  0.4826],
        [-0.0485, -1.0993,  1.2437],
        [ 0.54

(Epoch 3) TRAIN LOSS:0.8018 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:04,  4.58it/s]

SequenceClassifierOutput(loss=tensor(0.6839, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0705, -0.8531,  0.5187],
        [ 0.5016, -0.9907,  0.9072],
        [ 0.4597, -0.5231,  0.8314],
        [ 0.4681, -0.9557,  0.7121],
        [-0.2592, -0.9497,  1.5616],
        [-0.2293, -1.1523,  1.0377],
        [ 0.3606, -1.2941,  1.4631],
        [-0.1948, -1.1090,  1.3787],
        [ 0.8723, -1.1041,  0.6741],
        [-0.0491, -0.9318,  1.2464],
        [ 0.5281, -0.3608,  0.2742],
        [-0.1298, -0.8297,  1.1075],
        [ 0.1515, -0.8380,  1.0894],
        [ 0.4232, -0.7255,  0.4797],
        [ 0.4792, -0.8141, -0.1596],
        [-0.1170, -1.0780,  1.3857],
        [ 0.4854, -1.1369,  1.2235],
        [ 0.3830, -1.1830,  1.0781],
        [-0.0963, -1.1711,  1.2862],
        [ 0.0778, -1.1742,  0.8356],
        [ 0.2852, -0.6415,  0.4364],
        [ 0.1669, -0.7718,  1.1569],
        [ 0.2676, -0.8146,  0.2523],
        [ 0.5860, -0.3746,  0.3719],
        [ 0.09

(Epoch 3) TRAIN LOSS:0.7983 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:03,  4.57it/s]

SequenceClassifierOutput(loss=tensor(0.7767, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3608, -0.0939, -0.0824],
        [-0.0483, -1.2408,  1.1609],
        [-0.2033, -0.6927,  1.1465],
        [ 0.0159, -0.9566,  0.9550],
        [-0.1962, -0.7544,  1.6399],
        [ 0.5251, -0.0723,  0.2409],
        [ 0.0888, -1.2266,  1.4313],
        [-0.1005, -1.1383,  1.2082],
        [ 0.6709, -0.5605,  0.1480],
        [ 0.2926, -1.3651,  0.9538],
        [ 0.1439, -0.9912,  0.6327],
        [ 0.2184, -0.3356,  0.3924],
        [-0.1304, -0.7090,  1.3398],
        [ 0.0337, -1.1426,  0.9239],
        [ 0.0972, -0.8356,  1.2851],
        [ 0.2177, -1.3518,  0.9221],
        [ 0.4764, -1.0416,  0.9526],
        [ 0.5406, -1.0166,  0.8192],
        [ 0.1455, -1.1152,  1.2751],
        [ 1.1359, -0.9244,  0.5600],
        [-0.1755, -1.1697,  1.3698],
        [ 0.1982, -0.8467,  0.4267],
        [ 0.4730, -0.0404,  0.1104],
        [ 0.0503, -1.2181,  1.2954],
        [-0.04

(Epoch 3) TRAIN LOSS:0.7994 LR:0.00000300:  32%|█████████████                            | 8/25 [00:01<00:03,  4.70it/s]

SequenceClassifierOutput(loss=tensor(0.8076, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1479, -0.9943,  1.0337],
        [ 0.4042, -1.0530,  0.4197],
        [ 0.3946, -0.9042,  1.1722],
        [ 0.3579, -0.9607,  0.5948],
        [ 0.8490, -0.3435,  0.0711],
        [ 0.6563, -0.6816,  0.4738],
        [ 0.3655, -0.5975,  0.5721],
        [ 0.5716, -0.4561,  0.1185],
        [-0.1949, -1.2396,  0.6453],
        [ 0.3708, -0.2614,  0.4005],
        [-0.0020, -1.3818,  1.5324],
        [ 0.7557, -0.6549,  0.4918],
        [-0.0539, -0.9228,  1.4572],
        [ 0.3645, -0.9727,  1.0804],
        [ 0.4870, -0.1638, -0.1968],
        [ 0.0954, -1.0997,  1.0588],
        [ 0.7274, -0.3852,  0.2499],
        [ 0.6055,  0.0835,  0.1253],
        [-0.0909, -0.7686,  1.3829],
        [ 0.3583, -0.8320,  0.3560],
        [ 0.3310, -1.1217,  0.7923],
        [ 0.7330, -0.5454,  0.6251],
        [ 0.2149, -0.6651,  0.6764],
        [ 0.5085, -1.0211,  0.5649],
        [-0.04

(Epoch 3) TRAIN LOSS:0.7981 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:01<00:03,  4.65it/s]

SequenceClassifierOutput(loss=tensor(0.7879, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3960, -0.3181,  0.0712],
        [-0.1953, -0.8066,  1.4806],
        [ 0.0329, -0.6724,  0.4770],
        [ 0.0935, -1.0257,  1.3809],
        [ 0.6099, -0.1465,  0.1870],
        [ 0.4754, -1.0723,  0.5467],
        [ 0.1311, -0.8342,  1.0763],
        [-0.3364, -0.8449,  1.6147],
        [ 0.1522, -0.5630,  0.4632],
        [-0.2208, -0.8691,  1.3592],
        [ 0.5119, -0.1946,  0.1453],
        [ 0.1421, -1.0766,  1.5107],
        [ 0.2212, -0.5143,  0.6523],
        [ 0.1951, -1.0663,  0.8373],
        [ 0.4410, -0.9467,  1.0503],
        [ 0.1952, -0.7953,  0.7946],
        [-0.2556, -0.8097,  1.2484],
        [ 0.2601, -0.7662,  0.6673],
        [ 0.2414, -0.3302,  0.1285],
        [ 0.1745, -0.2120,  0.7094],
        [ 0.1433, -1.1757,  1.4216],
        [ 0.5294, -0.7335,  1.2249],
        [ 0.4155, -0.8906,  0.3260],
        [ 0.6250, -0.6842,  0.3505],
        [ 0.14

(Epoch 3) TRAIN LOSS:0.8100 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:03,  4.60it/s]

SequenceClassifierOutput(loss=tensor(0.9167, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5843, -0.2321,  0.3082],
        [ 0.3587, -0.2332,  0.2043],
        [ 0.3053, -1.0790,  1.1502],
        [ 0.1228, -1.1709,  1.2013],
        [ 0.0928, -1.0449,  1.2828],
        [ 0.2395, -1.1707,  1.3551],
        [ 0.8446, -1.1721,  0.8281],
        [ 0.4444, -0.4791,  0.0055],
        [ 0.3288, -0.5535,  0.1840],
        [ 0.4897, -0.7274,  0.4500],
        [ 0.2870, -0.9755,  1.2462],
        [ 0.3138, -1.2050,  1.0380],
        [ 0.1627, -0.6293,  0.6484],
        [-0.1302, -0.9401,  1.1282],
        [ 0.5441, -1.1170,  1.0493],
        [ 0.2481, -0.7404,  1.0384],
        [ 0.4151, -0.7482,  0.7480],
        [ 0.3547, -1.2851,  1.4117],
        [ 0.5100, -1.1865,  0.8707],
        [ 0.6226, -0.5123,  0.4017],
        [ 0.1980, -1.1391,  1.3660],
        [ 0.1469, -1.0046,  1.3057],
        [ 0.0238, -0.8610,  1.1798],
        [ 0.8621, -0.1809, -0.0388],
        [ 0.04

(Epoch 3) TRAIN LOSS:0.7982 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:02<00:02,  4.70it/s]

SequenceClassifierOutput(loss=tensor(0.6801, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1742, -0.8272,  1.2904],
        [ 0.0844, -0.6366,  0.2423],
        [-0.3105, -0.7557,  1.3142],
        [-0.2050, -1.0409,  1.1532],
        [ 0.2357, -0.6837,  0.3129],
        [-0.0324, -1.2864,  1.4147],
        [ 0.8137, -0.3537,  0.3321],
        [ 0.0232, -0.7526,  1.3309],
        [-0.2362, -1.0273,  1.1192],
        [ 0.6840, -0.7484,  0.2353],
        [ 0.3558, -1.0121,  1.1759],
        [-0.0328, -0.8432,  1.2037],
        [ 0.2277, -0.2176,  0.0665],
        [-0.1706, -1.1612,  1.0493],
        [ 0.0817, -0.8410,  1.1474],
        [-0.3849, -0.8035,  1.5781],
        [ 0.2171, -0.1145, -0.0882],
        [ 0.3240, -1.2374,  0.6393],
        [ 0.3802, -1.1198, -0.0614],
        [ 0.5027, -0.6664,  0.5785],
        [ 0.2878, -0.7993,  0.5244],
        [ 0.2747, -0.7580,  0.5764],
        [ 0.1331, -0.5889,  0.6428],
        [ 0.3038, -0.6976,  1.0150],
        [ 0.57

(Epoch 3) TRAIN LOSS:0.7794 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:02<00:02,  4.59it/s]

SequenceClassifierOutput(loss=tensor(0.5723, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5329, -0.9000,  0.5347],
        [ 0.1416, -1.2774,  1.6079],
        [ 0.1420, -0.9558,  1.4231],
        [-0.1542, -0.8679,  1.3454],
        [ 0.7409, -0.8840,  1.2575],
        [ 0.2736, -1.2921,  0.9474],
        [ 0.4555, -0.8399,  0.6068],
        [-0.0470, -1.1801,  1.5501],
        [-0.0087, -0.8571,  1.0694],
        [ 0.2447, -0.8305,  0.7007],
        [-0.0712, -1.4066,  1.2054],
        [-0.3371, -1.1218,  1.7416],
        [-0.0307, -1.2663,  1.3267],
        [ 0.0844, -1.1727,  1.2956],
        [ 0.4597, -0.7707,  1.1241],
        [-0.0333, -1.0025,  1.3670],
        [ 0.0383, -1.1546,  1.5040],
        [ 0.4194, -0.9455,  1.0481],
        [ 0.5395, -1.1727,  0.9831],
        [-0.0812, -0.8052,  1.2699],
        [ 0.2543, -0.9565,  0.8930],
        [-0.1007, -1.0059,  0.8098],
        [ 0.1883, -0.9918,  1.1495],
        [ 0.3347, -0.8105,  0.8812],
        [ 0.03

(Epoch 3) TRAIN LOSS:0.7727 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:02<00:02,  4.66it/s]

SequenceClassifierOutput(loss=tensor(0.6928, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1632, -1.2824,  1.2262],
        [ 0.3331, -1.0059,  1.4620],
        [ 0.2467, -0.4941,  0.2221],
        [ 0.3724, -0.6547,  0.1027],
        [ 0.1117, -0.7780,  1.6255],
        [ 0.7248, -0.3709,  0.2468],
        [-0.1376, -1.2250,  1.0701],
        [ 0.4386, -0.7253,  0.4025],
        [ 0.4928, -0.7928,  1.1026],
        [-0.0720, -0.8550,  1.0913],
        [ 0.1797, -0.7677,  0.9079],
        [-0.1648, -1.0932,  1.3068],
        [-0.1161, -1.0880,  1.0824],
        [ 0.3669, -0.3504,  0.3942],
        [-0.0443, -1.2447,  1.4114],
        [-0.0278, -1.3634,  1.1292],
        [ 0.0188, -1.0302,  0.4594],
        [-0.1001, -1.1828,  1.3913],
        [-0.0533, -1.1672,  1.7426],
        [ 0.4713, -0.8510,  0.6630],
        [ 0.3453, -1.1599,  1.3922],
        [ 0.1622, -1.0046,  1.1931],
        [-0.1727, -0.8876,  1.1643],
        [ 0.1229, -1.0152,  1.2919],
        [ 0.35

(Epoch 3) TRAIN LOSS:0.7757 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:03<00:02,  4.71it/s]

SequenceClassifierOutput(loss=tensor(0.8153, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0589, -1.1075,  1.0600],
        [ 0.1359, -0.8748,  1.1786],
        [-0.0968, -1.0219,  1.0761],
        [ 0.4495, -0.4488,  0.4102],
        [ 0.1779, -0.5095,  0.4061],
        [ 0.5091, -0.9574,  0.8493],
        [ 0.1763, -0.7894,  0.4727],
        [ 0.6252, -0.6681,  0.4838],
        [ 0.1020, -0.9522,  1.3816],
        [-0.0395, -1.1662,  1.0867],
        [ 0.0475, -1.1147,  1.0822],
        [ 0.1405, -1.2404,  0.9674],
        [ 0.4770, -0.3011,  0.4821],
        [ 0.1117, -0.7412,  0.8670],
        [ 0.4841, -0.9798,  1.1979],
        [ 0.3343, -1.1879,  0.4044],
        [ 0.2126, -0.7193,  0.6319],
        [ 0.0445, -0.9822,  0.6718],
        [ 0.2699, -1.2779,  1.1847],
        [ 0.0947, -0.8487,  0.6943],
        [ 0.7416,  0.0070, -0.1105],
        [ 0.4368, -0.6427,  0.6275],
        [-0.0775, -0.7623,  1.4242],
        [ 0.0756, -0.8512,  0.7698],
        [ 0.45

(Epoch 3) TRAIN LOSS:0.7589 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:03<00:01,  4.78it/s]

SequenceClassifierOutput(loss=tensor(0.5899, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0977, -1.2726,  1.2985],
        [ 0.4056, -1.2048,  1.0061],
        [ 0.2016, -1.0648,  0.4508],
        [-0.0832, -1.1366,  1.0718],
        [ 0.1662, -0.3796,  0.4163],
        [ 0.3574, -1.2305,  0.8679],
        [ 0.1536, -1.1745,  1.4846],
        [-0.1578, -0.7180,  1.5250],
        [ 0.4635, -0.9111,  1.1710],
        [ 0.3887, -0.9518,  1.1320],
        [ 0.0769, -1.3100,  1.3529],
        [-0.3386, -1.0841,  1.2206],
        [ 0.3556, -0.6315, -0.0997],
        [ 0.3512, -0.1951,  0.2656],
        [ 0.0308, -1.2825,  1.4118],
        [ 0.1248, -0.9796,  1.2740],
        [ 0.2898, -1.1687,  0.5475],
        [ 0.0439, -0.9763,  0.7967],
        [ 0.1604, -0.5682,  0.1242],
        [ 0.1450, -0.2420,  0.4340],
        [-0.2600, -1.2267,  1.3800],
        [ 0.5615, -0.1762,  0.4551],
        [-0.0302, -1.1724,  1.3734],
        [-0.2310, -1.3205,  1.3820],
        [-0.11

(Epoch 3) TRAIN LOSS:0.7651 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:03<00:01,  4.77it/s]

SequenceClassifierOutput(loss=tensor(0.8638, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3245, -0.7786,  0.1718],
        [-0.2667, -1.1589,  1.4302],
        [ 0.4417, -0.7694,  0.6123],
        [-0.0420, -0.8136,  0.5925],
        [ 0.6150, -0.5190,  0.6525],
        [-0.1901, -0.9749,  1.3872],
        [-0.2821, -1.2158,  1.4611],
        [ 0.4447, -1.1065,  0.9864],
        [-0.0771, -1.1509,  1.3934],
        [ 0.0550, -0.7077,  1.4506],
        [-0.1315, -1.1348,  1.5424],
        [-0.0775, -1.1270,  1.6155],
        [-0.1634, -1.0382,  1.2327],
        [ 0.7887, -0.7014,  0.1458],
        [-0.0033, -0.8524,  1.1298],
        [-0.0377, -1.2545,  0.7480],
        [-0.0625, -1.2586,  1.8247],
        [-0.1847, -1.1537,  1.1916],
        [ 0.2160, -1.2313,  0.7731],
        [ 0.6843, -0.4520,  0.3997],
        [ 0.3484, -0.9381,  1.0345],
        [-0.2481, -1.0210,  1.4904],
        [ 0.6083, -0.6662,  0.3985],
        [ 0.0071, -1.3274,  1.5207],
        [-0.26

(Epoch 3) TRAIN LOSS:0.7690 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:03<00:01,  4.65it/s]

SequenceClassifierOutput(loss=tensor(0.8355, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4338, -0.5847,  0.3926],
        [ 0.2948, -0.2251,  0.1758],
        [ 0.2711, -0.8622,  0.4281],
        [ 0.1676, -0.9879,  0.9982],
        [-0.0144, -1.0540,  1.3871],
        [ 0.3939,  0.0233, -0.0900],
        [-0.2599, -0.5553,  1.2164],
        [ 0.2712, -1.1857,  1.1240],
        [ 0.3835, -0.1453, -0.1638],
        [ 0.5463, -1.0825,  0.7722],
        [ 0.1356, -0.6308,  0.3607],
        [ 0.5237, -0.0039,  0.3483],
        [-0.1359, -1.2801,  1.4395],
        [ 0.7424, -0.3266,  0.3892],
        [ 0.5519, -0.1065, -0.2964],
        [-0.5665, -0.8869,  1.4236],
        [-0.3067, -1.1525,  1.7389],
        [ 0.2278, -0.7980,  0.5264],
        [ 0.5404, -0.9904,  0.7591],
        [ 0.1225, -0.1046, -0.1874],
        [-0.0569, -1.1712,  1.4804],
        [-0.3959, -1.3410,  1.8153],
        [ 0.0624, -0.7283,  0.5943],
        [ 0.0739, -1.0037,  1.1728],
        [ 0.55

(Epoch 3) TRAIN LOSS:0.7679 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:04<00:01,  4.76it/s]

SequenceClassifierOutput(loss=tensor(0.7475, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0093, -1.0934,  1.5229],
        [ 0.6967, -0.1260,  0.0713],
        [ 0.0417, -1.1507,  1.5843],
        [ 0.3445, -0.6393,  0.5208],
        [ 0.0433, -1.0351,  1.4021],
        [ 0.3345, -0.9878,  1.1543],
        [ 0.2438, -0.6653,  0.3135],
        [ 0.2405, -1.2538,  1.6334],
        [ 0.0940, -1.2989,  1.2276],
        [ 0.9351, -0.1713,  0.1359],
        [ 0.1203, -0.4387,  0.4008],
        [ 0.1264, -1.0914,  1.5407],
        [-0.1063, -1.3892,  1.2420],
        [-0.0536, -1.1750,  1.0009],
        [ 0.2672, -1.2833,  1.1047],
        [-0.2034, -1.3949,  1.5233],
        [ 0.4143, -0.8748,  0.7754],
        [-0.3560, -0.9095,  1.0807],
        [ 0.1601, -0.3181,  0.2649],
        [ 0.1173, -0.5980,  0.3560],
        [ 0.3766, -0.6669,  0.0879],
        [ 0.5472, -1.2082,  1.0081],
        [ 0.3609, -1.0962,  0.6639],
        [-0.0470, -1.5197,  1.5528],
        [ 0.32

(Epoch 3) TRAIN LOSS:0.7679 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:04<00:01,  4.71it/s]

SequenceClassifierOutput(loss=tensor(0.7689, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0772, -0.8231,  1.6315],
        [ 0.0616, -0.9996,  1.1588],
        [-0.0247, -0.7493,  0.7426],
        [ 0.5170, -0.3729,  0.3106],
        [ 0.3281, -0.3502,  0.2993],
        [ 0.5688, -0.4338,  0.3007],
        [ 0.0489, -0.5581,  0.8195],
        [ 0.1628, -0.6155,  0.6031],
        [ 0.2255, -0.5635,  0.0680],
        [-0.1605, -1.1975,  1.4121],
        [-0.0231, -0.9955,  1.1926],
        [ 0.5266, -0.7431,  0.4868],
        [ 0.2587, -0.6091,  0.6635],
        [ 0.1915, -0.4520,  0.5651],
        [ 0.4252, -0.7733,  0.2596],
        [ 0.3023, -1.1306,  1.0221],
        [-0.4499, -1.2095,  1.1960],
        [ 0.3491, -0.5713,  0.7009],
        [ 0.6541, -0.5941,  0.3694],
        [ 0.0227, -0.2614,  0.2265],
        [ 0.1580, -1.2111,  0.4266],
        [-0.0905, -1.2425,  1.4592],
        [ 0.5740, -0.5663,  0.4609],
        [ 0.1300, -0.4248,  0.1367],
        [ 0.23

(Epoch 3) TRAIN LOSS:0.7714 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:04<00:00,  4.76it/s]

SequenceClassifierOutput(loss=tensor(0.8408, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3184, -0.4477, -0.3262],
        [-0.2574, -1.1097,  1.1888],
        [ 0.4519, -0.4849,  0.3595],
        [-0.0088, -1.1648,  1.2940],
        [ 0.3771, -0.7525,  0.5682],
        [ 0.6029, -0.2032,  0.4630],
        [ 0.2920, -0.6641,  0.5878],
        [ 0.0695, -1.1024,  1.3952],
        [ 0.6595, -0.9999,  1.1753],
        [ 0.5441, -0.2900,  0.4520],
        [ 0.4842, -0.8012,  0.7120],
        [ 0.0122, -1.0989,  1.6280],
        [ 0.6846, -0.3846,  0.5432],
        [ 0.2211, -1.0488,  1.5007],
        [-0.3145, -1.0510,  1.4719],
        [ 0.4346, -1.2056,  0.8367],
        [-0.4040, -0.7886,  1.0975],
        [ 0.4900, -0.5300, -0.0047],
        [ 0.3165, -0.3416,  0.4332],
        [ 0.5094, -1.0214,  1.0980],
        [ 0.7558, -0.2773,  0.5124],
        [-0.3206, -1.1403,  1.4799],
        [ 0.0789, -1.1628,  1.5476],
        [-0.0793, -0.9799,  0.6771],
        [ 0.07

(Epoch 3) TRAIN LOSS:0.7719 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:04<00:00,  4.71it/s]

SequenceClassifierOutput(loss=tensor(0.7822, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1603, -1.1338,  1.0656],
        [ 0.1643, -0.7072,  0.9020],
        [ 0.0890, -1.1196,  1.6489],
        [ 0.2259, -1.4216,  1.0029],
        [ 0.3294, -0.4267,  0.2362],
        [-0.3941, -1.1829,  1.3206],
        [-0.0613, -1.3108,  1.3521],
        [ 0.4218, -0.6892,  0.3934],
        [ 0.3048, -0.7505,  0.2509],
        [ 0.3580, -1.0377,  0.7203],
        [ 0.7062, -0.7530,  0.8555],
        [-0.3861, -1.0275,  1.6021],
        [ 0.0616, -1.1668,  1.4935],
        [-0.1184, -1.5346,  1.7246],
        [ 0.3285, -0.1365,  0.3318],
        [ 0.1397, -0.7467,  0.8782],
        [ 0.0057, -1.0700,  1.7687],
        [-0.1053, -1.2512,  1.5699],
        [ 0.0597, -0.9302,  1.2783],
        [ 0.4053, -1.0540,  0.8572],
        [ 0.0093, -0.3517,  0.1445],
        [ 0.1336, -0.8907,  0.9014],
        [ 0.7219, -0.2874,  0.4708],
        [ 0.5123, -0.3876,  0.5293],
        [ 0.33

(Epoch 3) TRAIN LOSS:0.7673 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:05<00:00,  4.86it/s]

SequenceClassifierOutput(loss=tensor(0.6104, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1600, -1.0254,  0.8222],
        [ 0.4167, -0.6333,  0.3968],
        [ 0.5937, -0.6336,  0.2484],
        [ 0.2199, -0.6195,  0.7091],
        [ 0.7465, -0.3041,  0.4996],
        [-0.0255, -1.2248,  1.4522],
        [-0.0618, -1.1488,  1.2320],
        [ 0.0546, -1.1738,  1.4908],
        [ 0.0366, -0.9936,  1.8297],
        [ 0.5560, -1.1510,  1.1499],
        [ 0.1944, -1.0085,  0.6992],
        [ 0.6282, -0.6447,  0.5879],
        [ 0.5336, -0.9812,  0.5841],
        [ 0.1683, -1.2695,  1.1606],
        [ 0.0461, -1.4797,  1.3239],
        [ 0.0364, -1.2169,  0.6893],
        [ 0.4367, -0.2295,  0.1700],
        [ 0.8825, -1.0979,  1.0157],
        [ 0.3396, -0.6777,  0.3965],
        [-0.0304, -1.2798,  1.2736],
        [ 0.2854, -1.1160,  0.9898],
        [ 0.2341, -0.8285,  0.0561],
        [ 0.3381, -0.9723,  0.9658],
        [-0.3943, -0.9098,  1.4068],
        [-0.39

(Epoch 3) TRAIN LOSS:0.7740 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:05<00:00,  4.61it/s]

(Epoch 3) TRAIN LOSS:0.7740 ACC:0.65 F1:0.41 REC:0.43 PRE:0.41 LR:0.00000300



(Epoch 4) TRAIN LOSS:0.6703 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:06,  3.81it/s]

SequenceClassifierOutput(loss=tensor(0.6703, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4352, -0.9619,  0.8310],
        [ 0.3527, -0.6924,  0.8681],
        [ 0.2360, -0.7818,  0.9928],
        [ 0.0451, -1.2780,  1.6329],
        [-0.1343, -1.3679,  1.5627],
        [ 0.2331, -0.8135,  0.5518],
        [ 0.1348, -1.2140,  1.3201],
        [-0.2101, -1.1838,  1.7062],
        [-0.1320, -1.4310,  1.2797],
        [ 0.8109, -1.1685,  0.6304],
        [-0.0664, -0.8260,  1.2781],
        [-0.1620, -1.0813,  1.3547],
        [-0.0990, -0.9527,  1.5762],
        [-0.0134, -1.0072,  1.5628],
        [ 0.0084, -0.9172,  0.6233],
        [-0.2005, -1.3769,  1.5767],
        [ 0.4850, -0.0381, -0.1642],
        [ 0.1004, -0.8224,  1.3502],
        [ 0.3932, -0.7108,  0.9644],
        [ 0.4173, -1.0655,  0.1269],
        [ 0.3227, -1.0601,  0.7454],
        [-0.1859, -1.2076,  1.5316],
        [-0.2920, -1.1387,  1.4452],
        [ 0.6455, -0.3968, -0.1732],
        [ 0.85

(Epoch 4) TRAIN LOSS:0.6741 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:05,  4.18it/s]

SequenceClassifierOutput(loss=tensor(0.6778, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0439, -1.2198,  1.5646],
        [-0.2320, -1.0462,  1.7612],
        [ 0.5231, -0.4630,  0.0587],
        [-0.3878, -1.3186,  1.4290],
        [ 0.6280, -0.9809,  0.8795],
        [-0.0668, -1.5498,  1.4362],
        [-0.2557, -1.1361,  1.2614],
        [-0.0152, -1.2096,  1.3740],
        [ 0.6205, -0.6755,  0.9887],
        [-0.2933, -1.4688,  1.9115],
        [ 0.4145, -0.6790, -0.0640],
        [-0.1015, -0.9132,  1.3326],
        [ 0.2549, -1.4149,  1.1984],
        [ 0.3366, -0.8563,  0.7151],
        [ 0.2208, -0.6166,  0.4158],
        [-0.1571, -1.4663,  1.4062],
        [-0.3285, -1.2706,  1.4237],
        [-0.2687, -1.1568,  1.5969],
        [-0.3307, -1.3503,  1.6159],
        [ 0.6838, -0.5561,  0.0400],
        [ 0.1699, -1.4181,  1.5072],
        [ 0.6369, -0.9055,  1.0023],
        [ 0.2442, -1.0481,  1.0279],
        [ 0.0512, -0.6547,  0.6151],
        [-0.17

(Epoch 4) TRAIN LOSS:0.7127 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:05,  4.30it/s]

SequenceClassifierOutput(loss=tensor(0.7899, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2994, -1.1152,  1.4262],
        [-0.0349, -1.0548,  1.0340],
        [ 0.4557, -0.2373,  0.2725],
        [ 0.3903, -0.8077,  0.2734],
        [ 0.1374, -1.4639,  1.5621],
        [-0.3269, -1.0014,  1.6911],
        [ 0.5408, -0.9006,  0.6452],
        [ 0.4795, -0.9747,  0.6214],
        [-0.3501, -1.1853,  1.2124],
        [-0.1543, -0.9877,  1.3170],
        [ 0.4126, -0.4716,  0.5860],
        [-0.1049, -1.1220,  1.4917],
        [ 0.4118, -0.5073,  0.4684],
        [ 0.2664,  0.0018,  0.2925],
        [ 0.6843, -0.8837,  0.2249],
        [ 0.4942, -0.3471,  0.2351],
        [ 0.4342, -0.4266,  0.1772],
        [ 0.3335, -0.3110,  0.4560],
        [ 0.0940, -1.1517,  1.3582],
        [-0.4652, -0.9007,  1.3084],
        [ 0.4152, -0.9031,  0.2025],
        [-0.3272, -1.1311,  1.4550],
        [ 0.0447, -0.8433,  1.3785],
        [ 0.3092, -0.8604,  1.0848],
        [ 0.15

(Epoch 4) TRAIN LOSS:0.7002 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:04,  4.35it/s]

SequenceClassifierOutput(loss=tensor(0.6628, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.4363e-01, -1.1132e+00,  1.0987e+00],
        [-2.9626e-01, -1.2409e+00,  1.3787e+00],
        [ 1.5497e-02, -8.4855e-01,  1.3037e+00],
        [ 5.1305e-01, -1.3060e+00,  1.2916e+00],
        [-3.6935e-01, -8.7932e-01,  1.7004e+00],
        [ 2.6624e-01, -6.2568e-02,  4.2904e-02],
        [ 2.6460e-01, -8.5964e-01,  6.9648e-01],
        [-2.9957e-01, -9.6210e-01,  1.4332e+00],
        [ 4.1705e-01, -1.8950e-01,  2.8140e-01],
        [ 4.6091e-02, -9.8220e-01,  1.1342e+00],
        [-3.9233e-01, -9.2149e-01,  1.3644e+00],
        [ 3.8806e-01, -8.2717e-01, -1.9407e-01],
        [ 6.6435e-01, -1.6348e-01,  1.4036e-01],
        [ 6.2651e-01, -3.7561e-01, -2.1024e-02],
        [ 7.2033e-01, -3.4264e-01,  7.5958e-02],
        [-7.8436e-02, -4.5483e-01, -1.8970e-01],
        [ 7.1934e-02, -9.7704e-01,  1.6009e+00],
        [ 4.2989e-01, -1.2167e+00,  1.1112e+00],
        [-1.0460e-01

(Epoch 4) TRAIN LOSS:0.6727 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:04,  4.31it/s]

SequenceClassifierOutput(loss=tensor(0.5628, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0885, -1.2384,  1.8114],
        [ 0.4896, -0.5965, -0.1712],
        [ 0.3692, -0.3917,  0.2009],
        [-0.3498, -0.7950,  1.2219],
        [ 0.4809, -0.9072,  0.8794],
        [ 0.0858, -1.1756,  1.1864],
        [ 0.4469, -1.1767,  0.7568],
        [ 0.4353, -0.3341,  0.4748],
        [ 0.4657, -0.8213,  0.4281],
        [ 0.3679, -0.8350,  1.2513],
        [-0.4952, -1.2111,  1.8857],
        [ 0.1691, -0.9845,  1.0325],
        [ 0.5897, -0.6825,  0.1020],
        [ 0.5003, -0.6049,  0.5729],
        [-0.3181, -1.4188,  1.5358],
        [-0.1389, -1.1359,  1.3116],
        [-0.1011, -0.8114,  1.3286],
        [ 0.3762, -1.0441,  1.2505],
        [ 0.2209, -1.0779,  0.9585],
        [ 0.2254, -0.8914,  0.9501],
        [ 0.6357, -0.8223,  0.2626],
        [-0.1701, -1.3948,  1.2183],
        [-0.2155, -1.1607,  1.5029],
        [-0.1339, -0.9239,  1.7603],
        [ 0.10

(Epoch 4) TRAIN LOSS:0.6927 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:04,  4.46it/s]

SequenceClassifierOutput(loss=tensor(0.7927, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2231, -1.0161,  1.5254],
        [ 0.1185, -0.6796,  0.4176],
        [ 0.5778, -0.9115,  0.5483],
        [ 0.5548, -0.4996,  0.1010],
        [ 0.0390, -1.0181,  1.1431],
        [-0.0572, -1.4976,  1.7521],
        [-0.0033, -1.4665,  1.8177],
        [ 0.3331, -0.8239,  0.0356],
        [ 0.6735, -1.0176,  0.9047],
        [ 0.5568, -1.0928,  0.9271],
        [ 0.6818, -0.7710,  0.9112],
        [ 0.7584, -0.8373,  0.4212],
        [ 0.2516, -1.2242,  1.3715],
        [ 0.5390, -1.1512,  0.8325],
        [ 0.5490, -0.1013,  0.1978],
        [ 0.6123, -0.2217,  0.1724],
        [ 0.2895, -0.3857,  0.2303],
        [ 0.2359, -1.2257,  1.5011],
        [-0.0209, -1.0831,  1.5695],
        [ 0.2693, -1.2808,  1.7348],
        [-0.0641, -1.5866,  1.5571],
        [-0.3570, -1.1579,  2.0464],
        [ 0.4590, -1.1663,  1.3127],
        [ 0.0936, -1.2283,  1.3727],
        [ 0.73

(Epoch 4) TRAIN LOSS:0.6833 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:04,  4.49it/s]

SequenceClassifierOutput(loss=tensor(0.6271, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4219, -1.3572,  1.4661],
        [ 0.5030, -0.9163,  0.0847],
        [ 0.1323, -1.1074,  1.4281],
        [ 0.5155, -1.6463,  1.2941],
        [ 0.1244, -0.6108,  0.4165],
        [-0.1005, -1.3744,  1.6833],
        [ 0.6500, -1.1061,  0.5279],
        [-0.4848, -1.1363,  1.8315],
        [-0.0394, -1.0671,  1.4282],
        [-0.1960, -1.0684,  1.8368],
        [-0.2175, -1.2403,  1.4826],
        [ 0.3130, -1.0090,  0.5570],
        [ 0.3691, -0.8340,  0.5907],
        [-0.3729, -1.3880,  1.4524],
        [ 0.2684, -1.1563,  1.0004],
        [-0.0516, -0.9303,  1.3938],
        [ 0.7126, -0.1828, -0.1222],
        [ 0.6288, -0.8192,  0.5298],
        [-0.2212, -0.5793,  1.3285],
        [ 0.6584, -1.0500,  0.9600],
        [ 0.3627, -0.8986,  0.9012],
        [-0.1628, -1.2748,  1.6742],
        [ 0.2400, -0.4659,  0.8351],
        [ 0.2977, -0.8541,  0.8865],
        [ 0.60

(Epoch 4) TRAIN LOSS:0.7107 LR:0.00000300:  32%|█████████████                            | 8/25 [00:01<00:03,  4.62it/s]

SequenceClassifierOutput(loss=tensor(0.9023, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0756, -0.7681,  1.4318],
        [ 0.1563, -1.2989,  1.3445],
        [ 0.3746, -1.1465,  0.5904],
        [ 0.3984, -1.1465,  1.4498],
        [ 0.4322, -1.0677,  0.7718],
        [ 0.0469, -0.1560,  0.4146],
        [ 0.6142, -1.1026,  0.6404],
        [ 0.1229, -0.4884,  1.0564],
        [-0.3079, -0.9470,  1.4770],
        [ 0.0985, -1.0500,  1.4913],
        [ 0.3959, -1.0917,  0.6754],
        [-0.5738, -1.2401,  1.6933],
        [ 0.2122, -1.4523,  1.2829],
        [ 0.5408, -0.8652,  0.7205],
        [ 0.3459, -0.8808,  0.5119],
        [ 0.3612, -0.8430,  0.2225],
        [ 0.1442, -1.0823,  1.0190],
        [ 0.4630, -0.7330,  0.7326],
        [-0.0078, -1.3724,  0.9479],
        [-0.3735, -1.0242,  1.5768],
        [-0.0289, -1.3339,  0.9482],
        [-0.1887, -1.1559,  1.1606],
        [ 0.4473, -0.8838,  0.2775],
        [-0.2265, -1.2182,  1.4595],
        [-0.23

(Epoch 4) TRAIN LOSS:0.7059 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:02<00:03,  4.66it/s]

SequenceClassifierOutput(loss=tensor(0.6671, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5989, -0.9148,  1.1940],
        [ 0.4509, -0.3462,  0.1662],
        [ 0.0746, -1.5632,  1.6377],
        [-0.0920, -0.8991,  1.2312],
        [-0.0333, -1.1211,  1.4919],
        [ 0.0022, -1.1205,  1.5240],
        [ 0.6087, -0.4050, -0.0668],
        [-0.1327, -1.1193,  1.7454],
        [ 0.3864, -0.5374, -0.0884],
        [ 0.6102, -0.8116,  0.0119],
        [ 0.3417, -1.1129,  1.3936],
        [ 0.1110, -1.2814,  1.2646],
        [ 0.1490, -1.2294,  1.4797],
        [-0.1863, -1.3657,  1.7023],
        [ 0.0467, -1.1487,  1.2739],
        [ 0.4758, -0.5120,  0.1924],
        [ 0.5668, -0.4367, -0.1017],
        [ 0.0975, -0.1620,  0.1438],
        [ 0.2194, -0.5495, -0.3696],
        [ 0.4590, -1.1449,  1.1586],
        [-0.1060, -1.3431,  1.6642],
        [-0.0657, -1.2862,  1.5280],
        [ 0.5440, -0.5369,  0.1247],
        [-0.1665, -1.0966,  1.1861],
        [ 0.27

(Epoch 4) TRAIN LOSS:0.7040 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:03,  4.52it/s]

SequenceClassifierOutput(loss=tensor(0.6869, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1984, -1.1297,  1.3106],
        [ 0.3495, -1.0053,  1.0486],
        [ 0.4352, -0.9687,  1.3593],
        [ 0.0829, -0.7757,  0.3324],
        [ 0.3515, -1.3590,  1.0501],
        [ 0.1391, -0.9559,  0.8182],
        [ 0.5292, -1.0177,  1.0364],
        [ 0.0658, -1.1793,  1.6568],
        [-0.2401, -1.0895,  1.1005],
        [-0.1206, -0.9263,  1.7508],
        [ 0.7295, -0.5655,  0.3468],
        [-0.1034, -1.1982,  1.5507],
        [ 0.2212, -0.9283,  0.8438],
        [-0.2328, -1.0778,  1.2713],
        [ 0.3690, -1.2484,  1.1829],
        [ 0.5382, -0.5663,  0.0813],
        [ 0.6005, -0.8345,  0.1867],
        [ 0.7012, -1.1842,  0.9142],
        [-0.0708, -1.0233,  1.3506],
        [-0.0624, -0.9374,  1.4882],
        [ 0.8383, -0.3222,  0.2299],
        [ 0.1614, -0.1686, -0.1007],
        [ 0.0532, -0.9285,  0.4613],
        [ 0.0938, -0.4485,  0.5199],
        [ 0.92

(Epoch 4) TRAIN LOSS:0.7164 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:02<00:03,  4.42it/s]

SequenceClassifierOutput(loss=tensor(0.8403, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1508, -1.1761,  1.5084],
        [ 0.2943, -0.5369, -0.1564],
        [ 0.2506, -1.2644,  1.4961],
        [ 0.2715, -0.6718,  1.0152],
        [ 0.0577, -1.0872,  1.4266],
        [ 0.3935, -0.5903, -0.2074],
        [-0.3311, -1.2555,  1.4692],
        [ 0.3153, -1.4609,  1.3846],
        [ 0.3705, -0.7027,  0.5915],
        [-0.2313, -1.1974,  1.7204],
        [ 0.1595, -0.6253,  0.4987],
        [ 0.4206, -0.5843,  0.2878],
        [ 0.0336, -1.4798,  1.1624],
        [ 0.3122, -0.8959,  0.7164],
        [ 0.6891, -0.8604,  0.3924],
        [ 0.2557, -0.9102,  0.8070],
        [ 0.5138, -0.9258,  0.6718],
        [ 0.0212, -1.4142,  1.5384],
        [ 0.3298, -1.1125,  1.1924],
        [ 0.2460, -1.2618,  1.1421],
        [ 0.5625, -0.3585,  0.1567],
        [ 0.5799, -0.8591,  0.3930],
        [-0.0161, -1.1794,  1.7963],
        [ 0.4055, -1.0890,  0.4163],
        [ 0.34

(Epoch 4) TRAIN LOSS:0.7266 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:02<00:03,  3.56it/s]

SequenceClassifierOutput(loss=tensor(0.7112, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 3.4318e-01, -1.2681e+00,  1.0535e+00],
        [ 9.4912e-01, -2.3435e-01,  2.2562e-01],
        [ 3.6251e-02, -1.4111e+00,  1.8726e+00],
        [ 4.3643e-01, -1.1777e+00,  9.0361e-01],
        [-2.1620e-01, -1.1862e+00,  1.4096e+00],
        [ 3.5182e-01, -6.7524e-01,  2.5537e-01],
        [ 2.7286e-01, -8.9624e-01,  1.3774e-02],
        [ 4.0792e-01, -6.1850e-01,  5.6925e-01],
        [ 1.7022e-01, -1.2285e+00,  1.1785e+00],
        [-2.1907e-01, -1.4057e+00,  1.7428e+00],
        [ 6.8988e-01, -5.6789e-01,  4.1506e-03],
        [ 3.4333e-01, -1.1600e+00,  4.2935e-01],
        [ 4.0121e-01, -1.2165e+00,  1.3229e+00],
        [ 2.5035e-01, -9.7853e-01,  6.2613e-01],
        [ 6.4877e-01, -8.8228e-01,  1.0443e+00],
        [ 4.5091e-01, -6.4780e-01,  3.1182e-01],
        [ 6.8464e-01, -6.7580e-01,  3.4522e-01],
        [ 4.4978e-01, -1.1429e+00,  1.1341e+00],
        [ 4.2106e-01

(Epoch 4) TRAIN LOSS:0.7254 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:03<00:03,  3.19it/s]

SequenceClassifierOutput(loss=tensor(0.7518, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3550, -1.3169,  1.2960],
        [ 0.3589, -0.3684,  0.3564],
        [ 0.4935, -0.0748, -0.2816],
        [ 0.5037, -0.0477,  0.1296],
        [-0.2328, -1.0689,  1.2744],
        [ 0.4030, -0.3193, -0.0479],
        [ 0.0282, -1.1163,  1.3809],
        [ 0.3470, -0.6625,  0.8308],
        [ 0.3368, -1.1393,  0.8089],
        [-0.0343, -0.9032,  1.2781],
        [ 0.3397, -1.3557,  1.1005],
        [-0.2850, -1.2452,  1.5318],
        [ 0.5776, -1.2199,  1.3209],
        [-0.3275, -1.2512,  1.6134],
        [-0.1537, -1.2726,  1.4736],
        [ 0.2471, -0.8862,  0.9136],
        [ 0.4550, -1.1328,  1.3884],
        [ 0.0672, -0.8403,  1.5468],
        [-0.0160, -1.0195,  1.9204],
        [ 0.1658, -1.1503,  1.5423],
        [ 0.2531, -0.6753,  0.4903],
        [ 0.4862, -0.8267,  0.9343],
        [ 0.0852, -1.2008,  1.3394],
        [-0.2010, -1.3293,  1.6985],
        [ 0.07

(Epoch 4) TRAIN LOSS:0.7273 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:03<00:03,  2.94it/s]

SequenceClassifierOutput(loss=tensor(0.7574, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6338, -0.6855,  0.6413],
        [ 0.0034, -0.8021,  0.2078],
        [-0.2428, -1.0124,  1.5941],
        [ 0.5374, -0.8041,  0.1674],
        [-0.3384, -0.5256,  0.2419],
        [ 0.1322, -0.2209, -0.1033],
        [ 0.1597, -0.8429,  1.2025],
        [ 0.3245, -0.2076, -0.0420],
        [ 0.2365, -0.8457,  0.6089],
        [-0.0121, -1.1629,  1.1228],
        [ 0.3250, -1.0613,  1.2825],
        [ 0.7790, -0.9116,  0.8153],
        [ 0.3297, -1.1699,  1.1738],
        [-0.3215, -1.1549,  1.6379],
        [ 0.5850, -0.1909,  0.1394],
        [-0.0189, -1.0576,  1.6062],
        [ 0.3215, -1.1918,  1.3153],
        [ 0.2932, -0.9583,  1.0041],
        [ 0.2819, -0.6279,  0.3928],
        [-0.3014, -0.9190,  1.7777],
        [ 0.3191, -0.9660,  1.3031],
        [-0.4315, -0.9616,  1.5420],
        [-0.0847, -1.3611,  1.1779],
        [-0.0890, -1.1606,  1.8147],
        [-0.17

(Epoch 4) TRAIN LOSS:0.7293 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:04<00:03,  2.85it/s]

SequenceClassifierOutput(loss=tensor(0.6509, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.3423e-01, -3.7328e-01, -7.8533e-02],
        [ 5.2999e-01, -1.0097e+00,  6.1272e-01],
        [ 1.6957e-01, -1.3545e+00,  1.0519e+00],
        [-2.6102e-01, -9.8135e-01,  1.6189e+00],
        [-7.7897e-02, -1.4972e+00,  1.4488e+00],
        [ 2.3697e-01, -1.3720e+00,  1.3200e+00],
        [ 5.1143e-01, -1.1193e+00,  3.9979e-01],
        [ 2.5037e-01, -1.0126e+00,  1.5525e+00],
        [ 1.1181e-01, -9.2577e-01,  1.3679e+00],
        [ 4.6430e-01, -1.4526e+00,  1.3805e+00],
        [-2.4900e-01, -1.4018e+00,  1.9429e+00],
        [-2.3798e-02, -1.4510e+00,  1.4176e+00],
        [ 5.1532e-01, -7.2281e-01,  8.1001e-01],
        [ 9.3476e-01, -6.5399e-01,  6.4813e-01],
        [ 5.3891e-02, -1.2810e+00,  1.3129e+00],
        [ 3.8817e-01, -1.0694e+00,  1.3054e+00],
        [ 6.6367e-01, -1.2386e+00,  3.5925e-01],
        [ 1.7038e-01, -1.3472e+00,  1.1302e+00],
        [ 3.8262e-01

(Epoch 4) TRAIN LOSS:0.7244 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:04<00:03,  2.93it/s]

SequenceClassifierOutput(loss=tensor(0.6461, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3836, -0.3934,  0.1405],
        [-0.0160, -1.3372,  1.1541],
        [-0.1292, -1.4292,  1.5028],
        [-0.0024, -0.9587,  1.9425],
        [ 0.1014, -0.8226,  1.3376],
        [-0.0430, -1.2813,  1.5641],
        [ 0.3074, -0.9544,  1.0321],
        [ 0.5031, -1.3083,  1.2955],
        [ 0.6856, -0.7995,  0.3929],
        [ 0.0819, -0.2569,  0.2909],
        [ 0.5873, -1.4244,  0.8057],
        [ 0.2423, -0.6590,  0.0494],
        [-0.2703, -1.0110,  1.4519],
        [ 0.0091, -0.9459,  1.6621],
        [ 0.3592, -1.2315,  1.0829],
        [ 0.3958, -1.3230,  1.2050],
        [ 0.6180, -0.8399,  0.6589],
        [ 0.0389, -1.3485,  1.5521],
        [ 0.1734, -0.4221,  0.3646],
        [-0.2512, -1.1699,  1.8233],
        [ 0.0478, -1.4260,  1.7824],
        [ 0.4570, -0.8216,  0.4559],
        [ 0.6758, -0.7577,  0.6399],
        [ 0.3056, -0.6985,  1.0586],
        [ 0.61

(Epoch 4) TRAIN LOSS:0.7198 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:04<00:02,  2.80it/s]

SequenceClassifierOutput(loss=tensor(0.7641, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0798, -1.1934,  1.8570],
        [ 0.7345, -0.7898,  0.4733],
        [ 0.7448, -0.3910,  0.1947],
        [-0.2986, -0.8808,  1.6270],
        [-0.0574, -0.9073,  0.9596],
        [ 0.8074, -0.5255,  0.1125],
        [ 0.5268, -0.4732, -0.0724],
        [-0.2574, -1.4063,  1.5669],
        [-0.2927, -0.9960,  1.8215],
        [ 0.1569, -1.1134,  0.8518],
        [ 0.3906, -0.9752,  0.9865],
        [ 0.3588, -0.8415,  0.9332],
        [ 0.4078, -0.5537,  0.1767],
        [-0.1268, -0.7097,  1.4310],
        [ 0.6735, -0.8427,  0.5786],
        [-0.2677, -1.2609,  1.8345],
        [-0.2163, -1.0759,  1.7283],
        [ 0.3466, -0.3653,  0.2763],
        [ 0.1967, -0.5086,  0.3346],
        [-0.0873, -1.2044,  1.8682],
        [ 0.2461, -1.1593,  1.4858],
        [ 0.5147, -0.6034,  0.0344],
        [-0.0088, -1.0318,  1.2788],
        [ 0.3575, -0.6972,  0.2426],
        [-0.28

(Epoch 4) TRAIN LOSS:0.7223 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:05<00:02,  2.74it/s]

SequenceClassifierOutput(loss=tensor(0.5851, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3150, -1.0931,  1.6167],
        [ 0.0416, -1.3648,  1.2598],
        [-0.4707, -1.2632,  1.8257],
        [-0.0743, -0.9348,  1.4921],
        [ 0.1969, -0.3631,  0.1457],
        [-0.3733, -1.1584,  1.6272],
        [ 0.3379, -0.2742,  0.0322],
        [ 0.4363, -0.5072, -0.2436],
        [ 0.0093, -1.1843,  1.2935],
        [ 0.0677, -1.1388,  0.7445],
        [ 0.1524, -1.1133,  1.2304],
        [-0.2049, -1.2184,  1.6867],
        [-0.0492, -1.3873,  1.4564],
        [-0.2143, -1.2915,  1.4770],
        [ 0.3948, -0.1525,  0.0988],
        [ 0.7938, -0.3819, -0.1337],
        [ 0.1282, -1.2057,  1.5256],
        [ 0.7294, -0.3302,  0.2190],
        [ 0.6356, -1.0414,  0.8399],
        [-0.2816, -1.0948,  1.7102],
        [ 0.5288, -1.0872,  0.8061],
        [ 0.6261, -0.6740,  0.0027],
        [ 0.1300, -0.8594,  1.4626],
        [ 0.8078, -0.4272,  0.1913],
        [ 0.62

(Epoch 4) TRAIN LOSS:0.7151 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:05<00:02,  2.70it/s]

SequenceClassifierOutput(loss=tensor(0.8706, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4962, -0.8354,  0.3518],
        [ 0.2960, -0.7794,  0.5315],
        [ 0.2111, -1.1271,  1.2009],
        [ 0.0942, -0.8262,  1.5104],
        [ 0.4288, -0.8764,  0.6994],
        [ 0.6119, -0.1546, -0.1021],
        [ 0.0065, -1.1914,  1.5409],
        [ 0.4801, -0.2516,  0.3474],
        [ 0.0883, -0.8429,  1.3113],
        [-0.3687, -1.2351,  1.5470],
        [ 0.8013,  0.0226, -0.2005],
        [-0.1541, -1.1652,  1.5092],
        [ 0.1527, -1.1401,  1.3356],
        [ 0.3016, -0.4228, -0.2117],
        [ 0.0412, -1.2580,  1.2079],
        [ 0.6750, -0.9399,  0.6322],
        [ 0.5416, -0.6767,  0.4642],
        [ 0.2066, -1.1154,  0.9540],
        [ 0.2429, -0.7153,  0.7797],
        [ 0.6047, -0.2909,  0.4561],
        [ 0.0547, -1.3150,  1.4165],
        [-0.0730, -0.9936,  1.3113],
        [-0.4025, -1.1994,  1.8285],
        [ 0.6245, -0.5440,  0.1647],
        [ 0.46

(Epoch 4) TRAIN LOSS:0.7228 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:05<00:01,  2.62it/s]

SequenceClassifierOutput(loss=tensor(0.6847, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0215, -1.0730,  1.4642],
        [ 0.1573, -1.4959,  1.2731],
        [ 0.7236, -0.2153,  0.0888],
        [ 0.2386, -1.1412,  0.8220],
        [-0.1264, -1.2074,  1.3504],
        [ 0.5004, -0.4683,  0.1543],
        [ 0.2373, -1.0234,  1.4164],
        [ 0.5871, -1.0290,  0.5985],
        [ 0.5228, -0.6006,  0.3003],
        [ 0.3880, -0.6995, -0.2787],
        [-0.1419, -1.3066,  2.1303],
        [ 0.1228, -0.5326,  1.2166],
        [-0.1940, -1.2769,  1.5470],
        [ 0.2809, -0.4960,  0.2749],
        [ 0.4148, -0.9436,  0.4169],
        [-0.1022, -1.4748,  1.7218],
        [ 0.6534, -0.0491, -0.1846],
        [ 0.3692, -0.4807, -0.0892],
        [ 0.8099, -0.5777,  0.3708],
        [ 0.0924, -1.0720,  1.2029],
        [-0.0500, -1.0328,  0.9616],
        [ 0.8084, -0.3268,  0.1294],
        [-0.1234, -1.3684,  1.6582],
        [-0.3385, -1.1278,  1.5409],
        [ 0.44

(Epoch 4) TRAIN LOSS:0.7210 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:06<00:01,  2.53it/s]

SequenceClassifierOutput(loss=tensor(0.7315, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0334, -1.0285,  1.3843],
        [ 0.1024, -1.0590,  1.1229],
        [ 0.0690, -0.9374,  1.7186],
        [ 0.2241, -1.2913,  0.9829],
        [ 0.5247,  0.3140,  0.3781],
        [ 0.6283, -0.7269,  0.5191],
        [ 0.4129, -0.5446,  0.2326],
        [ 0.1641, -1.2545,  1.4155],
        [ 0.0205, -1.0207,  1.2108],
        [ 0.3890, -0.7062,  1.3837],
        [ 0.6473,  0.0532, -0.1622],
        [ 0.1396, -1.2750,  1.4528],
        [ 0.4593, -0.8028,  0.3375],
        [ 0.1109, -1.0976,  1.4722],
        [-0.1582, -1.1406,  1.8258],
        [ 0.2551, -0.5576, -0.0406],
        [-0.4412, -1.3185,  1.7424],
        [ 0.2231, -0.5938,  0.0304],
        [-0.0597, -1.1635,  1.3087],
        [ 0.0047, -1.0390,  1.3192],
        [ 0.6384, -0.5731,  0.6465],
        [ 0.0146, -0.8642,  1.4501],
        [ 0.3208, -0.5789, -0.2855],
        [-0.2064, -1.0652,  1.3004],
        [-0.01

(Epoch 4) TRAIN LOSS:0.7215 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:06<00:01,  2.47it/s]

SequenceClassifierOutput(loss=tensor(0.5556, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1080, -1.0916,  1.8350],
        [ 0.2996, -1.0780,  1.0626],
        [ 0.1058, -0.7634,  0.7955],
        [ 0.1828, -0.5139,  0.0348],
        [-0.0806, -1.1656,  1.9409],
        [ 0.0847, -0.2613, -0.0663],
        [-0.0690, -1.2199,  1.6600],
        [-0.0691, -0.7182,  1.7066],
        [-0.1133, -0.9011,  1.6771],
        [ 0.0177, -1.0366,  1.2547],
        [ 0.5547, -0.7040,  0.1483],
        [-0.1564, -1.2711,  1.6864],
        [-0.0927, -1.1219,  1.6405],
        [-0.0238, -1.0715,  1.3498],
        [ 0.1228, -1.0520,  1.7327],
        [ 0.2834, -0.5333, -0.1000],
        [ 0.2457, -0.3911,  0.5048],
        [-0.1651, -0.9811,  1.6111],
        [ 0.2528, -0.5225, -0.2601],
        [ 0.4006, -0.4129, -0.1352],
        [ 0.2118, -0.8728,  1.3197],
        [ 0.4681, -0.9075,  0.7501],
        [-0.2630, -1.0844,  1.6623],
        [-0.1223, -1.1557,  1.8396],
        [-0.05

(Epoch 4) TRAIN LOSS:0.7143 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:07<00:00,  2.44it/s]

SequenceClassifierOutput(loss=tensor(1.0038, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4295, -0.7669,  0.4636],
        [ 0.3184, -0.2132,  0.2020],
        [-0.2394, -0.8145,  1.3898],
        [ 0.6995, -0.9385,  0.3232],
        [-0.1619, -1.3522,  1.1542],
        [-0.0503, -0.9626,  1.4674],
        [ 0.6506, -0.9778,  1.3348],
        [ 0.0783, -1.1949,  1.8000],
        [-0.0815, -1.3913,  1.4541],
        [ 0.6868, -0.4508, -0.1618],
        [ 0.2959, -0.9873,  1.7118],
        [ 0.3037, -0.3407,  0.0283],
        [-0.2063, -0.9767,  1.4646],
        [ 0.6556, -0.6663,  0.8125],
        [ 0.2392, -0.8819,  1.5466],
        [ 0.2603, -0.2311,  0.5113],
        [ 0.5458,  0.0151, -0.1268],
        [ 0.3082, -0.2643,  0.2922],
        [ 0.0202, -1.0794,  1.4269],
        [ 0.7395, -0.2567, -0.3208],
        [ 0.6092, -0.7053,  0.3550],
        [ 0.1838, -0.1211, -0.0926],
        [ 0.2953, -1.0745,  0.4061],
        [ 0.0629, -1.2693,  1.6438],
        [ 0.36

(Epoch 4) TRAIN LOSS:0.7263 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:07<00:00,  2.50it/s]

SequenceClassifierOutput(loss=tensor(0.7006, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1905, -1.2308,  1.8517],
        [-0.1417, -1.2052,  1.6734],
        [ 0.6524, -0.9637,  0.4609],
        [ 0.3459, -0.8628,  0.6713],
        [ 0.7333, -1.0952,  0.9200],
        [ 0.0897, -1.2262,  1.4357],
        [ 0.1956, -0.9946,  1.0851],
        [-0.2097, -0.6353,  1.6363],
        [ 0.0048, -0.9879,  0.7143],
        [ 0.4864, -0.5807,  0.3438],
        [ 0.4087, -1.2044,  1.2426],
        [-0.2939, -1.1081,  1.3644],
        [ 0.3372, -1.0577,  0.9683],
        [ 0.4608, -0.5952,  0.0918],
        [ 0.2676, -1.1485,  0.6122],
        [ 0.7666, -0.4928,  0.4119],
        [ 0.9159, -0.8441,  0.1736],
        [ 0.4449, -0.9581,  0.5640],
        [-0.3199, -1.1952,  1.5309],
        [-0.0970, -1.1168,  1.5859],
        [ 0.1165, -0.7213,  0.5646],
        [ 0.3243, -0.9417,  0.4459],
        [ 0.7698, -0.4891,  0.1992],
        [ 0.5121, -0.2213,  0.2837],
        [ 0.40

(Epoch 4) TRAIN LOSS:0.7253 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:08<00:00,  3.09it/s]

(Epoch 4) TRAIN LOSS:0.7253 ACC:0.69 F1:0.46 REC:0.48 PRE:0.44 LR:0.00000300



  _warn_prf(average, modifier, msg_start, len(result))
(Epoch 5) TRAIN LOSS:0.6513 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:08,  2.85it/s]

SequenceClassifierOutput(loss=tensor(0.6513, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0101, -0.9864,  1.6246],
        [-0.1088, -1.1664,  1.3460],
        [-0.0397, -0.8092,  1.4960],
        [-0.2022, -1.3932,  1.5209],
        [ 0.5455, -1.1089,  0.9286],
        [ 0.9567, -0.8082,  0.3522],
        [ 0.4089, -0.9221,  0.9104],
        [ 0.7027, -0.5035, -0.0993],
        [ 0.2394, -0.3643,  0.3834],
        [ 0.2895, -0.2055,  0.0796],
        [-0.0074, -1.0868,  1.0989],
        [-0.4057, -1.1211,  2.0676],
        [ 0.3430, -0.4494, -0.3771],
        [ 0.2449, -0.2450,  0.2094],
        [ 0.6159, -0.2953,  0.1747],
        [-0.1698, -1.3589,  1.7035],
        [-0.2208, -0.9525,  1.6567],
        [ 0.2702,  0.1958,  0.1978],
        [-0.0340, -1.3002,  0.9963],
        [ 0.4554, -0.9954,  0.7062],
        [ 0.4922, -0.4217,  0.0803],
        [ 0.3120, -0.6517,  0.0085],
        [ 0.4605, -1.1145,  0.8948],
        [ 0.7981, -0.7156,  0.3125],
        [ 0.76

(Epoch 5) TRAIN LOSS:0.6450 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:06,  3.74it/s]

SequenceClassifierOutput(loss=tensor(0.6387, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3459, -1.0737,  0.9916],
        [ 0.3367, -1.0120,  0.9850],
        [-0.2699, -1.0235,  1.7723],
        [ 0.2278, -0.0958, -0.4814],
        [ 0.1415, -0.0349,  0.1620],
        [ 0.6685, -0.8216,  0.2404],
        [ 0.0654, -1.2744,  1.5321],
        [ 0.0595, -1.4183,  1.5483],
        [-0.1080, -1.3285,  1.8493],
        [-0.0293, -1.0737,  1.8870],
        [ 0.6082, -0.7396,  0.1665],
        [ 0.7597, -0.4600,  0.3905],
        [ 0.6155, -0.5384,  0.1439],
        [-0.0866, -1.1346,  1.6549],
        [ 0.6185, -0.2835,  0.5222],
        [ 0.7692, -0.5057,  0.4077],
        [ 0.2506, -1.2815,  0.9318],
        [ 0.4557, -0.7635,  0.6927],
        [ 0.2289, -1.0516,  0.9231],
        [ 0.7538, -0.7022,  0.9297],
        [ 0.5699, -0.8668,  0.1182],
        [ 0.0110, -1.3976,  1.8192],
        [-0.3633, -1.1761,  1.7028],
        [-0.3766, -1.0424,  2.0576],
        [ 0.35

(Epoch 5) TRAIN LOSS:0.6602 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:05,  4.05it/s]

SequenceClassifierOutput(loss=tensor(0.6907, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 6.6414e-01, -4.7640e-01,  2.9191e-01],
        [-2.7349e-01, -7.9254e-01,  1.6700e+00],
        [ 5.1298e-01, -4.7145e-01,  3.4507e-01],
        [-2.6787e-01, -8.9955e-01,  1.6113e+00],
        [-3.9639e-01, -1.3628e+00,  1.6642e+00],
        [ 5.8107e-01, -2.8990e-01, -2.4836e-01],
        [ 5.6703e-01, -9.2329e-01,  1.1871e+00],
        [-2.8562e-01, -1.1165e+00,  1.3077e+00],
        [ 4.9602e-01,  6.1972e-02, -3.3076e-01],
        [ 4.8454e-01, -3.6876e-01,  5.5666e-01],
        [-3.8669e-01, -1.1305e+00,  1.8368e+00],
        [ 6.0524e-02, -8.1861e-01,  8.7765e-01],
        [-5.3309e-02, -7.7185e-01,  8.6050e-01],
        [ 5.9495e-01, -6.0788e-01, -9.2919e-02],
        [ 2.3160e-01, -1.2391e+00,  1.0836e+00],
        [ 3.3435e-01, -1.2905e+00,  1.4639e+00],
        [ 2.8297e-01, -4.3687e-01, -5.8841e-02],
        [ 4.4371e-01, -2.3887e-01,  1.7630e-01],
        [ 5.3122e-01

(Epoch 5) TRAIN LOSS:0.6872 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:01<00:05,  4.19it/s]

SequenceClassifierOutput(loss=tensor(0.7683, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2581, -1.2516,  1.6913],
        [ 0.2907, -0.4044, -0.2142],
        [-0.1217, -1.0416,  1.6095],
        [ 0.0034, -0.9685,  2.0715],
        [ 0.3176, -0.5801,  1.0574],
        [ 0.0042, -0.8101,  0.5233],
        [ 0.4484, -0.4689,  0.2882],
        [ 0.7047, -0.2959, -0.2166],
        [-0.4183, -1.0968,  1.5045],
        [ 0.3622, -0.2270, -0.1847],
        [ 0.0985, -1.1945,  1.3890],
        [ 0.2585, -0.6460,  0.5801],
        [-0.2276, -1.1903,  1.5275],
        [-0.0417, -0.9064,  1.1564],
        [ 0.6702, -0.2827, -0.2786],
        [ 0.3223, -0.6420,  0.2914],
        [ 0.0647, -1.2124,  1.7221],
        [ 0.2808, -0.7689,  0.3866],
        [ 0.1680, -0.6339,  0.9192],
        [ 0.2438, -1.3911,  1.7270],
        [-0.1731, -1.0682,  1.9048],
        [ 0.6366, -0.2345,  0.0030],
        [ 0.8122, -0.5898,  0.3331],
        [-0.1093, -1.3403,  1.4698],
        [-0.26

(Epoch 5) TRAIN LOSS:0.6824 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:04,  4.43it/s]

SequenceClassifierOutput(loss=tensor(0.6628, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3324, -0.9308,  0.5169],
        [ 0.2891, -0.1788, -0.1752],
        [ 0.2295, -0.9618,  0.9322],
        [ 0.5859, -0.4227,  0.3942],
        [ 0.1199, -1.1756,  1.1340],
        [ 0.0414, -1.4093,  1.2868],
        [ 0.8038, -0.3953,  0.1398],
        [ 0.6706, -0.2151, -0.0636],
        [-0.1504, -1.0513,  1.2270],
        [ 0.2695, -0.9020,  0.6743],
        [ 0.1880, -0.6784,  0.8875],
        [ 0.3845, -0.4871,  0.7307],
        [-0.0933, -1.1795,  1.2052],
        [ 0.2439, -0.4148,  0.0238],
        [ 0.8376, -0.0469, -0.2761],
        [ 0.0750, -1.2717,  1.6682],
        [ 0.6881, -1.0085,  0.3062],
        [ 0.1457, -1.0037,  1.5321],
        [ 0.7106, -0.1695, -0.3201],
        [ 0.1703, -1.1561,  0.9661],
        [-0.1326, -1.2100,  1.3744],
        [ 0.5260, -0.7561,  0.4167],
        [ 0.5207, -0.4306,  0.0724],
        [-0.0704, -1.0053,  1.1326],
        [-0.19

(Epoch 5) TRAIN LOSS:0.6942 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:04,  4.52it/s]

SequenceClassifierOutput(loss=tensor(0.7533, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1967, -1.2528,  1.4044],
        [ 0.7091, -0.2866,  0.2068],
        [ 0.5111, -0.6570,  0.1456],
        [ 0.6699, -0.7121,  0.6106],
        [-0.2432, -1.2545,  1.8309],
        [ 0.0486, -0.7308,  1.0057],
        [ 0.0470, -0.1006,  0.1453],
        [-0.0272, -1.1730,  0.8832],
        [-0.1113, -0.6886,  1.3072],
        [-0.3486, -1.1991,  1.6455],
        [ 0.0717, -0.8272,  0.8549],
        [ 0.6292, -1.0366,  0.8388],
        [ 0.2552, -0.2931,  0.3683],
        [ 0.0952, -0.8442,  0.8238],
        [ 0.2242, -0.8966,  0.7098],
        [-0.3089, -1.3694,  1.8429],
        [-0.1756, -1.3763,  2.1971],
        [-0.0587, -0.9494,  1.5253],
        [ 0.3418, -0.2950, -0.1600],
        [ 0.1212, -0.9497,  1.0223],
        [ 0.2395, -0.3748,  0.0253],
        [ 0.3492, -0.9871,  1.2870],
        [-0.3270, -1.0305,  1.3608],
        [ 0.4639, -0.5369,  0.1067],
        [ 0.49

(Epoch 5) TRAIN LOSS:0.7137 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:03,  4.56it/s]

SequenceClassifierOutput(loss=tensor(0.8310, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 6.5068e-01, -3.7986e-02, -2.9447e-01],
        [ 4.0290e-01, -9.3403e-01,  8.3934e-01],
        [-2.0391e-01, -1.2847e+00,  1.4483e+00],
        [ 5.5031e-01, -2.4548e-01, -1.5128e-01],
        [ 5.0874e-01, -3.9062e-01,  3.1302e-01],
        [ 6.1704e-01, -7.7745e-01,  4.8155e-01],
        [-3.0281e-01, -1.2404e+00,  1.3591e+00],
        [ 6.6372e-01, -9.1527e-01,  6.7497e-01],
        [-2.6320e-02, -9.8759e-01,  1.2780e+00],
        [ 1.1804e-02, -1.3189e+00,  2.0983e+00],
        [-3.6611e-01, -1.2236e+00,  1.5503e+00],
        [-1.9666e-01, -9.4290e-01,  1.1674e+00],
        [ 7.3513e-01, -1.3695e-01, -2.9061e-01],
        [-9.5382e-02, -1.1405e+00,  1.7698e+00],
        [ 6.0730e-03, -7.3825e-01,  1.3423e+00],
        [-9.1043e-02, -1.2621e+00,  1.1160e+00],
        [ 5.1843e-01, -1.0419e+00,  2.3974e-01],
        [-1.4766e-03, -1.1673e+00,  1.6724e+00],
        [ 4.2264e-01

(Epoch 5) TRAIN LOSS:0.7177 LR:0.00000300:  32%|█████████████                            | 8/25 [00:01<00:03,  4.71it/s]

SequenceClassifierOutput(loss=tensor(0.7453, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2827, -1.1634,  1.5620],
        [-0.1180, -0.2705,  0.0505],
        [ 0.5472, -0.2051, -0.1196],
        [-0.1422, -1.2603,  1.3996],
        [-0.3537, -1.2664,  1.3411],
        [-0.1501, -1.0021,  1.7503],
        [ 0.1218, -1.1410,  1.4544],
        [ 0.5139, -0.8241,  1.3208],
        [ 0.1844, -1.1901,  1.5351],
        [-0.0818, -1.1076,  1.2715],
        [-0.1777, -1.1278,  1.2059],
        [-0.0161, -1.0121,  1.3318],
        [ 0.7166, -0.7368,  0.6357],
        [-0.1536, -0.8888,  1.2675],
        [ 0.2295, -1.1830,  0.9737],
        [ 0.0982, -0.0157,  0.1060],
        [-0.0517, -1.0791,  1.8419],
        [ 0.0199, -1.4932,  1.5390],
        [ 0.8252, -1.1708,  0.1785],
        [ 0.1439, -1.1896,  1.0038],
        [ 0.5948, -0.4266, -0.1573],
        [ 0.7002, -0.3414, -0.0991],
        [ 0.0606, -1.2820,  1.6930],
        [ 0.4870, -0.8811,  0.7606],
        [ 0.43

(Epoch 5) TRAIN LOSS:0.7088 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:02<00:03,  4.53it/s]

SequenceClassifierOutput(loss=tensor(0.6382, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1611, -1.1930,  1.8894],
        [-0.2599, -1.2590,  1.9653],
        [ 0.0415, -0.8467,  1.1312],
        [ 0.5059, -0.6318,  0.2219],
        [-0.2729, -1.0932,  1.6063],
        [-0.0711, -0.9826,  1.6978],
        [-0.2081, -1.4007,  1.5997],
        [ 0.3994, -1.0382,  1.1569],
        [-0.1322, -0.9381,  1.3256],
        [-0.3956, -0.9586,  1.7567],
        [ 0.3082, -0.2409,  0.0724],
        [-0.3077, -0.8368,  1.5812],
        [-0.3369, -0.8410,  1.4701],
        [-0.1235, -1.0617,  1.4430],
        [-0.3324, -0.8999,  1.5968],
        [-0.1217, -1.4214,  1.6743],
        [ 0.8366, -0.0439, -0.2019],
        [ 0.0078, -1.6075,  1.3901],
        [-0.3002, -1.0853,  1.8388],
        [-0.3605, -1.0586,  1.6686],
        [-0.1373, -1.1936,  1.8373],
        [ 0.7037, -0.6299, -0.3708],
        [ 0.2882, -1.2145,  1.0032],
        [ 0.7166, -0.3829, -0.2601],
        [ 0.50

(Epoch 5) TRAIN LOSS:0.7093 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:03,  4.61it/s]

SequenceClassifierOutput(loss=tensor(0.7139, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5425, -0.1956, -0.2919],
        [-0.3665, -0.9458,  1.7041],
        [ 0.6050,  0.0149, -0.7059],
        [-0.5540, -1.4822,  1.6830],
        [ 0.6203, -0.9566,  0.5028],
        [ 0.4292, -1.0058,  1.2484],
        [ 0.6382, -0.8686,  0.2827],
        [ 0.5547,  0.1756, -0.5377],
        [-0.2023, -1.0928,  1.6585],
        [ 0.3612, -0.1582, -0.0927],
        [-0.0175, -1.0404,  1.0504],
        [ 0.3909, -0.8511,  0.3240],
        [ 0.2386, -0.2644,  0.4688],
        [ 0.4006, -0.4255, -0.0527],
        [-0.1786, -1.2227,  1.4449],
        [-0.1089, -1.1432,  1.5419],
        [ 0.5865, -0.2927,  0.1582],
        [ 0.8301, -0.9015,  0.5734],
        [-0.1389, -1.2431,  1.6645],
        [-0.0666, -1.0692,  1.6130],
        [ 0.4331, -0.2558,  0.0375],
        [ 0.6200, -0.5451,  0.5759],
        [ 0.2331, -1.0361,  1.0690],
        [ 0.4481, -0.6600,  0.9127],
        [-0.21

(Epoch 5) TRAIN LOSS:0.7194 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:02<00:02,  4.74it/s]

SequenceClassifierOutput(loss=tensor(0.8235, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0215, -0.2271,  0.1305],
        [ 0.8368, -0.8229,  0.4525],
        [ 0.6592, -1.1413,  0.9946],
        [ 0.7391, -0.8029,  0.1085],
        [ 0.2356, -0.7705,  1.1570],
        [ 0.9801, -0.1404, -0.4258],
        [ 0.3225, -0.8570,  1.1869],
        [ 0.7043, -0.6097,  0.1710],
        [-0.0645, -1.0367,  1.7047],
        [ 0.3906, -0.5324,  0.6189],
        [ 0.4279, -0.4775, -0.4228],
        [ 0.3248, -1.0195,  1.3731],
        [-0.0233, -0.7214,  1.1748],
        [ 0.4633, -0.8483,  0.5445],
        [ 0.9270, -0.0835, -0.3975],
        [ 0.2080, -0.9118,  0.9894],
        [ 0.4417, -0.0662, -0.3392],
        [ 0.7745, -0.3597,  0.1648],
        [ 0.5483, -0.1293, -0.3124],
        [ 0.0716, -0.8372,  0.7472],
        [ 0.4894, -0.1759, -0.1185],
        [ 0.5107, -1.0718,  0.4686],
        [ 0.0285, -0.2129,  0.1251],
        [ 0.0951, -1.1820,  1.7637],
        [-0.45

(Epoch 5) TRAIN LOSS:0.7110 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:02<00:02,  4.73it/s]

SequenceClassifierOutput(loss=tensor(0.6096, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1255, -0.7832,  1.3425],
        [ 0.4418, -0.0812, -0.1820],
        [-0.1613, -1.0566,  1.3463],
        [-0.1679, -0.5563,  1.5229],
        [ 0.1094, -0.7159, -0.2676],
        [ 0.3737, -0.8544,  1.3126],
        [ 0.6497, -1.0246,  0.4056],
        [ 0.4089, -0.7669,  1.0194],
        [-0.4469, -1.2135,  1.4740],
        [ 0.8821, -0.7086,  0.5219],
        [-0.3046, -1.0923,  2.0446],
        [ 0.0934, -1.4322,  1.0928],
        [ 0.2984, -0.5058,  0.8624],
        [ 0.5569, -1.1349,  1.1215],
        [-0.1451, -1.0900,  1.7592],
        [ 0.6045, -0.1774,  0.0077],
        [ 0.7357, -0.9442,  0.1831],
        [ 0.5774, -0.0083, -0.9037],
        [ 0.4279, -1.2124,  1.6972],
        [ 0.8055, -0.7345,  0.2180],
        [-0.0146, -1.2596,  1.3033],
        [-0.4584, -1.2316,  1.6655],
        [ 0.8107, -0.3195,  0.0313],
        [ 0.4042, -0.7706,  0.8299],
        [ 0.11

(Epoch 5) TRAIN LOSS:0.7047 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:03<00:02,  4.60it/s]

SequenceClassifierOutput(loss=tensor(0.6231, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5883, -0.7130,  0.1527],
        [ 0.8695,  0.1222, -0.2017],
        [-0.0907, -1.0983,  1.5159],
        [ 0.2956, -0.5248,  0.2368],
        [ 0.2148, -0.8889,  0.9334],
        [-0.3594, -1.2718,  1.7862],
        [-0.1323, -1.2912,  1.3896],
        [-0.1607, -1.0608,  1.2541],
        [ 0.0273, -0.9201,  1.3768],
        [ 0.1475, -0.7232,  0.5735],
        [ 0.1100, -0.9035,  1.0520],
        [ 0.4029, -0.5344,  0.1098],
        [-0.1913, -1.0680,  1.8670],
        [ 0.2975, -0.2159,  0.5594],
        [ 0.6611, -0.7204, -0.1076],
        [-0.3146, -0.8979,  1.6654],
        [ 0.4140, -0.1224, -0.2373],
        [ 0.6222, -0.6566,  0.5936],
        [-0.1762, -1.1613,  1.2647],
        [ 0.8997, -0.4322, -0.4810],
        [ 0.6386, -0.9355,  0.5211],
        [ 0.1959, -0.3147, -0.1787],
        [-0.1630, -1.1093,  1.7886],
        [ 0.2483, -0.4683,  0.7841],
        [-0.64

(Epoch 5) TRAIN LOSS:0.7053 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:03<00:02,  4.51it/s]

SequenceClassifierOutput(loss=tensor(0.7134, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1121, -0.9362,  1.5652],
        [ 0.4072, -0.7634,  0.5396],
        [-0.1877, -1.0070,  1.9314],
        [ 0.5370, -0.3631, -0.0150],
        [ 0.5086, -0.5756,  0.4046],
        [ 0.5789, -0.4125, -0.1890],
        [-0.4159, -1.2410,  1.3976],
        [ 0.0817, -0.7564,  0.6056],
        [ 0.5756, -0.4473, -0.1112],
        [-0.0416, -0.9399,  1.3795],
        [-0.1665, -1.2450,  1.7502],
        [-0.1392, -1.0628,  1.3383],
        [-0.2244, -1.1698,  1.6334],
        [ 0.1378, -1.1993,  1.1144],
        [-0.0345, -1.3863,  1.4856],
        [ 0.2024,  0.0531, -0.1986],
        [-0.0297, -0.8674,  1.3631],
        [ 0.8210, -0.3635, -0.4749],
        [-0.1878, -1.0711,  1.1238],
        [ 0.4865, -0.5125,  0.1374],
        [ 0.8427, -0.0191, -0.2010],
        [ 0.6178,  0.0661,  0.1721],
        [ 0.7640, -0.8924, -0.0171],
        [-0.0354, -0.9743,  1.7557],
        [ 0.36

(Epoch 5) TRAIN LOSS:0.7061 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:03<00:01,  4.51it/s]

SequenceClassifierOutput(loss=tensor(0.7179, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.7048e-02, -1.5496e+00,  2.0972e+00],
        [-6.0157e-01, -1.0183e+00,  1.4887e+00],
        [ 9.4696e-02, -1.0884e+00,  1.5585e+00],
        [-3.4642e-01, -9.1212e-01,  1.6333e+00],
        [ 1.3100e-03, -1.0699e+00,  1.7101e+00],
        [ 8.2065e-02,  2.2976e-01, -1.1633e-01],
        [-9.2244e-02, -1.2002e+00,  1.4756e+00],
        [ 5.2851e-01, -4.4018e-01, -1.5876e-01],
        [ 5.3215e-01, -1.0918e+00,  3.7733e-01],
        [ 3.2903e-01, -5.5519e-01,  5.0291e-01],
        [ 5.1802e-02, -1.0884e+00,  1.3003e+00],
        [-3.5236e-02, -2.2078e-01,  1.2318e-01],
        [ 7.0760e-01, -6.9414e-01,  5.3443e-01],
        [ 6.9951e-01,  5.0605e-02, -3.4593e-01],
        [ 4.4673e-01, -6.6468e-01,  3.8862e-01],
        [ 4.7989e-01, -5.7586e-01, -1.1742e-01],
        [-2.2323e-02, -1.1097e+00,  1.0609e+00],
        [ 1.7699e-01,  3.5332e-01, -3.6998e-01],
        [ 6.1757e-02

(Epoch 5) TRAIN LOSS:0.7022 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:03<00:01,  4.51it/s]

SequenceClassifierOutput(loss=tensor(0.6403, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3136,  0.3165, -0.8201],
        [ 0.1213, -1.1345,  1.3311],
        [ 0.2575, -0.9132,  1.6222],
        [ 0.3163, -0.8281,  0.8779],
        [ 0.0177, -0.9861,  1.5660],
        [ 0.6763, -0.0854, -0.1836],
        [ 0.2719, -1.0389,  1.3791],
        [-0.1893, -0.8953,  1.5602],
        [ 0.7075, -0.6515, -0.4232],
        [-0.3707, -1.2903,  1.6929],
        [ 0.4034, -0.8317,  0.7484],
        [-0.0905, -1.1286,  1.2289],
        [ 0.4560, -0.8290,  0.5463],
        [ 0.3225, -0.7750,  0.4795],
        [-0.0538, -1.0230,  1.6001],
        [-0.3676, -1.1593,  1.8974],
        [ 0.3352, -1.0016,  1.0541],
        [-0.1205, -1.2739,  1.8149],
        [-0.1811, -1.3900,  1.7371],
        [ 0.6465, -0.3507,  0.4003],
        [ 0.2552,  0.0329, -0.7708],
        [ 0.3602, -0.3083, -0.5677],
        [ 0.2461, -0.4775, -0.1039],
        [ 0.1660, -0.7561,  0.7332],
        [-0.16

(Epoch 5) TRAIN LOSS:0.6899 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:04<00:01,  4.65it/s]

SequenceClassifierOutput(loss=tensor(0.4806, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1320, -1.2126,  1.2200],
        [ 0.4355, -0.8470,  0.2435],
        [ 0.6955, -0.2109,  0.0490],
        [-0.2919, -1.1093,  1.8863],
        [-0.0420, -0.8844,  1.6790],
        [ 0.0255, -0.9171,  1.6877],
        [ 0.3836, -0.7301,  0.7164],
        [-0.1738, -1.2018,  1.5622],
        [-0.2379, -1.3043,  1.4178],
        [ 0.0070, -1.3395,  1.1758],
        [ 0.7400,  0.0503, -0.2699],
        [ 0.5365,  0.0358, -0.4010],
        [ 0.5570, -0.3264, -0.4041],
        [ 0.3815, -1.0159,  0.7348],
        [-0.1714, -1.2724,  1.7410],
        [ 0.2640, -1.0432,  1.2485],
        [ 0.0331, -1.3577,  1.2490],
        [ 0.0979, -1.3416,  1.8675],
        [-0.2347, -1.0286,  1.4904],
        [-0.3387, -1.3073,  1.6138],
        [-0.1748, -1.1448,  1.6443],
        [ 0.1042, -1.1412,  1.6323],
        [ 0.6747, -0.8278,  0.6198],
        [-0.3312, -1.1831,  1.9965],
        [-0.56

(Epoch 5) TRAIN LOSS:0.6969 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:04<00:01,  4.86it/s]

SequenceClassifierOutput(loss=tensor(0.7676, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6422, -0.5410, -0.2431],
        [ 0.0299, -1.1330,  1.7695],
        [ 0.3332, -1.4232,  1.3995],
        [ 0.6044, -0.3569,  0.0528],
        [-0.2157, -1.1849,  1.9834],
        [ 0.4933, -0.2327, -0.3402],
        [-0.1293, -1.0805,  1.2163],
        [ 0.8390, -0.2084, -0.4730],
        [-0.0477, -1.4168,  1.9623],
        [-0.2285, -1.1614,  1.8540],
        [ 0.3849, -0.3701, -0.1321],
        [ 0.2059, -1.0583,  1.4413],
        [ 0.3959, -0.7564,  0.6337],
        [ 0.4631, -1.1627,  1.2337],
        [-0.2023, -1.0762,  1.5220],
        [ 0.4064, -0.9572,  0.7247],
        [ 0.6117,  0.0689,  0.1423],
        [ 0.4874, -0.4372, -0.2469],
        [ 0.2465, -0.6836,  0.9922],
        [-0.2032, -1.0714,  1.6331],
        [ 0.1351, -1.0435,  1.1303],
        [ 0.5291, -0.6169,  0.1830],
        [ 0.3047, -0.1814, -0.3164],
        [ 0.1233, -0.9083,  1.2395],
        [-0.15

(Epoch 5) TRAIN LOSS:0.6909 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:04<00:00,  4.90it/s]

SequenceClassifierOutput(loss=tensor(0.5706, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 7.3992e-01, -3.6262e-02, -7.3157e-02],
        [ 4.5136e-01, -9.2948e-01,  8.7793e-01],
        [-3.0402e-01, -1.0155e+00,  1.6400e+00],
        [ 1.9110e-01, -1.0245e+00,  1.2967e+00],
        [ 2.7760e-01, -9.5683e-01,  7.7680e-01],
        [-4.7703e-03, -1.2715e+00,  1.5438e+00],
        [ 2.5637e-01, -3.8309e-01, -2.5242e-01],
        [ 1.0498e-01, -6.4457e-01, -2.6528e-01],
        [-2.4621e-01, -1.1939e+00,  1.6637e+00],
        [-2.9291e-01, -1.3012e+00,  1.7877e+00],
        [ 1.9014e-01, -1.3848e+00,  1.2418e+00],
        [-1.9416e-01, -1.0779e+00,  1.6887e+00],
        [-3.3502e-01, -1.2359e+00,  1.7591e+00],
        [ 1.0240e-01, -1.3203e+00,  1.5451e+00],
        [-1.8993e-01, -1.2532e+00,  1.8346e+00],
        [-5.7140e-02, -1.2243e+00,  1.2412e+00],
        [ 6.3409e-01, -7.6894e-01,  4.3400e-01],
        [-2.5845e-01, -9.3670e-01,  1.4218e+00],
        [ 4.2776e-01

(Epoch 5) TRAIN LOSS:0.6961 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:04<00:00,  4.94it/s]

SequenceClassifierOutput(loss=tensor(0.7149, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3705, -0.7395,  0.9060],
        [ 0.1752, -1.3011,  1.8233],
        [ 0.6027, -0.3214, -0.3854],
        [ 0.7757, -1.0589,  0.4501],
        [ 0.3025, -1.0776,  1.2982],
        [ 0.6679, -0.5196, -0.0349],
        [-0.2727, -1.5757,  1.7969],
        [ 0.2617, -0.5957,  0.5812],
        [-0.0656, -1.0296,  1.2018],
        [ 0.2192, -0.9760,  1.1304],
        [-0.4212, -1.2497,  2.0978],
        [ 0.3402, -0.5381,  0.0622],
        [ 0.0617, -0.8809,  0.8191],
        [ 0.8251, -0.9285,  0.5130],
        [-0.2348, -1.0439,  1.5443],
        [ 0.3117, -0.4883,  0.2169],
        [ 0.7127, -1.0201,  1.0518],
        [ 0.2936,  0.3985, -0.2032],
        [ 0.5331, -0.3411, -0.2431],
        [ 0.8106, -1.0332,  0.1251],
        [ 0.2444, -0.1517,  0.1154],
        [ 0.1243, -0.9959,  1.4072],
        [-0.2511, -1.4271,  2.0147],
        [ 0.8650, -0.6761,  0.2681],
        [ 0.70

(Epoch 5) TRAIN LOSS:0.6942 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:05<00:00,  4.84it/s]

SequenceClassifierOutput(loss=tensor(0.6512, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5264, -1.0094,  0.7444],
        [ 0.5530, -0.3874,  0.0897],
        [ 0.5461, -0.8554,  0.7315],
        [-0.2439, -1.3071,  1.9595],
        [ 0.0848, -0.7749,  0.9836],
        [ 0.3561, -0.0305,  0.2817],
        [ 0.0186, -1.1009,  1.3180],
        [ 0.3855, -0.4347, -0.0473],
        [ 0.1150, -1.4856,  1.8044],
        [-0.0437, -1.0876,  1.2426],
        [ 0.6846, -1.0684,  0.3415],
        [ 0.4171, -0.1747, -0.2404],
        [ 0.5863, -0.6076,  0.0751],
        [ 0.8394, -0.5086,  0.4466],
        [ 0.3077, -0.2936,  0.0546],
        [ 0.3517, -0.7136,  0.4525],
        [ 0.5248, -1.1433,  1.1586],
        [ 0.7698, -0.0398, -0.0626],
        [ 0.0930, -1.2659,  0.6536],
        [-0.1416, -1.2394,  1.6363],
        [-0.3628, -1.0428,  1.5007],
        [ 0.0104, -1.0264,  1.6490],
        [-0.4160, -1.2829,  1.8450],
        [ 0.2648, -1.1974,  1.6996],
        [ 0.06

(Epoch 5) TRAIN LOSS:0.6881 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:05<00:00,  4.74it/s]

SequenceClassifierOutput(loss=tensor(0.5406, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2170, -1.1530,  1.9000],
        [ 0.0954, -0.8046,  1.1812],
        [ 0.1784, -1.2949,  1.2397],
        [ 0.0431, -1.1374,  1.3283],
        [-0.1534, -1.1166,  2.1292],
        [-0.6912, -1.0416,  1.7776],
        [ 0.5323, -0.6962,  0.7629],
        [ 0.5236, -0.3880, -0.0959],
        [ 0.0448, -1.0597,  1.0400],
        [ 0.1562, -1.1492,  1.0673],
        [ 0.8386, -0.8939,  0.1256],
        [ 0.0194, -0.8154,  1.8549],
        [ 0.5095, -1.0683,  1.0492],
        [-0.1420, -0.9095,  1.6495],
        [ 0.8429, -0.0089, -0.4466],
        [ 0.5040, -0.9539,  0.6725],
        [-0.4825, -1.2548,  1.8330],
        [-0.0746, -1.6333,  1.6540],
        [-0.1147, -1.2387,  1.2672],
        [ 0.3993, -1.1763,  0.6167],
        [-0.1816, -1.3164,  1.5404],
        [ 0.6250, -0.5635, -0.1082],
        [-0.1653, -1.3311,  1.7848],
        [ 0.7658, -0.2460, -0.3721],
        [ 0.51

(Epoch 5) TRAIN LOSS:0.6881 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:05<00:00,  4.50it/s]

(Epoch 5) TRAIN LOSS:0.6881 ACC:0.72 F1:0.49 REC:0.51 PRE:0.54 LR:0.00000300



(Epoch 6) TRAIN LOSS:0.5868 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:07,  3.29it/s]

SequenceClassifierOutput(loss=tensor(0.5868, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4451, -1.0859,  1.0066],
        [-0.4454, -0.8982,  1.3909],
        [ 0.0238, -0.9891,  1.1365],
        [ 0.0854, -0.7338,  0.9916],
        [-0.3723, -1.2230,  1.9401],
        [ 0.0265, -1.2666,  1.4238],
        [ 0.1209, -1.1068,  1.0059],
        [-0.0565, -1.3336,  1.4652],
        [ 0.3510, -0.9986,  1.0590],
        [ 0.6444, -0.7508, -0.0919],
        [ 0.4792, -1.1669,  0.4861],
        [-0.0196, -0.9049,  1.6929],
        [ 0.0179, -0.6637,  0.5913],
        [-0.4247, -1.3947,  1.9032],
        [-0.1595, -1.1886,  1.5945],
        [ 0.4936, -0.0966, -0.1095],
        [ 0.3960, -0.1163, -0.6804],
        [ 0.3570, -1.2962,  1.5200],
        [-0.0457, -1.0099,  1.5209],
        [-0.1610, -1.1648,  1.7706],
        [-0.2903, -1.1569,  1.9441],
        [ 0.1413, -1.2076,  1.0546],
        [-0.0519, -1.3632,  1.5013],
        [ 0.2515, -0.8586,  1.1986],
        [ 0.33

(Epoch 6) TRAIN LOSS:0.5846 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:05,  4.02it/s]

SequenceClassifierOutput(loss=tensor(0.5824, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2933, -1.1196,  1.0102],
        [ 0.5575, -0.3773, -0.1442],
        [ 0.1287, -1.0556,  1.6321],
        [ 0.4022, -0.9798,  0.7505],
        [-0.1420, -1.1294,  1.4202],
        [ 0.0535, -1.1848,  1.5654],
        [-0.1196, -1.0587,  1.4499],
        [ 0.3392, -0.3330,  0.1786],
        [-0.0121,  0.1280, -0.2179],
        [-0.1944, -1.4873,  1.7336],
        [-0.1004, -1.3240,  1.6289],
        [ 0.6739, -0.2487, -0.2150],
        [-0.0735, -1.3958,  1.5943],
        [-0.1560, -1.4117,  1.6556],
        [-0.0171, -1.0502,  1.6809],
        [ 0.7144,  0.1698, -0.4002],
        [ 0.0472, -1.0417,  1.3969],
        [-0.1587, -1.0869,  1.8105],
        [ 0.8944, -0.4952,  0.5366],
        [ 0.0664, -1.1198,  1.7043],
        [ 0.4277,  0.0482, -0.0690],
        [ 0.2727, -1.1161,  1.4380],
        [-0.0127, -0.5049,  0.0526],
        [ 0.2487, -1.0428,  1.3694],
        [-0.19

(Epoch 6) TRAIN LOSS:0.5943 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:05,  4.24it/s]

SequenceClassifierOutput(loss=tensor(0.6138, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3725, -0.4818,  0.4867],
        [ 0.4622, -0.9213,  0.9415],
        [ 0.8626, -0.2767,  0.1769],
        [-0.1001, -1.3165,  1.9680],
        [-0.0276, -0.9504,  1.3704],
        [-0.2564, -0.9997,  2.0077],
        [ 0.5122, -0.2771, -0.0243],
        [-0.3454, -1.1734,  2.0278],
        [-0.0801, -0.4158,  0.4189],
        [ 0.5026, -0.2212,  0.0112],
        [ 0.7434, -0.4185, -0.2311],
        [ 0.6297, -0.5552, -0.0445],
        [ 0.6858, -1.1552,  1.1731],
        [ 0.2125, -0.1870, -0.3772],
        [ 0.4853, -1.1626,  0.7811],
        [-0.4401, -1.1843,  1.8392],
        [-0.4277, -1.3093,  1.8190],
        [ 0.0220, -0.8849,  1.6949],
        [-0.2184, -1.2806,  1.6979],
        [-0.3309, -1.3740,  1.7096],
        [ 0.4253, -1.2394,  1.4995],
        [ 0.0196, -1.4261,  1.2815],
        [-0.3162, -1.1914,  1.6408],
        [-0.1858, -1.0859,  1.6581],
        [-0.46

(Epoch 6) TRAIN LOSS:0.6107 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:04,  4.44it/s]

SequenceClassifierOutput(loss=tensor(0.6598, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5490, -0.9814,  0.2717],
        [ 0.1240, -1.1164,  1.3781],
        [ 0.8452, -0.7705,  0.3696],
        [ 0.9283, -0.2235, -0.2003],
        [ 0.5808, -0.4860,  0.6015],
        [ 0.5804, -0.4356,  0.2807],
        [ 0.5736, -0.1997, -0.0155],
        [-0.3347, -1.4341,  2.1084],
        [ 0.6280, -0.1340, -0.4249],
        [ 0.5581, -0.8716,  0.4012],
        [ 0.4629, -0.4740,  0.4027],
        [ 0.4973, -0.4573,  0.3512],
        [ 0.0121, -1.3225,  1.3028],
        [-0.1575, -0.9855,  1.3483],
        [ 0.1181, -1.4217,  1.9743],
        [-0.3329, -1.2382,  2.1058],
        [ 0.0859, -1.2705,  1.7176],
        [ 0.5591, -0.1935, -0.1454],
        [ 0.8346, -0.0324, -0.1659],
        [-0.0697, -1.6044,  1.3238],
        [-0.2846, -1.1831,  1.9487],
        [ 0.5932, -0.3406,  0.2549],
        [ 0.3925, -0.6836,  0.5575],
        [ 0.0381, -1.2927,  1.7529],
        [ 0.70

(Epoch 6) TRAIN LOSS:0.6283 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:04,  4.37it/s]

SequenceClassifierOutput(loss=tensor(0.6985, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5250, -1.2515,  1.0080],
        [-0.1316, -1.3546,  1.7577],
        [-0.4678, -1.4779,  1.9738],
        [ 0.2491, -1.0015,  1.2125],
        [ 0.1412, -1.2283,  1.2665],
        [-0.0378, -1.2977,  1.7681],
        [ 0.3731, -0.1861, -0.2975],
        [ 0.2473, -1.1117,  1.2675],
        [ 0.6523, -0.6483,  0.6113],
        [-0.2354, -1.3373,  1.8310],
        [ 0.4739, -0.4341, -0.1894],
        [-0.0370, -1.2745,  1.3586],
        [ 0.3786, -1.0453,  1.5152],
        [ 0.5057, -1.1976,  1.3202],
        [ 0.2849, -0.5570,  0.7302],
        [ 0.3560, -1.0714,  0.7093],
        [-0.1934, -1.2242,  1.4464],
        [ 0.0731, -0.9311,  1.6592],
        [ 0.4101, -0.2002,  0.0173],
        [ 0.4021, -1.0590,  1.0637],
        [-0.3845, -1.0338,  1.6388],
        [ 0.1014, -1.1590,  0.7897],
        [ 0.5371, -0.5355,  0.4664],
        [-0.2749, -1.1374,  1.7036],
        [ 0.52

(Epoch 6) TRAIN LOSS:0.6611 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:05,  3.72it/s]

SequenceClassifierOutput(loss=tensor(0.6811, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1771,  0.1353,  0.0917],
        [-0.1970, -1.0007,  1.2138],
        [-0.2336, -1.2363,  1.8245],
        [ 0.0452, -1.3820,  1.4813],
        [-0.1760, -1.0208,  1.8293],
        [ 0.0801, -1.2405,  1.3724],
        [ 0.5065, -0.3344,  0.3861],
        [ 0.4571,  0.1183, -0.2964],
        [ 0.0794, -1.0102,  1.0082],
        [-0.4629, -0.9156,  1.7336],
        [ 0.8159, -0.4205,  0.4807],
        [-0.1747, -1.3190,  1.7104],
        [ 0.0835, -0.3630,  0.4307],
        [ 0.4309, -0.6874,  0.6491],
        [-0.1697, -1.3127,  1.5976],
        [ 0.1626, -1.0393,  1.3448],
        [-0.3579, -1.3232,  1.8112],
        [ 0.6061, -0.6671,  0.8276],
        [-0.4083, -1.2075,  2.0650],
        [ 0.6632, -0.1202, -0.2730],
        [-0.0748, -1.0009,  1.3918],
        [ 0.7500, -0.3731, -0.2154],
        [ 0.6455, -0.7301,  0.2376],
        [ 0.5870, -0.3688, -0.4040],
        [ 0.92

(Epoch 6) TRAIN LOSS:0.6640 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:05,  3.39it/s]

SequenceClassifierOutput(loss=tensor(0.5554, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1056, -0.6558,  1.5036],
        [ 0.7839,  0.0231, -0.1233],
        [ 0.1591, -0.7366,  0.7426],
        [-0.3698, -0.9986,  1.7502],
        [ 0.1706, -1.2367,  1.6451],
        [ 0.2516, -1.1959,  1.0479],
        [ 0.2074, -0.8971,  0.8025],
        [-0.1129, -0.7281,  1.0046],
        [ 0.6262, -0.8947,  1.0541],
        [-0.1265, -1.1745,  1.1216],
        [ 0.0609, -1.2577,  1.3439],
        [-0.3698, -1.2868,  1.4504],
        [ 0.8112, -0.7655, -0.0605],
        [ 0.2150, -1.0808,  1.5251],
        [-0.2389, -1.2231,  1.7801],
        [ 0.4689, -0.1068, -0.2374],
        [ 0.0975, -1.2947,  1.2461],
        [ 0.1924, -1.0806,  1.0405],
        [ 0.2388, -1.0182,  1.1016],
        [-0.1680, -1.5991,  2.1760],
        [ 0.9125, -0.5755, -0.2201],
        [ 0.6480,  0.1373, -0.2278],
        [ 0.3101, -1.1009,  1.4310],
        [ 0.6757, -0.3067, -0.3049],
        [-0.44

(Epoch 6) TRAIN LOSS:0.6504 LR:0.00000300:  32%|█████████████                            | 8/25 [00:02<00:05,  3.22it/s]

SequenceClassifierOutput(loss=tensor(0.7702, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9093, -0.3899, -0.2618],
        [ 0.5856, -0.5507,  0.5168],
        [ 0.5202, -0.4474, -0.3870],
        [ 0.1742, -0.7432,  0.6427],
        [-0.5469, -1.4740,  2.1503],
        [ 0.0319, -0.2651, -0.0314],
        [ 0.8876, -0.5793,  0.0477],
        [ 0.1087, -0.9767,  1.5958],
        [ 0.4157, -0.9367,  1.1871],
        [ 0.4592, -0.8468,  1.2824],
        [-0.1419, -1.1966,  1.4727],
        [ 0.6671, -0.2966,  0.1385],
        [ 0.5088, -0.1102, -0.6649],
        [ 0.3758, -0.4359,  0.8385],
        [ 0.6495, -0.7086,  0.4074],
        [ 0.1675, -1.1717,  0.7365],
        [-0.0665, -0.8647,  1.8037],
        [-0.0810, -0.8950,  1.3068],
        [ 0.2254, -1.0483,  1.4464],
        [-0.2878, -1.3367,  2.0968],
        [ 0.6958,  0.2816, -0.4469],
        [ 0.1294, -1.1312,  1.2113],
        [ 0.4805, -0.1760, -0.0767],
        [-0.6631, -1.1725,  2.0190],
        [-0.43

(Epoch 6) TRAIN LOSS:0.6637 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:02<00:05,  3.12it/s]

SequenceClassifierOutput(loss=tensor(0.6910, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3058, -1.3240,  1.8987],
        [ 0.6164, -0.1019, -0.6718],
        [ 0.4202, -1.3018,  1.1046],
        [ 0.3650, -1.2704,  0.7768],
        [ 0.3266, -1.0076,  0.8626],
        [-0.0309, -1.5206,  2.0337],
        [-0.1940, -1.3631,  2.0374],
        [ 0.3303, -0.2044,  0.3542],
        [-0.2599, -1.2360,  2.0264],
        [ 0.6455, -0.8408,  0.6136],
        [ 0.0484, -1.3120,  1.5729],
        [-0.1009, -1.3053,  1.6163],
        [ 0.5453, -0.7243,  0.1739],
        [ 0.6951, -0.8393,  0.4639],
        [ 0.4690, -1.0449,  0.8616],
        [ 0.1193, -1.3657,  1.7534],
        [ 0.2947, -0.8088,  0.7285],
        [ 0.6893, -0.4880,  0.0662],
        [ 0.5308, -0.7344,  0.8453],
        [ 0.8454, -1.1446,  0.7735],
        [ 0.4851, -1.3132,  0.2896],
        [ 0.2465, -1.0840,  1.0344],
        [-0.1679, -1.0154,  1.8544],
        [ 0.5662, -0.3825,  0.4295],
        [-0.03

(Epoch 6) TRAIN LOSS:0.6665 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:05,  2.85it/s]

SequenceClassifierOutput(loss=tensor(0.7753, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0351, -1.0804,  1.5165],
        [ 0.7808, -0.3396,  0.3309],
        [ 0.2927, -0.3087,  0.1401],
        [-0.1759, -1.2537,  1.6694],
        [ 0.0590, -1.4764,  1.9407],
        [-0.0362, -1.2840,  1.2512],
        [ 0.5575, -0.6161,  0.6811],
        [-0.2282, -1.6116,  1.9157],
        [ 0.2098, -1.3021,  1.6204],
        [ 0.4723, -0.1979, -0.3364],
        [ 0.5134, -0.2582, -0.2446],
        [-0.5090, -1.4371,  2.0488],
        [-0.1384, -0.9394,  1.3860],
        [ 0.3041,  0.1600, -0.2741],
        [-0.5473, -1.2978,  1.8311],
        [ 0.7139, -1.2353,  0.2215],
        [ 0.3913, -0.1297, -0.0601],
        [ 0.5246, -0.3203,  0.2968],
        [-0.2811, -1.0350,  1.1401],
        [ 0.1274, -1.0894,  0.4750],
        [ 0.3479, -1.0171,  1.1311],
        [ 0.1314, -0.4225,  0.7198],
        [ 0.2067, -0.7689,  0.8677],
        [ 0.5098, -0.9130,  0.4701],
        [ 0.58

(Epoch 6) TRAIN LOSS:0.6764 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:03<00:04,  2.86it/s]

SequenceClassifierOutput(loss=tensor(0.6274, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 4.1802e-01, -8.0090e-01,  4.3086e-01],
        [ 1.5113e-01, -9.6886e-01,  4.6558e-01],
        [ 3.9160e-01, -7.8117e-03, -1.9857e-02],
        [-2.3906e-02, -1.3844e+00,  1.8182e+00],
        [ 3.1011e-01, -1.2730e+00,  1.8646e+00],
        [ 4.2198e-01, -8.0766e-01,  9.1751e-01],
        [ 6.2900e-01, -7.0869e-01,  5.3712e-01],
        [ 5.0269e-01, -4.2770e-01,  4.7022e-01],
        [ 4.9527e-01,  8.2860e-02, -3.4345e-01],
        [-1.0446e-01, -1.3154e+00,  1.3786e+00],
        [ 9.2443e-01, -6.3815e-01, -1.9933e-01],
        [ 9.4434e-02, -1.0777e+00,  1.7719e+00],
        [ 2.3755e-01, -9.3268e-01,  5.9023e-01],
        [-3.0912e-01, -1.1281e+00,  2.2253e+00],
        [ 1.0257e+00, -7.4023e-01,  4.6560e-01],
        [ 6.8384e-01, -4.4474e-01,  7.4125e-01],
        [ 7.8757e-01, -9.6452e-01,  7.5653e-01],
        [-5.3030e-01, -1.1147e+00,  2.2244e+00],
        [-1.6993e-01

(Epoch 6) TRAIN LOSS:0.6723 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:03<00:04,  2.77it/s]

SequenceClassifierOutput(loss=tensor(0.7383, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2306, -1.3741,  1.7897],
        [-0.1484, -1.3039,  1.5134],
        [-0.2576, -1.1254,  1.8986],
        [-0.2728, -1.3530,  1.9905],
        [ 0.7666, -0.3470, -0.4007],
        [ 0.7243, -0.4699,  0.1797],
        [ 0.8032, -0.8969,  0.4975],
        [ 0.7109, -0.4351,  0.6617],
        [ 0.5712,  0.3492, -0.4799],
        [ 0.7781,  0.1138, -0.5891],
        [ 0.8315, -0.1979, -0.4003],
        [ 0.1554, -1.1754,  1.6855],
        [ 0.6131, -0.5945, -0.0418],
        [ 0.8101, -0.2476, -0.4204],
        [ 0.7826, -0.7119, -0.0828],
        [ 0.1114, -1.3760,  1.9469],
        [ 0.3262, -1.0991,  1.0767],
        [ 0.3635,  0.3162, -0.4881],
        [ 0.2293, -1.1806,  1.6478],
        [-0.2175, -1.3922,  1.7821],
        [ 0.6280, -0.7776,  0.2179],
        [ 0.4898, -0.7733,  0.6269],
        [-0.0815, -1.4397,  2.1153],
        [ 0.6609, -0.7378,  0.0992],
        [ 0.17

(Epoch 6) TRAIN LOSS:0.6773 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:04<00:04,  2.70it/s]

SequenceClassifierOutput(loss=tensor(0.6137, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3001, -1.1647,  1.0612],
        [ 0.6287, -0.5205,  0.5401],
        [ 0.7330, -1.0131,  0.5582],
        [-0.3020, -1.2743,  2.1851],
        [-0.3093, -1.3885,  1.6840],
        [ 0.4877, -1.1217,  1.3516],
        [ 0.1653, -0.2394,  0.5061],
        [ 0.3129, -0.8710,  0.5509],
        [ 0.1556, -1.0991,  1.8624],
        [ 0.6430, -0.2905, -0.4656],
        [-0.0619, -1.0592,  1.9140],
        [-0.0774, -1.1537,  1.4804],
        [-0.3726, -1.3110,  1.8887],
        [ 0.3348, -0.5937,  0.4155],
        [ 1.0003, -0.4213, -0.2272],
        [-0.0597, -1.6053,  2.0751],
        [ 0.6489, -0.2715,  0.0439],
        [-0.3814, -1.2883,  1.8603],
        [ 0.2552, -1.1695,  1.5254],
        [-0.2932, -1.4099,  1.9270],
        [-0.2239, -1.0551,  1.4067],
        [ 0.8356, -0.6265,  0.0768],
        [ 0.1937, -1.2179,  1.4866],
        [-0.3642, -1.3649,  2.1365],
        [-0.07

(Epoch 6) TRAIN LOSS:0.6728 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:04<00:04,  2.62it/s]

SequenceClassifierOutput(loss=tensor(0.7017, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-4.0871e-02, -1.4756e+00,  1.6326e+00],
        [ 3.8294e-01, -9.6290e-01,  1.3340e+00],
        [-3.9703e-01, -1.1759e+00,  1.9834e+00],
        [ 1.7829e-01, -7.7346e-01,  1.1236e+00],
        [-1.7086e-02, -1.2882e+00,  1.5036e+00],
        [ 7.1222e-01, -7.4076e-01,  4.6252e-01],
        [-4.2039e-01, -1.5028e+00,  1.8980e+00],
        [ 4.1994e-01,  7.3603e-02, -4.6471e-01],
        [ 4.6686e-01, -2.2278e-01,  7.6074e-04],
        [ 2.3838e-01, -6.1661e-01, -1.2631e-01],
        [ 8.9764e-01, -9.8214e-01,  5.2731e-01],
        [-2.7549e-01, -7.8662e-01,  1.2012e+00],
        [ 3.0870e-01,  3.8578e-01, -5.2642e-01],
        [ 8.9265e-01, -8.3444e-01,  3.1737e-01],
        [-1.9719e-01, -1.3409e+00,  1.7863e+00],
        [ 9.0851e-01, -6.6393e-01,  4.4374e-01],
        [-9.5222e-03, -8.3356e-01,  1.2154e+00],
        [-1.6868e-01, -1.3437e+00,  1.5945e+00],
        [ 3.9390e-01

(Epoch 6) TRAIN LOSS:0.6747 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:04<00:03,  2.63it/s]

SequenceClassifierOutput(loss=tensor(0.6754, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0992, -1.0197,  1.6803],
        [-0.2869, -1.3413,  1.9214],
        [ 0.6317, -0.2354,  0.0838],
        [-0.4261, -0.6545,  1.7672],
        [-0.0828, -1.1537,  1.7181],
        [-0.4055, -0.9899,  1.9339],
        [-0.5823, -1.4405,  1.8044],
        [ 0.7394, -0.2557, -0.7355],
        [-0.3231, -0.9182,  1.7135],
        [ 0.4300, -0.5334, -0.2433],
        [ 0.2558, -1.3615,  1.9745],
        [ 0.2272, -0.3921,  0.0674],
        [ 0.2668, -0.5781,  0.9312],
        [ 0.7215, -0.8245,  0.4305],
        [-0.2649, -1.5009,  2.2365],
        [ 0.7766, -0.4420, -0.0220],
        [ 1.3026, -0.8586,  0.4177],
        [-0.0648, -0.9975,  1.4042],
        [ 0.6201, -0.2654, -0.5218],
        [-0.1049, -1.3373,  1.5971],
        [ 0.7346, -0.7695,  0.0692],
        [ 0.3663, -0.3557, -0.2858],
        [-0.1497, -1.6076,  2.0521],
        [-0.0024, -1.2926,  1.9578],
        [ 0.41

(Epoch 6) TRAIN LOSS:0.6748 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:05<00:03,  2.57it/s]

SequenceClassifierOutput(loss=tensor(0.6953, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 4.8653e-01, -1.3379e-01,  7.6564e-01],
        [ 3.8896e-01, -4.8969e-01, -2.2592e-01],
        [ 5.2885e-01, -6.8308e-01,  2.5097e-01],
        [-2.6160e-01, -1.3295e+00,  1.8842e+00],
        [ 2.8153e-01, -1.2580e+00,  1.4971e+00],
        [ 5.5628e-01, -2.6111e-01, -3.7925e-02],
        [ 8.3098e-01, -1.1453e-01, -6.9766e-01],
        [-1.1697e-01, -1.2484e+00,  1.7314e+00],
        [-2.6995e-01, -1.5453e+00,  1.8976e+00],
        [ 3.4745e-01, -8.9167e-01,  6.0852e-01],
        [ 3.1523e-02, -1.4661e+00,  1.7472e+00],
        [ 4.4532e-01, -4.2357e-01, -1.5405e-01],
        [ 7.1506e-01, -7.9355e-02, -3.0809e-01],
        [ 6.8377e-01, -6.2038e-01,  1.7385e-01],
        [ 6.1214e-01, -5.7802e-01, -1.6135e-01],
        [ 3.8042e-01, -8.1675e-01,  8.4631e-01],
        [ 5.8198e-01,  5.7640e-02, -4.3447e-01],
        [-3.4158e-01, -1.3253e+00,  2.3347e+00],
        [ 4.5958e-01

(Epoch 6) TRAIN LOSS:0.6760 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:05<00:03,  2.50it/s]

SequenceClassifierOutput(loss=tensor(0.6768, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5440, -0.1570, -0.3879],
        [ 0.1314,  0.5636,  0.0471],
        [ 0.0759, -0.7873,  1.4871],
        [-0.1689, -1.4876,  1.6386],
        [ 0.9217, -0.5614,  0.1134],
        [ 0.1911, -0.4785,  0.3867],
        [ 0.8547,  0.0318, -0.4733],
        [ 0.6019, -0.2359, -1.0462],
        [ 0.5447,  0.1770, -0.2901],
        [ 0.0617, -0.6783,  1.0928],
        [-0.2846, -1.2090,  1.5360],
        [ 0.0831, -1.2730,  1.5948],
        [-0.0202, -1.2904,  1.9386],
        [-0.1289, -1.2387,  1.6523],
        [ 0.8623, -0.5182,  0.1621],
        [ 0.7484, -0.0807, -0.5576],
        [ 0.5394, -0.6443,  0.7000],
        [ 0.6352, -0.7526,  0.7613],
        [-0.1641, -1.3945,  2.0422],
        [ 0.5140, -0.6682,  0.4411],
        [ 0.8083, -0.5971, -0.1219],
        [-0.3385, -0.9470,  1.8765],
        [ 0.3488,  0.1255, -0.5454],
        [ 0.5015, -0.1950,  0.0336],
        [-0.00

(Epoch 6) TRAIN LOSS:0.6842 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:06<00:02,  2.76it/s]

SequenceClassifierOutput(loss=tensor(0.8320, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.4454, -1.5673,  1.6748],
        [-0.3938, -0.9999,  1.7222],
        [ 0.7473, -0.3283, -0.5537],
        [ 0.2969, -0.7397,  0.2717],
        [-0.1603, -1.2669,  1.9601],
        [-0.1545, -1.2876,  1.3567],
        [ 0.3656, -0.0966, -0.9616],
        [-0.1113, -0.8334,  1.8474],
        [ 0.2071, -1.1595,  1.2193],
        [ 0.3718, -0.9101,  1.1399],
        [ 0.4369, -0.4610, -0.1450],
        [ 0.1889, -0.0758, -0.1407],
        [ 0.3809, -0.8908,  0.8063],
        [-0.1083, -1.0628,  1.6654],
        [ 0.3186,  0.1188, -0.0185],
        [ 0.3946, -1.4671,  1.3858],
        [ 0.4445, -0.2092, -0.3875],
        [ 0.1915, -0.5624,  0.8495],
        [ 0.2421, -0.2464, -0.8103],
        [ 0.0668, -0.5483,  0.8459],
        [ 0.5110, -0.8065,  0.3828],
        [ 0.6236, -1.0171,  0.4254],
        [ 0.7184, -0.6239, -0.4679],
        [ 0.8725, -0.5871,  0.4205],
        [ 0.80

(Epoch 6) TRAIN LOSS:0.6770 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:06<00:01,  3.15it/s]

SequenceClassifierOutput(loss=tensor(0.5401, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 9.4148e-01, -2.4666e-01, -7.8157e-01],
        [ 2.7457e-04, -1.0746e+00,  1.4817e+00],
        [-4.9695e-02, -8.6037e-01,  9.5189e-01],
        [ 6.3176e-01, -1.4425e-01, -5.4380e-01],
        [ 2.1101e-01, -1.1811e+00,  1.5018e+00],
        [ 7.7105e-01, -8.5849e-01,  1.3069e-01],
        [ 4.6669e-01, -1.3158e-01, -6.0788e-01],
        [-1.9124e-01, -1.5482e+00,  1.6670e+00],
        [-2.8222e-01, -1.3736e+00,  1.4522e+00],
        [ 5.1480e-01, -3.7999e-01,  8.8091e-02],
        [ 4.0902e-01, -7.4510e-01, -3.4325e-02],
        [ 2.8751e-01, -7.2237e-01,  1.0361e+00],
        [-4.3795e-02, -1.2548e+00,  1.8746e+00],
        [ 4.4197e-01, -3.3150e-01,  1.7541e-01],
        [ 2.2991e-01, -1.2477e+00,  2.0621e+00],
        [ 5.5897e-01, -6.3799e-01,  2.2464e-01],
        [ 3.2297e-01, -1.1681e+00,  1.3747e+00],
        [ 6.3577e-01, -1.2319e+00,  1.3700e+00],
        [ 2.1641e-01

(Epoch 6) TRAIN LOSS:0.6676 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:07<00:00,  3.81it/s]

SequenceClassifierOutput(loss=tensor(0.5384, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 2.1259e-01, -1.3108e+00,  1.1225e+00],
        [ 9.1437e-01, -4.2918e-01, -3.5383e-01],
        [ 6.1555e-01,  2.0938e-01, -2.8787e-01],
        [ 5.1784e-01, -8.5230e-01,  1.1349e+00],
        [-3.2051e-01, -1.4061e+00,  2.1625e+00],
        [-1.2792e-01, -1.0022e+00,  1.9157e+00],
        [-5.3066e-01, -1.2697e+00,  2.0116e+00],
        [-4.2471e-01, -1.5310e+00,  2.2207e+00],
        [-4.4006e-01, -1.0598e+00,  2.0369e+00],
        [ 6.5902e-01, -9.4600e-02, -3.1831e-01],
        [ 5.4620e-01, -5.3171e-01,  7.2392e-01],
        [-2.7499e-01, -1.0018e+00,  1.6848e+00],
        [-4.6855e-01, -1.2349e+00,  1.9676e+00],
        [-3.3398e-01, -1.3489e+00,  2.2387e+00],
        [ 8.2493e-01, -9.9229e-03, -6.3505e-01],
        [ 1.2633e-03, -8.7608e-01,  1.3741e+00],
        [-4.6859e-02, -1.0089e+00,  1.9273e+00],
        [-3.0791e-01, -1.1601e+00,  2.1239e+00],
        [ 5.6471e-01

(Epoch 6) TRAIN LOSS:0.6605 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:07<00:00,  4.20it/s]

SequenceClassifierOutput(loss=tensor(0.5038, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3899, -1.1187,  0.8180],
        [-0.2485, -1.3134,  1.5180],
        [ 0.5261,  0.2098,  0.0270],
        [ 0.6951, -0.4956, -0.2786],
        [ 0.0721, -0.9275,  1.1567],
        [-0.4258, -1.1922,  1.7262],
        [-0.3675, -1.1409,  2.0269],
        [ 0.0233, -1.6084,  1.5218],
        [-0.2635, -1.2308,  1.8151],
        [-0.3702, -0.7648,  1.8332],
        [-0.2626, -0.9917,  1.7647],
        [ 0.2535, -1.1098,  1.4588],
        [ 0.4513, -0.1874, -0.6229],
        [ 0.0128, -0.0134, -0.5381],
        [ 0.6048,  0.2417, -0.7112],
        [ 0.9011, -0.6579,  0.2966],
        [-0.0854, -1.5304,  1.7761],
        [-0.1178, -1.1822,  1.3974],
        [-0.1352, -1.6795,  1.4732],
        [ 0.5654, -1.0602,  0.9476],
        [-0.2951, -1.3475,  2.0462],
        [-0.3113, -1.3130,  1.9685],
        [-0.1244, -1.0791,  1.9447],
        [-0.4352, -1.0736,  1.6933],
        [ 0.52

(Epoch 6) TRAIN LOSS:0.6535 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  4.51it/s]

SequenceClassifierOutput(loss=tensor(0.4861, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3026, -1.1928,  2.0000],
        [ 0.4349, -0.3526, -0.1495],
        [ 0.1233, -1.1685,  1.3492],
        [ 0.2014, -1.4586,  1.8329],
        [ 0.4954, -0.5083,  0.0283],
        [ 0.3920,  0.2177, -0.3794],
        [ 0.0489, -0.9957,  1.1944],
        [-0.4380, -1.2977,  2.0192],
        [-0.4575, -1.3343,  2.0422],
        [ 0.4473, -1.3698,  1.3015],
        [ 0.3971, -0.6120,  0.9149],
        [ 0.4640, -0.1581, -0.3478],
        [-0.2726, -1.2032,  1.8506],
        [ 0.7943, -0.3126,  0.0218],
        [-0.0699, -0.9101,  1.6114],
        [ 0.1053, -1.1617,  1.1457],
        [ 0.0537, -1.1042,  1.3653],
        [ 1.0096, -0.3061, -0.4508],
        [ 0.1687, -0.7912,  1.0364],
        [ 0.5500, -0.0089,  0.0135],
        [-0.1948, -1.5748,  1.5696],
        [ 0.6903, -1.0757,  0.3485],
        [ 0.3036, -1.4492,  1.5334],
        [ 0.0103, -1.0428,  1.6727],
        [-0.27

(Epoch 6) TRAIN LOSS:0.6535 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  3.23it/s]

(Epoch 6) TRAIN LOSS:0.6535 ACC:0.73 F1:0.50 REC:0.53 PRE:0.58 LR:0.00000300



(Epoch 7) TRAIN LOSS:0.5809 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:06,  3.90it/s]

SequenceClassifierOutput(loss=tensor(0.5809, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-1.9457e-01, -1.1355e+00,  1.4541e+00],
        [-2.1105e-01, -1.4821e+00,  1.9202e+00],
        [-2.6946e-01, -1.4378e+00,  1.9315e+00],
        [ 6.8145e-01, -2.8764e-01, -4.0523e-01],
        [ 9.0550e-03, -1.2699e+00,  1.7100e+00],
        [ 1.7481e-01, -1.0856e+00,  1.4497e+00],
        [-2.1961e-01, -1.1145e+00,  2.1023e+00],
        [ 1.6092e-01, -1.3210e+00,  1.5343e+00],
        [ 1.0303e-01, -1.2888e+00,  1.6425e+00],
        [ 1.2930e-01, -9.1325e-01,  1.1266e+00],
        [ 5.9539e-01,  2.4032e-01, -4.0079e-01],
        [-2.5364e-01, -1.0802e+00,  2.0897e+00],
        [ 3.4214e-01, -1.1900e+00,  1.4734e+00],
        [ 4.6844e-01, -2.4694e-01, -2.3411e-01],
        [-4.1811e-01, -1.3464e+00,  2.1206e+00],
        [ 2.4132e-01, -1.0929e+00,  6.3999e-01],
        [-2.1603e-01, -1.0476e+00,  1.8893e+00],
        [ 3.2697e-01, -1.4251e+00,  1.3472e+00],
        [ 9.5023e-01

(Epoch 7) TRAIN LOSS:0.6127 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:05,  4.33it/s]

SequenceClassifierOutput(loss=tensor(0.6446, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0138, -1.4905,  1.8709],
        [ 0.4992, -0.7461, -0.2403],
        [ 0.1961, -0.0243, -0.2189],
        [ 0.0040, -1.1701,  1.4657],
        [-0.1380, -0.6783,  0.4827],
        [-0.4555, -1.2254,  2.3791],
        [-0.0399, -1.3558,  1.3026],
        [ 0.6079,  0.0336, -0.1563],
        [ 1.0263, -0.1992, -0.8299],
        [ 0.2634, -1.3473,  1.6005],
        [ 0.2124,  0.1363, -0.5520],
        [-0.1563, -1.0399,  1.3174],
        [-0.1214, -1.4434,  1.6599],
        [ 0.1995, -0.9858,  1.0327],
        [-0.3137, -1.3099,  1.7287],
        [-0.0973, -1.2302,  1.7459],
        [ 0.2341, -0.1926,  0.2177],
        [-0.0618, -1.1269,  1.6204],
        [ 0.5142, -0.2692, -0.1657],
        [ 0.3033, -0.7921,  0.9857],
        [ 0.4970, -0.9939,  1.1208],
        [-0.0598, -0.8345,  1.1846],
        [-0.1213, -0.9361,  1.1105],
        [-0.3612, -1.4528,  2.2767],
        [-0.32

(Epoch 7) TRAIN LOSS:0.5819 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:04,  4.43it/s]

SequenceClassifierOutput(loss=tensor(0.5202, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2716, -1.7523,  2.2153],
        [-0.7304, -1.1733,  2.0875],
        [ 0.5903, -0.3785,  0.0972],
        [ 0.6925, -0.3079, -0.1375],
        [ 0.2249, -1.1068,  0.6176],
        [-0.1614, -1.3279,  2.0276],
        [ 0.3050, -0.3946, -0.4901],
        [ 0.1995, -1.3910,  1.3430],
        [-0.2310, -1.4316,  2.0624],
        [ 0.6190, -0.6231,  0.6950],
        [-0.2766, -1.2328,  2.0813],
        [-0.2018, -0.9764,  1.7195],
        [ 0.9392, -1.0169,  0.0437],
        [-0.3769, -1.3062,  1.7796],
        [ 0.3877,  0.1237, -0.3433],
        [-0.4905, -1.3364,  2.0704],
        [ 1.0516,  0.1973, -0.9955],
        [ 0.1703, -1.1761,  1.6704],
        [-0.1201, -1.0224,  1.6064],
        [ 0.8490, -0.5084,  0.2923],
        [-0.3496, -1.0008,  1.7449],
        [ 0.5904, -0.5892, -0.4091],
        [ 0.5599, -0.2029, -0.3299],
        [ 0.8823, -0.7456,  0.5113],
        [ 0.86

(Epoch 7) TRAIN LOSS:0.5812 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:04,  4.77it/s]

SequenceClassifierOutput(loss=tensor(0.5793, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0698, -1.5749,  1.2949],
        [ 0.3684, -0.8459,  0.7956],
        [-0.1489, -0.7682,  1.5044],
        [-0.3092, -1.5290,  2.1479],
        [ 0.8361,  0.0222, -0.1454],
        [ 0.6588, -1.2402,  0.8959],
        [-0.1461, -1.1617,  2.1053],
        [ 0.4992, -0.7529,  0.2846],
        [ 0.2261, -1.0287,  0.9635],
        [-0.1047, -0.9588,  0.7366],
        [ 0.4649, -0.8894,  0.1601],
        [ 0.7905, -0.5870, -0.0511],
        [ 0.0043, -0.8816,  1.0416],
        [-0.0293, -1.1968,  1.9793],
        [-0.2978, -1.3666,  2.1421],
        [ 0.5239, -0.6704,  0.0713],
        [ 0.0598, -1.0025,  0.9501],
        [ 0.9472, -0.8656,  0.1508],
        [ 0.6589,  0.4142, -0.2477],
        [ 0.1958,  0.0945, -0.3453],
        [-0.4027, -1.5400,  2.1450],
        [-0.4438, -1.3162,  1.9102],
        [ 0.5870, -0.4004,  0.6212],
        [-0.3411, -1.6182,  2.1490],
        [ 0.75

(Epoch 7) TRAIN LOSS:0.6179 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:04,  4.65it/s]

SequenceClassifierOutput(loss=tensor(0.6019, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-4.9513e-01, -1.5262e+00,  2.2316e+00],
        [ 4.5703e-01,  4.9284e-02, -6.2587e-01],
        [-2.0842e-01, -1.5776e+00,  2.1810e+00],
        [-3.4962e-02, -9.0959e-01,  1.5837e+00],
        [ 7.4587e-01, -2.1015e-01, -8.2011e-02],
        [ 5.3606e-01, -3.4453e-01,  4.2278e-02],
        [ 6.0776e-02, -1.4128e+00,  1.4061e+00],
        [-6.7472e-01, -1.3162e+00,  2.1417e+00],
        [ 8.2944e-02, -1.3956e+00,  1.4633e+00],
        [ 8.3896e-02, -9.8912e-01,  1.8054e+00],
        [ 3.1985e-01,  1.8402e-01, -3.2264e-01],
        [ 6.9604e-01, -1.0201e+00,  9.1830e-01],
        [ 2.8465e-01, -1.2184e+00,  1.3702e+00],
        [ 5.8532e-03, -1.5174e+00,  1.5788e+00],
        [ 4.7927e-02, -1.4085e+00,  1.7535e+00],
        [-3.1815e-01, -1.2861e+00,  2.2341e+00],
        [ 6.6404e-01, -7.3674e-01,  2.9084e-01],
        [ 5.7148e-02, -9.4109e-01,  1.7230e+00],
        [ 9.8917e-01

(Epoch 7) TRAIN LOSS:0.5979 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:03,  4.73it/s]

SequenceClassifierOutput(loss=tensor(0.4783, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.4471, -1.2888,  1.4648],
        [ 0.0814, -1.2940,  1.9599],
        [ 0.7791, -1.2503,  0.8330],
        [ 0.7992, -0.8747,  0.3230],
        [-0.1308, -1.3308,  2.1926],
        [-0.2112, -0.0448,  0.0679],
        [ 0.6407,  0.3067, -0.8175],
        [ 0.4075, -0.7769,  0.3965],
        [-0.1173, -1.2640,  1.6817],
        [ 0.6349, -0.0472, -0.4976],
        [-0.1929, -1.3920,  2.0145],
        [-0.1171, -1.3221,  2.0178],
        [ 0.4199, -1.1403,  1.1440],
        [ 0.0085, -1.2559,  1.8665],
        [-0.2900, -1.2987,  1.9253],
        [ 0.0842, -1.1005,  1.7426],
        [ 0.1707, -1.0467,  0.9560],
        [ 0.0309, -1.2624,  1.9409],
        [ 0.2192, -1.3608,  1.7977],
        [-0.0862, -1.0619,  1.3962],
        [-0.1149, -1.3974,  1.5512],
        [ 0.1072, -0.1221, -0.2008],
        [ 0.2574, -0.7855,  0.8782],
        [ 0.1660, -1.5961,  1.3066],
        [-0.09

(Epoch 7) TRAIN LOSS:0.5928 LR:0.00000300:  32%|█████████████                            | 8/25 [00:01<00:03,  4.66it/s]

SequenceClassifierOutput(loss=tensor(0.5564, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2587, -1.4633,  1.9506],
        [ 0.2049, -1.5055,  1.4904],
        [-0.0318, -1.5727,  2.3040],
        [-0.2041, -1.3854,  1.7999],
        [ 0.6028, -0.0647, -0.3072],
        [-0.0986, -1.3387,  1.7563],
        [ 0.4896, -0.3517,  0.2862],
        [ 0.0123, -1.1696,  1.7001],
        [ 0.2571, -0.1520, -0.3674],
        [ 0.5793, -0.2479, -0.2211],
        [-0.4462, -1.6765,  1.9860],
        [ 0.6496, -0.2905, -0.3700],
        [ 0.7194, -0.7886,  0.1742],
        [-0.0069, -1.4287,  1.6385],
        [ 0.1740, -1.5306,  1.5631],
        [-0.0150, -1.5839,  1.6411],
        [-0.5987, -1.6641,  2.3622],
        [ 0.5218, -0.3448, -0.1203],
        [ 0.5317, -0.7920,  0.5135],
        [-0.2229, -1.2064,  2.2523],
        [ 0.2340, -0.5195,  0.2318],
        [ 0.7947, -0.6900,  0.1948],
        [-0.1580, -0.9899,  1.9351],
        [ 1.0119, -0.6067, -0.1545],
        [ 0.12

(Epoch 7) TRAIN LOSS:0.5796 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:01<00:03,  4.67it/s]

SequenceClassifierOutput(loss=tensor(0.4743, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3784, -1.2471,  1.9397],
        [ 0.1482, -1.2748,  1.4813],
        [-0.1684, -1.3213,  1.7770],
        [ 1.0791, -0.9977,  0.3275],
        [-0.0965, -1.3867,  2.1641],
        [-0.1988, -1.5612,  1.7800],
        [ 0.6367, -0.5422, -0.2503],
        [-0.1989, -1.1093,  2.1703],
        [ 0.8902, -0.0345, -0.5012],
        [ 0.9428, -0.8533,  0.2157],
        [ 0.6543,  0.0090, -0.5650],
        [ 0.1450, -0.7234,  0.8735],
        [-0.2953, -1.2844,  2.2327],
        [ 0.9463, -0.6124,  0.1364],
        [ 0.3303, -1.1202,  0.9664],
        [-0.4195, -1.1801,  2.2138],
        [-0.2781, -1.2351,  1.9836],
        [-0.3918, -1.5114,  2.1423],
        [ 0.0484, -1.3147,  1.5503],
        [-0.0878, -1.2978,  1.8132],
        [ 0.3944, -0.5863,  0.7906],
        [ 0.8561, -0.1251, -0.3384],
        [-0.2752, -1.5274,  1.8402],
        [ 0.9911, -0.1989, -0.5982],
        [-0.08

(Epoch 7) TRAIN LOSS:0.5747 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:03,  4.56it/s]

SequenceClassifierOutput(loss=tensor(0.5306, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0448, -1.3162,  1.6141],
        [-0.3616, -1.1996,  1.9850],
        [ 0.4613, -0.9085,  0.6909],
        [-0.3828, -1.2833,  1.9060],
        [ 0.9792, -0.9087,  0.0111],
        [-0.2664, -1.2681,  1.9564],
        [ 0.4636, -0.5893,  0.0755],
        [ 0.0886, -0.9337,  1.7404],
        [-0.4704, -1.3597,  2.3619],
        [ 0.1555, -1.4446,  1.9772],
        [ 0.6094, -0.6446,  0.2243],
        [-0.0330, -0.9557,  1.2311],
        [ 0.5676, -0.0157, -0.3091],
        [ 0.5157, -0.3639,  0.1310],
        [-0.5643, -1.5158,  2.2236],
        [ 1.1645, -0.7105,  0.0540],
        [ 0.0184, -1.2708,  1.0090],
        [-0.0954, -1.3651,  2.1908],
        [ 0.3886, -0.6256,  0.3078],
        [ 0.0281, -1.2168,  1.8859],
        [-0.1342, -0.9429,  1.7988],
        [ 0.4317, -0.9448,  1.0178],
        [ 0.0065, -1.3048,  1.9435],
        [-0.4716, -1.5940,  1.9712],
        [-0.22

(Epoch 7) TRAIN LOSS:0.5649 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:02<00:03,  4.57it/s]

SequenceClassifierOutput(loss=tensor(0.4672, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 7.4999e-01, -6.9106e-01,  2.8398e-01],
        [-5.0729e-01, -1.2221e+00,  2.3323e+00],
        [ 2.7024e-01, -5.0794e-01,  3.7724e-01],
        [ 9.3489e-02, -1.1880e+00,  1.2878e+00],
        [ 2.7280e-01, -6.0961e-01,  9.3351e-01],
        [ 7.0398e-01, -1.0831e-01, -5.1626e-01],
        [ 6.1588e-01, -9.2532e-01,  3.0610e-01],
        [-5.0296e-01, -1.1912e+00,  2.2896e+00],
        [-3.3067e-01, -1.2235e+00,  1.9834e+00],
        [-1.1234e-01, -1.2044e+00,  2.0685e+00],
        [ 2.9693e-01, -7.7127e-01,  8.6979e-01],
        [ 1.6912e-03, -6.2727e-02,  1.3741e-01],
        [ 4.3310e-01, -1.6348e+00,  1.4370e+00],
        [-4.7194e-01, -1.3419e+00,  2.2060e+00],
        [-4.2678e-01, -1.4950e+00,  2.1709e+00],
        [-7.9194e-02, -1.1388e+00,  1.0144e+00],
        [ 7.1406e-01,  5.8043e-02, -2.6655e-01],
        [ 3.6434e-01, -8.1284e-01,  4.1468e-01],
        [ 6.0939e-01

(Epoch 7) TRAIN LOSS:0.5612 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:02<00:02,  4.94it/s]

SequenceClassifierOutput(loss=tensor(0.4728, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3264, -1.2663,  1.7865],
        [ 0.1611, -1.1890,  1.9990],
        [ 0.8265, -0.5899,  0.4111],
        [ 0.9176, -0.0830, -0.3062],
        [-0.3946, -1.1635,  1.9776],
        [ 1.0642, -0.2435, -0.5038],
        [-0.4321, -1.4610,  2.0576],
        [ 0.1345, -1.4654,  1.5644],
        [-0.5113, -1.4282,  2.0842],
        [ 0.2486, -0.6328,  0.5492],
        [ 0.2520, -1.1477,  1.5995],
        [-0.3958, -1.3282,  2.0286],
        [ 0.5724, -0.5081,  0.1120],
        [-0.3066, -1.6570,  2.1108],
        [-0.1902, -1.6610,  1.6908],
        [ 0.4138,  0.2573, -0.5020],
        [ 0.1648, -0.2425,  0.3300],
        [ 0.3849, -0.5634,  0.0088],
        [-0.2015, -1.2469,  1.6924],
        [-0.3976, -1.4178,  2.3734],
        [ 0.3996, -1.1204,  1.5570],
        [ 0.4737, -1.1927,  1.0710],
        [ 0.3016, -0.9952,  0.2757],
        [-0.4954, -1.0155,  1.8052],
        [ 1.08

(Epoch 7) TRAIN LOSS:0.5646 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:03<00:02,  4.91it/s]

SequenceClassifierOutput(loss=tensor(0.5591, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3779, -1.2578,  2.1011],
        [ 0.1644, -1.2336,  1.8337],
        [ 0.3053, -1.3778,  2.0858],
        [ 0.3243, -0.1969, -0.2451],
        [-0.2159, -1.3630,  1.7660],
        [ 0.6196, -0.3040,  0.0346],
        [-0.4261, -1.3544,  1.3532],
        [-0.5118, -1.4141,  2.0421],
        [ 0.7506, -0.1680, -0.3623],
        [ 0.3716, -0.0206,  0.1774],
        [-0.2234, -1.3434,  2.1810],
        [-0.3435, -1.1117,  2.0011],
        [ 0.5881, -0.2722, -0.4061],
        [ 0.6991, -0.9715,  0.8190],
        [ 0.2307, -1.0892,  1.8042],
        [-0.4129, -1.5478,  1.9466],
        [ 0.4170, -0.2492, -0.1392],
        [ 0.7935, -0.0805, -0.0296],
        [ 0.2777, -0.1004, -0.1049],
        [-0.3884, -1.2932,  1.8956],
        [-0.1793, -1.6677,  1.8737],
        [ 0.5625, -1.0050,  0.5841],
        [ 0.0564, -1.2611,  1.0240],
        [ 0.6364, -0.2972, -0.1418],
        [-0.23

(Epoch 7) TRAIN LOSS:0.5752 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:03<00:01,  4.88it/s]

SequenceClassifierOutput(loss=tensor(0.7339, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2110, -0.3084, -0.0301],
        [ 0.4243, -0.6970,  0.4298],
        [ 0.9045, -0.5399, -0.4081],
        [-0.2690, -1.3130,  2.2744],
        [-0.2874, -0.7184,  1.9519],
        [-0.2684, -1.3032,  1.9946],
        [ 0.4544, -1.0889,  1.3376],
        [-0.4092, -1.0149,  1.6734],
        [ 0.5703, -0.4576,  0.7096],
        [ 0.4901, -0.9140,  0.8239],
        [-0.4931, -1.1666,  2.2011],
        [ 0.4178, -0.6412,  1.1665],
        [ 0.2360, -0.2872, -0.4326],
        [-0.0894, -1.5266,  1.4507],
        [ 0.5622, -0.5278,  0.3189],
        [ 0.6053,  0.1942, -0.3566],
        [-0.3860, -1.2157,  1.5774],
        [ 0.8909, -0.2291,  0.0466],
        [-0.1790, -1.5130,  2.3831],
        [-0.0605, -1.5252,  2.0047],
        [-0.6292, -1.3004,  2.2925],
        [-0.2538, -1.3841,  2.1492],
        [ 0.1444, -1.2718,  1.4988],
        [-0.1601, -1.5050,  1.5072],
        [ 0.56

(Epoch 7) TRAIN LOSS:0.5874 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:03<00:01,  4.71it/s]

SequenceClassifierOutput(loss=tensor(0.7833, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3029, -1.6049,  1.9822],
        [-0.1953, -1.4606,  2.0499],
        [ 0.4526,  0.0304, -0.3282],
        [ 0.5964, -0.8839,  0.8127],
        [ 0.6228, -0.5182, -0.0368],
        [-0.5600, -1.4142,  2.3628],
        [ 0.4203,  0.1019, -0.3726],
        [ 1.0133, -0.7344,  0.4603],
        [-0.3129, -1.4609,  1.8630],
        [ 0.7326, -1.2950,  1.0208],
        [-0.4795, -1.1530,  2.1768],
        [-0.1930, -1.3928,  1.9547],
        [ 0.8666, -0.3061,  0.2636],
        [ 0.4254, -0.4914, -0.1032],
        [ 1.1593, -0.3593, -0.2129],
        [ 0.9823, -0.5577,  0.1501],
        [-0.2766, -1.2117,  2.3422],
        [ 0.2487, -0.4427,  0.1778],
        [ 0.8654, -1.0009,  0.5675],
        [ 0.2229, -0.9448,  1.1489],
        [ 0.7739, -0.5276, -0.3184],
        [ 0.5173, -0.6980, -0.0422],
        [ 0.7357,  0.1348, -0.2829],
        [-0.3887, -1.3920,  2.2946],
        [-0.02

(Epoch 7) TRAIN LOSS:0.5995 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:03<00:01,  4.59it/s]

SequenceClassifierOutput(loss=tensor(0.8041, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 9.9433e-01, -1.7157e-01, -3.8272e-01],
        [ 3.3033e-01, -1.1566e+00,  1.5072e+00],
        [ 2.7008e-01, -1.4630e+00,  1.7011e+00],
        [ 6.2066e-01, -1.9743e-01, -5.0432e-01],
        [ 8.4729e-02, -1.3756e+00,  1.0894e+00],
        [-4.4601e-01, -1.0433e+00,  2.3944e+00],
        [-2.4736e-01, -1.1145e+00,  1.6550e+00],
        [ 6.3004e-01,  1.2991e-01, -8.6348e-01],
        [-1.9594e-01, -1.3815e+00,  2.0506e+00],
        [-1.7696e-01, -1.4813e+00,  1.9318e+00],
        [-2.4297e-01, -1.2244e+00,  1.7695e+00],
        [ 2.6152e-01, -1.3784e+00,  1.4543e+00],
        [ 6.6794e-01, -3.9274e-01,  3.6476e-01],
        [-3.9256e-01, -7.6196e-01,  1.7339e+00],
        [ 7.5369e-01, -2.7895e-01,  4.1589e-01],
        [-3.2128e-01, -1.1910e+00,  2.0308e+00],
        [-8.5417e-02, -1.3530e+00,  1.6242e+00],
        [-1.4077e-02, -1.5881e+00,  1.9177e+00],
        [ 4.2143e-01

(Epoch 7) TRAIN LOSS:0.5955 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:04<00:01,  4.69it/s]

SequenceClassifierOutput(loss=tensor(0.5237, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.6347, -1.3174,  2.4169],
        [ 0.2730, -0.3246,  0.1055],
        [ 0.2705, -0.0830, -0.2330],
        [ 0.8351, -0.9717,  0.4469],
        [-0.3721, -1.4617,  1.9775],
        [ 1.2285, -0.4850, -0.2508],
        [ 0.1832, -0.1320, -0.5590],
        [-0.3091, -1.1067,  2.0156],
        [ 0.1445,  0.4269, -0.1643],
        [-0.0659, -1.2250,  1.3588],
        [ 0.7177,  0.1071, -0.5994],
        [-0.3246, -1.4435,  2.0302],
        [ 0.4742,  0.1670, -0.0975],
        [ 0.5530, -0.8750,  0.5707],
        [-0.4691, -1.6593,  2.2576],
        [ 0.7137, -0.6183,  0.5066],
        [-0.4260, -1.0308,  2.1783],
        [ 0.6546, -0.1882,  0.0276],
        [ 0.6825, -0.4621, -0.4289],
        [ 0.4219, -0.6536, -0.0345],
        [-0.5787, -1.5595,  2.0891],
        [-0.0221, -1.0413,  1.5013],
        [ 0.7680, -0.7864, -0.1057],
        [ 0.8319, -0.5826,  0.1737],
        [-0.22

(Epoch 7) TRAIN LOSS:0.6080 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:04<00:01,  4.76it/s]

SequenceClassifierOutput(loss=tensor(0.8457, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7940, -0.7110, -0.0160],
        [-0.1230, -0.8791,  1.3403],
        [ 0.4975, -0.9587, -0.0762],
        [ 0.4276,  0.1688, -0.1170],
        [ 0.1043, -1.7534,  1.9018],
        [-0.1717, -1.4576,  1.6725],
        [-0.4037, -1.2431,  2.4070],
        [-0.4449, -1.5575,  2.0917],
        [-0.4821, -1.5648,  2.2778],
        [ 0.0896, -1.3894,  1.4772],
        [-0.0557, -1.2074,  2.1432],
        [ 1.1827, -0.8868,  0.2815],
        [ 0.8003,  0.1952, -0.6865],
        [ 0.7104, -0.7344,  0.2829],
        [ 0.6205, -0.5161, -0.2515],
        [ 1.0162, -0.5099, -0.1784],
        [ 0.4021, -0.6677,  0.3096],
        [-0.0698, -1.4299,  1.6142],
        [-0.4486, -1.6842,  2.1991],
        [-0.2006, -1.5759,  2.1145],
        [-0.7605, -1.5581,  2.3286],
        [ 0.8590, -0.5921, -0.1632],
        [-0.2915, -1.2179,  1.8870],
        [-0.1739, -1.2786,  2.3229],
        [ 0.92

(Epoch 7) TRAIN LOSS:0.6212 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:04<00:00,  4.92it/s]

SequenceClassifierOutput(loss=tensor(0.7273, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0470e+00, -4.8465e-01,  2.9603e-01],
        [ 5.8802e-01, -8.4030e-01,  3.7367e-01],
        [ 7.0238e-01, -7.3344e-01,  6.3944e-01],
        [ 5.2387e-01, -7.6494e-01,  1.4838e+00],
        [ 7.6090e-01, -3.7336e-01, -1.1280e-01],
        [ 5.3332e-01,  4.9848e-02, -4.7238e-02],
        [-3.8848e-01, -1.4077e+00,  2.1404e+00],
        [ 6.5528e-01, -9.7341e-01,  5.6989e-01],
        [-3.9917e-01, -1.3153e+00,  2.0721e+00],
        [-3.8550e-01, -1.4777e+00,  2.3378e+00],
        [ 2.6071e-01, -4.2255e-01,  1.9118e-01],
        [ 5.0420e-01,  1.4580e-01, -5.2961e-01],
        [ 4.2209e-01, -1.8447e-01, -1.4851e-01],
        [ 3.2292e-01, -6.6779e-01,  2.9964e-01],
        [ 3.1131e-01, -1.4347e+00,  1.6848e+00],
        [ 8.1915e-02, -3.0615e-01,  3.5325e-01],
        [ 1.7959e-01, -1.3723e+00,  1.8682e+00],
        [ 3.3650e-01, -3.8224e-01,  3.5151e-01],
        [-4.2967e-01

(Epoch 7) TRAIN LOSS:0.6172 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:04<00:00,  4.91it/s]

SequenceClassifierOutput(loss=tensor(0.5295, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.6835, -1.0974,  1.9464],
        [ 0.1257, -1.4412,  2.1220],
        [-0.1745, -1.3651,  2.1456],
        [-0.1207, -1.3350,  1.4712],
        [ 0.4954, -0.0544, -0.2641],
        [-0.4364, -1.4613,  2.3661],
        [-0.0029, -1.4718,  1.9680],
        [ 0.8795,  0.2949, -0.4611],
        [ 0.5110, -0.9635,  0.9934],
        [-0.5876, -1.1916,  2.0109],
        [-0.4639, -1.2652,  2.4906],
        [ 0.1389, -1.0062,  0.9526],
        [ 0.1108, -0.9403,  1.4783],
        [ 0.5805, -0.4560, -0.3383],
        [-0.0135, -1.1702,  1.6626],
        [ 0.0340, -1.3538,  1.4637],
        [ 1.0353, -0.2962, -0.7584],
        [ 0.0274, -1.5118,  1.7982],
        [-0.0296, -1.6495,  2.1417],
        [ 0.3602, -0.2487,  0.0976],
        [ 0.5737, -0.5419,  0.1336],
        [ 0.6593, -1.1270,  1.0497],
        [-0.3043, -1.0692,  2.2684],
        [-0.2475, -1.6803,  2.5172],
        [ 0.52

(Epoch 7) TRAIN LOSS:0.6200 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:05<00:00,  4.86it/s]

SequenceClassifierOutput(loss=tensor(0.6853, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1047, -1.3990,  1.5173],
        [ 1.0758, -0.7384,  0.0819],
        [ 0.6664, -0.9472,  0.7187],
        [-0.2645, -1.1917,  2.2647],
        [ 0.9832, -0.8753,  0.2756],
        [ 0.1694, -0.9601,  1.3482],
        [ 0.7964, -0.0423, -0.5269],
        [ 0.1240, -1.1190,  0.8267],
        [ 1.1843, -0.5651, -0.1630],
        [ 0.6302,  0.1641, -0.2663],
        [-0.0474, -1.4165,  1.7815],
        [ 0.2945, -1.3142,  1.3833],
        [-0.6625, -1.3298,  1.7620],
        [-0.2108, -1.3380,  2.1612],
        [ 0.8528, -0.5377,  0.3869],
        [ 0.6203, -1.2148,  1.3063],
        [-0.1887, -1.3560,  2.1174],
        [-0.0173, -0.9082,  1.5734],
        [ 0.9648, -0.2234, -0.6309],
        [-0.0712, -1.0632,  1.8638],
        [ 0.3677, -1.4169,  1.7833],
        [ 0.6953, -0.1427, -0.8653],
        [-0.0390, -1.5759,  1.7660],
        [ 0.5020,  0.0217, -0.1021],
        [-0.15

(Epoch 7) TRAIN LOSS:0.6153 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:05<00:00,  4.77it/s]

SequenceClassifierOutput(loss=tensor(0.5019, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8661, -0.7618,  0.4852],
        [ 0.0493, -1.2962,  1.9436],
        [ 0.6168, -0.2061, -0.0461],
        [-0.4220, -1.4656,  2.0148],
        [ 0.7932,  0.1367, -0.8506],
        [ 0.6083, -0.4627, -0.0949],
        [ 0.8817, -0.6383,  0.6835],
        [ 0.5952, -0.1654, -0.4599],
        [-0.6444, -1.5405,  2.5026],
        [-0.5242, -1.3992,  2.2883],
        [ 0.0315, -1.5831,  1.7080],
        [ 0.6535, -0.2710, -0.2804],
        [ 0.1558, -1.2936,  1.2514],
        [-0.5528, -1.3960,  2.2483],
        [-0.4712, -1.5905,  1.9171],
        [ 0.9541, -0.7367,  0.6513],
        [ 0.0162, -1.2037,  1.5650],
        [-0.1459, -1.3419,  1.8326],
        [ 0.5115, -0.0735, -0.7206],
        [ 0.0671, -1.3452,  1.6678],
        [ 0.2516, -1.3895,  1.3573],
        [-0.3780, -1.2696,  2.1109],
        [-0.3689, -1.1579,  1.8131],
        [ 0.1888, -1.3885,  1.4431],
        [ 0.99

(Epoch 7) TRAIN LOSS:0.6153 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:05<00:00,  4.64it/s]

(Epoch 7) TRAIN LOSS:0.6153 ACC:0.74 F1:0.51 REC:0.54 PRE:0.48 LR:0.00000300



(Epoch 8) TRAIN LOSS:0.4679 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:05,  4.20it/s]

SequenceClassifierOutput(loss=tensor(0.4679, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4410, -0.8806,  0.7240],
        [-0.1296, -1.3728,  1.8056],
        [-0.4432, -1.4709,  1.9792],
        [ 0.2117,  0.0551, -0.0267],
        [-0.2163, -1.2053,  1.6143],
        [ 0.8782, -0.7665,  0.5175],
        [ 0.6315, -0.8835,  0.6521],
        [ 0.5503, -0.8612,  0.2762],
        [-0.3292, -1.3133,  2.1651],
        [-0.3245, -1.3702,  1.6883],
        [-0.3460, -1.6322,  1.5069],
        [ 0.0841, -1.4179,  1.6653],
        [-0.0723, -1.5557,  2.1439],
        [ 1.0158, -0.3437, -0.7776],
        [ 0.3376, -1.2225,  1.4763],
        [ 0.1661, -1.2198,  1.6867],
        [-0.5158, -1.2473,  1.8987],
        [ 0.7478, -0.4447,  0.0313],
        [-0.2225, -1.2456,  2.1967],
        [ 0.2001, -1.6325,  1.5822],
        [-0.1569, -1.2004,  1.6669],
        [ 0.6285, -0.4241, -0.5264],
        [-0.5338, -1.3951,  1.6994],
        [-0.4357, -1.2798,  1.8044],
        [-0.13

(Epoch 8) TRAIN LOSS:0.4960 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:05,  4.44it/s]

SequenceClassifierOutput(loss=tensor(0.5240, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2978, -1.1970,  1.1728],
        [ 0.9852, -0.5206,  0.3116],
        [ 0.7927, -0.1829, -0.4701],
        [-0.5146, -1.2825,  2.1476],
        [ 0.3554, -1.3198,  0.9446],
        [ 0.0897, -1.6542,  2.0652],
        [-0.4730, -1.2477,  2.1134],
        [ 0.6311,  0.1842, -0.2779],
        [-0.4194, -1.6137,  2.0654],
        [ 0.0059, -1.4536,  1.8883],
        [ 0.5717, -0.1408, -0.6876],
        [-0.1238, -1.6192,  2.3457],
        [-0.4892, -1.3874,  2.2708],
        [ 0.6952, -0.2078, -0.2296],
        [ 1.1457, -1.1499,  0.1322],
        [ 0.0944, -1.3707,  1.4651],
        [ 0.0879, -1.0823,  1.8435],
        [-0.0855, -1.1427,  1.8651],
        [ 0.1415, -0.5504,  1.0766],
        [ 0.8470,  0.2018, -0.6746],
        [ 0.2856, -0.5203,  0.3159],
        [ 0.4765, -0.9311,  1.0687],
        [ 0.2236, -0.8588,  0.6818],
        [-0.5292, -1.2304,  2.1968],
        [-0.12

(Epoch 8) TRAIN LOSS:0.5421 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:05,  4.13it/s]

SequenceClassifierOutput(loss=tensor(0.6343, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0339, -1.2520,  1.9018],
        [-0.0983, -1.7263,  2.2415],
        [-0.2870, -0.9865,  1.3689],
        [-0.0199, -1.2465,  1.9309],
        [ 0.9566, -0.5509,  0.3995],
        [ 0.8386, -1.3153,  0.5492],
        [ 0.4665, -0.0987, -0.3748],
        [ 0.8923,  0.0271, -0.3067],
        [-0.1269, -1.2913,  1.9076],
        [ 0.5306,  0.3985, -0.4068],
        [ 0.1132, -1.2537,  1.4784],
        [-0.1397, -1.5024,  1.7849],
        [-0.1468, -1.1610,  2.1507],
        [ 0.5608, -0.7388, -0.0632],
        [ 0.3354, -0.4222, -0.1561],
        [ 0.5317, -1.0094,  0.6788],
        [-0.3236, -1.4100,  2.2679],
        [ 0.2439, -1.3923,  1.6252],
        [ 0.1555, -0.2912, -0.2258],
        [ 0.3696, -1.2793,  1.1420],
        [ 1.2121, -0.9533, -0.0140],
        [-0.1171, -1.2982,  1.6947],
        [-0.2797, -1.3775,  2.0187],
        [ 0.7200,  0.2152, -0.5523],
        [ 0.64

(Epoch 8) TRAIN LOSS:0.5472 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:04,  4.23it/s]

SequenceClassifierOutput(loss=tensor(0.5626, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7069, -0.1680, -0.2841],
        [-0.4558, -1.3528,  2.4426],
        [-0.0487, -1.6053,  1.8722],
        [ 0.5376, -0.0601, -0.6073],
        [-0.0600, -0.8800,  2.0050],
        [ 0.8444, -0.0471, -0.3384],
        [-0.0679, -1.1029,  1.3274],
        [ 0.0530, -1.4858,  1.5309],
        [-0.2077, -1.0171,  1.7398],
        [ 0.5494, -0.1233,  0.1193],
        [-0.4343, -1.2039,  2.0989],
        [ 0.9245, -0.7640, -0.0370],
        [-0.2047, -1.4207,  2.2623],
        [-0.3135, -1.1579,  1.7839],
        [ 0.9405, -0.6743, -0.2239],
        [-0.1634, -1.2278,  1.9108],
        [-0.1219, -1.1931,  1.7688],
        [-0.4470, -1.2054,  1.9402],
        [ 0.9779, -0.6708, -0.0098],
        [ 0.6910,  0.0747, -0.4569],
        [ 0.6981, -0.8565,  1.0480],
        [ 0.1481, -1.2096,  1.6270],
        [ 0.7020, -0.2156, -0.6680],
        [-0.1027, -1.4705,  1.9970],
        [-0.35

(Epoch 8) TRAIN LOSS:0.5747 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:05,  3.40it/s]

SequenceClassifierOutput(loss=tensor(0.5920, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0478, -0.4637, -0.0023],
        [ 0.7899, -0.1556, -0.2058],
        [-0.0835, -1.2699,  1.7392],
        [-0.7470, -1.5102,  2.2727],
        [-0.4128, -1.2553,  1.8543],
        [ 0.1325, -1.2010,  1.8382],
        [ 0.6912, -0.1008, -0.3622],
        [-0.3769, -1.2997,  2.1791],
        [ 0.4561, -0.0712, -0.4711],
        [-0.1014, -1.7314,  2.0654],
        [ 0.8953, -0.2100, -0.5898],
        [-0.3752, -1.4955,  2.1667],
        [ 0.0711, -1.1176,  1.7206],
        [-0.0264, -1.3236,  1.6557],
        [ 0.9369, -0.2056,  0.0899],
        [ 0.0347, -1.4050,  1.9040],
        [ 0.3339, -0.8595,  0.1931],
        [-0.0278, -0.8425,  1.6350],
        [ 0.3681, -0.8357,  0.7380],
        [ 0.3159, -1.5346,  1.8882],
        [-0.1319, -1.3336,  2.0220],
        [-0.4025, -1.3308,  1.8302],
        [-0.1053, -1.6377,  1.9903],
        [-0.3171, -1.4607,  2.1472],
        [ 0.55

(Epoch 8) TRAIN LOSS:0.5776 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:05,  3.20it/s]

SequenceClassifierOutput(loss=tensor(0.7802, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3220, -1.1728,  2.3538],
        [ 0.7426,  0.2434, -0.9913],
        [ 0.8974, -0.5734,  0.4398],
        [-0.0927, -1.3069,  1.8148],
        [ 0.6215, -0.3614,  0.1429],
        [ 0.8547, -1.1048,  0.5013],
        [ 0.1765, -1.0058,  1.4843],
        [-0.3864, -1.3619,  1.6420],
        [ 0.4754, -0.5866, -0.2193],
        [ 0.1215, -1.2664,  1.6915],
        [-0.1753, -1.5908,  1.9864],
        [ 0.7450,  0.1354, -0.5860],
        [ 0.5335, -0.1640, -0.3930],
        [-0.2306, -1.0934,  2.0304],
        [-0.5154, -1.4146,  1.7305],
        [-0.0807, -1.2901,  1.8302],
        [ 0.5326,  0.4686, -0.7347],
        [-0.2413, -1.3933,  2.3023],
        [ 0.3755, -1.6383,  1.4192],
        [-0.5177, -1.2854,  2.2091],
        [ 0.6698, -0.5530, -0.1182],
        [-0.1287, -1.5904,  1.8784],
        [ 0.7950, -0.8935,  0.3781],
        [ 0.3172, -0.0896, -0.2504],
        [ 0.82

(Epoch 8) TRAIN LOSS:0.6065 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:02<00:06,  2.94it/s]

SequenceClassifierOutput(loss=tensor(0.4782, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5226, -0.0913, -0.6719],
        [ 0.2350, -1.1147,  1.1473],
        [ 0.2109, -1.2106,  1.4285],
        [-0.1929, -1.4545,  1.1831],
        [-0.3268, -1.3423,  2.1524],
        [-0.0678, -1.4167,  1.8026],
        [ 0.7361, -0.1998, -0.6805],
        [ 0.8756, -0.5087,  0.1049],
        [-0.2815, -1.3263,  1.4771],
        [ 0.0882, -0.9898,  1.6165],
        [ 0.1455, -1.3196,  1.4835],
        [-0.1475, -1.4291,  2.0901],
        [ 0.9447, -0.0154, -0.1531],
        [ 0.6840, -1.1942,  0.7536],
        [-0.1453, -1.2778,  1.9425],
        [ 0.6647, -0.9931,  1.2180],
        [-0.3182, -1.4096,  1.7740],
        [ 0.3159, -1.6447,  1.0395],
        [ 0.3935, -1.1559,  1.7899],
        [ 0.0643, -1.2417,  1.3742],
        [ 0.7961, -0.7029,  0.0142],
        [ 0.7985, -0.2341, -0.3447],
        [-0.3112, -1.3890,  2.6435],
        [ 0.6516, -0.0456, -0.4561],
        [-0.39

(Epoch 8) TRAIN LOSS:0.5905 LR:0.00000300:  32%|█████████████                            | 8/25 [00:02<00:05,  2.85it/s]

SequenceClassifierOutput(loss=tensor(0.4806, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4960, -0.4124, -0.1635],
        [-0.6697, -1.5156,  2.5069],
        [ 0.5117, -0.7736,  0.4293],
        [ 0.2743, -0.2968,  0.1965],
        [ 1.0837, -0.5770, -0.0800],
        [-0.3529, -1.2688,  2.2306],
        [-0.0456, -1.2307,  2.0553],
        [ 0.8110, -0.8379,  0.3948],
        [-0.4635, -1.3118,  2.4659],
        [ 0.9124, -0.2968,  0.5341],
        [-0.2227, -1.2562,  2.3531],
        [-0.2295, -1.3392,  1.6835],
        [-0.3495, -1.5506,  2.0434],
        [-0.3251, -1.4438,  2.0577],
        [ 0.9732, -0.7924,  0.1330],
        [-0.2100, -1.4738,  1.8581],
        [ 0.7888, -1.0404,  0.2781],
        [-0.4924, -0.6868,  1.5480],
        [ 0.0996, -1.6674,  1.9011],
        [-0.3838, -1.4027,  2.4245],
        [ 0.1024, -1.3063,  1.9244],
        [ 0.7203, -0.4583,  0.1800],
        [ 0.9301, -0.1094, -0.0632],
        [ 0.3999,  0.2627, -0.1252],
        [ 0.19

(Epoch 8) TRAIN LOSS:0.5783 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:02<00:06,  2.67it/s]

SequenceClassifierOutput(loss=tensor(0.5015, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9193, -0.1568, -0.1244],
        [ 0.4653, -0.2250, -0.0799],
        [ 0.1048, -1.6081,  2.2779],
        [-0.3515, -1.5922,  2.2050],
        [ 0.5887, -0.9572,  1.2352],
        [-0.3744, -1.1606,  2.1630],
        [ 0.1176, -1.2438,  1.7990],
        [-0.1270, -1.2110,  1.4560],
        [ 0.4567, -0.0959, -0.2375],
        [-0.0086, -0.9388,  1.4031],
        [-0.2270, -1.5881,  1.2354],
        [ 1.1573, -0.6049, -0.2970],
        [ 0.4936, -0.5746,  0.4297],
        [ 0.4876, -0.2705, -0.5408],
        [-0.3830, -1.4947,  2.4252],
        [-0.1760, -1.5242,  2.0655],
        [ 0.5538,  0.1529, -0.3207],
        [ 0.2641, -0.9127,  0.5987],
        [-0.2924, -1.3810,  2.1439],
        [ 0.6707, -0.7276,  0.3631],
        [-0.4498, -1.2971,  2.0815],
        [ 0.9056, -0.1967, -0.4322],
        [-0.4508, -1.1240,  2.3051],
        [ 0.0890, -0.8821,  1.7187],
        [ 1.17

(Epoch 8) TRAIN LOSS:0.5706 LR:0.00000300:  40%|████████████████                        | 10/25 [00:03<00:05,  2.65it/s]

SequenceClassifierOutput(loss=tensor(0.6601, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9081, -0.3213, -0.4201],
        [-0.0518, -1.2049,  2.0138],
        [ 0.8753, -0.5890,  0.0383],
        [-0.1503, -1.4710,  1.8725],
        [ 0.8680, -0.5443, -0.3552],
        [ 0.4588, -0.6512,  0.3740],
        [-0.5683, -1.4732,  2.3364],
        [-0.5332, -1.2001,  2.5231],
        [ 0.9279, -0.7581,  0.5229],
        [ 0.7710, -1.3888,  0.5980],
        [ 0.7427, -0.7432,  0.3297],
        [-0.3570, -1.6224,  2.2476],
        [ 0.7702,  0.1447, -0.1473],
        [-0.6939, -1.1508,  2.2078],
        [ 0.8615, -0.7180,  0.2843],
        [ 0.4273, -0.2410, -0.2839],
        [ 0.4392, -1.2857,  0.6083],
        [ 0.4516, -1.3629,  1.5582],
        [ 0.3013, -1.1419,  1.3162],
        [ 0.8357,  0.1937, -0.7667],
        [ 0.0857, -0.8398,  1.3606],
        [ 0.5530, -0.0764, -0.4271],
        [ 1.0807, -0.6784, -0.0188],
        [ 0.4463, -0.1894, -0.4289],
        [ 0.67

(Epoch 8) TRAIN LOSS:0.5787 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:03<00:05,  2.66it/s]

SequenceClassifierOutput(loss=tensor(0.7184, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2301, -0.4239,  0.6949],
        [-0.4226, -1.4290,  2.2380],
        [ 0.0190, -1.2714,  1.5663],
        [ 0.4798, -0.4425,  0.1222],
        [-0.0321, -1.2457,  1.6345],
        [ 0.6858,  0.1729, -0.7630],
        [ 0.3035, -1.2628,  1.7255],
        [-0.1040, -1.4419,  2.1822],
        [ 0.5272, -0.4306,  0.2648],
        [ 0.3553, -0.9724,  1.0209],
        [-0.3650, -1.4918,  2.4443],
        [ 0.6904, -0.9265,  0.2921],
        [ 0.6151,  0.4002, -0.8928],
        [ 0.6574, -0.0673, -0.1890],
        [ 0.2287, -1.1539,  1.1199],
        [ 0.7462, -1.1308,  1.1940],
        [ 0.8210, -0.1938, -0.3084],
        [-0.6194, -1.7177,  2.4915],
        [ 0.4746,  0.1822, -0.4832],
        [ 0.6459, -0.9317,  0.8325],
        [-0.5108, -1.4573,  1.8312],
        [ 1.2109, -0.6318,  0.3051],
        [-0.7556, -1.1870,  1.8661],
        [-0.2875, -1.6118,  2.1638],
        [ 1.13

(Epoch 8) TRAIN LOSS:0.5904 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:04<00:04,  2.65it/s]

SequenceClassifierOutput(loss=tensor(0.6938, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8701,  0.2834, -0.5280],
        [-0.2394, -1.3318,  1.7467],
        [ 0.3531, -0.7030,  0.4008],
        [ 0.1272, -1.1750,  1.5523],
        [ 0.1029, -0.9443,  1.4262],
        [-0.4107, -0.9529,  1.6584],
        [-0.4218, -1.6736,  2.2637],
        [-0.3526, -1.3497,  2.0612],
        [-0.1333, -1.3100,  2.0542],
        [ 0.9090,  0.0690, -0.3714],
        [-0.3097, -1.2121,  1.9536],
        [ 0.3289, -0.9713,  1.5474],
        [ 0.6713, -1.0818,  1.0417],
        [ 0.7245, -0.9273,  0.6416],
        [-0.0650, -1.3650,  1.6358],
        [-0.2258, -1.3315,  2.1358],
        [-0.2621, -1.1604,  2.1034],
        [ 0.4441, -0.2127, -0.2533],
        [ 0.7842, -0.4247, -0.0058],
        [ 0.0212, -1.0810,  1.1120],
        [ 0.1203, -0.5872,  0.6508],
        [ 0.2885, -1.3799,  1.0828],
        [-0.3940, -1.1643,  1.7789],
        [ 1.1033,  0.2972, -1.0160],
        [ 0.58

(Epoch 8) TRAIN LOSS:0.5983 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:04<00:04,  2.59it/s]

SequenceClassifierOutput(loss=tensor(0.6669, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.7828e-01, -8.1674e-01,  1.4740e+00],
        [ 6.5275e-01,  2.6509e-01, -6.3280e-01],
        [-5.9710e-01, -1.5318e+00,  1.8951e+00],
        [ 9.0069e-01, -9.9992e-01, -4.4296e-02],
        [ 5.5652e-01, -2.8543e-01, -5.6856e-01],
        [ 6.1746e-01, -2.6040e-01, -1.9167e-01],
        [-3.6357e-01, -1.0736e+00,  1.8423e+00],
        [ 3.0780e-01, -6.2466e-02,  1.9225e-02],
        [ 6.7634e-01, -4.5002e-01, -1.0074e-01],
        [ 2.5257e-01,  1.4519e-01, -5.1806e-01],
        [-7.8077e-01, -1.1709e+00,  1.9356e+00],
        [-2.4938e-03, -1.1001e+00,  1.9169e+00],
        [-3.5741e-01, -1.1928e+00,  2.1901e+00],
        [ 6.5756e-01, -1.7858e-01, -5.6319e-01],
        [ 5.9322e-01, -5.9237e-01, -1.3362e-01],
        [ 9.9766e-01, -8.6842e-01,  4.2528e-01],
        [ 1.4847e+00, -3.1494e-01, -2.0871e-01],
        [ 5.6719e-01, -1.0838e+00,  1.3541e+00],
        [-3.0125e-01

(Epoch 8) TRAIN LOSS:0.6032 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:04<00:04,  2.55it/s]

SequenceClassifierOutput(loss=tensor(0.5498, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3773, -1.3701,  2.6443],
        [-0.1992, -1.6408,  1.9627],
        [ 0.6083,  0.3657, -0.6718],
        [ 0.3979,  0.2726,  0.1359],
        [-0.0193, -1.4799,  1.9917],
        [ 0.3544, -1.2454,  1.5546],
        [ 0.6060, -0.2687, -0.7036],
        [-0.6321, -1.2577,  2.0074],
        [-0.0542, -1.4392,  1.7980],
        [ 0.5027, -0.5077,  0.0285],
        [-0.2900, -1.6476,  2.2060],
        [ 0.0587, -1.6641,  1.6750],
        [ 0.6859, -0.1631, -0.5422],
        [ 0.5744, -0.3353, -0.4717],
        [ 0.9833, -0.1894, -0.4222],
        [ 0.8083, -0.0309, -0.6382],
        [-0.1209, -1.5341,  1.9379],
        [ 0.6696, -0.4545, -0.0538],
        [ 0.4775, -1.4093,  1.3952],
        [ 0.4881, -0.9036,  0.7272],
        [-0.1894, -1.0655,  1.7835],
        [ 0.5481,  0.1061, -0.9393],
        [-0.1455, -1.3128,  2.1770],
        [ 0.8324,  0.0131, -0.5898],
        [-0.55

(Epoch 8) TRAIN LOSS:0.5997 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:05<00:04,  2.48it/s]

SequenceClassifierOutput(loss=tensor(0.4832, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-1.0654e-01, -1.2741e+00,  2.2239e+00],
        [ 4.7005e-01, -2.7969e-01, -9.5869e-02],
        [ 3.2905e-01, -2.5104e-01, -1.5559e-02],
        [-4.5613e-01, -1.3373e+00,  2.0328e+00],
        [-2.4293e-01, -1.1254e+00,  2.0650e+00],
        [ 7.3452e-01, -4.1097e-02, -4.9164e-02],
        [ 1.7101e-02, -4.3161e-01,  6.0064e-01],
        [-2.1969e-01, -1.7020e+00,  2.1331e+00],
        [-3.1984e-02, -1.5266e+00,  1.7940e+00],
        [-5.6534e-01, -1.4822e+00,  2.3056e+00],
        [ 1.1313e+00, -3.3630e-01, -2.8976e-01],
        [-1.6081e-01, -1.4460e+00,  1.6355e+00],
        [ 6.3919e-01, -7.6071e-01,  2.8910e-01],
        [ 4.7525e-01, -8.8880e-01,  7.8248e-01],
        [ 4.6135e-01, -1.8222e-01, -1.8853e-01],
        [-1.9491e-01, -1.6009e+00,  2.4420e+00],
        [ 9.2602e-02, -1.5055e+00,  1.5264e+00],
        [ 4.4207e-01, -7.1618e-01,  9.2193e-01],
        [ 4.3772e-01

(Epoch 8) TRAIN LOSS:0.5924 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:05<00:03,  2.54it/s]

SequenceClassifierOutput(loss=tensor(0.5198, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0145, -1.5448,  1.4765],
        [ 0.2567, -0.1007, -0.0810],
        [ 0.7809, -0.0110, -0.2207],
        [ 0.1482, -1.2626,  1.6185],
        [ 0.9377, -0.6733,  0.0244],
        [-0.5775, -1.1128,  2.2667],
        [-0.4501, -1.4705,  2.5753],
        [-0.1775, -1.3342,  1.8935],
        [ 0.2645, -1.3974,  1.7556],
        [ 0.4436, -0.3459,  0.1908],
        [ 0.2526, -1.3650,  1.7383],
        [-0.2755, -1.3921,  1.6045],
        [ 1.1196, -0.1939, -0.6842],
        [ 0.5314,  0.2029, -0.9838],
        [ 0.7754, -0.7655, -0.3146],
        [-0.0937, -1.3773,  2.3243],
        [-0.1271, -1.3100,  1.7884],
        [-0.4641, -1.2163,  2.3064],
        [-0.1446, -1.2122,  1.7611],
        [ 0.5222, -0.4843,  0.6301],
        [-0.3412, -1.7144,  2.3666],
        [ 0.0877, -1.3789,  1.7906],
        [-0.0972, -1.3938,  2.0707],
        [-0.1877, -1.1121,  1.5281],
        [ 0.64

(Epoch 8) TRAIN LOSS:0.5881 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:06<00:03,  2.54it/s]

SequenceClassifierOutput(loss=tensor(0.6551, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7425, -0.2556, -0.1996],
        [ 0.5207,  0.4647, -0.4285],
        [ 0.8643, -0.9627,  0.3619],
        [ 0.1165, -1.3303,  2.0102],
        [-0.5101, -1.1362,  2.5252],
        [ 0.9477, -0.0321, -0.4843],
        [ 0.0561, -1.0360,  1.9411],
        [-0.6847, -1.4616,  2.2019],
        [ 0.3020, -0.2656,  0.0361],
        [-0.3780, -1.1097,  2.0995],
        [ 0.8766, -0.4917,  0.4453],
        [-0.3796, -1.2797,  2.2613],
        [ 0.0841, -1.3823,  1.9832],
        [ 1.1309, -0.6518, -0.0452],
        [-0.4553, -1.2066,  2.3377],
        [ 0.4910, -0.7560,  0.2414],
        [ 0.5155, -0.8918,  0.6932],
        [ 0.3629,  0.2448, -0.4686],
        [ 1.2892, -0.4866, -0.0653],
        [ 1.3788,  0.0852,  0.3750],
        [-0.1979, -1.2799,  1.8907],
        [ 0.8185, -0.6146, -0.0609],
        [ 0.7561, -0.6909,  0.4237],
        [-0.0072, -1.4925,  2.1792],
        [-0.14

(Epoch 8) TRAIN LOSS:0.5918 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:06<00:02,  2.63it/s]

SequenceClassifierOutput(loss=tensor(0.6843, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0386, -1.1963,  2.0073],
        [-0.5211, -1.4069,  2.3976],
        [ 0.6274,  0.1042, -0.7258],
        [ 0.4335, -1.3602,  1.6206],
        [ 1.0120, -0.1065, -0.9405],
        [ 0.2095, -0.5735, -0.3065],
        [ 0.6947,  0.0951, -0.6641],
        [ 0.6357, -0.2387,  0.1493],
        [ 0.4859, -0.0786,  0.0194],
        [ 0.2304, -0.3797, -0.3879],
        [ 0.8307, -1.1971,  0.5708],
        [ 0.2952, -0.0852, -0.5476],
        [ 0.1646, -0.7134,  0.7741],
        [ 0.5489, -0.0612, -0.8171],
        [ 0.1952,  0.0816, -0.2308],
        [-0.6604, -1.8071,  2.1721],
        [-0.2088, -0.7954,  1.2442],
        [ 0.5601, -0.0269, -0.3655],
        [ 0.2263, -1.3808,  1.7677],
        [ 0.4810, -0.4992,  0.3753],
        [-0.2367, -1.4611,  1.8192],
        [ 0.5969, -1.3995,  0.9440],
        [ 0.0334, -1.4321,  1.9296],
        [-0.3249, -1.7137,  2.2936],
        [ 0.16

(Epoch 8) TRAIN LOSS:0.5967 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:06<00:02,  2.61it/s]

SequenceClassifierOutput(loss=tensor(0.4622, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.3979e-02, -1.1259e+00,  1.7387e+00],
        [-5.9144e-01, -1.1572e+00,  1.5775e+00],
        [ 1.1199e+00, -3.9557e-01, -4.5178e-01],
        [ 1.0447e+00, -2.6245e-01, -1.5627e-01],
        [-4.0947e-01, -1.5548e+00,  2.3415e+00],
        [-1.6921e-01, -1.1420e+00,  1.5234e+00],
        [ 4.8670e-01, -2.5733e-01, -3.7183e-03],
        [ 2.8822e-01, -1.0024e+00,  1.1661e+00],
        [-7.4669e-01, -1.6210e+00,  2.2113e+00],
        [ 7.9905e-01, -7.1362e-01,  5.6369e-01],
        [-4.4189e-01, -1.0359e+00,  1.4155e+00],
        [ 2.1592e-01, -1.4038e+00,  1.4563e+00],
        [-2.8741e-01, -1.2066e+00,  2.0129e+00],
        [ 1.0256e-01, -1.2783e+00,  1.6967e+00],
        [ 6.1799e-01, -5.2395e-01,  2.9663e-01],
        [ 9.6046e-01, -4.4510e-01, -6.0991e-01],
        [ 4.7771e-01, -1.2825e-01, -4.0211e-01],
        [ 2.8744e-01, -1.2220e+00,  1.7821e+00],
        [ 1.0133e+00

(Epoch 8) TRAIN LOSS:0.5900 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:07<00:01,  2.58it/s]

SequenceClassifierOutput(loss=tensor(0.4611, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 9.4233e-01, -5.4417e-01,  4.4974e-01],
        [ 1.3807e-02, -1.0062e+00,  1.8490e+00],
        [ 4.4917e-01,  5.2034e-01, -8.7264e-01],
        [-3.5677e-01, -1.5058e+00,  2.1968e+00],
        [ 2.1206e-01, -2.1099e-01, -4.5037e-01],
        [ 3.2810e-01,  2.2075e-01, -7.4607e-04],
        [ 2.9992e-01, -3.2927e-02, -2.1960e-01],
        [ 7.3676e-01, -3.7669e-01, -3.9157e-01],
        [ 3.3053e-01, -1.3847e+00,  1.1525e+00],
        [-4.3965e-01, -1.3242e+00,  1.9916e+00],
        [ 1.0510e+00, -1.9780e-01, -4.7947e-01],
        [ 7.5074e-01,  1.6301e-01, -1.2697e+00],
        [-6.1837e-01, -1.7063e+00,  2.3229e+00],
        [-4.4034e-01, -1.2197e+00,  2.0915e+00],
        [-2.3393e-01, -1.2172e+00,  2.1030e+00],
        [-1.7507e-01, -8.4123e-01,  1.7520e+00],
        [ 5.7337e-01, -1.7737e-01, -6.8669e-02],
        [-2.2919e-01, -1.4328e+00,  1.7860e+00],
        [ 4.4137e-01

(Epoch 8) TRAIN LOSS:0.5783 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:07<00:01,  2.99it/s]

SequenceClassifierOutput(loss=tensor(0.4621, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 4.3483e-01,  2.3205e-01, -5.8333e-01],
        [-2.4167e-01, -1.3619e+00,  2.2938e+00],
        [ 6.1080e-01, -6.1511e-02, -6.1829e-01],
        [ 7.9729e-02, -1.4564e+00,  1.8305e+00],
        [-4.1077e-01, -1.5161e+00,  2.5543e+00],
        [ 6.4751e-01, -6.2668e-01, -2.4213e-01],
        [-1.4389e-01, -5.8318e-01, -2.1346e-01],
        [ 6.6847e-01, -2.0792e-01, -5.7349e-01],
        [-9.9776e-02, -1.3417e+00,  1.4012e+00],
        [ 4.8367e-01, -1.3430e+00,  1.6855e+00],
        [-6.9706e-02, -1.0659e+00,  1.4297e+00],
        [-3.6369e-01, -1.3812e+00,  2.4281e+00],
        [-1.5329e-01, -1.7279e+00,  1.4491e+00],
        [-3.6754e-02, -1.5792e+00,  2.1035e+00],
        [ 3.7758e-01,  1.0050e-01, -5.0416e-01],
        [-4.1074e-01, -1.5101e+00,  2.3250e+00],
        [-3.6676e-02, -1.3980e+00,  1.8153e+00],
        [ 2.2669e-01, -9.0836e-01,  7.7642e-01],
        [ 6.1357e-01

(Epoch 8) TRAIN LOSS:0.5874 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:08<00:00,  3.82it/s]

SequenceClassifierOutput(loss=tensor(0.7802, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0568, -1.3100,  2.0219],
        [ 0.6814,  0.2166, -0.6994],
        [ 1.0501, -0.2475, -0.4895],
        [ 1.1887, -0.6633,  0.1068],
        [-0.3613, -1.3927,  2.1873],
        [-0.2192, -1.0479,  2.0929],
        [ 0.5498, -0.0778, -0.5251],
        [ 0.6156, -0.9523,  0.5400],
        [ 0.7016,  0.2122, -0.7792],
        [ 0.4281, -1.2551,  1.5368],
        [ 1.2067,  0.0705, -0.8936],
        [ 0.3221,  0.3305, -0.6367],
        [ 0.6473, -0.7720,  0.8210],
        [ 0.6353,  0.1106, -0.4467],
        [ 0.1055, -0.8381,  1.2033],
        [-0.2374, -1.2302,  2.1910],
        [ 0.6259, -0.4141,  0.6134],
        [ 0.6027, -0.7606,  0.7399],
        [-0.4219, -1.7157,  2.3199],
        [-0.0122, -1.5805,  1.3779],
        [ 0.9982, -0.2811, -0.7574],
        [-0.0186, -1.4138,  1.7457],
        [ 0.8352, -0.6319,  0.4637],
        [ 0.3105, -1.2617,  1.6031],
        [-0.43

(Epoch 8) TRAIN LOSS:0.5936 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:08<00:00,  2.95it/s]

(Epoch 8) TRAIN LOSS:0.5936 ACC:0.76 F1:0.53 REC:0.56 PRE:0.55 LR:0.00000300



(Epoch 9) TRAIN LOSS:0.4299 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:06,  3.54it/s]

SequenceClassifierOutput(loss=tensor(0.4299, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4073, -0.1033, -0.4930],
        [ 0.5009, -1.3202,  1.5777],
        [ 0.6599, -1.1946,  0.7877],
        [ 0.6519,  0.4713, -0.8756],
        [ 0.7595, -0.2837, -0.5385],
        [ 0.1299, -0.2028,  0.4005],
        [ 0.0860, -1.4355,  1.7787],
        [ 0.2825, -1.1676,  1.4514],
        [-0.3811, -1.6626,  1.9884],
        [-0.4238, -1.4798,  2.1098],
        [-0.4627, -1.4652,  2.2465],
        [ 0.2077, -1.1118,  1.1859],
        [-0.2188, -0.9090,  1.5817],
        [ 0.4629, -0.9400,  0.8099],
        [ 0.5295, -1.4219,  1.4973],
        [-0.3700, -1.6253,  1.9685],
        [ 0.9360, -0.4219, -0.0773],
        [ 0.4470, -0.1314, -0.0790],
        [-0.1447, -1.6937,  2.1944],
        [ 1.1907, -0.8519, -0.1085],
        [ 0.3372,  0.0074, -0.8982],
        [ 1.3071, -0.6740, -0.3344],
        [ 0.8923,  0.3500, -0.3501],
        [ 1.0480, -0.7204,  0.0750],
        [-0.35

(Epoch 9) TRAIN LOSS:0.4426 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:05,  4.21it/s]

SequenceClassifierOutput(loss=tensor(0.4553, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-7.2462e-02, -9.4337e-01,  1.9171e+00],
        [ 7.5859e-01, -1.1903e+00,  1.2137e+00],
        [ 2.3756e-01, -6.9613e-02, -4.6723e-01],
        [ 2.7072e-01, -3.1166e-01,  9.5731e-02],
        [ 2.1391e-01, -1.3826e+00,  1.8814e+00],
        [-2.6285e-01, -1.1682e+00,  1.4387e+00],
        [-4.3090e-01, -1.2093e+00,  2.2585e+00],
        [ 1.0676e+00, -7.7540e-01,  1.4572e-01],
        [-1.3496e-01, -8.8188e-01,  1.3180e+00],
        [-4.3754e-01, -1.7622e+00,  2.2501e+00],
        [ 6.1886e-01, -9.9477e-01,  4.8062e-01],
        [ 4.0162e-01, -1.0885e+00,  1.5563e+00],
        [-1.1125e-01, -1.1983e+00,  2.3067e+00],
        [-6.8090e-02, -1.6152e+00,  2.1133e+00],
        [ 7.2837e-01,  1.7867e-01, -4.2422e-01],
        [-9.0605e-02, -1.4796e+00,  1.8682e+00],
        [ 1.0815e+00, -7.8261e-01,  9.4047e-02],
        [-2.6101e-02, -1.5509e+00,  1.8253e+00],
        [-5.0376e-01

(Epoch 9) TRAIN LOSS:0.5136 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:04,  4.52it/s]

SequenceClassifierOutput(loss=tensor(0.6556, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 8.4574e-01, -1.1445e+00,  7.4075e-01],
        [ 4.3513e-01,  1.2089e-02,  5.1644e-02],
        [ 8.4760e-01,  1.3739e-01, -4.3323e-01],
        [-5.5729e-01, -1.5732e+00,  2.5930e+00],
        [-2.2881e-01, -1.6559e+00,  2.2204e+00],
        [ 4.0779e-03, -1.5862e+00,  2.0090e+00],
        [ 4.3342e-01,  7.0137e-02, -6.9534e-01],
        [-7.5945e-01, -1.8232e+00,  2.2507e+00],
        [ 8.4921e-01, -8.0196e-01, -1.8447e-01],
        [ 6.7417e-01,  5.5063e-01, -8.8386e-01],
        [ 1.7125e-01, -8.6444e-01,  9.7046e-01],
        [-2.3640e-01, -1.1329e+00,  1.9740e+00],
        [-4.6449e-01, -1.2188e+00,  2.0842e+00],
        [ 7.5133e-01,  2.7989e-02, -1.0190e-01],
        [ 3.4334e-01, -1.3264e+00,  1.7371e+00],
        [ 2.8588e-01, -1.1850e+00,  1.2638e+00],
        [ 4.8167e-01, -2.3530e-03, -1.8561e-01],
        [ 9.4099e-01, -7.9648e-02, -3.2726e-01],
        [ 4.2295e-01

(Epoch 9) TRAIN LOSS:0.5272 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:04,  4.75it/s]

SequenceClassifierOutput(loss=tensor(0.6656, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7409, -0.1156, -0.5020],
        [ 0.9496, -1.2613,  0.6871],
        [ 0.7759, -0.3452, -0.4861],
        [ 0.7039,  0.5314, -0.4981],
        [-0.3306, -1.4136,  2.2678],
        [-0.2153, -1.6274,  2.2367],
        [ 0.9863,  0.2961, -0.7101],
        [-0.0716, -1.3521,  1.5184],
        [-0.0037, -1.1342,  2.1855],
        [ 0.5497, -1.2612,  1.4812],
        [ 0.2190, -1.5627,  1.8677],
        [ 0.5065,  0.0954, -0.6246],
        [ 0.5257, -0.8992,  1.1346],
        [ 0.6947,  0.1098, -0.5146],
        [ 0.8934, -0.4117, -0.8149],
        [ 0.8130, -0.1179,  0.0616],
        [ 1.4882, -0.7147, -0.2809],
        [ 1.0284, -1.0840, -0.2713],
        [-0.1857, -1.4945,  2.4241],
        [ 1.3353, -0.4999, -0.3641],
        [ 0.4770, -0.0635, -0.5340],
        [-0.0706, -1.5346,  2.2961],
        [-0.3509, -1.7430,  2.0102],
        [-0.0267, -1.3606,  2.0154],
        [ 0.30

(Epoch 9) TRAIN LOSS:0.5460 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:03,  4.86it/s]

SequenceClassifierOutput(loss=tensor(0.6082, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0133, -0.3294, -0.2452],
        [-0.5555, -1.2949,  2.2390],
        [-0.6924, -1.0544,  2.2172],
        [-0.3192, -1.3607,  2.5433],
        [ 1.1353, -0.6113, -0.1045],
        [-0.3520, -1.5141,  2.0361],
        [-0.1466, -1.5572,  1.9072],
        [ 0.2808, -0.0847,  0.2789],
        [ 0.3857,  0.1834, -0.6032],
        [ 0.4525,  0.4388, -0.5413],
        [ 0.7974, -0.6879,  0.0180],
        [ 1.0675, -1.0896,  0.7686],
        [ 0.1410, -1.2146,  1.3593],
        [ 0.6888, -0.1886,  0.1945],
        [ 1.2927, -0.1798, -0.8223],
        [ 0.0818, -1.7707,  2.0400],
        [ 0.0960, -1.1166,  0.7496],
        [ 0.4626, -0.0574, -0.6258],
        [ 0.3801,  0.0195, -0.1904],
        [ 0.7395, -0.2919, -0.2199],
        [ 0.5799, -1.4628,  0.9380],
        [-0.1182, -1.2418,  1.8343],
        [ 0.5252,  0.0200, -1.1135],
        [ 0.5686, -1.4021,  2.0074],
        [ 0.81

(Epoch 9) TRAIN LOSS:0.5561 LR:0.00000300:  32%|█████████████                            | 8/25 [00:01<00:03,  4.74it/s]

SequenceClassifierOutput(loss=tensor(0.6267, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2316, -1.3167,  1.5987],
        [ 0.3084, -1.2929,  1.4582],
        [ 0.2128, -1.2176,  1.8630],
        [-0.4831, -1.2056,  2.2674],
        [-0.4145, -0.9060,  1.9945],
        [-0.3575, -1.3551,  2.2579],
        [-0.1498, -1.6986,  2.2878],
        [-0.7562, -1.6689,  2.7127],
        [-0.1198, -1.6907,  2.1052],
        [ 0.5955,  0.0172, -0.9415],
        [ 0.9103, -0.1709, -0.6070],
        [ 0.0582, -1.3578,  1.2826],
        [ 1.5482, -0.9234, -0.0325],
        [ 0.4522, -0.4702,  0.2285],
        [-0.1616, -1.2733,  2.0103],
        [ 1.1190,  0.2319, -0.5365],
        [ 0.8821, -0.7798,  0.0436],
        [ 1.0746, -0.8567,  0.2294],
        [ 0.9894, -0.3477,  0.5481],
        [ 0.8809, -0.0380, -0.5761],
        [-0.0442, -1.1735,  1.7351],
        [-0.2651, -1.3577,  1.9884],
        [ 0.1604,  0.1354, -0.1794],
        [-0.2180, -1.5362,  2.0436],
        [ 0.14

(Epoch 9) TRAIN LOSS:0.5682 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:01<00:03,  4.92it/s]

SequenceClassifierOutput(loss=tensor(0.6651, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5211, -1.3411,  1.8605],
        [ 0.5630,  0.4100, -1.2757],
        [ 1.2694, -0.1706, -0.3188],
        [ 0.3756, -1.1189,  1.0183],
        [-0.5051, -1.6774,  2.3870],
        [ 0.5325,  0.4755, -0.8289],
        [-0.4379, -1.0599,  2.4874],
        [-0.2177, -1.3694,  1.9601],
        [ 0.3442, -0.0148, -0.5474],
        [ 0.8954, -0.2686, -0.3637],
        [ 0.4767, -1.3424,  0.8737],
        [-0.4325, -1.4596,  2.4335],
        [ 0.7709, -1.1458,  0.8862],
        [ 0.3683, -0.6966,  0.5636],
        [ 0.1320, -1.0773,  1.2515],
        [ 0.7020, -0.2374, -0.2136],
        [ 1.0229, -0.6592,  0.1321],
        [ 0.4392, -0.8060,  1.2661],
        [ 0.4750,  0.0901, -0.6646],
        [ 0.5886, -0.0096, -0.3920],
        [ 0.3807, -1.3283,  1.5855],
        [-0.2332, -1.4717,  2.7394],
        [ 0.2324, -1.6059,  1.4607],
        [ 0.8989, -0.3835, -0.9753],
        [ 0.76

(Epoch 9) TRAIN LOSS:0.5975 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:02<00:02,  4.74it/s]

SequenceClassifierOutput(loss=tensor(0.7985, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1829, -1.6657,  2.2389],
        [ 0.7492, -0.9920,  0.3660],
        [ 0.7477, -1.1538,  0.6991],
        [-0.2467, -1.2473,  1.7342],
        [-0.2552, -1.5795,  2.5849],
        [ 0.1264, -0.7568,  0.7715],
        [-0.4002, -1.6248,  2.0610],
        [ 0.8768, -1.1388,  0.6653],
        [-0.0699, -1.1873,  2.3161],
        [ 0.0250, -1.6842,  1.9551],
        [ 0.6934, -0.0416, -0.3145],
        [-0.5188, -1.5947,  2.2723],
        [ 0.8667, -0.6996, -0.0072],
        [ 0.5470, -0.8955,  0.4385],
        [ 0.7156,  0.4026, -0.4372],
        [ 0.6184, -1.3510,  1.1335],
        [ 1.0979, -0.6893, -0.4107],
        [ 0.5527, -1.7875,  1.5387],
        [ 1.0500, -0.7554,  0.2308],
        [ 0.8366, -0.6191,  0.8555],
        [ 0.9393, -0.6475, -0.2658],
        [-0.2739, -1.4343,  2.1562],
        [ 0.1439, -1.4094,  2.2747],
        [-0.1205, -1.1618,  2.0200],
        [ 0.13

(Epoch 9) TRAIN LOSS:0.5796 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:02<00:02,  4.79it/s]

SequenceClassifierOutput(loss=tensor(0.3828, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8379,  0.1767, -0.0827],
        [ 0.0079, -1.4915,  2.0666],
        [-0.3322, -1.4880,  2.4811],
        [-0.5419, -1.4133,  2.6029],
        [-0.3037, -1.4371,  2.6455],
        [-0.1083, -1.7702,  2.0107],
        [ 1.1587,  0.4011, -0.9973],
        [ 0.3744, -1.1397,  0.6921],
        [ 1.0683, -0.1039, -0.4979],
        [-0.7909, -1.2761,  2.3599],
        [-0.5747, -1.4818,  2.5504],
        [ 0.4832, -0.1752, -0.3518],
        [-0.2267, -1.3369,  1.9671],
        [ 0.8306,  0.6953, -1.1716],
        [ 0.9033, -0.5219, -0.2213],
        [ 0.0232, -1.4616,  2.0018],
        [ 0.4768, -1.5652,  1.4079],
        [-0.1353, -1.1441,  2.0205],
        [ 0.5753, -0.9236,  1.2655],
        [ 1.2835, -0.8595, -0.3991],
        [ 1.1353, -0.6883, -0.7298],
        [ 0.5402, -0.9430,  1.0920],
        [ 0.0525, -1.4084,  2.0126],
        [-0.4904, -1.3186,  2.3513],
        [ 0.13

(Epoch 9) TRAIN LOSS:0.5758 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:02<00:02,  4.67it/s]

SequenceClassifierOutput(loss=tensor(0.5305, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6562,  0.1707, -1.0631],
        [-0.2916, -1.6327,  2.2024],
        [ 0.1927, -0.9646,  1.0303],
        [-0.5828, -1.6391,  2.5947],
        [ 0.5278, -0.3735,  0.5820],
        [ 1.1387, -0.4872, -0.3344],
        [-0.6345, -1.4087,  2.4875],
        [-0.1855, -1.5058,  2.0209],
        [-0.7259, -1.2543,  2.6886],
        [-0.1837, -1.5007,  1.5487],
        [-0.4602, -1.4982,  2.4045],
        [-0.6731, -1.5452,  2.2153],
        [ 1.2247,  0.0278, -0.7853],
        [-0.0172, -1.4903,  1.9207],
        [-0.5110, -1.7750,  2.6225],
        [ 1.1490, -0.9645,  0.4296],
        [-0.4710, -1.4624,  1.9715],
        [ 0.7818,  0.1130, -0.1659],
        [ 0.7253, -0.4520, -0.0421],
        [-0.2302, -1.3540,  2.5331],
        [-0.4343, -1.5910,  2.1716],
        [-0.1865, -1.6128,  2.4437],
        [ 0.0307, -1.7685,  1.4797],
        [-0.4991, -1.5367,  2.4192],
        [ 0.66

(Epoch 9) TRAIN LOSS:0.5828 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:03<00:02,  4.48it/s]

SequenceClassifierOutput(loss=tensor(0.6730, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2681,  0.1707, -0.6790],
        [-0.3373, -1.5188,  2.0852],
        [ 0.9863, -0.2845, -0.6453],
        [ 0.1213, -0.9423,  1.3687],
        [-0.3009, -1.3515,  2.0546],
        [ 1.0759, -0.0089, -0.7337],
        [-0.3587, -1.6309,  2.0485],
        [ 0.9666, -0.0624, -0.1917],
        [-0.2986, -1.5661,  1.9044],
        [ 0.1565, -1.7915,  1.7150],
        [ 1.0940, -0.5789,  0.0969],
        [ 0.0979, -1.3515,  1.7719],
        [ 0.9702, -1.0453,  0.4359],
        [ 1.2059,  0.0582, -1.0776],
        [ 0.8137,  0.0404, -0.0549],
        [ 0.2853, -1.2935,  1.5671],
        [ 0.1491,  0.3066,  0.0402],
        [ 1.0773, -0.7480, -0.1010],
        [ 0.0080, -1.3106,  1.8652],
        [ 0.9429, -0.0584, -0.3744],
        [ 0.4593, -0.5344,  0.1908],
        [ 0.6658, -1.2796,  1.0136],
        [ 0.9606,  0.1297, -0.9931],
        [ 0.4252, -0.4903, -0.1608],
        [-0.33

(Epoch 9) TRAIN LOSS:0.5763 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:03<00:02,  4.58it/s]

SequenceClassifierOutput(loss=tensor(0.4853, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4847, -0.4364, -0.1780],
        [ 0.9907, -0.4778, -0.8599],
        [ 0.3978, -0.5007, -0.1263],
        [-0.5026, -1.4051,  2.4446],
        [ 0.6560, -0.4171,  0.2186],
        [-0.4032, -1.4503,  2.2866],
        [ 0.5953,  0.2862, -0.9211],
        [-0.1133, -1.4084,  1.7133],
        [ 0.2457, -1.4726,  1.8032],
        [ 0.2609, -1.3888,  1.9800],
        [ 0.7539,  0.1154, -0.5609],
        [ 0.1270, -0.9446,  1.3060],
        [ 0.3891, -1.1844,  1.5515],
        [ 0.4011,  0.3086, -0.5602],
        [ 0.8004, -0.6178,  0.0970],
        [ 0.3577, -1.5945,  1.7257],
        [-0.1650, -1.2460,  2.0214],
        [ 0.5435, -0.2420, -0.4046],
        [ 0.8226, -0.0520,  0.0269],
        [-0.4224, -1.4850,  2.4742],
        [ 0.6660, -0.8018, -0.0513],
        [ 1.0131, -0.7702,  0.2073],
        [ 1.1285, -0.7188, -0.3423],
        [-0.3827, -1.6851,  2.6727],
        [ 1.05

(Epoch 9) TRAIN LOSS:0.5731 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:03<00:02,  3.67it/s]

SequenceClassifierOutput(loss=tensor(0.4601, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.4982, -1.4941,  2.4113],
        [-0.0606, -1.3853,  2.2525],
        [-0.6712, -1.3547,  2.6862],
        [ 0.6582, -0.3775, -0.3444],
        [ 0.5607,  0.3949, -0.9357],
        [-0.4189, -1.3047,  2.3259],
        [ 0.4453,  0.3878, -1.1479],
        [-0.4473, -1.5836,  2.4406],
        [ 0.6112, -1.5493,  0.8501],
        [ 0.5885,  0.4187, -0.2223],
        [-0.5597, -1.8197,  1.9940],
        [ 0.6020, -1.1571,  1.3621],
        [ 0.2531, -0.1683, -0.2747],
        [ 0.6716, -0.3482,  0.3927],
        [ 1.0494, -0.7382,  0.1603],
        [ 0.8105,  0.1737, -1.2723],
        [-0.2913, -1.6178,  2.3548],
        [-0.4225, -1.7887,  2.1027],
        [-0.4168, -1.5898,  2.5520],
        [-0.0421, -1.5703,  2.0051],
        [ 0.0292, -1.5161,  2.3110],
        [ 0.8665, -0.1096, -0.2302],
        [ 0.1208, -1.2597,  1.4804],
        [ 0.2778, -1.3015,  1.7821],
        [ 0.94

(Epoch 9) TRAIN LOSS:0.5664 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:04<00:02,  3.27it/s]

SequenceClassifierOutput(loss=tensor(0.6450, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0113, -1.7015,  2.2302],
        [-0.3194, -1.2177,  2.1214],
        [ 0.6721,  0.0837, -0.5999],
        [-0.1804, -1.7148,  2.1149],
        [ 1.2711, -0.9046, -0.3180],
        [ 0.5672, -1.3450,  0.9935],
        [ 1.4798, -0.7209,  0.0988],
        [ 1.1270, -0.4404, -0.4944],
        [ 1.0913, -1.2697,  0.4904],
        [-0.4535, -1.5700,  2.4259],
        [ 0.8092,  0.2423, -1.0270],
        [ 1.0491, -0.6072,  0.1934],
        [ 0.7660, -0.6541,  0.2333],
        [ 0.7401, -0.3469, -0.0172],
        [ 0.5989, -0.5388, -0.3705],
        [ 0.7208,  0.1138, -0.4153],
        [-0.4794, -1.6106,  2.4543],
        [-0.3371, -1.9349,  2.3823],
        [ 0.4336, -0.3947,  0.4723],
        [ 0.7850,  0.3163, -0.8685],
        [ 0.4743,  0.1017, -0.1516],
        [ 1.0623, -0.0757, -0.6241],
        [ 0.1355, -1.5073,  2.1850],
        [ 0.8226, -0.3895, -0.7750],
        [-0.52

(Epoch 9) TRAIN LOSS:0.5708 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:04<00:02,  2.99it/s]

SequenceClassifierOutput(loss=tensor(0.5567, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0132e+00, -1.8907e-01, -5.2494e-01],
        [ 5.4529e-01, -1.2598e+00,  1.3273e+00],
        [ 3.2169e-01,  4.0610e-01, -1.9455e-01],
        [-2.7015e-01, -1.3307e+00,  1.9953e+00],
        [ 4.8885e-02, -1.3672e+00,  1.2868e+00],
        [ 8.1725e-01, -8.4479e-01,  9.2042e-01],
        [ 1.0622e+00, -3.3758e-01, -5.5993e-01],
        [-4.9206e-01, -1.1564e+00,  2.5140e+00],
        [ 2.1117e-01, -1.1542e+00,  1.4460e+00],
        [-9.2927e-02, -1.5823e+00,  1.7021e+00],
        [ 7.0942e-01,  1.9276e-01, -7.3284e-01],
        [ 1.1721e+00, -2.2891e-01, -2.8744e-01],
        [ 3.6772e-01, -1.3946e+00,  2.1012e+00],
        [ 1.4040e+00, -8.5654e-01, -4.3154e-01],
        [-3.4226e-04, -1.0421e+00,  1.8081e+00],
        [-5.1738e-01, -1.6653e+00,  2.3742e+00],
        [ 4.1939e-01,  1.3259e-02,  7.9846e-02],
        [ 1.0441e+00, -3.6576e-01, -5.2560e-01],
        [ 9.6144e-01

(Epoch 9) TRAIN LOSS:0.5701 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:04<00:02,  2.75it/s]

SequenceClassifierOutput(loss=tensor(0.5863, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.2955, -0.4782, -0.5458],
        [ 0.2371, -0.8994,  0.9040],
        [ 0.2022, -1.0921,  0.9925],
        [ 0.2394, -0.1568,  0.1813],
        [-0.0653, -1.5060,  2.0922],
        [ 0.0137, -1.5475,  2.2202],
        [-0.2349, -1.6063,  2.6164],
        [ 0.9665,  0.1411, -0.4633],
        [ 0.2628, -1.3299,  1.5654],
        [ 0.1614,  0.0176, -0.3468],
        [ 1.0738, -0.2730, -0.9218],
        [ 0.5426, -0.4990,  0.3035],
        [ 0.3829, -0.6864,  0.6641],
        [-0.4299, -1.4903,  2.1550],
        [ 0.3743,  0.3347, -0.3999],
        [ 0.2838,  0.1589, -0.0537],
        [ 0.8923, -0.2901, -0.3875],
        [-0.2751, -1.1096,  1.7229],
        [ 0.9394, -1.1516, -0.2469],
        [ 0.4121,  0.0714, -0.3924],
        [-0.1080, -1.5285,  1.6088],
        [ 0.5377, -1.0638,  0.8813],
        [ 0.0047, -1.0887,  1.8719],
        [-0.4127, -1.5650,  2.1972],
        [-0.12

(Epoch 9) TRAIN LOSS:0.5709 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:05<00:01,  2.63it/s]

SequenceClassifierOutput(loss=tensor(0.5043, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9625,  0.0098, -1.0914],
        [-0.0726, -1.7363,  1.7232],
        [-0.6089, -1.6790,  2.5903],
        [-0.0288, -1.5085,  1.8855],
        [-0.1546, -1.3292,  1.7885],
        [ 0.0677, -1.1155,  1.4385],
        [ 0.8561, -0.0272, -1.0989],
        [-0.0038, -1.5606,  2.0522],
        [ 0.3459, -1.1861,  1.5209],
        [ 0.6533,  0.3553, -0.6743],
        [ 0.6687, -1.0130,  0.3594],
        [ 0.6901, -1.3056,  0.8856],
        [ 1.2214, -0.4072, -0.7030],
        [-0.0248, -1.6421,  1.9649],
        [ 0.6653,  0.3886, -0.7722],
        [ 0.5119, -0.0793, -0.5616],
        [ 0.7203,  0.2382, -0.6278],
        [ 1.1706, -0.3560, -0.7478],
        [ 0.7101,  0.0342, -0.2885],
        [-0.0082, -1.4592,  1.6889],
        [-0.2617, -1.6297,  2.1394],
        [-0.1952, -1.5534,  2.1128],
        [-0.3214, -1.6345,  2.3269],
        [-0.2258, -1.4244,  2.1143],
        [ 0.57

(Epoch 9) TRAIN LOSS:0.5677 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:05<00:01,  2.53it/s]

SequenceClassifierOutput(loss=tensor(0.4373, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0035, -0.4763, -0.3110],
        [-0.2263, -1.5220,  2.1553],
        [ 0.4176, -0.3424, -0.0670],
        [ 0.1840, -1.6378,  1.9320],
        [-0.0444, -1.3007,  2.0827],
        [ 1.0499,  0.0037, -0.1544],
        [ 1.2502, -0.8005, -0.3813],
        [-0.1691, -1.4169,  1.9360],
        [ 0.0801, -1.4065,  1.6019],
        [ 0.3158, -1.4347,  1.8830],
        [-0.1667, -1.7667,  1.8346],
        [-0.1344, -1.7153,  2.4559],
        [ 0.7325,  0.0370, -0.6646],
        [ 0.5923, -1.0358,  0.7080],
        [ 1.1192,  0.1357, -1.1112],
        [-0.0887, -1.8101,  2.2224],
        [ 0.9454, -0.5191,  0.1158],
        [-0.5348, -1.3598,  2.3835],
        [ 0.6825, -0.7229,  0.4257],
        [ 1.0882, -0.3462, -0.6912],
        [ 1.2889, -0.2243, -0.8510],
        [-0.3079, -1.3375,  1.9070],
        [-0.1938, -1.2211,  2.0490],
        [ 0.1688, -0.9322,  1.3938],
        [ 1.43

(Epoch 9) TRAIN LOSS:0.5618 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:06<00:01,  2.61it/s]

SequenceClassifierOutput(loss=tensor(0.4449, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1986, -1.6079,  1.7597],
        [ 1.3287, -0.5815, -0.1746],
        [ 0.9931,  0.0792, -0.4098],
        [ 1.2717,  0.0878, -0.9521],
        [-0.2211, -1.6211,  2.3641],
        [-0.2432, -1.3787,  2.6741],
        [-0.5063, -1.6006,  2.3919],
        [ 0.7209, -1.5669,  1.7128],
        [ 0.2761, -1.4438,  1.9245],
        [ 0.9629,  0.0834, -0.5395],
        [-0.2858, -1.5195,  2.3630],
        [ 1.1814, -1.1382,  0.0862],
        [ 1.0451, -1.0028,  0.2824],
        [-0.4680, -1.5023,  2.5152],
        [-0.5279, -1.4412,  2.4770],
        [ 0.4765, -0.0597, -0.8055],
        [-0.3211, -1.8256,  2.3112],
        [-0.1762, -1.4525,  2.1302],
        [ 1.1752, -1.1247,  0.3002],
        [-0.1792, -1.6602,  2.0033],
        [-0.5124, -1.5930,  2.3709],
        [-0.5380, -1.7603,  2.4549],
        [ 0.7529, -0.4755,  0.4520],
        [-0.2753, -1.2992,  1.3458],
        [ 0.94

(Epoch 9) TRAIN LOSS:0.5567 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:06<00:00,  2.51it/s]

SequenceClassifierOutput(loss=tensor(0.6543, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0461, -1.0479,  1.4418],
        [ 0.4010,  0.0473, -0.8426],
        [ 1.5540, -0.8613, -0.0081],
        [-0.5347, -1.2586,  2.1878],
        [-0.3624, -1.5733,  2.1026],
        [ 1.1164, -0.5794, -0.5516],
        [-0.3103, -1.7896,  1.9915],
        [ 0.3746, -0.7822,  0.2177],
        [ 1.1759, -0.4531, -0.6056],
        [-0.0152, -1.4311,  2.0037],
        [ 0.0888, -1.7382,  1.6935],
        [-0.1189, -1.2610,  1.4371],
        [-0.4603, -1.4709,  2.4270],
        [ 0.8066,  0.3010, -0.4466],
        [ 0.6411, -0.0339, -0.7158],
        [ 1.2593, -0.9489,  0.4838],
        [ 0.9167,  0.4199, -0.9641],
        [ 0.4772, -0.9125,  1.1101],
        [ 0.8168, -0.3320,  0.4578],
        [ 0.3641, -0.1411, -0.0739],
        [-0.1018, -1.2954,  1.4001],
        [-0.4745, -1.4806,  2.4111],
        [ 0.3551, -1.4201,  1.2701],
        [ 1.2001, -0.2962, -0.0325],
        [-0.43

(Epoch 9) TRAIN LOSS:0.5563 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  2.82it/s]

SequenceClassifierOutput(loss=tensor(0.4496, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0809, -1.5325,  1.9588],
        [ 0.2026, -1.4468,  1.6630],
        [-0.2774, -1.4547,  2.2825],
        [ 1.0408, -0.8199, -0.4160],
        [-0.2176, -1.6487,  2.4329],
        [ 0.9702, -0.2984, -0.4537],
        [ 0.8625, -1.2152,  0.5372],
        [ 1.4527, -0.2732, -0.7615],
        [-0.0786, -1.4405,  1.8366],
        [ 0.3101, -0.6591,  0.4167],
        [-0.6723, -1.5964,  2.4443],
        [-0.4999, -1.2134,  1.7340],
        [-0.5711, -1.5182,  2.2731],
        [ 0.0149, -1.3303,  2.1522],
        [-0.4478, -1.3516,  2.3386],
        [ 0.8288,  0.1433, -0.3549],
        [ 0.8302, -1.5989,  1.1037],
        [ 0.5128, -0.9228,  1.0576],
        [-0.5194, -1.4455,  2.7805],
        [-0.5294, -1.4785,  2.6936],
        [ 0.4886,  0.4172, -0.4680],
        [-0.2427, -1.4544,  2.5750],
        [-0.7793, -1.1319,  2.1090],
        [ 1.1736, -0.6908, -0.5038],
        [ 0.97

(Epoch 9) TRAIN LOSS:0.5563 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  3.45it/s]

(Epoch 9) TRAIN LOSS:0.5563 ACC:0.77 F1:0.54 REC:0.57 PRE:0.61 LR:0.00000300



(Epoch 10) TRAIN LOSS:0.4121 LR:0.00000300:   4%|█▌                                      | 1/25 [00:00<00:06,  3.48it/s]

SequenceClassifierOutput(loss=tensor(0.4121, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.6262, -1.6002,  2.5272],
        [-0.1401, -0.7691,  1.2487],
        [ 0.0081, -1.2400,  1.7486],
        [ 0.0358, -1.3931,  1.5456],
        [-0.1714, -1.2328,  1.9792],
        [ 0.5504,  0.6643, -0.7774],
        [ 0.0863, -1.4560,  1.4483],
        [ 0.4833, -1.2797,  0.3345],
        [-0.6539, -1.4968,  2.6983],
        [ 1.0314, -1.1774,  0.1223],
        [-0.5569, -1.3546,  2.6173],
        [-0.5850, -1.5151,  2.6765],
        [-0.5005, -1.6447,  2.7221],
        [ 0.0173, -1.4036,  2.2180],
        [ 0.5785,  0.0341, -0.6480],
        [ 0.2750, -1.5107,  1.2120],
        [-0.5607, -1.3763,  2.6332],
        [ 0.8016, -0.2190, -0.1178],
        [ 0.9962, -1.3706,  0.6939],
        [ 1.2922, -0.9124,  0.0078],
        [ 0.8839, -0.4640, -0.0806],
        [-0.0070, -1.2053,  1.8016],
        [ 0.4055, -1.1798,  1.6878],
        [ 0.0957, -1.6569,  1.8551],
        [-0.50

(Epoch 10) TRAIN LOSS:0.4812 LR:0.00000300:   8%|███▏                                    | 2/25 [00:00<00:05,  4.19it/s]

SequenceClassifierOutput(loss=tensor(0.5503, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-1.8275e-01, -1.8480e+00,  2.4456e+00],
        [-6.8372e-01, -1.1548e+00,  2.3544e+00],
        [ 9.5479e-01,  1.1066e-01, -7.4559e-01],
        [ 9.4307e-01, -1.5593e+00,  1.2611e+00],
        [-3.5781e-01, -1.4967e+00,  2.6675e+00],
        [ 6.8527e-01, -1.1320e+00,  6.1493e-01],
        [-6.1675e-01, -1.5229e+00,  2.6920e+00],
        [-2.9848e-01, -1.0743e+00,  1.9564e+00],
        [-1.0944e-01, -1.7378e+00,  2.0885e+00],
        [ 3.1813e-01,  2.4021e-01, -5.2378e-01],
        [ 3.8261e-01, -4.3051e-01,  1.7665e-01],
        [ 6.4862e-01, -8.9561e-01,  1.0852e+00],
        [ 6.6810e-01, -1.1831e+00,  9.7210e-01],
        [ 1.4373e-04, -4.2617e-01, -5.4449e-02],
        [-8.4147e-01, -1.3376e+00,  2.7101e+00],
        [-4.0787e-01, -1.4766e+00,  2.2389e+00],
        [ 8.7831e-01, -1.4747e-01, -2.1014e-01],
        [ 5.0206e-01, -8.2567e-01,  1.0581e+00],
        [-2.0162e-01

(Epoch 10) TRAIN LOSS:0.4428 LR:0.00000300:  12%|████▊                                   | 3/25 [00:00<00:05,  4.28it/s]

SequenceClassifierOutput(loss=tensor(0.3658, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0576, -1.4829,  1.8551],
        [ 1.1799, -0.2475, -1.0548],
        [ 0.5570, -0.7668,  0.7690],
        [-0.0840, -1.4671,  1.9934],
        [-0.8468, -1.4723,  2.3701],
        [ 0.1648, -1.6819,  2.1417],
        [-0.1840, -1.5523,  2.0860],
        [-0.1144, -1.2476,  2.1473],
        [ 0.6418,  0.0777, -0.7079],
        [ 0.0844, -1.6121,  2.1851],
        [-0.4062, -1.4684,  2.5138],
        [ 0.0350, -1.6272,  1.7723],
        [ 0.9852, -0.2374, -0.6127],
        [-0.3522, -1.5068,  2.2733],
        [ 0.2320, -1.6316,  1.3094],
        [ 0.7381,  0.1520, -0.5998],
        [-0.3766, -1.4825,  2.5974],
        [ 1.2289, -0.5419, -0.5055],
        [ 0.8812, -0.4826, -0.6102],
        [ 0.3131, -0.2009, -0.6792],
        [ 0.4345, -0.2081, -0.2157],
        [ 0.4712,  0.6405, -0.9939],
        [ 0.6095, -0.0815,  0.0967],
        [ 0.1758, -1.4018,  1.7790],
        [-0.66

(Epoch 10) TRAIN LOSS:0.4937 LR:0.00000300:  16%|██████▍                                 | 4/25 [00:00<00:04,  4.42it/s]

SequenceClassifierOutput(loss=tensor(0.6466, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2420,  0.2216, -0.3959],
        [ 0.9528, -0.1263, -0.4709],
        [ 0.0602, -1.0407,  1.5284],
        [-0.1749, -1.3632,  2.2566],
        [ 0.7360, -1.3269,  1.2252],
        [-0.5735, -1.4212,  2.4368],
        [ 0.9675,  0.2909, -0.9820],
        [ 0.1408, -1.3751,  1.5517],
        [ 0.7296, -0.2057, -0.1277],
        [ 0.5362, -1.2934,  1.4680],
        [ 0.7864, -0.2888,  0.0812],
        [-0.4405, -1.7278,  2.5183],
        [ 1.0431,  0.5761, -0.7226],
        [ 0.1090, -1.3021,  1.8566],
        [-0.0220, -1.4636,  1.2689],
        [ 0.1231, -1.5860,  1.7027],
        [ 0.0898, -1.8441,  2.0811],
        [ 0.5221, -0.5089,  0.1852],
        [ 0.5171,  0.1327, -0.4118],
        [ 0.8199,  0.0806, -0.8421],
        [-0.4545, -1.5937,  2.3099],
        [ 1.2489,  0.0196, -1.0804],
        [ 1.2135, -0.5341,  0.3151],
        [ 0.6055, -0.8753,  0.5750],
        [ 0.59

(Epoch 10) TRAIN LOSS:0.4626 LR:0.00000300:  20%|████████                                | 5/25 [00:01<00:04,  4.54it/s]

SequenceClassifierOutput(loss=tensor(0.3383, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.5923, -1.8343,  2.5235],
        [-0.0314, -1.3126,  1.9581],
        [ 0.0753, -1.4811,  2.0056],
        [ 0.3027, -1.5411,  1.5928],
        [-0.3804, -1.5694,  2.1614],
        [ 0.5725, -1.3692,  1.4605],
        [ 0.5171, -0.1200, -0.9089],
        [ 0.2842, -1.5237,  1.8247],
        [-0.3896, -1.4722,  2.4720],
        [ 0.9505, -0.2210, -0.6819],
        [-0.1699, -1.3166,  1.7053],
        [-0.4534, -0.5308,  0.8446],
        [-0.2854, -1.5582,  2.2769],
        [-0.4510, -1.4653,  2.6756],
        [-0.2209, -1.4461,  2.0767],
        [-0.3855, -1.7907,  2.3547],
        [-0.8212, -1.2729,  2.4126],
        [ 0.0305, -1.5852,  2.3118],
        [-0.7690, -1.3486,  2.6832],
        [-0.2512, -1.7815,  2.2080],
        [ 0.3943,  0.3256, -0.4573],
        [-0.1507, -1.5598,  2.1807],
        [-0.6734, -1.3242,  2.2048],
        [ 1.1893, -0.6942, -0.3545],
        [-0.49

(Epoch 10) TRAIN LOSS:0.4805 LR:0.00000300:  24%|█████████▌                              | 6/25 [00:01<00:04,  4.53it/s]

SequenceClassifierOutput(loss=tensor(0.5699, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.5180, -1.1750,  2.6297],
        [-0.1395, -1.2951,  2.1915],
        [-0.0202, -1.6409,  2.4035],
        [ 0.5001, -0.0676, -0.0987],
        [ 1.0253,  0.2236, -1.2750],
        [-0.2137, -1.5206,  2.5767],
        [-0.7419, -1.1476,  2.3735],
        [-0.4489, -1.7114,  2.5838],
        [ 0.5252,  0.0123, -0.4347],
        [ 0.9117, -0.0401, -0.0697],
        [-0.1606, -1.4117,  1.9094],
        [ 0.9505, -0.3768, -0.3624],
        [ 1.0346, -0.0342, -0.9164],
        [ 0.8031,  0.0909, -0.6486],
        [ 0.2533, -1.5081,  2.0842],
        [ 0.6612, -0.1948, -0.6215],
        [ 0.7234, -0.5281, -0.2709],
        [ 0.6266, -0.3137,  0.1971],
        [ 1.2081, -0.1015, -0.9415],
        [ 0.0763, -1.3662,  1.5500],
        [ 1.0408, -1.0851,  0.9619],
        [ 0.1342, -1.1115,  1.5359],
        [ 1.6397, -0.6061, -0.6044],
        [ 1.1799, -0.9356,  0.0831],
        [ 0.84

(Epoch 10) TRAIN LOSS:0.4990 LR:0.00000300:  28%|███████████▏                            | 7/25 [00:01<00:03,  4.56it/s]

SequenceClassifierOutput(loss=tensor(0.6095, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6398,  0.4240, -0.9122],
        [ 0.6685, -1.4452,  1.6564],
        [ 1.0502,  0.0120, -0.7710],
        [-0.1667, -1.2054,  1.9523],
        [-0.2256, -1.7905,  2.3587],
        [ 1.2120, -0.5121, -0.5149],
        [-0.2980, -1.8518,  2.2922],
        [-0.1829, -1.7564,  2.4886],
        [ 0.0720, -1.4146,  1.7409],
        [-0.0500, -1.4604,  1.7216],
        [ 0.0281, -1.8183,  2.1367],
        [ 1.2946, -0.6346, -0.7446],
        [-0.2048, -1.5685,  1.8556],
        [-0.1923, -1.6398,  1.8884],
        [-0.4142, -1.2225,  2.3130],
        [ 1.3081, -0.3895, -0.1447],
        [ 0.6886, -0.8595,  1.1248],
        [-0.1535, -1.2696,  2.3652],
        [ 0.4610, -0.0484, -0.7827],
        [ 0.6486, -1.0887,  0.9567],
        [ 1.1718, -0.8013, -0.0099],
        [ 0.3545, -1.5880,  1.8932],
        [ 0.4452, -0.0175, -0.6117],
        [ 1.0070, -0.6282,  0.1476],
        [ 0.99

(Epoch 10) TRAIN LOSS:0.5222 LR:0.00000300:  32%|████████████▊                           | 8/25 [00:01<00:03,  4.55it/s]

SequenceClassifierOutput(loss=tensor(0.6851, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.6759, -1.8262,  2.7391],
        [-0.7031, -1.4470,  2.5900],
        [ 0.4950, -0.9553,  0.9767],
        [ 0.9844, -0.4803, -0.2237],
        [ 1.2349, -0.3354, -0.5381],
        [ 0.7485, -0.5223,  0.3391],
        [-0.4183, -1.3898,  2.3417],
        [ 1.2473, -0.5860, -0.1061],
        [-0.6437, -1.3450,  2.2503],
        [ 1.1984, -0.4120, -0.3936],
        [ 1.1508, -0.8167,  0.1476],
        [ 0.1368,  0.2995, -0.1802],
        [ 0.0347,  0.5387, -0.1234],
        [ 0.7498, -1.3993,  1.2840],
        [ 1.1380, -0.3579, -0.7201],
        [-0.0323, -1.8368,  1.9237],
        [ 0.6928, -1.3369,  1.3575],
        [ 0.3948, -0.2023, -0.3139],
        [ 1.0088,  0.1830, -0.7296],
        [ 0.9361, -0.9878,  0.1629],
        [ 0.3742, -1.5236,  1.4704],
        [ 0.3135,  0.4461, -0.5728],
        [ 0.0156, -1.5308,  1.8352],
        [ 0.7164, -0.2009, -0.4834],
        [ 0.41

(Epoch 10) TRAIN LOSS:0.5413 LR:0.00000300:  36%|██████████████▍                         | 9/25 [00:02<00:03,  4.61it/s]

SequenceClassifierOutput(loss=tensor(0.6937, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0447, -0.1383, -0.0505],
        [ 1.2639, -0.2295, -0.9772],
        [-0.5085, -1.2809,  2.3663],
        [-0.1829, -1.4314,  2.4963],
        [ 1.1583, -0.3702, -0.2403],
        [ 0.2808,  0.1257, -0.1154],
        [-0.1283, -1.5367,  1.9885],
        [ 0.5981,  0.4171, -0.5867],
        [ 0.6799,  0.2416, -0.4663],
        [ 0.9000, -0.0196, -0.1830],
        [ 0.6750, -0.1049,  0.0687],
        [ 0.0772, -1.2331,  1.6832],
        [-0.5052, -1.4763,  2.5504],
        [ 0.8635,  0.1742, -0.8315],
        [ 1.1096, -0.7023, -0.4660],
        [ 0.0752, -1.4353,  1.8982],
        [-0.4205, -1.3845,  2.5254],
        [ 0.5119,  0.3346, -1.0737],
        [ 0.7524,  0.3936, -0.9975],
        [-0.3263, -1.5346,  2.2250],
        [ 0.3076,  0.2623, -0.7123],
        [ 0.3461, -1.5896,  1.8780],
        [-0.3978, -1.4224,  2.7242],
        [-0.4799, -1.2681,  2.3005],
        [ 0.16

(Epoch 10) TRAIN LOSS:0.5272 LR:0.00000300:  40%|███████████████▌                       | 10/25 [00:02<00:03,  4.53it/s]

SequenceClassifierOutput(loss=tensor(0.4000, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.5219, -1.9141,  2.8285],
        [ 0.0496, -1.4022,  1.6827],
        [-0.7545, -1.3432,  2.8659],
        [ 0.6645, -0.0579, -0.5323],
        [ 0.0427, -0.9118,  1.6244],
        [-0.3732, -1.4495,  2.2732],
        [ 0.5164, -0.7630,  0.1852],
        [-0.6437, -1.3953,  2.8644],
        [-0.5283, -1.2308,  2.4665],
        [-0.2354, -1.7444,  2.1730],
        [-0.1127, -1.1130,  1.4519],
        [ 1.5684, -0.3001, -0.7150],
        [-0.2940, -1.3250,  2.2304],
        [ 0.0637, -1.3526,  1.6539],
        [ 0.6514, -1.5730,  1.7189],
        [ 0.9009, -0.1782, -0.8900],
        [ 0.6054, -0.3843,  0.2302],
        [-0.1346, -1.9148,  2.4332],
        [ 0.7056,  0.0619, -0.9335],
        [-0.4479, -1.7142,  2.6375],
        [ 0.7021,  0.1878, -0.3635],
        [ 0.7311, -0.7243,  0.1815],
        [ 0.0479, -1.8863,  1.9301],
        [-0.4286, -1.4057,  2.0190],
        [ 1.14

(Epoch 10) TRAIN LOSS:0.5237 LR:0.00000300:  44%|█████████████████▏                     | 11/25 [00:02<00:03,  4.60it/s]

SequenceClassifierOutput(loss=tensor(0.4894, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8582, -0.5853, -0.1473],
        [ 0.6545, -0.6680,  0.3259],
        [ 0.8423, -0.4023, -0.6199],
        [ 0.0216, -1.4874,  2.0493],
        [-0.1077, -1.1479,  1.9110],
        [ 0.5866,  0.3479, -0.4561],
        [ 0.9869,  0.0085, -0.2521],
        [ 1.0936, -1.1016,  0.3721],
        [ 1.4172, -0.5465, -0.0708],
        [ 0.4086, -0.0172, -0.3198],
        [ 0.1511, -1.5765,  1.9102],
        [-0.3708, -1.3945,  1.9191],
        [ 0.0992, -1.4173,  1.8399],
        [ 0.6344, -1.2670,  0.9866],
        [-0.5317, -1.7847,  2.7722],
        [ 1.3209, -0.3463, -0.5134],
        [-0.0520, -0.8484,  1.2596],
        [ 0.2374, -1.3927,  1.2394],
        [-0.5765, -1.4614,  2.5075],
        [-0.7121, -1.6181,  2.7467],
        [ 0.9817,  0.1766, -0.5938],
        [ 0.9786, -0.3546, -0.5563],
        [-0.2471, -1.9512,  2.2730],
        [ 0.8297, -0.9067,  0.5560],
        [-0.36

(Epoch 10) TRAIN LOSS:0.5235 LR:0.00000300:  48%|██████████████████▋                    | 12/25 [00:02<00:02,  4.59it/s]

SequenceClassifierOutput(loss=tensor(0.5217, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6891, -0.7577, -0.0880],
        [ 1.2012, -0.3144, -0.4175],
        [-0.4019, -1.2663,  2.3928],
        [ 0.0334, -1.5550,  1.9205],
        [ 0.3062,  0.2060, -0.0869],
        [ 0.6594, -0.9719,  0.5690],
        [-0.6678, -1.5442,  2.5595],
        [ 0.1983, -0.2564,  0.0027],
        [-0.6807, -1.5616,  2.6574],
        [-0.4556, -1.5556,  2.1702],
        [-0.0456, -1.8367,  2.1361],
        [ 0.4545, -0.4141, -0.3129],
        [ 1.0101, -0.5068, -0.6140],
        [-0.0845, -1.1675,  2.1894],
        [ 0.5842,  0.4084, -0.5140],
        [ 1.1422, -0.4379,  0.1417],
        [-0.4125, -1.3625,  2.2214],
        [-0.0044, -1.0869,  1.8081],
        [ 0.3509,  0.0178, -0.0817],
        [-0.2553, -1.1594,  2.2068],
        [ 0.6511,  0.2000, -0.6194],
        [ 0.1040, -1.3326,  2.0689],
        [ 0.3090, -1.2788,  2.0144],
        [ 0.9210, -0.6147, -0.0169],
        [-0.30

(Epoch 10) TRAIN LOSS:0.5209 LR:0.00000300:  52%|████████████████████▎                  | 13/25 [00:02<00:02,  4.64it/s]

SequenceClassifierOutput(loss=tensor(0.4893, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 6.1274e-01, -1.1037e+00,  8.2198e-01],
        [ 7.5231e-01,  6.0046e-01, -1.0793e+00],
        [-1.4787e-01, -1.5671e+00,  2.0872e+00],
        [ 5.3730e-01, -1.4548e+00,  1.0775e+00],
        [-3.2933e-01, -1.2040e+00,  2.1153e+00],
        [ 5.7591e-01,  1.3223e-01, -6.0440e-01],
        [ 1.3026e-01, -1.3558e+00,  2.1321e+00],
        [-5.9741e-01, -1.5703e+00,  2.5264e+00],
        [ 4.2294e-01, -5.1017e-02,  3.4076e-01],
        [ 1.0878e+00, -2.2518e-01, -4.2373e-01],
        [ 7.4883e-01, -3.6914e-01, -1.2167e-01],
        [ 6.2978e-01, -9.1597e-01,  7.9336e-01],
        [ 8.7572e-02, -1.3989e+00,  1.8483e+00],
        [-2.8750e-01, -1.6711e+00,  2.7332e+00],
        [-5.0996e-01, -1.6911e+00,  2.2150e+00],
        [-1.0726e-01, -1.6151e+00,  2.1784e+00],
        [ 1.2400e+00, -5.6459e-01, -3.2775e-01],
        [-6.3407e-01, -8.9351e-01,  2.4425e+00],
        [-3.6611e-01

(Epoch 10) TRAIN LOSS:0.5250 LR:0.00000300:  56%|█████████████████████▊                 | 14/25 [00:03<00:02,  4.72it/s]

SequenceClassifierOutput(loss=tensor(0.5775, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.1398e+00, -1.7399e-01, -7.9141e-01],
        [ 8.6201e-01, -1.0205e-01, -3.4946e-01],
        [ 1.0321e+00, -1.1522e+00,  6.3486e-01],
        [-1.7548e-02, -1.6393e+00,  1.7153e+00],
        [-4.9576e-01, -1.4960e+00,  2.0946e+00],
        [ 4.9008e-01, -5.2126e-04, -6.8388e-01],
        [-6.0585e-01, -1.6799e+00,  2.3258e+00],
        [ 8.0041e-01, -1.4332e+00,  1.2350e+00],
        [ 2.8765e-01, -1.2746e+00,  1.2272e+00],
        [-2.8071e-01, -1.4383e+00,  2.4535e+00],
        [ 6.4876e-01, -1.4752e+00,  1.5838e+00],
        [ 1.2948e+00, -4.2376e-01, -6.8618e-01],
        [-1.8411e-01, -1.7652e+00,  2.4733e+00],
        [ 8.6693e-01, -7.2192e-01, -3.2614e-01],
        [-1.4326e-01, -1.5387e+00,  1.8216e+00],
        [ 1.0035e+00, -1.7470e-01, -7.2185e-01],
        [-4.5928e-01, -1.0825e+00,  2.2560e+00],
        [-5.3658e-02, -8.8951e-01,  1.6602e+00],
        [-4.1852e-01

(Epoch 10) TRAIN LOSS:0.5221 LR:0.00000300:  60%|███████████████████████▍               | 15/25 [00:03<00:02,  4.60it/s]

SequenceClassifierOutput(loss=tensor(0.4824, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.1114e-01, -1.6127e+00,  2.6332e+00],
        [ 1.1272e-01, -1.4644e+00,  1.2561e+00],
        [ 7.3279e-01, -2.7571e-01, -9.7062e-01],
        [-3.7554e-01, -1.0829e+00,  2.3199e+00],
        [-1.9478e-01, -1.8195e+00,  2.2652e+00],
        [-4.5047e-02, -1.0638e+00,  2.0292e+00],
        [ 7.4159e-02,  2.1079e-01, -8.2685e-01],
        [-6.8610e-01, -1.1349e+00,  2.3538e+00],
        [ 1.3541e+00, -5.9037e-01,  1.6281e-01],
        [ 9.1488e-03, -1.7401e+00,  1.9923e+00],
        [-3.9443e-01, -1.5676e+00,  1.7919e+00],
        [-2.8847e-01, -1.3298e+00,  1.8183e+00],
        [-4.6903e-01, -1.7033e+00,  2.4438e+00],
        [ 6.7539e-01,  2.8756e-01, -8.8865e-01],
        [-8.3708e-01, -1.4109e+00,  2.6590e+00],
        [ 1.0254e+00, -5.6493e-01,  1.1782e-03],
        [ 6.4307e-01, -6.4157e-01,  4.2783e-01],
        [-5.8537e-01, -1.6007e+00,  2.6345e+00],
        [-7.4628e-01

(Epoch 10) TRAIN LOSS:0.5309 LR:0.00000300:  64%|████████████████████████▉              | 16/25 [00:03<00:01,  4.70it/s]

SequenceClassifierOutput(loss=tensor(0.6625, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8086,  0.0305, -0.6513],
        [-0.4990, -1.5874,  2.9068],
        [-0.1312, -1.3309,  1.9255],
        [-0.4206, -1.6648,  2.6617],
        [ 0.4231, -0.3619, -0.0841],
        [-0.1325, -1.7507,  2.3869],
        [-0.7662, -1.5261,  2.7649],
        [-0.1122, -1.7659,  2.0632],
        [ 0.7106,  0.5135, -0.1857],
        [-0.3804, -1.3481,  2.0239],
        [ 0.5800, -0.1950, -0.1004],
        [ 0.9953, -0.1415, -1.0555],
        [ 0.6243,  0.1556, -0.7893],
        [-0.4408, -1.3097,  2.6202],
        [ 0.5982, -0.1159, -0.1994],
        [ 0.7608,  0.7394, -0.5175],
        [ 0.1802, -1.2719,  2.2626],
        [-0.1625, -1.6404,  1.6787],
        [ 1.5711, -0.6700, -0.6071],
        [ 0.6070, -0.1317, -0.1267],
        [ 1.3586, -0.3000, -1.2734],
        [-0.4252, -1.5603,  2.6499],
        [ 0.7141, -0.4742,  0.3762],
        [ 0.3447, -1.7146,  2.0515],
        [-0.37

(Epoch 10) TRAIN LOSS:0.5362 LR:0.00000300:  68%|██████████████████████████▌            | 17/25 [00:03<00:01,  4.63it/s]

SequenceClassifierOutput(loss=tensor(0.6217, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.6809, -1.5182,  2.3369],
        [ 0.8243,  0.1039, -0.5702],
        [ 0.4625,  0.2766, -0.9553],
        [ 0.4436, -1.4817,  1.3040],
        [ 0.1996, -1.2609,  1.8327],
        [-0.2422, -1.3504,  2.0220],
        [ 0.9623, -1.3645,  1.0102],
        [ 0.0986, -1.6720,  2.0077],
        [ 0.8625, -0.6790,  0.0810],
        [-0.2622, -1.4960,  2.4557],
        [ 0.2251, -0.4975,  0.6250],
        [-0.2394, -1.4288,  2.0699],
        [-0.5619, -1.6014,  2.5477],
        [ 0.7871, -0.7269,  0.2291],
        [-0.2699, -1.5559,  2.1357],
        [-0.4918, -1.5397,  2.4235],
        [ 0.1494, -1.3303,  2.1571],
        [-0.2400, -1.8328,  2.5085],
        [-0.0225, -1.6505,  1.8360],
        [-0.8991, -1.9246,  2.6206],
        [-0.4745, -1.5015,  2.7075],
        [ 0.9804, -0.9526,  0.4797],
        [ 0.0782, -1.2707,  1.9308],
        [-0.3390, -1.6679,  2.6807],
        [-0.18

(Epoch 10) TRAIN LOSS:0.5465 LR:0.00000300:  72%|████████████████████████████           | 18/25 [00:03<00:01,  4.97it/s]

SequenceClassifierOutput(loss=tensor(0.7210, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5831,  0.2471, -0.8604],
        [ 0.2394,  0.0215, -0.2708],
        [ 1.4770, -0.9226, -0.0287],
        [ 0.5643, -0.8158,  0.4925],
        [ 0.5790, -0.3430, -0.4833],
        [ 0.3979,  0.2424, -0.2706],
        [ 0.7573, -0.6329, -0.3406],
        [ 1.2292, -1.0581,  0.1499],
        [-0.2108, -1.5325,  2.2412],
        [ 0.6975, -0.9272,  1.0283],
        [ 0.2720, -0.7977,  0.3236],
        [ 0.5354, -1.6920,  1.4540],
        [ 1.0545, -0.3284, -0.3625],
        [ 0.5701, -1.4473,  1.4846],
        [ 0.3615,  0.1879, -0.4718],
        [ 0.7246,  0.6532, -0.9185],
        [-0.3920, -1.2395,  2.4879],
        [-0.3255, -1.1730,  1.9682],
        [ 0.2509,  0.3066, -0.5419],
        [-0.3613, -0.9582,  1.7949],
        [-0.0398, -1.5422,  1.9079],
        [ 0.0874, -1.6896,  1.9022],
        [ 0.1410, -1.4347,  1.6059],
        [ 0.5302,  0.5387, -0.7377],
        [ 0.02

(Epoch 10) TRAIN LOSS:0.5440 LR:0.00000300:  80%|███████████████████████████████▏       | 20/25 [00:04<00:00,  5.24it/s]

SequenceClassifierOutput(loss=tensor(0.4930, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1366, -0.1345, -0.5028],
        [-0.2374, -1.5802,  2.2471],
        [ 0.8530, -0.1996, -0.2924],
        [-0.3879, -1.6501,  1.9439],
        [ 0.2078,  0.4071, -0.5113],
        [ 1.5967, -0.5873, -0.2453],
        [ 0.3477,  0.4204, -0.2846],
        [ 0.7633,  0.8034, -1.0868],
        [-0.0670, -1.3938,  1.8667],
        [-0.8109, -1.3354,  2.6618],
        [-0.5424, -0.7132,  1.1624],
        [ 0.1649, -1.8714,  2.2704],
        [ 0.5342, -1.7536,  1.7249],
        [ 0.4012, -1.1290,  1.5709],
        [ 0.2321, -0.3792,  0.7769],
        [ 0.8146,  0.4807, -0.8665],
        [-0.3566, -1.6964,  2.2612],
        [-0.7358, -1.6567,  2.4106],
        [ 0.0437, -1.4180,  1.0292],
        [ 0.0529, -0.9069,  1.2114],
        [-0.3974, -1.1012,  2.1327],
        [-0.4652, -1.0936,  1.9370],
        [ 0.4307, -1.5550,  1.3531],
        [-0.5933, -1.5921,  2.2417],
        [ 0.83

(Epoch 10) TRAIN LOSS:0.5399 LR:0.00000300:  88%|██████████████████████████████████▎    | 22/25 [00:04<00:00,  5.17it/s]

SequenceClassifierOutput(loss=tensor(0.4645, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2876, -1.9064,  2.9282],
        [ 0.8507, -0.6325, -0.0593],
        [ 1.1027,  0.0234, -0.7657],
        [-0.1785, -1.4291,  1.5404],
        [-0.1429, -1.6188,  2.5474],
        [ 0.6420,  0.2317, -0.6293],
        [-0.0439, -1.6022,  1.5497],
        [ 0.9459, -0.2528, -0.7888],
        [-0.4257, -1.4999,  2.4633],
        [ 1.0668, -0.3987, -0.6975],
        [ 0.1267, -1.5511,  2.0538],
        [-0.6141, -1.2123,  2.0727],
        [-0.4175, -1.6112,  2.5818],
        [-0.0735, -1.4620,  2.1315],
        [ 0.4072, -1.0764,  0.7746],
        [ 0.9970, -0.8589, -0.2913],
        [ 1.2742,  0.2453, -1.1233],
        [-0.2504, -1.8085,  2.5381],
        [ 0.0690,  0.6121, -0.4256],
        [ 0.8892, -0.7481,  0.0342],
        [-0.7248, -1.4474,  3.0753],
        [ 1.0374,  0.0255, -1.0321],
        [-0.5533, -1.4206,  2.2893],
        [ 0.2447, -1.0129,  1.5775],
        [ 0.41

(Epoch 10) TRAIN LOSS:0.5379 LR:0.00000300:  92%|███████████████████████████████████▉   | 23/25 [00:04<00:00,  4.81it/s]

SequenceClassifierOutput(loss=tensor(0.4035, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.5815, -1.7565,  2.8998],
        [-0.1026, -1.3515,  2.1387],
        [-0.2231, -1.6775,  2.3368],
        [ 1.3565, -0.1997, -0.2362],
        [ 0.5401, -0.3834,  0.3302],
        [ 0.0814, -1.2113,  1.8388],
        [-0.2713, -1.4638,  2.2241],
        [-0.3238, -1.3101,  2.4380],
        [-0.2755, -1.4247,  2.0695],
        [-0.0429, -1.5892,  2.4731],
        [ 0.5096, -1.4636,  1.6393],
        [ 0.7692, -0.5971, -0.1007],
        [ 1.0171, -0.3747, -0.0632],
        [-0.2785, -1.5649,  2.4251],
        [ 1.3134, -0.6599, -0.5485],
        [ 0.7682,  0.0579, -0.8875],
        [-0.2617, -1.8513,  2.5391],
        [-0.2929, -1.8388,  2.2553],
        [ 0.6644, -0.2513, -0.2193],
        [-0.7791, -1.5146,  2.6226],
        [ 1.0019,  0.1375, -0.5276],
        [ 1.0932, -0.2564, -0.4420],
        [-0.4276, -1.6528,  2.8680],
        [ 0.3293,  0.4427, -0.6958],
        [-0.26

(Epoch 10) TRAIN LOSS:0.5323 LR:0.00000300:  96%|█████████████████████████████████████▍ | 24/25 [00:05<00:00,  4.38it/s]

SequenceClassifierOutput(loss=tensor(0.3319, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7370,  0.0128, -0.6244],
        [-0.1166, -1.3575,  1.6745],
        [-0.8515, -1.4502,  2.7131],
        [-0.5716, -1.5921,  2.4546],
        [ 0.5327,  0.2404,  0.3993],
        [-0.3798, -1.3231,  2.6679],
        [ 0.1771, -0.1609, -0.3348],
        [ 0.1630, -1.2608,  1.7416],
        [ 0.9299, -1.1613,  0.4822],
        [-0.8081, -1.3478,  2.3312],
        [-0.1985, -1.5164,  1.9518],
        [-0.3088, -1.4026,  1.9062],
        [ 1.2152, -0.5553, -0.6626],
        [ 0.5356, -0.6048,  0.5450],
        [-0.6771, -1.8147,  2.9505],
        [-0.5173, -1.2673,  2.5818],
        [ 1.1437, -0.7188, -0.2206],
        [ 0.0199, -1.4023,  1.6556],
        [ 0.8118, -0.5193, -0.3582],
        [-0.3664, -1.5383,  2.4392],
        [-0.2068, -1.9064,  2.4537],
        [-0.3996, -1.5846,  2.3852],
        [ 1.0358, -0.0620, -0.7977],
        [-0.4287, -1.6363,  2.5055],
        [-0.07

(Epoch 10) TRAIN LOSS:0.5243 LR:0.00000300: 100%|███████████████████████████████████████| 25/25 [00:05<00:00,  4.38it/s]

(Epoch 10) TRAIN LOSS:0.5243 ACC:0.78 F1:0.58 REC:0.59 PRE:0.63 LR:0.00000300





In [17]:
# Evaluate on validation
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm.tqdm(iter(valid_loader), leave=True, total=len(valid_loader))
for i, batch_data in enumerate(pbar):
    batch_seq = batch_data[-1]
    loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    
    # Calculate total loss
    valid_loss = loss.item()
    total_loss = total_loss + valid_loss

    # Calculate evaluation metrics
    list_hyp += batch_hyp
    list_label += batch_label
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)

    pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
    
metrics = document_sentiment_metrics_fn(list_hyp, list_label)
print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
    total_loss/(i+1), metrics_to_string(metrics)))

VALID LOSS:0.6434 ACC:0.77 F1:0.70 REC:0.69 PRE:0.72:  29%|████████▊                      | 2/7 [00:00<00:00,  6.71it/s]

SequenceClassifierOutput(loss=tensor(0.7156, device='cuda:0'), logits=tensor([[ 0.9848, -1.1901,  0.6303],
        [ 0.9267,  0.1584, -0.4856],
        [ 0.0654, -1.5826,  2.0053],
        [-0.1290, -1.5016,  2.1812],
        [-0.2285, -1.6376,  2.4970],
        [ 0.7982,  0.3852, -1.0152],
        [ 0.3923,  0.3987, -0.4227],
        [ 0.3894,  0.2551, -0.6009],
        [ 0.0919, -1.2441,  1.1212],
        [ 0.5897,  0.5582, -0.8677],
        [ 1.2826, -0.1770, -1.1496],
        [-0.1299, -0.3111,  0.5171],
        [ 0.2921,  0.3444, -0.7956],
        [ 0.6041,  0.5338, -1.0841],
        [ 0.3990, -0.4024, -0.0963],
        [ 0.7475, -0.6365,  0.4139],
        [ 0.4781, -0.8977,  1.1694],
        [-0.3028, -1.4150,  2.6588],
        [ 0.1691,  0.3640, -0.7206],
        [ 0.8735, -0.3155, -0.5683],
        [ 0.5605,  0.8164, -1.5068],
        [ 0.8352, -1.0822,  0.3040],
        [-0.1292, -1.6275,  2.5480],
        [ 0.2016,  0.1394, -0.6559],
        [ 0.2465, -0.0344, -0.3429],
     

VALID LOSS:0.5922 ACC:0.77 F1:0.67 REC:0.66 PRE:0.70:  57%|█████████████████▋             | 4/7 [00:00<00:00,  6.56it/s]

SequenceClassifierOutput(loss=tensor(0.4090, device='cuda:0'), logits=tensor([[ 0.0661,  0.2467, -0.3563],
        [-0.6512, -1.7945,  2.7131],
        [ 1.3169, -0.9718, -0.3663],
        [ 0.5506, -0.9534,  0.8528],
        [-0.4600, -1.4777,  2.3229],
        [ 1.1033, -0.9344,  0.4311],
        [-0.9087, -1.4477,  2.5952],
        [-0.7602, -1.5999,  2.7431],
        [ 0.0925,  0.5303, -0.6430],
        [ 1.0660, -0.5138, -0.0588],
        [-0.5783, -1.7088,  2.6895],
        [-0.6451, -1.4130,  2.5237],
        [ 0.1043, -1.2336,  1.4781],
        [ 1.3178, -0.9273, -0.0950],
        [-0.7764, -1.3475,  2.3681],
        [-0.2857, -1.7744,  2.4125],
        [-0.5193, -1.7904,  2.7724],
        [-0.5416, -1.5884,  2.3327],
        [-0.6641, -1.5208,  2.5290],
        [ 0.8212, -1.1392,  1.0042],
        [-0.9900, -1.7187,  2.7551],
        [-0.4491, -1.5059,  2.3621],
        [ 0.3366,  0.2360, -0.6432],
        [ 1.2116, -1.0346,  0.0141],
        [ 1.2889, -0.1418, -0.9872],
     

VALID LOSS:0.5492 ACC:0.79 F1:0.70 REC:0.68 PRE:0.72:  86%|██████████████████████████▌    | 6/7 [00:00<00:00,  6.97it/s]

SequenceClassifierOutput(loss=tensor(0.3296, device='cuda:0'), logits=tensor([[ 1.3106, -0.4925, -0.8903],
        [ 1.4060, -0.2686, -0.2209],
        [-0.7131, -1.5191,  2.6861],
        [-0.0631, -1.5453,  1.9726],
        [-0.6472, -1.3325,  2.3653],
        [-0.1313, -1.4732,  2.1454],
        [ 1.2802, -0.1038, -0.8897],
        [-0.3357, -1.1656,  1.9239],
        [-0.3917, -1.0866,  1.8739],
        [ 1.3551, -0.2239, -0.9130],
        [ 0.8051, -0.8648, -0.4330],
        [-0.4231, -1.4522,  2.1662],
        [-0.3290, -1.7561,  2.4750],
        [ 0.1985, -1.0920,  0.9990],
        [ 0.9586, -0.1668, -0.8865],
        [ 0.9274, -1.2588,  0.5371],
        [ 0.7759, -0.7079,  0.4429],
        [-0.3891, -1.4798,  2.7241],
        [-0.0632, -1.1153,  2.0037],
        [ 0.2904,  0.4111, -0.0504],
        [-0.3072, -1.7968,  2.5616],
        [ 1.0335, -1.2880,  0.5263],
        [-0.0695, -1.7118,  2.0791],
        [ 0.3459,  0.1229, -0.9239],
        [ 1.1111, -1.2679,  0.4993],
     

VALID LOSS:0.5617 ACC:0.79 F1:0.70 REC:0.69 PRE:0.72: 100%|███████████████████████████████| 7/7 [00:01<00:00,  6.37it/s]

(Epoch 10) VALID LOSS:0.5617 ACC:0.79 F1:0.70 REC:0.69 PRE:0.72



