In [1]:
import pandas as pd
import numpy as np
import torch
import tqdm
import os
import random
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

2021-10-24 10:02:04.669615: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="4"
tf_device='/gpu:0'

In [3]:
tokenizer = BertTokenizer.from_pretrained("indolem/indobertweet-base-uncased")
config = BertConfig.from_pretrained("indolem/indobertweet-base-uncased")
config.num_labels = 3
model = BertForSequenceClassification.from_pretrained("indolem/indobertweet-base-uncased", config=config)

Some weights of the model checkpoint at indolem/indobertweet-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/ind

In [4]:
class DocumentSentimentDataset(Dataset):
    # Static constant variable (We need to have this part to comply with IndoNLU standard)
    LABEL2INDEX = {'negative': 0, 'neutral': 1, 'positive': 2} # Label string to index
    INDEX2LABEL = {0:'negative', 1:'neutral', 2: 'positive'} # Index to label string
    NUM_LABELS = 3 # Number of label
   
    def load_dataset(self, path):
        df = pd.read_csv(path, sep='\t', header=None) # Read tsv file with pandas
        df.columns = ['text','sentiment'] # Rename the columns
        df['sentiment'] = df['sentiment'].apply(lambda lab: self.LABEL2INDEX[lab]) # Convert string label into index
        return df
   
    def __init__(self, dataset_path, tokenizer, *args, **kwargs):
        self.data = self.load_dataset(dataset_path) # Load the tsv file

        # Assign the tokenizer for tokenization
        # here we use subword tokenizer from HuggingFace
        self.tokenizer = tokenizer
    
    def __getitem__(self, index):
        data = self.data.loc[index,:] # Taking data from a specific row from Pandas
        text, sentiment = data['text'], data['sentiment'] # Take text and sentiment from the row
        subwords = self.tokenizer.encode(text) # Tokenize the text with tokenizer
        return np.array(subwords), np.array(sentiment), data['text']
   
    def __len__(self):
        return len(self.data)  # Return the length of the dataset

In [5]:
class DocumentSentimentDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)
        self.max_seq_len = max_seq_len # Assign max limit of the sequence length
        self.collate_fn = self._collate_fn # Assign the collate_fn function with our function
       
    def _collate_fn(self, batch):
        batch_size = len(batch) # Take the batch size
        max_seq_len = max(map(lambda x: len(x[0]), batch)) # Find maximum sequence length from the batch 
        max_seq_len = min(self.max_seq_len, max_seq_len) # Compare with our defined limit
       
    # Create buffer for subword, mask, and sentiment labels, initialize all with 0
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)
       
    # Fill all of the buffer
        seq_list = []
        for i, (subwords, sentiment, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            sentiment_batch[i,0] = sentiment
            
            seq_list.append(raw_seq)
           
    # Return the subword, mask, and sentiment data
        return subword_batch, mask_batch, sentiment_batch, seq_list

In [6]:
def forward_sequence_classification(model, batch_data, i2w, is_test=False, device = 'cuda', **kwargs):
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)
            
    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()
    
    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    print(outputs)
    loss, logits = outputs[:2]
    
    # generate prediction & label list
    list_hyp = []
    list_label = []
    hyp = torch.topk(logits, 1)[1]
    for j in range(len(hyp)):
        list_hyp.append(i2w[hyp[j].item()])
        list_label.append(i2w[label_batch[j][0].item()])
    return loss, list_hyp, list_label

In [7]:
def document_sentiment_metrics_fn(list_hyp, list_label):
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

In [8]:
import itertools
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
set_seed(241021)

In [9]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'negative': 0, 'neutral': 1, 'positive': 2}
{0: 'negative', 1: 'neutral', 2: 'positive'}


In [10]:
text =  'satu satu aku sayang ibu, dua dua juga sayang ayah, tiga-tiga sayang adik kakak, satu dua tiga sayang semuanya'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
print(subwords)
logits = model(subwords)[0]
print(logits)
print(torch.topk(logits, k=1))
label = torch.topk(logits, k=1)[1].squeeze().item()
print(f'Text: {text} | Label : {i2w[label]} ({torch.nn.functional.softmax(logits, dim=-1).squeeze()[label] * 100}%)')

tensor([[   3, 1713, 1713, 2254, 5458, 2477,   16, 1854, 1854, 1614, 5458, 3095,
           16, 2139,   17, 2139, 5458, 4962, 6962,   16, 1713, 1854, 2139, 5458,
         4398,    4]])
tensor([[0.1063, 0.3589, 0.2616]], grad_fn=<AddmmBackward0>)
torch.return_types.topk(
values=tensor([[0.3589]], grad_fn=<TopkBackward0>),
indices=tensor([[1]]))
Text: satu satu aku sayang ibu, dua dua juga sayang ayah, tiga-tiga sayang adik kakak, satu dua tiga sayang semuanya | Label : neutral (37.25643539428711%)


In [11]:
print(f'Text: {text} | Label : {i2w[label]} ({torch.nn.functional.softmax(logits, dim=-1).squeeze()[label] * 100}%)')

Text: satu satu aku sayang ibu, dua dua juga sayang ayah, tiga-tiga sayang adik kakak, satu dua tiga sayang semuanya | Label : neutral (37.25643539428711%)


In [12]:
train = DocumentSentimentDataset('dataset/train_preprocess.tsv', tokenizer)
valid = DocumentSentimentDataset('dataset/valid_preprocess.tsv', tokenizer)
train_loader = DocumentSentimentDataLoader(dataset=train, max_seq_len=100, batch_size=32, num_workers=1, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid, max_seq_len=100, batch_size=32, num_workers=1, shuffle=True)

In [13]:
torch.cuda.empty_cache()

In [14]:
def count_param(module, trainable=True):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
count_param(model)

110560515

In [15]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [16]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm.tqdm(iter(train_loader), leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

(Epoch 1) TRAIN LOSS:1.1632 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:11,  2.16it/s]

SequenceClassifierOutput(loss=tensor(1.1632, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 3.3615e-02,  3.5671e-01, -1.2313e-01],
        [ 1.5387e-01,  4.2158e-01, -1.7582e-01],
        [-6.1776e-02,  2.8236e-01, -2.6932e-01],
        [-1.3965e-01,  2.6802e-01, -9.1601e-03],
        [ 1.3911e-01,  3.6378e-01,  3.1713e-01],
        [ 2.8474e-01,  1.0744e-01,  1.5907e-01],
        [ 4.1207e-01,  2.3071e-02, -3.6158e-02],
        [ 4.2197e-01,  1.6374e-01,  1.5196e-01],
        [ 1.2188e-01,  4.4394e-01, -1.5308e-01],
        [ 8.6493e-02,  3.1165e-01,  3.0392e-01],
        [ 2.1207e-01,  2.7171e-01,  5.5797e-02],
        [-5.1121e-02,  3.3995e-01,  4.7404e-02],
        [ 1.9331e-01,  1.0391e-01,  1.6870e-02],
        [ 1.5999e-01,  3.6368e-01,  2.0139e-01],
        [-3.1938e-04,  3.3215e-01, -3.2678e-02],
        [ 1.1143e-02,  3.8876e-01,  5.1317e-02],
        [ 2.2058e-01,  1.7777e-02,  1.1571e-01],
        [-7.8299e-02,  1.1653e-02,  3.1399e-01],
        [ 7.7959e-02

(Epoch 1) TRAIN LOSS:1.1261 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:07,  3.20it/s]

SequenceClassifierOutput(loss=tensor(1.0889, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3122,  0.3217, -0.1440],
        [ 0.1463, -0.0136,  0.2308],
        [ 0.1727, -0.0009,  0.1943],
        [-0.1019,  0.5760,  0.1275],
        [ 0.3802,  0.5496,  0.2276],
        [ 0.1621,  0.1478,  0.0158],
        [-0.0870,  0.1240, -0.0224],
        [-0.1019,  0.4882,  0.2198],
        [ 0.4551,  0.2383,  0.1481],
        [ 0.3522,  0.2099,  0.2823],
        [ 0.3960,  0.5070,  0.0869],
        [ 0.0337,  0.4887,  0.2073],
        [ 0.1098,  0.2982,  0.0875],
        [ 0.0508,  0.1102,  0.6286],
        [ 0.1664,  0.3772, -0.0606],
        [ 0.1752,  0.2354,  0.1167],
        [ 0.0652,  0.4043,  0.1094],
        [ 0.1208,  0.3034,  0.0591],
        [ 0.1132,  0.4409,  0.1675],
        [-0.0445,  0.1736, -0.1837],
        [ 0.1837, -0.2312, -0.0061],
        [-0.0467,  0.1526, -0.1352],
        [-0.0084,  0.0827,  0.0168],
        [ 0.2302,  0.4138,  0.1310],
        [ 0.19

(Epoch 1) TRAIN LOSS:1.1143 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:05,  3.74it/s]

SequenceClassifierOutput(loss=tensor(1.0907, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0346, -0.1154,  0.0232],
        [ 0.0946,  0.3326, -0.0672],
        [ 0.0473,  0.1677,  0.5480],
        [ 0.2483,  0.0150,  0.3518],
        [-0.1228,  0.1418,  0.0245],
        [-0.0732,  0.1857,  0.0721],
        [-0.0128,  0.1618,  0.4092],
        [ 0.0220,  0.0244,  0.2622],
        [ 0.4095,  0.2438,  0.2791],
        [ 0.0531,  0.2163,  0.4700],
        [ 0.0047,  0.3045,  0.1316],
        [ 0.2263,  0.2079,  0.2548],
        [ 0.1671,  0.0629,  0.3603],
        [ 0.0163, -0.2083,  0.0765],
        [-0.0568,  0.0006, -0.2054],
        [ 0.1987,  0.0741,  0.1593],
        [ 0.0212,  0.1719,  0.0480],
        [ 0.1269,  0.2134,  0.2186],
        [ 0.2333,  0.3685,  0.0422],
        [ 0.3334,  0.6208,  0.1982],
        [ 0.2157,  0.5636,  0.3385],
        [ 0.2314,  0.4046,  0.1742],
        [ 0.0250,  0.3406,  0.1347],
        [ 0.2231,  0.4070,  0.1667],
        [ 0.45

(Epoch 1) TRAIN LOSS:1.1036 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:01<00:05,  3.95it/s]

SequenceClassifierOutput(loss=tensor(1.0717, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1779,  0.1836,  0.3339],
        [-0.1602,  0.0715,  0.2446],
        [-0.0354, -0.0240, -0.1259],
        [ 0.1368,  0.1706,  0.2710],
        [ 0.3767,  0.1387,  0.4991],
        [ 0.3811,  0.1332,  0.2071],
        [ 0.0289, -0.2043,  0.1218],
        [ 0.2331,  0.0055,  0.2861],
        [ 0.3541,  0.3454,  0.1720],
        [ 0.5143,  0.0630,  0.0650],
        [ 0.4185,  0.2955,  0.0117],
        [-0.0214,  0.1872,  0.2479],
        [ 0.1413, -0.0402,  0.5571],
        [ 0.2887,  0.1130, -0.0361],
        [ 0.3541,  0.5412, -0.0089],
        [-0.0117, -0.0556, -0.1471],
        [ 0.0153,  0.5127,  0.2475],
        [ 0.2821, -0.0272,  0.1499],
        [ 0.0735,  0.0700,  0.2326],
        [ 0.3910, -0.2387,  0.2258],
        [ 0.1976,  0.0706,  0.1737],
        [-0.0403,  0.0411,  0.6489],
        [ 0.2538,  0.1999,  0.2554],
        [ 0.1640,  0.0334,  0.0730],
        [ 0.19

(Epoch 1) TRAIN LOSS:1.0977 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:04,  4.21it/s]

SequenceClassifierOutput(loss=tensor(1.0737, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3800,  0.4238,  0.1094],
        [ 0.0883, -0.0359,  0.3706],
        [ 0.0068,  0.2451,  0.2106],
        [ 0.1511,  0.0304,  0.5555],
        [-0.1037, -0.0115, -0.2279],
        [-0.0563,  0.1585,  0.2377],
        [ 0.2327, -0.0511,  0.3024],
        [ 0.2004, -0.0319, -0.0461],
        [ 0.2943,  0.4594,  0.0964],
        [ 0.0364,  0.1609, -0.0008],
        [ 0.0621, -0.1988,  0.0929],
        [ 0.3664, -0.1276,  0.0967],
        [ 0.4536,  0.2783,  0.2247],
        [ 0.3239,  0.1249,  0.4309],
        [ 0.1331,  0.1816,  0.3295],
        [ 0.1631,  0.2209,  0.3257],
        [ 0.2331,  0.1388,  0.4487],
        [ 0.1395,  0.1768,  0.0309],
        [ 0.2213,  0.0689,  0.4927],
        [ 0.1049,  0.1899,  0.3320],
        [-0.1437,  0.0936,  0.2778],
        [ 0.0192,  0.3447, -0.0718],
        [ 0.1185,  0.2725,  0.3021],
        [ 0.2322,  0.1166, -0.0502],
        [ 0.21

(Epoch 1) TRAIN LOSS:1.0856 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:04,  4.43it/s]

SequenceClassifierOutput(loss=tensor(1.0253, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.4632e-01,  2.2505e-01,  5.2646e-01],
        [ 1.1008e-01,  3.8590e-01,  1.6911e-01],
        [-3.2958e-02,  4.9773e-01,  4.0870e-02],
        [ 1.1178e-01,  1.1290e-01,  2.3639e-01],
        [-2.0053e-01,  1.5535e-04,  2.5413e-01],
        [ 2.7108e-01,  1.3135e-01,  3.4013e-01],
        [-9.3649e-02,  1.1348e-01,  3.9905e-01],
        [ 1.6078e-01,  1.1042e-01,  7.1562e-01],
        [ 9.9061e-03,  2.3589e-01,  1.6695e-01],
        [ 2.0406e-01,  2.4353e-01, -9.9176e-02],
        [ 6.1976e-02,  6.4625e-02,  1.3870e-01],
        [ 1.5852e-02,  2.4861e-01, -1.6768e-01],
        [ 1.3974e-01, -3.3256e-01,  4.4891e-01],
        [ 1.5082e-01, -2.4520e-01,  1.2757e-01],
        [ 3.4516e-01,  8.3872e-02,  2.5975e-01],
        [ 2.9715e-01,  1.3830e-01,  2.4326e-01],
        [ 3.2477e-01, -2.8221e-02,  6.4699e-01],
        [ 1.3829e-01,  1.3651e-01,  2.5305e-01],
        [ 3.2085e-01

(Epoch 1) TRAIN LOSS:1.0774 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:03,  4.52it/s]

SequenceClassifierOutput(loss=tensor(1.0284, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1435,  0.3482,  0.3586],
        [ 0.1687,  0.3788,  0.2131],
        [ 0.0390,  0.3178,  0.1185],
        [ 0.2675,  0.4399,  0.2029],
        [-0.0938, -0.0586,  0.3242],
        [-0.0405,  0.1517,  0.3000],
        [ 0.0795,  0.1326,  0.2573],
        [ 0.0888,  0.1415,  0.3274],
        [ 0.2008,  0.1633,  0.4362],
        [ 0.1004, -0.0054,  0.3486],
        [ 0.1883,  0.2686,  0.1385],
        [ 0.4234, -0.1943,  0.2828],
        [-0.1389,  0.4773,  0.2079],
        [ 0.1929,  0.0219,  0.5587],
        [ 0.0669,  0.1082,  0.1145],
        [-0.0150, -0.2466,  0.3978],
        [ 0.1270, -0.0110,  0.2795],
        [-0.1440, -0.0970,  0.2425],
        [ 0.2546,  0.1942,  0.2479],
        [-0.0796,  0.1024,  0.0620],
        [-0.2318,  0.1133,  0.4236],
        [ 0.1977,  0.1428,  0.3591],
        [ 0.1862,  0.0037,  0.7541],
        [ 0.1120,  0.2429,  0.0267],
        [ 0.18

(Epoch 1) TRAIN LOSS:1.0742 LR:0.00000300:  32%|█████████████                            | 8/25 [00:01<00:03,  4.64it/s]

SequenceClassifierOutput(loss=tensor(1.0518, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0211,  0.2136,  0.2360],
        [ 0.0622,  0.2408,  0.3441],
        [ 0.4178,  0.0025,  0.0829],
        [ 0.1068, -0.0751,  0.4202],
        [ 0.5732, -0.0214,  0.2879],
        [ 0.1352, -0.1526,  0.4413],
        [-0.0757,  0.0536,  0.3459],
        [ 0.0491,  0.4485,  0.5639],
        [ 0.0258,  0.0816,  0.1567],
        [-0.1120, -0.0809, -0.0846],
        [-0.0909, -0.1669,  0.2331],
        [ 0.1757,  0.3923, -0.1252],
        [ 0.0930, -0.0842,  0.3955],
        [ 0.2376, -0.1622,  0.3943],
        [ 0.0484,  0.0668,  0.2885],
        [-0.2041, -0.1363,  0.2743],
        [ 0.0626, -0.1138,  0.3137],
        [ 0.3527, -0.0017,  0.0937],
        [ 0.1753, -0.0902,  0.4379],
        [-0.1431,  0.1859,  0.1353],
        [-0.0344,  0.0970,  0.3163],
        [-0.0685,  0.1532,  0.0442],
        [-0.0459,  0.2018,  0.3540],
        [-0.0263, -0.0804, -0.2046],
        [ 0.11

(Epoch 1) TRAIN LOSS:1.0632 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:02<00:03,  4.72it/s]

SequenceClassifierOutput(loss=tensor(0.9753, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0654,  0.0260,  0.4611],
        [-0.0095,  0.1559,  0.1571],
        [-0.1771, -0.2093,  0.4811],
        [ 0.2397, -0.2414,  0.3328],
        [ 0.4533,  0.1881,  0.1530],
        [ 0.2439,  0.0161,  0.3569],
        [ 0.0552,  0.0768,  0.0969],
        [ 0.0720,  0.1938,  0.2781],
        [ 0.3149, -0.1415,  0.0785],
        [-0.2503,  0.2865,  0.2077],
        [-0.0726, -0.4214,  0.5060],
        [ 0.0349,  0.0157,  0.2002],
        [-0.2447, -0.0839,  0.1976],
        [ 0.0538,  0.0385,  0.4742],
        [-0.1708, -0.0419,  0.1885],
        [ 0.1654, -0.2533,  0.4973],
        [-0.0206, -0.0518, -0.0402],
        [-0.2215, -0.2677,  0.4523],
        [ 0.1256, -0.0708,  0.1864],
        [-0.0667,  0.0138,  0.2301],
        [ 0.2010,  0.1210,  0.6152],
        [ 0.0969, -0.2085,  0.4912],
        [ 0.2080, -0.0883,  0.6034],
        [-0.0015, -0.0340,  0.7022],
        [-0.26

(Epoch 1) TRAIN LOSS:1.0507 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:03,  4.54it/s]

SequenceClassifierOutput(loss=tensor(0.9377, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0952,  0.0142, -0.0154],
        [ 0.0181, -0.3425,  0.6935],
        [ 0.2310,  0.1459, -0.0152],
        [ 0.2471, -0.0196, -0.0222],
        [ 0.1790, -0.4854,  0.6544],
        [ 0.1744, -0.1172,  0.5780],
        [-0.0046, -0.1343, -0.1329],
        [-0.1482,  0.0271,  0.1835],
        [-0.1814, -0.2079,  0.4475],
        [ 0.1617, -0.0024,  0.4702],
        [ 0.2567, -0.2011,  0.0726],
        [ 0.1263,  0.2090,  0.0734],
        [-0.0041,  0.0221,  0.5450],
        [ 0.1670,  0.0815,  0.3292],
        [ 0.1116, -0.1293,  0.3362],
        [ 0.1305,  0.1710, -0.0585],
        [ 0.1604,  0.0640,  0.5775],
        [ 0.1993, -0.3088,  0.5399],
        [-0.2596, -0.0526,  0.0703],
        [-0.2429, -0.0724,  0.6605],
        [-0.0136, -0.1328,  0.1320],
        [ 0.0592, -0.3034,  0.6413],
        [ 0.1876,  0.0577,  0.1148],
        [-0.2234,  0.0537, -0.1334],
        [ 0.05

(Epoch 1) TRAIN LOSS:1.0402 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:02<00:03,  3.64it/s]

SequenceClassifierOutput(loss=tensor(0.9388, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2847, -0.0352,  0.6732],
        [ 0.0045, -0.1666,  0.5717],
        [ 0.1738, -0.1878,  0.4245],
        [ 0.0655, -0.0144, -0.0075],
        [ 0.1539, -0.0438,  0.2409],
        [ 0.3111, -0.2560,  0.2870],
        [ 0.4893, -0.1282,  0.1604],
        [ 0.3078,  0.4038,  0.2104],
        [ 0.1393,  0.1048,  0.3515],
        [-0.1446, -0.1325,  0.4203],
        [ 0.2759, -0.3401,  0.4008],
        [ 0.0407, -0.1961,  0.4747],
        [ 0.1226,  0.3966,  0.0136],
        [ 0.1219, -0.1161,  0.4323],
        [ 0.1033, -0.2111,  0.5300],
        [ 0.2307, -0.0277, -0.1317],
        [ 0.2762, -0.1899,  0.6036],
        [-0.0489, -0.0571,  0.3588],
        [ 0.1211,  0.0450,  0.5194],
        [-0.0226, -0.0135,  0.3289],
        [ 0.0578,  0.0857, -0.0755],
        [ 0.0594, -0.4068,  0.4063],
        [ 0.3397,  0.2485,  0.2146],
        [-0.0284, -0.1481,  0.6147],
        [ 0.03

(Epoch 1) TRAIN LOSS:1.0318 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:03<00:04,  3.16it/s]

SequenceClassifierOutput(loss=tensor(0.9680, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1880, -0.2436, -0.0226],
        [ 0.0995, -0.0487,  0.1149],
        [ 0.0634, -0.1088,  0.4050],
        [-0.2105, -0.1383,  0.3894],
        [ 0.1123, -0.1020,  0.8911],
        [ 0.0198, -0.1941,  0.5967],
        [ 0.0313, -0.0242,  0.6320],
        [-0.1294, -0.2380,  0.3605],
        [ 0.1384, -0.4084,  0.1579],
        [ 0.0573, -0.1177,  0.3217],
        [-0.0036, -0.1958,  0.5182],
        [ 0.4034, -0.0573, -0.0289],
        [-0.0113,  0.1275,  0.2910],
        [ 0.1280, -0.3072,  0.5462],
        [-0.1099, -0.1122,  0.2535],
        [ 0.0825, -0.0385,  0.2080],
        [ 0.3484, -0.0568,  0.7027],
        [-0.1210, -0.2459,  0.6476],
        [ 0.0821, -0.2091,  0.0960],
        [-0.1664,  0.0910,  0.3898],
        [-0.0814, -0.1625,  0.5151],
        [ 0.1252, -0.1863,  0.1425],
        [-0.1148, -0.2218,  0.5329],
        [-0.0926,  0.0794,  0.4790],
        [ 0.22

(Epoch 1) TRAIN LOSS:1.0269 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:03<00:04,  2.91it/s]

SequenceClassifierOutput(loss=tensor(0.9227, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.5669e-01,  1.7161e-01,  4.8726e-01],
        [ 2.5650e-01, -6.1859e-02,  7.6263e-01],
        [ 6.5961e-02,  1.5375e-01,  5.8818e-01],
        [-1.1471e-01,  7.6990e-02,  3.2956e-01],
        [ 4.4886e-01, -1.9057e-01,  5.6162e-01],
        [-4.1124e-01, -8.4116e-02,  6.5219e-01],
        [ 1.3260e-02,  1.1269e-01,  2.0795e-02],
        [ 3.1602e-02,  9.1837e-02,  4.8793e-01],
        [ 5.2250e-01,  7.7133e-02,  3.1888e-01],
        [ 3.9734e-02, -3.0344e-01,  6.4502e-01],
        [-2.8506e-02, -1.0631e-01,  4.4588e-01],
        [-4.4185e-01, -3.7657e-01,  6.7974e-01],
        [-6.5713e-03, -3.8201e-01,  8.6423e-01],
        [-1.5745e-02, -3.4702e-01,  6.8411e-02],
        [ 2.3777e-02, -1.3794e-01,  5.9564e-01],
        [ 2.5667e-01,  3.8775e-02,  3.2281e-01],
        [-6.8894e-02,  5.9304e-02,  2.4150e-01],
        [-1.7443e-01, -1.0220e-02,  5.6287e-01],
        [ 2.3086e-01

(Epoch 1) TRAIN LOSS:1.0194 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:03<00:03,  2.91it/s]

SequenceClassifierOutput(loss=tensor(0.9221, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0451, -0.3207,  0.4624],
        [-0.1224, -0.1512,  0.5602],
        [-0.0225, -0.1250,  0.4991],
        [ 0.3136,  0.0609,  0.6629],
        [-0.0208, -0.3120,  0.5284],
        [ 0.0537, -0.1175,  0.4387],
        [ 0.4022, -0.3733,  0.5198],
        [-0.0498,  0.0087,  0.3195],
        [-0.0638, -0.4058,  0.2772],
        [-0.0036, -0.3631,  0.6601],
        [-0.0351, -0.1681,  0.5050],
        [ 0.0347, -0.1743,  0.2916],
        [ 0.1153, -0.1689,  0.6436],
        [ 0.2047, -0.0477,  0.3926],
        [-0.0056,  0.3595,  0.3318],
        [ 0.1590, -0.5489,  0.7143],
        [ 0.1474, -0.1948,  0.5694],
        [ 0.2534, -0.1556,  0.3776],
        [ 0.1983, -0.3940,  0.4939],
        [-0.0247, -0.1403,  0.2813],
        [ 0.2248,  0.0687,  0.3843],
        [ 0.1582, -0.1898,  0.5041],
        [-0.2488,  0.0355,  0.2466],
        [-0.0793, -0.2928,  0.1425],
        [ 0.01

(Epoch 1) TRAIN LOSS:1.0129 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:04<00:03,  2.82it/s]

SequenceClassifierOutput(loss=tensor(0.9507, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2475, -0.3100,  0.7337],
        [-0.1170, -0.0802,  0.4186],
        [ 0.0649,  0.2886,  0.2410],
        [ 0.1324, -0.2868,  0.3278],
        [-0.0258, -0.3943,  0.5283],
        [ 0.0575, -0.2932,  0.3039],
        [-0.0190, -0.4336,  0.7453],
        [ 0.3842,  0.3109,  0.4333],
        [ 0.3171,  0.0210,  0.4287],
        [ 0.3210, -0.1348,  0.6658],
        [ 0.1433, -0.3327,  0.5064],
        [ 0.1181, -0.2703,  0.2112],
        [ 0.1663, -0.1061,  0.6632],
        [ 0.0068, -0.0251,  0.4476],
        [ 0.2090, -0.0931,  0.5834],
        [ 0.2435, -0.3350,  0.2677],
        [ 0.4352,  0.1171,  0.5682],
        [ 0.5377, -0.1009,  0.2698],
        [ 0.0777,  0.1658,  0.2454],
        [-0.1654, -0.4863,  0.4510],
        [ 0.0295, -0.0426,  0.3659],
        [ 0.0047, -0.1549,  0.3848],
        [ 0.0916, -0.1208,  0.8354],
        [ 0.0779, -0.6612,  0.7221],
        [-0.23

(Epoch 1) TRAIN LOSS:1.0090 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:04<00:03,  2.74it/s]

SequenceClassifierOutput(loss=tensor(0.9331, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.7826e-01, -1.7189e-01,  4.9071e-01],
        [-1.4641e-01,  3.7307e-02,  6.8115e-01],
        [ 2.9522e-01, -1.3556e-01,  7.3100e-01],
        [ 2.2002e-01,  2.4106e-01,  4.9834e-01],
        [ 6.0245e-03, -2.2497e-01,  7.7372e-01],
        [ 1.7167e-01, -5.3972e-01,  7.4234e-01],
        [-2.1647e-01, -9.0831e-02,  3.9956e-01],
        [ 8.3728e-02, -2.4035e-01,  1.0204e+00],
        [ 1.2591e-01, -5.2274e-02,  6.9486e-01],
        [ 1.4842e-01,  6.2981e-02,  5.8393e-01],
        [ 2.9257e-02, -1.6970e-01,  2.1088e-01],
        [-1.1448e-01, -2.0720e-01,  5.4482e-01],
        [ 1.9277e-01, -3.1683e-01,  5.2898e-01],
        [-7.8491e-04, -1.6886e-01,  3.1800e-01],
        [-9.8475e-02, -3.0214e-01,  8.6216e-01],
        [ 2.6175e-01, -5.5212e-01,  5.2346e-01],
        [ 2.2240e-01, -3.1184e-01,  4.2763e-01],
        [ 1.4383e-01, -3.2771e-01,  6.9076e-01],
        [ 6.7983e-03

(Epoch 1) TRAIN LOSS:1.0046 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:05<00:02,  2.77it/s]

SequenceClassifierOutput(loss=tensor(0.9980, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1528, -0.0297,  0.6874],
        [ 0.1480, -0.5076,  0.8674],
        [ 0.1622, -0.0681,  0.5430],
        [ 0.0926, -0.2591,  0.2083],
        [ 0.2900,  0.0519,  0.4511],
        [ 0.2757,  0.1181,  0.0183],
        [ 0.2098, -0.3967,  0.7105],
        [-0.2165, -0.5656,  0.7688],
        [-0.1139, -0.3664,  0.4858],
        [ 0.0673,  0.0033,  0.0308],
        [ 0.1156, -0.3816,  0.5515],
        [ 0.1907,  0.2912,  0.3199],
        [ 0.1371, -0.4024,  0.7178],
        [ 0.0900,  0.1274,  0.0681],
        [ 0.0890, -0.0970,  0.5141],
        [-0.0141, -0.3072,  0.6972],
        [-0.1895, -0.1284,  0.3271],
        [ 0.1821, -0.2589,  0.3622],
        [ 0.2374,  0.0179,  0.3389],
        [ 0.0359, -0.1101,  0.4872],
        [-0.2877, -0.4013,  0.5496],
        [-0.1774, -0.4258,  0.7162],
        [ 0.1340, -0.3988,  0.3037],
        [-0.2356, -0.3109,  0.3265],
        [-0.00

(Epoch 1) TRAIN LOSS:1.0042 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:05<00:02,  2.61it/s]

SequenceClassifierOutput(loss=tensor(0.8929, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0895, -0.2145,  0.0696],
        [-0.0424, -0.4404,  0.7486],
        [-0.0218, -0.4907,  0.3845],
        [-0.2799, -0.5168,  0.6393],
        [ 0.2931, -0.1061,  0.2544],
        [ 0.0652, -0.2784,  0.8176],
        [ 0.0925, -0.4879,  0.8055],
        [ 0.2636, -0.3290,  0.5786],
        [ 0.0650,  0.0783,  0.3490],
        [ 0.0686, -0.2800,  0.0201],
        [-0.1119, -0.6616,  0.5488],
        [ 0.0598, -0.1632,  0.9117],
        [-0.3197, -0.3234,  0.4199],
        [ 0.2990,  0.0354,  0.4913],
        [ 0.1687,  0.1075,  0.3698],
        [-0.2432,  0.0088,  0.6488],
        [-0.0039, -0.3120,  0.6893],
        [ 0.0652, -0.6308,  0.7867],
        [ 0.2370, -0.4461,  0.4771],
        [-0.1479, -0.4232,  0.3757],
        [ 0.3016, -0.1583,  0.1131],
        [ 0.4542, -0.1389,  0.3178],
        [-0.0738, -0.0580,  0.6983],
        [-0.0981, -0.2624,  0.5539],
        [-0.03

(Epoch 1) TRAIN LOSS:0.9984 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:05<00:02,  2.63it/s]

SequenceClassifierOutput(loss=tensor(0.8090, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0926, -0.4711,  0.9825],
        [ 0.0601, -0.2038,  0.9013],
        [ 0.0466, -0.2763,  0.4268],
        [ 0.1532, -0.3957,  0.8441],
        [-0.0404, -0.0414,  0.4842],
        [ 0.0182, -0.5810,  1.0860],
        [-0.3568, -0.3493,  0.2905],
        [ 0.0939, -0.3446,  0.7750],
        [ 0.1932, -0.2988,  0.3006],
        [-0.0956, -0.0127,  1.1177],
        [ 0.1966, -0.0521,  0.5624],
        [ 0.1046, -0.0464,  0.5281],
        [ 0.3623, -0.1542,  0.0224],
        [ 0.0103, -0.1719,  0.3181],
        [-0.2956, -0.2955,  0.8218],
        [ 0.2450, -0.5801,  0.6851],
        [ 0.0156, -0.4081,  0.8622],
        [-0.0238, -0.3122,  0.6368],
        [-0.0211, -0.5449,  0.7177],
        [-0.0095, -0.3061,  0.7019],
        [ 0.2236, -0.2223,  0.6173],
        [-0.0102, -0.5798,  0.6483],
        [ 0.0388, -0.4046,  0.4578],
        [-0.2294, -0.2719,  0.7804],
        [ 0.09

(Epoch 1) TRAIN LOSS:0.9834 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:06<00:01,  2.71it/s]

SequenceClassifierOutput(loss=tensor(0.8738, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2544, -0.3203,  1.0004],
        [-0.1712,  0.1657,  0.7463],
        [ 0.2213, -0.0437,  0.4696],
        [-0.2031, -0.1869,  0.5860],
        [-0.3520, -0.5612,  0.8349],
        [ 0.0292, -0.1386,  0.4496],
        [-0.2365, -0.5428,  0.6302],
        [-0.1434, -0.4777,  0.8841],
        [ 0.1256,  0.0072,  0.7338],
        [-0.0026, -0.4222,  0.8406],
        [ 0.2210, -0.2102,  0.6114],
        [ 0.1037, -0.2111,  0.2777],
        [-0.0551, -0.0219,  0.5206],
        [ 0.3147, -0.3829,  0.5481],
        [ 0.0070, -0.4885,  0.8030],
        [ 0.3023, -0.5018,  1.0148],
        [-0.0489, -0.3772,  0.1629],
        [ 0.2567, -0.2287,  0.5198],
        [ 0.2458, -0.3195,  0.4224],
        [ 0.1906, -0.2802,  0.9993],
        [ 0.0230, -0.4690,  0.7551],
        [ 0.0147,  0.1936,  0.1503],
        [ 0.3359,  0.0960,  0.1897],
        [-0.2303, -0.6489,  0.8248],
        [-0.11

(Epoch 1) TRAIN LOSS:0.9826 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:06<00:00,  3.09it/s]

SequenceClassifierOutput(loss=tensor(0.9663, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0705, -0.2272,  0.7832],
        [ 0.0171, -0.3407,  0.9444],
        [-0.1291, -0.0955,  0.2520],
        [-0.2857, -0.2315,  0.5477],
        [-0.1622, -0.2040,  1.1259],
        [-0.0442, -0.3574,  0.2900],
        [-0.0146, -0.6492,  1.2479],
        [-0.1082, -0.5740,  0.7654],
        [-0.2193, -0.4423,  0.7027],
        [-0.0347, -0.4649,  1.1056],
        [ 0.1302, -0.3053,  0.3867],
        [-0.0334, -0.0906,  0.6915],
        [ 0.0935, -0.0723,  0.5565],
        [ 0.0455, -0.3937,  0.1527],
        [-0.4163, -0.3180,  0.6204],
        [ 0.0613,  0.0016, -0.0924],
        [ 0.3314, -0.3300,  0.3058],
        [ 0.1277, -0.2881,  0.8859],
        [ 0.2118, -0.3597,  0.7017],
        [ 0.1753, -0.2145,  0.6596],
        [ 0.2929, -0.4088,  0.6603],
        [ 0.1051, -0.1577,  0.5538],
        [ 0.2377,  0.0911,  0.2283],
        [-0.0752, -0.3846,  0.5503],
        [ 0.07

(Epoch 1) TRAIN LOSS:0.9756 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:07<00:00,  3.39it/s]

SequenceClassifierOutput(loss=tensor(0.8209, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0583, -0.0393,  0.6441],
        [ 0.1927, -0.4679,  1.0421],
        [-0.1440, -0.6019,  1.0912],
        [-0.1005, -0.3315,  1.1292],
        [-0.1344, -0.4165,  0.7995],
        [ 0.0432, -0.3461,  0.3728],
        [-0.0591, -0.0783,  0.5139],
        [ 0.1548, -0.2175,  0.5781],
        [-0.2683, -0.4432,  0.8557],
        [-0.2599, -0.4660,  0.7956],
        [-0.1094, -0.2359,  0.5192],
        [ 0.0269, -0.0164,  0.4021],
        [ 0.1565, -0.6390,  0.9139],
        [-0.2687, -0.4518,  0.5693],
        [-0.0152, -0.7030,  0.6874],
        [-0.2601, -0.2159,  0.6619],
        [-0.0780, -0.3248,  0.8466],
        [ 0.0257, -0.4485,  0.9123],
        [ 0.1327, -0.3912,  0.9927],
        [-0.0064, -0.4154,  0.8970],
        [-0.2916, -0.5509,  0.9999],
        [ 0.0286, -0.5775,  1.0258],
        [-0.1317, -0.0204,  0.7073],
        [-0.0152, -0.8227,  0.8831],
        [-0.25

(Epoch 1) TRAIN LOSS:0.9730 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:07<00:00,  3.70it/s]

SequenceClassifierOutput(loss=tensor(0.9139, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2215, -0.2951,  0.9331],
        [ 0.0655, -0.3762,  1.1953],
        [-0.1491, -0.3201,  1.0529],
        [ 0.3128, -0.3719,  0.7003],
        [-0.2097, -0.6901,  1.1596],
        [ 0.0969, -0.4986,  0.5601],
        [-0.0664, -0.3521,  1.0286],
        [ 0.0400, -0.1667,  0.3581],
        [ 0.2673, -0.3497,  0.8935],
        [-0.1424,  0.0894,  0.5231],
        [-0.3132, -0.3263,  1.1323],
        [ 0.0415, -0.1093,  0.2296],
        [-0.1119, -0.3920,  0.6793],
        [ 0.0708, -0.1240,  0.4580],
        [ 0.1198, -0.5354,  0.6172],
        [-0.1454, -0.2403,  0.8883],
        [-0.0746, -0.3557,  0.6500],
        [ 0.1336, -0.1393,  0.5574],
        [-0.0124, -0.4764,  0.8833],
        [ 0.0113,  0.1902,  0.5354],
        [-0.0884, -0.2256,  0.5113],
        [-0.2141, -0.3041,  0.6974],
        [ 0.1885, -0.0763,  0.4230],
        [-0.2474, -0.0235,  0.6635],
        [ 0.08

(Epoch 1) TRAIN LOSS:0.9696 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  4.02it/s]

SequenceClassifierOutput(loss=tensor(0.8872, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0429, -0.3487,  0.6806],
        [ 0.0376, -0.6897,  0.7598],
        [-0.0853, -0.1779,  0.3228],
        [-0.1573, -0.3785,  0.7225],
        [-0.1494, -0.2701,  0.2614],
        [ 0.1332, -0.3535,  0.2764],
        [-0.2130, -0.0293,  0.2526],
        [-0.0814, -0.3831,  0.6191],
        [-0.0556, -0.8925,  0.7705],
        [-0.0429, -0.3297,  0.6567],
        [-0.1299, -0.8571,  0.9607],
        [ 0.3299,  0.0340,  0.6166],
        [ 0.3056, -0.2369,  0.4459],
        [ 0.1177, -0.2492,  0.5440],
        [-0.1897, -0.6010,  0.6013],
        [ 0.2163, -0.0465,  0.5071],
        [ 0.0551, -0.5142,  0.8879],
        [ 0.2738, -0.4921,  0.6917],
        [-0.1505, -0.3622,  0.8188],
        [-0.0544, -0.3036,  0.2901],
        [ 0.1383, -0.2599,  0.7302],
        [ 0.1697, -0.1556,  0.1826],
        [ 0.0420, -0.5407,  0.8829],
        [-0.0209, -0.5133,  0.3706],
        [ 0.05

(Epoch 1) TRAIN LOSS:0.9696 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  3.30it/s]

(Epoch 1) TRAIN LOSS:0.9696 ACC:0.56 F1:0.38 REC:0.41 PRE:0.41 LR:0.00000300



(Epoch 2) TRAIN LOSS:0.6926 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:05,  4.12it/s]

SequenceClassifierOutput(loss=tensor(0.6926, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3838, -0.0308,  0.4997],
        [ 0.2220, -0.5973,  0.4469],
        [-0.1834, -0.2605,  0.8632],
        [ 0.2405, -0.2445,  0.5152],
        [ 0.0535,  0.1790,  0.3729],
        [ 0.2241, -0.0514,  0.5406],
        [ 0.4910, -0.4519,  0.2460],
        [ 0.0450, -0.2795,  0.8734],
        [ 0.0430,  0.0196,  0.2605],
        [-0.0122, -0.5570,  1.0677],
        [-0.0335, -0.3802,  1.1443],
        [-0.1566, -0.4173,  0.8440],
        [-0.0099, -0.5804,  0.9777],
        [ 0.2041,  0.0318,  0.1819],
        [ 0.0564, -0.4786,  0.7143],
        [-0.0315, -0.4098,  0.5365],
        [-0.1302, -0.6152,  0.8558],
        [ 0.1038, -0.4631,  0.8012],
        [-0.2255, -0.5291,  1.1621],
        [-0.1762, -0.6405,  0.6723],
        [-0.0716, -0.5086,  1.2985],
        [ 0.1008, -0.4796,  0.6069],
        [-0.1502, -0.4791,  0.8321],
        [ 0.4613, -0.3739,  0.3257],
        [-0.16

(Epoch 2) TRAIN LOSS:0.7643 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:04,  4.56it/s]

SequenceClassifierOutput(loss=tensor(0.8468, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1995, -0.6301,  1.0480],
        [-0.0759, -0.4319,  0.6210],
        [-0.0977, -0.0722,  0.5480],
        [-0.2575, -0.3913,  1.0524],
        [ 0.3377, -0.0543,  0.0894],
        [-0.2535, -0.7046,  0.9454],
        [ 0.3786, -0.5551,  0.7133],
        [-0.1819, -0.3623,  0.7475],
        [-0.2493, -0.5134,  1.1986],
        [-0.2358, -0.5530,  1.1330],
        [ 0.1391, -0.5364,  0.3825],
        [ 0.1720, -0.2447,  0.8341],
        [-0.0099, -0.4474,  0.5099],
        [ 0.1462, -0.3140,  0.5008],
        [-0.0717, -0.4021,  0.9253],
        [ 0.0596, -0.5270,  0.8875],
        [ 0.1784, -0.5182,  0.8227],
        [ 0.1730, -0.1128,  0.0557],
        [-0.0956, -0.7123,  0.9665],
        [ 0.0079, -0.2204,  0.4442],
        [ 0.3711, -0.0318,  0.5756],
        [-0.1217, -0.6151,  1.0883],
        [ 0.0981, -0.3177,  0.8792],
        [ 0.1056, -0.6681,  0.8919],
        [-0.19

(Epoch 2) TRAIN LOSS:0.7671 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:04,  4.66it/s]

SequenceClassifierOutput(loss=tensor(0.7755, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1379, -0.1906,  0.5258],
        [ 0.2254, -0.0329,  0.6446],
        [-0.0360, -0.6044,  0.7331],
        [ 0.2489, -0.6000,  1.0028],
        [-0.1993, -0.0530,  0.3354],
        [ 0.3577, -0.2558,  0.5381],
        [-0.0916, -0.3759,  0.4483],
        [-0.0333, -0.4685,  0.7886],
        [ 0.1280, -0.7590,  1.3806],
        [ 0.3752,  0.0884,  0.1971],
        [ 0.0385, -0.6302,  1.0049],
        [ 0.1180, -0.4945,  1.0960],
        [ 0.0643, -0.5099,  0.5584],
        [ 0.4581, -0.5798,  1.0609],
        [ 0.0490, -0.7626,  1.0048],
        [ 0.1554, -0.4111,  0.9554],
        [ 0.4253, -0.3479,  0.1644],
        [ 0.3994, -0.3549,  0.3756],
        [ 0.1902, -0.2381,  0.6289],
        [ 0.2328, -0.2980,  0.6522],
        [ 0.1484, -0.4076,  0.2611],
        [ 0.1033, -0.5249,  1.2157],
        [ 0.0969, -0.7611,  1.0684],
        [-0.1201, -0.5895,  0.8574],
        [-0.07

(Epoch 2) TRAIN LOSS:0.7884 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:04,  4.75it/s]

SequenceClassifierOutput(loss=tensor(0.8733, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1107, -0.0798,  0.5268],
        [ 0.0235, -0.2055,  0.5415],
        [ 0.0622, -0.6667,  0.8830],
        [ 0.1499, -0.5769,  1.1101],
        [ 0.0507, -0.5962,  1.0457],
        [ 0.1455, -0.3349,  0.2665],
        [-0.0337, -0.8830,  1.0160],
        [ 0.2292, -0.3977,  1.1597],
        [-0.0845, -0.5866,  1.0462],
        [-0.1097, -0.4414,  0.5187],
        [ 0.1456, -0.3995,  0.5511],
        [-0.1741, -0.8473,  1.1647],
        [-0.0926, -0.2577,  0.3817],
        [-0.1607, -0.5911,  1.0185],
        [-0.1252, -0.8269,  0.9546],
        [-0.0983, -0.4417,  1.1000],
        [ 0.0511,  0.1131,  0.4507],
        [ 0.0414, -0.4004,  0.8423],
        [ 0.3161,  0.1256,  0.3842],
        [ 0.1031, -0.1496,  0.8199],
        [-0.2111, -0.5048,  1.1979],
        [ 0.2945, -0.2418,  0.4130],
        [-0.0884, -0.4676,  0.6183],
        [-0.1735, -0.6673,  0.2962],
        [ 0.07

(Epoch 2) TRAIN LOSS:0.7761 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:03,  4.76it/s]

SequenceClassifierOutput(loss=tensor(0.7148, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3910, -0.6554,  0.8782],
        [-0.1712, -0.7958,  0.9044],
        [-0.3994, -0.8173,  1.1556],
        [-0.2350, -0.5376,  1.0281],
        [-0.1420, -0.8422,  1.0202],
        [-0.1566, -0.7700,  0.9621],
        [ 0.1336, -0.3088,  0.5136],
        [-0.0143, -0.5835,  1.1758],
        [-0.1822, -0.3872,  0.6709],
        [-0.3199, -0.4186,  1.0659],
        [-0.3857, -0.6215,  0.6510],
        [ 0.3737,  0.0837,  0.1030],
        [-0.2291, -0.3766,  1.0830],
        [-0.2484, -0.8610,  0.8861],
        [-0.1350, -0.3794,  0.9941],
        [ 0.0502, -0.4751,  0.6165],
        [ 0.3078, -0.0434,  0.1920],
        [ 0.0240, -0.7044,  0.7589],
        [-0.1089, -0.3015,  0.9008],
        [-0.2283, -0.7340,  0.7750],
        [ 0.0445, -0.4990,  1.0382],
        [ 0.1899, -0.3142,  0.6965],
        [-0.2430, -0.8448,  1.1853],
        [-0.1839, -0.8161,  0.7457],
        [ 0.02

(Epoch 2) TRAIN LOSS:0.7985 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:03,  4.73it/s]

SequenceClassifierOutput(loss=tensor(0.9329, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.4416e-02, -8.4364e-01,  9.5788e-01],
        [-1.8786e-01, -4.7250e-01,  9.1759e-01],
        [ 6.2008e-02, -4.4229e-01,  5.4059e-01],
        [ 8.4778e-02, -3.5815e-01,  7.9185e-01],
        [ 3.1494e-02, -1.5980e-01,  2.3331e-01],
        [ 1.9628e-01, -3.5170e-01,  5.2719e-01],
        [-2.9580e-01, -4.6998e-01,  1.1235e+00],
        [-1.8770e-01, -9.7880e-01,  1.1713e+00],
        [ 1.3967e-01, -1.6683e-01,  4.8099e-01],
        [ 4.3191e-01,  5.2843e-02,  5.6095e-01],
        [ 2.1756e-01, -1.4571e-01, -1.6120e-02],
        [ 6.0691e-04, -9.0691e-01,  1.1298e+00],
        [ 7.5742e-02, -2.4652e-01,  3.0748e-01],
        [ 3.0542e-01, -1.7144e-01,  2.5543e-01],
        [-1.6734e-01, -8.5817e-01,  6.7102e-01],
        [ 1.1618e-01, -5.3880e-01,  5.9139e-01],
        [-6.1275e-02, -7.9202e-01,  1.0932e+00],
        [ 2.2766e-01, -7.3078e-01,  4.5058e-01],
        [-1.2777e-01

(Epoch 2) TRAIN LOSS:0.8037 LR:0.00000300:  32%|█████████████                            | 8/25 [00:01<00:03,  4.75it/s]

SequenceClassifierOutput(loss=tensor(0.8397, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1415, -0.1871,  0.3946],
        [ 0.0377, -0.5478,  0.8941],
        [-0.1469, -0.8871,  0.9316],
        [-0.0552, -0.6924,  0.9684],
        [ 0.4113, -0.3216,  0.1917],
        [-0.0765, -0.6304,  1.0205],
        [ 0.1257, -0.1274,  0.3085],
        [-0.0217, -0.7699,  0.9369],
        [ 0.0799, -0.2473,  0.5063],
        [ 0.2626, -0.0946,  0.5139],
        [ 0.0616, -0.3199,  0.7608],
        [-0.0327, -0.6046,  0.9648],
        [-0.1185, -0.3937,  0.9709],
        [ 0.0848, -0.2624,  0.0888],
        [ 0.1075, -0.6786,  1.2889],
        [ 0.1563, -0.3590,  0.8416],
        [-0.0297, -0.7062,  0.7022],
        [-0.0172, -0.2656,  0.5113],
        [-0.2651, -0.8534,  1.0112],
        [-0.0913, -0.4284,  0.7116],
        [ 0.1512,  0.1602,  0.6554],
        [ 0.1662, -0.2879,  1.1467],
        [ 0.1405, -0.2838,  0.5714],
        [-0.0890, -0.5545,  0.4114],
        [-0.27

(Epoch 2) TRAIN LOSS:0.8043 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:03,  4.89it/s]

SequenceClassifierOutput(loss=tensor(0.8780, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-2.9867e-01, -5.4512e-01,  1.2471e+00],
        [ 5.0236e-01, -3.5819e-01,  3.9259e-01],
        [-1.5257e-01, -6.1161e-01,  9.1159e-01],
        [-1.9799e-01, -7.5011e-01,  1.2656e+00],
        [-3.6306e-02, -3.9373e-02,  2.3965e-01],
        [-3.2132e-02, -7.9288e-01,  9.0028e-01],
        [-8.4829e-02, -5.4229e-01,  4.7960e-01],
        [ 1.2315e-01, -5.6333e-01,  7.0203e-01],
        [-4.9113e-04, -1.9568e-01,  4.7629e-01],
        [ 2.4102e-01, -4.6338e-01,  5.9852e-01],
        [-9.0890e-02, -6.1848e-01,  2.2413e-01],
        [ 4.8957e-02, -7.8289e-01,  1.0074e+00],
        [-9.0248e-02, -4.6828e-01,  8.9408e-01],
        [-3.5971e-02, -7.6418e-01,  1.1207e+00],
        [ 3.4027e-01, -1.6756e-01,  5.3609e-01],
        [ 1.3519e-01, -8.1762e-01,  1.0430e+00],
        [ 2.2998e-01, -2.2944e-01,  5.9604e-01],
        [-3.4660e-01, -4.8001e-01,  9.3618e-01],
        [ 9.0141e-02

(Epoch 2) TRAIN LOSS:0.8062 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:02<00:02,  4.72it/s]

SequenceClassifierOutput(loss=tensor(0.8255, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1122, -0.6668,  0.8769],
        [-0.0130, -0.9179,  0.9514],
        [-0.1190, -0.7676,  1.1365],
        [ 0.4282, -0.2798,  0.4044],
        [ 0.1307, -0.2577,  0.5151],
        [ 0.4855, -0.7756,  0.2916],
        [ 0.2276, -0.3840,  0.6068],
        [-0.3630, -0.9996,  1.4091],
        [ 0.4290, -0.2273, -0.0327],
        [ 0.0695, -0.6883,  1.0352],
        [ 0.0917, -0.5955,  0.8479],
        [-0.4790, -0.8707,  1.5531],
        [ 0.4195, -0.2610,  0.6472],
        [-0.0859, -0.6163,  0.7809],
        [ 0.0313, -0.7377,  0.8163],
        [ 0.0100, -0.4258,  0.3166],
        [ 0.1250, -0.3740,  0.7316],
        [ 0.0837, -0.5478,  0.9578],
        [ 0.2869, -0.8089,  1.2645],
        [-0.0650, -0.7438,  0.7418],
        [ 0.4796, -0.3174,  0.0841],
        [ 0.0749, -0.5623,  0.9390],
        [ 0.1351, -0.6686,  0.7233],
        [-0.1976, -0.6098,  0.8538],
        [ 0.28

(Epoch 2) TRAIN LOSS:0.8045 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:02<00:02,  4.82it/s]

SequenceClassifierOutput(loss=tensor(0.7848, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4143, -0.1361,  0.4562],
        [-0.0642, -0.7661,  1.0883],
        [-0.1407, -0.6377,  0.9355],
        [-0.0211, -0.8459,  1.2502],
        [-0.2829, -0.8641,  1.1607],
        [-0.1624, -0.6927,  1.0526],
        [ 0.2606, -0.1180,  0.3758],
        [-0.1301, -0.7552,  1.0640],
        [-0.1449, -0.7572,  1.1046],
        [-0.1846, -0.9365,  1.3750],
        [ 0.1741, -0.7102,  0.6707],
        [ 0.1509, -0.5906,  0.7213],
        [-0.1022, -0.8664,  1.0483],
        [ 0.0810, -0.6095,  0.8829],
        [-0.0102, -0.5146,  0.3657],
        [-0.0528, -0.8103,  1.0632],
        [-0.1631, -0.7791,  1.2343],
        [ 0.0550, -0.6745,  1.1522],
        [ 0.1298, -0.6281,  0.4956],
        [ 0.0695, -0.7275,  0.8904],
        [ 0.2241, -0.4609,  0.5767],
        [-0.0169, -0.6023,  0.8725],
        [ 0.1366, -0.2227,  0.4570],
        [ 0.1556, -0.5140,  1.0656],
        [ 0.08

(Epoch 2) TRAIN LOSS:0.8080 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:02<00:02,  4.78it/s]

SequenceClassifierOutput(loss=tensor(0.7870, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.7250e-03, -8.2254e-01,  1.1945e+00],
        [ 3.6058e-01, -1.7327e-01,  5.1324e-01],
        [ 3.0244e-01, -3.2462e-01,  7.0572e-01],
        [ 2.7721e-01, -2.1766e-01,  6.3241e-01],
        [ 5.6883e-02, -6.4751e-01,  7.3417e-01],
        [ 6.4451e-02, -7.8605e-01,  1.1866e+00],
        [-7.5380e-03, -2.2635e-01,  3.6542e-01],
        [-2.0371e-01, -7.6384e-01,  1.2071e+00],
        [-1.5457e-02, -8.9941e-01,  1.3627e+00],
        [-7.1684e-02, -8.9724e-01,  8.7260e-01],
        [ 2.1947e-01, -8.8444e-01,  8.0581e-01],
        [-2.6515e-01, -5.4827e-01,  1.1303e+00],
        [ 2.4689e-01, -3.0821e-01,  5.1840e-01],
        [-8.6697e-04, -8.0221e-01,  1.2516e+00],
        [-2.5674e-01, -9.1247e-01,  1.2003e+00],
        [ 5.5843e-01, -3.1041e-01,  4.2988e-01],
        [ 1.0312e-01, -2.8049e-01,  2.0044e-01],
        [ 9.3469e-02, -6.1283e-01,  7.8752e-01],
        [ 6.9606e-02

(Epoch 2) TRAIN LOSS:0.8074 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:03<00:02,  4.64it/s]

SequenceClassifierOutput(loss=tensor(0.7991, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2995, -0.1573,  0.7587],
        [ 0.0143, -0.6391,  0.9050],
        [-0.1258, -1.0005,  1.0402],
        [ 0.2447, -0.5540,  0.4315],
        [-0.0501, -0.7894,  1.0145],
        [ 0.2493, -0.8292,  0.7799],
        [ 0.2915, -0.0922,  0.2096],
        [ 0.2858, -0.4869,  0.6111],
        [ 0.0176, -0.5114,  0.6103],
        [-0.1455, -0.5756,  0.8845],
        [-0.0627, -0.4522,  0.6014],
        [ 0.1928, -0.4934,  0.6298],
        [ 0.0196, -0.8446,  1.2724],
        [-0.0330, -1.0352,  1.1362],
        [ 0.0127, -0.3924,  0.9629],
        [ 0.1176, -0.5622,  0.9157],
        [ 0.1626, -0.5699,  0.3351],
        [ 0.4640,  0.0071,  0.4527],
        [ 0.2739, -0.2094,  0.5598],
        [ 0.0304, -0.4900,  0.6826],
        [-0.1148, -0.8897,  1.3075],
        [ 0.2756, -0.4143,  0.3699],
        [-0.0343, -0.6533,  0.5869],
        [-0.1251, -0.9113,  1.0604],
        [-0.32

(Epoch 2) TRAIN LOSS:0.8049 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:03<00:02,  4.42it/s]

SequenceClassifierOutput(loss=tensor(0.7680, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1869, -0.9562,  1.2602],
        [-0.0433, -0.9565,  0.8394],
        [ 0.1737, -0.3997,  0.7942],
        [ 0.4035, -0.3010,  0.6474],
        [ 0.0548, -0.8867,  0.8023],
        [ 0.1724, -0.3863,  0.6022],
        [ 0.0417, -0.4876,  0.2334],
        [-0.0358, -0.5817,  0.6243],
        [ 0.0735, -0.4807,  0.8985],
        [ 0.1401, -0.4611,  0.6235],
        [-0.2430, -0.9696,  1.2439],
        [ 0.0223, -0.8234,  1.3470],
        [ 0.0960, -0.5372,  0.5551],
        [ 0.2487, -0.5880,  0.9035],
        [-0.1113, -0.8467,  1.2230],
        [ 0.2300, -0.8980,  1.3175],
        [ 0.4512, -0.2200,  0.4398],
        [-0.4447, -0.9316,  1.1002],
        [ 0.2356, -0.3076,  0.3873],
        [ 0.1697, -0.6955,  1.1892],
        [ 0.2534, -0.4123,  0.4719],
        [-0.3229, -0.8889,  1.0899],
        [ 0.2667, -0.6993,  0.9680],
        [ 0.3340, -0.1956,  0.3657],
        [-0.02

(Epoch 2) TRAIN LOSS:0.8074 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:03<00:02,  3.66it/s]

SequenceClassifierOutput(loss=tensor(0.7656, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0760, -0.7678,  1.2095],
        [ 0.0235, -0.7652,  1.2381],
        [-0.1574, -0.1101, -0.2609],
        [-0.1246, -0.7658,  1.2204],
        [ 0.0889, -0.5163,  0.5530],
        [ 0.0381, -0.7658,  0.9427],
        [ 0.0261, -1.1138,  1.1278],
        [-0.1167, -0.3822,  1.2099],
        [ 0.2864, -0.1476,  0.5015],
        [-0.1803, -0.9651,  1.1297],
        [-0.1390, -1.1169,  1.1463],
        [ 0.3735, -0.5756,  0.5895],
        [-0.1260, -0.8059,  1.0806],
        [-0.0049, -0.8577,  1.1367],
        [-0.0632, -0.3691,  0.7157],
        [ 0.1438, -0.7204,  0.9143],
        [-0.0768, -0.8733,  1.4009],
        [ 0.0271, -0.7992,  1.3148],
        [ 0.0368, -0.6577,  1.2870],
        [-0.1374, -0.7116,  1.3464],
        [ 0.2780, -0.1903,  0.3561],
        [ 0.3822, -0.4404,  0.2549],
        [-0.2284, -0.8520,  1.4904],
        [ 0.1198, -0.4124,  0.2551],
        [-0.00

(Epoch 2) TRAIN LOSS:0.8051 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:04<00:02,  3.11it/s]

SequenceClassifierOutput(loss=tensor(0.7855, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1908, -0.2349,  0.3754],
        [-0.2726, -0.6812,  1.0370],
        [ 0.2151, -0.4966,  0.4221],
        [-0.0354, -0.0410,  0.2731],
        [ 0.1433, -0.1148,  0.4650],
        [ 0.0490, -0.4693,  0.9318],
        [-0.0921, -1.0931,  1.1833],
        [ 0.3302, -0.4735,  0.1178],
        [ 0.3794, -0.5013,  0.3380],
        [-0.2621, -0.8863,  1.3504],
        [ 0.5350, -0.3516,  0.1531],
        [ 0.4705, -0.5464,  0.1382],
        [-0.0415, -0.7211,  0.8389],
        [-0.1469, -0.9087,  1.2030],
        [-0.0371, -1.1592,  1.4198],
        [ 0.1594, -0.4954,  0.6173],
        [ 0.1800, -0.5156,  0.4195],
        [ 0.4986, -0.4602,  0.5908],
        [ 0.1840, -0.2331,  0.4167],
        [ 0.3895, -0.3935,  0.7324],
        [ 0.1474, -0.6869,  1.0015],
        [ 0.0063, -0.6545,  1.2440],
        [ 0.2253, -0.3648,  0.4357],
        [-0.2933, -0.8123,  1.3566],
        [ 0.22

(Epoch 2) TRAIN LOSS:0.8040 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:04<00:02,  2.89it/s]

SequenceClassifierOutput(loss=tensor(0.6034, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1822, -0.7281,  1.4296],
        [ 0.1846, -0.3602,  0.5328],
        [-0.0704, -0.8183,  1.1344],
        [ 0.0269, -0.6004,  0.7146],
        [ 0.3296, -0.8302,  0.3955],
        [-0.3061, -0.9740,  1.2457],
        [ 0.0167, -0.9392,  1.2453],
        [ 0.2299, -0.2615,  0.6517],
        [ 0.5073, -0.4883,  0.2180],
        [ 0.1495, -0.9023,  1.2805],
        [ 0.1934, -0.7139,  1.1625],
        [-0.2016, -0.7805,  1.1850],
        [ 0.2927, -0.2935,  0.3699],
        [-0.1867, -1.0741,  1.2414],
        [ 0.1544, -0.5849,  0.7666],
        [-0.0079, -0.8849,  1.0211],
        [ 0.4673, -0.2642,  0.0852],
        [ 0.3300, -0.2250,  0.2687],
        [ 0.0389, -0.6522,  0.7811],
        [ 0.3001, -0.6833,  0.5542],
        [-0.1438, -0.9562,  1.1429],
        [ 0.1853, -0.8746,  1.1584],
        [-0.0695, -0.6028,  1.3033],
        [ 0.1037, -0.3538,  0.8019],
        [-0.32

(Epoch 2) TRAIN LOSS:0.7940 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:05<00:01,  2.79it/s]

SequenceClassifierOutput(loss=tensor(0.7598, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2130, -0.1393,  0.4594],
        [ 0.2373, -0.3176, -0.0340],
        [ 0.1983, -0.5550,  1.0120],
        [ 0.1059, -0.5601,  0.1591],
        [ 0.2398, -0.9054,  0.9092],
        [ 0.1030, -0.4475,  0.4320],
        [ 0.1137, -0.7084,  1.0681],
        [-0.1512, -0.8369,  1.0458],
        [-0.0790, -0.7102,  0.7377],
        [ 0.5987, -0.5604,  0.6320],
        [-0.2359, -0.9037,  1.2390],
        [-0.1925, -0.5621,  1.2134],
        [ 0.2065, -0.1759,  0.2569],
        [-0.1025, -0.8357,  1.2248],
        [ 0.1578, -0.1240,  0.1940],
        [-0.1391, -0.5009,  1.0772],
        [ 0.2214, -0.0643,  0.5112],
        [-0.1037, -0.7506,  1.2324],
        [ 0.0995, -0.5852,  0.4388],
        [ 0.2248, -0.7820,  0.8602],
        [-0.1788, -0.6484,  0.8949],
        [-0.0967, -0.8550,  1.2379],
        [-0.0310, -0.0456,  0.0644],
        [ 0.4714, -0.1435,  0.4622],
        [-0.04

(Epoch 2) TRAIN LOSS:0.7924 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:05<00:01,  2.70it/s]

SequenceClassifierOutput(loss=tensor(0.6228, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0742, -0.7018,  1.0123],
        [ 0.2123, -0.9628,  1.2123],
        [ 0.3778, -0.5166,  0.5536],
        [-0.2960, -1.1107,  1.3625],
        [ 0.0577, -0.1178,  0.5335],
        [-0.0456, -0.6911,  1.0583],
        [-0.4677, -0.5647,  0.8716],
        [-0.1590, -0.7810,  1.1942],
        [ 0.0635, -0.8878,  0.7975],
        [ 0.1436, -0.4843,  0.0362],
        [-0.3248, -0.7353,  0.8482],
        [ 0.3295, -0.4740,  0.3816],
        [ 0.2428, -0.2095,  0.5247],
        [ 0.0404, -0.4962,  0.5651],
        [-0.1374, -0.8345,  1.1541],
        [-0.1015, -0.9643,  1.2077],
        [-0.1473, -0.9175,  0.8280],
        [ 0.5102, -0.5173,  0.0435],
        [-0.2575, -0.8098,  1.1048],
        [-0.0200, -0.7852,  1.0991],
        [ 0.5996, -0.6304,  0.4310],
        [ 0.0184, -0.8940,  0.9861],
        [-0.1830, -1.1208,  1.3211],
        [-0.1136, -0.7629,  1.4231],
        [-0.12

(Epoch 2) TRAIN LOSS:0.7847 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:05<00:01,  2.66it/s]

SequenceClassifierOutput(loss=tensor(0.7021, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0318, -0.5874,  0.3332],
        [ 0.1758, -1.0161,  1.5406],
        [ 0.1988, -0.5493,  0.4110],
        [ 0.0938, -0.9108,  1.1432],
        [-0.0952, -0.8786,  1.0831],
        [ 0.1338, -0.9743,  1.0829],
        [ 0.3519, -0.6268,  0.6262],
        [ 0.1642, -0.5093,  0.5548],
        [-0.0527, -0.4670,  1.2100],
        [-0.0258, -0.6642,  0.4964],
        [ 0.0845, -1.2703,  1.0407],
        [-0.0690, -0.9340,  1.2812],
        [-0.4242, -0.8250,  1.4929],
        [ 0.0725, -1.0123,  1.1978],
        [ 0.0254, -0.6926,  1.3896],
        [ 0.0771, -0.7587,  0.9945],
        [-0.0777, -1.0004,  0.9179],
        [-0.0602, -1.0110,  0.9425],
        [-0.0989, -0.5624,  0.9162],
        [ 0.0494, -0.9090,  1.4342],
        [-0.2009, -0.5932,  0.4239],
        [ 0.3095, -0.2464,  0.1990],
        [-0.0836, -0.6228,  1.2363],
        [-0.3262, -0.9374,  1.2635],
        [ 0.14

(Epoch 2) TRAIN LOSS:0.7811 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:06<00:00,  2.63it/s]

SequenceClassifierOutput(loss=tensor(0.8595, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2898, -0.3303,  0.6277],
        [ 0.1830, -0.1431,  0.4743],
        [ 0.3083, -0.4979,  0.3910],
        [ 0.3044, -0.6189,  0.7779],
        [-0.1845, -0.9091,  1.1653],
        [ 0.1083, -0.5549,  0.7667],
        [ 0.3318, -0.3019,  0.3293],
        [ 0.2980, -0.0281,  0.4824],
        [ 0.4138, -0.0903,  0.2951],
        [ 0.5030, -0.7917,  0.5217],
        [-0.1902, -0.8703,  1.2569],
        [ 0.2448, -0.2775,  0.3278],
        [-0.0614, -0.5746,  1.2312],
        [ 0.1099, -0.7288,  0.8569],
        [ 0.0198, -0.7177,  1.3479],
        [ 0.0492, -0.8670,  1.3469],
        [ 0.1827, -0.3432,  0.1660],
        [ 0.0519, -0.9130,  1.3010],
        [-0.0351, -0.7545,  1.4965],
        [ 0.0522,  0.1321,  0.3471],
        [ 0.2123, -0.0967,  0.5594],
        [-0.1505, -0.0181,  0.2703],
        [ 0.3477, -0.2274,  0.3743],
        [ 0.4080, -0.5426,  0.8558],
        [ 0.14

(Epoch 2) TRAIN LOSS:0.7843 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:06<00:00,  2.56it/s]

SequenceClassifierOutput(loss=tensor(0.6965, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 8.9369e-02, -9.1794e-01,  1.2764e+00],
        [-1.9113e-03, -1.0916e+00,  1.4090e+00],
        [ 5.9255e-01, -5.2560e-01,  1.3576e-01],
        [ 1.2028e-01, -6.4179e-01,  1.1804e+00],
        [-2.5452e-02, -6.1194e-01,  8.9158e-01],
        [ 1.0249e-02, -7.5305e-01,  1.1795e+00],
        [ 1.2614e-01, -9.7726e-01,  1.2005e+00],
        [ 3.8006e-01, -5.4781e-01,  6.8086e-01],
        [ 9.7962e-02, -4.9311e-01,  7.2073e-01],
        [-1.1400e-01, -8.5390e-01,  1.4761e+00],
        [ 9.1150e-02, -5.9802e-01,  3.0041e-01],
        [-1.9343e-01, -7.4324e-01,  1.3368e+00],
        [ 4.6218e-01, -8.3600e-01,  8.7261e-01],
        [ 1.8133e-01, -9.1279e-01,  1.2915e+00],
        [ 4.2733e-01, -3.1367e-01,  3.8922e-01],
        [ 6.3168e-02, -6.7498e-01,  1.0246e+00],
        [ 6.6425e-01, -3.3786e-01,  1.6677e-01],
        [ 1.0993e-01, -9.7327e-01,  1.2580e+00],
        [ 5.8916e-02

(Epoch 2) TRAIN LOSS:0.7808 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  3.49it/s]

(Epoch 2) TRAIN LOSS:0.7808 ACC:0.62 F1:0.34 REC:0.38 PRE:0.42 LR:0.00000300



(Epoch 3) TRAIN LOSS:0.8154 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:05,  4.52it/s]

SequenceClassifierOutput(loss=tensor(0.8154, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3053, -1.0446,  1.3534],
        [ 0.3900, -0.3903,  0.8157],
        [-0.1294, -0.7635,  1.0142],
        [-0.1253, -0.7655,  1.2098],
        [-0.0213, -0.9758,  1.2603],
        [-0.0859, -0.1398,  0.1394],
        [-0.0677, -0.7072,  1.4496],
        [ 0.1804, -0.3367,  0.1227],
        [ 0.5626,  0.6017,  0.1345],
        [ 0.5449, -0.3271,  0.6944],
        [ 0.0117, -0.9913,  0.8195],
        [ 0.4238, -0.8540,  0.3971],
        [-0.2146, -0.8947,  1.1159],
        [-0.0648, -0.3221,  0.0476],
        [ 0.1373, -0.0917,  0.1355],
        [ 0.4737, -0.1081,  0.3538],
        [ 0.6083, -0.7187,  0.2808],
        [ 0.0711, -0.3421,  0.4045],
        [ 0.1448, -0.4391,  0.7699],
        [ 0.5295, -0.0962,  0.0752],
        [ 0.2733, -0.4269,  0.9420],
        [-0.0620, -0.5888,  0.4568],
        [-0.2695, -0.7910,  1.4981],
        [ 0.1736, -0.3916,  0.2846],
        [ 0.00

(Epoch 3) TRAIN LOSS:0.7416 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:04,  4.70it/s]

SequenceClassifierOutput(loss=tensor(0.7385, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0681, -0.9093,  0.9550],
        [ 0.4742, -0.4703,  0.2944],
        [ 0.0960, -0.9630,  1.1816],
        [ 0.0882, -0.8658,  1.2033],
        [-0.0089, -0.9665,  1.1407],
        [-0.0853, -1.2036,  1.2988],
        [ 0.1757, -0.9954,  1.1876],
        [ 0.3771, -0.7664,  0.9529],
        [ 0.1220, -0.5532,  0.3297],
        [ 0.3073, -0.0315,  0.6300],
        [ 0.3176, -0.4670,  0.8754],
        [ 0.1163, -0.9623,  1.0724],
        [ 0.2223, -0.3190,  0.6393],
        [ 0.1307, -0.7540,  1.2164],
        [-0.2459, -0.6533,  1.2405],
        [ 0.3635, -0.3105,  0.3827],
        [-0.3731, -1.0951,  1.2932],
        [-0.0230, -0.8199,  1.3445],
        [-0.1835, -0.9204,  0.9840],
        [-0.1766, -0.8144,  1.2510],
        [-0.0307, -0.4722,  0.4852],
        [ 0.1275, -0.4489,  0.1718],
        [-0.2064, -0.9319,  1.3897],
        [ 0.0792, -0.7048,  0.0327],
        [ 0.35

(Epoch 3) TRAIN LOSS:0.7551 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:04,  4.75it/s]

SequenceClassifierOutput(loss=tensor(0.7957, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0250, -0.8240,  0.9113],
        [ 0.7005, -0.6338,  0.3410],
        [-0.0776, -0.6800,  1.3058],
        [ 0.4496, -0.6478,  0.6390],
        [ 0.0831, -0.5867,  1.3451],
        [-0.0274, -0.9575,  0.9587],
        [ 0.3388, -0.5391,  0.2947],
        [ 0.3363,  0.0425,  0.5665],
        [-0.0499, -0.9498,  1.0653],
        [ 0.2845, -0.0026,  0.4265],
        [-0.0493, -0.8092,  1.1911],
        [-0.0816, -0.8844,  1.2396],
        [-0.0897, -0.7824,  0.9299],
        [ 0.2472, -0.0708,  0.5813],
        [-0.0277, -0.3565,  0.3332],
        [-0.2760, -0.8601,  1.1631],
        [ 0.0019, -0.6820,  1.1214],
        [-0.1093, -0.9029,  1.4182],
        [ 0.1264, -0.7107,  0.8129],
        [ 0.0873, -0.5320,  0.2196],
        [ 0.1323, -0.1669,  0.3127],
        [-0.0151, -1.1297,  1.3696],
        [-0.0710, -0.7341,  0.8702],
        [ 0.0658, -0.4577,  0.2015],
        [ 0.47

(Epoch 3) TRAIN LOSS:0.7643 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:04,  4.63it/s]

SequenceClassifierOutput(loss=tensor(0.8010, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1604, -0.7453,  1.0091],
        [-0.1330, -1.1099,  1.4562],
        [-0.1063, -0.9326,  1.2440],
        [-0.4572, -0.6846,  1.3577],
        [ 0.1628, -1.0188,  0.6573],
        [ 0.2581, -0.5743,  0.3848],
        [ 0.2814, -0.7362,  0.7642],
        [ 0.2174, -0.7588,  0.3769],
        [ 0.4455, -0.4868,  1.0201],
        [ 0.5143, -0.1024,  0.2355],
        [-0.4145, -0.7236,  1.1720],
        [ 0.0851, -0.5354,  1.0203],
        [ 0.0333, -1.1373,  1.3042],
        [-0.0058, -0.5930,  0.9050],
        [-0.0415, -0.9545,  1.3970],
        [ 0.0237, -0.8332,  1.1957],
        [-0.0538, -0.9938,  1.4032],
        [ 0.5579, -0.6271,  0.5143],
        [ 0.2318, -0.2532,  0.3778],
        [ 0.1641, -0.7147,  0.9842],
        [ 0.2389, -0.6547,  0.9550],
        [ 0.0156, -1.1629,  1.4358],
        [ 0.1809, -0.5986,  0.8963],
        [-0.1797, -1.3616,  1.3705],
        [ 0.38

(Epoch 3) TRAIN LOSS:0.7426 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:04,  4.68it/s]

SequenceClassifierOutput(loss=tensor(0.6343, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4889, -0.4706,  0.6338],
        [ 0.0662, -0.8674,  1.2478],
        [ 0.5978, -0.5641,  0.3909],
        [ 0.6422, -0.5895,  0.3823],
        [-0.1411, -1.1469,  1.1768],
        [-0.1922, -0.9314,  1.4822],
        [-0.0244, -0.8313,  1.2373],
        [ 0.0123, -1.0407,  1.1058],
        [ 0.4318, -0.1722,  0.1840],
        [-0.0067, -0.8318,  1.2213],
        [ 0.4141, -0.1966,  0.5257],
        [ 0.1472, -0.7900,  1.3602],
        [ 0.0350, -0.9000,  1.2859],
        [-0.1764,  0.2149, -0.2683],
        [ 0.2156, -0.4393,  0.6451],
        [-0.2192, -0.9028,  1.3362],
        [ 0.1008, -0.7963,  1.2536],
        [ 0.1249, -0.8499,  1.4026],
        [-0.4610, -0.9453,  1.2725],
        [ 0.1306, -0.9437,  1.2078],
        [ 0.3366, -0.6312,  0.3989],
        [-0.1556, -0.9515,  1.2581],
        [ 0.0819, -0.6702,  1.0168],
        [ 0.3902, -0.5544,  0.9521],
        [-0.12

(Epoch 3) TRAIN LOSS:0.7386 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:03,  4.67it/s]

SequenceClassifierOutput(loss=tensor(0.7147, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5600, -0.0992,  0.3148],
        [-0.0970, -0.7873,  0.9177],
        [-0.3744, -1.0086,  1.3731],
        [-0.1567, -0.7922,  1.2436],
        [-0.1693, -0.9565,  1.6214],
        [ 0.1477, -0.3003,  0.4729],
        [ 0.1394, -0.9269,  1.3331],
        [-0.1839, -1.1001,  1.2982],
        [ 0.3752, -0.9944,  0.3036],
        [-0.1599, -0.8529,  1.3319],
        [ 0.2164, -0.5188,  0.6153],
        [-0.1444, -0.4374,  0.1009],
        [-0.0813, -1.0118,  1.5867],
        [-0.3256, -0.9443,  1.3973],
        [-0.1733, -0.5617,  1.4370],
        [-0.1258, -0.7526,  0.8422],
        [ 0.4441, -0.3962,  0.2939],
        [ 0.3521, -0.3443,  0.3174],
        [ 0.2476, -0.7828,  1.2374],
        [ 0.1655, -0.4038,  0.4922],
        [ 0.0254, -0.8261,  1.0063],
        [ 0.4866, -0.6783,  0.3769],
        [ 0.4480, -0.5525,  0.4054],
        [-0.2415, -1.2154,  1.4681],
        [ 0.01

(Epoch 3) TRAIN LOSS:0.7419 LR:0.00000300:  32%|█████████████                            | 8/25 [00:01<00:03,  4.77it/s]

SequenceClassifierOutput(loss=tensor(0.7647, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0412, -0.9236,  0.9619],
        [-0.0870, -0.7439,  0.5863],
        [ 0.3000, -0.3746,  0.6518],
        [ 0.1241, -0.3817,  0.7629],
        [ 0.2820, -0.8556,  0.7815],
        [ 0.1051, -1.1197,  1.3711],
        [ 0.1510, -0.4634,  0.4570],
        [ 0.2944, -0.3722,  0.5265],
        [-0.1563, -0.9539,  1.2299],
        [-0.0288, -0.0772,  0.5195],
        [ 0.1016, -1.1345,  1.4248],
        [ 0.3793, -0.7205,  0.5949],
        [-0.0230, -0.8114,  0.9443],
        [ 0.0792, -0.5365,  0.3237],
        [ 0.3494, -0.2398,  0.5846],
        [ 0.0815, -0.6585,  1.0219],
        [ 0.1764, -0.5515,  0.3816],
        [ 0.2396, -0.4748,  0.2803],
        [-0.0325, -0.9457,  1.1869],
        [ 0.5020, -0.5910,  0.4315],
        [ 0.2469, -0.5230,  0.7379],
        [ 0.4361, -0.5912,  0.5868],
        [ 0.4296, -0.3831, -0.1278],
        [ 0.1525, -1.0475,  1.0616],
        [-0.08

(Epoch 3) TRAIN LOSS:0.7384 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:01<00:03,  4.59it/s]

SequenceClassifierOutput(loss=tensor(0.7103, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3955, -0.9464,  0.5785],
        [-0.2633, -0.7977,  1.3640],
        [ 0.3878, -0.4928,  0.2395],
        [-0.1445, -0.5821,  1.1787],
        [ 0.2228, -0.4713,  0.4997],
        [ 0.2060, -0.2153,  0.4194],
        [-0.1350, -0.9298,  1.4299],
        [-0.0833, -1.0347,  0.9045],
        [ 0.1376, -0.8464,  1.1204],
        [-0.0486, -1.0631,  0.8682],
        [ 0.3045, -0.1355,  0.2411],
        [ 0.0342, -0.9983,  1.3921],
        [-0.0312, -1.2369,  1.6258],
        [ 0.2533, -0.9848,  1.1112],
        [ 0.4700, -0.6563,  0.5126],
        [-0.1080,  0.0331,  0.1524],
        [ 0.0023, -0.8438,  1.1875],
        [ 0.0966, -0.9153,  1.2194],
        [ 0.3296, -0.5358,  0.6250],
        [ 0.3165, -0.0678,  0.3236],
        [-0.3312, -1.0845,  1.2757],
        [ 0.2445, -0.6172,  0.4525],
        [ 0.4767, -0.2216,  0.1074],
        [ 0.5284, -0.8393,  0.8713],
        [-0.20

(Epoch 3) TRAIN LOSS:0.7457 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:03,  4.53it/s]

SequenceClassifierOutput(loss=tensor(0.8121, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5069, -0.2219,  0.2664],
        [ 0.3453, -0.4053,  0.2763],
        [ 0.2172, -0.8850,  1.1321],
        [-0.1241, -0.8343,  1.1556],
        [ 0.1446, -0.7720,  1.0346],
        [-0.1341, -0.9381,  1.3049],
        [ 0.1312, -1.0530,  0.7801],
        [-0.0776, -0.7498,  0.7477],
        [ 0.5714, -0.4007,  0.3880],
        [ 0.5713, -0.7208,  0.5169],
        [-0.0640, -0.7822,  0.7953],
        [ 0.1853, -0.6688,  0.6111],
        [ 0.4152, -0.1994,  0.5728],
        [ 0.1305, -1.1893,  1.2552],
        [ 0.4686, -0.2482,  0.1266],
        [ 0.1718, -0.5411,  0.6346],
        [ 0.3085, -0.5198,  0.7830],
        [-0.1264, -0.8230,  1.1790],
        [-0.0247, -0.7829,  1.5419],
        [ 0.2395,  0.0023,  0.1890],
        [ 0.1336, -0.8589,  1.4602],
        [-0.1003, -0.5368,  0.9066],
        [-0.0356, -0.9714,  1.1951],
        [ 0.4978, -0.3151,  0.3340],
        [ 0.33

(Epoch 3) TRAIN LOSS:0.7324 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:02<00:03,  4.62it/s]

SequenceClassifierOutput(loss=tensor(0.5985, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3041, -0.8433,  1.3273],
        [ 0.5150,  0.0468,  0.3794],
        [-0.1766, -0.9408,  1.4526],
        [-0.1372, -1.0389,  1.5288],
        [ 0.1200, -0.2459,  0.1623],
        [-0.4362, -0.8719,  1.1179],
        [ 0.1769, -0.5712,  0.1161],
        [-0.3890, -1.1873,  1.3739],
        [-0.1638, -1.1459,  1.3376],
        [ 0.3631, -0.9397,  1.2520],
        [ 0.7081, -0.3425,  0.2762],
        [-0.2551, -1.1803,  1.5078],
        [ 0.0942, -0.3689,  0.4809],
        [-0.1567, -0.7355,  1.1520],
        [ 0.1049, -1.0503,  1.0739],
        [-0.2865, -0.6425,  1.3640],
        [ 0.3635, -0.1695,  0.0470],
        [ 0.5289, -0.4772,  0.3250],
        [ 0.0494, -0.6794,  0.4074],
        [-0.0724, -0.8608,  1.2651],
        [ 0.3745, -0.4822,  0.6712],
        [ 0.5018, -0.4519,  0.4565],
        [ 0.1978,  0.0944,  0.4661],
        [-0.2479, -0.7841,  1.1632],
        [ 0.31

(Epoch 3) TRAIN LOSS:0.7143 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:02<00:02,  4.55it/s]

SequenceClassifierOutput(loss=tensor(0.5155, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0668, -0.1275,  0.6135],
        [-0.4199, -0.9428,  1.6496],
        [ 0.2823, -1.0624,  1.0290],
        [ 0.0465, -1.0893,  1.5178],
        [ 0.4234, -0.8240,  0.1991],
        [ 0.1356, -0.8116,  1.2035],
        [ 0.0452, -0.8808,  1.0721],
        [-0.2642, -1.1784,  1.4016],
        [-0.1937, -1.0943,  1.4656],
        [ 0.6387, -0.6895,  0.2067],
        [ 0.1180, -0.7065,  1.0583],
        [-0.2122, -1.1206,  1.2798],
        [ 0.2710, -0.9281,  1.2107],
        [ 0.1485, -0.8795,  1.0901],
        [-0.2196, -0.8857,  1.1092],
        [-0.2103, -1.1012,  1.5841],
        [-0.2090, -1.0167,  1.4538],
        [-0.2908, -1.1921,  1.4249],
        [-0.0143, -0.7183,  0.6446],
        [ 0.0899, -0.7422,  1.1190],
        [-0.0720, -0.7031,  1.0362],
        [-0.2680, -1.1288,  1.4097],
        [ 0.1391, -0.7505,  1.0604],
        [ 0.3804, -0.4829,  0.6649],
        [-0.07

(Epoch 3) TRAIN LOSS:0.7038 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:02<00:02,  4.59it/s]

SequenceClassifierOutput(loss=tensor(0.5777, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0163, -1.0132,  0.9977],
        [-0.2192, -1.1057,  1.2460],
        [ 0.0053, -0.2626,  0.0918],
        [ 0.3847, -0.2541,  0.1491],
        [ 0.0220, -1.0361,  1.0869],
        [ 0.3913, -0.6455,  0.2299],
        [-0.1544, -0.8204,  1.5520],
        [ 0.4555,  0.0717,  0.2171],
        [ 0.1302, -1.2079,  1.1177],
        [-0.3292, -0.8251,  1.0852],
        [-0.2187, -1.3519,  1.4591],
        [-0.1518, -0.8235,  1.5133],
        [-0.1563, -0.9281,  1.0774],
        [ 0.7473, -0.0121,  0.3172],
        [-0.0912, -1.3174,  1.3542],
        [ 0.2689, -0.7020,  0.9769],
        [ 0.3447,  0.0464,  0.7119],
        [-0.4228, -1.1227,  1.8468],
        [-0.0331, -1.1995,  1.3111],
        [ 0.2945, -0.3623,  0.4796],
        [-0.0533, -1.0602,  1.1963],
        [ 0.1570, -1.1205,  1.0106],
        [ 0.0652, -0.9041,  1.3683],
        [ 0.1748, -1.0592,  1.4775],
        [-0.00

(Epoch 3) TRAIN LOSS:0.7006 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:03<00:02,  4.68it/s]

SequenceClassifierOutput(loss=tensor(0.6587, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1961, -0.7348,  1.7454],
        [-0.0434, -1.0303,  1.4543],
        [ 0.0051, -0.8567,  1.3478],
        [ 0.4910, -0.1416,  0.5293],
        [ 0.2206, -0.5612, -0.1563],
        [ 0.7167, -0.9208,  0.2284],
        [ 0.8923, -0.4854,  0.1402],
        [ 0.4102, -0.2501,  0.1991],
        [-0.2219, -0.8639,  0.9378],
        [-0.1299, -1.0065,  1.2422],
        [ 0.0916, -0.8497,  1.2248],
        [ 0.0328, -1.2036,  1.3851],
        [ 0.3666, -0.2455,  0.0560],
        [ 0.0052, -0.7061,  1.2529],
        [-0.0238, -1.2580,  1.3391],
        [ 0.4634, -0.7761,  0.1472],
        [ 0.1190, -0.7720,  0.8406],
        [ 0.0798, -0.7731,  1.1646],
        [ 0.0373, -1.2273,  1.2372],
        [ 0.1300, -0.1451,  0.3723],
        [ 0.0659, -0.5238,  0.2503],
        [-0.0463, -0.8839,  1.4379],
        [ 0.0120, -0.9619,  1.1173],
        [ 0.1491, -0.5794,  1.1832],
        [ 0.31

(Epoch 3) TRAIN LOSS:0.6869 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:03<00:01,  4.79it/s]

SequenceClassifierOutput(loss=tensor(0.5915, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1714, -0.8147,  1.0989],
        [ 0.4394, -0.6816,  0.5413],
        [ 0.1276, -1.1658,  1.3383],
        [-0.3351, -0.9943,  1.4979],
        [ 0.3001, -0.0531,  0.5983],
        [-0.2705, -0.9320,  1.5550],
        [ 0.0854, -0.9253,  1.2490],
        [ 0.1027, -0.9040,  1.0272],
        [ 0.4502, -0.8809,  1.1248],
        [-0.2729, -0.9449,  1.5357],
        [-0.0313, -0.9362,  1.2261],
        [-0.2087, -0.7831,  1.3247],
        [ 0.3317, -0.5461,  0.5011],
        [-0.1413, -1.1079,  1.4906],
        [-0.3329, -0.8700,  1.6997],
        [ 0.1122, -0.7196,  1.1881],
        [ 0.0196, -0.5656,  0.3610],
        [-0.0591, -0.6188,  1.2484],
        [ 0.5007, -0.7596,  0.7236],
        [ 0.5506, -0.2648,  0.4131],
        [-0.2387, -1.1509,  1.3555],
        [ 0.6035, -0.2764,  0.4185],
        [-0.1234, -0.8326,  0.9207],
        [-0.3463, -1.1238,  1.5278],
        [ 0.07

(Epoch 3) TRAIN LOSS:0.6882 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:03<00:01,  4.73it/s]

SequenceClassifierOutput(loss=tensor(0.7094, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0805, -0.7497,  0.6791],
        [-0.2594, -0.9418,  1.2294],
        [ 0.3905, -0.3682,  0.4522],
        [-0.0559, -1.2053,  1.4855],
        [ 0.3576, -0.1696,  0.4569],
        [-0.3301, -1.1992,  1.5845],
        [-0.1156, -0.9892,  1.3269],
        [ 0.4518, -0.4683,  0.7504],
        [ 0.0170, -0.9370,  1.5890],
        [-0.1371, -0.6866,  1.5986],
        [-0.0701, -0.8793,  1.2853],
        [-0.0832, -1.2233,  1.3616],
        [-0.2869, -0.8467,  1.3462],
        [ 0.4784, -0.2657,  0.3565],
        [-0.1039, -1.1055,  1.4584],
        [-0.0353, -1.0120,  0.9236],
        [-0.4309, -0.9899,  1.3538],
        [-0.0731, -1.1133,  1.6568],
        [ 0.6935, -0.5183,  0.2275],
        [ 0.6581, -0.6524, -0.0195],
        [ 0.4181, -0.4601,  0.8174],
        [ 0.0034, -0.6956,  0.6958],
        [ 0.5776,  0.1753,  0.5593],
        [-0.1285, -1.1385,  1.4554],
        [-0.36

(Epoch 3) TRAIN LOSS:0.6909 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:03<00:01,  4.63it/s]

SequenceClassifierOutput(loss=tensor(0.7373, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3146, -0.5148,  0.6738],
        [ 0.2092, -0.6238,  0.2640],
        [-0.0202, -0.2245,  0.5053],
        [-0.0152, -0.9219,  1.5373],
        [-0.2055, -0.8005,  1.3090],
        [ 0.7432, -0.3574,  0.0808],
        [-0.0291, -1.1090,  1.6817],
        [ 0.1371, -0.9267,  1.1185],
        [-0.0373, -0.5892,  0.2606],
        [ 0.6124, -0.3371,  0.2270],
        [ 0.4657, -0.6451,  0.0610],
        [ 0.5241, -0.4496,  0.0596],
        [-0.3216, -1.3185,  1.5393],
        [ 0.1671, -0.1334, -0.3635],
        [ 0.2861,  0.2166,  0.2738],
        [-0.0270, -1.3133,  1.5382],
        [-0.2903, -1.0462,  1.4129],
        [-0.2975, -0.6369,  1.0798],
        [ 0.2643, -1.0551,  1.1838],
        [ 0.3107, -0.2526, -0.0122],
        [-0.0523, -1.1073,  1.5185],
        [-0.0620, -1.1794,  1.4534],
        [-0.1287, -0.9098,  1.4152],
        [-0.2626, -1.1422,  1.4402],
        [ 0.56

(Epoch 3) TRAIN LOSS:0.6889 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:04<00:01,  4.73it/s]

SequenceClassifierOutput(loss=tensor(0.6526, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2661, -1.1948,  1.4330],
        [ 0.5694, -0.2717,  0.5069],
        [-0.2579, -0.7993,  1.0280],
        [ 0.5890, -0.1984,  0.5248],
        [-0.3695, -1.0937,  1.5736],
        [ 0.6190, -0.6345,  0.3536],
        [ 0.4532, -0.2533,  0.6004],
        [ 0.0076, -1.4078,  1.3812],
        [-0.1825, -0.9479,  1.2003],
        [ 0.3253, -0.4567,  0.4159],
        [ 0.3055, -0.3270,  0.1420],
        [-0.2994, -0.8579,  1.5475],
        [-0.1889, -1.2332,  1.5844],
        [-0.1952, -0.7858,  1.5282],
        [-0.3688, -1.1545,  1.5836],
        [-0.4731, -0.8575,  1.2646],
        [ 0.1544, -0.8273,  0.5916],
        [ 0.0340, -1.3401,  1.4092],
        [ 0.5374, -0.3697,  0.4119],
        [ 0.4184, -0.5219,  0.2571],
        [ 0.7923, -0.5504, -0.1620],
        [ 0.4992, -0.5649,  0.4554],
        [ 0.2514, -0.1975,  0.2143],
        [-0.2492, -1.2408,  1.5610],
        [ 0.18

(Epoch 3) TRAIN LOSS:0.6886 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:04<00:01,  4.70it/s]

SequenceClassifierOutput(loss=tensor(0.6820, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0856, -1.2878,  1.3046],
        [ 0.1669, -0.8328,  0.9493],
        [ 0.0816, -0.4510,  0.2767],
        [ 0.5324, -0.2740,  0.2825],
        [-0.0904, -1.1235,  1.2622],
        [ 0.2715, -0.1193,  0.3040],
        [ 0.2435, -0.8093,  0.9130],
        [ 0.2367, -0.8534,  0.7903],
        [-0.3584,  0.1530, -0.1451],
        [-0.0851, -0.7788,  1.3848],
        [ 0.2180, -0.6752,  0.9481],
        [ 0.4667, -0.4100,  0.0754],
        [ 0.5225, -0.5641,  0.4027],
        [-0.1576, -1.1288,  1.8689],
        [ 0.0939, -0.8271,  1.0480],
        [ 0.8831, -0.8500, -0.0287],
        [-0.3314, -1.0080,  1.5318],
        [-0.2136, -0.6035,  0.3584],
        [ 0.5322, -0.3696,  0.3553],
        [ 0.5333, -0.1092, -0.2869],
        [ 0.3965, -1.0232,  0.7883],
        [-0.2221, -1.2424,  1.4271],
        [ 0.7754, -0.5896,  0.5918],
        [ 1.2209, -0.6484,  0.0558],
        [-0.10

(Epoch 3) TRAIN LOSS:0.6939 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:04<00:00,  4.77it/s]

SequenceClassifierOutput(loss=tensor(0.8006, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.4774e-01, -3.5364e-01,  3.1655e-01],
        [ 3.9350e-02, -1.3434e+00,  1.3100e+00],
        [ 5.6686e-01, -4.2231e-01,  5.5273e-01],
        [-4.5291e-01, -1.2907e+00,  1.6312e+00],
        [ 5.0145e-01, -5.6616e-01,  5.2296e-01],
        [ 1.8780e-01, -5.5837e-01,  3.3922e-01],
        [-1.4475e-01, -9.8491e-01,  1.3883e+00],
        [-4.3548e-02, -8.1456e-01,  1.0889e+00],
        [-6.1574e-03, -5.4881e-01,  7.0607e-01],
        [ 3.4270e-01, -1.3134e-01,  2.1718e-01],
        [-6.3426e-02, -4.7601e-01,  1.8429e-01],
        [-3.0948e-01, -1.3001e+00,  1.3818e+00],
        [ 3.9329e-01, -3.1894e-01,  2.8412e-01],
        [-9.9144e-05, -9.8949e-01,  1.2642e+00],
        [-3.8743e-01, -1.0457e+00,  1.2398e+00],
        [ 9.7381e-02, -3.4162e-01,  3.1576e-01],
        [-1.2499e-01, -8.3066e-01,  8.0314e-01],
        [ 6.6511e-01, -3.9199e-01, -1.4338e-01],
        [ 5.6181e-01

(Epoch 3) TRAIN LOSS:0.6937 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:04<00:00,  4.73it/s]

SequenceClassifierOutput(loss=tensor(0.6906, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0286, -0.7367,  1.2124],
        [ 0.4240, -0.4462,  0.2638],
        [-0.3642, -1.2576,  1.7318],
        [-0.1325, -1.0522,  0.8987],
        [-0.2259, -0.8696,  1.5181],
        [-0.1060, -1.0976,  1.3776],
        [-0.2128, -1.1692,  1.5349],
        [ 0.8035, -0.5631,  0.2399],
        [ 0.2512, -0.2698, -0.0864],
        [ 0.0665, -1.0140,  1.2057],
        [ 0.4880, -0.5993,  0.3512],
        [-0.3665, -1.1786,  1.3703],
        [-0.0970, -1.0418,  1.3107],
        [-0.3785, -1.0000,  1.7526],
        [ 0.0761, -0.5103,  0.4653],
        [ 0.1847, -1.0872,  1.5044],
        [-0.1738, -1.2909,  1.6300],
        [-0.5408, -1.1675,  1.6474],
        [-0.2660, -0.9414,  1.3269],
        [-0.0994, -1.1497,  1.4096],
        [ 0.3726, -0.4144,  0.0256],
        [ 0.5407, -0.1117,  0.2024],
        [ 0.5640, -0.6267,  0.2714],
        [ 0.5571, -0.6153,  0.3603],
        [ 0.49

(Epoch 3) TRAIN LOSS:0.6848 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:05<00:00,  4.93it/s]

SequenceClassifierOutput(loss=tensor(0.5179, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-1.2942e-03, -1.0303e+00,  1.5649e+00],
        [ 6.5058e-01, -6.8994e-01,  5.1136e-01],
        [ 2.2641e-01, -4.8770e-01,  3.9999e-01],
        [ 2.7913e-01, -3.6906e-01,  5.5316e-01],
        [ 6.3719e-01, -3.2517e-01,  5.5161e-03],
        [-1.0304e-01, -1.3089e+00,  1.7806e+00],
        [-1.3349e-01, -1.0890e+00,  1.4974e+00],
        [-6.9086e-02, -1.0204e+00,  1.3163e+00],
        [ 8.5793e-02, -1.1666e+00,  1.5220e+00],
        [-9.7432e-02, -1.1294e+00,  1.7227e+00],
        [-1.1103e-01, -1.0598e+00,  1.3256e+00],
        [-1.0149e-01, -1.2661e+00,  1.7572e+00],
        [-2.2245e-03, -6.1569e-01,  6.0023e-01],
        [ 7.6643e-03, -6.4893e-01,  7.6971e-01],
        [ 6.1041e-02, -1.2875e+00,  1.4145e+00],
        [ 7.9717e-01, -1.0037e+00,  1.0737e-02],
        [ 3.0334e-01,  1.6843e-01,  4.0899e-01],
        [ 7.5633e-01, -5.5191e-01,  2.9832e-01],
        [ 7.1583e-01

(Epoch 3) TRAIN LOSS:0.6905 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:05<00:00,  4.65it/s]

(Epoch 3) TRAIN LOSS:0.6905 ACC:0.72 F1:0.49 REC:0.50 PRE:0.64 LR:0.00000300



(Epoch 4) TRAIN LOSS:0.5834 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:06,  3.82it/s]

SequenceClassifierOutput(loss=tensor(0.5834, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2620, -0.7343,  0.3238],
        [ 0.1801, -0.8801,  1.1610],
        [ 0.4403,  0.3153,  0.0257],
        [-0.2396, -1.2049,  1.4852],
        [-0.2548, -1.2177,  1.8195],
        [ 0.1838, -0.8880,  1.1477],
        [-0.4400, -1.2532,  1.6483],
        [-0.1112, -1.1393,  1.6642],
        [-0.4505, -0.7210,  1.3255],
        [ 0.4357, -0.3900,  0.1488],
        [-0.2048, -1.0898,  1.7667],
        [-0.0351, -0.8612,  1.3343],
        [-0.3171, -1.2163,  1.5628],
        [-0.0024, -1.1862,  1.2148],
        [-0.1287, -1.1289,  1.4030],
        [ 0.0254, -1.0286,  1.3470],
        [ 0.5275, -0.4173,  0.1750],
        [ 0.4019, -0.3904,  0.1833],
        [ 0.7352, -0.4347,  0.4763],
        [ 0.6752, -0.4447,  0.7031],
        [ 0.8629, -0.7739, -0.0742],
        [-0.3137, -1.1884,  1.3836],
        [-0.2443, -1.0385,  1.4088],
        [ 0.7228, -0.5072,  0.1096],
        [ 0.43

(Epoch 4) TRAIN LOSS:0.5901 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:05,  4.23it/s]

SequenceClassifierOutput(loss=tensor(0.5968, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2218, -1.2449,  1.5627],
        [ 0.0162, -0.9383,  1.2281],
        [ 0.1285, -0.3754,  0.3146],
        [-0.2886, -0.9969,  1.7604],
        [-0.0329, -0.8197,  1.7570],
        [-0.0967, -1.2763,  1.3535],
        [ 0.1565, -0.6412,  1.2917],
        [-0.3524, -1.1844,  1.5387],
        [ 0.4853, -0.4088,  0.0302],
        [-0.0291, -1.3139,  1.3039],
        [ 0.4606, -0.3837, -0.0825],
        [-0.2587, -0.9315,  1.5461],
        [-0.0776, -0.8984,  1.6410],
        [ 0.6980, -0.7242,  0.7040],
        [ 0.7352, -0.5858,  0.1418],
        [ 0.1645, -1.0518,  1.5266],
        [-0.2227, -1.0385,  1.4031],
        [-0.2046, -0.9211,  1.3842],
        [-0.0279, -1.1006,  1.4286],
        [ 0.6939, -0.3353,  0.2612],
        [-0.1641, -1.0713,  1.5714],
        [ 0.0278, -0.8008,  1.1196],
        [ 0.6078, -0.2832,  0.0334],
        [ 0.4027, -0.3570,  0.2298],
        [-0.19

(Epoch 4) TRAIN LOSS:0.6289 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:04,  4.43it/s]

SequenceClassifierOutput(loss=tensor(0.7066, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 3.7355e-01, -7.2494e-01,  8.5390e-01],
        [ 3.6375e-01, -9.5366e-01,  6.0576e-01],
        [ 4.7513e-01, -3.6708e-01,  4.0740e-01],
        [ 7.0916e-01, -4.6515e-01,  3.8814e-01],
        [ 2.0808e-01, -1.2127e+00,  1.3069e+00],
        [-5.7193e-01, -7.0221e-01,  1.7015e+00],
        [ 5.2097e-01, -7.9224e-01,  5.0480e-01],
        [ 5.4843e-01, -5.3601e-01,  2.9265e-01],
        [ 6.8211e-02, -9.4717e-01,  1.2812e+00],
        [-1.1216e-01, -1.3133e+00,  1.1930e+00],
        [ 4.4223e-01,  1.7899e-02,  1.3386e-01],
        [ 6.1968e-02, -1.1841e+00,  1.3385e+00],
        [ 3.9666e-01, -7.7280e-01,  5.9551e-01],
        [ 2.3755e-01, -2.9409e-01, -4.3242e-04],
        [ 6.0410e-01, -7.3062e-01,  4.7563e-01],
        [ 2.9671e-01, -6.4629e-01,  3.1831e-01],
        [-3.9553e-01, -8.5363e-01,  1.5286e+00],
        [ 5.0887e-01, -5.6454e-01,  2.6632e-01],
        [-3.5371e-02

(Epoch 4) TRAIN LOSS:0.6262 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:05,  4.17it/s]

SequenceClassifierOutput(loss=tensor(0.6180, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0115, -1.1213,  1.2033],
        [-0.2607, -0.7501,  1.6389],
        [ 0.1042, -1.1635,  1.3037],
        [-0.2996, -1.0643,  1.7450],
        [ 0.1622, -1.0649,  1.4829],
        [ 0.3485, -0.3078,  0.2468],
        [ 0.9739, -0.0321,  0.4961],
        [-0.2722, -1.1533,  1.6101],
        [ 0.7343, -0.5619, -0.0442],
        [ 0.6132, -0.6451,  0.3189],
        [-0.2227, -1.0421,  1.3319],
        [ 0.4775, -0.4632,  0.3487],
        [ 0.7644, -0.3408,  0.2778],
        [ 0.2070, -0.6598, -0.1724],
        [ 0.0647,  0.0061,  0.0141],
        [ 0.0115, -0.2688,  0.1541],
        [ 0.0259, -1.2394,  1.5665],
        [ 0.0353, -0.8294,  1.1288],
        [ 0.3248, -0.6741,  1.3322],
        [ 0.7076, -0.0942,  0.2710],
        [ 0.4542, -0.1203, -0.0830],
        [-0.0224, -0.6469,  1.4555],
        [ 0.3745, -0.3174,  0.3829],
        [ 0.1598, -1.1111,  1.5036],
        [-0.42

(Epoch 4) TRAIN LOSS:0.5956 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:05,  3.95it/s]

SequenceClassifierOutput(loss=tensor(0.7081, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1754, -1.1499,  1.6544],
        [ 0.1136, -0.9902,  1.3702],
        [ 0.9068, -0.6539, -0.0879],
        [ 0.7000,  0.0211,  0.2885],
        [ 0.2573, -0.8679,  0.5716],
        [-0.1617, -0.9800,  1.4456],
        [-0.2598, -1.1393,  1.4997],
        [ 0.4799, -0.5064,  0.1989],
        [ 0.4761, -0.7505,  0.7830],
        [ 0.4966, -0.7570,  0.8069],
        [ 0.6406, -0.4446, -0.0980],
        [ 0.4500, -0.5557,  0.5090],
        [ 0.0051, -0.9620,  1.4379],
        [ 0.1380, -1.1636,  1.2842],
        [ 0.2090, -0.3109,  0.6366],
        [ 0.7634, -0.3845,  0.0900],
        [ 0.8894, -0.3289,  0.4607],
        [-0.0414, -1.2041,  1.3090],
        [-0.2215, -1.2203,  1.7235],
        [ 0.1509, -0.9550,  1.3573],
        [-0.2696, -1.2346,  1.7436],
        [-0.2655, -1.0315,  1.4889],
        [ 0.4281, -0.8536,  1.1613],
        [ 0.1267, -0.4949,  1.1101],
        [ 0.79

(Epoch 4) TRAIN LOSS:0.6144 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:05,  3.35it/s]

SequenceClassifierOutput(loss=tensor(0.4874, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 4.0193e-01, -1.0223e+00,  1.2001e+00],
        [ 2.9730e-01, -6.7590e-04,  4.4529e-01],
        [-2.8278e-01, -1.1260e+00,  1.5293e+00],
        [-3.5444e-01, -1.0021e+00,  1.6952e+00],
        [ 1.0082e+00,  3.0029e-02, -2.1540e-01],
        [-1.9182e-01, -1.0886e+00,  1.6484e+00],
        [ 7.9073e-01, -7.1743e-01,  2.1189e-01],
        [-1.9965e-01, -8.2860e-01,  1.3336e+00],
        [-1.9901e-01, -8.9030e-01,  1.3384e+00],
        [-6.0388e-01, -1.2218e+00,  1.6331e+00],
        [-7.7481e-02, -1.3388e+00,  1.6688e+00],
        [ 6.3632e-01, -3.3441e-01, -9.6159e-02],
        [-2.3451e-01, -8.1426e-01,  1.5056e+00],
        [-5.5221e-01, -1.0060e+00,  1.5108e+00],
        [-1.7639e-01, -7.1469e-01,  9.7929e-01],
        [-5.1597e-02, -8.9880e-01,  1.1117e+00],
        [ 4.0689e-01, -4.6400e-01,  3.9230e-01],
        [-1.6970e-01, -1.4528e+00,  1.5113e+00],
        [-2.5456e-02

(Epoch 4) TRAIN LOSS:0.5962 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:02<00:06,  2.94it/s]

SequenceClassifierOutput(loss=tensor(0.7028, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0057, -0.8773,  1.0817],
        [ 0.0661, -1.0966,  1.5384],
        [ 0.2966, -0.7092,  0.4413],
        [-0.1999, -1.1848,  1.2992],
        [ 0.6914, -0.6337,  0.2182],
        [-0.3158, -0.9783,  1.4120],
        [ 0.9374, -0.7823, -0.0258],
        [ 0.4649, -0.1187,  0.0170],
        [-0.2254, -1.1126,  1.3195],
        [-0.1667, -0.9916,  1.2922],
        [ 0.4264, -0.5843,  0.2440],
        [-0.3549, -1.1821,  1.6234],
        [ 0.2106, -1.0335,  0.9974],
        [ 0.6090,  0.3239, -0.0579],
        [ 0.5468, -0.1627,  0.3165],
        [ 0.5992, -0.1155,  0.2144],
        [-0.4116, -1.1589,  1.6106],
        [ 0.7913, -0.4166,  0.2549],
        [-0.2154, -0.8276,  1.1776],
        [-0.2356, -1.1319,  1.7173],
        [ 0.2426, -1.1137,  1.1174],
        [-0.2318, -1.1083,  1.6001],
        [ 0.5422, -0.3852,  0.4424],
        [ 0.0862, -1.0958,  1.3778],
        [-0.14

(Epoch 4) TRAIN LOSS:0.6096 LR:0.00000300:  32%|█████████████                            | 8/25 [00:02<00:06,  2.82it/s]

SequenceClassifierOutput(loss=tensor(0.6022, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 5.3074e-01, -6.4772e-01, -8.5501e-03],
        [ 3.9776e-01,  3.3327e-02,  8.8133e-02],
        [-5.9112e-02, -1.0364e+00,  1.1517e+00],
        [-1.4717e-03, -1.0998e+00,  1.3635e+00],
        [-2.5529e-01, -8.5236e-01,  1.6707e+00],
        [-2.0060e-01, -1.3092e+00,  1.5981e+00],
        [ 4.6616e-01, -1.7548e-01, -5.7334e-02],
        [-7.6672e-02, -1.0329e+00,  1.1236e+00],
        [ 4.1256e-01, -1.3628e-01,  2.9148e-01],
        [ 5.7631e-01, -7.0034e-01,  4.9094e-01],
        [-7.0358e-02, -1.1673e+00,  1.5360e+00],
        [-3.4024e-01, -1.2187e+00,  1.5584e+00],
        [ 9.1920e-02, -5.6190e-01,  5.1667e-01],
        [-2.2452e-01, -1.2963e+00,  1.4455e+00],
        [ 1.3329e-01, -1.0696e+00,  1.0576e+00],
        [-1.7937e-01, -9.2738e-01,  1.5616e+00],
        [ 2.9698e-01,  3.0443e-01,  3.6568e-02],
        [ 1.0620e+00, -3.4521e-01,  1.0291e-01],
        [ 1.0054e-01

(Epoch 4) TRAIN LOSS:0.6087 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:02<00:05,  2.72it/s]

SequenceClassifierOutput(loss=tensor(0.5725, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6126, -0.7719,  0.6750],
        [ 0.0252, -0.7048,  1.0113],
        [-0.0573, -1.1233,  1.5560],
        [ 1.1389, -0.5375,  0.1802],
        [ 0.0125, -1.1189,  1.2992],
        [ 0.1400, -1.0254,  0.7927],
        [ 0.5505, -0.7862,  0.6037],
        [-0.3293, -1.1217,  1.3127],
        [-0.2795, -1.3151,  1.4044],
        [-0.0787, -1.4229,  1.5657],
        [ 0.8389, -0.5433, -0.1932],
        [-0.4802, -1.3039,  1.5536],
        [ 0.3838, -0.6573,  0.6343],
        [ 0.0603, -0.9876,  1.3290],
        [ 0.0382, -1.0652,  1.4707],
        [ 0.0555,  0.0934, -0.2474],
        [ 0.9965,  0.0524,  0.1546],
        [ 0.8710, -0.6557, -0.0085],
        [-0.1366, -0.7782,  1.5506],
        [-0.2596, -1.1036,  1.8153],
        [ 0.4843, -0.4364,  0.2341],
        [ 0.4748, -0.1181,  0.1329],
        [ 0.3845, -0.7439,  1.0873],
        [ 0.3964, -0.0536,  0.1135],
        [ 0.57

(Epoch 4) TRAIN LOSS:0.6051 LR:0.00000300:  40%|████████████████                        | 10/25 [00:03<00:05,  2.68it/s]

SequenceClassifierOutput(loss=tensor(0.7517, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0641, -1.1067,  1.3668],
        [ 0.2073, -1.1697,  1.2944],
        [-0.3109, -1.2101,  1.8900],
        [ 0.0187, -0.9758,  1.3592],
        [-0.0310, -1.0714,  1.0385],
        [ 0.0661, -0.8944,  0.9983],
        [-0.1740, -0.9842,  1.7210],
        [-0.1327, -0.8701,  1.5879],
        [ 0.1522, -0.5884,  0.8021],
        [-0.1446, -1.2252,  1.5619],
        [ 0.2251,  0.0571,  0.3124],
        [ 0.2293, -0.2479,  0.1265],
        [ 0.1427, -1.0471,  1.1321],
        [ 0.6550, -0.6129,  0.5250],
        [ 0.2844, -0.4839,  0.9285],
        [-0.2983, -1.0706,  1.1986],
        [ 0.4413, -1.2876,  1.0108],
        [-0.2413, -1.2808,  1.5429],
        [ 0.2930, -1.2931,  1.0101],
        [-0.1312, -1.0393,  1.4659],
        [ 0.3682, -0.3789,  0.2263],
        [ 0.4892, -0.8331,  0.4494],
        [-0.2688, -1.2761,  1.8269],
        [ 0.6017, -0.6468,  0.2633],
        [ 0.16

(Epoch 4) TRAIN LOSS:0.6184 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:03<00:05,  2.61it/s]

SequenceClassifierOutput(loss=tensor(0.7738, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3316, -0.1958,  0.4143],
        [ 0.1182, -1.3319,  1.4330],
        [-0.3424, -0.8762,  1.6958],
        [ 0.5265, -0.6190,  0.3662],
        [ 0.7110, -0.1811, -0.0857],
        [-0.2329, -1.1689,  1.7622],
        [-0.0105, -0.9937,  1.7922],
        [ 0.5775, -0.1289,  0.0878],
        [-0.0982, -1.2291,  1.6128],
        [ 1.1028, -0.2341, -0.0361],
        [ 0.7592, -0.1720,  0.6791],
        [ 0.0364, -0.7145,  0.2783],
        [ 0.8413, -0.6462,  0.4019],
        [ 0.1676, -1.2435,  1.3203],
        [ 0.4462, -0.0164, -0.1854],
        [ 0.3901, -0.8168,  0.2360],
        [ 0.2620, -1.0329,  1.1162],
        [-0.1032, -1.0682,  1.9572],
        [ 0.5574, -0.5693,  0.5114],
        [-0.2541, -1.4098,  1.6481],
        [-0.0071, -1.1555,  1.7086],
        [-0.1394, -1.1565,  1.7769],
        [ 0.4051, -0.9001,  0.8851],
        [ 0.3228, -0.2419, -0.0149],
        [ 0.77

(Epoch 4) TRAIN LOSS:0.6314 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:04<00:05,  2.58it/s]

SequenceClassifierOutput(loss=tensor(0.5213, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4149, -0.6593,  0.7520],
        [ 0.8828, -0.5826,  0.1751],
        [-0.5960, -1.2771,  1.5989],
        [ 1.4300, -0.6452, -0.0392],
        [-0.0794, -1.0415,  1.1275],
        [ 0.3228, -0.1724,  0.3846],
        [ 0.3218,  0.2778,  0.0647],
        [-0.2583, -1.1920,  1.6100],
        [-0.1114, -1.0296,  1.2942],
        [-0.0317, -1.2947,  1.7174],
        [ 0.9300, -0.3868, -0.2590],
        [ 0.6498, -0.3509, -0.2713],
        [ 0.1154, -0.9518,  0.9051],
        [-0.2328, -1.2527,  1.1448],
        [ 0.1876, -0.9890,  0.9942],
        [ 0.3101, -0.6626,  0.6560],
        [ 0.9024, -0.4022,  0.2050],
        [ 0.2732, -0.8116,  1.2072],
        [ 0.5632, -0.4093,  0.4724],
        [-0.4412, -1.2363,  1.6442],
        [-0.2696, -1.3226,  1.9197],
        [-0.0521, -1.3080,  1.3127],
        [ 0.7527, -0.4041,  0.2486],
        [-0.3533, -1.2195,  1.5120],
        [ 0.40

(Epoch 4) TRAIN LOSS:0.6229 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:04<00:04,  2.61it/s]

SequenceClassifierOutput(loss=tensor(0.6697, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0576, -1.1289,  1.1885],
        [ 0.3705,  0.0345, -0.2273],
        [ 0.3384,  0.1812,  0.1071],
        [ 0.3741, -0.2053,  0.1637],
        [ 0.0949, -1.2073,  1.4308],
        [ 0.1315, -0.2680,  0.8333],
        [-0.1241, -1.0299,  1.3341],
        [ 0.4579, -0.4360,  0.6349],
        [-0.0608, -1.0875,  1.2136],
        [ 0.2044, -0.6864,  1.2513],
        [-0.0709, -0.9443,  1.1964],
        [-0.0739, -1.4701,  1.7117],
        [-0.0637, -0.5642,  1.1019],
        [-0.2708, -1.1708,  1.2336],
        [-0.0609, -0.6888,  0.7720],
        [ 0.0395, -0.9046,  1.2376],
        [ 0.0438, -0.5788,  1.4540],
        [-0.4275, -0.9248,  1.2044],
        [ 0.0471, -1.1646,  1.6004],
        [-0.2925, -1.1553,  1.7820],
        [ 0.1028, -1.1374,  1.5196],
        [ 0.0888, -1.0620,  0.7602],
        [ 0.1146, -1.2475,  1.3931],
        [-0.1859, -1.1380,  1.5601],
        [-0.02

(Epoch 4) TRAIN LOSS:0.6263 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:04<00:04,  2.57it/s]

SequenceClassifierOutput(loss=tensor(0.5708, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5193, -0.2506,  0.0286],
        [ 0.4453,  0.1947,  0.1213],
        [-0.1791, -1.1284,  1.8113],
        [ 0.7996, -1.1396, -0.2148],
        [ 0.7404, -0.4261,  0.0961],
        [ 0.0826, -0.0652, -0.2638],
        [ 0.0045, -0.7284,  0.9624],
        [ 0.4592, -0.4875,  0.3607],
        [ 0.6165, -0.0438, -0.0441],
        [ 0.3251, -0.6661,  0.5248],
        [-0.2961, -1.1000,  1.5088],
        [ 0.5615, -0.5801,  0.1585],
        [ 0.3499, -1.1503,  1.3415],
        [-0.3161, -0.7982,  1.7286],
        [ 0.2949,  0.0945,  0.6164],
        [-0.2552, -1.2441,  1.5338],
        [-0.2976, -1.2169,  1.6788],
        [ 1.1733, -0.7512,  0.3080],
        [ 0.0765, -0.3546,  0.3555],
        [-0.2181, -1.2127,  1.4805],
        [-0.2271, -1.1079,  1.3386],
        [-0.3747, -1.2466,  1.7873],
        [-0.0105, -1.1814,  1.3653],
        [-0.2570, -0.8788,  1.9777],
        [ 0.01

(Epoch 4) TRAIN LOSS:0.6226 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:05<00:03,  2.59it/s]

SequenceClassifierOutput(loss=tensor(0.5080, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8173, -0.2603,  0.2645],
        [ 0.7289, -0.2487,  0.0583],
        [-0.0238, -0.7521,  1.2924],
        [-0.4485, -1.0739,  1.4757],
        [-0.2901, -1.2718,  1.5696],
        [-0.2350, -0.9922,  1.3850],
        [ 0.8354, -0.3909, -0.0165],
        [-0.2643, -1.1087,  1.2867],
        [-0.1016, -0.5526,  1.4671],
        [ 0.1834, -0.8543,  1.2383],
        [-0.3816, -1.4232,  1.8241],
        [-0.4316, -1.0429,  1.4577],
        [ 0.1314, -1.0779,  1.4612],
        [ 0.5322, -0.0513, -0.0609],
        [-0.0091, -1.2040,  1.7198],
        [-0.0142, -1.0778,  1.0334],
        [ 0.4206, -0.8697,  0.9366],
        [-0.0573, -0.9617,  1.1760],
        [-0.4810, -0.7838,  1.6221],
        [-0.0416, -1.1443,  1.5314],
        [-0.2102, -1.2599,  1.4216],
        [ 0.2887, -0.4725,  0.5447],
        [ 0.0820, -1.2584,  1.3825],
        [ 0.7962, -0.3951,  0.3202],
        [ 0.69

(Epoch 4) TRAIN LOSS:0.6154 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:05<00:03,  2.72it/s]

SequenceClassifierOutput(loss=tensor(0.5539, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6365, -0.5535,  0.1821],
        [-0.2882, -1.0970,  1.8538],
        [ 0.0984, -1.2600,  0.9565],
        [-0.0241, -1.2207,  1.8218],
        [-0.4618, -0.8262,  1.5649],
        [-0.3806, -1.2030,  1.8307],
        [-0.0367, -0.9904,  1.7652],
        [-0.0135, -0.9824,  1.5910],
        [ 0.6677, -0.5462,  0.4670],
        [-0.0524, -0.0835,  0.1501],
        [ 0.7168, -0.7384,  0.7416],
        [ 0.3121, -0.2482, -0.0436],
        [-0.3891, -1.0405,  1.6247],
        [-0.1703, -1.1288,  1.9413],
        [ 0.7590, -0.9097,  0.2722],
        [-0.0703, -1.1111,  1.2419],
        [ 0.0632, -1.2124,  1.5381],
        [-0.3698, -1.2018,  1.4384],
        [ 0.3978, -0.2890,  0.0434],
        [-0.6256, -1.0711,  1.8260],
        [ 0.0410, -1.3654,  1.4071],
        [ 0.6257, -0.6055,  0.0296],
        [ 0.5578, -0.2025,  0.2805],
        [ 0.5221, -0.2159,  0.3257],
        [ 0.54

(Epoch 4) TRAIN LOSS:0.6102 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:06<00:02,  2.87it/s]

SequenceClassifierOutput(loss=tensor(0.5834, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 3.6648e-02, -1.1419e+00,  1.2878e+00],
        [ 3.1421e-01, -2.2773e-01,  2.2441e-01],
        [ 1.1334e+00, -6.3392e-01, -2.7550e-01],
        [-2.7021e-01, -1.1616e+00,  1.6745e+00],
        [-2.3142e-01, -1.3377e+00,  1.7562e+00],
        [ 9.6139e-01, -3.1681e-01,  1.5652e-02],
        [ 4.5162e-01, -1.9511e-01, -2.1280e-02],
        [-5.0553e-01, -1.2784e+00,  1.6076e+00],
        [-1.9987e-01, -1.2532e+00,  1.7155e+00],
        [ 7.2914e-01, -3.9891e-01,  8.0269e-02],
        [ 1.2143e+00, -5.7698e-01, -5.4065e-02],
        [ 5.0894e-01,  9.9840e-02,  9.6619e-03],
        [ 5.0301e-01, -4.2729e-01,  3.1741e-01],
        [-1.8834e-01, -8.4940e-01,  1.4213e+00],
        [ 5.0831e-01, -6.8400e-01,  2.7459e-01],
        [ 1.9597e-03, -1.2933e+00,  1.3921e+00],
        [-3.8886e-01, -1.4049e+00,  1.7076e+00],
        [ 2.9689e-01, -1.2925e-01,  1.9151e-01],
        [ 7.6126e-01

(Epoch 4) TRAIN LOSS:0.6029 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:06<00:01,  3.24it/s]

SequenceClassifierOutput(loss=tensor(0.4715, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2844, -1.0993,  1.7798],
        [ 0.4442, -1.0535,  0.6464],
        [-0.3740, -1.1052,  1.9523],
        [-0.1144, -0.9825,  1.2206],
        [ 0.1409, -0.4352,  0.5084],
        [-0.1317, -1.0608,  1.7131],
        [ 0.4139,  0.0465,  0.1456],
        [ 0.6238, -0.3389,  0.1443],
        [-0.0225, -0.7478,  1.4701],
        [-0.0664, -1.0628,  1.3131],
        [-0.1304, -1.0805,  1.5092],
        [-0.2055, -1.3214,  1.6162],
        [-0.2144, -1.1066,  1.8149],
        [-0.4552, -1.2241,  1.7676],
        [ 0.2019, -0.6712,  0.8005],
        [ 0.7090, -0.5080,  0.2190],
        [-0.0607, -1.3608,  1.6209],
        [ 0.2971, -0.2668,  0.3032],
        [ 1.2181, -0.6414,  0.1075],
        [-0.5442, -1.2707,  1.6880],
        [ 0.9910, -0.9073, -0.1401],
        [ 0.8144, -0.3752,  0.1816],
        [ 0.0442, -0.9821,  1.7529],
        [ 0.8339, -0.4667,  0.4764],
        [ 0.91

(Epoch 4) TRAIN LOSS:0.6076 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:06<00:01,  3.56it/s]

SequenceClassifierOutput(loss=tensor(0.6973, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4336, -0.3442, -0.2001],
        [-0.0208, -1.1897,  1.4983],
        [-0.0406, -0.8769,  1.0626],
        [-0.2588, -1.2308,  1.5475],
        [-0.2488, -1.4633,  1.8072],
        [ 0.8040, -0.4730,  0.0049],
        [ 0.3335, -1.2775,  1.0816],
        [ 0.4720, -0.3098,  0.2047],
        [-0.2331, -1.0359,  1.5814],
        [-0.1114, -0.9630,  1.5405],
        [ 1.0114,  0.0180, -0.0256],
        [-0.2626, -1.5179,  1.8041],
        [-0.0162, -1.3494,  1.1067],
        [ 0.6251, -0.4113,  0.2824],
        [ 0.0540, -1.2677,  1.3741],
        [ 1.0845, -0.3804, -0.2113],
        [ 0.7757, -0.2581, -0.1422],
        [-0.2220, -0.8316,  1.6940],
        [ 0.5632, -0.3636,  0.2076],
        [ 0.5188, -0.6182,  0.1832],
        [-0.0884, -1.2118,  1.1594],
        [-0.0122, -1.0676,  1.3984],
        [-0.5100, -1.2250,  1.9104],
        [ 0.7500, -0.7692, -0.1291],
        [ 0.96

(Epoch 4) TRAIN LOSS:0.6072 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:06<00:01,  3.78it/s]

SequenceClassifierOutput(loss=tensor(0.5984, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1374, -0.9315,  0.9491],
        [ 0.1985, -1.0579,  1.6201],
        [ 0.4282,  0.5549, -0.0586],
        [-0.2347, -1.0833,  1.5390],
        [-0.2720, -1.1658,  1.4367],
        [ 0.4323, -0.4456,  0.1404],
        [-0.2765, -1.6334,  1.4853],
        [ 0.4673, -0.1724, -0.1513],
        [ 0.3159, -0.5114, -0.1025],
        [ 0.7755, -0.1623, -0.4051],
        [-0.2903, -1.1236,  1.4856],
        [-0.1597, -1.2700,  1.3529],
        [-0.3470, -1.4450,  2.0079],
        [ 0.6702, -0.0304,  0.0429],
        [ 0.3647, -0.1023,  0.3433],
        [-0.2885, -1.3556,  1.4991],
        [ 0.6101, -0.0437,  0.3988],
        [ 0.3400, -0.3354,  0.1457],
        [ 0.2676,  0.2107,  0.1102],
        [ 0.1257, -1.0445,  1.5305],
        [ 0.0038, -1.3295,  1.9001],
        [ 1.2956, -0.3352, -0.0527],
        [-0.2740, -1.5627,  1.6222],
        [-0.2671, -1.0848,  1.5699],
        [ 0.08

(Epoch 4) TRAIN LOSS:0.6084 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:07<00:00,  3.95it/s]

SequenceClassifierOutput(loss=tensor(0.6348, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2046, -1.0358,  1.6497],
        [ 0.0733, -1.1146,  1.2055],
        [-0.3418, -0.9974,  1.6225],
        [ 0.0699, -1.2224,  1.3330],
        [-0.1356, -0.4662,  0.7208],
        [ 0.9043, -1.0183, -0.0852],
        [ 0.4577, -1.0696,  1.2674],
        [-0.5911, -1.1168,  1.6684],
        [-0.3490, -1.2296,  1.8136],
        [-0.0554, -1.2333,  1.8048],
        [ 0.4750,  0.0330,  0.1232],
        [ 0.2348, -0.8730,  1.0532],
        [ 0.1737, -0.6193,  0.1690],
        [-0.5641, -1.2578,  1.6914],
        [ 0.1108, -0.9838,  1.1319],
        [ 0.3215, -0.1294,  0.0678],
        [-0.2715, -1.2906,  1.7844],
        [ 0.7962,  0.1260, -0.0484],
        [ 0.0435, -0.8646,  0.9378],
        [-0.2143, -1.1786,  1.7951],
        [ 0.4354,  0.1158,  0.2342],
        [-0.4923, -1.1597,  1.6536],
        [ 0.5265, -0.2509,  0.0503],
        [ 0.1573, -1.0664,  0.8746],
        [-0.13

(Epoch 4) TRAIN LOSS:0.6051 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:07<00:00,  4.05it/s]

SequenceClassifierOutput(loss=tensor(0.5326, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2439, -0.9996,  1.4843],
        [-0.0341, -1.1572,  1.5223],
        [-0.4133, -0.8984,  1.4757],
        [ 0.0834,  0.1612,  0.2380],
        [-0.2460, -0.9793,  1.7821],
        [ 0.5131,  0.1094, -0.0066],
        [-0.2590, -1.4099,  1.7343],
        [ 0.3890, -0.9880,  1.1773],
        [-0.0685, -0.9645,  1.6350],
        [-0.2718, -0.9860,  1.7458],
        [ 0.3869,  0.1455,  0.1876],
        [-0.3066, -1.2331,  1.5540],
        [-0.0286, -1.2235,  1.5717],
        [ 0.0345, -1.1783,  1.4668],
        [-0.2481, -0.8609,  1.3313],
        [ 0.3660, -0.3950,  0.5354],
        [ 0.0490, -0.4283,  0.6279],
        [-0.0130, -1.3856,  1.5858],
        [ 0.3888, -0.1686, -0.0283],
        [ 0.4939,  0.1858, -0.2223],
        [-0.3831, -0.9867,  1.4948],
        [ 0.7101, -0.2832, -0.3231],
        [-0.3667, -1.1061,  1.9314],
        [-0.4772, -1.4658,  1.5536],
        [ 0.24

(Epoch 4) TRAIN LOSS:0.6175 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:07<00:00,  4.30it/s]

SequenceClassifierOutput(loss=tensor(0.9015, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7678, -0.1796,  0.0396],
        [ 0.2424, -0.3676,  0.0801],
        [-0.1791, -0.7379,  0.8678],
        [ 0.5313, -0.1462, -0.0303],
        [ 0.3164, -0.9701,  0.9798],
        [ 0.2843, -1.2592,  1.3870],
        [ 1.1677, -0.4549, -0.0800],
        [-0.0513, -1.3451,  1.6278],
        [-0.4947, -0.9848,  1.3087],
        [ 0.6548, -0.1572, -0.1668],
        [-0.3153, -1.3007,  1.7827],
        [ 0.4596, -0.3564,  0.2850],
        [-0.4190, -1.0635,  1.8572],
        [ 0.1386,  0.0212,  0.4892],
        [-0.1290, -1.2961,  1.6610],
        [ 0.4889, -0.0833,  0.0147],
        [ 0.2654, -0.5849,  0.3023],
        [ 0.4691, -0.9026,  0.5158],
        [-0.1422, -0.9112,  1.6613],
        [ 0.6736, -0.3559, -0.3755],
        [ 0.2303, -1.4734,  1.2321],
        [ 0.8304, -0.3085,  0.0629],
        [ 0.4407, -0.2753,  0.2299],
        [-0.4564, -1.1553,  1.7478],
        [ 0.45

(Epoch 4) TRAIN LOSS:0.6145 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  3.21it/s]

(Epoch 4) TRAIN LOSS:0.6145 ACC:0.77 F1:0.55 REC:0.56 PRE:0.83 LR:0.00000300



(Epoch 5) TRAIN LOSS:0.5311 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:06,  3.94it/s]

SequenceClassifierOutput(loss=tensor(0.5311, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.4508, -0.8768,  1.4291],
        [-0.5598, -1.2575,  1.4183],
        [ 0.2932, -1.3204,  1.5480],
        [-0.1463, -1.5184,  1.7271],
        [ 0.7228, -0.9052,  0.6806],
        [ 0.8294, -0.5217, -0.1438],
        [ 1.0444, -0.8276, -0.0331],
        [ 0.5679, -0.1524, -0.3593],
        [ 0.5896,  0.1994, -0.0549],
        [ 0.5291,  0.2629,  0.2216],
        [-0.2949, -1.1120,  1.5714],
        [-0.2274, -1.1946,  1.4228],
        [ 0.7934, -0.1793,  0.0525],
        [ 0.2104,  0.0437, -0.0986],
        [ 0.0834, -0.3535,  0.6913],
        [-0.2880, -1.2674,  1.7737],
        [ 0.0349, -1.1022,  1.6448],
        [ 0.3527, -0.1914,  0.0868],
        [ 0.3013, -0.7627,  0.6358],
        [ 0.5268, -0.5743, -0.1856],
        [ 0.1545,  0.1216,  0.2845],
        [ 0.5817, -0.0723,  0.4115],
        [ 0.0950, -1.2448,  1.3003],
        [ 0.8651, -0.4931, -0.0610],
        [ 0.89

(Epoch 5) TRAIN LOSS:0.5505 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:04,  4.43it/s]

SequenceClassifierOutput(loss=tensor(0.5461, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0917, -0.2065, -0.3340],
        [-0.5077, -1.2701,  1.3607],
        [ 0.7924, -0.1179,  0.1009],
        [-0.2964, -1.6632,  1.5767],
        [-0.4715, -1.4210,  1.8280],
        [ 0.7716, -0.1788, -0.0792],
        [-0.4273, -0.6909,  0.9275],
        [ 0.0735, -1.0477,  1.2082],
        [ 0.5583, -0.0522,  0.1552],
        [ 0.7345, -0.0189, -0.2148],
        [-0.1210, -1.2778,  1.6441],
        [ 0.1334, -1.1844,  1.2444],
        [-0.5701, -0.7181,  1.3692],
        [ 0.8640,  0.0475, -0.0571],
        [-0.3235, -0.9850,  1.6578],
        [-0.0132, -1.0888,  1.1824],
        [ 0.2879, -0.0584,  0.2021],
        [ 0.6447, -0.7189,  0.2905],
        [-0.3606, -1.0841,  1.6205],
        [ 0.2642, -0.4443,  0.4079],
        [ 1.2889, -0.3267,  0.2301],
        [-0.3845, -1.1468,  1.8145],
        [ 0.4617, -0.1255,  0.1240],
        [-0.4085, -0.9545,  1.6142],
        [ 0.59

(Epoch 5) TRAIN LOSS:0.5712 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:04,  4.43it/s]

SequenceClassifierOutput(loss=tensor(0.6330, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1201, -1.0770,  1.7377],
        [ 0.8195,  0.2772, -0.2604],
        [-0.2674, -0.9001,  1.2303],
        [-0.4198, -1.3740,  1.6241],
        [-0.0763, -0.0400,  0.4304],
        [ 0.3505,  0.0600, -0.1288],
        [ 0.4322, -0.0328,  0.4524],
        [ 0.7057, -0.7549,  0.1236],
        [-0.4745, -0.8735,  1.3985],
        [ 0.9506, -0.1899, -0.4033],
        [-0.0257, -1.1086,  1.6352],
        [ 0.0534, -0.8195,  0.6702],
        [-0.0680, -1.1762,  1.5057],
        [-0.1796, -0.9013,  1.4060],
        [ 0.4650, -0.3299,  0.1464],
        [ 0.7335, -0.3050,  0.0140],
        [-0.3370, -1.2891,  1.5902],
        [ 0.7716, -0.1443, -0.0191],
        [ 1.1095, -0.3928, -0.1958],
        [-0.3100, -1.1431,  1.7081],
        [-0.1239, -1.1621,  2.0314],
        [ 0.6587, -0.9441, -0.1628],
        [ 0.9161, -0.3053, -0.0978],
        [-0.2853, -1.3591,  1.6584],
        [-0.33

(Epoch 5) TRAIN LOSS:0.5632 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:04,  4.56it/s]

SequenceClassifierOutput(loss=tensor(0.5313, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0492e+00, -5.1883e-01, -1.0416e-01],
        [ 7.8598e-01, -3.3912e-01, -1.3481e-01],
        [ 6.9344e-01, -6.7630e-01,  5.1834e-01],
        [ 8.8333e-01, -4.5711e-01,  2.8945e-01],
        [-3.5076e-01, -1.3451e+00,  1.6567e+00],
        [-4.3596e-01, -1.1379e+00,  1.8596e+00],
        [ 7.6532e-01, -3.0021e-01, -2.0241e-01],
        [ 3.3368e-01, -1.3964e-01, -1.2639e-01],
        [ 3.1001e-01, -3.2441e-01,  6.1574e-01],
        [ 2.2577e-01, -8.7702e-01,  2.8658e-01],
        [-6.9538e-01, -1.0135e+00,  1.4919e+00],
        [-4.8590e-01, -1.1787e+00,  1.7101e+00],
        [-1.2019e-01, -1.4553e+00,  1.9380e+00],
        [ 8.9533e-01, -1.4896e-01, -1.0780e-01],
        [ 6.2508e-01, -3.7313e-02,  3.0658e-01],
        [-6.1943e-03, -1.1715e+00,  1.4299e+00],
        [ 2.1729e-01, -1.1794e+00,  1.2800e+00],
        [-1.0796e-01, -1.4487e+00,  1.6107e+00],
        [ 4.8181e-01

(Epoch 5) TRAIN LOSS:0.5670 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:04,  4.59it/s]

SequenceClassifierOutput(loss=tensor(0.5862, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2532, -0.9623,  1.4488],
        [ 0.4071, -0.5302,  0.0303],
        [ 0.6884, -0.5271,  0.3110],
        [ 0.0592,  0.0842,  0.3831],
        [-0.4025, -1.3302,  1.4878],
        [-0.0972, -1.3048,  1.6978],
        [ 0.0536,  0.1117, -0.2641],
        [-0.2754, -1.3000,  1.4222],
        [-0.2606, -1.2426,  1.7085],
        [-0.4161, -0.9923,  1.6594],
        [-0.3034, -1.2633,  1.3726],
        [ 0.1405, -1.0805,  0.7992],
        [ 0.7468, -0.3200,  0.1249],
        [ 0.0863, -0.2176,  0.1229],
        [ 0.1932, -1.3153,  0.9860],
        [-0.2666, -1.4941,  1.9994],
        [-0.0705, -1.4186,  1.7352],
        [-0.2234, -1.1937,  2.0305],
        [ 0.2889,  0.5086, -0.2198],
        [-0.2421, -1.0448,  1.3939],
        [ 1.0331, -0.0915, -0.0839],
        [-0.0372, -1.0926,  1.3716],
        [-0.0314, -1.1555,  1.5461],
        [ 0.6061, -0.2955, -0.0034],
        [ 0.94

(Epoch 5) TRAIN LOSS:0.5934 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:03,  4.63it/s]

SequenceClassifierOutput(loss=tensor(0.7517, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9559, -0.7576, -0.2281],
        [ 0.1344, -0.4509,  0.1696],
        [-0.0690, -1.0003,  1.3657],
        [ 0.7274,  0.0362,  0.0588],
        [ 0.3856, -0.7758,  0.2159],
        [ 0.8588, -0.0141, -0.1041],
        [-0.3049, -1.3521,  1.7984],
        [ 0.4737, -0.5585,  0.1178],
        [-0.0114, -1.0290,  1.1434],
        [-0.4534, -1.1382,  2.0540],
        [-0.2891, -1.3661,  1.3734],
        [-0.2843, -1.1500,  1.9074],
        [ 0.2332,  0.0063,  0.1130],
        [-0.0682, -0.8184,  0.9851],
        [-0.2028, -0.7594,  1.2004],
        [-0.2956, -1.3637,  1.7712],
        [ 0.6277, -0.6653,  0.1400],
        [-0.1171, -1.4293,  1.9052],
        [ 0.8389,  0.1293, -0.2079],
        [ 0.5137, -0.0663,  0.0782],
        [ 0.8002, -0.1309,  0.3644],
        [ 0.7326, -0.1383, -0.3712],
        [-0.5606, -1.2858,  1.9271],
        [-0.0348, -1.1163,  1.8682],
        [ 0.24

(Epoch 5) TRAIN LOSS:0.5875 LR:0.00000300:  32%|█████████████                            | 8/25 [00:01<00:03,  4.75it/s]

SequenceClassifierOutput(loss=tensor(0.5458, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.4978, -1.2805,  1.8231],
        [-0.2610, -0.5239,  0.8693],
        [ 0.7748, -0.2842,  0.1600],
        [-0.3622, -0.9787,  1.8017],
        [-0.6548, -1.1227,  2.0013],
        [-0.2145, -1.3171,  1.1849],
        [ 0.0599, -1.1840,  1.8207],
        [-0.2187, -1.3548,  1.7002],
        [ 0.3698, -0.8986,  0.6171],
        [-0.2376, -0.9963,  1.6907],
        [-0.2609, -1.0584,  1.7602],
        [ 0.0305, -1.3544,  1.4069],
        [ 0.9649, -0.7974, -0.2262],
        [-0.3260, -1.1262,  1.6489],
        [-0.0433, -1.4851,  1.4586],
        [ 0.2682,  0.0297,  0.0059],
        [-0.2165, -1.3512,  1.8720],
        [ 0.1012, -0.9480,  1.3561],
        [ 0.7446, -0.2763, -0.1759],
        [-0.2871, -1.4108,  1.4012],
        [ 0.5485,  0.0998, -0.0179],
        [ 0.1657, -0.9722,  1.3097],
        [-0.3160, -1.4211,  1.7519],
        [ 0.6558, -0.5266,  0.3088],
        [-0.24

(Epoch 5) TRAIN LOSS:0.5749 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:01<00:03,  4.62it/s]

SequenceClassifierOutput(loss=tensor(0.4741, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1456, -1.0336,  1.3338],
        [-0.0216, -1.3121,  1.8157],
        [ 0.0881, -0.9204,  1.1722],
        [ 0.1641, -1.0447,  1.2870],
        [-0.6257, -1.2910,  2.0067],
        [-0.5060, -1.1209,  1.4884],
        [-0.4283, -1.4970,  2.0123],
        [ 0.0321, -0.8508,  1.1364],
        [ 0.1751, -0.9436,  1.4029],
        [-0.6495, -1.1732,  1.9395],
        [ 0.5683, -0.2587, -0.3764],
        [-0.2793, -1.1312,  1.9921],
        [-0.7373, -1.2661,  1.8960],
        [-0.4726, -0.7128,  1.5419],
        [-0.3104, -1.3097,  1.6457],
        [ 0.1337, -0.9043,  1.6141],
        [ 0.1636, -0.0506, -0.0815],
        [-0.3448, -1.2370,  1.8439],
        [-0.3625, -1.1385,  1.4820],
        [-0.3887, -1.2013,  1.8542],
        [-0.1010, -1.2489,  1.3646],
        [ 0.3419, -0.7529,  0.8576],
        [-0.1505, -1.4866,  1.2520],
        [ 0.7086, -0.4264, -0.1922],
        [ 0.99

(Epoch 5) TRAIN LOSS:0.5781 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:03,  4.61it/s]

SequenceClassifierOutput(loss=tensor(0.6068, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6961,  0.3498, -0.2099],
        [-0.2817, -0.9975,  1.7389],
        [ 1.3704, -0.5116,  0.0666],
        [-0.3853, -1.1990,  1.9788],
        [ 0.3344, -0.5751,  0.1235],
        [ 1.2969, -0.4083, -0.0793],
        [ 1.3558, -0.4158, -0.1800],
        [ 0.8136, -0.2193,  0.1512],
        [-0.0324, -1.2849,  1.6917],
        [ 0.3832, -0.6648,  0.7362],
        [ 0.2592, -1.1071,  1.3052],
        [ 0.6583, -0.5395,  0.1097],
        [-0.1957, -0.7293,  1.2795],
        [ 0.4336,  0.2830,  0.2296],
        [-0.4637, -1.3169,  1.7084],
        [-0.2048, -1.3267,  1.9242],
        [ 0.8299, -0.6945,  0.8476],
        [ 0.7036, -0.6275,  0.2361],
        [-0.2754, -1.0362,  1.9364],
        [ 0.0712, -1.2113,  1.5058],
        [ 0.5890, -0.2683, -0.0540],
        [ 0.3411,  0.2374, -0.0361],
        [ 0.5009, -0.6993,  0.4318],
        [ 0.9588, -0.4863,  0.0788],
        [-0.36

(Epoch 5) TRAIN LOSS:0.5803 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:02<00:02,  4.83it/s]

SequenceClassifierOutput(loss=tensor(0.6032, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 3.2563e-01,  4.7409e-01, -3.1063e-01],
        [-3.2090e-01, -1.2778e+00,  1.8418e+00],
        [-1.8045e-01, -1.3735e+00,  1.4888e+00],
        [ 7.0843e-01, -5.3187e-01,  3.6900e-01],
        [ 5.0429e-01, -8.0977e-01,  7.3271e-01],
        [-7.7173e-02, -1.1385e+00,  1.5059e+00],
        [-8.7750e-02, -1.2756e+00,  1.6309e+00],
        [ 9.6245e-01, -5.8612e-01,  1.7895e-01],
        [-1.7833e-01, -9.7286e-01,  1.5884e+00],
        [-4.1844e-01, -1.3554e+00,  1.9827e+00],
        [-2.2798e-01, -1.3405e+00,  1.9388e+00],
        [-4.3361e-01, -1.1789e+00,  1.8163e+00],
        [ 9.1519e-01, -2.7817e-01,  1.4858e-03],
        [ 9.1446e-01,  9.5083e-02, -3.5757e-01],
        [ 5.9059e-02, -8.9025e-01,  1.4017e+00],
        [-1.2017e-01, -8.7251e-01,  1.0462e+00],
        [ 6.8995e-01, -9.6424e-01,  7.6913e-01],
        [ 2.2618e-01, -8.7033e-02, -4.6961e-01],
        [ 8.1669e-02

(Epoch 5) TRAIN LOSS:0.5783 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:02<00:02,  4.81it/s]

SequenceClassifierOutput(loss=tensor(0.4471, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1772, -1.1973,  1.1220],
        [-0.1111,  0.3223, -0.0893],
        [-0.3920, -1.3733,  1.8029],
        [-0.2370, -1.4235,  1.9639],
        [ 0.4159, -0.6861,  0.1470],
        [-0.3679, -1.3798,  1.6862],
        [ 1.0851, -0.4206, -0.2020],
        [-0.1627, -1.0586,  1.5982],
        [-0.4103, -1.2764,  2.0291],
        [ 1.0169, -0.6672,  0.0832],
        [-0.0625, -0.9417,  1.2457],
        [-0.5404, -1.2280,  1.8250],
        [ 0.1404, -0.8126,  1.1049],
        [ 0.9722, -0.4283, -0.1517],
        [-0.5440, -1.1970,  1.5544],
        [ 0.9486, -0.3775,  0.0673],
        [ 0.5274, -0.3319, -0.1635],
        [ 0.8089, -0.3005,  0.4441],
        [-0.3328, -0.9672,  1.3698],
        [-0.1854, -0.9559,  1.3796],
        [-0.4059, -1.4483,  1.8173],
        [-0.3514, -1.1705,  1.5655],
        [ 0.9509, -1.1758,  0.9292],
        [ 0.5403, -1.3730,  0.8078],
        [-0.53

(Epoch 5) TRAIN LOSS:0.5702 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:03<00:02,  4.66it/s]

SequenceClassifierOutput(loss=tensor(0.4645, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.3255, -0.7223, -0.2670],
        [ 0.7257, -0.9283,  0.4867],
        [-0.1523, -1.1782,  1.8483],
        [ 0.3691, -0.1763,  0.1984],
        [-0.3822, -0.8697,  1.6069],
        [-0.4436, -1.1018,  1.7342],
        [-0.0855, -1.0080,  1.4829],
        [-0.2412, -0.8253,  1.4140],
        [ 0.1780, -1.0284,  1.6547],
        [-0.1760, -1.3017,  1.5240],
        [ 0.3518, -0.9848,  0.7405],
        [ 0.5080, -0.1795, -0.0450],
        [-0.2400, -1.5296,  1.8742],
        [ 0.1194, -0.6526,  1.1581],
        [ 0.7360, -0.1886, -0.1194],
        [-0.5277, -0.9827,  1.7690],
        [ 0.7632, -0.5265,  0.1595],
        [ 0.7139, -0.6615,  0.1788],
        [-0.1553, -1.1337,  1.7266],
        [ 0.8671, -0.4143, -0.2188],
        [ 0.6490, -0.5981,  0.8925],
        [ 0.5181,  0.3363,  0.1994],
        [-0.2815, -1.3175,  1.8570],
        [ 1.0606, -0.5766, -0.1788],
        [-0.65

(Epoch 5) TRAIN LOSS:0.5723 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:03<00:02,  4.58it/s]

SequenceClassifierOutput(loss=tensor(0.6014, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.4343, -1.1232,  1.3636],
        [ 0.1717, -0.5626,  0.1448],
        [-0.6974, -1.2589,  1.9265],
        [ 0.6170,  0.4236, -0.4293],
        [ 0.3172, -0.6733,  0.9459],
        [ 0.9317, -0.6410,  0.0206],
        [ 0.0180, -1.5719,  1.7589],
        [-0.3639, -1.2027,  1.8308],
        [ 1.4633, -0.4936, -0.3632],
        [-0.0972, -1.6173,  1.5739],
        [ 0.1891, -1.1328,  1.3232],
        [-0.5204, -1.3045,  1.7089],
        [-0.5226, -0.9904,  1.4964],
        [ 0.2324, -1.3051,  1.4693],
        [-0.1899, -1.5481,  1.5731],
        [ 0.5948,  0.6487, -0.2351],
        [-0.3051, -1.2243,  1.8030],
        [ 0.9389, -0.3408, -0.2962],
        [ 0.5519, -0.4529,  0.8997],
        [ 0.9706, -0.2108, -0.1786],
        [ 0.5128,  0.3845,  0.0292],
        [ 0.3457, -0.9961,  0.8137],
        [ 0.6109, -0.7962,  0.7839],
        [-0.2750, -1.4278,  1.7369],
        [ 0.72

(Epoch 5) TRAIN LOSS:0.5733 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:03<00:01,  4.58it/s]

SequenceClassifierOutput(loss=tensor(0.5885, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3411, -1.2949,  2.0472],
        [-0.2021, -0.8998,  1.0738],
        [-0.0888, -0.9504,  1.5032],
        [-0.2899, -1.4462,  1.5486],
        [-0.2872, -1.5896,  1.5975],
        [ 0.6052, -0.7032,  0.5187],
        [-0.5448, -1.0078,  1.9183],
        [ 1.0821, -0.5655, -0.4191],
        [ 0.9925, -0.3368, -0.3237],
        [ 0.1282,  0.0423,  0.3746],
        [-0.1197, -1.1687,  1.6346],
        [ 0.4331,  0.1856, -0.1014],
        [ 1.1639, -0.3929, -0.1363],
        [ 0.5875,  0.7253, -0.1897],
        [ 0.5732, -0.4884,  0.4930],
        [ 0.3645,  0.2379, -0.1460],
        [ 0.3979, -0.5357,  0.9083],
        [ 0.5266, -0.1607,  0.2038],
        [ 0.1062,  0.3434,  0.1937],
        [ 0.4461, -1.2272,  1.1110],
        [ 0.6537, -0.2679, -0.1568],
        [ 0.0252, -1.0364,  1.1465],
        [ 0.7518, -0.1854,  0.2225],
        [-0.3800, -1.3789,  1.7875],
        [-0.28

(Epoch 5) TRAIN LOSS:0.5682 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:03<00:01,  4.58it/s]

SequenceClassifierOutput(loss=tensor(0.4862, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6524, -0.0462, -0.1158],
        [-0.2315, -1.4421,  1.7516],
        [-0.2239, -1.4032,  1.8447],
        [ 0.9754, -0.1213, -0.3349],
        [-0.1133, -1.2296,  1.7555],
        [ 0.5680,  0.1264, -0.2311],
        [-0.4569, -1.4296,  1.9674],
        [-0.1711, -1.4756,  1.5455],
        [ 1.1240, -0.0814, -0.5726],
        [-0.3638, -0.7737,  1.3748],
        [ 0.1557, -1.2481,  1.4530],
        [-0.1697, -1.2321,  1.4633],
        [-0.4158, -0.8389,  2.0091],
        [ 1.1610, -0.2649, -0.5439],
        [-0.4208, -1.5318,  1.8323],
        [-0.3947, -1.5902,  1.8583],
        [ 0.3575, -0.7997,  0.7131],
        [-0.2656, -1.4805,  1.8303],
        [-0.3456, -1.3474,  1.7899],
        [ 1.0213, -0.4035, -0.3405],
        [ 0.5081, -0.3481, -0.1991],
        [ 0.2899,  0.1228, -0.2087],
        [ 0.4847, -0.0801, -0.0963],
        [ 0.5230, -0.1368, -0.2926],
        [-0.14

(Epoch 5) TRAIN LOSS:0.5550 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:03<00:01,  4.68it/s]

SequenceClassifierOutput(loss=tensor(0.3308, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2745, -1.0171,  1.7645],
        [-0.1236, -1.4520,  1.5729],
        [ 0.7441, -0.4583, -0.3503],
        [-0.4702, -0.8098,  1.7005],
        [-0.2251, -1.1021,  1.9020],
        [-0.3288, -1.0772,  1.9757],
        [ 0.8903, -0.3973, -0.0761],
        [-0.4580, -1.0034,  1.6495],
        [-0.1559, -1.2965,  1.6714],
        [-0.4839, -0.7841,  1.6956],
        [ 0.8283, -0.2082,  0.0068],
        [ 0.3108, -0.0682, -0.3125],
        [ 0.8805, -0.3430, -0.2176],
        [-0.0458, -1.0224,  0.8434],
        [ 0.4176, -1.4940,  1.4877],
        [-0.0879, -1.2088,  1.5633],
        [-0.6279, -1.1205,  1.7915],
        [-0.1940, -1.4328,  2.0013],
        [-0.4632, -1.2040,  2.0497],
        [-0.1386, -1.4156,  1.9471],
        [-0.1193, -1.4204,  1.1321],
        [-0.2317, -1.0378,  1.2839],
        [-0.0223, -1.0729,  1.0979],
        [-0.1121, -1.1328,  1.8081],
        [-0.55

(Epoch 5) TRAIN LOSS:0.5564 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:04<00:01,  4.78it/s]

SequenceClassifierOutput(loss=tensor(0.5810, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.1709, -0.9757, -0.2712],
        [ 0.8267, -0.2882,  0.1265],
        [ 0.4877, -0.3653, -0.0873],
        [ 1.1041, -0.0214, -0.7882],
        [ 0.8005, -0.5889,  0.2096],
        [-0.4414, -1.6636,  2.1694],
        [ 0.9309, -0.2810, -0.2161],
        [-0.2766, -0.9604,  1.8088],
        [-0.0479, -0.4907,  0.2799],
        [ 0.0412, -1.5303,  1.7920],
        [-0.6756, -1.3948,  1.7814],
        [-0.0058, -0.9557,  1.4179],
        [ 0.4745, -0.1878, -0.2925],
        [ 0.8545, -0.6248, -0.2235],
        [ 0.5542, -0.0620, -0.2724],
        [-0.3068, -1.5525,  1.9775],
        [ 0.4830, -0.5808,  0.7928],
        [-0.4370, -1.5097,  2.0460],
        [ 0.7305, -0.5299,  0.1201],
        [ 0.2698, -0.2927, -0.3031],
        [ 0.3491, -0.6131,  0.0756],
        [ 1.0454, -0.3362, -0.2284],
        [-0.4619, -1.5630,  2.0277],
        [ 0.9571, -0.2405, -0.1055],
        [ 0.30

(Epoch 5) TRAIN LOSS:0.5573 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:04<00:00,  4.73it/s]

SequenceClassifierOutput(loss=tensor(0.4621, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 8.6292e-01, -5.8122e-01, -3.3430e-01],
        [ 7.8943e-01, -3.9630e-01,  3.2192e-02],
        [-1.6980e-01, -9.8805e-01,  1.1447e+00],
        [-1.6820e-01, -8.6534e-01,  1.5543e+00],
        [ 1.3410e-01, -9.6712e-01,  1.1518e+00],
        [-4.5888e-01, -1.3578e+00,  1.7273e+00],
        [ 2.9876e-01,  7.3978e-02, -2.4838e-01],
        [ 3.8842e-01,  1.3810e-01, -3.6033e-01],
        [-9.3580e-02, -1.6273e+00,  1.7198e+00],
        [-5.7932e-01, -1.2507e+00,  2.3522e+00],
        [-1.4390e-01, -1.2732e+00,  1.5284e+00],
        [-3.0292e-02, -1.6509e+00,  1.8478e+00],
        [-5.4816e-01, -1.0686e+00,  2.0314e+00],
        [-7.0316e-01, -1.5598e+00,  1.9801e+00],
        [-2.7653e-01, -1.1346e+00,  1.4787e+00],
        [-1.2458e-01, -1.4473e+00,  1.4478e+00],
        [ 1.2455e+00, -2.6998e-01,  4.4237e-04],
        [-1.9560e-01, -1.2209e+00,  1.9014e+00],
        [ 1.0622e+00

(Epoch 5) TRAIN LOSS:0.5570 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:04<00:00,  4.27it/s]

SequenceClassifierOutput(loss=tensor(0.5576, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 9.2798e-01, -5.0087e-01, -4.6136e-02],
        [ 1.5450e-01, -1.0373e+00,  1.2945e+00],
        [ 9.1538e-01, -4.8260e-01, -1.2635e-02],
        [ 1.2713e+00, -4.2135e-01, -3.1986e-01],
        [-1.2435e-01, -1.1714e+00,  1.1042e+00],
        [ 6.1309e-01, -1.3377e-02,  1.4099e-01],
        [-5.1500e-01, -1.5506e+00,  2.0440e+00],
        [ 8.8072e-01, -8.0276e-01, -1.6052e-04],
        [-3.8580e-03, -1.2548e+00,  1.5784e+00],
        [-1.3384e-01, -1.0708e+00,  1.8910e+00],
        [-3.3006e-01, -1.0866e+00,  1.6710e+00],
        [ 5.9911e-01,  8.4528e-02,  2.3153e-01],
        [-2.3838e-01, -1.1640e+00,  1.8486e+00],
        [ 8.8751e-01,  1.5010e-02,  3.0827e-01],
        [-4.7043e-01, -1.4184e+00,  1.8995e+00],
        [ 9.4520e-01, -1.6553e-01, -5.3849e-01],
        [ 1.3242e+00, -3.8334e-01,  6.7567e-02],
        [ 7.3788e-01, -2.4799e-01,  3.7312e-01],
        [ 5.8156e-01

(Epoch 5) TRAIN LOSS:0.5570 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:05<00:00,  3.59it/s]

SequenceClassifierOutput(loss=tensor(0.5608, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.8146, -0.4958, -0.2320],
        [ 0.6401, -0.7677,  0.6868],
        [-0.0488, -1.0507,  1.4086],
        [-0.3390, -1.4885,  1.7307],
        [-0.2300, -1.3232,  1.8357],
        [ 0.7344,  0.2560, -0.1826],
        [-0.2595, -1.3018,  1.7181],
        [ 0.7231, -0.2548, -0.2515],
        [-0.1397, -1.1206,  1.7195],
        [-0.0772, -1.1275,  1.8122],
        [ 1.4085, -0.2121,  0.0561],
        [ 0.2928, -0.0117, -0.0052],
        [ 0.3632, -0.4577,  0.2147],
        [ 1.1824, -0.0712, -0.4703],
        [ 0.3067,  0.2327,  0.2059],
        [ 0.9071, -0.5603, -0.2860],
        [ 1.0237, -0.1079, -0.1848],
        [ 1.0218, -0.3237, -0.6625],
        [ 0.0167, -1.3966,  1.1761],
        [-0.3528, -1.4873,  1.8800],
        [ 0.2298, -1.1958,  0.9242],
        [-0.3218, -1.1923,  1.4248],
        [-0.4828, -1.4595,  2.0289],
        [ 0.1826, -1.2764,  1.4483],
        [-0.11

(Epoch 5) TRAIN LOSS:0.5572 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:05<00:00,  3.16it/s]

SequenceClassifierOutput(loss=tensor(0.3226, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1239, -1.3235,  1.6099],
        [-0.2582, -1.2631,  1.8543],
        [-0.0247, -1.3334,  1.5242],
        [ 0.0258, -1.1871,  1.4648],
        [-0.5592, -1.1093,  2.1185],
        [-0.2351, -1.3931,  1.9347],
        [ 0.3864, -0.9046,  0.3532],
        [ 1.1582, -0.2267, -0.0526],
        [-0.5222, -1.0040,  1.7083],
        [-0.1006, -0.8048,  1.6124],
        [ 1.4725, -0.6485, -0.6837],
        [-0.0304, -1.5294,  1.6199],
        [ 0.0226, -1.2464,  1.1228],
        [-0.1545, -1.0312,  1.9975],
        [ 0.1185, -0.5391,  0.7588],
        [ 0.8941, -0.8006, -0.0158],
        [-0.0981, -1.3666,  1.8890],
        [ 0.0678, -1.4067,  1.4495],
        [ 0.1069, -1.0334,  1.6373],
        [-0.2600, -1.3107,  1.7459],
        [-0.7246, -1.3760,  2.1557],
        [ 0.6285, -0.8241,  0.2885],
        [-0.8863, -1.5066,  2.1647],
        [ 1.3213, -0.5782, -0.7730],
        [-0.09

(Epoch 5) TRAIN LOSS:0.5478 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:06<00:00,  4.07it/s]

(Epoch 5) TRAIN LOSS:0.5478 ACC:0.80 F1:0.62 REC:0.61 PRE:0.80 LR:0.00000300



  0%|                                                                                            | 0/25 [00:00<?, ?it/s]

SequenceClassifierOutput(loss=tensor(0.4638, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0550, -1.1721,  1.5863],
        [-0.6265, -1.3269,  1.9321],
        [-0.2866, -1.1767,  1.4773],
        [ 0.5093, -1.4224,  0.9024],
        [-0.5640, -1.4066,  2.2198],
        [-0.4250, -1.0955,  1.8076],
        [-0.0332, -1.4328,  1.9556],
        [-0.4275, -1.1501,  1.7343],
        [ 0.9091, -0.3770,  0.0603],
        [ 1.2630, -0.2534,  0.0551],
        [ 0.5615, -0.8034,  0.5986],
        [-0.3360, -1.4574,  1.7275],
        [-0.1831, -1.1296,  1.2053],
        [-0.5810, -1.1695,  2.0522],
        [-0.2841, -1.4310,  1.5195],
        [ 0.5949, -0.0329,  0.1396],
        [ 0.3817, -0.1459, -0.2616],
        [-0.1885, -1.7128,  1.7975],
        [ 0.1049, -1.3325,  1.5019],
        [-0.3764, -1.4564,  1.8445],
        [-0.5075, -1.3079,  2.0162],
        [-0.3899, -1.1531,  1.5991],
        [-0.3379, -0.9744,  1.7787],
        [-0.0787, -1.2052,  1.5380],
        [ 0.46

(Epoch 6) TRAIN LOSS:0.4638 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:11,  2.08it/s]

SequenceClassifierOutput(loss=tensor(0.4927, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3866, -1.2546,  1.7808],
        [ 1.0298, -0.8253,  0.0963],
        [-0.1558, -1.1459,  1.2012],
        [ 0.7238, -1.3867,  0.8998],
        [ 0.0098, -0.9585,  0.7746],
        [ 0.0261, -1.3799,  1.2916],
        [ 0.1549, -1.2288,  1.2200],
        [ 0.4381,  0.1121,  0.1037],
        [ 0.4175,  0.2482,  0.2180],
        [-0.7752, -1.2487,  1.9328],
        [-0.5754, -1.0983,  1.9146],
        [ 1.0556, -0.5142, -0.3742],
        [ 0.2640, -0.7827,  0.8379],
        [-0.3402, -1.2647,  1.8335],
        [-0.3839, -1.4011,  1.9554],
        [ 0.6436,  0.8365, -0.6067],
        [-0.6421, -0.9576,  1.7428],
        [-0.1877, -1.3070,  1.0384],
        [ 0.8777, -1.1689,  0.6133],
        [-0.1514, -1.4779,  1.8658],
        [ 0.7182, -0.3896, -0.0057],
        [-0.2185, -1.1139,  1.3491],
        [ 0.5144, -0.4899,  0.2385],
        [-0.2262, -1.1118,  1.9327],
        [-0.26

(Epoch 6) TRAIN LOSS:0.4783 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:09,  2.31it/s]

SequenceClassifierOutput(loss=tensor(0.4360, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6881, -0.0741, -0.2506],
        [ 0.0033, -1.1345,  1.1557],
        [ 0.8458, -0.6830, -0.1407],
        [-0.5722, -1.0461,  1.8704],
        [-0.0806, -1.3646,  1.9429],
        [-0.1438, -1.5759,  1.8441],
        [ 1.0631, -0.4561, -0.2478],
        [-0.0886, -1.0860,  1.9020],
        [ 0.2775,  0.3426,  0.0256],
        [ 0.6412, -0.8885,  0.0664],
        [ 0.9309, -0.4911, -0.3014],
        [ 0.7356, -0.6892,  0.4671],
        [ 0.9997, -0.7119,  0.1763],
        [ 0.2736,  0.5197, -0.1634],
        [ 0.7628, -0.1896, -0.3766],
        [-0.3931, -1.7282,  1.9330],
        [-0.4948, -1.6521,  1.8727],
        [-0.3074, -1.2379,  2.0066],
        [-0.4771, -1.6293,  1.9458],
        [-0.4981, -1.4013,  1.9021],
        [ 0.3359, -0.9346,  0.8998],
        [-0.2547, -1.2312,  1.6913],
        [-0.2477, -1.1676,  1.7426],
        [-0.4753, -1.6242,  2.0450],
        [-0.39

(Epoch 6) TRAIN LOSS:0.4642 LR:0.00000300:  12%|████▉                                    | 3/25 [00:01<00:09,  2.36it/s]

SequenceClassifierOutput(loss=tensor(0.5537, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 4.5198e-01, -4.8817e-01,  2.1426e-01],
        [ 3.6540e-01, -1.6053e+00,  1.4665e+00],
        [ 1.1344e+00, -2.5327e-01, -2.9981e-01],
        [ 3.9941e-01, -5.2382e-01,  4.4748e-01],
        [ 6.9307e-01, -1.5028e+00,  1.2155e+00],
        [-4.6469e-02, -6.5969e-01,  2.7588e-01],
        [ 5.7203e-01, -7.6667e-01,  3.2901e-01],
        [-6.2194e-01, -1.5149e+00,  1.8261e+00],
        [ 7.1763e-01, -2.0825e-01, -5.6655e-02],
        [ 4.2493e-01, -9.4355e-01,  1.0532e+00],
        [ 1.0964e+00, -4.9062e-01, -7.6641e-02],
        [ 5.1932e-01, -3.6093e-01, -3.1395e-01],
        [-2.3324e-01, -1.0664e+00,  1.4614e+00],
        [-5.3543e-02, -1.3762e+00,  1.2704e+00],
        [-3.6203e-02, -1.3421e+00,  1.8457e+00],
        [-4.2110e-01, -1.6909e+00,  2.0115e+00],
        [-3.0091e-02, -1.0119e+00,  1.5959e+00],
        [ 5.7402e-01,  2.8677e-01, -5.0366e-01],
        [ 9.3933e-01

(Epoch 6) TRAIN LOSS:0.4865 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:01<00:08,  2.54it/s]

SequenceClassifierOutput(loss=tensor(0.5834, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8386, -0.8316,  0.3433],
        [-0.1495, -1.1498,  1.7159],
        [-0.1938, -1.0882,  1.7155],
        [-0.1440, -1.3793,  1.0807],
        [ 0.0400, -1.3165,  1.7605],
        [ 0.0193, -1.1830,  1.4654],
        [ 0.7034,  0.0558,  0.0560],
        [ 0.0851, -1.3884,  1.8371],
        [ 0.9852, -0.5710,  0.0635],
        [-0.2888, -0.9473,  1.7586],
        [ 1.7269, -0.4485, -0.3400],
        [-0.3969, -1.5530,  1.9188],
        [-0.2041, -1.3721,  1.7787],
        [ 0.4079, -1.3201,  0.9935],
        [-0.2406, -1.1288,  1.1179],
        [ 0.7327, -0.6781,  0.0091],
        [ 0.0952, -1.2282,  1.6535],
        [-0.4923, -1.4798,  2.0656],
        [ 0.4474,  0.4716, -0.5612],
        [ 1.1597, -0.7714, -0.2170],
        [-0.4383, -0.7547,  1.5374],
        [-0.4511, -1.3361,  1.7875],
        [ 0.1743, -0.1191,  0.4147],
        [-0.1315, -1.4468,  1.6982],
        [ 1.13

(Epoch 6) TRAIN LOSS:0.5059 LR:0.00000300:  20%|████████▏                                | 5/25 [00:02<00:07,  2.57it/s]

SequenceClassifierOutput(loss=tensor(0.6056, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9285, -0.4309, -0.1378],
        [ 0.3271,  0.3860, -0.2946],
        [ 0.9252, -0.1521, -0.2598],
        [-0.6952, -1.3647,  1.8228],
        [ 1.1179, -0.1120, -0.4850],
        [ 0.9893, -0.0682, -0.3955],
        [ 0.2948, -0.3334,  0.0140],
        [ 0.5698, -0.7552,  0.1874],
        [-0.5501, -1.2716,  2.0347],
        [ 0.3881,  0.5167, -0.5064],
        [ 0.7284,  0.3293, -0.1577],
        [ 1.5543, -0.5408, -0.3396],
        [-0.0855, -0.9953,  1.7748],
        [ 0.2730,  0.1567, -0.0309],
        [ 0.5571,  0.0406, -0.2360],
        [-0.2703, -1.3092,  1.2445],
        [ 0.6482, -0.9910,  0.2877],
        [-0.3656, -1.4113,  2.0526],
        [-0.6851, -1.3237,  2.1168],
        [-0.4787, -0.9640,  1.7075],
        [-0.7664, -1.3777,  2.1918],
        [-0.5791, -1.1977,  1.5364],
        [-0.1423, -1.1675,  1.7788],
        [ 0.2469, -0.3785,  0.2353],
        [ 0.61

(Epoch 6) TRAIN LOSS:0.5225 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:02<00:07,  2.60it/s]

SequenceClassifierOutput(loss=tensor(0.5642, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4979,  0.4791, -0.3867],
        [-0.3175, -1.2514,  2.1974],
        [-0.5234, -1.2541,  1.9810],
        [ 0.2648, -1.1755,  1.3870],
        [-0.4951, -1.4045,  1.8931],
        [-0.0796, -1.3388,  1.6992],
        [ 0.9642, -0.4438, -0.4272],
        [ 0.9464,  0.0681, -0.1812],
        [-0.1581, -1.3933,  1.6541],
        [-0.2507, -1.4484,  2.1128],
        [ 0.7672,  0.3081, -0.5326],
        [-0.1690, -1.0382,  1.2122],
        [ 0.7043,  0.1324, -0.1485],
        [-0.1756, -1.2456,  1.6374],
        [ 0.0402, -1.2942,  1.1943],
        [-0.3106, -1.5091,  1.9602],
        [-0.5108, -1.2452,  2.0354],
        [ 1.0562, -0.8102, -0.1221],
        [-0.5942, -1.0359,  1.6852],
        [ 0.4325,  0.2500, -0.3644],
        [-0.0205, -1.1775,  1.7317],
        [ 1.6078, -0.0197, -0.4481],
        [ 0.7637, -0.9599,  0.0055],
        [ 0.2922, -0.5208,  0.4232],
        [ 0.63

(Epoch 6) TRAIN LOSS:0.5225 LR:0.00000300:  32%|█████████████                            | 8/25 [00:03<00:06,  2.82it/s]

SequenceClassifierOutput(loss=tensor(0.4804, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1058, -1.0335,  1.4482],
        [ 0.8674, -0.2689,  0.1464],
        [-0.3989, -0.9635,  2.0101],
        [-0.4789, -1.3892,  2.1320],
        [-0.1436, -1.5885,  1.7572],
        [ 0.0772, -0.7332,  1.5693],
        [ 0.3124, -1.0616,  1.4732],
        [ 0.4117, -0.1557,  0.2930],
        [ 1.0628, -0.6719, -0.0821],
        [ 0.0610, -1.2443,  1.5059],
        [-0.4446, -1.4762,  1.8345],
        [-0.5586, -1.2715,  1.7042],
        [ 0.5120, -0.4493,  0.5780],
        [ 0.3302, -1.0729,  1.0175],
        [-0.4647, -1.1318,  1.8314],
        [ 0.8300, -0.2557, -0.1688],
        [-0.3168, -0.9158,  1.4017],
        [-0.3256, -1.1963,  1.7412],
        [ 0.0579, -1.1986,  1.7407],
        [-0.6142, -1.2398,  1.8769],
        [ 1.1999, -0.3167, -0.5267],
        [ 1.2433, -0.1236, -0.1689],
        [ 0.0260, -1.4770,  1.7423],
        [ 1.2911, -0.7524, -0.0899],
        [-0.01

(Epoch 6) TRAIN LOSS:0.5257 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:03<00:04,  3.33it/s]

SequenceClassifierOutput(loss=tensor(0.5518, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7334,  0.0026, -0.2270],
        [ 0.2213, -0.2179,  0.6849],
        [ 0.9263, -0.1991, -0.3233],
        [ 0.6620, -0.5448,  0.4082],
        [-0.6159, -1.2862,  2.2102],
        [ 1.1083, -0.2026, -0.2560],
        [ 0.0143, -1.5665,  1.4485],
        [ 0.2745, -0.6438,  0.7247],
        [-0.4454, -1.3715,  1.6144],
        [-0.3643, -1.0458,  1.8362],
        [-0.0252, -1.0527,  1.2048],
        [ 0.9344, -0.3080, -0.0722],
        [ 0.6232, -0.4309, -0.0484],
        [-0.4669, -1.4873,  1.5237],
        [ 1.4631, -0.7765, -0.4517],
        [ 0.2213, -0.9146,  1.4591],
        [-0.3821, -1.1679,  1.4520],
        [-0.1882, -1.1170,  1.7083],
        [ 0.1549, -1.3582,  1.7929],
        [-0.5739, -1.6354,  1.7551],
        [ 0.1746,  0.3176,  0.2238],
        [-0.5748, -1.3622,  2.1000],
        [-0.2864, -0.5699,  0.4344],
        [-0.4766, -0.9013,  1.3696],
        [-0.11

(Epoch 6) TRAIN LOSS:0.5265 LR:0.00000300:  40%|████████████████                        | 10/25 [00:03<00:04,  3.59it/s]

SequenceClassifierOutput(loss=tensor(0.5333, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0586, -1.3896,  1.2153],
        [ 0.2319, -0.2521,  0.0671],
        [ 0.4371, -1.0638,  1.1224],
        [ 0.1182, -1.2265,  0.9226],
        [ 0.9553, -0.4851,  0.2210],
        [-0.2550, -1.4245,  1.7375],
        [-0.3513, -1.4471,  1.8399],
        [ 1.3992, -0.3748, -0.1473],
        [-0.0658, -1.4051,  1.8768],
        [ 1.6188, -0.5482, -0.4443],
        [-0.1157, -1.1956,  2.0739],
        [-0.2035, -1.0822,  1.7887],
        [ 0.6917, -0.4579,  0.6013],
        [ 0.0917, -1.0230,  1.1207],
        [ 1.0445, -0.4180,  0.4713],
        [-0.7757, -1.4497,  2.0493],
        [ 0.2815, -0.4513,  0.0851],
        [ 0.4561,  0.5265, -0.1112],
        [ 0.0793, -1.1483,  2.0361],
        [ 1.3965, -0.6050, -0.1122],
        [ 1.4734, -0.6994, -0.4078],
        [-0.4821, -1.2840,  1.8749],
        [-0.1425, -0.6522,  0.9981],
        [ 1.0500, -0.5098, -0.1178],
        [ 0.08

(Epoch 6) TRAIN LOSS:0.5301 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:03<00:03,  4.01it/s]

SequenceClassifierOutput(loss=tensor(0.5662, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.6503, -0.9572,  1.4337],
        [ 1.0369, -0.9288,  0.3138],
        [ 0.2613, -0.3220,  0.6024],
        [ 0.3080, -1.4010,  1.5669],
        [ 0.2047, -1.5300,  1.7195],
        [ 0.0425, -1.4743,  1.7590],
        [ 1.1201, -0.0036, -0.1672],
        [-0.4840, -1.4117,  1.5791],
        [-0.1265, -1.1237,  1.4486],
        [ 0.5757,  0.4332, -0.4221],
        [ 0.9790, -0.2845, -0.3931],
        [-0.2037, -1.3426,  1.9501],
        [-0.3093, -1.4962,  1.9596],
        [ 0.4454, -0.2914, -0.1126],
        [-0.4784, -1.6582,  1.9908],
        [ 0.2549, -0.5521,  0.6392],
        [ 0.6740,  0.2249,  0.0667],
        [ 0.4799, -0.2131,  0.0279],
        [-0.6179, -1.2070,  2.1452],
        [ 0.6142, -0.4095, -0.0828],
        [-0.4161, -1.0804,  1.8718],
        [-0.7886, -1.3936,  1.8655],
        [-0.3286, -1.0679,  2.1431],
        [ 0.2135, -0.7899,  1.0423],
        [ 0.23

(Epoch 6) TRAIN LOSS:0.5211 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:04<00:02,  4.42it/s]

SequenceClassifierOutput(loss=tensor(0.5552, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0558, -1.1897,  1.4114],
        [-0.2568, -0.8264,  1.2804],
        [-0.5676, -0.8931,  1.7420],
        [-0.0977, -1.5998,  1.9585],
        [ 1.1620, -0.8188, -0.4065],
        [ 1.2063, -0.4428, -0.0568],
        [-0.2429, -1.2146,  1.7050],
        [ 1.0944, -0.5286, -0.4468],
        [ 0.5461, -0.0226, -0.7445],
        [ 0.2913,  0.1779, -0.3642],
        [ 1.0772, -0.0571, -0.7809],
        [ 0.1957, -1.4738,  1.9238],
        [ 0.7836, -0.0372, -0.0216],
        [ 0.8530,  0.0465, -0.4045],
        [ 0.8486, -0.5139, -0.2990],
        [-0.2920, -1.1321,  1.9823],
        [ 0.4325, -1.1858,  0.9172],
        [ 0.4016,  1.2438, -0.3215],
        [ 0.2421, -1.5712,  1.4215],
        [-0.0908, -1.4151,  1.6441],
        [ 0.8409, -0.4112, -0.0642],
        [ 0.9015, -0.6526, -0.0711],
        [-0.5237, -1.3500,  1.9471],
        [ 0.9118, -0.2031, -0.7258],
        [ 0.72

(Epoch 6) TRAIN LOSS:0.5181 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:04<00:02,  4.50it/s]

SequenceClassifierOutput(loss=tensor(0.4796, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0770, -1.1539,  1.1014],
        [ 0.8142, -0.8722,  0.1430],
        [ 1.5524, -0.7243, -0.3811],
        [-0.5388, -1.4741,  2.1378],
        [-0.5561, -1.5835,  2.1200],
        [ 0.2326, -1.2551,  1.6636],
        [ 0.1386, -0.9745,  1.2085],
        [-0.1896, -1.3066,  1.7922],
        [-0.5256, -1.1081,  1.7622],
        [ 1.1836, -0.0125, -0.4232],
        [-0.0288, -0.9324,  1.3434],
        [-0.4599, -0.7796,  1.6752],
        [ 0.2087, -1.5921,  1.3425],
        [ 0.9525, -0.5950,  0.5750],
        [ 0.5466, -0.3938, -0.3041],
        [-0.2076, -1.5412,  1.9859],
        [ 1.3337, -0.4757, -0.2539],
        [-0.2126, -1.2459,  1.2597],
        [ 0.1970, -1.1177,  1.0405],
        [-0.7154, -1.5140,  2.0701],
        [-0.3184, -0.9968,  1.6202],
        [ 0.9810, -0.6128,  0.2566],
        [-0.2725, -1.5288,  1.8824],
        [ 0.2547, -1.1896,  1.8521],
        [-0.31

(Epoch 6) TRAIN LOSS:0.5170 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:04<00:01,  4.56it/s]

SequenceClassifierOutput(loss=tensor(0.4991, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.4965, -0.8918,  1.4977],
        [-0.6749, -1.5437,  2.0570],
        [ 0.8379,  0.1998, -0.3424],
        [-0.2000, -1.4924,  1.5458],
        [-0.2239, -1.5405,  1.7202],
        [-0.4426, -1.1535,  1.4348],
        [-0.7624, -1.3507,  2.1734],
        [ 0.2407,  0.0183, -0.0174],
        [-0.2828, -1.1320,  1.7662],
        [ 1.1833, -0.5901,  0.0605],
        [ 0.4687, -1.3386,  1.5196],
        [ 0.9986, -0.5896,  0.1288],
        [ 0.9756, -0.1229,  0.0728],
        [ 1.4195, -0.5003, -0.3459],
        [-0.2518, -1.1908,  2.1834],
        [-0.1773, -1.1300,  1.5981],
        [ 0.9286, -0.7547, -0.4100],
        [-0.4221, -1.4808,  1.8915],
        [ 0.4725, -0.0102,  0.1760],
        [ 0.4441, -1.3556,  0.7603],
        [ 1.4108, -0.3108, -0.3498],
        [ 1.5306, -0.5167, -0.7481],
        [-0.1800, -1.5866,  1.9169],
        [-0.4237, -1.3498,  2.0524],
        [ 1.13

(Epoch 6) TRAIN LOSS:0.5179 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:04<00:01,  4.50it/s]

SequenceClassifierOutput(loss=tensor(0.5324, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1893, -0.6112,  0.5451],
        [ 0.8421, -0.1288, -0.4863],
        [ 0.7506, -0.3705, -0.0872],
        [-0.5690, -1.1860,  2.0977],
        [-0.7916, -1.2597,  1.8363],
        [ 1.0744, -0.2426, -0.3162],
        [ 0.7474, -0.0516, -0.2546],
        [ 0.1213, -1.3003,  1.2698],
        [-0.0786, -1.5197,  1.8327],
        [ 0.9462,  0.1488, -0.5692],
        [-0.7097, -1.5164,  2.1293],
        [ 1.4122, -0.4959, -0.2291],
        [ 0.7202, -0.2312, -0.2264],
        [ 0.4385,  0.2120, -0.2933],
        [ 0.9071, -0.2411, -0.2016],
        [-0.6251, -1.2765,  1.9049],
        [ 0.8451, -0.2458,  0.1072],
        [-0.1644, -1.4722,  1.3014],
        [ 0.2351, -0.1154,  0.2300],
        [-0.3359, -1.5593,  1.9304],
        [-0.2996, -1.3692,  1.5495],
        [ 0.0699, -1.3050,  1.4543],
        [-0.3132, -1.5234,  2.0541],
        [ 0.1824, -1.0280,  1.1810],
        [-0.68

(Epoch 6) TRAIN LOSS:0.5171 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:05<00:01,  4.60it/s]

SequenceClassifierOutput(loss=tensor(0.5039, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2768, -0.3377,  0.0332],
        [ 0.5462, -0.5365, -0.3578],
        [ 0.0730, -1.3348,  1.3519],
        [-0.2012, -1.3304,  2.1317],
        [ 1.4046, -0.3984, -0.7163],
        [ 0.5022, -0.2208, -0.1082],
        [ 1.1054, -0.2476, -0.1919],
        [ 0.7813,  0.1362, -0.5739],
        [ 0.9490, -0.5923,  0.2268],
        [-0.6389, -1.4111,  1.7275],
        [-0.6366, -1.1997,  1.8225],
        [-0.2362, -1.4137,  2.2795],
        [-0.5330, -1.3552,  1.7312],
        [-0.3197, -1.5069,  1.7823],
        [ 0.9378, -0.6599,  0.0130],
        [ 0.8677,  0.0082, -0.3048],
        [-0.4869, -1.3692,  1.8235],
        [-0.4327, -1.0797,  2.0613],
        [ 0.1750, -1.4700,  1.4034],
        [ 0.9010, -0.8978,  0.0496],
        [ 0.6988, -0.6439, -0.3017],
        [-0.7117, -1.5433,  1.9178],
        [ 0.7472, -0.1131, -0.2959],
        [ 0.6415,  0.7739, -0.4632],
        [-0.48

(Epoch 6) TRAIN LOSS:0.5202 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:05<00:01,  4.52it/s]

SequenceClassifierOutput(loss=tensor(0.5749, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3657, -1.3615,  2.0279],
        [-0.6423, -1.4608,  1.9063],
        [ 0.3732, -0.2848, -0.1986],
        [ 0.2177,  0.0728, -0.2890],
        [-0.4153, -1.3479,  2.0949],
        [-0.6817, -0.5845,  1.7628],
        [ 0.4412, -0.2488,  0.1576],
        [-0.3645, -1.5718,  1.8932],
        [ 0.4700, -0.8620,  0.6824],
        [-0.3450, -1.5046,  1.6971],
        [ 0.2544, -0.7254,  0.8591],
        [ 0.4262,  0.3013, -0.6112],
        [-0.4054, -1.3291,  1.8062],
        [-0.2022, -1.1761,  1.7228],
        [ 0.9018, -0.4746, -0.4636],
        [ 0.1633, -1.1171,  1.7159],
        [ 0.9094,  0.2469, -0.1004],
        [-0.0078,  0.2672,  0.1497],
        [ 0.5273,  0.6954, -0.6899],
        [-0.6435, -1.2728,  2.1051],
        [ 0.6418, -0.1431, -0.3340],
        [ 1.8232, -0.7648, -0.2454],
        [ 0.6640,  0.2213, -0.2321],
        [ 1.2222,  0.0237, -0.1791],
        [ 1.27

(Epoch 6) TRAIN LOSS:0.5146 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:05<00:01,  4.56it/s]

SequenceClassifierOutput(loss=tensor(0.4090, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0769, -0.3418, -0.2967],
        [-0.2338, -1.4021,  1.8904],
        [-0.6782, -0.9486,  1.7942],
        [ 0.8817, -0.8244,  0.1769],
        [-0.4027, -1.3928,  1.6834],
        [ 1.0743, -0.7091, -0.0455],
        [ 0.7323,  0.1314, -0.3034],
        [-0.3345, -1.2174,  1.6744],
        [-0.7574, -1.5482,  2.0311],
        [ 0.8268, -0.4581, -0.8288],
        [ 1.1375, -0.7038,  0.1962],
        [-0.2350, -1.4991,  1.8536],
        [-0.4022, -1.7843,  1.9086],
        [ 0.2792,  0.6292, -0.5218],
        [-0.2868, -1.5279,  1.8582],
        [ 0.7991, -0.4573,  0.5548],
        [-0.2511, -1.2489,  1.5584],
        [ 0.0270, -1.3543,  2.0515],
        [ 0.5671, -0.9134,  0.6497],
        [-0.4647, -1.1335,  1.6489],
        [-0.5107, -1.3143,  2.0405],
        [ 0.4335, -0.9825,  0.7355],
        [-0.5773, -1.3657,  2.1001],
        [ 1.3129, -0.2525, -0.1246],
        [-0.30

(Epoch 6) TRAIN LOSS:0.5111 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:05<00:00,  4.71it/s]

SequenceClassifierOutput(loss=tensor(0.4421, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5192,  0.1732,  0.0454],
        [ 0.8700,  0.3546, -0.4922],
        [ 0.9451, -0.6231,  0.3401],
        [-0.3644, -1.2292,  1.9936],
        [-0.1571, -1.0275,  1.4054],
        [-0.6573, -1.4446,  2.1665],
        [ 0.3897, -1.1562,  1.1657],
        [-0.6356, -1.3909,  1.9681],
        [ 1.0866, -0.4507, -0.1794],
        [ 0.6662, -0.1905, -0.4177],
        [ 0.2575,  0.4600,  0.0856],
        [-0.2247, -1.3608,  1.8784],
        [ 1.1477,  0.0933, -0.4657],
        [ 1.1109, -0.8400,  0.2888],
        [ 0.9530,  0.1178, -0.2090],
        [ 0.6565,  1.0680, -0.2685],
        [-0.6911, -0.8923,  1.8896],
        [ 0.3578, -0.7210,  0.1946],
        [ 1.1283, -0.4365, -0.6849],
        [-0.5201, -1.6376,  2.1781],
        [-0.4199, -1.3300,  1.6796],
        [-0.4704, -1.3967,  1.5740],
        [ 0.7504, -0.4996,  0.0952],
        [ 1.4713, -0.4634, -0.6284],
        [-0.11

(Epoch 6) TRAIN LOSS:0.5066 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:06<00:00,  4.60it/s]

SequenceClassifierOutput(loss=tensor(0.4110, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.5074, -1.1978,  1.9966],
        [ 0.6948, -0.3322,  0.1901],
        [-0.0876, -0.6832,  0.4248],
        [ 0.7757, -0.9668,  1.1770],
        [-0.6216, -1.5433,  2.1702],
        [-0.5623, -1.4273,  2.0654],
        [-0.7139, -1.4937,  2.0818],
        [-0.6612, -1.4674,  2.3281],
        [-0.7533, -1.5396,  2.2885],
        [ 1.0093, -0.2961, -0.2168],
        [-0.1779, -1.0702,  1.5135],
        [-0.4487, -1.5009,  2.0989],
        [-0.5673, -1.1876,  1.4294],
        [-0.7671, -0.7820,  1.8742],
        [ 1.0091, -0.4255, -0.3796],
        [-0.3889, -0.9646,  1.5958],
        [ 0.0652, -1.2119,  1.3281],
        [-0.5934, -1.5158,  2.1513],
        [ 0.0420,  0.2599, -0.0728],
        [ 0.7125,  0.0414, -0.5163],
        [ 0.7477, -0.2572, -0.0409],
        [-0.7208, -1.4962,  2.1284],
        [-0.3026, -1.3056,  1.9865],
        [-0.3053, -1.5190,  2.0470],
        [-0.56

(Epoch 6) TRAIN LOSS:0.5027 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:06<00:00,  4.82it/s]

SequenceClassifierOutput(loss=tensor(0.4168, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2399, -1.4174,  1.8064],
        [ 0.0442, -1.5484,  2.0185],
        [ 0.9906,  0.1948, -0.1188],
        [ 0.5842, -0.2439,  0.3684],
        [-0.4190, -1.6546,  1.8647],
        [-0.1649, -1.4278,  1.9877],
        [-0.3884, -1.7035,  2.1157],
        [-0.4453, -1.1933,  1.7238],
        [-0.4156, -1.3703,  1.6574],
        [-0.4957, -1.0048,  1.7321],
        [-0.5057, -1.6802,  2.0281],
        [-0.9172, -1.6406,  2.0293],
        [ 0.6373, -0.3345, -0.3099],
        [ 0.1776, -0.0232,  0.1402],
        [ 1.1892, -0.4296, -0.1209],
        [ 0.9189, -0.5111, -0.1458],
        [-0.5604, -1.1460,  1.9620],
        [-0.2500, -1.4109,  1.8138],
        [-0.4838, -1.1653,  1.8113],
        [ 1.0276, -0.2904, -0.2952],
        [-0.3812, -1.3453,  2.2362],
        [-0.3242, -1.7839,  2.0339],
        [-0.6203, -1.3430,  1.6511],
        [-0.4573, -1.3155,  2.0849],
        [ 1.22

(Epoch 6) TRAIN LOSS:0.4998 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:06<00:00,  4.84it/s]

SequenceClassifierOutput(loss=tensor(0.3745, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1179, -1.3899,  1.9192],
        [ 0.8518, -0.4942,  0.3883],
        [-0.6208, -0.9491,  1.7086],
        [-0.6116, -1.6726,  1.8508],
        [ 0.2037,  0.1467, -0.2655],
        [ 0.3607, -0.7145,  0.3250],
        [ 0.3628, -1.2158,  0.6960],
        [-0.5078, -1.1798,  1.8215],
        [-0.7196, -1.4412,  2.2346],
        [ 0.2033, -0.4926,  0.7739],
        [ 0.0570, -0.9553,  1.1992],
        [ 0.5174, -0.2654, -0.1849],
        [-0.4732, -1.3361,  1.9431],
        [ 0.9698, -0.2211, -0.3708],
        [-0.4543, -1.4084,  1.6598],
        [-0.3608, -1.5454,  1.7622],
        [ 0.9042, -1.1211,  0.5421],
        [ 1.0504, -0.0924, -0.4551],
        [-0.5186, -0.8886,  1.5835],
        [ 0.5975,  0.2783, -0.3567],
        [-0.2558, -1.4533,  1.8924],
        [ 0.4549, -0.7948,  0.0268],
        [-0.0117, -0.8757,  1.0692],
        [-0.0467, -0.5768,  1.5930],
        [-0.57

(Epoch 6) TRAIN LOSS:0.4998 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:06<00:00,  3.72it/s]

(Epoch 6) TRAIN LOSS:0.4998 ACC:0.82 F1:0.70 REC:0.67 PRE:0.84 LR:0.00000300



(Epoch 7) TRAIN LOSS:0.4305 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:06,  3.94it/s]

SequenceClassifierOutput(loss=tensor(0.4305, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3061, -1.2725,  1.3540],
        [ 0.0310, -1.6019,  1.4971],
        [-0.6533, -1.3527,  2.2538],
        [ 0.5611,  0.0895, -0.4163],
        [-0.1808, -1.4472,  2.0116],
        [-0.5286, -1.4547,  1.9287],
        [-0.4914, -1.4163,  2.2138],
        [ 0.2600, -1.2756,  1.2427],
        [ 0.8899, -0.9983,  0.3296],
        [-0.3653, -1.2231,  1.6306],
        [ 0.4562, -0.1237, -0.5180],
        [-0.4506, -1.2902,  2.1142],
        [-0.2310, -1.2290,  1.8675],
        [ 0.2746,  0.4481, -0.5839],
        [-0.6782, -1.4582,  2.2315],
        [ 0.6275, -0.4181, -0.0401],
        [-0.4843, -1.2850,  2.0246],
        [-0.5019, -1.2267,  1.7989],
        [ 0.9051, -0.8177,  0.6356],
        [-0.1847, -1.2456,  1.8681],
        [ 1.6975,  0.1373, -0.5650],
        [ 0.1641, -0.1582,  0.2732],
        [ 0.5218,  0.6269, -0.5526],
        [-0.7016, -1.0991,  2.2756],
        [ 1.28

(Epoch 7) TRAIN LOSS:0.4194 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:05,  4.35it/s]

SequenceClassifierOutput(loss=tensor(0.4083, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.5646, -1.5020,  2.0691],
        [ 1.1382, -0.0922, -0.2866],
        [ 0.0770, -0.0848,  0.3754],
        [ 0.7748, -1.3874,  1.1908],
        [ 0.5315, -0.4355, -0.1992],
        [-0.6583, -1.4858,  2.1361],
        [-0.2328, -1.1071,  1.7095],
        [ 1.2878, -0.2702, -0.4438],
        [ 1.3430, -0.2541, -0.5249],
        [ 0.3271, -1.4202,  1.1674],
        [ 0.9838, -0.6364, -0.1563],
        [-0.1666, -1.5336,  1.6354],
        [-0.4250, -1.0775,  2.1174],
        [ 0.0178, -1.4117,  1.4562],
        [-0.5485, -1.4425,  2.1852],
        [ 0.2187, -1.4687,  1.6713],
        [ 0.0925, -0.4919,  0.3218],
        [-0.0858, -0.8488,  1.1882],
        [ 0.9157,  0.0445, -0.3774],
        [-0.3882, -1.0795,  1.8021],
        [ 0.9130, -1.2550,  0.8445],
        [-0.7850, -1.4042,  2.1425],
        [-0.7376, -1.0230,  2.1459],
        [-0.3432, -1.2162,  2.1650],
        [-0.03

(Epoch 7) TRAIN LOSS:0.3886 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:05,  4.38it/s]

SequenceClassifierOutput(loss=tensor(0.3270, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1662, -1.5064,  2.1467],
        [-0.4711, -1.3260,  1.6988],
        [ 1.2387, -0.3826, -0.4272],
        [ 0.3766, -0.2381, -0.3102],
        [-0.1523, -0.9117,  1.2450],
        [-0.6127, -1.6164,  2.2021],
        [ 0.9831, -0.6444,  0.1346],
        [-0.4571, -1.1099,  1.7164],
        [-0.3664, -1.6664,  2.3697],
        [ 0.1859, -0.9701,  1.5742],
        [-0.7450, -1.0991,  1.9439],
        [-0.3748, -1.1994,  1.6658],
        [ 0.6247, -0.6839,  0.1370],
        [-0.3894, -1.1944,  1.8288],
        [ 1.1352, -0.1114, -0.6896],
        [-0.6251, -1.4038,  2.1899],
        [ 1.1841, -0.1923, -0.2899],
        [-0.3509, -1.2057,  1.9451],
        [-0.5693, -1.5205,  2.2912],
        [ 1.3118, -0.7273, -0.1567],
        [-0.4534, -1.5432,  1.6808],
        [ 0.7669, -0.8027,  0.0248],
        [ 1.2729, -0.0678, -0.4850],
        [ 1.5084, -0.7162, -0.5967],
        [ 1.20

(Epoch 7) TRAIN LOSS:0.3764 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:04,  4.76it/s]

SequenceClassifierOutput(loss=tensor(0.3397, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0940, -1.6828,  1.6756],
        [ 1.3586, -0.4597, -0.4535],
        [-0.4949, -1.1802,  1.9109],
        [-0.7575, -1.2978,  2.0396],
        [ 0.9697, -0.1148, -0.1922],
        [-0.4950, -1.3221,  1.9213],
        [-0.3556, -1.3716,  2.0251],
        [ 1.4043, -0.4861, -0.4665],
        [ 1.0455, -0.8561,  0.3392],
        [ 0.7671, -0.3215,  0.4101],
        [ 1.1157, -0.7848, -0.4088],
        [ 0.9778, -0.4784, -0.4084],
        [-0.3098, -1.5393,  1.9039],
        [ 0.1176, -1.4624,  1.7886],
        [-0.6154, -1.3848,  2.2815],
        [ 0.1094,  0.2748, -0.0138],
        [-0.2451, -1.0820,  1.8486],
        [ 0.9449, -1.0370,  0.7021],
        [ 1.3056, -0.3083,  0.1405],
        [ 0.8096, -0.2600, -0.2978],
        [-0.4560, -1.2336,  1.5884],
        [-0.7519, -1.2647,  1.9471],
        [ 0.6185, -0.1285, -0.5524],
        [-0.1924, -1.3495,  1.9989],
        [ 0.09

(Epoch 7) TRAIN LOSS:0.4356 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:04,  4.64it/s]

SequenceClassifierOutput(loss=tensor(0.4307, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3213, -1.5851,  2.0312],
        [ 0.7279,  0.3253, -0.6749],
        [-0.1595, -1.5179,  2.1646],
        [-0.4851, -1.1732,  1.8147],
        [ 0.4113, -0.5350,  0.3338],
        [ 0.8434, -0.4290, -0.0637],
        [-0.3750, -1.2889,  1.8383],
        [-0.4790, -1.5356,  2.2068],
        [-0.3345, -1.3663,  1.9221],
        [-0.1655, -1.1736,  1.7134],
        [ 0.2773,  0.6472, -0.5289],
        [ 0.9802, -0.3623, -0.1233],
        [-0.3206, -1.2598,  1.8437],
        [-0.2096, -1.5181,  1.5376],
        [-0.8120, -0.9362,  2.0709],
        [-0.4357, -1.3328,  2.0619],
        [ 1.2014,  0.0720, -0.5360],
        [-0.7986, -1.4905,  2.0897],
        [ 0.9349, -0.2207, -0.0879],
        [-0.0560, -1.0367,  1.3649],
        [ 0.0062, -1.3374,  1.6721],
        [-0.6983, -1.5462,  2.2155],
        [-0.2511, -1.1176,  1.7050],
        [ 1.3648, -0.7290, -0.3011],
        [-0.15

(Epoch 7) TRAIN LOSS:0.4233 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:01<00:03,  4.90it/s]

SequenceClassifierOutput(loss=tensor(0.3495, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3272, -1.2970,  1.6096],
        [-0.0408, -1.3366,  1.9142],
        [ 0.7935, -1.2792,  1.1997],
        [ 0.9065, -0.3374, -0.6914],
        [-0.9853, -1.5410,  2.2231],
        [ 1.3450, -0.1527, -0.2149],
        [ 1.0193, -0.2159, -0.3045],
        [ 1.5411, -0.3734, -0.4057],
        [-0.5270, -1.3590,  1.8140],
        [ 0.8682,  0.2492, -0.3079],
        [-0.4990, -1.5606,  2.2659],
        [-0.1888, -1.5097,  1.7124],
        [ 0.5539, -1.5416,  1.5244],
        [ 0.0335, -1.1797,  1.7864],
        [-0.2316, -1.0800,  2.0663],
        [-0.4063, -1.2592,  1.9566],
        [ 1.1607, -0.4705,  0.3175],
        [-0.3211, -1.4958,  2.1334],
        [ 0.1185, -1.1029,  1.6147],
        [ 0.1716, -0.8223,  1.1415],
        [-0.1951, -1.1908,  1.4663],
        [ 1.4961, -0.3968, -0.6831],
        [-0.5037, -0.8766,  1.5316],
        [-0.1439, -1.3908,  1.7276],
        [-0.25

(Epoch 7) TRAIN LOSS:0.4128 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:01<00:03,  4.74it/s]

SequenceClassifierOutput(loss=tensor(0.3433, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2136, -1.4709,  2.2225],
        [-0.3201, -1.0844,  1.5811],
        [-0.3146, -1.0930,  2.0390],
        [ 1.7345, -0.8399, -0.4082],
        [-0.8142, -1.4792,  2.3465],
        [-0.5839, -1.4187,  1.8022],
        [ 0.7263,  0.1791, -0.4548],
        [-0.4867, -1.5275,  2.2348],
        [ 0.7030, -0.1953,  0.0783],
        [ 0.6314, -0.5437,  0.6313],
        [ 1.0805, -0.2641, -0.3817],
        [-0.5777, -1.4206,  2.2965],
        [-0.3276, -1.2401,  2.2538],
        [ 1.3103,  0.2050, -0.3169],
        [ 0.6566, -0.4656, -0.2474],
        [-0.5606, -1.4325,  2.1635],
        [-0.5298, -1.3379,  1.8608],
        [-0.6852, -1.4999,  2.3171],
        [-0.7427, -1.3688,  2.1350],
        [-0.3739, -1.0078,  1.6112],
        [ 0.2010, -1.3024,  1.4817],
        [ 1.4942, -0.0072, -0.3253],
        [-0.5666, -1.4391,  2.1913],
        [ 0.4107,  0.4945, -0.3949],
        [-0.38

(Epoch 7) TRAIN LOSS:0.4115 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:03,  4.37it/s]

SequenceClassifierOutput(loss=tensor(0.3992, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3074, -1.3631,  1.5340],
        [-0.0457, -1.8221,  1.9154],
        [ 0.4090, -1.2876,  1.1994],
        [-0.6420, -1.3247,  2.1144],
        [ 1.1553, -0.3280, -0.1290],
        [-0.0653, -0.8419,  1.3013],
        [ 1.5531, -0.2702, -0.4356],
        [-0.5620, -1.5742,  2.1370],
        [-0.9801, -1.3589,  2.3526],
        [-0.8799, -1.0862,  1.9705],
        [-0.1807, -0.1456, -0.0895],
        [-0.5470, -1.5153,  2.3670],
        [ 0.5514, -0.5412,  0.3973],
        [ 0.2475,  0.5395, -0.3241],
        [-0.8502, -1.1875,  2.0080],
        [ 1.3464, -0.9020, -0.3621],
        [-0.6323, -1.0444,  2.1986],
        [-0.7836, -1.5164,  2.2863],
        [-0.0424, -0.6764,  1.0645],
        [-0.2800, -1.5132,  2.0388],
        [-0.3421, -1.2648,  1.8990],
        [-0.5452, -1.1911,  1.8787],
        [-0.1138, -1.1392,  1.8124],
        [-0.6783, -1.6234,  2.3144],
        [-0.01

(Epoch 7) TRAIN LOSS:0.4090 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:02<00:03,  4.29it/s]

SequenceClassifierOutput(loss=tensor(0.3844, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8528, -0.6841,  0.2355],
        [-0.5819, -1.2029,  1.7006],
        [ 1.0363, -0.3508, -0.3929],
        [-0.0914, -0.8741,  0.9058],
        [-0.1146, -0.8270,  1.1501],
        [ 1.0329, -0.5941, -0.2829],
        [ 1.1102, -1.0190,  0.0912],
        [-0.5394, -1.1590,  1.7322],
        [-0.6096, -0.9312,  2.0486],
        [-0.6236, -1.2935,  1.9074],
        [-0.0184, -1.3047,  1.5375],
        [-0.0166,  0.4882,  0.0336],
        [-0.0309, -1.3679,  1.6674],
        [-0.4186, -1.6380,  2.2615],
        [-0.5075, -1.5936,  2.3495],
        [ 0.5575, -0.8519,  0.8319],
        [ 0.4766, -0.0839,  0.0590],
        [ 1.3715,  0.2714, -0.4425],
        [ 0.6378, -0.2437, -0.4121],
        [-0.7374, -1.4456,  2.2449],
        [ 1.1769, -0.6682, -0.6213],
        [-0.5871, -1.3933,  2.1186],
        [-0.1156, -1.5511,  1.6217],
        [ 0.8249, -0.1946,  0.0066],
        [-0.59

(Epoch 7) TRAIN LOSS:0.4129 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:02<00:03,  3.61it/s]

SequenceClassifierOutput(loss=tensor(0.3385, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.4479, -1.0625,  1.8186],
        [-0.4240, -1.2663,  1.9489],
        [ 0.4359, -0.5843, -0.0116],
        [ 1.0024, -0.3932, -0.0616],
        [-0.5975, -1.5295,  2.0800],
        [ 0.8113, -0.3736, -0.3390],
        [-0.1841, -1.3985,  1.7458],
        [-0.1209, -1.3521,  1.8154],
        [-0.5223, -1.6489,  2.2166],
        [ 0.3390,  0.2555, -0.1703],
        [-0.1144, -1.3677,  2.1508],
        [-0.5321, -1.5151,  2.0059],
        [ 0.7235, -0.6121, -0.3258],
        [-0.0470, -1.2763,  1.6033],
        [-0.0527, -1.3109,  1.8197],
        [ 0.5943, -0.1864, -0.0581],
        [ 0.5083,  1.3638, -0.4071],
        [ 1.1981, -0.3344, -0.3357],
        [-0.5782, -1.5156,  2.1590],
        [-0.4784, -1.5283,  2.4049],
        [-0.4154, -0.9388,  1.6542],
        [ 1.4999, -0.6883, -0.6663],
        [-0.0878, -1.0851,  1.6101],
        [-0.6793, -1.1010,  1.8983],
        [ 0.93

(Epoch 7) TRAIN LOSS:0.4071 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:03<00:03,  3.34it/s]

SequenceClassifierOutput(loss=tensor(0.4381, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.1962, -0.5081, -0.4535],
        [ 0.7051,  0.2181, -0.0181],
        [ 0.6429,  0.5080, -0.1838],
        [ 1.1280, -1.4029,  0.7786],
        [ 1.6377, -0.2601, -0.6627],
        [ 0.0475,  0.3407, -0.3333],
        [-0.2982, -1.3141,  1.9321],
        [-0.3260, -1.4250,  1.9582],
        [-0.6532, -0.9734,  1.7523],
        [ 1.0787, -1.0465,  0.5251],
        [ 0.9529, -0.5758, -0.4495],
        [-0.5643, -1.9106,  2.1438],
        [-0.4803, -1.5266,  2.0815],
        [ 1.3532,  0.1489, -0.7361],
        [ 1.0694, -0.4412, -0.2749],
        [ 0.8656, -1.3792,  0.9367],
        [ 1.9246, -0.6357, -0.8930],
        [ 1.3171, -0.6032, -0.2320],
        [ 0.3162,  0.4742, -0.0519],
        [ 0.3548, -0.6044,  0.5286],
        [-0.0872, -1.5132,  1.5927],
        [-0.5834, -0.9064,  1.8934],
        [ 1.6166, -0.5209, -0.5043],
        [-0.5972, -1.5878,  2.2138],
        [ 0.03

(Epoch 7) TRAIN LOSS:0.4094 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:03<00:03,  3.19it/s]

SequenceClassifierOutput(loss=tensor(0.4129, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3111, -1.5776,  2.1029],
        [-0.2943, -0.9103,  1.5914],
        [-0.2175, -1.1312,  2.2652],
        [ 1.3542, -0.1566, -0.0509],
        [-0.3273, -1.7747,  1.8712],
        [ 0.5353, -0.4469, -0.2271],
        [-0.4433, -1.2376,  2.1146],
        [-0.7608, -1.4573,  2.5124],
        [ 0.6152, -0.5530,  0.0514],
        [ 1.1175, -0.6680, -0.1404],
        [-0.8864, -1.3108,  2.3314],
        [-0.8593, -1.3214,  2.4692],
        [ 1.1267, -0.5541, -0.1197],
        [ 0.9082,  0.0913, -0.2416],
        [-0.4678, -1.1708,  1.3914],
        [ 0.0332, -1.4980,  1.5775],
        [ 1.1299, -0.1359, -0.4170],
        [ 1.1599, -0.2785, -0.1194],
        [ 0.5045, -0.2273,  0.7260],
        [-0.5240, -1.5253,  1.9773],
        [-0.5438, -1.6802,  1.9930],
        [ 1.5967, -0.7449, -0.1730],
        [-0.1033, -0.9171,  1.4409],
        [ 0.1916,  0.0815,  0.4502],
        [-0.68

(Epoch 7) TRAIN LOSS:0.4096 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:03<00:03,  2.94it/s]

SequenceClassifierOutput(loss=tensor(0.4756, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-2.6671e-01, -1.1097e+00,  1.5750e+00],
        [ 2.5285e-01, -5.8696e-01,  1.0202e+00],
        [ 8.3214e-01, -8.6924e-01,  7.9816e-01],
        [-5.9082e-01, -1.1337e+00,  1.6004e+00],
        [-7.7371e-01, -1.2651e+00,  1.7505e+00],
        [ 2.2948e-01, -1.7371e+00,  1.5927e+00],
        [ 1.0757e+00, -9.9971e-01,  5.9298e-02],
        [-9.9841e-01, -1.0558e+00,  2.1164e+00],
        [ 7.9669e-01, -7.5945e-02,  7.3872e-02],
        [-4.7030e-01, -1.4990e+00,  1.8105e+00],
        [-4.4935e-01, -1.5395e+00,  2.1162e+00],
        [ 8.0474e-01, -5.4539e-01, -8.8808e-02],
        [ 3.6554e-01,  6.5224e-02, -5.4692e-01],
        [-3.4881e-01, -1.4401e+00,  2.4296e+00],
        [ 8.4299e-02, -1.1477e+00,  1.2912e+00],
        [ 3.9235e-01, -5.3842e-01,  6.7855e-01],
        [ 6.9892e-02, -1.1067e+00,  1.4807e+00],
        [ 2.9054e-01, -5.8699e-01,  8.0405e-01],
        [-4.8251e-01

(Epoch 7) TRAIN LOSS:0.4137 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:04<00:03,  2.85it/s]

SequenceClassifierOutput(loss=tensor(0.5639, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2305, -1.3865,  2.1032],
        [-0.1810, -1.3157,  1.9058],
        [ 0.0307,  0.4983,  0.4976],
        [ 0.4856, -1.3040,  1.0390],
        [ 1.3843, -0.7025, -0.3808],
        [-0.9035, -1.5572,  2.2647],
        [-0.0688, -0.0737, -0.2574],
        [ 0.7048, -0.1779, -0.4169],
        [-0.2210, -1.2919,  1.9167],
        [ 1.0994, -0.4514, -0.2206],
        [-0.5469, -1.5633,  2.2100],
        [-0.2650, -1.6538,  1.6926],
        [ 1.0085, -0.3801, -0.2251],
        [-0.1782, -1.1456,  1.7216],
        [ 0.9404,  0.2072, -0.5368],
        [ 1.6447, -1.0311, -0.4334],
        [-0.5906, -1.5604,  1.9047],
        [ 0.3123,  0.3275, -0.1418],
        [ 0.9221, -0.3604, -0.3906],
        [-0.5502, -1.1579,  2.1435],
        [ 1.5289, -0.3788, -0.6864],
        [ 0.7379, -1.7381,  1.1092],
        [ 0.0499, -0.2438, -0.0603],
        [-0.6869, -1.4644,  1.8910],
        [ 0.07

(Epoch 7) TRAIN LOSS:0.4225 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:04<00:02,  2.67it/s]

SequenceClassifierOutput(loss=tensor(0.6193, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.9521, -0.5535, -0.6737],
        [ 0.4850, -1.3211,  1.2621],
        [ 0.1678, -1.5662,  1.7064],
        [ 0.2211,  0.6210, -0.2043],
        [-0.3755, -1.3366,  1.6169],
        [-0.3789, -1.6953,  2.0921],
        [-0.1563, -0.9995,  1.3066],
        [ 0.3636,  0.5877, -0.3370],
        [-0.5586, -1.6029,  2.3530],
        [-0.4512, -1.3230,  1.7545],
        [-0.5050, -1.2107,  1.6761],
        [-0.1746, -1.4509,  1.8756],
        [ 0.3719,  0.3946,  0.3765],
        [-0.6724, -1.5232,  2.4198],
        [ 0.6059, -0.5968,  0.5700],
        [-0.3589, -1.2935,  2.2772],
        [ 0.0796, -1.4522,  1.6850],
        [-0.0861, -1.2416,  1.6315],
        [ 0.6324,  0.2832, -0.4095],
        [-0.3702, -1.4730,  1.9273],
        [-0.3047, -1.7649,  1.7362],
        [-0.4946, -1.1176,  1.6100],
        [ 0.3030,  0.4058, -0.2908],
        [ 1.5174, -0.4876, -0.4627],
        [ 0.01

(Epoch 7) TRAIN LOSS:0.4335 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:05<00:02,  2.55it/s]

SequenceClassifierOutput(loss=tensor(0.3867, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.9593, -1.1764,  2.1201],
        [-0.1030, -0.5328,  0.9994],
        [ 1.1305, -0.2204, -0.0129],
        [ 2.2339, -0.3863, -0.6114],
        [-0.5611, -1.2062,  2.4425],
        [ 1.1463, -0.2173, -0.1877],
        [ 1.3196, -0.1870, -0.8309],
        [-0.5158, -1.6587,  2.0493],
        [-0.2902, -0.2230, -0.0514],
        [-0.3705, -1.2828,  1.9653],
        [ 1.6797,  0.0817, -0.7467],
        [-0.2482, -1.6981,  1.8021],
        [ 1.0368, -0.2391, -0.4289],
        [ 0.4981, -0.4079,  0.4232],
        [-0.3153, -1.5269,  2.0575],
        [ 0.1475,  0.2074,  0.3552],
        [-0.9042, -1.0410,  2.1823],
        [ 1.0417,  0.1424, -0.2788],
        [ 0.6161, -0.8498,  0.5933],
        [ 1.6279, -0.3055, -0.6933],
        [-0.5625, -1.4932,  2.2084],
        [-0.7363, -1.5517,  2.1714],
        [ 0.7478, -0.6071,  0.1593],
        [ 1.3139, -0.6779, -0.3703],
        [-0.28

(Epoch 7) TRAIN LOSS:0.4310 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:05<00:02,  2.57it/s]

SequenceClassifierOutput(loss=tensor(0.6821, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.3749, -0.4883, -0.3131],
        [-0.3981, -1.3666,  1.9566],
        [ 0.4954, -0.2931,  0.0222],
        [-0.0686,  0.5259, -0.0605],
        [ 0.3917, -1.5572,  1.6280],
        [-0.6766, -1.4822,  2.0610],
        [-0.9377, -1.3981,  2.2159],
        [-0.6250, -1.4017,  2.3157],
        [-0.3935, -1.5584,  2.3639],
        [ 0.6307, -0.8722,  0.7780],
        [-0.5162, -1.4511,  1.9591],
        [ 1.9588, -0.8882, -0.4477],
        [ 0.4260,  0.0071,  0.2530],
        [ 1.3158,  0.1242, -0.3394],
        [ 1.2327, -0.4740, -0.3078],
        [ 0.8683, -1.5625,  0.5841],
        [ 0.9358, -0.5202,  0.3591],
        [-0.1430, -1.6774,  1.8738],
        [-0.6929, -1.1924,  1.8951],
        [ 0.2127, -1.6929,  1.5240],
        [-0.5028, -1.6598,  2.3868],
        [ 0.3622,  0.2010, -0.4025],
        [-0.3697, -1.5582,  2.0943],
        [-0.5932, -1.5349,  2.2631],
        [ 0.33

(Epoch 7) TRAIN LOSS:0.4436 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:05<00:01,  2.60it/s]

SequenceClassifierOutput(loss=tensor(0.6016, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-3.9613e-01, -1.2495e+00,  2.0985e+00],
        [ 1.4410e-01, -4.2757e-01,  9.5143e-01],
        [-5.9493e-01, -1.4682e+00,  2.0479e+00],
        [ 1.7874e-01, -1.1032e+00,  1.4921e+00],
        [ 2.2667e-01,  2.6932e-01, -3.3310e-01],
        [ 1.5109e+00, -8.6344e-01, -3.4946e-01],
        [-7.7342e-01, -1.5571e+00,  1.9465e+00],
        [ 8.8968e-01, -6.3049e-01, -2.7767e-01],
        [ 1.4096e+00, -1.3896e-01, -7.1165e-03],
        [-3.5272e-01, -1.7939e+00,  1.9721e+00],
        [-1.2015e-01, -1.3964e+00,  1.5959e+00],
        [-7.1999e-01, -1.4105e+00,  2.0922e+00],
        [ 2.8363e-03,  6.9790e-01,  1.5484e-02],
        [-6.2099e-01, -1.2837e+00,  1.9845e+00],
        [ 6.1951e-02, -4.1069e-01,  1.3970e-01],
        [-2.6721e-01, -9.7083e-01,  1.8106e+00],
        [ 1.3121e-01, -6.9961e-02,  4.8326e-01],
        [ 7.5049e-01,  5.8178e-02,  1.7425e-01],
        [-4.9282e-01

(Epoch 7) TRAIN LOSS:0.4511 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:06<00:01,  2.63it/s]

SequenceClassifierOutput(loss=tensor(0.4617, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.3190, -0.5447, -0.3859],
        [ 0.3556,  0.3224, -0.1752],
        [ 0.4997, -1.4974,  1.3443],
        [-0.0203, -1.2513,  1.6699],
        [ 1.0958, -0.4360, -0.2965],
        [ 0.5842,  0.4361, -0.1606],
        [-0.6249, -1.5545,  2.1937],
        [ 1.4603, -0.9477, -0.3664],
        [-0.1944, -1.6552,  1.9854],
        [-0.2985, -1.7506,  2.2715],
        [ 0.8573, -0.3576, -0.4340],
        [ 0.6703, -0.3606,  0.1344],
        [ 0.1910,  0.7414, -0.2415],
        [ 0.6355, -0.9302,  0.6543],
        [-0.5567, -1.1698,  1.6990],
        [ 0.8781,  0.0171, -0.3186],
        [-0.2798, -1.2718,  1.4670],
        [ 0.5066,  0.5443, -0.2901],
        [-0.9298, -1.4572,  2.1397],
        [ 1.5805, -0.5355, -0.6718],
        [-0.6608, -1.4223,  2.1231],
        [-0.2215, -1.5928,  2.0863],
        [ 1.3508, -0.4948, -0.8404],
        [ 0.2861,  0.1646, -0.0613],
        [-0.67

(Epoch 7) TRAIN LOSS:0.4516 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:06<00:01,  2.63it/s]

SequenceClassifierOutput(loss=tensor(0.3771, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.7585, -1.5097,  2.1563],
        [-0.5340, -1.5081,  2.2275],
        [-0.3915, -1.6887,  2.0018],
        [ 0.0472, -0.7287,  0.7043],
        [ 0.7899,  0.5608, -0.6042],
        [-0.8217, -1.6304,  2.4838],
        [-0.7120, -1.2157,  2.0614],
        [ 1.3272,  0.1397, -0.8235],
        [-0.0420, -0.6984,  1.2643],
        [-0.2850, -1.2911,  1.6819],
        [-0.1793, -1.4947,  1.7332],
        [ 0.2055, -1.5505,  1.7552],
        [-0.2775, -1.4107,  1.9086],
        [ 1.4229, -0.1945, -0.7116],
        [-0.3971, -1.2534,  1.3942],
        [-0.3461, -1.4248,  1.8236],
        [ 1.1209,  0.4425, -0.6900],
        [ 0.2247, -1.9612,  1.7604],
        [-0.3696, -1.4853,  2.2435],
        [ 0.1124, -0.0221,  0.6412],
        [ 1.3957, -0.7030, -0.7840],
        [ 0.5376, -1.3157,  1.2567],
        [-0.8017, -1.5778,  2.3374],
        [-0.3568, -1.5774,  2.0479],
        [ 0.74

(Epoch 7) TRAIN LOSS:0.4540 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:07<00:00,  2.95it/s]

SequenceClassifierOutput(loss=tensor(0.5839, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 2.5505e-01, -1.3703e+00,  1.9142e+00],
        [ 1.6317e+00, -7.9493e-01, -7.3088e-01],
        [ 1.5644e+00, -6.0786e-01, -7.4190e-01],
        [-5.7662e-01, -1.3972e+00,  2.0870e+00],
        [ 1.1776e+00, -5.9795e-01, -3.3217e-01],
        [-6.3671e-01, -1.0604e+00,  2.1786e+00],
        [ 9.4432e-01, -7.7702e-01,  5.8998e-01],
        [ 6.8514e-01, -3.0337e-01,  3.4150e-01],
        [ 1.1375e+00, -1.7602e-01, -3.5417e-01],
        [ 4.3146e-01,  6.5587e-01, -7.0246e-01],
        [ 2.2546e-01, -1.5716e+00,  1.4106e+00],
        [-3.1468e-01, -8.2521e-01,  1.2374e+00],
        [-4.2828e-01, -1.1760e+00,  2.4219e+00],
        [-2.1856e-01, -1.7771e+00,  1.8185e+00],
        [ 1.3191e+00,  1.1745e-03, -5.1732e-01],
        [ 1.1971e-02, -1.4338e+00,  1.7042e+00],
        [-3.2773e-01, -8.8219e-01,  1.2174e+00],
        [ 1.5765e-01, -9.2040e-01,  1.4705e+00],
        [ 1.1631e+00

(Epoch 7) TRAIN LOSS:0.4482 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  3.28it/s]

SequenceClassifierOutput(loss=tensor(0.3096, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5843, -0.1718, -0.2414],
        [-0.3014, -1.6160,  1.8942],
        [ 0.1967, -0.5090,  0.3783],
        [-0.4581, -1.4043,  2.1237],
        [ 0.8619, -0.3247, -0.4541],
        [ 1.2384, -0.2590, -0.6898],
        [ 1.5887, -0.6134, -0.6573],
        [ 1.8836, -0.5626, -0.6784],
        [-0.5438, -1.0200,  1.5638],
        [-0.9014, -1.5594,  2.5814],
        [-0.7475, -1.2660,  1.7691],
        [ 0.5758,  0.7171, -0.4314],
        [ 0.0836, -1.6281,  2.0325],
        [-0.5011, -1.6460,  2.2978],
        [-0.6366, -1.7187,  2.1654],
        [ 1.2692, -0.5076, -0.5650],
        [-0.4358, -1.0996,  1.7368],
        [-0.0687, -1.3741,  1.9450],
        [ 0.2128,  0.4885, -0.4785],
        [ 0.0514, -1.3406,  1.0409],
        [-0.3664, -1.5736,  2.2213],
        [-0.7570, -1.3568,  2.3171],
        [-0.5111, -1.3495,  2.0460],
        [ 0.8720, -1.2047,  1.0723],
        [ 1.18

(Epoch 7) TRAIN LOSS:0.4482 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  3.27it/s]

(Epoch 7) TRAIN LOSS:0.4482 ACC:0.85 F1:0.76 REC:0.72 PRE:0.86 LR:0.00000300



(Epoch 8) TRAIN LOSS:0.3530 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:06,  3.69it/s]

SequenceClassifierOutput(loss=tensor(0.3530, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8421, -0.4837, -0.0613],
        [ 0.4534, -1.5069,  1.5170],
        [ 0.0946, -1.4347,  1.9515],
        [ 0.4690, -0.1806, -0.1990],
        [-0.6642, -1.0488,  2.0899],
        [ 1.4949, -0.9131, -0.4045],
        [ 0.3408, -0.4906,  0.5707],
        [ 0.8039, -0.3159, -0.3187],
        [-0.1504, -1.4231,  2.1597],
        [-0.1656, -1.2466,  1.6566],
        [-0.2917, -1.8760,  1.9445],
        [-0.1626, -1.5276,  1.9246],
        [-0.2755, -1.4274,  1.5862],
        [ 1.3515, -0.4267, -0.9880],
        [-0.0254, -1.5983,  1.9092],
        [-0.4272, -1.3228,  2.0850],
        [-0.5379, -0.8875,  1.9284],
        [ 1.2237, -0.3197, -0.5628],
        [-0.7936, -1.3698,  2.1670],
        [ 0.1271, -1.5750,  1.7343],
        [ 0.2580, -1.4799,  1.6612],
        [ 0.7676, -0.8677,  0.5585],
        [-0.6662, -1.4617,  2.1490],
        [-0.7157, -1.6066,  2.2308],
        [-0.70

(Epoch 8) TRAIN LOSS:0.3707 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:05,  4.14it/s]

SequenceClassifierOutput(loss=tensor(0.3884, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1589, -1.0386,  1.0061],
        [ 1.3066, -0.3721, -0.6512],
        [ 1.2108, -0.3099, -0.1285],
        [-0.7627, -1.4202,  2.5130],
        [-0.3182, -1.1128,  1.8894],
        [-0.2688, -1.6791,  2.1465],
        [-0.9190, -1.5212,  2.2000],
        [ 0.0297, -0.1238, -0.2926],
        [-0.5142, -1.4317,  1.9020],
        [-0.0440, -1.8724,  1.8507],
        [ 0.2869,  0.3918, -0.3634],
        [-0.6671, -1.6793,  2.0949],
        [-0.3386, -1.3522,  2.0272],
        [ 1.8669, -0.4210, -0.4617],
        [ 1.7603, -0.7023, -0.5005],
        [ 0.6581, -0.6660,  0.7457],
        [-0.0933, -1.2286,  2.1920],
        [-0.3209, -1.4354,  2.1063],
        [-0.1413,  0.2995,  0.1376],
        [ 1.0972, -0.3433, -0.6859],
        [ 0.2754, -0.7217,  0.9647],
        [ 0.1403, -1.5721,  1.9283],
        [-0.7875, -1.4177,  2.2167],
        [-0.9474, -1.3489,  2.4064],
        [ 0.57

(Epoch 8) TRAIN LOSS:0.3779 LR:0.00000300:  12%|████▉                                    | 3/25 [00:00<00:05,  4.33it/s]

SequenceClassifierOutput(loss=tensor(0.3922, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.4701, -1.4944,  2.0741],
        [ 0.9322, -1.6636,  0.8434],
        [-0.5922, -1.3860,  2.2021],
        [-0.4247, -1.3443,  2.2851],
        [ 0.3882, -0.2781, -0.0254],
        [ 1.0825, -0.4533, -0.0377],
        [ 0.3667,  0.7857, -0.4334],
        [ 1.2046, -0.4084,  0.0057],
        [-0.7313, -1.6282,  2.2138],
        [ 1.3110, -0.1367, -0.5127],
        [-0.3550, -1.3041,  1.8899],
        [-0.5104, -1.9070,  2.1970],
        [ 0.2845, -1.3832,  1.4166],
        [ 1.9060, -0.2769, -0.6905],
        [ 0.1810, -0.9263,  1.2530],
        [ 0.7331, -1.0463,  0.9675],
        [-0.5265, -1.3192,  1.9033],
        [-0.5144, -1.1685,  2.0758],
        [ 0.2225, -0.2753, -0.0243],
        [ 1.3677, -0.3126, -0.6489],
        [ 1.5113, -0.0799, -0.8896],
        [-0.0673, -1.1679,  1.5413],
        [-0.2550, -1.5168,  2.0614],
        [ 1.0519,  0.0475, -0.3163],
        [ 0.14

(Epoch 8) TRAIN LOSS:0.3933 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:00<00:04,  4.35it/s]

SequenceClassifierOutput(loss=tensor(0.4394, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0598, -0.6190,  0.2827],
        [-0.2545, -1.3700,  2.4167],
        [-0.1577, -1.2860,  1.8888],
        [ 0.6860,  0.6155, -0.7549],
        [-0.2035, -1.3644,  1.8603],
        [ 1.0055, -0.7860, -0.3354],
        [ 0.3040, -1.8236,  1.3825],
        [-0.5129, -1.6490,  1.7684],
        [-0.3323, -1.4280,  1.8080],
        [ 0.6440, -0.0139,  0.1893],
        [-0.4574, -1.5701,  2.0663],
        [ 1.3621, -1.6635,  0.9441],
        [-0.5251, -1.5032,  2.3147],
        [-0.6499, -1.2676,  2.1947],
        [ 2.0356, -0.7251, -0.5655],
        [-0.2815, -1.6034,  1.7985],
        [-0.4827, -1.4626,  2.2535],
        [-0.4103, -1.6039,  2.1975],
        [ 1.5842, -0.7951, -0.3054],
        [ 0.0482,  0.9684, -0.1598],
        [ 0.1440, -0.4124,  0.6779],
        [-0.3362, -1.4054,  1.8336],
        [ 1.3184, -0.0183, -0.3154],
        [-0.5141, -1.3869,  1.4182],
        [-0.29

(Epoch 8) TRAIN LOSS:0.4002 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:04,  4.44it/s]

SequenceClassifierOutput(loss=tensor(0.4281, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6101, -0.0465,  0.4132],
        [ 0.4146,  0.6017, -0.5971],
        [-0.8309, -1.6366,  2.1937],
        [ 0.2217,  0.6445, -0.2205],
        [-0.6713, -1.3501,  2.2986],
        [-0.2467, -1.4654,  2.1384],
        [-0.0067, -1.5890,  1.6865],
        [-0.0207, -1.4592,  1.9263],
        [ 0.2504, -1.8858,  1.5426],
        [ 1.3848, -0.7512, -0.2724],
        [-0.3944, -1.5833,  2.2485],
        [ 1.4620, -0.5664, -0.2280],
        [ 1.0199, -0.9757,  0.4465],
        [ 1.1370, -0.6807, -0.8501],
        [ 1.3608, -0.4995, -0.3791],
        [ 0.2932, -1.5899,  1.5770],
        [ 0.8578, -0.5017,  0.0783],
        [ 1.0061,  0.0161,  0.2863],
        [-0.1268, -1.2107,  1.4293],
        [-0.4197, -1.3319,  2.2264],
        [-0.5317, -1.5061,  2.5220],
        [ 0.6095, -1.5652,  1.1441],
        [-0.5354, -1.2925,  2.1214],
        [-0.6523, -1.3698,  2.3093],
        [-0.06

(Epoch 8) TRAIN LOSS:0.4052 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:01<00:04,  4.63it/s]

SequenceClassifierOutput(loss=tensor(0.4299, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 8.8105e-01,  6.0095e-01, -4.6934e-01],
        [ 5.1240e-01,  1.1482e+00, -6.0592e-01],
        [-5.8288e-01, -1.2867e+00,  2.0221e+00],
        [-6.0969e-01, -1.6169e+00,  2.3304e+00],
        [-5.4579e-01, -1.6058e+00,  1.8550e+00],
        [-1.5709e-01, -1.2096e+00,  1.4274e+00],
        [ 1.6472e+00, -3.7642e-01, -7.7808e-01],
        [-3.7929e-01, -1.4935e+00,  2.3020e+00],
        [ 7.9918e-01,  8.0821e-02, -1.1076e-01],
        [-4.8230e-01, -1.7071e+00,  2.1531e+00],
        [ 1.9918e+00, -5.6120e-01, -7.6886e-01],
        [ 2.5118e-01, -1.5204e+00,  2.1065e+00],
        [-4.8566e-01, -1.5325e+00,  2.1950e+00],
        [-4.1002e-01, -1.5831e+00,  2.2682e+00],
        [ 9.7181e-01, -1.3060e-04, -8.3112e-01],
        [-3.8712e-01, -1.4923e+00,  2.3819e+00],
        [-5.3892e-01, -5.0613e-01,  1.4474e+00],
        [-8.1726e-01, -1.3403e+00,  2.4119e+00],
        [ 5.6174e-01

(Epoch 8) TRAIN LOSS:0.4129 LR:0.00000300:  32%|█████████████                            | 8/25 [00:01<00:03,  4.77it/s]

SequenceClassifierOutput(loss=tensor(0.3541, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.8325, -1.0918, -0.3373],
        [-0.7137, -1.2590,  1.8332],
        [-0.2144, -1.6825,  1.9195],
        [ 0.5569, -0.8687,  0.5084],
        [-0.5983, -1.5089,  1.8624],
        [-0.3299, -1.5039,  2.4376],
        [ 1.3020,  0.0047, -0.2934],
        [ 1.4713, -0.6986, -0.2788],
        [ 0.2250, -1.8129,  1.6252],
        [-0.2846, -1.3529,  1.5698],
        [ 0.0825, -1.2621,  1.7586],
        [-0.6626, -1.9131,  1.9236],
        [ 0.7951,  0.6133, -0.0245],
        [ 1.3741, -1.3719,  0.3069],
        [-0.3475, -1.3014,  2.0799],
        [ 0.7237, -1.3820,  0.6431],
        [-0.7226, -1.1296,  2.1574],
        [ 0.1926, -1.4947,  1.5954],
        [-0.0790, -1.8259,  1.8692],
        [-0.2541, -1.6525,  1.9188],
        [ 1.2199, -0.4871, -0.0470],
        [ 1.5376, -0.3988, -0.6587],
        [-1.0525, -1.5958,  2.3960],
        [ 0.3465,  0.6712, -0.5962],
        [-0.42

(Epoch 8) TRAIN LOSS:0.3976 LR:0.00000300:  36%|██████████████▊                          | 9/25 [00:01<00:03,  4.61it/s]

SequenceClassifierOutput(loss=tensor(0.2749, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.3266, -0.4418, -0.7561],
        [-0.7254, -1.4846,  2.3002],
        [ 1.1841, -0.8218, -0.2367],
        [ 0.1167,  0.6266, -0.4524],
        [ 1.5498, -0.3542, -0.7613],
        [-0.6654, -1.5220,  2.2992],
        [ 0.1271, -1.4582,  1.3567],
        [ 1.5629, -0.3337, -0.2384],
        [-0.4981, -1.5862,  2.1566],
        [ 1.4860, -0.3889, -0.5280],
        [-0.7317, -1.6328,  2.0926],
        [-0.6209, -1.5216,  2.4189],
        [-0.3881, -1.7589,  1.8879],
        [-0.4312, -1.4265,  1.7580],
        [ 1.9162, -0.6578, -0.7308],
        [-0.6865, -1.3204,  2.3864],
        [ 1.6326,  0.0549, -0.5441],
        [-0.6923, -1.1935,  2.1777],
        [-0.2003, -1.6057,  1.8767],
        [-0.4755, -1.6098,  2.3948],
        [ 0.5952, -1.3897,  0.9668],
        [ 1.5419,  0.0728, -0.4188],
        [ 1.3008, -0.3308, -0.6380],
        [ 0.4328,  0.2948, -0.5296],
        [-0.01

(Epoch 8) TRAIN LOSS:0.3980 LR:0.00000300:  40%|████████████████                        | 10/25 [00:02<00:03,  4.70it/s]

SequenceClassifierOutput(loss=tensor(0.4022, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0409,  0.7801, -0.3614],
        [ 1.2007, -0.4347, -0.5877],
        [ 0.4840, -1.4691,  1.8307],
        [-0.5015, -1.4161,  2.2433],
        [ 0.3441, -0.0139,  0.5017],
        [-0.6014, -1.2617,  2.0506],
        [-0.2682, -1.5361,  2.3363],
        [-0.2830, -1.0672,  1.6102],
        [ 0.8977,  0.1080, -0.2390],
        [-0.1795, -1.6941,  1.9278],
        [-0.2519, -1.5930,  2.1228],
        [ 0.8908, -0.2685, -0.3034],
        [ 0.6946, -0.3523, -0.4338],
        [ 0.6399, -0.2419,  0.1602],
        [-0.3880, -1.3920,  1.7725],
        [-0.6412, -1.7491,  2.1900],
        [ 0.2981,  0.3549, -0.2091],
        [ 0.3286, -0.2589,  0.0217],
        [-0.5874, -0.8945,  1.5082],
        [ 1.2027, -0.7138, -0.0307],
        [-0.8571, -1.6467,  2.3573],
        [ 1.0790, -0.6782, -0.2326],
        [-0.4764, -1.5417,  2.1570],
        [-0.3223, -1.5904,  1.9772],
        [ 1.94

(Epoch 8) TRAIN LOSS:0.4037 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:02<00:02,  4.79it/s]

SequenceClassifierOutput(loss=tensor(0.4798, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 2.4062e-03, -1.6618e-02,  4.1750e-01],
        [-4.5536e-01, -1.0166e+00,  1.6476e+00],
        [-2.7071e-01, -9.6659e-01,  1.5304e+00],
        [ 5.8923e-01,  2.9379e-01,  3.0210e-01],
        [ 2.3485e-01, -1.8357e+00,  1.6748e+00],
        [ 1.9118e-01, -7.9272e-02, -1.5654e-01],
        [-4.8031e-01, -1.3252e+00,  1.9184e+00],
        [-5.6640e-01, -1.6507e+00,  2.4576e+00],
        [ 4.4538e-01, -3.6521e-01,  3.9283e-01],
        [ 3.3703e-01, -1.0813e+00,  5.5987e-01],
        [-5.1136e-01, -1.6193e+00,  2.0662e+00],
        [ 1.2962e+00, -3.2953e-01, -5.0337e-01],
        [ 1.6012e-01, -6.2844e-01,  2.3885e-01],
        [ 6.5802e-01,  1.2711e+00, -3.4839e-01],
        [-5.5162e-01, -1.3960e+00,  2.0204e+00],
        [ 1.7559e+00, -3.7179e-01, -6.9338e-01],
        [ 5.2226e-01,  4.8968e-01, -1.4245e-01],
        [-3.2488e-01, -1.4854e+00,  2.0428e+00],
        [-8.3035e-02

(Epoch 8) TRAIN LOSS:0.4101 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:02<00:02,  4.66it/s]

SequenceClassifierOutput(loss=tensor(0.4870, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7427, -0.0297, -0.5271],
        [-0.6083, -1.2660,  2.3734],
        [ 0.7334, -0.4946,  0.5721],
        [-0.4454, -1.2013,  1.8558],
        [-0.0890, -1.3249,  1.5123],
        [-0.6453, -0.7707,  1.9322],
        [-0.5994, -1.4510,  2.5128],
        [-0.5866, -1.6350,  2.5137],
        [ 0.8159, -1.2759,  0.9396],
        [ 1.3210, -0.5089, -0.6642],
        [-0.6416, -0.9861,  2.0331],
        [-0.3082, -1.5716,  2.1194],
        [ 1.1404, -1.1882,  0.6433],
        [-0.0058, -1.3597,  1.7849],
        [-0.3627, -1.4283,  2.2728],
        [-0.5963, -1.5615,  1.9861],
        [-0.8142, -1.1373,  2.2917],
        [-0.0484,  0.4750,  0.0803],
        [ 1.2429, -0.4924, -0.5023],
        [-0.0973, -1.5348,  0.9402],
        [-0.3681, -1.2738,  2.3310],
        [ 0.1495, -1.6192,  2.1022],
        [-0.3159, -1.6967,  1.8090],
        [ 0.9661, -0.3927, -0.1754],
        [ 0.63

(Epoch 8) TRAIN LOSS:0.4108 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:03<00:02,  4.66it/s]

SequenceClassifierOutput(loss=tensor(0.4204, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.7240e-01, -1.4853e+00,  2.2184e+00],
        [ 9.9895e-01, -3.6352e-01, -5.3953e-01],
        [-2.5197e-01, -1.8403e+00,  2.0753e+00],
        [ 1.8315e+00, -5.6017e-01, -7.5696e-01],
        [ 1.8984e-01,  3.2717e-01,  5.9690e-02],
        [ 5.1663e-01, -8.1297e-01,  2.8779e-01],
        [-6.6059e-01, -1.8238e+00,  2.3402e+00],
        [ 2.1785e-02,  9.7878e-01, -4.7081e-01],
        [ 1.0107e+00,  1.9041e-02, -8.3778e-02],
        [ 3.9667e-01, -2.3265e-01,  1.3215e-01],
        [-6.9074e-01, -1.1468e+00,  1.4323e+00],
        [-7.6464e-01, -1.5261e+00,  2.2007e+00],
        [-5.2632e-01, -1.6620e+00,  2.2916e+00],
        [-5.9856e-01, -1.1365e+00,  2.1188e+00],
        [ 8.4086e-01, -4.1811e-01, -1.2591e-01],
        [ 2.0357e+00, -7.3932e-01, -3.2074e-01],
        [ 1.8020e+00, -9.3972e-01, -4.3777e-01],
        [-3.7179e-01, -1.6907e+00,  2.2182e+00],
        [-4.9118e-01

(Epoch 8) TRAIN LOSS:0.4102 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:03<00:02,  4.58it/s]

SequenceClassifierOutput(loss=tensor(0.4011, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.7924, -1.6306,  2.6090],
        [-0.3866, -1.5548,  2.0744],
        [ 0.4826, -0.3979,  0.4145],
        [ 0.0608,  0.6073, -0.5742],
        [-0.1852, -1.8483,  1.8008],
        [-0.2907, -1.3747,  2.0417],
        [ 0.6204, -0.1719, -0.4165],
        [-0.9855, -1.5115,  2.3901],
        [-0.3260, -1.7751,  2.2018],
        [ 1.1218, -0.2798, -0.3322],
        [-0.6697, -1.8463,  2.3317],
        [-0.2866, -1.7641,  2.1746],
        [ 0.8886, -0.6104,  0.2680],
        [ 1.2999, -0.7951, -0.0087],
        [ 1.8119, -0.4767, -0.6538],
        [ 1.5464,  0.0435, -0.1483],
        [-0.3484, -1.5053,  2.1197],
        [ 1.2488,  0.2782, -0.6814],
        [ 1.0145, -1.5299,  0.8644],
        [ 1.8207, -0.2272, -0.6680],
        [-0.5458, -1.0327,  1.8616],
        [ 0.1028,  0.7731, -0.2693],
        [-0.5344, -1.6043,  2.1944],
        [ 1.4054, -0.7976, -0.7378],
        [-0.95

(Epoch 8) TRAIN LOSS:0.4000 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:03<00:01,  4.68it/s]

SequenceClassifierOutput(loss=tensor(0.2468, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.5885, -1.8242,  2.1586],
        [ 1.0676, -0.5370, -0.6904],
        [ 0.2928, -0.7965,  0.3785],
        [-0.7903, -1.4609,  2.1463],
        [-0.8467, -1.6443,  2.1701],
        [ 1.1915, -0.4387, -0.7818],
        [-0.3545, -0.9714,  1.5346],
        [-0.1542, -1.8637,  1.8462],
        [-0.5518, -1.4124,  2.1286],
        [-0.4495, -1.6913,  2.1717],
        [ 1.2196, -0.4584, -0.9767],
        [-0.2426, -1.8103,  2.1008],
        [-0.9475, -1.3445,  2.5073],
        [-0.4893, -1.1170,  1.8302],
        [ 1.4060, -0.4175,  0.1067],
        [-0.6752, -1.1225,  2.1705],
        [-0.1564, -1.5653,  1.8759],
        [-0.3176, -1.4054,  1.9993],
        [ 1.8404, -0.2865, -0.8357],
        [-0.5368, -1.1891,  2.0524],
        [ 0.6733,  0.1322,  0.0297],
        [ 1.1267, -0.3862, -0.7606],
        [-0.8123, -1.4211,  2.2827],
        [ 1.1916, -0.9831, -0.4202],
        [ 1.62

(Epoch 8) TRAIN LOSS:0.3972 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:03<00:01,  4.75it/s]

SequenceClassifierOutput(loss=tensor(0.3535, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9092, -1.3087,  0.2885],
        [ 0.3623,  0.3915, -0.3837],
        [ 0.2879,  0.3111, -0.2744],
        [ 0.0470, -1.1246,  1.6604],
        [ 0.9146,  0.0377, -0.5245],
        [-1.0094, -1.5889,  2.1601],
        [-0.7424, -1.5954,  2.3643],
        [-0.3282, -1.4737,  1.6957],
        [-0.2834, -1.4835,  1.9365],
        [-0.0609, -0.2807,  0.0183],
        [ 1.2110, -1.5609,  0.7698],
        [ 0.1359, -1.1981,  1.6209],
        [ 1.5262, -0.3191, -0.7359],
        [ 1.2744, -0.5192, -0.4993],
        [ 1.2233, -0.0473, -0.5200],
        [-0.1272, -1.8264,  2.1934],
        [-0.5599, -1.2695,  1.4966],
        [-0.5328, -1.8031,  2.3900],
        [-0.6416, -1.0983,  2.0005],
        [ 1.5867, -0.5504, -0.7954],
        [-0.5940, -1.5394,  1.9069],
        [-0.5305, -1.1266,  2.4171],
        [-0.5390, -1.6639,  2.4811],
        [-0.1774, -0.8348,  1.4848],
        [ 0.10

(Epoch 8) TRAIN LOSS:0.3983 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:04<00:01,  4.93it/s]

SequenceClassifierOutput(loss=tensor(0.4740, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.5411, -1.2889,  2.0975],
        [-0.6986, -1.5569,  2.0133],
        [ 0.0697,  0.8497, -0.5710],
        [ 0.3097, -1.6292,  1.7739],
        [ 0.1951,  0.5343, -0.4128],
        [-0.2953,  0.6324,  0.0382],
        [ 0.7232, -0.3445, -0.0323],
        [ 1.2661, -0.5705, -0.0669],
        [ 1.2106, -0.1294, -0.6240],
        [ 1.3319, -0.2753, -0.2751],
        [ 1.1975, -1.0339,  0.6217],
        [ 0.5951,  0.8437, -0.3682],
        [-0.3029, -1.2908,  2.1659],
        [ 0.7651,  0.4641, -0.2553],
        [ 0.2875,  0.5520, -0.5864],
        [-1.0504, -1.5082,  2.4916],
        [-0.6660, -1.3297,  2.3516],
        [ 1.8192, -0.6324, -0.8994],
        [ 0.0173, -1.5246,  1.7722],
        [ 0.7952,  0.1007, -0.4321],
        [-1.0221, -1.6971,  2.7107],
        [ 0.9325, -1.0917,  0.2691],
        [-0.4417, -1.2770,  1.9423],
        [-0.6286, -1.6404,  2.3438],
        [-0.69

(Epoch 8) TRAIN LOSS:0.3980 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:04<00:01,  4.88it/s]

SequenceClassifierOutput(loss=tensor(0.3915, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0053, -1.6812,  1.8202],
        [-1.0576, -1.1466,  2.3645],
        [ 1.6131, -0.2668, -0.5003],
        [ 1.0350,  0.1235, -0.5510],
        [-0.6388, -1.7237,  2.3488],
        [-0.5292, -1.6853,  2.0391],
        [-0.0972, -0.9030,  0.6810],
        [-0.3642, -1.2421,  1.7725],
        [-1.1491, -1.2732,  2.7117],
        [ 0.7451, -0.1398, -0.4982],
        [ 0.4156, -1.5567,  1.1541],
        [-0.5213, -1.3591,  2.1000],
        [-0.7522, -1.3192,  2.1552],
        [-0.1863, -1.5392,  2.0838],
        [ 1.5239, -0.7187, -0.7555],
        [ 1.3283, -0.5380, -0.3455],
        [ 0.4446, -0.3916,  0.6482],
        [ 0.1155, -1.9574,  1.4640],
        [ 1.0260,  0.1962, -0.5086],
        [ 1.1239, -0.4694, -0.3293],
        [-0.8044, -1.4524,  2.3068],
        [ 1.5080, -0.4703, -0.3560],
        [-0.5283, -1.5059,  2.3931],
        [-0.7699, -1.4341,  2.1531],
        [-0.52

(Epoch 8) TRAIN LOSS:0.3927 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:04<00:00,  4.71it/s]

SequenceClassifierOutput(loss=tensor(0.2865, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.4519, -0.5535, -0.2781],
        [-0.9196, -1.4700,  2.3146],
        [ 0.7009, -0.0136, -0.4269],
        [-0.9758, -1.8080,  2.4001],
        [ 1.3412, -0.2991, -0.5971],
        [ 1.1068, -0.2116, -0.4936],
        [ 0.9315,  0.2299, -0.2064],
        [ 1.1520, -1.5177,  0.8301],
        [-0.2944, -1.3870,  2.1203],
        [-0.8878, -1.3162,  2.1117],
        [ 1.7097, -0.8631, -0.7654],
        [ 1.1423, -0.9127, -0.5907],
        [-0.7800, -1.7370,  2.2256],
        [-0.9448, -1.4777,  2.4871],
        [-0.5748, -1.4878,  2.3901],
        [-0.6985, -1.6703,  2.1495],
        [ 1.6805, -1.0456, -0.6159],
        [-0.1924, -1.5422,  1.9529],
        [ 0.2880,  1.0087, -0.3516],
        [-0.6070, -1.5202,  2.1267],
        [ 1.8629, -0.7779, -0.6965],
        [-0.2188, -1.5318,  1.5727],
        [ 1.2745, -0.1041, -0.8886],
        [ 2.0795, -0.7899, -0.8604],
        [-0.51

(Epoch 8) TRAIN LOSS:0.3903 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:04<00:00,  4.74it/s]

SequenceClassifierOutput(loss=tensor(0.3414, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0262, -0.5888,  0.4295],
        [-0.4921, -1.4908,  2.3578],
        [ 0.4180,  0.5628, -0.4154],
        [-0.8261, -1.4258,  2.2041],
        [-0.8770, -1.8288,  2.3021],
        [ 1.9924, -0.7815, -0.6289],
        [ 0.5073, -0.4524,  0.3750],
        [ 0.2342,  0.7111, -0.5196],
        [-0.4922, -1.5184,  2.3452],
        [-0.4811, -1.3707,  2.2187],
        [-0.5415, -1.5582,  2.5312],
        [-0.6884, -1.5911,  2.2769],
        [-0.0592, -1.9614,  1.9174],
        [-0.4794, -1.7257,  1.9376],
        [ 0.9473, -0.5018, -0.2883],
        [-0.5759, -1.1743,  2.1265],
        [-0.5909, -1.6866,  2.4263],
        [ 0.2981, -1.1383,  1.2570],
        [ 0.3903,  0.1466, -0.3283],
        [ 0.1246,  0.4486,  0.2416],
        [ 0.6654, -1.5884,  1.2427],
        [-0.3078, -1.5296,  2.3834],
        [-0.8431, -1.2907,  2.3509],
        [ 1.4456, -1.2539,  0.4500],
        [-0.41

(Epoch 8) TRAIN LOSS:0.4022 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:05<00:00,  4.67it/s]

SequenceClassifierOutput(loss=tensor(0.5981, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.5998e-01, -1.5930e+00,  2.4774e+00],
        [ 7.1135e-01,  4.2459e-01, -7.8548e-01],
        [ 1.1800e+00,  6.2367e-03, -6.9297e-01],
        [ 1.3477e+00, -9.6046e-01,  3.1295e-01],
        [-9.1423e-01, -1.7246e+00,  2.4645e+00],
        [-1.2060e+00, -1.3873e+00,  2.2324e+00],
        [ 5.8192e-02,  2.2217e-02, -5.3346e-02],
        [ 1.2863e+00, -5.3035e-01,  3.8994e-02],
        [ 2.1739e-01,  2.8078e-01,  1.0833e-01],
        [-3.3501e-01, -1.2089e+00,  1.6439e+00],
        [ 1.4421e+00, -3.3771e-01, -3.5798e-02],
        [ 5.1878e-02, -9.7970e-02,  3.6026e-01],
        [ 1.0454e+00, -4.5654e-01,  1.0308e-01],
        [-6.0023e-03,  3.5114e-01, -2.5712e-01],
        [ 4.4765e-01, -1.0005e+00,  1.5871e+00],
        [-9.0671e-01, -1.5393e+00,  2.4820e+00],
        [-1.5253e-01, -1.2986e+00,  1.8266e+00],
        [ 1.3401e+00, -1.0708e+00, -1.9163e-01],
        [-6.0616e-01

(Epoch 8) TRAIN LOSS:0.4066 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:05<00:00,  4.42it/s]

(Epoch 8) TRAIN LOSS:0.4066 ACC:0.86 F1:0.80 REC:0.77 PRE:0.86 LR:0.00000300



  0%|                                                                                            | 0/25 [00:00<?, ?it/s]

SequenceClassifierOutput(loss=tensor(0.3386, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.3305, -0.3262, -0.7163],
        [ 1.1817, -1.2350,  0.4769],
        [ 0.2055, -0.9623,  0.6468],
        [ 0.2155,  0.0217,  0.0074],
        [ 1.4317, -0.3156, -0.5818],
        [-0.4693, -1.6028,  2.2722],
        [-0.0854, -1.1442,  1.5525],
        [-0.5468, -1.4164,  2.3477],
        [-0.3896, -1.6690,  1.9441],
        [-0.8383, -1.2071,  2.4006],
        [-0.5530, -1.4272,  2.3801],
        [-0.4036, -1.5256,  2.1832],
        [-0.5778, -1.7401,  2.2849],
        [ 0.9332, -0.2352, -0.2193],
        [-0.5936, -1.5804,  2.1358],
        [-0.3701, -1.8294,  2.0353],
        [ 1.4143, -1.0199, -0.4846],
        [ 0.7519, -0.1935, -0.1466],
        [-0.3582, -1.5678,  1.8894],
        [ 1.9757, -0.5416, -0.7755],
        [ 0.8804,  0.0844, -0.5197],
        [ 1.9080, -0.7660, -0.3973],
        [ 1.3739,  0.1077, -0.5577],
        [ 1.0109,  0.7594, -0.9630],
        [-0.11

(Epoch 9) TRAIN LOSS:0.3386 LR:0.00000300:   4%|█▋                                       | 1/25 [00:00<00:11,  2.15it/s]

SequenceClassifierOutput(loss=tensor(0.3125, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.9447, -0.8329,  2.0315],
        [-0.4851, -1.1824,  1.9601],
        [ 0.3254, -0.5419,  0.4693],
        [-0.1118, -0.7566,  1.0168],
        [ 0.4027, -0.6485,  0.8558],
        [-0.5169, -1.3191,  2.1539],
        [-0.5594, -1.4343,  1.9135],
        [ 1.6112, -0.1697, -0.5327],
        [-0.0446, -1.2225,  1.6813],
        [-0.5719, -1.6616,  2.3933],
        [ 0.3697, -0.3077,  0.6090],
        [-0.4317, -1.5346,  1.9776],
        [-0.7823, -1.9342,  2.2418],
        [-0.6939, -1.3965,  2.2776],
        [ 1.2093, -0.0822, -0.3997],
        [-0.6145, -1.3973,  2.1480],
        [ 1.0953, -0.6206, -0.3678],
        [-1.0654, -1.4337,  1.9836],
        [-0.9717, -1.3247,  2.1263],
        [ 0.3835, -0.4788,  0.2627],
        [-0.0208,  1.0169, -0.3643],
        [-0.9941, -0.9083,  2.2580],
        [-0.6647, -1.5258,  2.2730],
        [ 0.3377,  0.6842, -0.4377],
        [ 0.99

(Epoch 9) TRAIN LOSS:0.3256 LR:0.00000300:   8%|███▎                                     | 2/25 [00:00<00:09,  2.39it/s]

SequenceClassifierOutput(loss=tensor(0.4551, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9625, -0.6208, -0.6012],
        [ 0.0701,  0.6149,  0.1517],
        [ 0.5360,  0.1089, -0.1773],
        [-0.7810, -1.5004,  2.2827],
        [-0.4038, -1.8537,  2.1215],
        [-0.3231, -1.5206,  2.1577],
        [ 1.6563, -0.6631, -0.6985],
        [-0.7617, -1.6551,  2.5163],
        [ 0.8222, -1.6325,  1.0088],
        [ 0.0079,  0.9237, -0.4497],
        [ 0.0518, -1.5559,  1.8143],
        [-0.5997, -1.1512,  2.1660],
        [-0.7802, -1.5788,  2.2375],
        [ 0.8970, -0.8017,  0.5542],
        [ 0.0940, -1.7461,  2.0203],
        [-0.5220, -1.4201,  2.2447],
        [ 0.9315, -0.1232, -0.6224],
        [ 0.6315,  0.4335,  0.0503],
        [ 1.2647, -1.0168, -0.1590],
        [ 0.8091, -0.1916, -0.3262],
        [-0.9901, -1.6918,  2.5987],
        [-0.7811, -1.7076,  2.1740],
        [ 1.2242, -0.3762, -0.6143],
        [-0.0516, -1.8925,  1.6931],
        [-0.99

(Epoch 9) TRAIN LOSS:0.3687 LR:0.00000300:  12%|████▉                                    | 3/25 [00:01<00:08,  2.50it/s]

SequenceClassifierOutput(loss=tensor(0.2973, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2709, -1.4025,  1.7000],
        [-0.8612, -1.4883,  2.3441],
        [-0.4358, -1.5137,  2.3110],
        [ 0.9379, -0.2491, -0.1747],
        [ 0.4369, -1.5343,  1.6080],
        [ 1.0671, -0.9660,  0.2280],
        [ 1.5277, -0.4506, -0.9721],
        [ 1.1902, -0.3572, -0.2371],
        [-0.3407, -0.8449,  1.4877],
        [ 0.4333,  1.0000, -0.4212],
        [-0.6465, -1.5522,  2.0589],
        [-0.5981, -1.3872,  2.0898],
        [-0.9273, -1.6117,  2.0170],
        [ 1.6398, -0.5164, -0.3777],
        [-0.7543, -1.4402,  2.3964],
        [-0.8951, -1.2308,  2.5826],
        [-0.9272, -1.1946,  2.4297],
        [ 1.5690, -0.8476, -0.1692],
        [-0.5793, -1.2664,  2.1355],
        [ 1.9829, -0.4508, -0.8464],
        [ 1.0690, -0.6117, -0.4925],
        [ 0.4638, -1.2613,  1.0046],
        [-0.2608, -1.4396,  2.0448],
        [ 1.5655, -1.0801, -0.7945],
        [ 0.22

(Epoch 9) TRAIN LOSS:0.3509 LR:0.00000300:  16%|██████▌                                  | 4/25 [00:01<00:08,  2.54it/s]

SequenceClassifierOutput(loss=tensor(0.4916, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 6.1471e-01,  5.6628e-01, -8.1408e-01],
        [ 6.2529e-01, -1.0990e+00,  9.2577e-01],
        [ 8.7378e-01, -3.2934e-01, -2.9974e-01],
        [ 4.9731e-01, -3.7260e-01,  5.3318e-01],
        [-2.2406e-01, -1.5887e+00,  2.0474e+00],
        [-6.8700e-01, -1.5623e+00,  2.3505e+00],
        [ 1.6082e+00, -1.4753e-01, -3.5311e-01],
        [ 2.2585e-01, -1.6543e+00,  1.7368e+00],
        [-9.0356e-01, -1.5775e+00,  2.3992e+00],
        [-2.8394e-01, -5.0746e-01,  1.2922e+00],
        [-7.6917e-01, -1.3788e+00,  2.2813e+00],
        [ 1.0216e+00, -3.4174e-01, -2.4703e-01],
        [ 7.5976e-01, -2.9501e-01,  5.9687e-02],
        [ 4.5694e-01,  1.0224e+00, -3.6858e-01],
        [ 1.2998e+00, -3.6347e-01, -3.9849e-01],
        [ 1.1617e+00, -1.5880e-01, -8.3311e-01],
        [ 1.5664e+00, -4.0277e-01, -3.1339e-01],
        [ 1.8519e+00,  4.2301e-02, -6.9542e-01],
        [-4.1549e-01

(Epoch 9) TRAIN LOSS:0.3790 LR:0.00000300:  20%|████████▏                                | 5/25 [00:01<00:07,  2.66it/s]

SequenceClassifierOutput(loss=tensor(0.3788, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8421, -0.2534,  0.1141],
        [ 1.3222, -0.4555, -0.3328],
        [ 1.6695, -0.7404, -0.1927],
        [ 0.4818,  0.0824, -0.1923],
        [ 0.9113, -0.1568, -0.3182],
        [-0.3772, -1.5594,  1.9910],
        [-0.0136, -1.3063,  1.8685],
        [-0.7440, -1.8334,  2.5333],
        [-0.6663, -1.3638,  2.1527],
        [ 0.5863,  0.7590, -0.4648],
        [ 1.4321, -0.7368, -0.5032],
        [-0.6395, -1.7021,  2.5465],
        [ 0.5671, -1.3922,  1.2880],
        [-0.9196, -1.4433,  2.5199],
        [-0.7552, -1.5388,  2.2412],
        [-0.6597, -1.6572,  2.2371],
        [-0.8007, -1.5224,  2.4570],
        [-0.7256, -1.6364,  2.3108],
        [-0.7532, -1.3501,  2.5548],
        [ 1.3263, -0.1004, -0.6279],
        [ 0.3961, -1.1043,  1.2425],
        [ 1.1551, -0.3770, -0.5177],
        [-0.2997, -1.5086,  2.1991],
        [-0.6391, -1.5838,  2.0962],
        [-0.70

(Epoch 9) TRAIN LOSS:0.3790 LR:0.00000300:  24%|█████████▊                               | 6/25 [00:02<00:07,  2.65it/s]

SequenceClassifierOutput(loss=tensor(0.3084, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.8098, -0.6875, -0.8688],
        [-0.8813, -1.3278,  2.2290],
        [-0.8975, -1.2172,  2.1619],
        [-0.6374, -1.8067,  2.3676],
        [ 1.3779, -0.0442, -0.6100],
        [-0.4887, -1.7058,  2.1540],
        [-0.0385, -2.0135,  1.6980],
        [-0.2219, -0.0782,  0.4342],
        [ 1.3522, -0.2691, -0.5514],
        [ 0.7210,  0.6831, -0.8322],
        [ 1.3328, -0.4211, -0.6640],
        [ 2.1084, -0.6720, -0.5357],
        [-0.3814, -1.3013,  2.2725],
        [ 0.9721, -0.2925, -0.4340],
        [ 1.5159, -0.6037, -0.9751],
        [-0.7605, -1.7056,  2.1035],
        [ 1.0803, -0.7523,  0.3381],
        [ 1.3329,  0.1996, -0.7007],
        [-0.0164,  0.6483, -0.4676],
        [-0.2291, -0.5341,  0.5421],
        [ 1.8103, -1.4523,  0.1027],
        [-0.3752, -1.5569,  2.3863],
        [-0.0468, -0.8602,  0.8647],
        [ 0.3576, -1.7986,  1.5791],
        [ 0.10

(Epoch 9) TRAIN LOSS:0.3689 LR:0.00000300:  28%|███████████▍                             | 7/25 [00:02<00:06,  2.59it/s]

SequenceClassifierOutput(loss=tensor(0.4131, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-1.2876, -0.8077,  1.8407],
        [-0.6897, -1.5008,  2.0581],
        [-0.5809, -1.6438,  2.2332],
        [ 0.7328, -1.9138,  1.3337],
        [-1.0560, -1.3035,  2.0524],
        [-0.0883, -1.6062,  2.2509],
        [-0.8953, -1.7554,  2.5285],
        [-0.5639, -1.6612,  2.5999],
        [-0.4027, -1.7104,  1.8129],
        [ 0.4988, -0.7005,  0.6613],
        [ 1.0420, -0.1803, -0.4403],
        [-0.1848, -1.5242,  1.7692],
        [ 1.1889,  0.0684, -0.6194],
        [ 1.5787, -1.6556,  0.5942],
        [-0.4549, -1.5971,  2.2637],
        [ 1.2740,  0.2220, -0.9025],
        [ 1.5134,  0.1877, -1.0075],
        [ 1.6730, -0.8251, -0.6341],
        [ 1.6980, -0.3064, -0.4453],
        [ 0.5672,  0.2325, -0.2232],
        [ 1.2991, -1.4343,  0.8653],
        [-0.8520, -1.4189,  2.7123],
        [-0.1335,  0.9070, -0.1667],
        [-0.8408, -1.4742,  2.0548],
        [-0.74

(Epoch 9) TRAIN LOSS:0.3744 LR:0.00000300:  32%|█████████████                            | 8/25 [00:03<00:06,  2.56it/s]

SequenceClassifierOutput(loss=tensor(0.4091, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7331, -1.3281,  0.8153],
        [ 0.0542,  0.9797, -0.1745],
        [ 1.2035, -0.1879, -1.0508],
        [ 0.1851, -1.4231,  1.3331],
        [-0.7897, -1.5792,  2.3437],
        [ 0.6685, -0.8995, -0.0189],
        [-0.9978, -1.4453,  2.4833],
        [-1.0177, -1.6010,  2.4377],
        [ 1.1177, -0.3468, -0.6772],
        [ 1.7763, -0.5172, -0.4589],
        [ 0.8717, -1.2107,  0.5757],
        [-0.7778, -1.5084,  2.5011],
        [-0.0905, -1.3289,  1.8098],
        [-0.5010, -0.5783,  1.0120],
        [-0.5910, -1.4233,  2.2252],
        [ 0.7831, -0.4817,  0.0369],
        [ 1.4750, -0.9328, -0.5863],
        [ 1.0815, -1.0536,  0.7643],
        [-0.0036,  0.9857, -0.1826],
        [ 0.3440, -0.5817,  0.3885],
        [ 0.5994, -1.3774,  0.6610],
        [-0.9095, -1.4318,  2.3909],
        [-0.3040, -1.6295,  1.8852],
        [ 1.3201, -0.3664, -0.3374],
        [-0.14

(Epoch 9) TRAIN LOSS:0.3954 LR:0.00000300:  40%|████████████████                        | 10/25 [00:03<00:05,  2.82it/s]

SequenceClassifierOutput(loss=tensor(0.5495, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2295,  0.6535, -0.2806],
        [ 0.4675, -2.0019,  1.4457],
        [-0.7132, -1.2828,  2.0033],
        [-0.1670, -0.9004,  1.4563],
        [ 0.8955, -0.5160, -0.1185],
        [-0.7669, -1.7427,  2.4342],
        [ 1.9636, -0.1927, -0.4328],
        [ 1.6789, -0.6404, -0.7177],
        [ 0.9707, -0.7200, -0.0575],
        [-0.2556, -1.8242,  2.1087],
        [ 1.0794, -0.4441, -0.2194],
        [-0.9048, -1.1783,  2.4367],
        [-0.8719, -1.1220,  2.3271],
        [-0.3062, -1.4545,  2.2319],
        [-0.1105,  1.1182, -0.6105],
        [-0.8446, -1.8163,  2.4280],
        [-0.8343, -1.2994,  2.2510],
        [-0.5481, -1.6883,  2.1120],
        [ 1.2167,  0.1294, -0.5817],
        [-0.6389, -1.3429,  2.6074],
        [ 0.1569, -1.6655,  1.9742],
        [ 1.7735, -0.1989, -1.1968],
        [ 0.9942, -1.1701,  0.6135],
        [ 0.1540, -1.4790,  1.2088],
        [-1.15

(Epoch 9) TRAIN LOSS:0.4056 LR:0.00000300:  44%|█████████████████▌                      | 11/25 [00:04<00:04,  3.22it/s]

SequenceClassifierOutput(loss=tensor(0.5070, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0064, -1.7911,  2.0093],
        [-0.1870, -1.5742,  2.0387],
        [ 1.2589, -0.4788, -0.2449],
        [-0.9612, -1.7215,  2.5212],
        [-0.4885, -1.4575,  1.8228],
        [-0.2577, -1.5968,  2.0564],
        [-0.8148, -1.5402,  2.5260],
        [ 0.9677, -1.4298,  0.9910],
        [-0.4564, -1.0893,  1.9401],
        [ 0.2263, -1.6054,  1.8467],
        [ 0.5371, -0.9935,  0.5855],
        [-1.0187, -1.7005,  2.4186],
        [ 1.0372, -0.6468,  0.2828],
        [-0.2311, -0.1690,  0.4442],
        [ 1.0378,  0.2101, -0.8482],
        [ 1.2105, -1.4027,  0.2898],
        [ 1.1935,  0.1120, -0.7937],
        [ 0.7995, -1.5404,  1.1481],
        [ 1.2087, -0.8374,  0.4339],
        [ 1.3189, -0.1923, -0.5072],
        [ 0.9739, -0.4637, -0.1870],
        [-0.1090, -1.8041,  2.0058],
        [-0.7886, -1.0218,  1.8908],
        [-0.9863, -1.3932,  2.3094],
        [ 1.14

(Epoch 9) TRAIN LOSS:0.3958 LR:0.00000300:  48%|███████████████████▏                    | 12/25 [00:04<00:03,  3.58it/s]

SequenceClassifierOutput(loss=tensor(0.2883, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.6900, -0.5961, -0.3139],
        [ 0.1747, -1.5227,  1.4876],
        [-0.7431, -1.6132,  2.3499],
        [-0.8222, -1.5298,  2.5236],
        [-0.4196, -1.6037,  2.3673],
        [ 0.3410, -1.6265,  1.5010],
        [ 1.1793, -0.2835, -0.5732],
        [ 0.6135, -1.3659,  1.2582],
        [ 0.7419, -0.5886, -0.1062],
        [-0.7001, -1.0309,  2.0861],
        [-0.7485, -1.1635,  2.0558],
        [ 0.9161,  0.0384, -0.4659],
        [-0.6284, -1.6708,  2.0937],
        [ 1.4617, -0.1586, -0.7239],
        [ 1.6103, -0.7909, -0.2358],
        [-0.8967, -1.6684,  2.3506],
        [-0.1333, -1.5765,  2.0634],
        [-0.7067, -1.4601,  2.2061],
        [ 0.0234,  0.0276,  0.7798],
        [ 1.7448, -0.8257, -1.3429],
        [ 1.4850, -0.5676, -0.9122],
        [-0.0574, -1.0105,  1.8085],
        [-0.8311, -1.3975,  1.9384],
        [-0.8291, -1.5897,  2.4096],
        [ 0.06

(Epoch 9) TRAIN LOSS:0.3925 LR:0.00000300:  52%|████████████████████▊                   | 13/25 [00:04<00:03,  3.81it/s]

SequenceClassifierOutput(loss=tensor(0.3532, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.3369,  0.2823, -1.0825],
        [-0.6153, -1.3668,  2.2732],
        [ 0.2455, -0.6736,  1.0373],
        [-0.8306, -1.3386,  1.9742],
        [-0.1599, -0.5349,  0.9244],
        [ 1.6841, -0.4701, -1.1869],
        [-1.1525, -1.6471,  2.6618],
        [-0.2629, -1.6274,  1.8820],
        [-0.7887, -1.5130,  2.3891],
        [-0.7195, -1.1813,  2.1437],
        [-0.7428, -1.7297,  2.7112],
        [-0.9074, -1.6752,  2.6611],
        [ 0.8363,  0.2295, -0.0564],
        [ 1.2508, -1.6738,  0.8719],
        [-0.7726, -1.6062,  2.3531],
        [ 1.9197, -0.0636, -0.7134],
        [-0.8950, -1.5564,  2.0137],
        [ 0.8628,  0.3325, -0.1025],
        [ 0.2845,  0.7603, -0.1152],
        [-0.4113, -1.2227,  1.9202],
        [-1.0436, -1.7701,  2.4386],
        [-0.3899, -1.2283,  1.7674],
        [ 0.0939, -1.6383,  2.0480],
        [-1.0007, -1.6786,  2.4674],
        [ 0.88

(Epoch 9) TRAIN LOSS:0.3911 LR:0.00000300:  56%|██████████████████████▍                 | 14/25 [00:04<00:02,  3.98it/s]

SequenceClassifierOutput(loss=tensor(0.3731, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2058,  0.6980, -0.2330],
        [-0.0843, -1.9133,  1.9813],
        [ 1.9773, -0.5770, -0.7356],
        [-0.5344, -1.3309,  2.0679],
        [-0.8453, -1.3991,  2.0213],
        [ 0.4501,  0.6461, -0.3326],
        [ 1.3569, -1.6782,  0.7633],
        [ 1.1269, -0.4084, -0.6858],
        [-0.5910, -1.5683,  2.1256],
        [-0.5411, -1.5729,  2.0566],
        [ 0.5116, -0.0370, -0.4773],
        [-0.7809, -1.3199,  2.1515],
        [ 1.5544, -0.3510, -0.7252],
        [ 1.1427, -0.3818, -0.9751],
        [-0.1529,  0.0470,  0.7148],
        [-0.6012, -1.2743,  1.7722],
        [-0.0459,  0.7277,  0.0904],
        [ 1.0262,  0.0221, -0.0098],
        [-0.3128, -1.6157,  2.0596],
        [ 1.6693, -0.6771, -0.6097],
        [ 0.9163, -0.4576,  0.0326],
        [ 1.4294, -0.6000,  0.0505],
        [ 0.3807,  1.4755, -0.3964],
        [ 1.8419, -0.7590, -0.8797],
        [-0.41

(Epoch 9) TRAIN LOSS:0.3816 LR:0.00000300:  60%|████████████████████████                | 15/25 [00:04<00:02,  4.24it/s]

SequenceClassifierOutput(loss=tensor(0.2481, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.1080, -0.2714, -0.8989],
        [ 2.2078, -0.3832, -0.8425],
        [ 1.6345, -0.7008, -0.8863],
        [-0.8142, -1.4658,  2.4387],
        [ 1.5802, -0.1267, -1.0969],
        [-0.7461, -1.3535,  2.4288],
        [ 1.8685, -0.1753, -0.7231],
        [-0.7926, -1.1905,  1.9575],
        [-0.6617, -1.9246,  2.0826],
        [-0.7526, -1.4214,  2.2235],
        [ 0.3484,  0.5593, -0.6107],
        [ 0.3166, -1.4248,  1.2814],
        [-0.6977, -1.0989,  2.0290],
        [ 1.1319, -0.0163, -0.6253],
        [ 1.8603, -0.5069, -0.7902],
        [-0.4291, -1.5660,  2.1177],
        [-0.3884, -1.6821,  2.3519],
        [ 1.3716, -0.3652, -0.4522],
        [-0.0445, -0.2367,  0.6301],
        [-0.6146, -1.9020,  2.3129],
        [ 1.8583, -0.1360, -0.7516],
        [ 1.8236, -0.6948, -0.6920],
        [ 1.6261, -0.5971, -0.5999],
        [-0.8596, -1.8333,  2.6246],
        [ 1.49

(Epoch 9) TRAIN LOSS:0.3802 LR:0.00000300:  64%|█████████████████████████▌              | 16/25 [00:05<00:02,  4.37it/s]

SequenceClassifierOutput(loss=tensor(0.3591, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.2726e+00, -1.8310e-03, -6.8563e-01],
        [ 1.9718e-01, -4.7952e-01,  3.9780e-01],
        [-9.2198e-01, -1.4850e+00,  2.5089e+00],
        [ 1.3265e+00, -2.6721e-03, -5.9623e-01],
        [ 6.6012e-01, -1.8470e+00,  1.3440e+00],
        [ 9.4016e-01,  8.0927e-01, -8.9516e-01],
        [ 2.6432e-01,  9.1505e-01, -2.8789e-01],
        [-8.8769e-01, -1.5402e+00,  2.3797e+00],
        [ 2.8444e-01, -7.9961e-01,  7.2051e-01],
        [ 3.4634e-01, -1.4923e+00,  1.2827e+00],
        [-7.2742e-01, -9.2872e-01,  1.9364e+00],
        [ 2.8464e-01, -1.0933e+00,  1.0518e+00],
        [ 7.4427e-01, -2.6659e-01, -1.8938e-01],
        [-6.4549e-01, -1.5203e+00,  1.8995e+00],
        [ 1.4298e-01, -1.7878e+00,  1.7185e+00],
        [ 8.2576e-01, -1.8554e+00,  1.1793e+00],
        [-6.4185e-01, -1.5532e+00,  2.4545e+00],
        [-8.4372e-01, -1.5202e+00,  2.2602e+00],
        [-7.6861e-01

(Epoch 9) TRAIN LOSS:0.3724 LR:0.00000300:  68%|███████████████████████████▏            | 17/25 [00:05<00:01,  4.53it/s]

SequenceClassifierOutput(loss=tensor(0.2478, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.5987, -1.6806,  2.2791],
        [-0.6959, -1.5452,  2.1115],
        [-1.3122, -1.5642,  2.4337],
        [-0.3022, -1.3124,  1.4107],
        [ 0.5268,  1.2578, -0.9288],
        [-1.0138, -1.4330,  2.3483],
        [ 1.0250,  1.1195, -1.0195],
        [-0.9308, -1.4800,  2.3047],
        [ 0.2172, -1.8122,  1.3939],
        [ 1.4812, -0.4545, -0.3604],
        [-0.3791, -1.4933,  2.0387],
        [-0.4564, -0.7027,  1.6352],
        [ 0.0429, -0.0749,  0.1924],
        [ 0.0423, -0.6441,  1.1824],
        [ 1.8041, -0.8192, -0.9077],
        [ 1.2985, -0.3816, -0.6248],
        [-0.8794, -1.5927,  2.5361],
        [-0.5544, -1.7537,  2.0499],
        [-0.8799, -1.5110,  2.0703],
        [-0.3245, -1.7889,  2.0703],
        [-0.6973, -1.5493,  2.1895],
        [ 1.2307, -0.4979, -0.4680],
        [-0.5542, -1.2277,  1.7515],
        [-1.0203, -1.5783,  2.5904],
        [ 2.17

(Epoch 9) TRAIN LOSS:0.3743 LR:0.00000300:  72%|████████████████████████████▊           | 18/25 [00:05<00:01,  4.65it/s]

SequenceClassifierOutput(loss=tensor(0.4064, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2336, -1.6805,  2.1518],
        [-0.6777, -1.1765,  1.6001],
        [ 0.8195, -0.3824, -0.2959],
        [-0.2450, -1.1716,  1.7164],
        [ 1.7545, -0.2726, -0.5643],
        [-0.2570, -1.5171,  2.1160],
        [ 1.3723,  0.0651, -0.4322],
        [ 1.0429,  0.3935, -0.8190],
        [ 1.2601, -0.4563, -0.5833],
        [-0.7141, -1.6357,  2.4106],
        [-0.0095,  0.7903, -0.6132],
        [ 1.7506, -0.6035, -0.5413],
        [ 1.5232, -0.5994, -0.4538],
        [ 1.4218, -0.1378, -0.9463],
        [ 1.7907, -0.3021, -0.9981],
        [ 0.7229,  0.8660, -0.7794],
        [-0.6883, -1.0708,  2.0813],
        [-0.4429, -1.6557,  2.2823],
        [ 0.5128,  0.3884, -0.1751],
        [ 1.7566, -0.2138, -0.8890],
        [-0.4159, -0.3088,  0.5740],
        [ 1.4958, -0.6068, -0.8587],
        [ 0.5451, -1.4126,  0.8813],
        [ 1.3096, -0.6799, -0.7778],
        [-0.92

(Epoch 9) TRAIN LOSS:0.3722 LR:0.00000300:  76%|██████████████████████████████▍         | 19/25 [00:05<00:01,  4.57it/s]

SequenceClassifierOutput(loss=tensor(0.3353, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0926, -0.4373, -0.2740],
        [-0.4529, -1.3255,  2.2601],
        [ 0.0952,  1.2094, -0.6428],
        [-0.5545, -1.5123,  2.5484],
        [ 0.0318, -1.5951,  1.3841],
        [ 0.9504, -1.3086,  0.6117],
        [ 0.9181,  0.0970, -0.6414],
        [-0.8425, -1.6014,  2.5168],
        [-0.6547, -1.3994,  1.8985],
        [ 0.0652, -1.5999,  1.4981],
        [ 1.2640,  0.6638, -1.0533],
        [ 1.6989, -0.0241, -0.8863],
        [ 0.8370, -1.3216,  0.9464],
        [ 1.9419, -0.3274, -0.6644],
        [-0.1845, -0.8050,  2.0465],
        [-0.8513, -1.6160,  2.6486],
        [-0.2590, -1.4163,  1.7365],
        [ 2.0526, -0.5076, -0.5360],
        [ 1.3857,  0.1037, -0.9999],
        [-0.3259, -1.8669,  2.0796],
        [-0.5493, -1.4681,  2.4457],
        [-0.1363, -1.6751,  2.1438],
        [-0.4248, -1.5015,  1.8591],
        [-0.5589, -1.5999,  2.1770],
        [ 0.22

(Epoch 9) TRAIN LOSS:0.3742 LR:0.00000300:  80%|████████████████████████████████        | 20/25 [00:05<00:01,  4.57it/s]

SequenceClassifierOutput(loss=tensor(0.4121, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0109, -1.1162,  0.4465],
        [-0.9512, -1.4436,  2.1255],
        [-0.9267, -1.3215,  2.1674],
        [ 0.0246,  0.9666, -0.0454],
        [ 0.1954, -1.4920,  1.8110],
        [-0.4774, -1.9541,  2.1503],
        [-0.9751, -1.6731,  2.2697],
        [-0.0849,  0.7330, -0.6424],
        [-0.5502, -0.3860,  1.1135],
        [ 0.0996, -0.1471,  0.4178],
        [ 0.7068,  0.3289, -0.7943],
        [ 1.2863, -0.9413, -0.3025],
        [-0.1318,  0.9820, -0.1652],
        [-0.7760, -1.3854,  2.0970],
        [ 0.6364, -0.3877,  0.4581],
        [ 1.3445,  0.1210, -0.4084],
        [ 2.1173, -0.7937, -1.1405],
        [-1.0079, -1.3471,  2.0937],
        [ 1.3908, -0.6207, -0.8673],
        [ 0.1209,  0.5963,  0.0763],
        [ 0.0207, -1.4864,  1.8082],
        [ 1.2125, -1.3050,  0.0459],
        [-0.6982, -1.4428,  2.6448],
        [-0.7485, -1.2031,  1.8332],
        [-0.84

(Epoch 9) TRAIN LOSS:0.3734 LR:0.00000300:  84%|█████████████████████████████████▌      | 21/25 [00:06<00:00,  4.52it/s]

SequenceClassifierOutput(loss=tensor(0.3571, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.9896, -0.8050, -1.0593],
        [ 0.8417, -1.6667,  1.0367],
        [-0.7513, -1.5640,  2.6057],
        [-0.5015, -1.4665,  2.3759],
        [-0.3176, -1.4832,  1.9255],
        [-0.2084, -1.2263,  1.1905],
        [ 0.0356,  0.5052, -0.2853],
        [-0.3831, -1.5075,  2.4217],
        [-0.0190, -1.2772,  0.9591],
        [ 1.2424, -0.2850, -0.5966],
        [ 0.4498, -0.3639,  0.0436],
        [ 1.5278, -1.2527, -0.0174],
        [ 1.8907, -0.1604, -0.8946],
        [ 0.5210, -1.6357,  1.6154],
        [-0.0358,  1.0098, -0.6927],
        [ 1.9708, -0.4873, -1.0407],
        [ 1.5921, -0.4104, -0.3555],
        [ 1.8044, -0.8165, -1.0001],
        [ 1.7522,  0.4176, -1.1606],
        [-0.6500, -1.0612,  1.6333],
        [-0.7811, -1.8533,  2.3052],
        [-0.8507, -1.1710,  2.3166],
        [-0.6105, -1.3728,  2.4142],
        [-1.2713, -1.6083,  2.4372],
        [ 0.18

(Epoch 9) TRAIN LOSS:0.3688 LR:0.00000300:  88%|███████████████████████████████████▏    | 22/25 [00:06<00:00,  4.75it/s]

SequenceClassifierOutput(loss=tensor(0.2720, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.6749, -0.6876, -0.4247],
        [-0.9540, -1.7801,  2.7564],
        [ 0.2424,  0.1653,  0.0795],
        [-0.9124, -1.3273,  2.0662],
        [-0.4467, -1.4743,  2.2374],
        [ 1.7202, -0.4948, -0.9618],
        [ 1.5654, -0.5080, -0.9473],
        [-1.0490, -1.6265,  2.4169],
        [-0.5539, -0.8615,  1.7003],
        [-0.7154, -1.4749,  2.0089],
        [-0.4679, -1.4153,  2.1565],
        [-0.5719, -1.8583,  2.0929],
        [ 0.5706,  0.9196, -0.1578],
        [ 0.3032, -0.2762,  0.4163],
        [ 0.4739,  0.7466, -0.7986],
        [-0.7793, -1.7904,  2.3362],
        [ 1.9868, -0.5427, -0.6383],
        [-1.3800, -1.6021,  2.5808],
        [-0.5722, -1.1993,  2.0604],
        [ 0.1864,  0.4913, -0.2793],
        [ 1.8758, -0.8091, -0.8805],
        [-0.6470, -1.4927,  2.3501],
        [-1.1221, -1.2910,  2.2226],
        [ 0.7023, -0.4507,  0.7340],
        [ 1.81

(Epoch 9) TRAIN LOSS:0.3623 LR:0.00000300:  92%|████████████████████████████████████▊   | 23/25 [00:06<00:00,  4.63it/s]

SequenceClassifierOutput(loss=tensor(0.2197, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.5414, -1.6062,  2.5057],
        [ 2.0954, -0.4581, -0.7054],
        [ 1.9302,  0.0594, -0.8728],
        [ 0.3531,  0.2705, -0.0602],
        [-0.2905, -1.3760,  1.8366],
        [-0.6743, -1.5176,  2.3690],
        [-0.6661, -1.8058,  2.5709],
        [ 0.4717, -1.2683,  0.9769],
        [-0.3060, -1.6488,  2.2194],
        [ 2.0659,  0.6124, -1.0920],
        [-0.7716, -1.2811,  1.9976],
        [ 1.9868, -0.9090, -0.6368],
        [ 1.4744, -0.6224, -0.5872],
        [-1.1921, -1.6632,  2.4565],
        [-1.1734, -1.4264,  2.1029],
        [ 1.8957, -0.4840, -0.5872],
        [-0.1970, -1.6827,  1.8572],
        [-0.9779, -1.3511,  2.2885],
        [ 2.4373, -0.7426, -0.7463],
        [-1.0278, -0.9099,  2.3368],
        [-1.0391, -1.8173,  2.5174],
        [-0.5344, -1.8827,  2.0795],
        [ 0.5037,  0.9428,  0.0266],
        [-0.2896, -1.5779,  2.0213],
        [ 1.21

(Epoch 9) TRAIN LOSS:0.3652 LR:0.00000300:  96%|██████████████████████████████████████▍ | 24/25 [00:06<00:00,  4.56it/s]

SequenceClassifierOutput(loss=tensor(0.4315, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1792, -0.7190,  1.5559],
        [ 0.5288, -0.0517, -0.2430],
        [ 1.2223, -0.4080, -0.3296],
        [-0.8844, -1.5941,  2.2935],
        [-0.7792, -1.7351,  2.3904],
        [ 2.1251, -0.8153, -0.8729],
        [-0.4197, -2.1093,  2.4086],
        [ 1.1688, -1.0111,  0.5650],
        [ 1.6356,  0.0617, -0.8343],
        [-1.2280, -1.4908,  2.5385],
        [ 0.8893, -1.3632,  0.9721],
        [-0.2618, -1.0857,  1.6723],
        [-0.9016, -1.2609,  2.5895],
        [ 1.4248, -0.4012, -0.9505],
        [ 0.2325,  1.0719, -0.1384],
        [ 1.5558, -0.6396, -0.3429],
        [ 1.5348, -0.6512, -0.5484],
        [ 0.5854, -1.0255,  1.2405],
        [ 1.1110, -0.6900,  0.2723],
        [ 1.7212, -0.6752, -0.8588],
        [ 0.3258, -1.2241,  1.3677],
        [-0.6131, -1.5577,  2.5592],
        [ 1.0122, -1.4221,  1.0635],
        [ 0.2071, -0.1890,  0.1041],
        [-0.21

(Epoch 9) TRAIN LOSS:0.3620 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:06<00:00,  4.67it/s]

SequenceClassifierOutput(loss=tensor(0.2863, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1600, -1.5828,  1.9420],
        [-0.4525, -1.1581,  1.8991],
        [-0.5231, -1.3390,  2.2661],
        [ 1.0445,  0.0359, -0.3624],
        [-0.4109, -1.8363,  2.4208],
        [ 1.2770,  0.4297, -0.3689],
        [ 2.0116, -0.7908, -0.9627],
        [ 1.5182, -0.4884, -0.6706],
        [ 0.6116, -1.7519,  1.2533],
        [ 0.0556, -0.1376,  0.3520],
        [-0.4237, -1.5256,  1.7583],
        [-0.3295, -0.9787,  1.4116],
        [-1.1315, -1.7549,  2.5776],
        [ 0.1459, -1.7118,  1.6776],
        [-0.9138, -1.6640,  2.8140],
        [ 1.5100, -0.5674, -0.7164],
        [ 0.0080, -1.1976,  1.2563],
        [-0.7014, -1.6011,  2.1988],
        [-1.1219, -1.4526,  2.5243],
        [-0.3894, -2.0169,  2.4810],
        [ 1.0236, -0.2649, -0.4226],
        [-0.8594, -1.5883,  2.3997],
        [-1.0699, -1.5187,  2.5066],
        [-0.0175, -0.4618,  0.5732],
        [ 1.57

(Epoch 9) TRAIN LOSS:0.3620 LR:0.00000300: 100%|████████████████████████████████████████| 25/25 [00:07<00:00,  3.50it/s]

(Epoch 9) TRAIN LOSS:0.3620 ACC:0.89 F1:0.84 REC:0.81 PRE:0.89 LR:0.00000300



(Epoch 10) TRAIN LOSS:0.2704 LR:0.00000300:   4%|█▌                                      | 1/25 [00:00<00:06,  3.57it/s]

SequenceClassifierOutput(loss=tensor(0.2704, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-1.0568, -1.6390,  2.4290],
        [-1.0872, -1.1358,  2.3149],
        [-1.1261, -1.4054,  2.1968],
        [-0.6626, -1.6000,  2.4527],
        [-0.5063, -1.5447,  2.0460],
        [ 0.0360,  0.3852, -0.3764],
        [-0.6867, -1.4365,  2.1594],
        [ 1.3671, -0.5221, -0.4960],
        [-0.9752, -1.5318,  2.7633],
        [ 2.1859, -0.6337, -0.9389],
        [-0.5708, -1.7431,  2.6307],
        [-1.1533, -1.5867,  2.8110],
        [-1.0569, -1.3868,  2.3707],
        [-0.3997, -1.8964,  2.4149],
        [ 1.0310,  0.1080, -0.3312],
        [ 1.3129, -1.1705,  0.1949],
        [-0.8499, -1.6514,  2.5677],
        [ 2.1539, -0.6059, -0.8442],
        [ 0.8835, -1.1354,  0.4333],
        [ 1.7329, -0.6609, -0.7435],
        [ 2.1505, -0.6363, -0.8751],
        [-0.0454, -1.1770,  1.5710],
        [-0.3867, -1.5436,  2.3068],
        [-0.3729, -1.5449,  2.2925],
        [-1.09

(Epoch 10) TRAIN LOSS:0.3153 LR:0.00000300:   8%|███▏                                    | 2/25 [00:00<00:05,  4.26it/s]

SequenceClassifierOutput(loss=tensor(0.3602, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.4347, -1.8153,  2.2072],
        [-0.9833, -1.6166,  2.4477],
        [ 2.0323, -0.5906, -0.5853],
        [ 1.2015, -1.3406,  0.8522],
        [-0.8853, -1.6817,  2.6161],
        [ 0.5559, -0.9348,  0.7534],
        [-1.0824, -1.7383,  2.7354],
        [-0.2514, -0.9014,  1.2594],
        [-0.9275, -1.6262,  2.5319],
        [-0.4052,  1.1856, -0.2674],
        [ 0.6941, -0.5880, -0.2609],
        [ 0.9332, -1.4845,  0.8195],
        [ 0.9335, -1.4140,  0.9738],
        [ 0.2734,  0.3331,  0.3464],
        [-0.7467, -1.1675,  2.0501],
        [-1.1101, -1.5610,  2.5772],
        [-0.2015, -0.1411,  0.8533],
        [-0.6249, -1.0672,  1.7152],
        [-0.0951, -1.3196,  2.0407],
        [-0.6081, -1.6595,  2.1040],
        [-0.9504, -1.6699,  2.4580],
        [-0.3000, -0.8250,  1.0767],
        [ 1.6453, -0.3751, -0.5581],
        [ 1.2368,  0.1454, -0.4110],
        [ 0.86

(Epoch 10) TRAIN LOSS:0.2894 LR:0.00000300:  12%|████▊                                   | 3/25 [00:00<00:05,  4.32it/s]

SequenceClassifierOutput(loss=tensor(0.2376, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.7865, -1.4901,  2.4548],
        [ 1.6410, -1.0763, -0.7890],
        [ 0.5121, -0.3060,  0.0823],
        [-0.4957, -1.3404,  1.8892],
        [-1.1646, -1.7666,  2.7706],
        [-1.1846, -1.4484,  2.1397],
        [-0.8552, -1.3436,  2.2759],
        [-0.8255, -1.8171,  2.6729],
        [ 1.3750, -0.4174, -0.9536],
        [-0.5912, -1.3373,  2.3503],
        [-0.6014, -1.6783,  2.4593],
        [-0.4848, -1.5442,  2.1795],
        [ 1.6319, -0.0534, -0.6250],
        [-0.3450, -1.6043,  2.1765],
        [ 0.3314, -1.5150,  1.8350],
        [ 0.8391,  0.0969, -0.4630],
        [-0.0904, -1.2387,  1.3640],
        [ 2.2252, -0.5186, -0.9337],
        [ 2.2943, -0.2277, -1.0406],
        [-0.5639,  1.0056, -0.1504],
        [ 1.7470, -0.6817, -0.5394],
        [ 0.3004,  0.7479, -0.5973],
        [ 1.4263,  0.1495, -0.6089],
        [-0.0230, -1.3270,  2.0544],
        [-0.61

(Epoch 10) TRAIN LOSS:0.3259 LR:0.00000300:  16%|██████▍                                 | 4/25 [00:00<00:04,  4.48it/s]

SequenceClassifierOutput(loss=tensor(0.4352, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.1240, -0.2917, -0.4500],
        [ 0.1884,  0.5970, -0.4542],
        [-0.0358, -1.6534,  1.5401],
        [-0.8719, -1.5090,  2.0107],
        [ 1.0274, -1.1523,  0.3773],
        [-1.0013, -1.2927,  2.5799],
        [ 1.4820,  0.1000, -0.8824],
        [-0.8699, -1.2886,  1.9975],
        [ 2.0328, -0.4632, -0.9453],
        [ 1.0649, -1.4529,  0.6057],
        [ 0.7209, -0.1977, -0.2181],
        [-1.1578, -1.4951,  2.7842],
        [ 0.0918,  0.7457, -0.5949],
        [-1.0260, -1.4055,  2.2475],
        [ 0.0815, -1.8568,  1.6413],
        [ 0.0648, -2.1863,  1.9277],
        [ 0.1205, -1.6769,  1.6249],
        [ 0.1292, -0.4774,  0.4674],
        [ 0.6749,  0.3070, -0.7469],
        [ 1.6216, -0.1616, -1.0095],
        [-0.6438, -1.6294,  2.5631],
        [ 1.8149, -0.3597, -0.7672],
        [ 1.4698, -0.4964, -0.7793],
        [ 1.6835, -0.6784, -0.7162],
        [ 0.88

(Epoch 10) TRAIN LOSS:0.3081 LR:0.00000300:  20%|████████                                | 5/25 [00:01<00:04,  4.61it/s]

SequenceClassifierOutput(loss=tensor(0.2372, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.9789, -1.6595,  2.6592],
        [-0.9144, -1.3613,  2.2689],
        [-0.9641, -1.6025,  2.2402],
        [-0.4798, -1.3477,  2.3299],
        [-0.1743, -1.6596,  2.2053],
        [ 0.2164, -1.5812,  1.7271],
        [ 0.0374,  0.2243,  0.2594],
        [ 0.0535, -1.3894,  1.6796],
        [-0.7899, -1.7521,  2.4111],
        [ 1.7737, -0.7148, -0.6706],
        [-0.2220, -1.2478,  1.7927],
        [-0.9250, -0.9574,  2.3106],
        [-0.9199, -1.3022,  2.3096],
        [-0.8090, -1.8844,  2.4226],
        [-0.6781, -0.9227,  1.4774],
        [-1.0373, -1.3606,  2.2790],
        [-1.4540, -1.4806,  2.5220],
        [-0.8926, -1.8116,  2.6418],
        [-1.2541, -1.7372,  2.8043],
        [-0.8011, -1.7517,  2.7878],
        [ 0.9540, -0.0107, -0.3167],
        [-1.1352, -1.3408,  2.3069],
        [-0.9163, -1.4803,  2.5879],
        [ 2.1611, -0.5404, -0.6187],
        [-1.23

(Epoch 10) TRAIN LOSS:0.3196 LR:0.00000300:  24%|█████████▌                              | 6/25 [00:01<00:04,  4.56it/s]

SequenceClassifierOutput(loss=tensor(0.3770, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.9665, -1.6658,  2.3949],
        [-0.4096, -1.6910,  2.0960],
        [-0.7754, -1.6410,  2.3224],
        [ 0.9927, -0.7471, -0.1886],
        [-0.1122,  1.2844, -0.3368],
        [-1.1206, -1.5238,  2.4279],
        [-0.6545, -1.1450,  2.0867],
        [-0.8705, -1.8271,  2.7309],
        [-0.1864,  1.1392, -0.4955],
        [ 1.7032, -0.4253, -0.4256],
        [-1.0415, -1.2437,  2.5246],
        [ 2.0759, -0.2820, -1.4042],
        [ 1.4603, -0.1167, -0.5916],
        [ 1.7187, -0.2186, -0.7856],
        [-0.5932, -2.0162,  2.1773],
        [ 1.9009, -0.7577, -0.9021],
        [ 1.6920,  0.1465, -0.4793],
        [-0.1336,  1.0274, -0.6832],
        [ 1.7173, -0.6003, -0.8328],
        [-0.0681, -1.4826,  2.2818],
        [ 0.7885, -0.2051, -0.2304],
        [ 0.1337, -1.5133,  1.6206],
        [ 1.7653, -0.3026, -0.8543],
        [ 2.2605, -0.7614, -0.6663],
        [-0.30

(Epoch 10) TRAIN LOSS:0.3305 LR:0.00000300:  28%|███████████▏                            | 7/25 [00:01<00:03,  4.66it/s]

SequenceClassifierOutput(loss=tensor(0.3960, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.4997, -0.0581, -0.2602],
        [ 0.1239, -1.5635,  1.5814],
        [ 1.7247, -0.4208, -0.5598],
        [-1.2391, -1.3787,  2.3230],
        [-0.4019, -1.8931,  2.2990],
        [ 2.1135, -0.5690, -0.6615],
        [-0.8767, -1.4160,  2.5247],
        [-0.8601, -1.8666,  2.5427],
        [-0.3679, -1.8696,  1.9440],
        [-0.7620, -1.4600,  2.3462],
        [ 0.5802, -1.6717,  1.5652],
        [ 1.9351, -0.7582, -1.0153],
        [-1.0614, -1.7006,  2.5784],
        [-0.4890, -1.5106,  1.9237],
        [-0.5953, -1.5052,  2.1135],
        [ 1.5345, -0.5421, -0.4367],
        [ 0.5982, -0.2518,  0.2330],
        [-0.8944, -1.2943,  2.3986],
        [ 1.0826,  0.2074, -0.1889],
        [ 1.0706, -0.9633, -0.0143],
        [ 2.6570, -0.6869, -0.9532],
        [ 1.1592, -1.4857,  0.8465],
        [ 0.2411,  0.8790, -0.4230],
        [ 1.2959, -0.7390, -0.1600],
        [-0.29

(Epoch 10) TRAIN LOSS:0.3393 LR:0.00000300:  32%|████████████▊                           | 8/25 [00:01<00:03,  4.64it/s]

SequenceClassifierOutput(loss=tensor(0.4005, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-1.2860, -1.3948,  2.9794],
        [-0.7189, -1.8983,  2.6911],
        [ 0.6045, -0.1635, -0.0350],
        [ 1.6203, -0.4588, -0.8955],
        [ 2.0751, -0.5046, -0.9556],
        [ 2.1270, -0.7766, -0.6980],
        [-0.8793, -1.4724,  2.7095],
        [ 2.3239, -0.6942, -1.1323],
        [-1.4142, -1.4494,  2.6775],
        [ 2.1482, -0.5575, -0.6677],
        [ 1.8669, -1.1815, -0.5472],
        [-0.0494,  1.0333, -0.0123],
        [ 0.1795, -0.1333, -0.1487],
        [ 1.1120, -1.8699,  0.9583],
        [ 0.7885,  0.3117, -0.7356],
        [-0.4071, -1.4419,  2.3452],
        [ 1.2995, -1.6349,  0.5838],
        [ 0.3845, -1.2839,  1.0833],
        [ 1.8416, -0.3614, -0.6641],
        [ 1.6876, -0.1694, -0.8731],
        [-0.4058, -1.7943,  1.8104],
        [ 0.5437,  0.0484,  0.0269],
        [-0.8349, -1.3627,  2.3007],
        [-0.7832,  0.0224,  0.5466],
        [ 0.44

(Epoch 10) TRAIN LOSS:0.3430 LR:0.00000300:  36%|██████████████▍                         | 9/25 [00:01<00:03,  4.72it/s]

SequenceClassifierOutput(loss=tensor(0.3729, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.8549,  0.3522, -0.7452],
        [ 0.8431,  0.1813, -0.5593],
        [-0.8762, -1.4504,  2.3463],
        [-1.0203, -1.6988,  2.5080],
        [ 1.5715, -0.4448, -0.6151],
        [ 0.2679,  0.7774, -0.3510],
        [-0.4388, -1.5483,  2.6319],
        [ 0.0428,  0.7016, -0.3597],
        [ 0.7846, -0.4283,  0.0659],
        [ 1.5345, -0.0527, -0.6337],
        [ 1.3195, -1.1347, -0.5406],
        [-0.5747, -1.2707,  2.1414],
        [-0.9446, -1.5259,  2.5127],
        [ 0.0349,  0.9371, -0.2367],
        [ 1.7657, -0.2612, -1.2784],
        [ 1.3494, -1.6341,  0.7279],
        [-0.3270, -1.9155,  2.5539],
        [-0.4016, -0.8674,  0.9177],
        [ 0.0850,  0.7705, -0.2778],
        [-0.6640, -1.7360,  2.3237],
        [ 1.6599, -0.1946, -0.9608],
        [ 0.2870, -1.9311,  1.7245],
        [-1.4458, -1.6647,  2.6717],
        [-1.0641, -1.1127,  2.5026],
        [ 0.25

(Epoch 10) TRAIN LOSS:0.3268 LR:0.00000300:  40%|███████████████▌                       | 10/25 [00:02<00:03,  4.61it/s]

SequenceClassifierOutput(loss=tensor(0.1805, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.8640, -1.6209,  2.4860],
        [-0.9254, -1.3824,  1.8930],
        [-1.4887, -1.2752,  2.8118],
        [ 1.1529, -0.7044,  0.1072],
        [-0.7779, -1.6784,  2.3496],
        [-1.2277, -1.3751,  2.1584],
        [ 1.5991, -0.0753, -0.8380],
        [-0.7853, -1.7309,  2.6071],
        [-1.3923, -1.1830,  2.4819],
        [-0.9048, -1.3690,  2.5660],
        [-0.4407, -1.3165,  1.4613],
        [ 2.2111, -0.3799, -0.7798],
        [-0.7456, -1.3666,  2.0477],
        [-0.9486, -1.3826,  2.3887],
        [ 1.4290, -1.3810,  0.5154],
        [ 2.0555, -0.6085, -1.1274],
        [-0.4090, -1.3454,  2.0885],
        [-0.5582, -1.6986,  2.3113],
        [ 1.1695, -0.8500, -0.4129],
        [-1.2649, -1.4754,  2.3810],
        [ 0.1243,  1.1228, -0.9710],
        [ 0.4497, -1.5148,  1.4709],
        [ 0.0578, -1.3655,  1.3784],
        [-1.3762, -1.1763,  2.1803],
        [ 2.17

(Epoch 10) TRAIN LOSS:0.3249 LR:0.00000300:  44%|█████████████████▏                     | 11/25 [00:02<00:02,  4.72it/s]

SequenceClassifierOutput(loss=tensor(0.3058, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8426,  0.1166,  0.1429],
        [ 1.8339, -1.2347, -0.0298],
        [ 1.0359,  0.4407, -0.5998],
        [ 0.6926, -1.1888,  0.8952],
        [-0.7259, -1.4996,  2.6087],
        [ 0.1753, -0.6481,  0.4977],
        [ 1.7067, -1.0334, -0.8096],
        [ 2.2415, -0.9159, -0.9566],
        [ 1.7745, -0.9871, -0.2048],
        [ 0.1999,  0.1539,  0.3811],
        [-0.3383, -1.5072,  2.2568],
        [-1.0662, -1.0788,  2.6084],
        [ 0.1651, -1.4692,  1.7517],
        [-0.2879, -1.6144,  1.6205],
        [-0.8545, -1.5846,  2.1413],
        [ 1.8593, -0.3978, -0.6606],
        [-0.7317, -1.4299,  2.2482],
        [-0.0689, -1.9455,  1.9940],
        [-0.8979, -1.8232,  2.3810],
        [-0.6089, -1.4631,  2.5514],
        [ 1.4468, -0.8001,  0.1970],
        [ 1.6954, -0.0490, -0.7409],
        [-1.2647, -1.4802,  2.5452],
        [ 1.0792, -0.2221, -0.1666],
        [-1.04

(Epoch 10) TRAIN LOSS:0.3240 LR:0.00000300:  48%|██████████████████▋                    | 12/25 [00:02<00:02,  4.60it/s]

SequenceClassifierOutput(loss=tensor(0.3143, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.8745, -0.0368, -0.8722],
        [ 1.9449, -0.9112, -0.9572],
        [ 0.7798, -1.9660,  1.1117],
        [-0.6853, -1.0960,  2.0140],
        [ 1.5741, -0.8504, -0.2501],
        [-0.6241, -0.8131,  1.6867],
        [-1.1379, -1.6330,  2.6439],
        [-0.6830, -0.7396,  1.1536],
        [-0.9663, -1.2716,  2.4885],
        [-0.9933, -1.2524,  2.1068],
        [-0.5650, -1.3652,  2.5791],
        [ 1.4969,  0.1866, -0.7117],
        [ 1.7089, -0.9003, -0.7806],
        [-0.5495, -1.8285,  2.5326],
        [ 1.0253, -0.1959, -0.6785],
        [ 1.5813, -0.7428, -0.8540],
        [-1.2539, -1.3326,  2.6444],
        [-0.9349, -1.5071,  2.2829],
        [ 0.8027, -0.2039, -0.5038],
        [ 0.1033, -1.8154,  1.6276],
        [ 0.9480,  0.4947, -0.9183],
        [-0.6197, -0.7648,  1.8715],
        [-0.2418, -1.5370,  1.8566],
        [ 1.4895, -0.3139, -0.9606],
        [-1.05

(Epoch 10) TRAIN LOSS:0.3227 LR:0.00000300:  52%|████████████████████▎                  | 13/25 [00:02<00:02,  4.64it/s]

SequenceClassifierOutput(loss=tensor(0.3076, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.4528, -0.4335, -0.2958],
        [ 1.3401,  0.3194, -0.7942],
        [-0.8755, -1.5825,  2.1952],
        [ 0.4740, -1.7924,  1.8200],
        [-1.0827, -1.9580,  2.5432],
        [ 1.5444, -0.2017, -0.6732],
        [ 0.3757, -1.6300,  1.0019],
        [-1.0861, -1.8678,  2.5599],
        [-0.3446, -0.5861,  1.0129],
        [ 1.6353, -0.7920, -0.5520],
        [ 1.3620, -0.8850, -0.4436],
        [-0.3753, -1.1070,  1.4782],
        [-0.1367, -1.9039,  1.6820],
        [-0.2084, -1.9087,  2.2566],
        [-0.3992, -1.4085,  2.3231],
        [-0.8829, -1.4629,  2.1473],
        [ 1.8062, -0.5249, -0.6892],
        [-1.1876, -1.0763,  2.2517],
        [-0.8464, -1.7781,  2.4279],
        [ 0.0500,  0.7438, -0.9200],
        [ 0.0771,  0.8690, -0.2178],
        [ 1.3334,  0.4695, -0.8012],
        [-0.7066, -1.4426,  2.0090],
        [-0.0541, -0.5532,  1.0536],
        [-0.56

(Epoch 10) TRAIN LOSS:0.3263 LR:0.00000300:  56%|█████████████████████▊                 | 14/25 [00:03<00:02,  4.72it/s]

SequenceClassifierOutput(loss=tensor(0.3726, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.6228, -0.4677, -0.6411],
        [ 0.8749, -0.5314, -0.2595],
        [ 1.9677, -1.2572, -0.3176],
        [-0.4118, -1.1015,  1.7554],
        [-0.8575, -1.7554,  2.7922],
        [ 2.1196, -0.0703, -1.2025],
        [-0.2352, -2.0071,  2.5228],
        [ 0.5831, -1.2754,  0.8350],
        [-0.0997, -1.6227,  1.7291],
        [-1.0443, -1.7865,  2.6322],
        [-0.2505, -1.7939,  2.0953],
        [ 2.0587, -0.5745, -1.0226],
        [-1.3785, -1.5598,  2.7416],
        [ 1.7915, -0.4015, -0.7987],
        [-0.6781, -1.4968,  1.6823],
        [ 0.4706, -0.4725,  0.3257],
        [-1.1632, -1.0401,  2.3656],
        [-1.0750, -0.9946,  1.9222],
        [-0.7537, -1.3443,  2.1828],
        [-0.2170,  1.7250, -0.6437],
        [ 1.5994, -0.6188, -0.7576],
        [ 0.6400,  0.8271, -0.1069],
        [-0.2683, -0.1282,  0.8511],
        [ 1.9493, -0.8165, -0.5051],
        [ 1.30

(Epoch 10) TRAIN LOSS:0.3229 LR:0.00000300:  60%|███████████████████████▍               | 15/25 [00:03<00:02,  4.61it/s]

SequenceClassifierOutput(loss=tensor(0.2761, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.6504e-01, -1.6850e+00,  2.0517e+00],
        [ 1.4464e+00, -1.0451e+00,  2.3832e-01],
        [ 1.3697e+00, -1.4775e-02, -7.0724e-01],
        [-1.2626e+00, -1.3437e+00,  2.4459e+00],
        [-7.4711e-01, -1.7961e+00,  2.6344e+00],
        [-9.6170e-01, -1.2951e+00,  2.2981e+00],
        [ 4.1993e-01, -4.3674e-01,  8.9375e-01],
        [-9.5558e-01, -1.0619e+00,  2.4596e+00],
        [ 1.4293e+00, -6.7626e-01, -6.8781e-01],
        [ 9.3044e-01, -1.9846e+00,  7.9415e-01],
        [-9.2234e-01, -1.5721e+00,  2.4493e+00],
        [-1.0526e+00, -1.6042e+00,  2.8274e+00],
        [-1.1901e+00, -1.4325e+00,  2.6074e+00],
        [ 1.3059e+00, -9.8295e-02, -8.2750e-01],
        [-1.1963e+00, -1.6318e+00,  2.9675e+00],
        [ 1.4621e+00, -3.1348e-01, -8.9601e-01],
        [ 1.9596e+00, -2.5602e-01, -1.1075e+00],
        [-9.9620e-01, -1.7572e+00,  2.8056e+00],
        [-9.7734e-01

(Epoch 10) TRAIN LOSS:0.3299 LR:0.00000300:  64%|████████████████████████▉              | 16/25 [00:03<00:01,  4.66it/s]

SequenceClassifierOutput(loss=tensor(0.4342, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6893, -0.0293, -0.4217],
        [-0.8876, -1.8529,  2.7506],
        [-0.0223, -1.4010,  1.2966],
        [-0.4342, -1.7769,  2.0972],
        [ 1.0842, -0.2918, -0.6937],
        [-0.8149, -1.7225,  2.6888],
        [-0.7101, -1.7578,  2.4699],
        [ 0.3109, -1.3110,  0.9942],
        [ 1.3568, -0.4005, -0.6180],
        [-0.9817, -1.4879,  2.3845],
        [ 1.4032, -0.4757, -0.7482],
        [ 1.5306, -0.2751, -0.8034],
        [ 1.2273,  0.3993, -0.4896],
        [-1.1164, -1.3665,  2.8087],
        [-0.1260,  1.0826,  0.1478],
        [ 0.0599,  1.0632, -0.5630],
        [-0.1478, -1.5417,  1.7860],
        [-0.4687, -1.9851,  1.8424],
        [ 2.0803, -0.7675, -0.9055],
        [-0.1442,  0.7209, -0.2176],
        [ 2.0101, -0.5880, -0.9189],
        [-1.2111, -1.4752,  2.4943],
        [ 2.1049, -0.5834, -0.9359],
        [ 0.8816, -1.8513,  1.2631],
        [-0.52

(Epoch 10) TRAIN LOSS:0.3331 LR:0.00000300:  68%|██████████████████████████▌            | 17/25 [00:03<00:01,  4.41it/s]

SequenceClassifierOutput(loss=tensor(0.3846, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.9524, -1.5669,  2.5458],
        [ 1.5436, -0.5713, -0.5893],
        [ 0.0871,  1.3531, -1.0965],
        [ 1.3873, -1.6141,  0.7265],
        [-1.0637, -1.7112,  2.4671],
        [-0.8528, -1.7587,  2.5450],
        [ 0.6193, -1.4850,  1.4147],
        [ 0.3526, -2.0602,  1.7134],
        [ 1.7532,  0.0884, -1.0362],
        [-0.4274, -1.7250,  2.3580],
        [-0.4891, -0.5502,  0.9350],
        [-0.8324, -1.3658,  2.6204],
        [-1.1500, -1.3655,  2.8303],
        [ 0.6353, -1.8191,  1.1916],
        [-1.0701, -1.5187,  2.8641],
        [-0.6896, -1.2932,  1.8069],
        [ 0.8191, -1.5740,  0.9811],
        [-0.1381, -1.5059,  1.5559],
        [ 0.8297, -1.7132,  0.9300],
        [-1.3576, -1.7662,  2.5497],
        [-0.7458, -1.7703,  2.4992],
        [ 0.3861,  0.5998, -0.4483],
        [-0.9628, -1.4447,  2.5614],
        [-0.9024, -1.4903,  2.3302],
        [-0.64

(Epoch 10) TRAIN LOSS:0.3327 LR:0.00000300:  72%|████████████████████████████           | 18/25 [00:03<00:01,  4.99it/s]

SequenceClassifierOutput(loss=tensor(0.3254, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1607,  1.2027, -0.5653],
        [ 1.3008, -0.7113,  0.2553],
        [ 1.3123, -0.8053, -0.4500],
        [ 0.0332, -0.3400,  0.4248],
        [ 1.2696, -0.6043, -0.3902],
        [-0.5202,  1.0065,  0.1063],
        [ 1.7500, -0.1148, -0.7924],
        [ 2.2114, -0.6427, -1.0463],
        [-0.9306, -1.4931,  2.6323],
        [ 0.9222, -1.0629,  0.6539],
        [-1.0431, -1.3470,  2.4453],
        [-0.5058, -1.4251,  1.7346],
        [ 2.2052, -0.4642, -0.8060],
        [ 1.5359, -1.8016,  0.3858],
        [ 0.7208, -0.7644,  0.0677],
        [ 1.6418, -0.2347, -0.7915],
        [-0.7916, -1.7957,  2.6801],
        [-0.8588, -1.5291,  2.3661],
        [ 0.2920, -0.4480,  0.1320],
        [-0.7117, -1.1780,  2.0574],
        [-1.3101, -0.8933,  2.2295],
        [-0.3522, -1.1605,  1.3795],
        [ 1.2725, -1.5570,  0.4734],
        [ 2.1236, -0.5894, -0.6724],
        [ 0.36

(Epoch 10) TRAIN LOSS:0.3313 LR:0.00000300:  76%|█████████████████████████████▋         | 19/25 [00:04<00:01,  3.95it/s]

SequenceClassifierOutput(loss=tensor(0.2901, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.2161, -0.0771,  0.6206],
        [-0.2127, -1.7829,  2.1507],
        [ 1.3526, -0.2341, -1.0634],
        [-0.9412, -1.5313,  2.7251],
        [ 0.6843, -0.0603, -0.0062],
        [ 2.0186, -0.9033, -0.7207],
        [ 0.8343, -0.2628, -0.2028],
        [ 0.9040,  0.5909, -0.2279],
        [ 0.1514, -1.7205,  2.1317],
        [-1.2223, -1.5216,  2.7401],
        [-1.1230, -1.2209,  2.1643],
        [-0.3942, -1.2702,  2.1967],
        [ 0.6289, -1.5064,  0.8800],
        [-0.5859, -0.9055,  1.2509],
        [-0.2974,  0.1699,  0.2464],
        [ 0.6312,  0.9461, -0.6571],
        [-0.2723, -1.7247,  2.2295],
        [-1.2809, -1.2441,  2.3556],
        [ 1.5430, -1.1704, -0.4630],
        [ 0.9356, -1.8932,  1.1461],
        [-1.0563, -1.5323,  2.4326],
        [-1.0868, -0.9171,  2.4575],
        [-1.0389, -1.5487,  2.4450],
        [-1.2116, -1.1987,  2.4805],
        [ 0.93

(Epoch 10) TRAIN LOSS:0.3292 LR:0.00000300:  80%|███████████████████████████████▏       | 20/25 [00:04<00:01,  3.58it/s]

SequenceClassifierOutput(loss=tensor(0.3315, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0132, -0.9338, -0.1681],
        [-0.8387, -1.5825,  2.2711],
        [-0.1608, -1.4711,  2.3119],
        [-0.5571, -1.5023,  2.2051],
        [ 1.1897, -0.3585, -0.3097],
        [ 0.1828,  0.9932, -0.5658],
        [-0.2557,  0.3510,  0.5926],
        [ 0.2603, -1.5043,  1.4573],
        [ 1.7114, -0.4458, -0.9955],
        [-0.7907, -1.3932,  2.4840],
        [ 0.3883,  1.1939, -0.9759],
        [-1.1671, -1.4541,  2.6144],
        [ 0.3832,  1.4794, -0.5581],
        [-1.0230, -1.5022,  2.1488],
        [-1.3478, -1.3803,  2.3444],
        [ 1.4759, -1.6798,  0.7205],
        [-1.0341, -1.4415,  2.6030],
        [ 0.8383, -1.7281,  1.1741],
        [-0.8692, -1.1626,  2.4974],
        [ 2.0604,  0.0252, -0.9311],
        [-0.4489, -1.4827,  2.0095],
        [ 2.0155, -0.1586, -1.1900],
        [-0.9608, -1.9766,  2.5644],
        [-1.0333, -1.5132,  2.3648],
        [-0.90

(Epoch 10) TRAIN LOSS:0.3293 LR:0.00000300:  84%|████████████████████████████████▊      | 21/25 [00:05<00:01,  3.14it/s]

SequenceClassifierOutput(loss=tensor(0.2881, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-1.1524, -1.5326,  2.7446],
        [ 0.8950, -0.6388,  0.1615],
        [ 1.3024, -0.1364, -0.5430],
        [-1.1892, -1.6190,  2.4597],
        [-0.7433, -1.7096,  2.2743],
        [ 1.1432, -0.0723, -0.6037],
        [ 1.0851, -1.9005,  0.4526],
        [ 1.2515, -0.5813, -0.9096],
        [-1.3305, -1.5414,  2.6409],
        [ 2.0333, -0.7677, -0.7111],
        [-0.9399, -1.8319,  2.7009],
        [-0.8781, -1.0488,  1.9252],
        [-1.0478, -1.4435,  2.4541],
        [-0.7927, -1.6235,  2.4123],
        [ 0.5673, -1.5495,  1.2250],
        [ 2.0510, -0.2025, -0.9492],
        [ 1.7396, -0.5404, -0.6124],
        [-1.0447, -1.4376,  2.5055],
        [ 1.4607, -0.1412, -0.2884],
        [ 1.1979, -1.1543,  0.5057],
        [-1.4402, -1.2295,  2.4130],
        [ 1.6726, -0.3312, -1.0297],
        [-0.9445, -1.2398,  2.7402],
        [-0.6833, -1.0457,  1.9000],
        [ 0.75

(Epoch 10) TRAIN LOSS:0.3275 LR:0.00000300:  88%|██████████████████████████████████▎    | 22/25 [00:05<00:01,  2.99it/s]

SequenceClassifierOutput(loss=tensor(0.2520, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3166, -1.4379,  1.9120],
        [ 0.2031, -0.4526,  0.9344],
        [-0.8485, -1.4795,  2.4394],
        [-1.3036, -1.6170,  2.4171],
        [ 2.4262, -0.8715, -0.9670],
        [-0.2680, -1.9160,  2.0872],
        [-0.9795, -1.7237,  2.3196],
        [-1.3972, -1.7561,  2.3941],
        [-1.1383, -1.4942,  2.4114],
        [-0.3356, -1.8176,  1.9931],
        [-0.8992, -1.4825,  2.4122],
        [-0.4798, -1.8930,  2.1067],
        [ 2.1462, -0.8828, -1.0103],
        [ 0.1443,  0.2351,  0.7991],
        [ 0.1474,  0.7296, -0.1420],
        [-0.1007, -1.2246,  1.0128],
        [ 0.9325,  0.8147, -0.2480],
        [ 0.2148,  1.0384, -0.4434],
        [ 1.7897, -0.2325, -0.5253],
        [-1.0794, -1.6796,  2.3080],
        [ 1.7288,  0.1476, -0.8662],
        [ 1.9403, -1.6503,  0.2026],
        [-0.4151,  0.9805,  0.0541],
        [ 0.1379,  0.5374, -0.3594],
        [-1.32

(Epoch 10) TRAIN LOSS:0.3242 LR:0.00000300:  92%|███████████████████████████████████▉   | 23/25 [00:05<00:00,  2.86it/s]

SequenceClassifierOutput(loss=tensor(0.2222, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.7593, -1.7014,  2.6840],
        [-1.0556, -1.2331,  2.2801],
        [-0.4463, -1.9238,  2.2743],
        [ 2.3666, -0.4371, -0.9243],
        [ 0.2176,  0.7629, -0.4441],
        [-0.9501, -1.4784,  1.9500],
        [-0.8143, -1.6836,  2.5572],
        [-1.0624, -1.5287,  2.1900],
        [-0.5993, -1.5207,  2.5075],
        [-0.6684, -1.4846,  2.3416],
        [ 0.9272, -1.5966,  0.6404],
        [ 0.8095, -0.1139, -0.3778],
        [ 0.5753,  0.2949, -0.0052],
        [-0.7647, -1.5445,  2.4138],
        [ 1.7884, -0.6643, -0.8930],
        [ 1.2789, -0.5516, -0.2849],
        [-0.7480, -1.9532,  2.4277],
        [-1.0281, -1.9058,  2.5153],
        [ 1.6887, -1.0337, -0.9989],
        [-1.2005, -1.6476,  2.7367],
        [ 2.1977, -0.2597, -1.1488],
        [ 1.8828, -0.6325, -0.7564],
        [-1.0299, -1.6052,  2.3500],
        [-0.1262,  1.5366, -0.5841],
        [-0.64

(Epoch 10) TRAIN LOSS:0.3199 LR:0.00000300:  96%|█████████████████████████████████████▍ | 24/25 [00:06<00:00,  2.68it/s]

SequenceClassifierOutput(loss=tensor(0.2559, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7254,  0.0990, -0.4878],
        [ 0.2523, -1.3308,  1.3309],
        [-1.3066, -1.1074,  2.7711],
        [-1.1408, -1.6297,  2.6447],
        [-0.2022, -0.2565,  0.3066],
        [-0.8928, -1.5844,  2.6296],
        [ 0.0105, -0.1805,  0.4547],
        [-0.6614, -0.7295,  2.0005],
        [ 1.2113,  0.0264, -0.9041],
        [-1.2164, -1.5238,  2.7896],
        [-1.0757, -1.5189,  2.1256],
        [-1.3953, -0.8725,  2.4782],
        [ 1.8007, -0.2554, -0.8710],
        [-0.3457, -0.1102,  0.4699],
        [-1.2518, -1.8067,  2.5920],
        [-1.0845, -1.7266,  2.6442],
        [ 2.2348, -0.5817, -1.1785],
        [ 0.1511, -1.8020,  1.7730],
        [ 1.7021, -0.6425, -0.9898],
        [-0.8346, -1.9479,  2.3100],
        [-0.7446, -1.6225,  2.3538],
        [-0.9127, -1.6759,  2.5642],
        [ 1.0551, -0.6367, -0.0240],
        [-1.1320, -1.4569,  2.7415],
        [-0.56

(Epoch 10) TRAIN LOSS:0.3174 LR:0.00000300: 100%|███████████████████████████████████████| 25/25 [00:06<00:00,  3.71it/s]

(Epoch 10) TRAIN LOSS:0.3174 ACC:0.91 F1:0.87 REC:0.85 PRE:0.92 LR:0.00000300





In [17]:
# Evaluate on validation
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm.tqdm(iter(valid_loader), leave=True, total=len(valid_loader))
for i, batch_data in enumerate(pbar):
    batch_seq = batch_data[-1]
    loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    
    # Calculate total loss
    valid_loss = loss.item()
    total_loss = total_loss + valid_loss

    # Calculate evaluation metrics
    list_hyp += batch_hyp
    list_label += batch_label
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)

    pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
    
metrics = document_sentiment_metrics_fn(list_hyp, list_label)
print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
    total_loss/(i+1), metrics_to_string(metrics)))

VALID LOSS:0.4129 ACC:0.84 F1:0.80 REC:0.77 PRE:0.84:  29%|████████▊                      | 2/7 [00:00<00:00,  6.79it/s]

SequenceClassifierOutput(loss=tensor(0.5209, device='cuda:0'), logits=tensor([[ 1.6022,  0.0610, -0.6325],
        [ 1.5415, -0.7153,  0.0057],
        [ 0.0251, -1.4754,  1.5961],
        [-0.5106, -1.8198,  2.1493],
        [ 0.6256, -2.3428,  1.6935],
        [ 1.9691, -0.7293, -0.5360],
        [ 1.2407, -0.6683,  0.0056],
        [ 1.1910, -0.8565,  0.2248],
        [-0.1905, -0.4875,  1.1694],
        [ 1.5163,  0.2109, -0.4007],
        [ 1.9952, -0.3252, -1.1078],
        [ 0.1421,  0.7415,  0.1316],
        [-0.0185, -0.4474,  0.9149],
        [ 1.7116,  0.0200, -0.8571],
        [ 2.3021, -0.8258, -0.7692],
        [ 0.3830, -0.0900, -0.5695],
        [-1.0319, -1.2908,  2.3917],
        [-0.4671, -1.9519,  2.4526],
        [-0.1232, -0.6399,  1.1979],
        [ 0.0417,  0.3995,  0.2068],
        [ 0.6060,  1.4272, -0.7496],
        [ 1.5699, -1.2414, -0.3771],
        [-0.3993, -1.7767,  2.0441],
        [-0.0807, -0.0514, -0.2329],
        [ 1.2252,  0.2961, -0.3163],
     

VALID LOSS:0.3638 ACC:0.88 F1:0.85 REC:0.83 PRE:0.88:  57%|█████████████████▋             | 4/7 [00:00<00:00,  6.47it/s]

SequenceClassifierOutput(loss=tensor(0.2174, device='cuda:0'), logits=tensor([[ 0.2917,  1.2946, -0.5943],
        [-0.9586, -1.8916,  2.7693],
        [ 2.1209, -0.6995, -0.5268],
        [-0.2695, -0.6276,  1.0945],
        [-1.2645, -1.7874,  2.9435],
        [ 1.8473, -0.5752, -1.0842],
        [-1.4841, -1.4897,  2.8823],
        [-1.3052, -1.7162,  3.1231],
        [ 0.0418, -0.1484,  0.3796],
        [ 1.1245, -0.5135, -0.2867],
        [-1.1683, -1.4840,  2.5991],
        [-1.1683, -1.7737,  2.9686],
        [-0.9982, -1.6459,  2.5148],
        [ 2.4516, -0.9369, -0.7569],
        [-1.3920, -1.5936,  2.8308],
        [-0.7937, -1.6749,  2.3265],
        [-0.9768, -1.8861,  2.7870],
        [-0.9322, -1.7926,  2.6087],
        [-1.3273, -1.7056,  2.7538],
        [ 1.6630, -1.1063, -0.0391],
        [-1.2704, -1.6034,  2.9662],
        [-1.4786, -1.4340,  2.9404],
        [ 0.9151, -0.2246, -0.1893],
        [ 1.9432, -0.8665, -0.5152],
        [ 1.8698, -0.7514, -0.6658],
     

VALID LOSS:0.3218 ACC:0.91 F1:0.89 REC:0.88 PRE:0.90:  86%|██████████████████████████▌    | 6/7 [00:00<00:00,  6.76it/s]

SequenceClassifierOutput(loss=tensor(0.1935, device='cuda:0'), logits=tensor([[ 2.2209, -0.4182, -1.1661],
        [ 2.1915, -0.7597, -1.0946],
        [-1.3349, -1.5440,  2.7933],
        [-0.9303, -1.1808,  2.4547],
        [-1.3404, -1.4546,  2.6862],
        [-0.3688, -1.4697,  2.0464],
        [ 1.7009, -0.5646, -0.5219],
        [-1.1828, -1.4538,  2.5593],
        [-1.0880, -1.7102,  2.6170],
        [ 1.6788, -0.6300, -0.6948],
        [ 2.2376, -0.8655, -0.3439],
        [-1.2149, -1.5052,  2.8163],
        [-0.9936, -2.0224,  2.6162],
        [-0.4291, -0.9042,  1.6399],
        [ 1.1135, -0.2687, -0.3817],
        [ 1.6848, -0.9922, -0.3020],
        [ 1.1803, -0.8877, -0.0915],
        [-1.1092, -1.4820,  2.6541],
        [-0.1325, -1.6265,  2.0049],
        [-0.5748,  1.1407,  0.1592],
        [-0.9847, -1.8617,  2.9025],
        [ 1.7903, -0.9908, -0.4954],
        [-1.0382, -1.8737,  2.6168],
        [ 1.7707, -0.1145, -0.8552],
        [ 1.9438, -0.2474, -1.5069],
     

VALID LOSS:0.3368 ACC:0.91 F1:0.89 REC:0.88 PRE:0.90: 100%|███████████████████████████████| 7/7 [00:01<00:00,  6.29it/s]

SequenceClassifierOutput(loss=tensor(0.4266, device='cuda:0'), logits=tensor([[ 1.4578,  0.5876, -1.0129],
        [ 1.0917, -0.5079, -0.5128],
        [ 0.7812, -1.1402,  0.1224],
        [-1.3073, -1.8274,  3.0044],
        [ 0.7625, -1.3047,  0.8563],
        [ 2.1716, -0.3405, -1.2770],
        [ 0.7731, -0.0829,  0.1464],
        [-1.1277, -1.8785,  2.9902]], device='cuda:0'), hidden_states=None, attentions=None)
(Epoch 10) VALID LOSS:0.3368 ACC:0.91 F1:0.89 REC:0.88 PRE:0.90



