In [1]:
!pip install transformers
!pip install datasets
from google.colab import drive
# drive.mount('/content/drive')
# % cd /content/drive/MyDrive/TranslationArtifact_CodesAndGraphs

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 47.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 51.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [2]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from transformers import get_linear_schedule_with_warmup, get_constant_schedule
import torch
import torch.nn as nn
import torch.nn.functional as F 
from datasets import load_dataset
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
import math
from tqdm import tqdm
import copy
import pickle
import os
import random

In [3]:
def find_answer(question_text, answer, ref_text, tokenizer) -> bool:
    model_input = tokenizer(question_text, ref_text, truncation=True, padding=True, return_token_type_ids=True, add_special_tokens=True)['input_ids'] #model input has both question text and reference text
    answer = tokenizer(answer, truncation=True, padding=True, return_token_type_ids=True, add_special_tokens=False)['input_ids']
    if len(model_input) > 512:
        return -1, -1
    for i in range(0, len(model_input) - len(answer) + 1):
        if answer == model_input[i: i + len(answer)]:
            return (i, i + len(answer))
    return -1, -1
# prepair input
def prepare_inputs(indexes, data, tokenizer):
    contexts = []
    questions = []
    answer_starts = []
    answer_ends = []
    for i in indexes:
        t = data[i]
        question = t['question']
        answer = t['answers']['text'][0]
        context = t['context']
        s, e = find_answer(question, answer, context, tokenizer)
        if s == -1:
            continue
        contexts.append(context)
        questions.append(question)
        answer_starts.append(s)
        answer_ends.append(e)
    return contexts, questions, answer_starts, answer_ends

In [4]:
def get_scheduler(optimizer, scheduler: str, warmup_steps: int, t_total: int):
    """
    Returns the correct learning rate scheduler
    """
    scheduler = scheduler.lower()
    if scheduler=='constantlr':
        return get_constant_schedule(optimizer)
    elif scheduler=='warmupconstant':
        return get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
    elif scheduler=='warmuplinear':
        return get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    elif scheduler=='warmupcosine':
        return get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    elif scheduler=='warmupcosinewithhardrestarts':
        return get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    else:
        raise ValueError("Unknown scheduler {}".format(scheduler))
        
def train(model, optimizer, scheduler, train_data, dev_data, batch_size, device, max_grad_norm, tokenizer, best_acc = -1):
    loss_fn = nn.CrossEntropyLoss()

    step_cnt = 0
    best_model_weights = None
    
    contexts, questions, answer_starts, answer_ends = train_data

    for pointer in tqdm(range(0, len(contexts), batch_size), desc='training',ascii = True,leave = True):
        model.train() # model was in eval mode in evaluate(); re-activate the train mode
        optimizer.zero_grad() # clear gradients first
        torch.cuda.empty_cache() # releases all unoccupied cached memory 
        # t = torch.cuda.get_device_properties(0).total_memory
        # r = torch.cuda.memory_reserved(0) 
        # a = torch.cuda.memory_allocated(0)
        # print(t,r,a)

        step_cnt += 1
        
        input = tokenizer(contexts[pointer:min(pointer + batch_size, len(contexts))], questions[pointer:min(pointer + batch_size, len(contexts))], return_tensors="pt",truncation=True, padding=True, return_token_type_ids=True, add_special_tokens=True)
        input.to(device)
        
        answer_start, answer_end = (answer_starts[pointer:min(pointer + batch_size, len(contexts))], answer_ends[pointer:min(pointer + batch_size, len(contexts))])
        true_labels1 = torch.LongTensor(np.array(answer_start)).to(device)
        true_labels2 = torch.LongTensor(np.array(answer_end)).to(device)
            
        output = model(**input)
        if output is None: continue
        pred_indicies1 = output['start_logits']
        pred_indicies2 = output['end_logits']
        # print(pred_indicies1.shape, true_labels1.shape)
        loss1 = loss_fn(pred_indicies1,true_labels1)
        loss2 = loss_fn(pred_indicies2,true_labels2)
        loss = loss1 + loss2

        # back propagate
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        # update weights 
        optimizer.step()

        # update training rate
        scheduler.step()

        if step_cnt%int(len(contexts)/batch_size/10) == 0 or step_cnt == math.ceil(len(contexts)*1./batch_size):
            acc = evaluate(model,dev_data,device,tokenizer,mute=True)
            print('==> step {} dev acc: {}'.format(step_cnt,acc))
            if acc > best_acc:
                best_acc = acc
                best_model_weights = copy.deepcopy(model.cpu().state_dict())
                model.to(device)

    return best_model_weights
    
def evaluate(model, test_data, device, tokenizer, mute=False, batch_size=10):
    model.eval()
    contexts, questions, answer_starts, answer_ends = test_data
    all_labels = []
    all_predict = np.array([])
    with torch.no_grad():
        for pointer in range(0, len(contexts), batch_size):            
            input = tokenizer(contexts[pointer:min(pointer + batch_size, len(contexts))], questions[pointer:min(pointer + batch_size, len(contexts))], return_tensors="pt",truncation=True, padding=True, return_token_type_ids=True, add_special_tokens=True)
            input.to(device)

            answer_start, answer_end = (answer_starts[pointer:min(pointer + batch_size, len(contexts))], answer_ends[pointer:min(pointer + batch_size, len(contexts))])
            all_labels = all_labels + answer_start 
            all_labels = all_labels + answer_end
            
            outputs = model(**input)
            
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits
            predict_start = [np.argmax(pp) for pp in start_logits.cpu()]
            predict_end = [np.argmax(pp) for pp in end_logits.cpu()]
            all_predict = np.concatenate((all_predict, predict_start), axis=None)
            all_predict = np.concatenate((all_predict, predict_end), axis=None)
    assert len(all_predict) == len(all_labels)


    acc = len([i for i in range(len(all_labels)) if all_predict[i]==all_labels[i]])*1./len(all_labels)

    if not mute:
        print('==>acc<==', acc)

    return acc

In [5]:
train_d = load_dataset('tydiqa', name = 'primary_task', split = 'train')
train_d[0]

Downloading builder script:   0%|          | 0.00/3.49k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading and preparing dataset tydiqa/primary_task (download: 1.82 GiB, generated: 5.62 GiB, post-processed: Unknown size, total: 7.44 GiB) to /root/.cache/huggingface/datasets/tydiqa/primary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/161M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/58.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.62M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/166916 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/18670 [00:00<?, ? examples/s]

Dataset tydiqa downloaded and prepared to /root/.cache/huggingface/datasets/tydiqa/primary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148. Subsequent calls will reuse this data.


{'annotations': {'minimal_answers_end_byte': [-1],
  'minimal_answers_start_byte': [-1],
  'passage_answer_candidate_index': [-1],
  'yes_no_answer': ['NONE']},
 'document_plaintext': '\ntransl.\n\nRas (dari bahasa Prancis race, yang sendirinya dari bahasa Latin radix, "akar") adalah suatu sistem klasifikasi yang digunakan untuk mengkategorikan manusia dalam populasi atau kelompok besar dan berbeda melalui ciri fenotipe, asal usul geografis, tampang jasmani dan kesukuan yang terwarisi. Di awal abad ke-20 istilah ini sering digunakan dalam arti biologis untuk menunjuk populasi manusia yang beraneka ragam dari segi genetik dengan anggota yang memiliki fenotipe (tampang luar) yang sama.[1] Arti "ras" ini masih digunakan dalam antropologi forensik (dalam menganalisa sisa tulang), penelitian biomedis dan kedokteran berdasarkan asal usul.[2]\nDi samping itu, di Amerika Serikat misalnya, penegak hukum menggunakan istilah "ras" dalam menentukan profil tersangka dan penggambaran kembali tampang

In [6]:
l = 'english'

train_d = load_dataset('tydiqa', name = 'secondary_task', split = 'train')
train_d = train_d.shuffle()
test_d = load_dataset('tydiqa', name = 'secondary_task', split = 'validation')

tokenizer=AutoTokenizer.from_pretrained('xlm-roberta-base')
languages = [(t['id'].split("-")[0]) for t in train_d]
# find target language indicies
indexes_train = [i for i,x in enumerate(languages) if x == l]
indexes_dev = indexes_train[-100:]
indexes_train = indexes_train[:-100]
languages = [(t['id'].split("-")[0]) for t in test_d]
indexes_test = [i for i,x in enumerate(languages) if x == l]

train_data = prepare_inputs(indexes_train, train_d, tokenizer)
dev_data = prepare_inputs(indexes_dev, train_d, tokenizer)
test_data = prepare_inputs(indexes_test, test_d, tokenizer)

Downloading and preparing dataset tydiqa/secondary_task (download: 1.82 GiB, generated: 55.27 MiB, post-processed: Unknown size, total: 1.87 GiB) to /root/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/49881 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5077 [00:00<?, ? examples/s]

Dataset tydiqa downloaded and prepared to /root/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148. Subsequent calls will reuse this data.


Reusing dataset tydiqa (/root/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148)


Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

In [7]:
epoch_num = 100
batch_size = 5
warmup_percent = 0.2
max_grad_norm = 1
scheduler_setting = 'WarmupLinear'
device = 'cuda'
total_steps = math.ceil(epoch_num*len(train_data[0])*1./batch_size)
warmup_steps = int(total_steps*warmup_percent)

In [None]:
model_path = 'ModelWeights'
iter = 0
# for i in range(0, epoch_num):
#     if os.path.exists('./model weights/' + model_path + str(i)):
#       iter = i
identifier = "xlm-roberta-base" if iter == 0 else './model weights/' + model_path + str(iter)
iter = iter + 1 if iter != 0 else iter

print(identifier)
model = AutoModelForQuestionAnswering.from_pretrained('xlm-roberta-base')
tokenizer=AutoTokenizer.from_pretrained('xlm-roberta-base')
model.to(device)
optimizer = AdamW(model.parameters(),lr=2e-5,eps=1e-6,correct_bias=False)
scheduler = get_scheduler(optimizer, scheduler_setting, warmup_steps=warmup_steps, t_total=total_steps) 
for i in range(0, epoch_num):
    best_weight = train(model, optimizer, scheduler, train_data, dev_data, batch_size, device, max_grad_norm, tokenizer, best_acc = -1)
    model.load_state_dict(best_weight)
    # model.save_pretrained(save_directory = './model weights/'+ model_path + str(i))

xlm-roberta-base


Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream tas

==> step 64 dev acc: 0.016483516483516484


training:  20%|#9        | 128/649 [00:54<08:40,  1.00it/s]

==> step 128 dev acc: 0.016483516483516484


training:  30%|##9       | 192/649 [01:20<08:06,  1.06s/it]

==> step 192 dev acc: 0.016483516483516484


training:  39%|###9      | 256/649 [01:48<07:31,  1.15s/it]

==> step 256 dev acc: 0.01098901098901099


training:  49%|####9     | 320/649 [02:18<06:22,  1.16s/it]

==> step 320 dev acc: 0.01098901098901099


training:  59%|#####9    | 384/649 [02:47<05:01,  1.14s/it]

==> step 384 dev acc: 0.01098901098901099


training:  69%|######9   | 448/649 [03:17<03:35,  1.07s/it]

==> step 448 dev acc: 0.016483516483516484


training:  79%|#######8  | 512/649 [03:45<02:34,  1.13s/it]

==> step 512 dev acc: 0.016483516483516484


training:  89%|########8 | 575/649 [04:12<00:29,  2.49it/s]

==> step 576 dev acc: 0.02197802197802198


training:  99%|#########8| 640/649 [04:46<00:09,  1.08s/it]

==> step 640 dev acc: 0.02197802197802198


training: 100%|##########| 649/649 [04:53<00:00,  2.21it/s]

==> step 649 dev acc: 0.02197802197802198



training:  10%|9         | 63/649 [00:25<04:03,  2.40it/s]

==> step 64 dev acc: 0.02197802197802198


training:  20%|#9        | 128/649 [01:00<09:32,  1.10s/it]

==> step 128 dev acc: 0.02197802197802198


training:  29%|##9       | 191/649 [01:25<03:29,  2.19it/s]

==> step 192 dev acc: 0.027472527472527472


training:  39%|###9      | 256/649 [02:00<07:39,  1.17s/it]

==> step 256 dev acc: 0.02197802197802198


training:  49%|####9     | 320/649 [02:29<06:13,  1.14s/it]

==> step 320 dev acc: 0.027472527472527472


training:  59%|#####9    | 383/649 [02:56<02:04,  2.14it/s]

==> step 384 dev acc: 0.03296703296703297


training:  69%|######9   | 448/649 [03:30<03:36,  1.08s/it]

==> step 448 dev acc: 0.03296703296703297


training:  79%|#######8  | 512/649 [03:59<02:33,  1.12s/it]

==> step 512 dev acc: 0.03296703296703297


training:  89%|########8 | 575/649 [04:25<00:29,  2.50it/s]

==> step 576 dev acc: 0.04945054945054945


training:  99%|#########8| 640/649 [04:58<00:09,  1.07s/it]

==> step 640 dev acc: 0.027472527472527472


training: 100%|##########| 649/649 [05:05<00:00,  2.13it/s]

==> step 649 dev acc: 0.027472527472527472



training:  10%|9         | 63/649 [00:25<04:02,  2.42it/s]

==> step 64 dev acc: 0.03296703296703297


training:  20%|#9        | 128/649 [00:59<09:25,  1.09s/it]

==> step 128 dev acc: 0.03296703296703297


training:  30%|##9       | 192/649 [01:27<08:29,  1.12s/it]

==> step 192 dev acc: 0.016483516483516484


training:  39%|###9      | 256/649 [01:56<07:32,  1.15s/it]

==> step 256 dev acc: 0.02197802197802198


training:  49%|####9     | 320/649 [02:25<06:12,  1.13s/it]

==> step 320 dev acc: 0.016483516483516484


training:  59%|#####9    | 384/649 [02:54<05:05,  1.15s/it]

==> step 384 dev acc: 0.02197802197802198


training:  69%|######9   | 448/649 [03:23<03:33,  1.06s/it]

==> step 448 dev acc: 0.02197802197802198


training:  79%|#######8  | 512/649 [03:52<02:32,  1.11s/it]

==> step 512 dev acc: 0.027472527472527472


training:  89%|########8 | 576/649 [04:21<01:19,  1.09s/it]

==> step 576 dev acc: 0.02197802197802198


training:  99%|#########8| 640/649 [04:49<00:09,  1.07s/it]

==> step 640 dev acc: 0.027472527472527472


training: 100%|##########| 649/649 [04:55<00:00,  2.19it/s]

==> step 649 dev acc: 0.03296703296703297



training:  10%|9         | 63/649 [00:25<04:03,  2.41it/s]

==> step 64 dev acc: 0.038461538461538464


training:  20%|#9        | 128/649 [00:59<09:26,  1.09s/it]

==> step 128 dev acc: 0.02197802197802198


training:  30%|##9       | 192/649 [01:27<08:30,  1.12s/it]

==> step 192 dev acc: 0.02197802197802198


training:  39%|###9      | 256/649 [01:55<07:31,  1.15s/it]

==> step 256 dev acc: 0.02197802197802198


training:  49%|####9     | 320/649 [02:24<06:10,  1.13s/it]

==> step 320 dev acc: 0.027472527472527472


training:  59%|#####9    | 384/649 [02:54<05:04,  1.15s/it]

==> step 384 dev acc: 0.016483516483516484


training:  69%|######9   | 448/649 [03:23<03:33,  1.06s/it]

==> step 448 dev acc: 0.02197802197802198


training:  79%|#######8  | 512/649 [03:51<02:32,  1.12s/it]

==> step 512 dev acc: 0.03296703296703297


training:  89%|########8 | 576/649 [04:20<01:19,  1.09s/it]

==> step 576 dev acc: 0.016483516483516484


training:  99%|#########8| 640/649 [04:49<00:09,  1.07s/it]

==> step 640 dev acc: 0.03296703296703297


training: 100%|##########| 649/649 [04:55<00:00,  2.19it/s]

==> step 649 dev acc: 0.016483516483516484



training:  10%|9         | 63/649 [00:25<04:02,  2.42it/s]

==> step 64 dev acc: 0.027472527472527472


training:  20%|#9        | 128/649 [00:59<09:25,  1.09s/it]

==> step 128 dev acc: 0.027472527472527472


training:  30%|##9       | 192/649 [01:27<08:28,  1.11s/it]

==> step 192 dev acc: 0.027472527472527472


training:  39%|###9      | 256/649 [01:55<07:32,  1.15s/it]

==> step 256 dev acc: 0.027472527472527472


training:  49%|####9     | 320/649 [02:24<06:10,  1.13s/it]

==> step 320 dev acc: 0.02197802197802198


training:  59%|#####9    | 383/649 [02:51<02:03,  2.16it/s]

==> step 384 dev acc: 0.038461538461538464


training:  69%|######8   | 447/649 [03:23<01:09,  2.90it/s]

==> step 448 dev acc: 0.04395604395604396


training:  79%|#######8  | 512/649 [03:56<02:32,  1.11s/it]

==> step 512 dev acc: 0.027472527472527472


training:  89%|########8 | 576/649 [04:25<01:19,  1.08s/it]

==> step 576 dev acc: 0.038461538461538464


training:  99%|#########8| 640/649 [04:53<00:09,  1.06s/it]

==> step 640 dev acc: 0.02197802197802198


training: 100%|##########| 649/649 [05:00<00:00,  2.16it/s]

==> step 649 dev acc: 0.01098901098901099



training:  10%|9         | 63/649 [00:25<04:03,  2.40it/s]

==> step 64 dev acc: 0.04945054945054945


training:  20%|#9        | 128/649 [00:59<09:24,  1.08s/it]

==> step 128 dev acc: 0.04945054945054945


training:  29%|##9       | 191/649 [01:24<03:25,  2.22it/s]

==> step 192 dev acc: 0.06043956043956044


training:  39%|###9      | 256/649 [01:57<07:30,  1.15s/it]

==> step 256 dev acc: 0.04395604395604396


training:  49%|####9     | 320/649 [02:26<06:11,  1.13s/it]

==> step 320 dev acc: 0.038461538461538464


training:  59%|#####9    | 384/649 [02:55<05:03,  1.15s/it]

==> step 384 dev acc: 0.04945054945054945


training:  69%|######9   | 448/649 [03:25<03:33,  1.06s/it]

==> step 448 dev acc: 0.038461538461538464


training:  79%|#######8  | 512/649 [03:53<02:32,  1.11s/it]

==> step 512 dev acc: 0.03296703296703297


training:  89%|########8 | 576/649 [04:22<01:19,  1.09s/it]

==> step 576 dev acc: 0.038461538461538464


training:  99%|#########8| 640/649 [04:50<00:09,  1.06s/it]

==> step 640 dev acc: 0.03296703296703297


training: 100%|##########| 649/649 [04:57<00:00,  2.18it/s]

==> step 649 dev acc: 0.03296703296703297



training:  10%|9         | 63/649 [00:25<04:02,  2.41it/s]

==> step 64 dev acc: 0.04945054945054945


training:  20%|#9        | 128/649 [00:59<09:25,  1.09s/it]

==> step 128 dev acc: 0.04395604395604396


training:  30%|##9       | 192/649 [01:27<08:28,  1.11s/it]

==> step 192 dev acc: 0.027472527472527472


training:  39%|###9      | 256/649 [01:55<07:31,  1.15s/it]

==> step 256 dev acc: 0.03296703296703297


training:  49%|####9     | 320/649 [02:24<06:11,  1.13s/it]

==> step 320 dev acc: 0.038461538461538464


training:  59%|#####9    | 384/649 [02:54<05:04,  1.15s/it]

==> step 384 dev acc: 0.04945054945054945


training:  69%|######9   | 448/649 [03:23<03:34,  1.07s/it]

==> step 448 dev acc: 0.04395604395604396


training:  79%|#######8  | 512/649 [03:51<02:32,  1.11s/it]

==> step 512 dev acc: 0.04395604395604396


training:  89%|########8 | 576/649 [04:20<01:19,  1.09s/it]

==> step 576 dev acc: 0.02197802197802198


training:  99%|#########8| 640/649 [04:48<00:09,  1.07s/it]

==> step 640 dev acc: 0.01098901098901099


training: 100%|##########| 649/649 [04:55<00:00,  2.20it/s]

==> step 649 dev acc: 0.02197802197802198



training:  10%|9         | 63/649 [00:25<04:02,  2.41it/s]

==> step 64 dev acc: 0.03296703296703297


training:  20%|#9        | 127/649 [00:56<03:37,  2.40it/s]

==> step 128 dev acc: 0.04395604395604396


training:  30%|##9       | 192/649 [01:29<08:26,  1.11s/it]

==> step 192 dev acc: 0.03296703296703297


training:  39%|###9      | 256/649 [01:57<07:32,  1.15s/it]

==> step 256 dev acc: 0.03296703296703297


training:  49%|####9     | 320/649 [02:26<06:11,  1.13s/it]

==> step 320 dev acc: 0.027472527472527472


training:  59%|#####9    | 384/649 [02:55<05:04,  1.15s/it]

==> step 384 dev acc: 0.04395604395604396


training:  69%|######9   | 448/649 [03:25<03:33,  1.06s/it]

==> step 448 dev acc: 0.038461538461538464


training:  79%|#######8  | 511/649 [03:50<01:02,  2.19it/s]

==> step 512 dev acc: 0.04945054945054945


training:  89%|########8 | 576/649 [04:24<01:19,  1.08s/it]

==> step 576 dev acc: 0.03296703296703297


training:  99%|#########8| 640/649 [04:52<00:09,  1.06s/it]

==> step 640 dev acc: 0.04395604395604396


training: 100%|##########| 649/649 [04:58<00:00,  2.17it/s]

==> step 649 dev acc: 0.03296703296703297



training:  10%|9         | 63/649 [00:25<04:02,  2.41it/s]

==> step 64 dev acc: 0.03296703296703297


training:  20%|#9        | 127/649 [00:56<03:37,  2.40it/s]

==> step 128 dev acc: 0.04395604395604396


training:  29%|##9       | 191/649 [01:26<03:25,  2.23it/s]

==> step 192 dev acc: 0.054945054945054944


training:  39%|###9      | 256/649 [01:59<07:31,  1.15s/it]

==> step 256 dev acc: 0.03296703296703297


training:  49%|####9     | 320/649 [02:28<06:11,  1.13s/it]

==> step 320 dev acc: 0.04945054945054945


training:  59%|#####9    | 384/649 [02:57<05:04,  1.15s/it]

==> step 384 dev acc: 0.038461538461538464


training:  69%|######9   | 448/649 [03:26<03:33,  1.06s/it]

==> step 448 dev acc: 0.054945054945054944


training:  79%|#######8  | 511/649 [03:52<01:03,  2.18it/s]

==> step 512 dev acc: 0.07142857142857142


training:  89%|########8 | 576/649 [04:26<01:18,  1.08s/it]

==> step 576 dev acc: 0.04945054945054945


training:  99%|#########8| 640/649 [04:54<00:09,  1.06s/it]

==> step 640 dev acc: 0.06043956043956044


training: 100%|##########| 649/649 [05:00<00:00,  2.16it/s]

==> step 649 dev acc: 0.054945054945054944



training:  10%|9         | 63/649 [00:25<04:02,  2.42it/s]

==> step 64 dev acc: 0.03296703296703297


training:  20%|#9        | 128/649 [00:59<09:24,  1.08s/it]

==> step 128 dev acc: 0.005494505494505495


training:  29%|##9       | 191/649 [01:24<03:25,  2.23it/s]

==> step 192 dev acc: 0.04395604395604396


training:  39%|###9      | 256/649 [01:57<07:31,  1.15s/it]

==> step 256 dev acc: 0.04395604395604396


training:  49%|####9     | 319/649 [02:23<02:22,  2.31it/s]

==> step 320 dev acc: 0.04945054945054945


training:  59%|#####9    | 384/649 [02:58<05:03,  1.15s/it]

==> step 384 dev acc: 0.027472527472527472


training:  69%|######8   | 447/649 [03:24<01:09,  2.90it/s]

==> step 448 dev acc: 0.06043956043956044


training:  79%|#######8  | 511/649 [03:54<01:02,  2.20it/s]

==> step 512 dev acc: 0.07142857142857142


training:  89%|########8 | 576/649 [04:28<01:18,  1.08s/it]

==> step 576 dev acc: 0.06593406593406594


training:  99%|#########8| 640/649 [04:56<00:09,  1.06s/it]

==> step 640 dev acc: 0.06593406593406594


training: 100%|#########9| 648/649 [05:00<00:00,  1.76it/s]

==> step 649 dev acc: 0.0989010989010989


training: 100%|##########| 649/649 [05:05<00:00,  2.13it/s]
training:  10%|9         | 63/649 [00:25<04:00,  2.43it/s]

==> step 64 dev acc: 0.07692307692307693


training:  20%|#9        | 128/649 [00:58<09:23,  1.08s/it]

==> step 128 dev acc: 0.07142857142857142


training:  30%|##9       | 192/649 [01:26<08:26,  1.11s/it]

==> step 192 dev acc: 0.06593406593406594


training:  39%|###9      | 256/649 [01:54<07:31,  1.15s/it]

==> step 256 dev acc: 0.04945054945054945


training:  49%|####9     | 319/649 [02:21<02:21,  2.33it/s]

==> step 320 dev acc: 0.08791208791208792


training:  59%|#####9    | 384/649 [02:55<05:04,  1.15s/it]

==> step 384 dev acc: 0.04395604395604396


training:  69%|######9   | 448/649 [03:24<03:33,  1.06s/it]

==> step 448 dev acc: 0.07692307692307693


training:  79%|#######8  | 512/649 [03:52<02:31,  1.11s/it]

==> step 512 dev acc: 0.08791208791208792


training:  89%|########8 | 576/649 [04:21<01:19,  1.08s/it]

==> step 576 dev acc: 0.07142857142857142


training:  99%|#########8| 640/649 [04:49<00:09,  1.06s/it]

==> step 640 dev acc: 0.054945054945054944


training: 100%|##########| 649/649 [04:56<00:00,  2.19it/s]

==> step 649 dev acc: 0.07142857142857142



training:  10%|9         | 63/649 [00:25<04:02,  2.41it/s]

==> step 64 dev acc: 0.06043956043956044


training:  20%|#9        | 127/649 [00:56<03:37,  2.40it/s]

==> step 128 dev acc: 0.06593406593406594


training:  30%|##9       | 192/649 [01:29<08:28,  1.11s/it]

==> step 192 dev acc: 0.04945054945054945


training:  39%|###9      | 256/649 [01:57<07:31,  1.15s/it]

==> step 256 dev acc: 0.06593406593406594


training:  49%|####9     | 319/649 [02:23<02:22,  2.32it/s]

==> step 320 dev acc: 0.07692307692307693


training:  59%|#####9    | 384/649 [02:58<05:04,  1.15s/it]

==> step 384 dev acc: 0.06043956043956044


training:  69%|######9   | 448/649 [03:27<03:33,  1.06s/it]

==> step 448 dev acc: 0.07142857142857142


training:  79%|#######8  | 511/649 [03:52<01:03,  2.19it/s]

==> step 512 dev acc: 0.08791208791208792


training:  89%|########8 | 575/649 [04:23<00:29,  2.50it/s]

==> step 576 dev acc: 0.09340659340659341


training:  99%|#########8| 640/649 [04:56<00:09,  1.05s/it]

==> step 640 dev acc: 0.08791208791208792


training: 100%|##########| 649/649 [05:03<00:00,  2.14it/s]

==> step 649 dev acc: 0.09340659340659341



training:  10%|9         | 63/649 [00:25<04:00,  2.43it/s]

==> step 64 dev acc: 0.10989010989010989


training:  20%|#9        | 128/649 [00:59<09:22,  1.08s/it]

==> step 128 dev acc: 0.04945054945054945


training:  30%|##9       | 192/649 [01:26<08:26,  1.11s/it]

==> step 192 dev acc: 0.06593406593406594


training:  39%|###9      | 256/649 [01:55<07:31,  1.15s/it]

==> step 256 dev acc: 0.07142857142857142


training:  49%|####9     | 320/649 [02:24<06:11,  1.13s/it]

==> step 320 dev acc: 0.07692307692307693


training:  59%|#####9    | 384/649 [02:53<05:04,  1.15s/it]

==> step 384 dev acc: 0.04395604395604396


training:  69%|######9   | 448/649 [03:22<03:33,  1.06s/it]

==> step 448 dev acc: 0.07692307692307693


training:  79%|#######8  | 512/649 [03:50<02:32,  1.11s/it]

==> step 512 dev acc: 0.07142857142857142


training:  89%|########8 | 576/649 [04:19<01:19,  1.09s/it]

==> step 576 dev acc: 0.1043956043956044


training:  99%|#########8| 640/649 [04:47<00:09,  1.06s/it]

==> step 640 dev acc: 0.08241758241758242


training: 100%|##########| 649/649 [04:54<00:00,  2.20it/s]

==> step 649 dev acc: 0.07142857142857142



training:  10%|9         | 63/649 [00:25<04:02,  2.42it/s]

==> step 64 dev acc: 0.07142857142857142


training:  20%|#9        | 128/649 [00:59<09:24,  1.08s/it]

==> step 128 dev acc: 0.06593406593406594


training:  30%|##9       | 192/649 [01:26<08:28,  1.11s/it]

==> step 192 dev acc: 0.054945054945054944


training:  39%|###9      | 256/649 [01:55<07:31,  1.15s/it]

==> step 256 dev acc: 0.04945054945054945


training:  49%|####9     | 319/649 [02:21<02:21,  2.33it/s]

==> step 320 dev acc: 0.07692307692307693


training:  59%|#####9    | 383/649 [02:53<02:02,  2.17it/s]

==> step 384 dev acc: 0.08241758241758242


training:  69%|######9   | 448/649 [03:27<03:33,  1.06s/it]

==> step 448 dev acc: 0.08241758241758242


training:  79%|#######8  | 512/649 [03:55<02:31,  1.11s/it]

==> step 512 dev acc: 0.07692307692307693


training:  89%|########8 | 576/649 [04:24<01:19,  1.08s/it]

==> step 576 dev acc: 0.04395604395604396


training:  99%|#########8| 640/649 [04:52<00:09,  1.06s/it]

==> step 640 dev acc: 0.07692307692307693


training: 100%|#########9| 648/649 [04:56<00:00,  1.75it/s]

==> step 649 dev acc: 0.08791208791208792


training: 100%|##########| 649/649 [05:01<00:00,  2.15it/s]
training:  10%|9         | 63/649 [00:25<04:02,  2.42it/s]

==> step 64 dev acc: 0.08241758241758242


training:  20%|#9        | 127/649 [00:56<03:37,  2.40it/s]

==> step 128 dev acc: 0.08791208791208792


training:  30%|##9       | 192/649 [01:28<08:25,  1.11s/it]

==> step 192 dev acc: 0.07142857142857142


training:  39%|###9      | 256/649 [01:56<07:30,  1.15s/it]

==> step 256 dev acc: 0.08791208791208792


training:  49%|####9     | 320/649 [02:25<06:09,  1.12s/it]

==> step 320 dev acc: 0.07692307692307693


training:  59%|#####9    | 384/649 [02:55<05:03,  1.15s/it]

==> step 384 dev acc: 0.06593406593406594


training:  69%|######9   | 448/649 [03:24<03:33,  1.06s/it]

==> step 448 dev acc: 0.06043956043956044


training:  79%|#######8  | 511/649 [03:49<01:03,  2.19it/s]

==> step 512 dev acc: 0.0989010989010989


training:  89%|########8 | 576/649 [04:23<01:18,  1.08s/it]

==> step 576 dev acc: 0.07142857142857142


training:  99%|#########8| 640/649 [04:51<00:09,  1.06s/it]

==> step 640 dev acc: 0.09340659340659341


training: 100%|#########9| 648/649 [04:55<00:00,  1.76it/s]

==> step 649 dev acc: 0.12087912087912088


training: 100%|##########| 649/649 [05:00<00:00,  2.16it/s]
training:  10%|9         | 63/649 [00:25<04:01,  2.42it/s]

==> step 64 dev acc: 0.07142857142857142


training:  20%|#9        | 127/649 [00:56<03:36,  2.41it/s]

==> step 128 dev acc: 0.07692307692307693


training:  30%|##9       | 192/649 [01:28<08:24,  1.10s/it]

==> step 192 dev acc: 0.07692307692307693


training:  39%|###9      | 255/649 [01:54<03:31,  1.86it/s]

==> step 256 dev acc: 0.09340659340659341


training:  49%|####9     | 320/649 [02:28<06:09,  1.12s/it]

==> step 320 dev acc: 0.06593406593406594


training:  59%|#####9    | 384/649 [02:57<05:03,  1.15s/it]

==> step 384 dev acc: 0.09340659340659341


training:  69%|######9   | 448/649 [03:26<03:33,  1.06s/it]

==> step 448 dev acc: 0.08241758241758242


training:  79%|#######8  | 512/649 [03:54<02:32,  1.11s/it]

==> step 512 dev acc: 0.08791208791208792


training:  89%|########8 | 575/649 [04:20<00:29,  2.50it/s]

==> step 576 dev acc: 0.11538461538461539


training:  99%|#########8| 640/649 [04:53<00:09,  1.06s/it]

==> step 640 dev acc: 0.07142857142857142


training: 100%|##########| 649/649 [05:00<00:00,  2.16it/s]

==> step 649 dev acc: 0.11538461538461539



training:  10%|9         | 63/649 [00:25<04:03,  2.41it/s]

==> step 64 dev acc: 0.08241758241758242


training:  20%|#9        | 128/649 [00:59<09:24,  1.08s/it]

==> step 128 dev acc: 0.04945054945054945


training:  29%|##9       | 191/649 [01:24<03:26,  2.22it/s]

==> step 192 dev acc: 0.09340659340659341


training:  39%|###9      | 255/649 [01:54<03:31,  1.86it/s]

==> step 256 dev acc: 0.12637362637362637


training:  49%|####9     | 320/649 [02:28<06:09,  1.12s/it]

==> step 320 dev acc: 0.0989010989010989


training:  59%|#####9    | 384/649 [02:57<05:03,  1.15s/it]

==> step 384 dev acc: 0.09340659340659341


training:  69%|######8   | 447/649 [03:23<01:09,  2.90it/s]

==> step 448 dev acc: 0.13186813186813187


training:  79%|#######8  | 512/649 [03:56<02:31,  1.11s/it]

==> step 512 dev acc: 0.0989010989010989


training:  89%|########8 | 576/649 [04:25<01:18,  1.08s/it]

==> step 576 dev acc: 0.13186813186813187


training:  99%|#########8| 640/649 [04:53<00:09,  1.06s/it]

==> step 640 dev acc: 0.07142857142857142


training: 100%|#########9| 648/649 [04:57<00:00,  1.75it/s]

==> step 649 dev acc: 0.13736263736263737


training: 100%|##########| 649/649 [05:02<00:00,  2.14it/s]
training:  10%|9         | 63/649 [00:25<04:03,  2.41it/s]

==> step 64 dev acc: 0.07142857142857142


training:  20%|#9        | 128/649 [00:59<09:24,  1.08s/it]

==> step 128 dev acc: 0.04395604395604396


training:  30%|##9       | 192/649 [01:26<08:26,  1.11s/it]

==> step 192 dev acc: 0.06043956043956044


training:  39%|###9      | 255/649 [01:52<03:32,  1.85it/s]

==> step 256 dev acc: 0.1043956043956044


training:  49%|####9     | 320/649 [02:26<06:10,  1.12s/it]

==> step 320 dev acc: 0.04395604395604396


training:  59%|#####9    | 384/649 [02:55<05:04,  1.15s/it]

==> step 384 dev acc: 0.08791208791208792


training:  69%|######9   | 448/649 [03:24<03:33,  1.06s/it]

==> step 448 dev acc: 0.0989010989010989


training:  79%|#######8  | 512/649 [03:52<02:32,  1.11s/it]

==> step 512 dev acc: 0.08241758241758242


training:  89%|########8 | 576/649 [04:21<01:19,  1.09s/it]

==> step 576 dev acc: 0.0989010989010989


training:  98%|#########8| 639/649 [04:47<00:03,  2.52it/s]

==> step 640 dev acc: 0.10989010989010989


training: 100%|#########9| 648/649 [04:56<00:00,  1.64it/s]

==> step 649 dev acc: 0.11538461538461539


training: 100%|##########| 649/649 [05:00<00:00,  2.16it/s]
training:  10%|9         | 63/649 [00:25<04:00,  2.43it/s]

==> step 64 dev acc: 0.09340659340659341


training:  20%|#9        | 128/649 [00:59<09:37,  1.11s/it]

==> step 128 dev acc: 0.054945054945054944


training:  30%|##9       | 192/649 [01:27<08:31,  1.12s/it]

==> step 192 dev acc: 0.09340659340659341


training:  39%|###9      | 255/649 [01:53<03:32,  1.85it/s]

==> step 256 dev acc: 0.10989010989010989


training:  49%|####9     | 320/649 [02:27<06:16,  1.14s/it]

==> step 320 dev acc: 0.07692307692307693


training:  59%|#####9    | 384/649 [02:57<05:06,  1.16s/it]

==> step 384 dev acc: 0.09340659340659341


training:  69%|######9   | 448/649 [03:26<03:33,  1.06s/it]

==> step 448 dev acc: 0.09340659340659341


training:  79%|#######8  | 512/649 [03:54<02:33,  1.12s/it]

==> step 512 dev acc: 0.09340659340659341


training:  89%|########8 | 576/649 [04:23<01:20,  1.10s/it]

==> step 576 dev acc: 0.08791208791208792


training:  98%|#########8| 639/649 [04:49<00:04,  2.49it/s]

==> step 640 dev acc: 0.11538461538461539


training: 100%|##########| 649/649 [05:01<00:00,  2.16it/s]

==> step 649 dev acc: 0.11538461538461539



training:  10%|9         | 63/649 [00:25<04:04,  2.40it/s]

==> step 64 dev acc: 0.07692307692307693


training:  20%|#9        | 127/649 [00:57<03:39,  2.38it/s]

==> step 128 dev acc: 0.0989010989010989


training:  30%|##9       | 192/649 [01:29<08:33,  1.12s/it]

==> step 192 dev acc: 0.07142857142857142


training:  39%|###9      | 256/649 [01:58<07:36,  1.16s/it]

==> step 256 dev acc: 0.04945054945054945


training:  49%|####9     | 320/649 [02:27<06:12,  1.13s/it]

==> step 320 dev acc: 0.06043956043956044


training:  59%|#####9    | 384/649 [02:56<05:05,  1.15s/it]

==> step 384 dev acc: 0.054945054945054944


training:  69%|######8   | 447/649 [03:23<01:09,  2.89it/s]

==> step 448 dev acc: 0.11538461538461539


training:  79%|#######8  | 512/649 [03:56<02:33,  1.12s/it]

==> step 512 dev acc: 0.08241758241758242


training:  89%|########8 | 576/649 [04:25<01:20,  1.10s/it]

==> step 576 dev acc: 0.08791208791208792


training:  98%|#########8| 639/649 [04:51<00:04,  2.50it/s]

==> step 640 dev acc: 0.12637362637362637


training: 100%|##########| 649/649 [05:03<00:00,  2.14it/s]

==> step 649 dev acc: 0.11538461538461539



training:  10%|9         | 63/649 [00:25<04:04,  2.40it/s]

==> step 64 dev acc: 0.07692307692307693


training:  20%|#9        | 128/649 [00:59<09:35,  1.10s/it]

==> step 128 dev acc: 0.054945054945054944


training:  30%|##9       | 192/649 [01:27<08:33,  1.12s/it]

==> step 192 dev acc: 0.06593406593406594


training:  39%|###9      | 255/649 [01:53<03:34,  1.83it/s]

==> step 256 dev acc: 0.09340659340659341


training:  49%|####9     | 320/649 [02:27<06:14,  1.14s/it]

==> step 320 dev acc: 0.06593406593406594


training:  59%|#####9    | 384/649 [02:57<05:06,  1.16s/it]

==> step 384 dev acc: 0.07142857142857142


training:  69%|######9   | 448/649 [03:26<03:33,  1.06s/it]

==> step 448 dev acc: 0.07692307692307693


training:  79%|#######8  | 511/649 [03:52<01:03,  2.17it/s]

==> step 512 dev acc: 0.0989010989010989


training:  89%|########8 | 575/649 [04:23<00:29,  2.48it/s]

==> step 576 dev acc: 0.12087912087912088


training:  99%|#########8| 640/649 [04:56<00:09,  1.08s/it]

==> step 640 dev acc: 0.10989010989010989


training: 100%|##########| 649/649 [05:03<00:00,  2.14it/s]

==> step 649 dev acc: 0.0989010989010989



training:  10%|9         | 63/649 [00:25<04:01,  2.42it/s]

==> step 64 dev acc: 0.07142857142857142


training:  20%|#9        | 128/649 [00:59<09:34,  1.10s/it]

==> step 128 dev acc: 0.04395604395604396


training:  30%|##9       | 192/649 [01:27<08:34,  1.13s/it]

==> step 192 dev acc: 0.06593406593406594


training:  39%|###9      | 256/649 [01:56<07:32,  1.15s/it]

==> step 256 dev acc: 0.07142857142857142


training:  49%|####9     | 320/649 [02:25<06:14,  1.14s/it]

==> step 320 dev acc: 0.07142857142857142


training:  59%|#####9    | 383/649 [02:52<02:03,  2.15it/s]

==> step 384 dev acc: 0.07692307692307693


training:  69%|######8   | 447/649 [03:23<01:10,  2.88it/s]

==> step 448 dev acc: 0.08241758241758242


training:  79%|#######8  | 511/649 [03:54<01:03,  2.17it/s]

==> step 512 dev acc: 0.1043956043956044


training:  89%|########8 | 575/649 [04:25<00:29,  2.49it/s]

==> step 576 dev acc: 0.12087912087912088


training:  98%|#########8| 639/649 [04:55<00:04,  2.49it/s]

==> step 640 dev acc: 0.13186813186813187


training: 100%|##########| 649/649 [05:07<00:00,  2.11it/s]

==> step 649 dev acc: 0.12637362637362637



training:  10%|9         | 63/649 [00:25<04:04,  2.40it/s]

==> step 64 dev acc: 0.07142857142857142


training:  20%|#9        | 128/649 [00:59<09:33,  1.10s/it]

==> step 128 dev acc: 0.06593406593406594


training:  30%|##9       | 192/649 [01:27<08:32,  1.12s/it]

==> step 192 dev acc: 0.04945054945054945


training:  39%|###9      | 255/649 [01:53<03:34,  1.83it/s]

==> step 256 dev acc: 0.08241758241758242


training:  49%|####9     | 319/649 [02:25<02:23,  2.30it/s]

==> step 320 dev acc: 0.09340659340659341


training:  59%|#####9    | 384/649 [02:59<05:06,  1.16s/it]

==> step 384 dev acc: 0.054945054945054944


training:  69%|######9   | 448/649 [03:29<03:34,  1.07s/it]

==> step 448 dev acc: 0.06593406593406594


training:  79%|#######8  | 512/649 [03:57<02:32,  1.12s/it]

==> step 512 dev acc: 0.08791208791208792


training:  89%|########8 | 576/649 [04:26<01:19,  1.09s/it]

==> step 576 dev acc: 0.08241758241758242


training:  98%|#########8| 639/649 [04:52<00:03,  2.52it/s]

==> step 640 dev acc: 0.12087912087912088


training: 100%|##########| 649/649 [05:03<00:00,  2.14it/s]

==> step 649 dev acc: 0.11538461538461539



training:  10%|9         | 63/649 [00:25<04:05,  2.39it/s]

==> step 64 dev acc: 0.12087912087912088


training:  20%|#9        | 128/649 [00:59<09:31,  1.10s/it]

==> step 128 dev acc: 0.04395604395604396


training:  30%|##9       | 192/649 [01:27<08:33,  1.12s/it]

==> step 192 dev acc: 0.07142857142857142


training:  39%|###9      | 256/649 [01:56<07:34,  1.16s/it]

==> step 256 dev acc: 0.0989010989010989


training:  49%|####9     | 320/649 [02:25<06:13,  1.13s/it]

==> step 320 dev acc: 0.0989010989010989


training:  59%|#####9    | 384/649 [02:55<05:06,  1.16s/it]

==> step 384 dev acc: 0.07142857142857142


training:  69%|######8   | 447/649 [03:21<01:09,  2.90it/s]

==> step 448 dev acc: 0.12637362637362637


training:  79%|#######8  | 512/649 [03:54<02:33,  1.12s/it]

==> step 512 dev acc: 0.0989010989010989


training:  89%|########8 | 576/649 [04:23<01:20,  1.10s/it]

==> step 576 dev acc: 0.0989010989010989


training:  99%|#########8| 640/649 [04:51<00:09,  1.07s/it]

==> step 640 dev acc: 0.0989010989010989


training: 100%|##########| 649/649 [04:58<00:00,  2.17it/s]

==> step 649 dev acc: 0.08791208791208792



training:  10%|9         | 63/649 [00:25<04:04,  2.40it/s]

==> step 64 dev acc: 0.0989010989010989


training:  20%|#9        | 128/649 [00:59<09:34,  1.10s/it]

==> step 128 dev acc: 0.06043956043956044


training:  30%|##9       | 192/649 [01:27<08:31,  1.12s/it]

==> step 192 dev acc: 0.04395604395604396


training:  39%|###9      | 256/649 [01:56<07:33,  1.15s/it]

==> step 256 dev acc: 0.0989010989010989


training:  49%|####9     | 320/649 [02:25<06:13,  1.13s/it]

==> step 320 dev acc: 0.06593406593406594


training:  59%|#####9    | 384/649 [02:54<05:06,  1.16s/it]

==> step 384 dev acc: 0.07692307692307693


training:  69%|######8   | 447/649 [03:21<01:10,  2.87it/s]

==> step 448 dev acc: 0.10989010989010989


training:  79%|#######8  | 512/649 [03:54<02:33,  1.12s/it]

==> step 512 dev acc: 0.08791208791208792


training:  89%|########8 | 576/649 [04:24<01:19,  1.10s/it]

==> step 576 dev acc: 0.08791208791208792


training:  98%|#########8| 639/649 [04:49<00:03,  2.51it/s]

==> step 640 dev acc: 0.14285714285714285


training: 100%|##########| 649/649 [05:01<00:00,  2.16it/s]

==> step 649 dev acc: 0.13736263736263737



training:  10%|9         | 63/649 [00:25<04:06,  2.38it/s]

==> step 64 dev acc: 0.08791208791208792


training:  20%|#9        | 128/649 [00:59<09:31,  1.10s/it]

==> step 128 dev acc: 0.07142857142857142


training:  30%|##9       | 192/649 [01:27<08:30,  1.12s/it]

==> step 192 dev acc: 0.08241758241758242


training:  39%|###9      | 256/649 [01:56<07:36,  1.16s/it]

==> step 256 dev acc: 0.07692307692307693


training:  49%|####9     | 320/649 [02:25<06:13,  1.14s/it]

==> step 320 dev acc: 0.06593406593406594


training:  59%|#####9    | 383/649 [02:52<02:03,  2.15it/s]

==> step 384 dev acc: 0.09340659340659341


training:  69%|######9   | 448/649 [03:26<03:36,  1.07s/it]

==> step 448 dev acc: 0.06043956043956044


training:  79%|#######8  | 511/649 [03:52<01:03,  2.16it/s]

==> step 512 dev acc: 0.1043956043956044


training:  89%|########8 | 576/649 [04:26<01:20,  1.10s/it]

==> step 576 dev acc: 0.07142857142857142


training:  98%|#########8| 639/649 [04:51<00:03,  2.51it/s]

==> step 640 dev acc: 0.12087912087912088


training: 100%|##########| 649/649 [05:03<00:00,  2.14it/s]

==> step 649 dev acc: 0.0989010989010989



training:  10%|9         | 63/649 [00:25<04:04,  2.40it/s]

==> step 64 dev acc: 0.08241758241758242


training:  20%|#9        | 128/649 [00:59<09:35,  1.10s/it]

==> step 128 dev acc: 0.054945054945054944


training:  30%|##9       | 192/649 [01:27<08:34,  1.13s/it]

==> step 192 dev acc: 0.054945054945054944


training:  39%|###9      | 255/649 [01:53<03:35,  1.83it/s]

==> step 256 dev acc: 0.08791208791208792


training:  49%|####9     | 320/649 [02:27<06:15,  1.14s/it]

==> step 320 dev acc: 0.054945054945054944


training:  59%|#####9    | 384/649 [02:57<05:08,  1.16s/it]

==> step 384 dev acc: 0.06593406593406594


training:  69%|######9   | 448/649 [03:27<03:34,  1.07s/it]

==> step 448 dev acc: 0.06593406593406594


training:  79%|#######8  | 512/649 [03:55<02:33,  1.12s/it]

==> step 512 dev acc: 0.08241758241758242


training:  89%|########8 | 576/649 [04:24<01:19,  1.09s/it]

==> step 576 dev acc: 0.06593406593406594


training:  98%|#########8| 639/649 [04:49<00:03,  2.50it/s]

==> step 640 dev acc: 0.1043956043956044


training: 100%|##########| 649/649 [05:01<00:00,  2.15it/s]

==> step 649 dev acc: 0.0989010989010989



training:  10%|9         | 63/649 [00:25<04:04,  2.39it/s]

==> step 64 dev acc: 0.07142857142857142


training:  20%|#9        | 127/649 [00:57<03:39,  2.38it/s]

==> step 128 dev acc: 0.07692307692307693


training:  29%|##9       | 191/649 [01:27<03:29,  2.19it/s]

==> step 192 dev acc: 0.09340659340659341


training:  39%|###9      | 256/649 [02:00<07:37,  1.16s/it]

==> step 256 dev acc: 0.07142857142857142


training:  49%|####9     | 320/649 [02:30<06:15,  1.14s/it]

==> step 320 dev acc: 0.08791208791208792


training:  59%|#####9    | 384/649 [02:59<05:07,  1.16s/it]

==> step 384 dev acc: 0.09340659340659341


training:  69%|######9   | 448/649 [03:29<03:36,  1.08s/it]

==> step 448 dev acc: 0.06593406593406594


training:  79%|#######8  | 512/649 [03:57<02:33,  1.12s/it]

==> step 512 dev acc: 0.09340659340659341


training:  89%|########8 | 576/649 [04:26<01:19,  1.09s/it]

==> step 576 dev acc: 0.09340659340659341


training:  98%|#########8| 639/649 [04:52<00:04,  2.49it/s]

==> step 640 dev acc: 0.1043956043956044


training: 100%|##########| 649/649 [05:03<00:00,  2.14it/s]

==> step 649 dev acc: 0.08241758241758242



training:  10%|9         | 63/649 [00:25<04:04,  2.39it/s]

==> step 64 dev acc: 0.10989010989010989


training:  20%|#9        | 128/649 [00:59<09:31,  1.10s/it]

==> step 128 dev acc: 0.07692307692307693


training:  30%|##9       | 192/649 [01:27<08:31,  1.12s/it]

==> step 192 dev acc: 0.08241758241758242


training:  39%|###9      | 256/649 [01:56<07:36,  1.16s/it]

==> step 256 dev acc: 0.08791208791208792


training:  49%|####9     | 320/649 [02:25<06:15,  1.14s/it]

==> step 320 dev acc: 0.06593406593406594


training:  59%|#####9    | 384/649 [02:55<05:06,  1.16s/it]

==> step 384 dev acc: 0.06043956043956044


training:  69%|######9   | 448/649 [03:24<03:35,  1.07s/it]

==> step 448 dev acc: 0.07692307692307693


training:  79%|#######8  | 512/649 [03:52<02:32,  1.12s/it]

==> step 512 dev acc: 0.0989010989010989


training:  89%|########8 | 576/649 [04:21<01:19,  1.09s/it]

==> step 576 dev acc: 0.06593406593406594


training:  99%|#########8| 640/649 [04:50<00:09,  1.07s/it]

==> step 640 dev acc: 0.06043956043956044


training: 100%|##########| 649/649 [04:56<00:00,  2.19it/s]

==> step 649 dev acc: 0.08241758241758242



training:  10%|9         | 63/649 [00:25<04:06,  2.38it/s]

==> step 64 dev acc: 0.09340659340659341


training:  20%|#9        | 128/649 [00:59<09:34,  1.10s/it]

==> step 128 dev acc: 0.06593406593406594


training:  30%|##9       | 192/649 [01:27<08:32,  1.12s/it]

==> step 192 dev acc: 0.07142857142857142


training:  39%|###9      | 256/649 [01:56<07:37,  1.16s/it]

==> step 256 dev acc: 0.07142857142857142


training:  49%|####9     | 320/649 [02:26<06:16,  1.14s/it]

==> step 320 dev acc: 0.08791208791208792


training:  59%|#####9    | 384/649 [02:55<05:08,  1.17s/it]

==> step 384 dev acc: 0.08791208791208792


training:  69%|######9   | 448/649 [03:25<03:36,  1.08s/it]

==> step 448 dev acc: 0.08791208791208792


training:  79%|#######8  | 512/649 [03:53<02:33,  1.12s/it]

==> step 512 dev acc: 0.07692307692307693


training:  89%|########8 | 576/649 [04:22<01:19,  1.09s/it]

==> step 576 dev acc: 0.08791208791208792


training:  99%|#########8| 640/649 [04:50<00:09,  1.07s/it]

==> step 640 dev acc: 0.09340659340659341


training: 100%|##########| 649/649 [04:57<00:00,  2.18it/s]

==> step 649 dev acc: 0.09340659340659341



training:  10%|9         | 63/649 [00:25<04:03,  2.40it/s]

==> step 64 dev acc: 0.09340659340659341


training:  20%|#9        | 128/649 [00:59<09:33,  1.10s/it]

==> step 128 dev acc: 0.08241758241758242


training:  30%|##9       | 192/649 [01:27<08:34,  1.13s/it]

==> step 192 dev acc: 0.08791208791208792


training:  39%|###9      | 256/649 [01:56<07:35,  1.16s/it]

==> step 256 dev acc: 0.06043956043956044


training:  49%|####9     | 320/649 [02:25<06:13,  1.14s/it]

==> step 320 dev acc: 0.08241758241758242


training:  59%|#####9    | 383/649 [02:52<02:03,  2.15it/s]

==> step 384 dev acc: 0.1043956043956044


training:  69%|######9   | 448/649 [03:27<03:36,  1.08s/it]

==> step 448 dev acc: 0.1043956043956044


training:  79%|#######8  | 512/649 [03:55<02:33,  1.12s/it]

==> step 512 dev acc: 0.08791208791208792


training:  89%|########8 | 576/649 [04:24<01:19,  1.09s/it]

==> step 576 dev acc: 0.08791208791208792


training:  99%|#########8| 640/649 [04:52<00:09,  1.07s/it]

==> step 640 dev acc: 0.1043956043956044


training: 100%|#########9| 648/649 [04:57<00:00,  1.73it/s]

==> step 649 dev acc: 0.10989010989010989


training: 100%|##########| 649/649 [05:01<00:00,  2.15it/s]
training:  10%|9         | 63/649 [00:25<04:05,  2.38it/s]

==> step 64 dev acc: 0.07692307692307693


training:  20%|#9        | 128/649 [00:59<09:33,  1.10s/it]

==> step 128 dev acc: 0.06043956043956044


training:  30%|##9       | 192/649 [01:27<08:31,  1.12s/it]

==> step 192 dev acc: 0.07142857142857142


training:  39%|###9      | 255/649 [01:53<03:34,  1.84it/s]

==> step 256 dev acc: 0.08791208791208792


training:  49%|####9     | 320/649 [02:27<06:16,  1.14s/it]

==> step 320 dev acc: 0.08241758241758242


training:  59%|#####9    | 383/649 [02:54<02:03,  2.15it/s]

==> step 384 dev acc: 0.10989010989010989


training:  69%|######9   | 448/649 [03:29<03:36,  1.08s/it]

==> step 448 dev acc: 0.07142857142857142


training:  79%|#######8  | 512/649 [03:57<02:33,  1.12s/it]

==> step 512 dev acc: 0.09340659340659341


training:  89%|########8 | 576/649 [04:26<01:19,  1.09s/it]

==> step 576 dev acc: 0.07692307692307693


training:  98%|#########8| 639/649 [04:52<00:04,  2.49it/s]

==> step 640 dev acc: 0.13186813186813187


training: 100%|##########| 649/649 [05:03<00:00,  2.14it/s]

==> step 649 dev acc: 0.12087912087912088



training:  10%|9         | 63/649 [00:25<04:05,  2.39it/s]

==> step 64 dev acc: 0.12637362637362637


training:  20%|#9        | 128/649 [00:59<09:30,  1.10s/it]

==> step 128 dev acc: 0.0989010989010989


training:  30%|##9       | 192/649 [01:27<08:32,  1.12s/it]

==> step 192 dev acc: 0.08791208791208792


training:  39%|###9      | 256/649 [01:56<07:36,  1.16s/it]

==> step 256 dev acc: 0.08241758241758242


training:  49%|####9     | 320/649 [02:25<06:13,  1.14s/it]

==> step 320 dev acc: 0.06593406593406594


training:  59%|#####9    | 384/649 [02:55<05:06,  1.16s/it]

==> step 384 dev acc: 0.07692307692307693


training:  69%|######9   | 448/649 [03:24<03:35,  1.07s/it]

==> step 448 dev acc: 0.07692307692307693


training:  79%|#######8  | 512/649 [03:53<02:33,  1.12s/it]

==> step 512 dev acc: 0.08791208791208792


training:  89%|########8 | 576/649 [04:22<01:20,  1.10s/it]

==> step 576 dev acc: 0.07142857142857142


training:  99%|#########8| 640/649 [04:50<00:09,  1.07s/it]

==> step 640 dev acc: 0.1043956043956044


training: 100%|##########| 649/649 [04:57<00:00,  2.18it/s]

==> step 649 dev acc: 0.10989010989010989



training:  10%|9         | 63/649 [00:25<04:04,  2.40it/s]

==> step 64 dev acc: 0.1043956043956044


training:  20%|#9        | 128/649 [00:59<09:31,  1.10s/it]

==> step 128 dev acc: 0.07692307692307693


training:  30%|##9       | 192/649 [01:27<08:33,  1.12s/it]

==> step 192 dev acc: 0.08241758241758242


training:  39%|###9      | 256/649 [01:56<07:36,  1.16s/it]

==> step 256 dev acc: 0.1043956043956044


training:  49%|####9     | 320/649 [02:25<06:13,  1.14s/it]

==> step 320 dev acc: 0.0989010989010989


training:  59%|#####9    | 384/649 [02:55<05:06,  1.16s/it]

==> step 384 dev acc: 0.0989010989010989


training:  69%|######9   | 448/649 [03:24<03:35,  1.07s/it]

==> step 448 dev acc: 0.08791208791208792


training:  79%|#######8  | 512/649 [03:53<02:33,  1.12s/it]

==> step 512 dev acc: 0.07142857142857142


training:  89%|########8 | 576/649 [04:22<01:19,  1.09s/it]

==> step 576 dev acc: 0.08241758241758242


training:  98%|#########8| 639/649 [04:47<00:03,  2.51it/s]

==> step 640 dev acc: 0.12637362637362637


training: 100%|##########| 649/649 [04:59<00:00,  2.17it/s]

==> step 649 dev acc: 0.11538461538461539



training:  10%|9         | 63/649 [00:25<04:04,  2.39it/s]

==> step 64 dev acc: 0.08241758241758242


training:  20%|#9        | 128/649 [00:59<09:34,  1.10s/it]

==> step 128 dev acc: 0.07142857142857142


training:  30%|##9       | 192/649 [01:27<08:32,  1.12s/it]

==> step 192 dev acc: 0.07142857142857142


training:  39%|###9      | 256/649 [01:56<07:33,  1.15s/it]

==> step 256 dev acc: 0.07142857142857142


training:  49%|####9     | 319/649 [02:22<02:22,  2.31it/s]

==> step 320 dev acc: 0.0989010989010989


training:  59%|#####9    | 384/649 [02:57<05:09,  1.17s/it]

==> step 384 dev acc: 0.08241758241758242


training:  69%|######9   | 448/649 [03:26<03:35,  1.07s/it]

==> step 448 dev acc: 0.09340659340659341


training:  79%|#######8  | 512/649 [03:55<02:33,  1.12s/it]

==> step 512 dev acc: 0.0989010989010989


training:  89%|########8 | 576/649 [04:24<01:19,  1.10s/it]

==> step 576 dev acc: 0.08791208791208792


training:  99%|#########8| 640/649 [04:52<00:09,  1.07s/it]

==> step 640 dev acc: 0.08791208791208792


training: 100%|#########9| 648/649 [04:56<00:00,  1.73it/s]

==> step 649 dev acc: 0.12087912087912088


training: 100%|##########| 649/649 [05:01<00:00,  2.15it/s]
training:  10%|9         | 63/649 [00:25<04:04,  2.40it/s]

==> step 64 dev acc: 0.0989010989010989


training:  20%|#9        | 128/649 [00:59<09:32,  1.10s/it]

==> step 128 dev acc: 0.06593406593406594


training:  30%|##9       | 192/649 [01:27<08:32,  1.12s/it]

==> step 192 dev acc: 0.06043956043956044


training:  39%|###9      | 256/649 [01:56<07:35,  1.16s/it]

==> step 256 dev acc: 0.08241758241758242


training:  49%|####9     | 320/649 [02:25<06:16,  1.14s/it]

==> step 320 dev acc: 0.06593406593406594


training:  59%|#####9    | 384/649 [02:55<05:07,  1.16s/it]

==> step 384 dev acc: 0.08791208791208792


training:  69%|######9   | 448/649 [03:25<03:35,  1.07s/it]

==> step 448 dev acc: 0.08791208791208792


training:  79%|#######8  | 512/649 [03:53<02:33,  1.12s/it]

==> step 512 dev acc: 0.09340659340659341


training:  89%|########8 | 575/649 [04:19<00:29,  2.48it/s]

==> step 576 dev acc: 0.1043956043956044


training:  98%|#########8| 639/649 [04:50<00:04,  2.49it/s]

==> step 640 dev acc: 0.10989010989010989


training: 100%|#########9| 648/649 [04:59<00:00,  1.63it/s]

==> step 649 dev acc: 0.11538461538461539


training: 100%|##########| 649/649 [05:03<00:00,  2.14it/s]
training:  10%|9         | 63/649 [00:25<04:05,  2.39it/s]

==> step 64 dev acc: 0.09340659340659341


training:  20%|#9        | 128/649 [00:59<09:30,  1.10s/it]

==> step 128 dev acc: 0.08241758241758242


training:  30%|##9       | 192/649 [01:27<08:33,  1.12s/it]

==> step 192 dev acc: 0.07142857142857142


training:  39%|###9      | 255/649 [01:53<03:33,  1.84it/s]

==> step 256 dev acc: 0.0989010989010989


training:  49%|####9     | 320/649 [02:28<06:17,  1.15s/it]

==> step 320 dev acc: 0.08791208791208792


training:  59%|#####9    | 384/649 [02:57<05:05,  1.15s/it]

==> step 384 dev acc: 0.07692307692307693


training:  69%|######9   | 448/649 [03:26<03:34,  1.07s/it]

==> step 448 dev acc: 0.08241758241758242


training:  79%|#######8  | 512/649 [03:55<02:33,  1.12s/it]

==> step 512 dev acc: 0.08241758241758242


training:  89%|########8 | 576/649 [04:24<01:19,  1.10s/it]

==> step 576 dev acc: 0.08241758241758242


training:  99%|#########8| 640/649 [04:52<00:09,  1.07s/it]

==> step 640 dev acc: 0.07142857142857142


training: 100%|##########| 649/649 [04:59<00:00,  2.17it/s]

==> step 649 dev acc: 0.06593406593406594



training:  10%|9         | 63/649 [00:25<04:04,  2.40it/s]

==> step 64 dev acc: 0.09340659340659341


training:  20%|#9        | 128/649 [00:59<09:34,  1.10s/it]

==> step 128 dev acc: 0.08241758241758242


training:  29%|##9       | 191/649 [01:25<03:28,  2.20it/s]

==> step 192 dev acc: 0.10989010989010989


training:  39%|###9      | 256/649 [01:58<07:38,  1.17s/it]

==> step 256 dev acc: 0.0989010989010989


training:  49%|####9     | 320/649 [02:27<06:14,  1.14s/it]

==> step 320 dev acc: 0.08241758241758242


training:  59%|#####9    | 384/649 [02:57<05:06,  1.16s/it]

==> step 384 dev acc: 0.08241758241758242


training:  69%|######9   | 448/649 [03:26<03:36,  1.08s/it]

==> step 448 dev acc: 0.08791208791208792


training:  79%|#######8  | 512/649 [03:55<02:34,  1.13s/it]

==> step 512 dev acc: 0.1043956043956044


training:  89%|########8 | 576/649 [04:24<01:20,  1.10s/it]

==> step 576 dev acc: 0.09340659340659341


training:  99%|#########8| 640/649 [04:52<00:09,  1.07s/it]

==> step 640 dev acc: 0.1043956043956044


training: 100%|##########| 649/649 [04:59<00:00,  2.17it/s]

==> step 649 dev acc: 0.10989010989010989



training:  10%|9         | 63/649 [00:25<04:04,  2.40it/s]

==> step 64 dev acc: 0.09340659340659341


training:  20%|#9        | 128/649 [00:59<09:34,  1.10s/it]

==> step 128 dev acc: 0.09340659340659341


training:  29%|##9       | 191/649 [01:25<03:28,  2.20it/s]

==> step 192 dev acc: 0.0989010989010989


training:  39%|###9      | 256/649 [01:58<07:39,  1.17s/it]

==> step 256 dev acc: 0.08241758241758242


training:  49%|####9     | 320/649 [02:28<06:14,  1.14s/it]

==> step 320 dev acc: 0.08241758241758242


training:  59%|#####9    | 384/649 [02:57<05:06,  1.16s/it]

==> step 384 dev acc: 0.08791208791208792


training:  69%|######8   | 447/649 [03:24<01:10,  2.88it/s]

==> step 448 dev acc: 0.1043956043956044


training:  79%|#######8  | 512/649 [03:57<02:34,  1.13s/it]

==> step 512 dev acc: 0.08791208791208792


training:  89%|########8 | 575/649 [04:24<00:29,  2.49it/s]

==> step 576 dev acc: 0.11538461538461539


training:  98%|#########8| 639/649 [04:54<00:04,  2.49it/s]

==> step 640 dev acc: 0.12087912087912088


training: 100%|##########| 649/649 [05:06<00:00,  2.12it/s]

==> step 649 dev acc: 0.0989010989010989



training:  10%|9         | 63/649 [00:25<04:05,  2.38it/s]

==> step 64 dev acc: 0.0989010989010989


training:  20%|#9        | 128/649 [00:59<09:32,  1.10s/it]

==> step 128 dev acc: 0.09340659340659341


training:  30%|##9       | 192/649 [01:27<08:34,  1.13s/it]

==> step 192 dev acc: 0.09340659340659341


training:  39%|###9      | 256/649 [01:56<07:34,  1.16s/it]

==> step 256 dev acc: 0.08791208791208792


training:  49%|####9     | 320/649 [02:25<06:14,  1.14s/it]

==> step 320 dev acc: 0.08241758241758242


training:  59%|#####9    | 384/649 [02:55<05:06,  1.16s/it]

==> step 384 dev acc: 0.09340659340659341


training:  69%|######9   | 448/649 [03:24<03:35,  1.07s/it]

==> step 448 dev acc: 0.09340659340659341


training:  79%|#######8  | 512/649 [03:53<02:33,  1.12s/it]

==> step 512 dev acc: 0.08791208791208792


training:  89%|########8 | 576/649 [04:22<01:20,  1.10s/it]

==> step 576 dev acc: 0.07692307692307693


training:  99%|#########8| 640/649 [04:50<00:09,  1.07s/it]

==> step 640 dev acc: 0.08241758241758242


training: 100%|##########| 649/649 [04:57<00:00,  2.18it/s]

==> step 649 dev acc: 0.09340659340659341



training:  10%|9         | 63/649 [00:25<04:05,  2.39it/s]

==> step 64 dev acc: 0.11538461538461539


training:  20%|#9        | 128/649 [00:59<09:33,  1.10s/it]

==> step 128 dev acc: 0.11538461538461539


training:  30%|##9       | 192/649 [01:27<08:32,  1.12s/it]

==> step 192 dev acc: 0.0989010989010989


training:  39%|###9      | 256/649 [01:56<07:37,  1.16s/it]

==> step 256 dev acc: 0.0989010989010989


training:  49%|####9     | 320/649 [02:25<06:13,  1.14s/it]

==> step 320 dev acc: 0.07692307692307693


training:  59%|#####9    | 384/649 [02:55<05:06,  1.16s/it]

==> step 384 dev acc: 0.09340659340659341


training:  69%|######9   | 448/649 [03:24<03:34,  1.07s/it]

==> step 448 dev acc: 0.09340659340659341


training:  79%|#######8  | 512/649 [03:53<02:32,  1.11s/it]

==> step 512 dev acc: 0.09340659340659341


training:  89%|########8 | 576/649 [04:22<01:19,  1.10s/it]

==> step 576 dev acc: 0.07692307692307693


training:  99%|#########8| 640/649 [04:50<00:09,  1.07s/it]

==> step 640 dev acc: 0.1043956043956044


training: 100%|##########| 649/649 [04:57<00:00,  2.18it/s]

==> step 649 dev acc: 0.1043956043956044



training:  10%|9         | 63/649 [00:25<04:03,  2.40it/s]

==> step 64 dev acc: 0.08791208791208792


training:  20%|#9        | 127/649 [00:57<03:38,  2.39it/s]

==> step 128 dev acc: 0.09340659340659341


training:  30%|##9       | 192/649 [01:29<08:34,  1.13s/it]

==> step 192 dev acc: 0.07142857142857142


training:  39%|###9      | 256/649 [01:58<07:37,  1.16s/it]

==> step 256 dev acc: 0.07692307692307693


training:  49%|####9     | 320/649 [02:27<06:13,  1.13s/it]

==> step 320 dev acc: 0.08791208791208792


training:  59%|#####9    | 384/649 [02:57<05:08,  1.16s/it]

==> step 384 dev acc: 0.06593406593406594


training:  69%|######9   | 448/649 [03:26<03:36,  1.08s/it]

==> step 448 dev acc: 0.06593406593406594


training:  79%|#######8  | 512/649 [03:55<02:33,  1.12s/it]

==> step 512 dev acc: 0.09340659340659341


training:  89%|########8 | 575/649 [04:21<00:29,  2.51it/s]

==> step 576 dev acc: 0.11538461538461539


training:  99%|#########8| 640/649 [04:54<00:09,  1.08s/it]

==> step 640 dev acc: 0.08791208791208792


training: 100%|##########| 649/649 [05:01<00:00,  2.15it/s]

==> step 649 dev acc: 0.08241758241758242



training:  10%|9         | 63/649 [00:25<04:03,  2.41it/s]

==> step 64 dev acc: 0.0989010989010989


training:  16%|#6        | 105/649 [00:47<03:37,  2.50it/s]

In [None]:
evaluate(model, test_data, device, tokenizer, mute=False, batch_size=10)