In [1]:
!pip install transformers
!pip install datasets
from google.colab import drive
# drive.mount('/content/drive')
# % cd /content/drive/MyDrive/TranslationArtifact_CodesAndGraphs

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 31.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 2.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 18.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [2]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from transformers import get_linear_schedule_with_warmup, get_constant_schedule
import torch
import torch.nn as nn
import torch.nn.functional as F 
from datasets import load_dataset
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
import math
from tqdm import tqdm
import copy
import pickle
import os
import random

In [3]:
def find_answer(question_text, answer, ref_text, tokenizer) -> bool:
    model_input = tokenizer(question_text, ref_text, truncation=True, padding=True, return_token_type_ids=True, add_special_tokens=True)['input_ids'] #model input has both question text and reference text
    answer = tokenizer(answer, truncation=True, padding=True, return_token_type_ids=True, add_special_tokens=False)['input_ids']
    if len(model_input) > 512:
        return -1, -1
    for i in range(0, len(model_input) - len(answer) + 1):
        if answer == model_input[i: i + len(answer)]:
            return (i, i + len(answer))
    return -1, -1
# prepair input
def prepare_inputs(indexes, data, tokenizer):
    contexts = []
    questions = []
    answer_starts = []
    answer_ends = []
    for i in indexes:
        t = data[i]
        question = t['question']
        answer = t['answers']['text'][0]
        context = t['context']
        s, e = find_answer(question, answer, context, tokenizer)
        if s == -1:
            continue
        contexts.append(context)
        questions.append(question)
        answer_starts.append(s)
        answer_ends.append(e)
    return contexts, questions, answer_starts, answer_ends

In [5]:
def get_scheduler(optimizer, scheduler: str, warmup_steps: int, t_total: int):
    """
    Returns the correct learning rate scheduler
    """
    scheduler = scheduler.lower()
    if scheduler=='constantlr':
        return get_constant_schedule(optimizer)
    elif scheduler=='warmupconstant':
        return get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
    elif scheduler=='warmuplinear':
        return get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    elif scheduler=='warmupcosine':
        return get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    elif scheduler=='warmupcosinewithhardrestarts':
        return get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    else:
        raise ValueError("Unknown scheduler {}".format(scheduler))
        
def train(model, optimizer, scheduler, train_data, dev_data, batch_size, device, max_grad_norm, tokenizer, best_acc = -1):
    loss_fn = nn.CrossEntropyLoss()

    step_cnt = 0
    best_model_weights = None
    
    contexts, questions, answer_starts, answer_ends = train_data

    for pointer in tqdm(range(0, len(contexts), batch_size), desc='training',ascii = True,leave = True):
        model.train() # model was in eval mode in evaluate(); re-activate the train mode
        optimizer.zero_grad() # clear gradients first
        torch.cuda.empty_cache() # releases all unoccupied cached memory 
        # t = torch.cuda.get_device_properties(0).total_memory
        # r = torch.cuda.memory_reserved(0) 
        # a = torch.cuda.memory_allocated(0)
        # print(t,r,a)

        step_cnt += 1
        
        input = tokenizer(contexts[pointer:min(pointer + batch_size, len(contexts))], questions[pointer:min(pointer + batch_size, len(contexts))], return_tensors="pt",truncation=True, padding=True, return_token_type_ids=True, add_special_tokens=True)
        input.to(device)
        
        answer_start, answer_end = (answer_starts[pointer:min(pointer + batch_size, len(contexts))], answer_ends[pointer:min(pointer + batch_size, len(contexts))])
        true_labels1 = torch.LongTensor(np.array(answer_start)).to(device)
        true_labels2 = torch.LongTensor(np.array(answer_end)).to(device)
            
        output = model(**input)
        if output is None: continue
        pred_indicies1 = output['start_logits']
        pred_indicies2 = output['end_logits']
        # print(pred_indicies1.shape, true_labels1.shape)
        loss1 = loss_fn(pred_indicies1,true_labels1)
        loss2 = loss_fn(pred_indicies2,true_labels2)
        loss = loss1 + loss2

        # back propagate
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        # update weights 
        optimizer.step()

        # update training rate
        scheduler.step()

        if step_cnt%int(len(contexts)/batch_size/10) == 0 or step_cnt == math.ceil(len(contexts)*1./batch_size):
            acc = evaluate(model,dev_data,device,tokenizer,mute=True)
            print('==> step {} dev acc: {}'.format(step_cnt,acc))
            if acc > best_acc:
                best_acc = acc
                best_model_weights = copy.deepcopy(model.cpu().state_dict())
                model.to(device)

    return best_model_weights
    
def evaluate(model, test_data, device, tokenizer, mute=False, batch_size=10):
    model.eval()
    contexts, questions, answer_starts, answer_ends = test_data
    all_labels = []
    all_predict = np.array([])
    with torch.no_grad():
        for pointer in range(0, len(contexts), batch_size):            
            input = tokenizer(contexts[pointer:min(pointer + batch_size, len(contexts))], questions[pointer:min(pointer + batch_size, len(contexts))], return_tensors="pt",truncation=True, padding=True, return_token_type_ids=True, add_special_tokens=True)
            input.to(device)

            answer_start, answer_end = (answer_starts[pointer:min(pointer + batch_size, len(contexts))], answer_ends[pointer:min(pointer + batch_size, len(contexts))])
            all_labels = all_labels + answer_start 
            all_labels = all_labels + answer_end
            
            outputs = model(**input)
            
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits
            predict_start = [np.argmax(pp) for pp in start_logits.cpu()]
            predict_end = [np.argmax(pp) for pp in end_logits.cpu()]
            all_predict = np.concatenate((all_predict, predict_start), axis=None)
            all_predict = np.concatenate((all_predict, predict_end), axis=None)
    assert len(all_predict) == len(all_labels)


    acc = len([i for i in range(len(all_labels)) if all_predict[i]==all_labels[i]])*1./len(all_labels)

    if not mute:
        print('==>acc<==', acc)

    return acc

In [6]:
train_d = load_dataset('tydiqa', name = 'primary_task', split = 'train')
train_d[0]

Downloading builder script:   0%|          | 0.00/3.49k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading and preparing dataset tydiqa/primary_task (download: 1.82 GiB, generated: 5.62 GiB, post-processed: Unknown size, total: 7.44 GiB) to /root/.cache/huggingface/datasets/tydiqa/primary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/161M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/58.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.62M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/166916 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/18670 [00:00<?, ? examples/s]

Dataset tydiqa downloaded and prepared to /root/.cache/huggingface/datasets/tydiqa/primary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148. Subsequent calls will reuse this data.


{'annotations': {'minimal_answers_end_byte': [-1],
  'minimal_answers_start_byte': [-1],
  'passage_answer_candidate_index': [-1],
  'yes_no_answer': ['NONE']},
 'document_plaintext': '\ntransl.\n\nRas (dari bahasa Prancis race, yang sendirinya dari bahasa Latin radix, "akar") adalah suatu sistem klasifikasi yang digunakan untuk mengkategorikan manusia dalam populasi atau kelompok besar dan berbeda melalui ciri fenotipe, asal usul geografis, tampang jasmani dan kesukuan yang terwarisi. Di awal abad ke-20 istilah ini sering digunakan dalam arti biologis untuk menunjuk populasi manusia yang beraneka ragam dari segi genetik dengan anggota yang memiliki fenotipe (tampang luar) yang sama.[1] Arti "ras" ini masih digunakan dalam antropologi forensik (dalam menganalisa sisa tulang), penelitian biomedis dan kedokteran berdasarkan asal usul.[2]\nDi samping itu, di Amerika Serikat misalnya, penegak hukum menggunakan istilah "ras" dalam menentukan profil tersangka dan penggambaran kembali tampang

In [7]:
l = 'english'

train_d = load_dataset('tydiqa', name = 'secondary_task', split = 'train')
train_d = train_d.shuffle()
test_d = load_dataset('tydiqa', name = 'secondary_task', split = 'validation')

tokenizer=AutoTokenizer.from_pretrained('xlm-roberta-base')
languages = [(t['id'].split("-")[0]) for t in train_d]
# find target language indicies
indexes_train = [i for i,x in enumerate(languages) if x == l]
indexes_dev = indexes_train[-100:]
indexes_train = indexes_train[:-100]
languages = [(t['id'].split("-")[0]) for t in test_d]
indexes_test = [i for i,x in enumerate(languages) if x == l]

train_data = prepare_inputs(indexes_train, train_d, tokenizer)
dev_data = prepare_inputs(indexes_dev, train_d, tokenizer)
test_data = prepare_inputs(indexes_test, test_d, tokenizer)

Downloading and preparing dataset tydiqa/secondary_task (download: 1.82 GiB, generated: 55.27 MiB, post-processed: Unknown size, total: 1.87 GiB) to /root/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/49881 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5077 [00:00<?, ? examples/s]

Dataset tydiqa downloaded and prepared to /root/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148. Subsequent calls will reuse this data.


Reusing dataset tydiqa (/root/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148)


Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

In [8]:
epoch_num = 100
batch_size = 5
warmup_percent = 0.2
max_grad_norm = 1
scheduler_setting = 'WarmupLinear'
device = 'cuda'
total_steps = math.ceil(epoch_num*len(train_data[0])*1./batch_size)
warmup_steps = int(total_steps*warmup_percent)

In [None]:
model_path = 'ModelWeights'
iter = 0
# for i in range(0, epoch_num):
#     if os.path.exists('./model weights/' + model_path + str(i)):
#       iter = i
identifier = "xlm-roberta-base" if iter == 0 else './model weights/' + model_path + str(iter)
iter = iter + 1 if iter != 0 else iter

print(identifier)
model = AutoModelForQuestionAnswering.from_pretrained('xlm-roberta-base')
tokenizer=AutoTokenizer.from_pretrained('xlm-roberta-base')
model.to(device)
optimizer = AdamW(model.parameters(),lr=2e-5,eps=1e-6,correct_bias=False)
scheduler = get_scheduler(optimizer, scheduler_setting, warmup_steps=warmup_steps, t_total=total_steps) 
for i in range(0, epoch_num):
    best_weight = train(model, optimizer, scheduler, train_data, dev_data, batch_size, device, max_grad_norm, tokenizer, best_acc = -1)
    model.load_state_dict(best_weight)
    # model.save_pretrained(save_directory = './model weights/'+ model_path + str(i))

xlm-roberta-base


Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream tas

==> step 64 dev acc: 0.0


training:  20%|#9        | 128/649 [01:54<15:58,  1.84s/it]

==> step 128 dev acc: 0.0


training:  29%|##9       | 191/649 [02:46<05:01,  1.52it/s]

==> step 192 dev acc: 0.005681818181818182


training:  39%|###9      | 256/649 [03:46<11:31,  1.76s/it]

==> step 256 dev acc: 0.005681818181818182


training:  49%|####9     | 320/649 [04:43<09:05,  1.66s/it]

==> step 320 dev acc: 0.005681818181818182


training:  59%|#####9    | 384/649 [05:39<07:51,  1.78s/it]

==> step 384 dev acc: 0.005681818181818182


training:  69%|######9   | 448/649 [06:34<05:40,  1.69s/it]

==> step 448 dev acc: 0.005681818181818182


training:  79%|#######8  | 512/649 [07:27<04:03,  1.78s/it]

==> step 512 dev acc: 0.005681818181818182


training:  89%|########8 | 576/649 [08:22<02:02,  1.68s/it]

==> step 576 dev acc: 0.005681818181818182


training:  99%|#########8| 640/649 [09:17<00:15,  1.76s/it]

==> step 640 dev acc: 0.0


training: 100%|##########| 649/649 [09:27<00:00,  1.14it/s]

==> step 649 dev acc: 0.0



training:  10%|9         | 63/649 [00:52<07:37,  1.28it/s]

==> step 64 dev acc: 0.0


training:  20%|#9        | 128/649 [01:55<16:02,  1.85s/it]

==> step 128 dev acc: 0.0


training:  30%|##9       | 192/649 [02:51<12:43,  1.67s/it]

==> step 192 dev acc: 0.0


training:  39%|###9      | 256/649 [03:45<11:32,  1.76s/it]

==> step 256 dev acc: 0.0


training:  49%|####9     | 320/649 [04:42<09:06,  1.66s/it]

==> step 320 dev acc: 0.0


training:  59%|#####9    | 384/649 [05:37<07:51,  1.78s/it]

==> step 384 dev acc: 0.0


training:  69%|######8   | 447/649 [06:28<02:31,  1.34it/s]

==> step 448 dev acc: 0.017045454545454544


training:  79%|#######8  | 512/649 [07:27<04:05,  1.79s/it]

==> step 512 dev acc: 0.017045454545454544


training:  89%|########8 | 576/649 [08:22<02:01,  1.67s/it]

==> step 576 dev acc: 0.017045454545454544


training:  99%|#########8| 640/649 [09:16<00:15,  1.75s/it]

==> step 640 dev acc: 0.017045454545454544


training: 100%|##########| 649/649 [09:26<00:00,  1.14it/s]

==> step 649 dev acc: 0.011363636363636364



training:  10%|9         | 63/649 [00:52<07:30,  1.30it/s]

==> step 64 dev acc: 0.011363636363636364


training:  20%|#9        | 127/649 [01:50<07:42,  1.13it/s]

==> step 128 dev acc: 0.017045454545454544


training:  30%|##9       | 192/649 [02:52<12:41,  1.67s/it]

==> step 192 dev acc: 0.017045454545454544


training:  39%|###9      | 256/649 [03:45<11:30,  1.76s/it]

==> step 256 dev acc: 0.017045454545454544


training:  49%|####9     | 320/649 [04:42<09:03,  1.65s/it]

==> step 320 dev acc: 0.011363636363636364


training:  59%|#####9    | 384/649 [05:37<07:50,  1.77s/it]

==> step 384 dev acc: 0.017045454545454544


training:  69%|######8   | 447/649 [06:27<02:29,  1.36it/s]

==> step 448 dev acc: 0.022727272727272728


training:  79%|#######8  | 512/649 [07:26<04:04,  1.79s/it]

==> step 512 dev acc: 0.017045454545454544


training:  89%|########8 | 576/649 [08:20<02:00,  1.65s/it]

==> step 576 dev acc: 0.011363636363636364


training:  98%|#########8| 639/649 [09:10<00:07,  1.32it/s]

==> step 640 dev acc: 0.028409090909090908


training: 100%|##########| 649/649 [09:26<00:00,  1.15it/s]

==> step 649 dev acc: 0.017045454545454544



training:  10%|9         | 63/649 [00:51<07:25,  1.31it/s]

==> step 64 dev acc: 0.022727272727272728


training:  20%|#9        | 128/649 [01:53<15:47,  1.82s/it]

==> step 128 dev acc: 0.005681818181818182


training:  29%|##9       | 191/649 [02:44<04:55,  1.55it/s]

==> step 192 dev acc: 0.028409090909090908


training:  39%|###9      | 255/649 [03:38<04:50,  1.35it/s]

==> step 256 dev acc: 0.03977272727272727


training:  49%|####9     | 320/649 [04:40<08:54,  1.62s/it]

==> step 320 dev acc: 0.017045454545454544


training:  59%|#####9    | 384/649 [05:35<07:44,  1.75s/it]

==> step 384 dev acc: 0.03977272727272727


training:  69%|######8   | 447/649 [06:24<02:27,  1.37it/s]

==> step 448 dev acc: 0.045454545454545456


training:  79%|#######8  | 512/649 [07:23<04:01,  1.76s/it]

==> step 512 dev acc: 0.028409090909090908


training:  89%|########8 | 575/649 [08:13<00:51,  1.43it/s]

==> step 576 dev acc: 0.056818181818181816


training:  99%|#########8| 640/649 [09:12<00:15,  1.73s/it]

==> step 640 dev acc: 0.045454545454545456


training: 100%|##########| 649/649 [09:22<00:00,  1.15it/s]

==> step 649 dev acc: 0.05113636363636364



training:  10%|9         | 63/649 [00:51<07:25,  1.32it/s]

==> step 64 dev acc: 0.045454545454545456


training:  20%|#9        | 128/649 [01:53<15:50,  1.82s/it]

==> step 128 dev acc: 0.03409090909090909


training:  29%|##9       | 191/649 [02:44<04:56,  1.54it/s]

==> step 192 dev acc: 0.06818181818181818


training:  39%|###9      | 256/649 [03:42<11:21,  1.73s/it]

==> step 256 dev acc: 0.03409090909090909


training:  49%|####9     | 320/649 [04:38<08:54,  1.63s/it]

==> step 320 dev acc: 0.03409090909090909


training:  59%|#####9    | 384/649 [05:33<07:44,  1.75s/it]

==> step 384 dev acc: 0.0625


training:  69%|######9   | 448/649 [06:26<05:33,  1.66s/it]

==> step 448 dev acc: 0.045454545454545456


training:  79%|#######8  | 512/649 [07:19<04:01,  1.76s/it]

==> step 512 dev acc: 0.022727272727272728


training:  89%|########8 | 576/649 [08:13<02:00,  1.65s/it]

==> step 576 dev acc: 0.05113636363636364


training:  99%|#########8| 640/649 [09:07<00:15,  1.73s/it]

==> step 640 dev acc: 0.06818181818181818


training: 100%|#########9| 648/649 [09:13<00:00,  1.28it/s]

==> step 649 dev acc: 0.07386363636363637


training: 100%|##########| 649/649 [09:19<00:00,  1.16it/s]
training:  10%|9         | 63/649 [00:51<07:27,  1.31it/s]

==> step 64 dev acc: 0.03977272727272727


training:  20%|#9        | 127/649 [01:49<07:37,  1.14it/s]

==> step 128 dev acc: 0.05113636363636364


training:  29%|##9       | 191/649 [02:45<04:56,  1.54it/s]

==> step 192 dev acc: 0.07386363636363637


training:  39%|###9      | 256/649 [03:44<11:20,  1.73s/it]

==> step 256 dev acc: 0.06818181818181818


training:  49%|####9     | 320/649 [04:40<08:55,  1.63s/it]

==> step 320 dev acc: 0.06818181818181818


training:  59%|#####9    | 384/649 [05:35<07:43,  1.75s/it]

==> step 384 dev acc: 0.0625


training:  69%|######9   | 448/649 [06:28<05:33,  1.66s/it]

==> step 448 dev acc: 0.07386363636363637


training:  79%|#######8  | 512/649 [07:21<04:00,  1.75s/it]

==> step 512 dev acc: 0.056818181818181816


training:  89%|########8 | 575/649 [08:11<00:51,  1.44it/s]

==> step 576 dev acc: 0.09659090909090909


training:  99%|#########8| 640/649 [09:10<00:15,  1.73s/it]

==> step 640 dev acc: 0.06818181818181818


training: 100%|##########| 649/649 [09:20<00:00,  1.16it/s]

==> step 649 dev acc: 0.03977272727272727



training:  10%|9         | 63/649 [00:51<07:26,  1.31it/s]

==> step 64 dev acc: 0.07954545454545454


training:  20%|#9        | 128/649 [01:53<15:50,  1.82s/it]

==> step 128 dev acc: 0.056818181818181816


training:  30%|##9       | 192/649 [02:48<12:30,  1.64s/it]

==> step 192 dev acc: 0.07386363636363637


training:  39%|###9      | 256/649 [03:41<11:22,  1.74s/it]

==> step 256 dev acc: 0.06818181818181818


training:  49%|####9     | 320/649 [04:37<08:56,  1.63s/it]

==> step 320 dev acc: 0.06818181818181818


training:  59%|#####9    | 383/649 [05:27<03:25,  1.30it/s]

==> step 384 dev acc: 0.09659090909090909


training:  69%|######9   | 448/649 [06:27<05:33,  1.66s/it]

==> step 448 dev acc: 0.0625


training:  79%|#######8  | 512/649 [07:19<04:00,  1.76s/it]

==> step 512 dev acc: 0.06818181818181818


training:  89%|########8 | 576/649 [08:13<02:00,  1.65s/it]

==> step 576 dev acc: 0.07386363636363637


training:  99%|#########8| 640/649 [09:07<00:15,  1.73s/it]

==> step 640 dev acc: 0.03977272727272727


training: 100%|##########| 649/649 [09:17<00:00,  1.16it/s]

==> step 649 dev acc: 0.03977272727272727



training:  10%|9         | 63/649 [00:51<07:26,  1.31it/s]

==> step 64 dev acc: 0.056818181818181816


training:  20%|#9        | 127/649 [01:49<07:45,  1.12it/s]

==> step 128 dev acc: 0.08522727272727272


training:  30%|##9       | 192/649 [02:50<12:33,  1.65s/it]

==> step 192 dev acc: 0.07386363636363637


training:  39%|###9      | 256/649 [03:44<11:28,  1.75s/it]

==> step 256 dev acc: 0.056818181818181816


training:  49%|####9     | 320/649 [04:40<09:02,  1.65s/it]

==> step 320 dev acc: 0.056818181818181816


training:  59%|#####9    | 384/649 [05:35<07:45,  1.76s/it]

==> step 384 dev acc: 0.08522727272727272


training:  69%|######9   | 448/649 [06:29<05:33,  1.66s/it]

==> step 448 dev acc: 0.08522727272727272


training:  79%|#######8  | 512/649 [07:22<04:00,  1.75s/it]

==> step 512 dev acc: 0.06818181818181818


training:  89%|########8 | 576/649 [08:16<02:00,  1.65s/it]

==> step 576 dev acc: 0.06818181818181818


training:  99%|#########8| 640/649 [09:09<00:15,  1.72s/it]

==> step 640 dev acc: 0.08522727272727272


training: 100%|##########| 649/649 [09:19<00:00,  1.16it/s]

==> step 649 dev acc: 0.07386363636363637



training:  10%|9         | 63/649 [00:51<07:24,  1.32it/s]

==> step 64 dev acc: 0.056818181818181816


training:  20%|#9        | 127/649 [01:49<07:35,  1.15it/s]

==> step 128 dev acc: 0.0625


training:  29%|##9       | 191/649 [02:46<04:56,  1.54it/s]

==> step 192 dev acc: 0.10227272727272728


training:  39%|###9      | 256/649 [03:45<11:22,  1.74s/it]

==> step 256 dev acc: 0.07386363636363637


training:  49%|####9     | 320/649 [04:40<08:56,  1.63s/it]

==> step 320 dev acc: 0.056818181818181816


training:  59%|#####9    | 384/649 [05:35<07:43,  1.75s/it]

==> step 384 dev acc: 0.09090909090909091


training:  69%|######9   | 448/649 [06:28<05:33,  1.66s/it]

==> step 448 dev acc: 0.06818181818181818


training:  79%|#######8  | 512/649 [07:21<04:01,  1.76s/it]

==> step 512 dev acc: 0.07954545454545454


training:  87%|########6 | 563/649 [08:02<01:02,  1.38it/s]

In [None]:
evaluate(model, test_data, device, tokenizer, mute=False, batch_size=10)