# BERT

## 设备设置

In [1]:
!nvidia-smi


Wed Apr 20 13:50:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
import json
import numpy as np
import random
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast, get_linear_schedule_with_warmup 
import logging
import math
import re

from tqdm.auto import tqdm



In [3]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level="INFO", # "DEBUG" "WARNING" "ERROR"
)
proj = "hw7.BERT"
logger = logging.getLogger(proj)

logger.info("logger initalize")

2022-04-20 13:50:40 | INFO | hw7.BERT | logger initalize


In [4]:
import os
from pathlib import Path

device_num = 1
envir = "kaggle"
device = "cpu"

if envir == "kaggle":
    data_root = Path('/kaggle/input/ml2021-spring-hw7')
    device = "cuda" if torch.cuda.is_available() else "cpu"
    %pip install transformers
    %pip install accelerate
    output_dir = Path(".").absolute()
    
else:
    data_root = Path(".").absolute() / 'data'
    torch.cuda.set_device(device_num)
    device = f"cuda:{device_num}" if torch.cuda.is_available() else "cpu"
    output_dir = Path(".").absolute() / 'output'


print("data root =", data_root)
print("device =", device)

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
data root = /kaggle/input/ml2021-spring-hw7
device = cuda


In [5]:
# Fix random seed for reproducibility
def same_seeds(seed):
	torch.manual_seed(seed)
	if torch.cuda.is_available():
	    torch.cuda.manual_seed(seed)
	    torch.cuda.manual_seed_all(seed)
	np.random.seed(seed)
	random.seed(seed)
	torch.backends.cudnn.benchmark = False
	torch.backends.cudnn.deterministic = True

same_seeds(2022)

In [6]:
fp16_training = False

if fp16_training:
    from accelerate import Accelerator
    accelerator = Accelerator(fp16=True)
    device = accelerator.device

In [7]:
import transformers

model = BertForQuestionAnswering.from_pretrained("luhua/chinese_pretrain_mrc_roberta_wwm_ext_large").to(device)
print("---")
tokenizer = transformers.BertTokenizerFast.from_pretrained("luhua/chinese_pretrain_mrc_roberta_wwm_ext_large")

Downloading:   0%|          | 0.00/634 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

---


Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

## 数据流程

### 读取数据

In [8]:
def read_data(file):
    with open(file , 'r', encoding='utf-8') as reader:
        data = json.load(reader)
    return data["questions"], data["paragraphs"]

train_questions, train_paragraphs = read_data(data_root / "hw7_train.json")
dev_questions, dev_paragraphs = read_data(data_root / "hw7_dev.json")
test_questions, test_paragraphs = read_data(data_root / "hw7_test.json")

### Tokenize

In [9]:
train_questions_tokenized = tokenizer([train_question["question_text"] for train_question in train_questions], add_special_tokens=False)
dev_questions_tokenized = tokenizer([dev_question["question_text"] for dev_question in dev_questions], add_special_tokens=False)
test_questions_tokenized = tokenizer([test_question["question_text"] for test_question in test_questions], add_special_tokens=False) 

train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens=False)
dev_paragraphs_tokenized = tokenizer(dev_paragraphs, add_special_tokens=False)
test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)

### Dataset and loader

In [10]:
print(train_questions[0])
print(train_paragraphs[3884])

{'id': 0, 'paragraph_id': 3884, 'question_text': '羅馬教皇利奧三世在800年正式加冕誰為羅馬人的皇帝?', 'answer_text': '查理大帝', 'answer_start': 141, 'answer_end': 144}
利奧三世開創的伊蘇里亞王朝在8世紀末期走上了末路，隨後統治帝國的一群無能皇帝進一步加深了災難局面。這其中最著名的是伊琳娜女皇，她弄瞎了作為法定繼承人的兒子君士坦丁六世的眼睛，將其關入修道院，自己成為第一個大權獨攬的東羅馬女皇。此舉影響重大，導致羅馬教皇利奧三世在800年把法蘭克國王查理大帝加冕為羅馬人的皇帝，使西方帝國有了與東羅馬帝國分庭抗禮的藉口。此外，皇帝尼基弗魯斯在與多瑙河下游平原的保加利亞第一帝國的普利斯卡戰役中被殺，頭蓋骨更被保加利亞酋長克魯姆做成了酒杯。馬其頓王朝的誕生開創了東羅馬帝國歷史上第二個最輝煌的時期。馬其頓王朝開國皇帝巴西爾一世生於亞美尼亞，幼時全家被多瑙河下游平原的保加利亞第一帝國俘虜，發配到馬其頓去開墾土地。長大後，他成為皇宮馬倌，貌美而多力，受到阿莫利王朝末代皇帝麥可三世的注意和寵愛。麥可任命他為宮廷侍衛長，並於866年把他立為自己的繼承人和共帝。867年，巴西爾發覺自己有失寵的跡象，於是在9月23日晚上發動了政變，他先用手擰彎了皇帝寢室的門閂，然後在半夜帶著親信殺入皇帝睡房，迅速制服衛兵，並殺掉了麥可三世。雖然皇位為篡奪而來，但巴西爾一世很快以自己的英明行為讓大家刮目相看。他在軍事上的勝利使其躋身於東羅馬帝國帝國最偉大的軍事家之列。他嚴格貫徹席哈克略王朝時開始的軍事制改革，在巴爾幹半島建立新軍事州，向這些地區遷入新移民，並憑藉不斷增強的君主制國的國力鞏固國防建設，不僅在巴爾幹半島的多瑙河沿岸北部設立邊境要塞，成功阻擋了斯拉夫人南下，而且在小亞細亞擴充軍隊並反擊了阿拉伯人的侵略，在義大利南部，也收復了原屬於東羅馬帝國的領地。


In [11]:
class QA_Dataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs, paragraphs=None):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 40
        self.max_paragraph_len = 150

        ##### TODO: Change value of doc_stride #####
        self.doc_stride = 75

        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]
        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1

        if paragraphs is not None:
            self.paragraphs = paragraphs


    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]

        ##### TODO: Preprocessing #####
        # Hint: How to prevent model from learning something it should not learn


        if self.split == "train":
            # 找到答案所在语句
            answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
            answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])

            # 在段落中找到答案，然后左右均匀扩展到上限长度
            # mid = (answer_start_token + answer_end_token) // 2
            # paragraph_start = max(0, min(mid - self.max_paragraph_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))
            # paragraph_end = paragraph_start + self.max_paragraph_len

            # TODO 随机产生一个窗口
            paragraph_start = np.random.randint(low=max(0, answer_end_token - self.max_paragraph_len), high=min(len(tokenized_paragraph) - self.max_paragraph_len, answer_start_token) + 1)
            paragraph_end = paragraph_start + self.max_paragraph_len

            # 插入CLS/SEP (CLS:[101], SEP:[102])
            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102] 
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]

            # 映射答案位置
            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start

            # 格式化成模型输入
            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token

        
        # 无答案数据
        else:
            # 将段落切成多个片，逐个判断。这里可能有问题，有可能将答案切分了。 OK
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            

            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]

                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)

            if self.split == "test":
                # 增加片段记录，优化[UNK]
                paragraph = self.paragraphs[question["paragraph_id"]]
                return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list), paragraph

            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)
    
    def padding(self, input_ids_question, input_ids_paragraph):
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len # 后面补0
        
        # 标记段落位置
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len

        # 防止Attention学到些无用的padding信息
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len

        return input_ids, token_type_ids, attention_mask




In [12]:
# Note: Do NOT change batch size of dev_loader / test_loader !
# Although batch size=1, it is actually a batch consisting of several windows from the same QA pair
train_batch_size = 16

train_set = QA_Dataset("train", train_questions, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QA_Dataset("dev", dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QA_Dataset("test", test_questions, test_questions_tokenized, test_paragraphs_tokenized, test_paragraphs)

train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, pin_memory=True)
dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)

### 验证结果

In [13]:
def form(regex):
    regex = regex.replace(' ', '')
    regex = regex.replace('.', '\\.')
    regex = regex.replace('(', '\\(')
    regex = regex.replace(')', '\\)')
    regex = regex.replace('*', '\\*')
    regex = regex.replace('+', '\\+')
    regex = regex.replace('?', '\\?')
    regex = regex.replace('[UNK]', '.*')
    regex = regex.replace('#', '.*')

    return regex

def evaluate(data, output, paragraph = None):
    ##### TODO: Postprocessing #####
    # There is a bug and room for improvement in postprocessing 
    # Hint: Open your prediction file to see what is wrong 
    # start > end的仍然没改善 OK
    # 有些UNKNOWN的TOKEN没法被录入


    answer = ''
    max_prob = float('-inf')
    num_of_windows = data[0].shape[1]

    if paragraph is not None:
        paragraph = paragraph.replace(' ','')

    for k in range(num_of_windows):
        # 找到概率最大的起点和终点
        start_prob, start_index = torch.max(output.start_logits[k], dim=0)
        end_prob, end_index = torch.max(output.end_logits[k], dim=0)

        # 总概率
        prob = start_prob + end_prob
        sep_index = (data[0][0][k] == 102).nonzero(as_tuple=True)[0]

        if prob > max_prob and start_index <= end_index and start_index > sep_index[0].item() and end_index < sep_index[1].item():
            answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])
            if re.match(".*[。？?；;]+", answer):
                continue

            max_prob = prob
            # 对test加入判断
            if (paragraph is not None) and answer.find("[UNK]") > -1:
                # data[0][0][k] first 102
                
                regex = f".*{form(tokenizer.decode(data[0][0][k][sep_index[0].item() + 1 : start_index]))}({form(tokenizer.decode(data[0][0][k][start_index : end_index + 1]))}){form(tokenizer.decode(data[0][0][k][end_index + 1 : sep_index[1].item()]))}"
                regex = re.sub(r'[.*]+', '.*', regex)
                # print("regex = ", regex)
                # print("paragraph = ", paragraph)

                re_result = re.match(regex, paragraph)
                if re_result is not None:
                    answer = re.match(regex, paragraph).group(1)

    return answer.replace(' ','')


In [14]:
def model_save_condition(epoch, num_epoch):
    if epoch + 4 >= num_epoch:
        return True
    return False

In [15]:
from transformers import get_linear_schedule_with_warmup

num_epoch = 5
validation = True
logging_step = 100
learning_rate = 1e-4 / 4
warmup_rate = 0.1

total_step = math.ceil(len(train_set) // train_batch_size) * num_epoch
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps = total_step * warmup_rate, num_training_steps = total_step
)

if fp16_training:
    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)

model.train()

for epoch in range(num_epoch):
    step = 1
    train_loss = train_acc = 0

    for data in tqdm(train_loader):

        data = [i.to(device) for i in data]

        output = model(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3], end_positions=data[4])
    
        start_index = torch.argmax(output.start_logits, dim=1)
        end_index = torch.argmax(output.end_logits, dim=1)

        train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()
        train_loss += output.loss

        if fp16_training:
            accelerator.backward(output.loss)
        else:
            output.loss.backward()

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        step += 1

        ##### TODO: Apply linear learning rate decay #####
        # next_rate = optimizer.param_groups[0]["lr"] - learning_rate / total_step
        # if next_rate > 0:
        #     optimizer.param_groups[0]["lr"] = next_rate
        #     lr_record.append(next_rate)

        # Print training loss and accuracy over past logging step
        if step % logging_step == 0:
            print(f"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / logging_step:.3f}, acc = {train_acc / logging_step:.3f}")
            train_loss = train_acc = 0

    if validation:
        print("Evaluating Dev Set ...")
        model.eval()
        with torch.no_grad():
            dev_acc = 0
            for i, data in enumerate(tqdm(dev_loader)):
                output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
                # prediction is correct only if answer text exactly matches
                dev_acc += evaluate(data, output) == dev_questions[i]["answer_text"]
                
            print(f"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}")
        model.train()

        # 保存最后三个epoch的模型
    if model_save_condition(epoch, num_epoch):
        model_save_dir = Path(output_dir).absolute() / f"saved_model {epoch}"
        model.save_pretrained(model_save_dir)
        print(f"Saving Model {epoch}...")




  0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 1 | Step 100 | loss = 1.098, acc = 0.631
Epoch 1 | Step 200 | loss = 0.703, acc = 0.739
Epoch 1 | Step 300 | loss = 0.562, acc = 0.767
Epoch 1 | Step 400 | loss = 0.460, acc = 0.794
Epoch 1 | Step 500 | loss = 0.431, acc = 0.799
Epoch 1 | Step 600 | loss = 0.465, acc = 0.794
Epoch 1 | Step 700 | loss = 0.453, acc = 0.811
Epoch 1 | Step 800 | loss = 0.463, acc = 0.806
Epoch 1 | Step 900 | loss = 0.468, acc = 0.797
Epoch 1 | Step 1000 | loss = 0.490, acc = 0.782
Epoch 1 | Step 1100 | loss = 0.447, acc = 0.801
Epoch 1 | Step 1200 | loss = 0.492, acc = 0.798
Epoch 1 | Step 1300 | loss = 0.414, acc = 0.803
Epoch 1 | Step 1400 | loss = 0.428, acc = 0.812
Epoch 1 | Step 1500 | loss = 0.431, acc = 0.812
Epoch 1 | Step 1600 | loss = 0.382, acc = 0.837
Evaluating Dev Set ...


  0%|          | 0/3524 [00:00<?, ?it/s]

Validation | Epoch 1 | acc = 0.822


  0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 2 | Step 100 | loss = 0.251, acc = 0.861
Epoch 2 | Step 200 | loss = 0.266, acc = 0.881
Epoch 2 | Step 300 | loss = 0.247, acc = 0.885
Epoch 2 | Step 400 | loss = 0.251, acc = 0.874
Epoch 2 | Step 500 | loss = 0.272, acc = 0.873
Epoch 2 | Step 600 | loss = 0.282, acc = 0.862
Epoch 2 | Step 700 | loss = 0.247, acc = 0.877
Epoch 2 | Step 800 | loss = 0.271, acc = 0.853
Epoch 2 | Step 900 | loss = 0.259, acc = 0.879
Epoch 2 | Step 1000 | loss = 0.319, acc = 0.857
Epoch 2 | Step 1100 | loss = 0.295, acc = 0.866
Epoch 2 | Step 1200 | loss = 0.253, acc = 0.869
Epoch 2 | Step 1300 | loss = 0.277, acc = 0.876
Epoch 2 | Step 1400 | loss = 0.289, acc = 0.869
Epoch 2 | Step 1500 | loss = 0.252, acc = 0.875
Epoch 2 | Step 1600 | loss = 0.290, acc = 0.862
Evaluating Dev Set ...


  0%|          | 0/3524 [00:00<?, ?it/s]

Validation | Epoch 2 | acc = 0.822
Saving Model 1...


  0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 3 | Step 100 | loss = 0.130, acc = 0.920
Epoch 3 | Step 200 | loss = 0.123, acc = 0.936
Epoch 3 | Step 300 | loss = 0.134, acc = 0.920
Epoch 3 | Step 400 | loss = 0.143, acc = 0.929
Epoch 3 | Step 500 | loss = 0.145, acc = 0.919
Epoch 3 | Step 600 | loss = 0.129, acc = 0.936
Epoch 3 | Step 700 | loss = 0.139, acc = 0.925
Epoch 3 | Step 800 | loss = 0.129, acc = 0.929
Epoch 3 | Step 900 | loss = 0.160, acc = 0.925
Epoch 3 | Step 1000 | loss = 0.138, acc = 0.929
Epoch 3 | Step 1100 | loss = 0.140, acc = 0.933
Epoch 3 | Step 1200 | loss = 0.167, acc = 0.909
Epoch 3 | Step 1300 | loss = 0.133, acc = 0.930
Epoch 3 | Step 1400 | loss = 0.132, acc = 0.932
Epoch 3 | Step 1500 | loss = 0.127, acc = 0.927
Epoch 3 | Step 1600 | loss = 0.133, acc = 0.928
Evaluating Dev Set ...


  0%|          | 0/3524 [00:00<?, ?it/s]

Validation | Epoch 3 | acc = 0.806
Saving Model 2...


  0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 4 | Step 100 | loss = 0.060, acc = 0.957
Epoch 4 | Step 200 | loss = 0.077, acc = 0.959
Epoch 4 | Step 300 | loss = 0.080, acc = 0.961
Epoch 4 | Step 400 | loss = 0.073, acc = 0.961
Epoch 4 | Step 500 | loss = 0.104, acc = 0.954
Epoch 4 | Step 600 | loss = 0.079, acc = 0.959
Epoch 4 | Step 700 | loss = 0.079, acc = 0.965
Epoch 4 | Step 800 | loss = 0.066, acc = 0.969
Epoch 4 | Step 900 | loss = 0.070, acc = 0.969
Epoch 4 | Step 1000 | loss = 0.067, acc = 0.965
Epoch 4 | Step 1100 | loss = 0.082, acc = 0.953
Epoch 4 | Step 1200 | loss = 0.065, acc = 0.969
Epoch 4 | Step 1300 | loss = 0.063, acc = 0.967
Epoch 4 | Step 1400 | loss = 0.078, acc = 0.962
Epoch 4 | Step 1500 | loss = 0.076, acc = 0.952
Epoch 4 | Step 1600 | loss = 0.062, acc = 0.968
Evaluating Dev Set ...


  0%|          | 0/3524 [00:00<?, ?it/s]

Validation | Epoch 4 | acc = 0.827
Saving Model 3...


  0%|          | 0/1684 [00:00<?, ?it/s]

Epoch 5 | Step 100 | loss = 0.036, acc = 0.966
Epoch 5 | Step 200 | loss = 0.039, acc = 0.983
Epoch 5 | Step 300 | loss = 0.043, acc = 0.982
Epoch 5 | Step 400 | loss = 0.040, acc = 0.978
Epoch 5 | Step 500 | loss = 0.046, acc = 0.978
Epoch 5 | Step 600 | loss = 0.045, acc = 0.976
Epoch 5 | Step 700 | loss = 0.037, acc = 0.982
Epoch 5 | Step 800 | loss = 0.041, acc = 0.978
Epoch 5 | Step 900 | loss = 0.037, acc = 0.982
Epoch 5 | Step 1000 | loss = 0.034, acc = 0.984
Epoch 5 | Step 1100 | loss = 0.029, acc = 0.986
Epoch 5 | Step 1200 | loss = 0.037, acc = 0.978
Epoch 5 | Step 1300 | loss = 0.029, acc = 0.984
Epoch 5 | Step 1400 | loss = 0.046, acc = 0.977
Epoch 5 | Step 1500 | loss = 0.029, acc = 0.982
Epoch 5 | Step 1600 | loss = 0.045, acc = 0.978
Evaluating Dev Set ...


  0%|          | 0/3524 [00:00<?, ?it/s]

Validation | Epoch 5 | acc = 0.824
Saving Model 4...


## 测试数据输出

In [16]:
for epoch in range(num_epoch):
    if model_save_condition(epoch, num_epoch) == False:
        continue

    print(f"Evaluating Test Set for epoch {epoch} ...")

    model_save_dir = Path(output_dir).absolute() / f"saved_model {epoch}"
    model.from_pretrained(model_save_dir)

    result = []

    model.eval()
    with torch.no_grad():
        for i, data in enumerate(tqdm(test_loader)):
            output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                        attention_mask=data[2].squeeze(dim=0).to(device))
            paragraph = data[3][0]
            result.append(evaluate(data, output, paragraph))


    result_file = Path(output_dir).absolute() / f"result {epoch}.csv"
    with open(result_file, 'w') as f:	
        f.write("ID,Answer\n")
        for i, test_question in enumerate(test_questions):
            # Replace commas in answers with empty strings (since csv is separated by comma)
            # Answers in kaggle are processed in the same way
            f.write(f"{test_question['id']},{result[i].replace(',','')}\n")

    print(f"Completed! Result is in {result_file}")



Evaluating Test Set for epoch 1 ...


  0%|          | 0/3493 [00:00<?, ?it/s]

Completed! Result is in /kaggle/working/result 1.csv
Evaluating Test Set for epoch 2 ...


  0%|          | 0/3493 [00:00<?, ?it/s]

Completed! Result is in /kaggle/working/result 2.csv
Evaluating Test Set for epoch 3 ...


  0%|          | 0/3493 [00:00<?, ?it/s]

Completed! Result is in /kaggle/working/result 3.csv
Evaluating Test Set for epoch 4 ...


  0%|          | 0/3493 [00:00<?, ?it/s]

Completed! Result is in /kaggle/working/result 4.csv
