In [1]:

import os
import json

from model import BertForQuestionAnswering, RobertaForQuestionAnswering
from dataset import QADataset

from transformers import (
    AutoConfig,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)


  from .autonotebook import tqdm as notebook_tqdm
2023-07-24 01:10:47.772956: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-24 01:10:47.822259: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

test_dir = "data/aihub_administration/test.json"
context_dir = "data/aihub_administration/context.json"

pretrained_models_dir = "result/roberta/aihub_administration_1024/"

batch_size = 64
max_length = 1024


In [3]:

pretrained_models = sorted([pretrained_model for pretrained_model in os.listdir(pretrained_models_dir) if "runs" not in pretrained_model])
context = json.load(open(context_dir, 'r'))
preds = []

config = AutoConfig.from_pretrained(pretrained_models_dir + pretrained_models[0])

for pretrained_model in pretrained_models:

    pretrained_model = pretrained_models_dir + pretrained_model

    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    test_set = QADataset(data_dir=test_dir, context=context, tokenizer=tokenizer, max_length=max_length)

    if config.model_type == "bert":
        model = BertForQuestionAnswering.from_pretrained(pretrained_model)
    elif config.model_type == "roberta":
        model = RobertaForQuestionAnswering.from_pretrained(pretrained_model)

    train_args = TrainingArguments(
        output_dir = "tmp/",
        overwrite_output_dir = True,
        do_predict = True,
        per_device_eval_batch_size = batch_size,
        logging_steps = 10,
        seed = 42,
        data_seed = 42,
    )

    trainer = Trainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer,
    )

    preds.append(trainer.predict(test_set))


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors


In [4]:

import re
import string
from collections import Counter

def normalize_answer(s):
    def remove_(text):
        ''' 불필요한 기호 제거 '''
        text = re.sub("'", " ", text)
        text = re.sub('"', " ", text)
        text = re.sub('《', " ", text)
        text = re.sub('》', " ", text)
        text = re.sub('<', " ", text)
        text = re.sub('>', " ", text)
        text = re.sub('〈', " ", text)
        text = re.sub('〉', " ", text)
        text = re.sub("\(", " ", text)
        text = re.sub("\)", " ", text)
        text = re.sub("‘", " ", text)
        text = re.sub("’", " ", text)
        return text

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(remove_(s))))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    
    #F1 by character
    prediction_Char = []
    for tok in prediction_tokens:
        now = [a for a in tok]
        prediction_Char.extend(now)

    ground_truth_Char = []
    for tok in ground_truth_tokens:
        now = [a for a in tok]
        ground_truth_Char.extend(now)
        
    common = Counter(prediction_Char) & Counter(ground_truth_Char)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    
    precision = 1.0 * num_same / len(prediction_Char)
    recall = 1.0 * num_same / len(ground_truth_Char)
    f1 = (2 * precision * recall) / (precision + recall)
    
    return f1

def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))

def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


In [5]:

for i, pred in enumerate(preds):

    predictions = pred.predictions
    label_ids = pred.label_ids
    
    exact_match = 0
    f1 = 0
    total = 0

    result = []

    for sample_idx, sample in enumerate(test_set.samples):

        pred_start = predictions[0][sample_idx].argmax(-1)
        pred_end = predictions[1][sample_idx].argmax(-1)

        pred_text = tokenizer.decode(test_set[sample_idx]["input_ids"][pred_start:pred_end], skip_special_tokens=True)

        label_text = context[sample["context"]][sample["answer_start"]:sample["answer_end"]]
        
        exact_match += exact_match_score(pred_text, label_text)
        f1 += f1_score(pred_text, label_text)

        total += 1

    exact_match = round(100.0 * exact_match / total, 4)
    f1 = round(100.0 * f1 / total, 4)

    print("[{}] em score: {}\tf1_score: {}".format(pretrained_models[i], exact_match, f1))


[checkpoint-147611] em score: 72.8826	f1_score: 90.1524
[checkpoint-163149] em score: 72.732	f1_score: 90.1227
[checkpoint-170918] em score: 72.6915	f1_score: 90.2541


In [6]:

for i, pred in enumerate(preds):

    predictions = pred.predictions
    label_ids = pred.label_ids
    
    exact_match = 0
    f1 = 0
    total = 0

    result = []

    for sample_idx, sample in enumerate(test_set.samples):

        pred_start = predictions[0][sample_idx].argmax(-1)
        pred_end = predictions[1][sample_idx].argmax(-1)

        if pred_start >= 512 and pred_end >= 512:

            pred_text = tokenizer.decode(test_set[sample_idx]["input_ids"][pred_start:pred_end], skip_special_tokens=True)

            label_text = context[sample["context"]][sample["answer_start"]:sample["answer_end"]]
            
            exact_match += exact_match_score(pred_text, label_text)
            f1 += f1_score(pred_text, label_text)

            total += 1

    exact_match = round(100.0 * exact_match / total, 4) if total != 0 else 0.
    f1 = round(100.0 * f1 / total, 4) if total != 0 else 0.

    print("[{}] > 512 em score: {}\tf1_score: {}".format(pretrained_models[i], exact_match, f1))
    print("n samples: {}".format(total))


[checkpoint-147611] > 512 em score: 79.2324	f1_score: 94.0733
n samples: 1459
[checkpoint-163149] > 512 em score: 79.5751	f1_score: 94.6677
n samples: 1459
[checkpoint-170918] > 512 em score: 80.0556	f1_score: 95.0895
n samples: 1439
