QA Service (ko)
---

## Prepare library

Import the required library.

In [1]:
import torch
from torch.utils.data import Dataset
import numpy as np
import pandas as pd
from transformers import  TrainingArguments, Trainer
from torch import cuda

Set the model and configuration.

In [None]:
from transformers import ElectraModel, ElectraConfig

# Initializing a ELECTRA electra-base-uncased style configuration
configuration = ElectraConfig()

# Initializing a model from the electra-base-uncased style configuration
model = ElectraModel(configuration)

# Accessing the model configuration
configuration = model.config

Set the device to GPU Enabled.

In [12]:
from transformers import ElectraForQuestionAnswering, ElectraTokenizerFast
model = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-base-v3-discriminator")
tokenizer = ElectraTokenizerFast.from_pretrained("monologg/koelectra-base-v3-discriminator")

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['qa_outputs.bias', 

In [2]:
device = 'cuda' if cuda.is_available() else 'cpu'

## Divide data set into train set and test set

Load the data set and divide it into train context, train question, test context, and text answer.

In [3]:
import json
from pathlib import Path

def read_klue(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_klue('klue-mrc-v1.1_train.json')
val_contexts, val_questions, val_answers = read_klue('klue-mrc-v1.1_dev.json')

Add end idx so that you can tokenize the dataset you divided.

In [4]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

Encode dataset

In [13]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

Add the answer to the encoded data.

In [14]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

Set Dataset to fit the trainer.

In [15]:
class KlueDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = KlueDataset(train_encodings)
val_dataset = KlueDataset(val_encodings)

Set training arguments and train.

In [16]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=400,
)

trainer = Trainer(
    model = model.to(device),            # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Step,Training Loss
400,3.8992
800,1.4217
1200,1.0417
1600,0.7456
2000,0.5904
2400,0.5147


TrainOutput(global_step=2760, training_loss=1.2530095031295998, metrics={'train_runtime': 2735.503, 'train_samples_per_second': 32.285, 'train_steps_per_second': 1.009, 'total_flos': 3.04761636598272e+16, 'train_loss': 1.2530095031295998, 'epoch': 5.0})

Evaluate trained model

In [17]:
trainer.evaluate()

{'eval_loss': 1.9084742069244385,
 'eval_runtime': 57.5785,
 'eval_samples_per_second': 100.923,
 'eval_steps_per_second': 3.161,
 'epoch': 5.0}

Save model

In [18]:
model_path = "klue-mrc-koelectra-base"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('klue-mrc-koeletra-base/tokenizer_config.json',
 'klue-mrc-koeletra-base/special_tokens_map.json',
 'klue-mrc-koeletra-base/vocab.txt',
 'klue-mrc-koeletra-base/added_tokens.json',
 'klue-mrc-koeletra-base/tokenizer.json')