# Generation-based MRC 문제를 풀어보기

In [None]:
!nvidia-smi

Fri May 14 08:25:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    56W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Requirements
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install nltk

In [None]:
import nltk
nltk.download('punkt')

In [None]:
!pip install git+https://github.com/SKT-AI/KoBART#egg=kobart

## 데이터 및 평가 지표 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from datasets import load_from_disk

train_dataset_origin = load_from_disk('/content/drive/MyDrive/Colab Notebooks/mrc/train')

eval_dataset_origin = load_from_disk('/content/drive/MyDrive/Colab Notebooks/mrc/val')

In [None]:
from datasets import load_metric

metric = load_metric('squad')

## Pre-trained 모델 및 토크나이저 불러오기

In [None]:
from transformers import BartForConditionalGeneration
from kobart import get_pytorch_kobart_model, get_kobart_tokenizer

In [None]:
tokenizer = get_kobart_tokenizer()

model = BartForConditionalGeneration.from_pretrained(get_pytorch_kobart_model())

using cached model
using cached model


## 설정하기

In [None]:
max_source_length = 1024
max_target_length = 32
padding = False
preprocessing_num_workers = 8
batch_size = 8
num_train_epochs = 8

## 전처리하기

In [None]:
def preprocess_function(examples):
    inputs = [f'질문: {q}  문서: {c} </s>' for q, c in zip(examples['question'], examples['context'])]
    targets = [f'{a["text"][0]} </s>' for a in examples['answers']]
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["example_id"] = []
    for i in range(len(model_inputs["labels"])):
        model_inputs["example_id"].append(examples["id"][i])
    return model_inputs

In [None]:
# column_names = datasets['train'].column_names
column_names = train_dataset_origin.column_names

In [None]:
train_dataset = train_dataset_origin.map(
            preprocess_function,
            batched=True,
            num_proc=preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=False
            )

In [None]:
eval_dataset = eval_dataset_origin.map(
            preprocess_function,
            batched=True,
            num_proc=preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=False,
            )

## Fine-tuning하기

In [None]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
label_pad_token_id = tokenizer.pad_token_id

data_collator = DataCollatorForSeq2Seq(
            tokenizer,
            label_pad_token_id=label_pad_token_id,
            pad_to_multiple_of=None,
            )

In [None]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    labels = np.where(labels == -100, 0, labels)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # decoded_labels is for rouge metric, not used for f1/em metric
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    formatted_predictions = [{"id": ex['id'], "prediction_text": decoded_preds[i]} for i, ex in enumerate(eval_dataset_origin)]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in eval_dataset_origin]

    result = metric.compute(predictions=formatted_predictions, references=references)
    return result

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/outputs',
    do_train=True,
    do_eval=True,
    predict_with_generate=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=500,
    logging_steps=500,
    learning_rate=5e-5,
    save_total_limit=3,
    load_best_model_at_end = True,
    metric_for_best_model = 'exact_match',
    gradient_accumulation_steps = 16
    )

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )

In [None]:
train_result = trainer.train()

In [None]:
# return tuple(save_directory)

In [None]:
train_result