## 基於截斷策略的機器閱讀理解任務實現

### Step1 載入相關套件

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator

### Step2數據集載入

In [7]:
datasets = load_dataset('cmrc2018', cache_dir='data')
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 10142
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 3219
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 1002
    })
})

### 數據預處理

In [8]:
tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-macbert-base')

def process_func(examples):
    tokenized_examples = tokenizer(text=examples['question'],
                               text_pair=examples['context'],
                               max_length=512,
                               return_offsets_mapping=True,
                               truncation="only_second",
                               padding=True)
    offset_mapping = tokenized_examples.pop("offset_mapping")
    start_positions = []
    end_positions = []
    for idx, offset in enumerate(offset_mapping):
        answer = examples['answers'][idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer['text'][0])
        context_start = tokenized_examples.sequence_ids(idx).index(1)
        context_end = tokenized_examples.sequence_ids(idx).index(None,context_start) - 1

        if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
            #代表答案不在context中
            start_token_pos = 0
            end_token_pos = 0
        else:
            token_id = context_start
            while token_id <= context_end and offset[token_id][0] < start_char:
                token_id += 1
            start_token_pos = token_id
            token_id = context_end
            while token_id >= context_start and offset[token_id][1] > end_char:
                token_id -= 1
            end_token_pos = token_id
        start_positions.append(start_token_pos)
        end_positions.append(end_token_pos)
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

In [9]:
tokenied_datasets = datasets.map(process_func, batched=True, remove_columns=datasets['train'].column_names)

Map:   0%|          | 0/10142 [00:00<?, ? examples/s]

### 配置模型

In [10]:
model = AutoModelForQuestionAnswering.from_pretrained('hfl/chinese-macbert-base')

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 配置TrainingArguments

In [11]:
args = TrainingArguments(
    output_dir = 'models_for_qa',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=3
)

### 建立訓練器

In [12]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenied_datasets['train'],
    eval_dataset=tokenied_datasets['validation'],
    data_collator=DefaultDataCollator()
)