<a href="https://colab.research.google.com/github/roberthsu2003/Transformer/blob/main/%E5%AF%A6%E6%88%B0%E9%81%8B%E7%94%A8/QuestionAnswering/%E6%BB%91%E5%8B%95%E7%AD%96%E7%95%A5%E5%AF%A6%E4%BD%9C/qa_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wget
#cmrc_eval.py檔會用到這個套件
!pip install nltk



In [None]:
import wget
#cmrc_eval.py評估檔內容有修改過
wget.download('https://raw.githubusercontent.com/roberthsu2003/Transformer/refs/heads/main/for_download/cmrc_eval.py')

'cmrc_eval.py'

## 基於視窗滑動策略的機器閱讀理解(MRC)
### 載人套件

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForQuestionAnswering,TrainingArguments,Trainer, DefaultDataCollator

### 下載資料集

In [None]:
datasets = load_dataset("roberthsu2003/for_MRC_QA", cache_dir='data')
datasets

### 數據處理

In [None]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

In [None]:
#建立處理token的function
def process_func(examples):
    tokenized_example = tokenizer(
        text = examples["question"],
        text_pair=examples['context'],
        return_offsets_mapping=True,
        return_overflowing_tokens=True,
        stride = 128, #設定重疊的部份
        max_length=384,
        truncation="only_second",
        padding="max_length"
        )
    sample_mapping = tokenized_example.pop("overflow_to_sample_mapping")
    start_positions = []
    end_positions = []
    example_ids = []
    
    for idx, _ in enumerate(sample_mapping):    
        answer = examples['answers'][sample_mapping[idx]] #參考白板比較好理解
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer['text'][0])

        context_start = tokenized_example.sequence_ids(idx).index(1)
        context_end = tokenized_example.sequence_ids(idx).index(None,context_start)-1

        offset = tokenized_example.get("offset_mapping")[idx]

        if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
            #答案不在context內
            start_token_pos = 0
            end_token_pos = 0
        else:
            #由左而右再由右而左找尋答案的index
            token_id = context_start
            while token_id <= context_end and offset[token_id][0] < start_char:
                token_id += 1
            start_token_pos = token_id
            token_id = context_end
            while token_id >= context_start and offset[token_id][1] > end_char:
                token_id -= 1
            end_token_pos = token_id

        start_positions.append(start_token_pos)
        end_positions.append(end_token_pos)
        example_ids.append(examples["id"][sample_mapping[idx]])
        #這些程式碼是為了預測使用的
        tokenized_example["offset_mapping"][idx] = [
            (o if tokenized_example.sequence_ids(idx)[k] == 1 else None)
            for k, o in enumerate(tokenized_example["offset_mapping"][idx])
        ]


    tokenized_example["example_ids"] = example_ids
    tokenized_example["start_positions"] = start_positions
    tokenized_example["end_positions"] = end_positions
    return tokenized_example



In [None]:
tokenized_datasets = datasets.map(process_func, batched=True, remove_columns=datasets["train"].column_names)

### 獲取模型輸出

In [None]:
import numpy as np
import collections

def get_result(start_logits, end_logits, exmaples, features):

    predictions = {}
    references = {}

    # example 和 feature的映射
    example_to_feature = collections.defaultdict(list)
    for idx, example_id in enumerate(features["example_ids"]):
        example_to_feature[example_id].append(idx)

    # 最优答案候选
    n_best = 20
    # 最大答案长度
    max_answer_length = 30

    for example in exmaples:
        example_id = example["id"]
        context = example["context"]
        answers = []
        for feature_idx in example_to_feature[example_id]:
            start_logit = start_logits[feature_idx]
            end_logit = end_logits[feature_idx]
            offset = features[feature_idx]["offset_mapping"]
            start_indexes = np.argsort(start_logit)[::-1][:n_best].tolist()
            end_indexes = np.argsort(end_logit)[::-1][:n_best].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offset[start_index] is None or offset[end_index] is None:
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    answers.append({
                        "text": context[offset[start_index][0]: offset[end_index][1]],
                        "score": start_logit[start_index] + end_logit[end_index]
                    })
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["score"])
            predictions[example_id] = best_answer["text"]
        else:
            predictions[example_id] = ""
        references[example_id] = example["answers"]["text"]

    return predictions, references

### 評估函數

In [None]:
from cmrc_eval import evaluate_cmrc

def metirc(pred):
    start_logits, end_logits = pred[0]
    if start_logits.shape[0] == len(tokenized_datasets["validation"]):
        p, r = get_result(start_logits, end_logits, datasets["validation"], tokenized_datasets["validation"])
    else:
        p, r = get_result(start_logits, end_logits, datasets["test"], tokenized_datasets["test"])
    return evaluate_cmrc(p, r)

### 下載模型

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("google-bert/bert-base-chinese")

### 配置TrainingArguments

In [None]:
args = TrainingArguments(
    output_dir="models_for_qa_slide",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=1
)

### Step8 配置Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    processing_class=tokenizer,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=DefaultDataCollator(),
    compute_metrics=metirc
)

### 模型訓練

In [None]:
trainer.train()

### 模型预测

In [None]:
from transformers import pipeline

pipe = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)
pipe

In [None]:
pipe(question="小明在哪里上班？", context="小明在北京上班")