# 截断策略的机器阅读理解任务

## Step1 导包 
显示路径

In [4]:
pwd()

'd:\\code\\studyLLM\\project02'

In [5]:
from datasets import load_dataset, DatasetDict
import evaluate
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator

# Step2 数据加载 \
使用的数据集是cmrc2018

In [6]:
# 联网下载
# datasets = load_dataset("cmrc2018", cache_dir="../data/mrc_data")
# 离线加载
datasets = DatasetDict.load_from_disk("../data/mrc_data")
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 10142
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 3219
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 1002
    })
})

## Step3 数据预处理 
- 首先查看数据结构: 以train的第一条数据为例,可以看到数据集有4个字段，分别是id,context, question,answers
- 然后对其进行token化，并查看tokenizer的输出结构: 其主要有三个字段，分别是input_ids, attention_mask, token_type_ids，如果在tokenizer的时候加入return_offsets_mapping=True，那么tokenizer的输出结构中会多一个offsets_mapping字段，其表示的是每个token在原始文本中的位置，方便后续的解码。
- 最后，对数据进行预处理，将数据转换为模型可以接受的格式，即input_ids, attention_mask, token_type_ids, start_positions, end_positions，其中start_positions和end_positions表示的是答案在原始文本中的位置，即offsets_mapping中每个token的起始位置和结束位置。

In [7]:
datasets["train"][0]

{'id': 'TRAIN_186_QUERY_0',
 'context': '范廷颂枢机（，），圣名保禄·若瑟（），是越南罗马天主教枢机。1963年被任为主教；1990年被擢升为天主教河内总教区宗座署理；1994年被擢升为总主教，同年年底被擢升为枢机；2009年2月离世。范廷颂于1919年6月15日在越南宁平省天主教发艳教区出生；童年时接受良好教育后，被一位越南神父带到河内继续其学业。范廷颂于1940年在河内大修道院完成神学学业。范廷颂于1949年6月6日在河内的主教座堂晋铎；及后被派到圣女小德兰孤儿院服务。1950年代，范廷颂在河内堂区创建移民接待中心以收容到河内避战的难民。1954年，法越战争结束，越南民主共和国建都河内，当时很多天主教神职人员逃至越南的南方，但范廷颂仍然留在河内。翌年管理圣若望小修院；惟在1960年因捍卫修院的自由、自治及拒绝政府在修院设政治课的要求而被捕。1963年4月5日，教宗任命范廷颂为天主教北宁教区主教，同年8月15日就任；其牧铭为「我信天主的爱」。由于范廷颂被越南政府软禁差不多30年，因此他无法到所属堂区进行牧灵工作而专注研读等工作。范廷颂除了面对战争、贫困、被当局迫害天主教会等问题外，也秘密恢复修院、创建女修会团体等。1990年，教宗若望保禄二世在同年6月18日擢升范廷颂为天主教河内总教区宗座署理以填补该教区总主教的空缺。1994年3月23日，范廷颂被教宗若望保禄二世擢升为天主教河内总教区总主教并兼天主教谅山教区宗座署理；同年11月26日，若望保禄二世擢升范廷颂为枢机。范廷颂在1995年至2001年期间出任天主教越南主教团主席。2003年4月26日，教宗若望保禄二世任命天主教谅山教区兼天主教高平教区吴光杰主教为天主教河内总教区署理主教；及至2005年2月19日，范廷颂因获批辞去总主教职务而荣休；吴光杰同日真除天主教河内总教区总主教职务。范廷颂于2009年2月22日清晨在河内离世，享年89岁；其葬礼于同月26日上午在天主教河内总教区总主教座堂举行。',
 'question': '范廷颂是什么时候被任为主教的？',
 'answers': {'text': ['1963年'], 'answer_start': [30]}}

In [8]:
tokenizer = AutoTokenizer.from_pretrained("../hfl/chinese-macbert-base")
example_dataset = datasets["train"].select(range(10))
tokenized_examples = tokenizer(text=example_dataset["question"], 
                              text_pair=example_dataset["context"], 
                              return_offsets_mapping=True,
                              max_length=512, 
                              truncation=True, 
                              padding="max_length")
tokenized_examples.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])

In [9]:
print(tokenized_examples["offset_mapping"][0], len(tokenized_examples["offset_mapping"][0]))

[(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23), (23, 24), (24, 25), (25, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 34), (34, 35), (35, 36), (36, 37), (37, 38), (38, 39), (39, 40), (40, 41), (41, 45), (45, 46), (46, 47), (47, 48), (48, 49), (49, 50), (50, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (61, 62), (62, 63), (63, 67), (67, 68), (68, 69), (69, 70), (70, 71), (71, 72), (72, 73), (73, 74), (74, 75), (75, 76), (76, 77), (77, 78), (78, 79), (79, 80), (80, 81), (81, 82), (82, 83), (83, 84), (84, 85), (85, 86), (86, 87), (87, 91), (91, 92), (92, 93), (93, 94), (94, 95), (95, 96), (96, 97), (97, 98), (98, 99), (

In [10]:
offset_mapping = tokenized_examples.pop("offset_mapping")
print(offset_mapping)

[[(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23), (23, 24), (24, 25), (25, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 34), (34, 35), (35, 36), (36, 37), (37, 38), (38, 39), (39, 40), (40, 41), (41, 45), (45, 46), (46, 47), (47, 48), (48, 49), (49, 50), (50, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (61, 62), (62, 63), (63, 67), (67, 68), (68, 69), (69, 70), (70, 71), (71, 72), (72, 73), (73, 74), (74, 75), (75, 76), (76, 77), (77, 78), (78, 79), (79, 80), (80, 81), (81, 82), (82, 83), (83, 84), (84, 85), (85, 86), (86, 87), (87, 91), (91, 92), (92, 93), (93, 94), (94, 95), (95, 96), (96, 97), (97, 98), (98, 99), 

In [11]:
for idx, offset in enumerate(offset_mapping):
    answer = example_dataset[idx]["answers"] 
    start_char = answer["answer_start"][0] # 定位回答的开始字符
    end_char = start_char + len(answer["text"][0]) # 定位回答的结束字符：开始字符加上回答的长度, 其中answer["text"][0]为回答的文本
    
    # 定位答案在token中的起始位置和结束位置
    # 截断策略：拿到context的起始和结束，然后从左右两侧向答案逼近
    context_start = tokenized_examples.sequence_ids(idx).index(1)
    context_end = tokenized_examples.sequence_ids(idx).index(None, context_start) - 1 # 其中index(None, context_start)表示从context_start开始，直到找到第一个None为止
    if offset[context_end][1] < start_char or offset[context_start][0] > end_char: # 判断答案是否在context中
        start_token_pos = 0
        end_token_pos = 0
    else:
        token_id = context_start
        while token_id <= context_end and offset[token_id][0] < start_char: # 从context_start开始，向右逼近，直到找到第一个偏移量大于等于start_char的token_id
            token_id += 1
        start_token_pos = token_id
        token_id = context_end
        while token_id >= context_start and offset[token_id][1] > end_char: # 从context_end开始，向左逼近，直到找到第一个偏移量小于等于end_char的token_id
            token_id -=1
        end_token_pos = token_id

print(answer, start_char, end_char, context_start, context_end, start_token_pos, end_token_pos)
print("token answer decode:", tokenizer.decode(tokenized_examples["input_ids"][idx][start_token_pos: end_token_pos + 1]))


{'text': ['大空翼'], 'answer_start': [84]} 84 87 21 486 105 107
token answer decode: 大 空 翼


将上面的实现整理成process_function

In [12]:
def process_function(examples):
    tokenized_examples = tokenizer(text=examples["question"],
                               text_pair=examples["context"],
                               return_offsets_mapping=True,
                               max_length=384, truncation="only_second", padding="max_length")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    start_positions = []
    end_positions = []
    
    for idx, offset in enumerate(offset_mapping):
        answer = examples["answers"][idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        context_start = tokenized_examples.sequence_ids(idx).index(1)
        context_end = tokenized_examples.sequence_ids(idx).index(None, context_start) - 1

        if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
            start_token_pos = 0
            end_token_pos = 0
        else:
            token_id = context_start 
            while token_id <= context_end and offset[token_id][0] < start_char:
                token_id += 1
            start_token_pos = token_id
            token_id = context_end
            while token_id >= context_start and offset[token_id][1] > end_char:
                token_id -= 1
            end_token_pos = token_id
        start_positions.append(start_token_pos)
        end_positions.append(end_token_pos)

    # 以字典的形式加入键值对
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

In [13]:
tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/3219 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 10142
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 3219
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1002
    })
})

## Step4 模型加载

In [14]:
model = AutoModelForQuestionAnswering.from_pretrained("../hfl/chinese-macbert-base")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ../hfl/chinese-macbert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step5 配置TrainingArguments

In [15]:
args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=3
)

## Step6 配置Trainer

In [16]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=DefaultDataCollator(),
)

  trainer = Trainer(


## Step7 训练模型

In [17]:
trainer.train()

  0%|          | 0/951 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 3.2811, 'grad_norm': 14.984274864196777, 'learning_rate': 4.737118822292324e-05, 'epoch': 0.16}
{'loss': 1.8313, 'grad_norm': 10.826260566711426, 'learning_rate': 4.474237644584648e-05, 'epoch': 0.32}
{'loss': 1.6169, 'grad_norm': 13.979934692382812, 'learning_rate': 4.211356466876972e-05, 'epoch': 0.47}
{'loss': 1.4947, 'grad_norm': 12.979494094848633, 'learning_rate': 3.9484752891692956e-05, 'epoch': 0.63}
{'loss': 1.4075, 'grad_norm': 7.87099552154541, 'learning_rate': 3.6855941114616195e-05, 'epoch': 0.79}
{'loss': 1.4194, 'grad_norm': 11.20816421508789, 'learning_rate': 3.4227129337539433e-05, 'epoch': 0.95}


  0%|          | 0/101 [00:00<?, ?it/s]

{'eval_loss': 1.060685396194458, 'eval_runtime': 152.5452, 'eval_samples_per_second': 21.102, 'eval_steps_per_second': 0.662, 'epoch': 1.0}
{'loss': 1.1355, 'grad_norm': 11.002781867980957, 'learning_rate': 3.159831756046267e-05, 'epoch': 1.1}
{'loss': 1.0451, 'grad_norm': 10.497243881225586, 'learning_rate': 2.8969505783385907e-05, 'epoch': 1.26}
{'loss': 0.9966, 'grad_norm': 8.62635326385498, 'learning_rate': 2.6340694006309152e-05, 'epoch': 1.42}
{'loss': 0.9843, 'grad_norm': 11.151959419250488, 'learning_rate': 2.3711882229232387e-05, 'epoch': 1.58}
{'loss': 0.9599, 'grad_norm': 15.441923141479492, 'learning_rate': 2.1083070452155626e-05, 'epoch': 1.74}
{'loss': 0.9761, 'grad_norm': 10.44046401977539, 'learning_rate': 1.8454258675078864e-05, 'epoch': 1.89}


  0%|          | 0/101 [00:00<?, ?it/s]

{'eval_loss': 1.0858100652694702, 'eval_runtime': 152.5141, 'eval_samples_per_second': 21.106, 'eval_steps_per_second': 0.662, 'epoch': 2.0}
{'loss': 0.8754, 'grad_norm': 7.021555423736572, 'learning_rate': 1.5825446898002103e-05, 'epoch': 2.05}
{'loss': 0.6987, 'grad_norm': 10.680444717407227, 'learning_rate': 1.3196635120925343e-05, 'epoch': 2.21}
{'loss': 0.6793, 'grad_norm': 11.001843452453613, 'learning_rate': 1.056782334384858e-05, 'epoch': 2.37}
{'loss': 0.6203, 'grad_norm': 8.707664489746094, 'learning_rate': 7.93901156677182e-06, 'epoch': 2.52}
{'loss': 0.6577, 'grad_norm': 9.775185585021973, 'learning_rate': 5.310199789695059e-06, 'epoch': 2.68}
{'loss': 0.6339, 'grad_norm': 8.147686958312988, 'learning_rate': 2.6813880126182968e-06, 'epoch': 2.84}
{'loss': 0.6472, 'grad_norm': 10.046258926391602, 'learning_rate': 5.257623554153523e-08, 'epoch': 3.0}


  0%|          | 0/101 [00:00<?, ?it/s]

{'eval_loss': 1.2464760541915894, 'eval_runtime': 153.3009, 'eval_samples_per_second': 20.998, 'eval_steps_per_second': 0.659, 'epoch': 3.0}
{'train_runtime': 5402.5002, 'train_samples_per_second': 5.632, 'train_steps_per_second': 0.176, 'train_loss': 1.1552502022180648, 'epoch': 3.0}


TrainOutput(global_step=951, training_loss=1.1552502022180648, metrics={'train_runtime': 5402.5002, 'train_samples_per_second': 5.632, 'train_steps_per_second': 0.176, 'total_flos': 5962661340337152.0, 'train_loss': 1.1552502022180648, 'epoch': 3.0})

## Step8 模型预测

In [18]:
from transformers import pipeline

pipe = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)
context = "小明在北京上班。"
question = "小明在哪个城市工作？"
pipe(question=question, context=context)

{'score': 0.7233455181121826, 'start': 3, 'end': 5, 'answer': '北京'}