## 自建一個cmrc2018的2筆資料

In [98]:
source_json = [{
    'id':'lesson1',
    'context':'小英的生日是1997年10月9日,女性',
    'question':'小英的生日是?',
    'answers':{'text':['1997年10月9日'],'answer_start':[6]}
    },
    {
    'id':'lesson2',
    'context':'川普日前宣布課徵加拿大和墨西哥25%的關稅',
    'question':'加拿大和墨西哥被課徵的關稅是',
    'answers':{'text':['25%'],'answer_start':[15]}
    }

]

## 轉換為DatasetDict

In [99]:
from datasets import Dataset, DatasetDict
train_dataset = Dataset.from_list(source_json)
datasets = DatasetDict({
    'train':train_dataset
})
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 2
    })
})

# 目標
![](./images/pic1.png)

## 使用AutoTokenizer建立input_ids,token_type_ids,attention_mask

In [100]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

In [101]:
tokenized_dataset = tokenizer(
    text=datasets['train']['question'],
    text_pair=datasets['train']['context'],
    max_length = 512, #BERT最高處理512byte,
    truncation="only_second", #全部超過只截斷context,
    padding=True
)
#查資訊類型
print(type(tokenized_dataset))
#取出第一筆
print(tokenized_dataset['input_ids'][0])
print(tokenized_dataset['token_type_ids'][0])
print(tokenized_dataset['attention_mask'][0])

<class 'transformers.tokenization_utils_base.BatchEncoding'>
[101, 2207, 5739, 4638, 4495, 3189, 3221, 136, 102, 2207, 5739, 4638, 4495, 3189, 3221, 8387, 2399, 8108, 3299, 130, 3189, 117, 1957, 2595, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


![](./images/pic2.png)

### 找出tokenize答案的起始index和答案的結束index
### 解決方案
### 請參考圖片說明

- 找出start_char
- 找出end_char
- 取出offset_mapping
- 取出tokenizer的sequence_ids
- 找出context_start
- 找出context_end
- 由前往後找,找出答案的起始index
- 由後行前找,找出答案的結束index

![](./images/pic3.png)

In [102]:
tokenized_dataset = tokenizer(
    text=datasets['train']['question'],
    text_pair=datasets['train']['context'],
    max_length = 512, 
    truncation="only_second", 
    padding=True,
    return_offsets_mapping=True #才可以取出offset_mapping
)

print(tokenized_dataset.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])


### 取出offset_mapping

In [103]:
#取出offset_mapping
offset_mapping = tokenized_dataset.pop('offset_mapping')

In [104]:
#取出offset_mapping第1筆資料
offset = offset_mapping[0]
print(offset)

[(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 10), (10, 11), (11, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]


## 找出start_char和end_char

In [105]:
answer = datasets['train']['answers'][0]
answer

{'answer_start': [6], 'text': ['1997年10月9日']}

In [106]:
start_char = answer['answer_start'][0]
start_char

6

In [107]:
end_char = start_char + len(answer['text'][0])
end_char

16

## 要取的sequence_ids才可以找出下面的資料
## 找出context_start和context_end

In [108]:
context = tokenized_dataset.sequence_ids(0) #取出第0筆的sequence_ids
print(context) 

[None, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, None, None, None, None, None, None, None, None, None, None, None, None, None]


In [109]:
context_start = context.index(1) #取出索引1的起始編號
print(context_start) #1的起始索引編號

9


In [110]:
context_end = context.index(None,context_start) - 1 #取出1後是None的起始編號的前一個(就是1的結束的索引編號)
print(context_end) #取出1的最後一個的索引編號


23


## 由前往後找到start_position
## 由後行前找到end_position

In [111]:
#由於截斷可能會沒有答案
if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
    #代表答案不在context中
    start_token_pos = 0
    end_token_pos = 0
else:
    token_id = context_start
    #由前後找,找出起始位置
    while token_id <= context_end and offset[token_id][0] < start_char:
        token_id += 1
    start_token_pos = token_id
    token_id = context_end
    #由後往前找,找出最後的位置
    while token_id >= context_start and offset[token_id][1] > end_char:
        token_id -= 1
    end_token_pos = token_id



### 驗證是否正確

In [112]:
answer_token = tokenized_dataset['input_ids'][0][start_token_pos:end_token_pos + 1]
answer_token

[8387, 2399, 8108, 3299, 130, 3189]

In [113]:
#驗證正確
tokenizer.decode(answer_token)

'1997 年 10 月 9 日'

### 建立function,一次處理一個batch
### 並得到訓練時所需要的資料


In [114]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

def process_func(examples):
    tokenized_examples = tokenizer(text=examples['question'],
                               text_pair=examples['context'],
                               max_length=512,
                               return_offsets_mapping=True,
                               truncation="only_second",
                               padding=True)
    offset_mapping = tokenized_examples.pop("offset_mapping")
    start_positions = []
    end_positions = []
    for idx, offset in enumerate(offset_mapping):
        answer = examples['answers'][idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer['text'][0])
        context_start = tokenized_examples.sequence_ids(idx).index(1)
        context_end = tokenized_examples.sequence_ids(idx).index(None,context_start) - 1

        if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
            #代表答案不在context中
            start_token_pos = 0
            end_token_pos = 0
        else:
            token_id = context_start
            while token_id <= context_end and offset[token_id][0] < start_char:
                token_id += 1
            start_token_pos = token_id
            token_id = context_end
            while token_id >= context_start and offset[token_id][1] > end_char:
                token_id -= 1
            end_token_pos = token_id
        start_positions.append(start_token_pos)
        end_positions.append(end_token_pos)
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

In [115]:
tokenied_datasets = datasets.map(process_func, batched=True, remove_columns=datasets['train'].column_names)
tokenied_datasets


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 2
    })
})