## 自建一個cmrc2018的2筆資料

In [49]:
source_json = [{
    'id':'lesson1',
    'context':'小英的生日是1997年10月9日,女性',
    'question':'小英的生日是?',
    'answer':{'text':['1997年10月9日'],'answer_start':[6]}
    },
    {
    'id':'lesson2',
    'context':'川普日前宣布課徵加拿大和墨西哥25%的關稅',
    'question':'加拿大和墨西哥被課徵的關稅是',
    'answer':{'text':['25%'],'answer_start':[15]}
    }

]

## 轉換為DatasetDict

In [50]:
from datasets import Dataset, DatasetDict
train_dataset = Dataset.from_list(source_json)
datasets = DatasetDict({
    'train':train_dataset
})
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answer'],
        num_rows: 2
    })
})

# 目標
![](./images/pic1.png)

## 使用AutoTokenizer建立input_ids,token_type_ids,attention_mask

In [51]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

In [52]:
tokenized_dataset = tokenizer(
    text=datasets['train']['question'],
    text_pair=datasets['train']['context'],
    max_length = 512, #BERT最高處理512byte,
    truncation="only_second", #全部超過只截斷context,
    padding=True
)
#查資訊類型
print(type(tokenized_dataset))
#取出第一筆
print(tokenized_dataset['input_ids'][0])
print(tokenized_dataset['token_type_ids'][0])
print(tokenized_dataset['attention_mask'][0])

<class 'transformers.tokenization_utils_base.BatchEncoding'>
[101, 2207, 5739, 4638, 4495, 3189, 3221, 136, 102, 2207, 5739, 4638, 4495, 3189, 3221, 8387, 2399, 8108, 3299, 130, 3189, 117, 1957, 2595, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


![](./images/pic2.png)

### 找出tokenize答案的起始index和答案的結束index
### 解決方案
### 請參考圖片說明

- 找出start_char
- 找出end_char
- 取出offset_mapping
- 取出tokenizer的sequence_ids
- 找出context_start
- 找出context_end
- 由前往後找,找出答案的起始index
- 由後行前找,找出答案的結束index

![](./images/pic3.png)

In [53]:
tokenized_dataset = tokenizer(
    text=datasets['train']['question'],
    text_pair=datasets['train']['context'],
    max_length = 512, 
    truncation="only_second", 
    padding=True,
    return_offsets_mapping=True #才可以取出offset_mapping
)

print(tokenized_dataset.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])


In [54]:
offset_mapping = tokenized_dataset.pop('offset_mapping')

In [55]:
print(offset_mapping[0])

[(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 10), (10, 11), (11, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]


In [56]:
answer = datasets['train']['answer'][0]
answer

{'answer_start': [6], 'text': ['1997年10月9日']}

In [57]:
start_char = answer['answer_start'][0]
start_char

6

In [58]:
end_char = start_char + len(answer['text'][0])
end_char

16

In [59]:
context = tokenized_dataset.sequence_ids(0) #取出第0筆的sequence_ids
print(context) 

[None, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, None, None, None, None, None, None, None, None, None, None, None, None, None]


In [60]:
context_start = context.index(1) #取出索引1的起始編號
print(context_start) #1的起始索引編號

9


In [61]:
context_end = context.index(None,context_start) - 1 #取出1後是None的起始編號的前一個(就是1的結束的索引編號)
print(context_end) #取出1的最後一個的索引編號


23


In [62]:
offset = offset_mapping[0]
print(offset)

[(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 10), (10, 11), (11, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]


In [63]:
tokenizer('小英的生日是1997年10月9日,女性')

{'input_ids': [101, 2207, 5739, 4638, 4495, 3189, 3221, 8387, 2399, 8108, 3299, 130, 3189, 117, 1957, 2595, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [64]:
offset[context_end][1]

19