#### Tokenizer 基本使用

In [1]:
from transformers import AutoTokenizer

In [2]:
#從HugginFace載入,輸入模型名稱,即可載入對應的分詞器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer
#被預先下載至~/.cache/huggingface/hub/

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [3]:
# tokenizer保存至本地
tokenizer.save_pretrained("./roberta_tokenizer")

('./roberta_tokenizer/tokenizer_config.json',
 './roberta_tokenizer/special_tokens_map.json',
 './roberta_tokenizer/vocab.txt',
 './roberta_tokenizer/added_tokens.json',
 './roberta_tokenizer/tokenizer.json')

In [4]:
# 從本地端載入tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")

#### 句子分詞

In [5]:
sen = "弱小的我也有大夢想!"

In [6]:
tokens = tokenizer.tokenize(sen)
tokens

['弱', '小', '的', '我', '也', '有', '大', '夢', '想', '!']

#### 查看詞典

In [11]:
tokenizer.vocab
len(tokenizer.vocab)
tokenizer.vocab_size

21128

#### 索引轉換

In [12]:
#將詞序列轉換為id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2483, 2207, 4638, 2769, 738, 3300, 1920, 1918, 2682, 106]

In [13]:
#將id序列轉換為token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['弱', '小', '的', '我', '也', '有', '大', '夢', '想', '!']

#### 更便捷的實現方式

In [14]:
#前面多了101,後面多了102的特殊編碼
#將字符串列轉換為id序列,稱為編碼
ids = tokenizer.encode(sen)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 1918, 2682, 106, 102]

In [19]:
ids = tokenizer.encode(sen,add_special_tokens=True)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 1918, 2682, 106, 102]

In [20]:
#將id序列轉換為字符序列,又稱為解碼
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 弱 小 的 我 也 有 大 夢 想! [SEP]'

In [21]:
#填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 1918, 2682, 106, 102, 0, 0, 0]

In [22]:
#截斷
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

[101, 2483, 2207, 4638, 102]

#### 其他輸入部分

In [24]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 1918, 2682, 106, 102, 0, 0, 0]

In [25]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]
attention_mask

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]

In [26]:
token_type_ids = [0] * len(ids)
token_type_ids

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

#### 自動完成的方式

In [28]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 1918, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

In [29]:
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 1918, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

#### 處理batch數據


In [32]:
from pprint import pprint
sens = [
    '弱小的我也有大夢想',
    '有夢想誰都了不起',
    '追逐夢想的心，比夢想本身，更可貴']

res = tokenizer(sens)
pprint(res)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 1918, 2682, 102],
               [101, 3300, 1918, 2682, 6306, 6963, 749, 679, 6629, 102],
               [101,
                6841,
                6852,
                1918,
                2682,
                4638,
                2552,
                8024,
                3683,
                1918,
                2682,
                3315,
                6716,
                8024,
                3291,
                1377,
                6523,
                102]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


In [35]:
%%time
# 單行處理時間
for i in range(1000):
    tokenizer(sen)

CPU times: user 81.1 ms, sys: 0 ns, total: 81.1 ms
Wall time: 80.8 ms


In [36]:
%%time
#處理batch數據
res = tokenizer([sen] * 1000)

CPU times: user 63.1 ms, sys: 3.42 ms, total: 66.5 ms
Wall time: 27.9 ms


#### Fast / Slow Tokenizer