每種模型使用的Tokenizer都不大一樣
- 使用AutoTokenizer,可以自依據不同模型,自動傳出對應的Tokenizer

In [1]:
from transformers import AutoTokenizer

In [8]:
sentence = "職能發展學院"

In [21]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [23]:
sentence = "職能發展學院"
tokens = tokenizer.tokenize(sentence)
print(tokens)

['職', '能', '發', '展', '學', '院']


In [22]:
tokenizer.vocab

{'##жбы': 93001,
 'телескоп': 92627,
 'Fu': 17056,
 '##וק': 23732,
 'MG': 38668,
 'tango': 77049,
 'albo': 51240,
 'đổ': 48424,
 'faces': 48343,
 'moyenne': 26558,
 '##llning': 91227,
 '##द्य': 97110,
 'arról': 94677,
 'riječ': 103786,
 'ware': 88902,
 'scientific': 23301,
 'результатами': 93172,
 'יותר': 14371,
 'Denkmalschutz': 77418,
 'sends': 104838,
 '##ంటారు': 79350,
 '牆': 5394,
 '##ckt': 25028,
 '##рви': 93162,
 'ATP': 18316,
 'farm': 30568,
 '郷': 7836,
 'Vega': 26810,
 '##கா': 105076,
 '27': 10365,
 'کینیڈا': 106013,
 '##давно': 95227,
 'Henrik': 19965,
 '##քերի': 68806,
 'Mahalleye': 97530,
 'antic': 33687,
 'independência': 97924,
 '##にも': 100955,
 'komen': 23504,
 '1685': 36960,
 '宓': 3383,
 'trouver': 42633,
 '##řila': 74435,
 '##੪': 111263,
 'Герб': 92709,
 'Vila': 23117,
 'disk': 50169,
 'staatlichen': 82278,
 '##risë': 75776,
 '##нии': 17240,
 'solaire': 107743,
 'urriaren': 108446,
 '##יקר': 75888,
 '##cnica': 107616,
 'tránh': 92890,
 '##懶': 113728,
 'presenting': 1039

In [24]:
tokenizer.vocab_size

119547

In [25]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[6492, 6546, 5714, 3495, 3370, 8222]

In [26]:
tokenizer.convert_ids_to_tokens(ids)

['職', '能', '發', '展', '學', '院']

In [27]:
tokenizer.convert_tokens_to_string(tokens)

'職 能 發 展 學 院'

In [28]:
ids = tokenizer.encode(sentence)
ids

[101, 6492, 6546, 5714, 3495, 3370, 8222, 102]

In [29]:
ids = tokenizer.encode(sentence, add_special_tokens=False)
ids

[6492, 6546, 5714, 3495, 3370, 8222]

In [30]:
str_sen = tokenizer.decode(ids,skip_special_tokens=True)
str_sen

'職 能 發 展 學 院'

In [None]:
#填充
ids = tokenizer.encode(sentence, padding="max_length", max_length=15)
ids

[101, 6492, 6546, 5714, 3495, 3370, 8222, 102, 0, 0, 0, 0, 0, 0, 0]

In [32]:
#截斷
tokenizer.encode(sentence, max_length=5, truncation=True)

[101, 6492, 6546, 5714, 102]

In [33]:
ids = tokenizer.encode(sentence, padding="max_length", max_length=15)
ids

[101, 6492, 6546, 5714, 3495, 3370, 8222, 102, 0, 0, 0, 0, 0, 0, 0]

In [None]:
#手動建立attention_mask
attention_mask = [1 if idx != 0 else 0 for idx in ids ]
attention_mask

[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]

In [37]:
#手動建立token_type_ids
token_type_ids = [0] * len(ids)
token_type_ids

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [38]:
#快速呼叫
inputs = tokenizer.encode_plus(sentence, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 6492, 6546, 5714, 3495, 3370, 8222, 102, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}

In [39]:
#使用__call__
inputs = tokenizer(sentence, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 6492, 6546, 5714, 3495, 3370, 8222, 102, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}

In [41]:
sentences = [
    '職能發展學院',
    '通訊等職類勞動力需求與職能基準研究',
    '職類職能培育機構督導考核及其他有關行業職能發展等事項']

res = tokenizer(sentences)
res

{'input_ids': [[101, 6492, 6546, 5714, 3495, 3370, 8222, 102], [101, 7735, 7172, 6069, 6492, 8408, 2628, 2621, 2594, 8301, 4875, 6631, 6492, 6546, 3099, 5111, 5832, 6003, 102], [101, 6492, 8408, 6492, 6546, 3098, 6518, 4741, 4698, 5785, 3458, 6456, 4578, 2730, 2460, 2196, 4461, 8160, 7069, 4671, 6492, 6546, 5714, 3495, 6069, 2149, 8375, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [45]:
%%time

for i in range(1000):
    tokenizer(sentence)

CPU times: user 34.1 ms, sys: 2.03 ms, total: 36.1 ms
Wall time: 34.7 ms


In [46]:
%%time
res = tokenizer([sentence] * 1000)

CPU times: user 22.8 ms, sys: 730 μs, total: 23.5 ms
Wall time: 9.12 ms


In [47]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)