In [23]:
"""
Raw text -> tokens -> Special tokens -> input IDs

Raw text: Let's try to tokenize!
tokens: ['let', "'", 's', 'try', 'to', 'token', '##ize', '!']
Special tokens: [[CLS], Let, ', s, try, to, token, ##ize, !, [SEP}}
input IDs: [101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102]
"""

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize("Let's try to tokenize!")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
final_inputs = tokenizer.prepare_for_model(input_ids)

print(tokens)
print(input_ids)
print(final_inputs["input_ids"])

inputs = tokenizer("Let's try to tokenize!")
print(input["input_ids"])

back_to_inputs = tokenizer.decode(inputs["input_ids"])
print(back_to_inputs)

print("----")
print(inputs)

['let', "'", 's', 'try', 'to', 'token', '##ize', '!']
[2292, 1005, 1055, 3046, 2000, 19204, 4697, 999]
[101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102]
[101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102]
[CLS] let's try to tokenize! [SEP]
----
{'input_ids': [101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [24]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]

encoded_input = tokenizer(batch_sentences, padding=True) # padding: 제일 긴 문장에 맞춰서 padding(0)이 들어간다.
print(encoded_input)

{'input_ids': [[101, 2021, 2054, 2055, 2117, 6350, 1029, 102, 0, 0, 0, 0, 0, 0], [101, 2123, 1005, 1056, 2228, 2002, 4282, 2055, 2117, 6350, 1010, 28315, 1012, 102], [101, 2054, 2055, 5408, 14625, 1029, 102, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]}


In [25]:
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) # truncate: sequence가 model이 처리하기 너무 길 때 잘라준다.
print(encoded_input)

{'input_ids': [[101, 2021, 2054, 2055, 2117, 6350, 1029, 102, 0, 0, 0, 0, 0, 0], [101, 2123, 1005, 1056, 2228, 2002, 4282, 2055, 2117, 6350, 1010, 28315, 1012, 102], [101, 2054, 2055, 5408, 14625, 1029, 102, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]}


In [26]:
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt") # return_tensors: pytorch에서 사용할 수 있는 실제 tensor로 바꿔준다.
print(encoded_input)

{'input_ids': tensor([[  101,  2021,  2054,  2055,  2117,  6350,  1029,   102,     0,     0,
             0,     0,     0,     0],
        [  101,  2123,  1005,  1056,  2228,  2002,  4282,  2055,  2117,  6350,
          1010, 28315,  1012,   102],
        [  101,  2054,  2055,  5408, 14625,  1029,   102,     0,     0,     0,
             0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}
