In [1]:
# !pip install transformers

In [2]:
from transformers import AutoModel, AutoTokenizer, BertTokenizer

In [3]:
BERT_MODEL_NAME = 'bert-base-cased'

bert_model = AutoModel.from_pretrained(BERT_MODEL_NAME)

## Tokenizer

In [4]:
bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

In [5]:
print(bert_tokenizer.vocab_size)

28996


In [6]:
for i, key in enumerate(bert_tokenizer.get_vocab()):
    print(key)
    if i > 10: break

dimensional
Regiment
##ING
##atus
##graphical
1910
else
hardened
##uku
Football
tickets
tiger


In [7]:
type(bert_tokenizer)

transformers.models.bert.tokenization_bert_fast.BertTokenizerFast

In [8]:
sample1 = 'I am interested in data science'
sample2 = 'Iaminterestedindatascience'

In [9]:
tokenized_input_text = bert_tokenizer(sample1, return_tensors='pt')
for key, value in tokenized_input_text.items():
    print(f'{key}: \n\t{value}')

input_ids: 
	tensor([[ 101,  146, 1821, 3888, 1107, 2233, 2598,  102]])
token_type_ids: 
	tensor([[0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask: 
	tensor([[1, 1, 1, 1, 1, 1, 1, 1]])


In [10]:
tokenized_input_text_merged = bert_tokenizer(sample2, return_tensors='pt')
for key, value in tokenized_input_text_merged.items():
    print(f'{key}: \n\t{value}')

input_ids: 
	tensor([[  101,   146, 11787, 22456, 24732, 20344, 10401, 25982,  3633,   102]])
token_type_ids: 
	tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask: 
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [11]:
# print(tokenized_input_text['input_ids'])
print(tokenized_input_text.input_ids)

# print(tokenized_input_text['token_type_ids'])
print(tokenized_input_text.token_type_ids)

# print(tokenized_input_text['attention_mask'])
print(tokenized_input_text.attention_mask)

tensor([[ 101,  146, 1821, 3888, 1107, 2233, 2598,  102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1]])


In [12]:
tokenized_text = bert_tokenizer.tokenize(sample1)
print(tokenized_text)

input_ids = bert_tokenizer.encode(sample1)
print(input_ids)

decoded_ids = bert_tokenizer.decode(input_ids)
print(decoded_ids)

['I', 'am', 'interested', 'in', 'data', 'science']
[101, 146, 1821, 3888, 1107, 2233, 2598, 102]
[CLS] I am interested in data science [SEP]


In [13]:
tokenized_text = bert_tokenizer.tokenize(sample1, add_special_tokens=False)
print(tokenized_text)

input_ids = bert_tokenizer.encode(sample1, add_special_tokens=False)
print(input_ids)

decoded_ids = bert_tokenizer.decode(input_ids)
print(decoded_ids)

['I', 'am', 'interested', 'in', 'data', 'science']
[146, 1821, 3888, 1107, 2233, 2598]
I am interested in data science


In [14]:
tokenized_text = bert_tokenizer.tokenize(
    sample1,
    add_special_tokens=False,
    max_length=5,
    truncation=True
)
tokenized_text

['I', 'am', 'interested', 'in', 'data']

In [15]:
input_ids = bert_tokenizer.encode(
    sample1,
    add_special_tokens=False,
    max_length=5,
    truncation=True
)
decoded_ids = bert_tokenizer.decode(input_ids)
input_ids, decoded_ids

([146, 1821, 3888, 1107, 2233], 'I am interested in data')

In [16]:
print(bert_tokenizer.pad_token)
print(bert_tokenizer.pad_token_id)

[PAD]
0


In [17]:
tokenized_text = bert_tokenizer.tokenize(
    sample1,
    add_special_tokens=False,
    max_length=20,
    padding='max_length'
)
print(tokenized_text)

input_ids = bert_tokenizer.encode(
    sample1,
    add_special_tokens=False,
    max_length=20,
    padding='max_length'
)
print(input_ids)

decoded_ids = bert_tokenizer.decode(input_ids)
print(decoded_ids)

['I', 'am', 'interested', 'in', 'data', 'science', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
[146, 1821, 3888, 1107, 2233, 2598, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
I am interested in data science [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


## 한국어 Tokenizer

In [20]:
kor_text = '아버지 가방에 들어가신다'

In [22]:
tokenized_text = bert_tokenizer.tokenize(
    kor_text,
    add_special_tokens=False,
    max_length=20,
    padding='max_length'
)
print(tokenized_text)

['[UNK]', '[UNK]', '[UNK]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [23]:
tokenized_input_text = bert_tokenizer(kor_text, return_tensors='pt')
for key, value in tokenized_input_text.items():
    print(f'{key}: \n\t{value}')

input_ids: 
	tensor([[101, 100, 100, 100, 102]])
token_type_ids: 
	tensor([[0, 0, 0, 0, 0]])
attention_mask: 
	tensor([[1, 1, 1, 1, 1]])


In [24]:
tokenized_text = bert_tokenizer.tokenize(kor_text)
print(tokenized_text)

input_ids = bert_tokenizer.encode(kor_text)
print(input_ids)

decoded_ids = bert_tokenizer.decode(input_ids)
print(decoded_ids)

['[UNK]', '[UNK]', '[UNK]']
[101, 100, 100, 100, 102]
[CLS] [UNK] [UNK] [UNK] [SEP]


In [25]:
MULTI_BERT_MODEL_NAME = 'bert-base-multilingual-cased'

multi_bert_model = AutoModel.from_pretrained(MULTI_BERT_MODEL_NAME)
multi_bert_tokenizer = AutoTokenizer.from_pretrained(MULTI_BERT_MODEL_NAME)

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [26]:
unk_text = '한꾺인 뜰만 알아뽈 쑤 있꼐 짞썽하꼤씁니따'

In [27]:
tokenized_text = multi_bert_tokenizer.tokenize(unk_text, add_special_tokens=False)
print(tokenized_text)

input_ids = multi_bert_tokenizer.encode(unk_text, add_special_tokens=False)
print(input_ids)

decoded_ids = multi_bert_tokenizer.decode(input_ids)
print(decoded_ids)

['[UNK]', '[UNK]', '[UNK]', '쑤', '[UNK]', '[UNK]']
[100, 100, 100, 9510, 100, 100]
[UNK] [UNK] [UNK] 쑤 [UNK] [UNK]


In [28]:
added_token_num = multi_bert_tokenizer.add_tokens(['한꾺인', '뜰만', '알아뽈', '있꼐', '짞썽하꼤씁니따'])
print(added_token_num)

tokenized_text = multi_bert_tokenizer.tokenize(unk_text, add_special_tokens=False)
print(tokenized_text)

input_ids = multi_bert_tokenizer.encode(unk_text, add_special_tokens=False)
print(input_ids)

decoded_ids = multi_bert_tokenizer.decode(input_ids)
print(decoded_ids)

5
['한꾺인', '뜰만', '알아뽈', '쑤', '있꼐', '짞썽하꼤씁니따']
[119547, 119548, 119549, 9510, 119550, 119551]
한꾺인 뜰만 알아뽈 쑤 있꼐 짞썽하꼤씁니따


In [29]:
special_token_text = '[DAD]아빠[/DAD]가 방에 들어가신다'

tokenized_text = multi_bert_tokenizer.tokenize(special_token_text, add_special_tokens=False)
print(tokenized_text)

input_ids = multi_bert_tokenizer.encode(special_token_text, add_special_tokens=False)
print(input_ids)

decoded_ids = multi_bert_tokenizer.decode(input_ids)
print(decoded_ids)

['[', 'DA', '##D', ']', '아', '##빠', '[', '/', 'DA', '##D', ']', '가', '방', '##에', '들어', '##가', '##신', '##다']
[164, 47855, 11490, 166, 9519, 119008, 164, 120, 47855, 11490, 166, 8843, 9328, 10530, 71568, 11287, 25387, 11903]
[ DAD ] 아빠 [ / DAD ] 가 방에 들어가신다


In [30]:
special_token_text = '[DAD]아빠[/DAD]가 방에 들어가신다'

added_token_num = multi_bert_tokenizer.add_special_tokens({'additional_special_tokens': ['[DAD]', '[/DAD]']})

tokenized_text = multi_bert_tokenizer.tokenize(special_token_text, add_special_tokens=False)
print(tokenized_text)

input_ids = multi_bert_tokenizer.encode(special_token_text, add_special_tokens=False)
print(input_ids)

decoded_ids = multi_bert_tokenizer.decode(input_ids)
print(decoded_ids)

['[DAD]', '아', '##빠', '[/DAD]', '가', '방', '##에', '들어', '##가', '##신', '##다']
[119552, 9519, 119008, 119553, 8843, 9328, 10530, 71568, 11287, 25387, 11903]
[DAD] 아빠 [/DAD] 가 방에 들어가신다


In [31]:
decoded_ids = multi_bert_tokenizer.decode(input_ids, skip_special_tokens=True)
print(decoded_ids)

아빠 가 방에 들어가신다


In [32]:
sample_list = [
    '아빠가 방에 들어가신다',
    '[DAD]아빠[/DAD]가방에들어가신다'
]

tokens = multi_bert_tokenizer(
    sample_list,
    padding=True
)

for i in range(2):
    print(f'Tokens (int):       {tokens['input_ids'][i]}')
    print(f'Tokens (str):       {[multi_bert_tokenizer.convert_ids_to_tokens(s) for s in tokens['input_ids']]}')
    print(f'Tokens (attn_mask): {tokens['attention_mask'][i]}')
    print()

Tokens (int):       [101, 9519, 119008, 11287, 9328, 10530, 71568, 11287, 25387, 11903, 102, 0, 0]
Tokens (str):       [['[CLS]', '아', '##빠', '##가', '방', '##에', '들어', '##가', '##신', '##다', '[SEP]', '[PAD]', '[PAD]'], ['[CLS]', '[DAD]', '아', '##빠', '[/DAD]', '가', '##방', '##에', '##들어', '##가', '##신', '##다', '[SEP]']]
Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]

Tokens (int):       [101, 119552, 9519, 119008, 119553, 8843, 42337, 10530, 93200, 11287, 25387, 11903, 102]
Tokens (str):       [['[CLS]', '아', '##빠', '##가', '방', '##에', '들어', '##가', '##신', '##다', '[SEP]', '[PAD]', '[PAD]'], ['[CLS]', '[DAD]', '아', '##빠', '[/DAD]', '가', '##방', '##에', '##들어', '##가', '##신', '##다', '[SEP]']]
Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

