In [1]:
# !pip install transformers

In [2]:
from transformers import AutoModel, AutoTokenizer, BertTokenizer

In [3]:
BERT_MODEL_NAME = 'bert-base-cased'

bert_model = AutoModel.from_pretrained(BERT_MODEL_NAME)

## Tokenizer

In [4]:
bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

In [5]:
print(bert_tokenizer.vocab_size)

28996


In [6]:
for i, key in enumerate(bert_tokenizer.get_vocab()):
    print(key)
    if i > 10: break

ṅ
glowing
surge
deals
recorded
bye
Cricket
##alaya
Beacon
outs
requirement
##used


In [7]:
type(bert_tokenizer)

transformers.models.bert.tokenization_bert_fast.BertTokenizerFast

In [8]:
sample1 = 'I am interested in data science'
sample2 = 'Iaminterestedindatascience'

In [9]:
tokenized_input_text = bert_tokenizer(sample1, return_tensors='pt')
for key, value in tokenized_input_text.items():
    print(f'{key}: \n\t{value}')

input_ids: 
	tensor([[ 101,  146, 1821, 3888, 1107, 2233, 2598,  102]])
token_type_ids: 
	tensor([[0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask: 
	tensor([[1, 1, 1, 1, 1, 1, 1, 1]])


In [10]:
tokenized_input_text_merged = bert_tokenizer(sample2, return_tensors='pt')
for key, value in tokenized_input_text_merged.items():
    print(f'{key}: \n\t{value}')

input_ids: 
	tensor([[  101,   146, 11787, 22456, 24732, 20344, 10401, 25982,  3633,   102]])
token_type_ids: 
	tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask: 
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [11]:
# print(tokenized_input_text['input_ids'])
print(tokenized_input_text.input_ids)

# print(tokenized_input_text['token_type_ids'])
print(tokenized_input_text.token_type_ids)

# print(tokenized_input_text['attention_mask'])
print(tokenized_input_text.attention_mask)

tensor([[ 101,  146, 1821, 3888, 1107, 2233, 2598,  102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1]])


In [12]:
tokenized_text = bert_tokenizer.tokenize(sample1)
print(tokenized_text)

input_ids = bert_tokenizer.encode(sample1)
print(input_ids)

decoded_ids = bert_tokenizer.decode(input_ids)
print(decoded_ids)

['I', 'am', 'interested', 'in', 'data', 'science']
[101, 146, 1821, 3888, 1107, 2233, 2598, 102]
[CLS] I am interested in data science [SEP]


In [13]:
tokenized_text = bert_tokenizer.tokenize(sample1, add_special_tokens=False)
print(tokenized_text)

input_ids = bert_tokenizer.encode(sample1, add_special_tokens=False)
print(input_ids)

decoded_ids = bert_tokenizer.decode(input_ids)
print(decoded_ids)

['I', 'am', 'interested', 'in', 'data', 'science']
[146, 1821, 3888, 1107, 2233, 2598]
I am interested in data science


In [14]:
tokenized_text = bert_tokenizer.tokenize(
    sample1,
    add_special_tokens=False,
    max_length=5,
    truncation=True
)
tokenized_text

['I', 'am', 'interested', 'in', 'data']

In [15]:
input_ids = bert_tokenizer.encode(
    sample1,
    add_special_tokens=False,
    max_length=5,
    truncation=True
)
decoded_ids = bert_tokenizer.decode(input_ids)
input_ids, decoded_ids

([146, 1821, 3888, 1107, 2233], 'I am interested in data')

In [16]:
print(bert_tokenizer.pad_token)
print(bert_tokenizer.pad_token_id)

[PAD]
0


In [17]:
tokenized_text = bert_tokenizer.tokenize(
    sample1,
    add_special_tokens=False,
    max_length=20,
    padding='max_length'
)
print(tokenized_text)

input_ids = bert_tokenizer.encode(
    sample1,
    add_special_tokens=False,
    max_length=20,
    padding='max_length'
)
print(input_ids)

decoded_ids = bert_tokenizer.decode(input_ids)
print(decoded_ids)

['I', 'am', 'interested', 'in', 'data', 'science', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
[146, 1821, 3888, 1107, 2233, 2598, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
I am interested in data science [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


## 한국어 Tokenizer

In [18]:
kor_text = '아버지 가방에 들어가신다'

In [19]:
tokenized_text = bert_tokenizer.tokenize(
    kor_text,
    add_special_tokens=False,
    max_length=20,
    padding='max_length'
)
print(tokenized_text)

['[UNK]', '[UNK]', '[UNK]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [20]:
tokenized_input_text = bert_tokenizer(kor_text, return_tensors='pt')
for key, value in tokenized_input_text.items():
    print(f'{key}: \n\t{value}')

input_ids: 
	tensor([[101, 100, 100, 100, 102]])
token_type_ids: 
	tensor([[0, 0, 0, 0, 0]])
attention_mask: 
	tensor([[1, 1, 1, 1, 1]])


In [21]:
tokenized_text = bert_tokenizer.tokenize(kor_text)
print(tokenized_text)

input_ids = bert_tokenizer.encode(kor_text)
print(input_ids)

decoded_ids = bert_tokenizer.decode(input_ids)
print(decoded_ids)

['[UNK]', '[UNK]', '[UNK]']
[101, 100, 100, 100, 102]
[CLS] [UNK] [UNK] [UNK] [SEP]


In [22]:
MULTI_BERT_MODEL_NAME = 'bert-base-multilingual-cased'

multi_bert_model = AutoModel.from_pretrained(MULTI_BERT_MODEL_NAME)
multi_bert_tokenizer = AutoTokenizer.from_pretrained(MULTI_BERT_MODEL_NAME)

In [23]:
unk_text = '한꾺인 뜰만 알아뽈 쑤 있꼐 짞썽하꼤씁니따'

In [24]:
tokenized_text = multi_bert_tokenizer.tokenize(unk_text, add_special_tokens=False)
print(tokenized_text)

input_ids = multi_bert_tokenizer.encode(unk_text, add_special_tokens=False)
print(input_ids)

decoded_ids = multi_bert_tokenizer.decode(input_ids)
print(decoded_ids)

['[UNK]', '[UNK]', '[UNK]', '쑤', '[UNK]', '[UNK]']
[100, 100, 100, 9510, 100, 100]
[UNK] [UNK] [UNK] 쑤 [UNK] [UNK]


In [25]:
added_token_num = multi_bert_tokenizer.add_tokens(['한꾺인', '뜰만', '알아뽈', '있꼐', '짞썽하꼤씁니따'])
print(added_token_num)

tokenized_text = multi_bert_tokenizer.tokenize(unk_text, add_special_tokens=False)
print(tokenized_text)

input_ids = multi_bert_tokenizer.encode(unk_text, add_special_tokens=False)
print(input_ids)

decoded_ids = multi_bert_tokenizer.decode(input_ids)
print(decoded_ids)

5
['한꾺인', '뜰만', '알아뽈', '쑤', '있꼐', '짞썽하꼤씁니따']
[119547, 119548, 119549, 9510, 119550, 119551]
한꾺인 뜰만 알아뽈 쑤 있꼐 짞썽하꼤씁니따


In [26]:
special_token_text = '[DAD]아빠[/DAD]가 방에 들어가신다'

tokenized_text = multi_bert_tokenizer.tokenize(special_token_text, add_special_tokens=False)
print(tokenized_text)

input_ids = multi_bert_tokenizer.encode(special_token_text, add_special_tokens=False)
print(input_ids)

decoded_ids = multi_bert_tokenizer.decode(input_ids)
print(decoded_ids)

['[', 'DA', '##D', ']', '아', '##빠', '[', '/', 'DA', '##D', ']', '가', '방', '##에', '들어', '##가', '##신', '##다']
[164, 47855, 11490, 166, 9519, 119008, 164, 120, 47855, 11490, 166, 8843, 9328, 10530, 71568, 11287, 25387, 11903]
[ DAD ] 아빠 [ / DAD ] 가 방에 들어가신다


In [27]:
special_token_text = '[DAD]아빠[/DAD]가 방에 들어가신다'

added_token_num = multi_bert_tokenizer.add_special_tokens({'additional_special_tokens': ['[DAD]', '[/DAD]']})

tokenized_text = multi_bert_tokenizer.tokenize(special_token_text, add_special_tokens=False)
print(tokenized_text)

input_ids = multi_bert_tokenizer.encode(special_token_text, add_special_tokens=False)
print(input_ids)

decoded_ids = multi_bert_tokenizer.decode(input_ids)
print(decoded_ids)

['[DAD]', '아', '##빠', '[/DAD]', '가', '방', '##에', '들어', '##가', '##신', '##다']
[119552, 9519, 119008, 119553, 8843, 9328, 10530, 71568, 11287, 25387, 11903]
[DAD] 아빠 [/DAD] 가 방에 들어가신다


In [28]:
decoded_ids = multi_bert_tokenizer.decode(input_ids, skip_special_tokens=True)
print(decoded_ids)

아빠 가 방에 들어가신다


In [29]:
sample_list = [
    '아빠가 방에 들어가신다',
    '[DAD]아빠[/DAD]가방에들어가신다'
]

tokens = multi_bert_tokenizer(
    sample_list,
    padding=True
)

for i in range(2):
    print(f'Tokens (int):       {tokens['input_ids'][i]}')
    print(f'Tokens (str):       {[multi_bert_tokenizer.convert_ids_to_tokens(s) for s in tokens['input_ids']]}')
    print(f'Tokens (attn_mask): {tokens['attention_mask'][i]}')
    print()

Tokens (int):       [101, 9519, 119008, 11287, 9328, 10530, 71568, 11287, 25387, 11903, 102, 0, 0]
Tokens (str):       [['[CLS]', '아', '##빠', '##가', '방', '##에', '들어', '##가', '##신', '##다', '[SEP]', '[PAD]', '[PAD]'], ['[CLS]', '[DAD]', '아', '##빠', '[/DAD]', '가', '##방', '##에', '##들어', '##가', '##신', '##다', '[SEP]']]
Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]

Tokens (int):       [101, 119552, 9519, 119008, 119553, 8843, 42337, 10530, 93200, 11287, 25387, 11903, 102]
Tokens (str):       [['[CLS]', '아', '##빠', '##가', '방', '##에', '들어', '##가', '##신', '##다', '[SEP]', '[PAD]', '[PAD]'], ['[CLS]', '[DAD]', '아', '##빠', '[/DAD]', '가', '##방', '##에', '##들어', '##가', '##신', '##다', '[SEP]']]
Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]



## [MASK] 토큰 예측

In [30]:
masked_text = '아빠가 [MASK] 들어가신다'
tokenized_text = multi_bert_tokenizer.tokenize(masked_text)

print(tokenized_text)

['아', '##빠', '##가', '[MASK]', '들어', '##가', '##신', '##다']


In [31]:
from transformers import pipeline

nlp_fill = pipeline('fill-mask', model=MULTI_BERT_MODEL_NAME)
nlp_fill(masked_text)




BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with anot

[{'score': 0.05300595983862877,
  'token': 9654,
  'token_str': '잘',
  'sequence': '아빠가 잘 들어가신다'},
 {'score': 0.05062916874885559,
  'token': 11287,
  'token_str': '##가',
  'sequence': '아빠가가 들어가신다'},
 {'score': 0.047388385981321335,
  'token': 8982,
  'token_str': '나',
  'sequence': '아빠가 나 들어가신다'},
 {'score': 0.03328689560294151,
  'token': 9056,
  'token_str': '다',
  'sequence': '아빠가 다 들어가신다'},
 {'score': 0.026803359389305115,
  'token': 14867,
  'token_str': '##면',
  'sequence': '아빠가면 들어가신다'}]

###### 한국어 전용 모델이 아닌 multi-lingual BERT를 활용했더니 아쉬운 결과가 나타난다 ..

## 영어 문장에서 [MASK] 토큰 예측

In [32]:
masked_text = 'Leave before you [MASK] me'
tokenized_text = multi_bert_tokenizer.tokenize(masked_text)

print(tokenized_text)

['Leave', 'before', 'you', '[MASK]', 'me']


In [33]:
from transformers import pipeline

nlp_fill = pipeline('fill-mask', model=MULTI_BERT_MODEL_NAME)
nlp_fill(masked_text)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'score': 0.3139025568962097,
  'token': 11248,
  'token_str': 'Love',
  'sequence': 'Leave before you Love me'},
 {'score': 0.04809042811393738,
  'token': 16138,
  'token_str': 'love',
  'sequence': 'Leave before you love me'},
 {'score': 0.04090191423892975,
  'token': 10169,
  'token_str': 'with',
  'sequence': 'Leave before you with me'},
 {'score': 0.035010598599910736,
  'token': 99401,
  'token_str': 'Loved',
  'sequence': 'Leave before you Loved me'},
 {'score': 0.030007442459464073,
  'token': 10135,
  'token_str': 'on',
  'sequence': 'Leave before you on me'}]

In [35]:
tokens_pt = multi_bert_tokenizer('아빠가 방에 들어가신다', return_tensors='pt')
for key, value in tokens_pt.items():
    print(f'{key}:\n\t{value}')
    
outputs = multi_bert_model(**tokens_pt)
last_hidden_state = outputs.last_hidden_state
pooler_output = outputs.pooler_output

print(f'\nToken wise output: {last_hidden_state.shape}, Pooled output: {pooler_output.shape}')

input_ids:
	tensor([[   101,   9519, 119008,  11287,   9328,  10530,  71568,  11287,  25387,
          11903,    102]])
token_type_ids:
	tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask:
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

Token wise output: torch.Size([1, 11, 768]), Pooled output: torch.Size([1, 768])


In [36]:
print(pooler_output)

tensor([[ 1.9587e-01, -2.6240e-03,  2.1747e-01, -7.7761e-02, -1.7742e-03,
          4.5689e-01,  5.9940e-02,  2.5821e-01, -3.6000e-01,  2.5447e-01,
          1.8904e-02, -1.8748e-01, -1.4784e-01, -1.1042e-01,  1.4066e-01,
         -1.7055e-01,  4.9949e-01,  4.0194e-02, -6.1990e-02, -3.2384e-01,
         -9.9998e-01, -1.0523e-01, -1.9167e-01, -5.8405e-02, -2.5229e-01,
         -1.8232e-02, -1.0872e-01, -4.8203e-02,  7.6683e-02, -1.2361e-01,
         -7.1717e-02, -9.9999e-01,  3.4002e-01,  4.9053e-01,  1.4669e-01,
         -1.1793e-02,  5.8997e-02,  2.0018e-01,  1.8910e-01, -3.4949e-01,
         -1.8250e-01,  3.9466e-03, -9.3936e-02,  1.9231e-01, -1.2782e-01,
         -2.0957e-01, -1.0196e-01,  1.7126e-01, -2.4489e-01,  9.5668e-02,
          1.2766e-01,  1.9946e-01,  3.0436e-01,  2.3804e-01,  1.3247e-01,
          1.6288e-01,  1.1462e-01,  1.5527e-01,  2.3521e-01, -2.1541e-03,
         -2.7164e-02,  1.0165e-01, -2.4749e-02, -1.1822e-01, -1.8863e-01,
         -3.7550e-01, -9.1158e-02, -1.

In [39]:
print(multi_bert_model.get_input_embeddings())
multi_bert_model.resize_token_embeddings(multi_bert_tokenizer.vocab_size + added_token_num)
print(multi_bert_model.get_input_embeddings())

Embedding(119547, 768, padding_idx=0)


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(119549, 768, padding_idx=0)
