In [18]:
import os
import json
import re
import pandas as pd
from tqdm.notebook import tqdm
import collections

## Tokenizer

In [2]:
from transformers import BertTokenizer

In [3]:
tokenizer = BertTokenizer.from_pretrained(
    'klue/bert-base',
)



In [4]:
tokenizer

PreTrainedTokenizer(name_or_path='klue/bert-base', vocab_size=32000, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
len(tokenizer)

32000

In [6]:
!rm -rf ../Tokenizer/*.txt ../Tokenizer/*.json
tokenizer.save_pretrained('../Tokenizer')

('../Tokenizer/tokenizer_config.json',
 '../Tokenizer/special_tokens_map.json',
 '../Tokenizer/vocab.txt',
 '../Tokenizer/added_tokens.json')

## Customize vocab.txt 

In [7]:
def load_vocab(dir_path, file_name) :
    assert file_name.endswith('.txt')
    vocab_map = {}
    idx = 0
    file_path = os.path.join(dir_path, file_name)

    f = open(file_path, 'r')
    while True:
        line = f.readline()
        if not line: 
            break
        vocab = line[:-1]
        vocab_map[idx] = vocab
        idx += 1

    f.close()

    return vocab_map


In [8]:
vocab_map = load_vocab('../Tokenizer', 'vocab.txt')

In [9]:
print(vocab_map[31500])
print(vocab_map[31999])

[unused0]
[unused499]


In [10]:
def add_unused(vocab_map, tokenizer, unk_chacters) :
    assert unk_chacters.endswith('.csv')
    vocab_size = len(tokenizer)
    unused_start = tokenizer.convert_tokens_to_ids('[unused0]')

    unk_ch_df = pd.read_csv(unk_chacters)
    unused_size = vocab_size - unused_start 
    for i in range(unused_size) :
        unused_idx = unused_start + i
        data = unk_ch_df.iloc[i]
        unk_ch = data['Character']
        vocab_map[unused_idx] = unk_ch


In [11]:
add_unused(vocab_map, tokenizer, '../Tokenizer/unk_characters.csv')

In [12]:
print(vocab_map[31500])
print(vocab_map[31999])

李
贊


In [13]:
def write_vocab_txt(vocab_map, file_path) :
    assert file_path.endswith('.txt')
    data_size = len(vocab_map)
    vocab_list = list(vocab_map.values())

    f = open(file_path, 'w')
    for i in range(data_size):
        f.write(vocab_list[i]+'\n')
    f.close()

In [14]:
write_vocab_txt(vocab_map, '../Tokenizer/vocab.txt')

## Check Result

In [15]:
tokenizer = BertTokenizer.from_pretrained(
    '../Tokenizer'
)

In [16]:
print('Size of Tokenizer : %d' %len(tokenizer))
tokenizer

Size of Tokenizer : 32000


PreTrainedTokenizer(name_or_path='../Tokenizer', vocab_size=32000, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [17]:
print(tokenizer.convert_ids_to_tokens(31500))
print(tokenizer.convert_ids_to_tokens(31999))

李
贊
