In [1]:
import os
import json
import re
import pandas as pd
from tqdm.notebook import tqdm
import collections

## Tokenizer

In [2]:
from transformers import BertTokenizer, BertTokenizerFast, AutoTokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(
    'klue/bert-base',
    use_fast=True,
)



In [4]:
tokenizer

PreTrainedTokenizerFast(name_or_path='klue/bert-base', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
len(tokenizer)

32000

In [6]:
!rm -rf ../Tokenizer/*.txt ../Tokenizer/*.json
tokenizer.save_pretrained('../Tokenizer')

('../Tokenizer/tokenizer_config.json',
 '../Tokenizer/special_tokens_map.json',
 '../Tokenizer/vocab.txt',
 '../Tokenizer/added_tokens.json',
 '../Tokenizer/tokenizer.json')

## Customize vocab.txt 

In [7]:
def load_vocab(file_path) :
    assert file_path.endswith('.txt')
    vocab_map = {}
    idx = 0

    f = open(file_path, 'r')
    while True:
        line = f.readline()
        if not line: 
            break
        vocab = line[:-1]
        vocab_map[idx] = vocab
        idx += 1

    f.close()
    return vocab_map


In [8]:
vocab_path = os.path.join('../Tokenizer', 'vocab.txt')
vocab_map = load_vocab(vocab_path)

In [9]:
print(vocab_map[31500])
print(vocab_map[31999])

[unused0]
[unused499]


In [10]:
def add_unused(vocab_map, tokenizer, unk_token_path) :
    assert unk_token_path.endswith('.csv')
    vocab_size = len(tokenizer)
    unused_start = tokenizer.convert_tokens_to_ids('[unused0]')

    unk_ch_df = pd.read_csv(unk_token_path)
    unused_size = vocab_size - unused_start 
    for i in range(unused_size) :
        unused_idx = unused_start + i
        data = unk_ch_df.iloc[i]
        unk_ch = data['Character']
        vocab_map[unused_idx] = unk_ch


In [11]:
unk_vocab_path = os.path.join('../Tokenizer', 'unk_characters.csv')
add_unused(vocab_map, tokenizer, unk_vocab_path)

In [12]:
print(vocab_map[31500])
print(vocab_map[31999])

李
贊


In [13]:
def write_vocab(vocab_map, file_path) :
    assert file_path.endswith('.txt')
    data_size = len(vocab_map)
    vocab_list = list(vocab_map.values())

    f = open(file_path, 'w')
    for i in range(data_size):
        f.write(vocab_list[i]+'\n')
    f.close()

In [14]:
write_vocab(vocab_map, vocab_path)

## Customize tokenizer.json 

In [15]:
def load_tokenizer_json(file_path) :
    assert file_path.endswith('.json')
    with open(file_path, "r") as json_data:
        tokenizer_data = json.load(json_data)
    return tokenizer_data

In [16]:
tokenizer_data = load_tokenizer_json(os.path.join('../Tokenizer', 'tokenizer.json'))

In [17]:
print('Tokenizer Data : {}'.format(tokenizer_data.keys()))
tokenizer_vocab_data = tokenizer_data['model']['vocab']

Tokenizer Data : dict_keys(['version', 'truncation', 'padding', 'added_tokens', 'normalizer', 'pre_tokenizer', 'post_processor', 'decoder', 'model'])


In [18]:
print(tokenizer_vocab_data['[unused0]'])
print(tokenizer_vocab_data['[unused200]'])

31500
31700


In [19]:
print('Tokenizr Vocab Data : {}'.format(type(tokenizer_vocab_data)))

Tokenizr Vocab Data : <class 'dict'>


In [20]:
def write_tokenizer_json(tokenizer_data, vocab_data, file_path) :
    assert file_path.endswith('.json')
    inverse_vocab_data = {vocab_data[key] : key for key in vocab_data.keys()}
    tokenizer_data['model']['vocab'] = inverse_vocab_data
    with open(file_path, 'w') as json_file:
        json.dump(tokenizer_data, json_file)

In [21]:
write_tokenizer_json(tokenizer_data, vocab_map, os.path.join('../Tokenizer', 'tokenizer.json'))

## Check Tokenizer Data

In [22]:
tokenizer_data = load_tokenizer_json(os.path.join('../Tokenizer', 'tokenizer.json'))

In [23]:
sorted(tokenizer_data['model']['vocab'].items(), key=lambda x : x[1], reverse=True)[:100]

[('贊', 31999),
 ('依', 31998),
 ('斗', 31997),
 ('雪', 31996),
 ('妃', 31995),
 ('吏', 31994),
 ('希', 31993),
 ('蘇', 31992),
 ('雜', 31991),
 ('惱', 31990),
 ('帶', 31989),
 ('苦', 31988),
 ('啓', 31987),
 ('始', 31986),
 ('ʰ', 31985),
 ('辛', 31984),
 ('綱', 31983),
 ('題', 31982),
 ('致', 31981),
 ('岳', 31980),
 ('奇', 31979),
 ('靖', 31978),
 ('믈', 31977),
 ('獻', 31976),
 ('狀', 31975),
 ('變', 31974),
 ('ú', 31973),
 ('盛', 31972),
 ('茂', 31971),
 ('象', 31970),
 ('ω', 31969),
 ('홋', 31968),
 ('誌', 31967),
 ('و', 31966),
 ('チ', 31965),
 ('貴', 31964),
 ('植', 31963),
 ('υ', 31962),
 ('ミ', 31961),
 ('虎', 31960),
 ('슌', 31959),
 ('望', 31958),
 ('け', 31957),
 ('퓌', 31956),
 ('比', 31955),
 ('澤', 31954),
 ('テ', 31953),
 ('デ', 31952),
 ('徳', 31951),
 ('劇', 31950),
 ('乙', 31949),
 ('觸', 31948),
 ('防', 31947),
 ('ノ', 31946),
 ('サ', 31945),
 ('許', 31944),
 ('え', 31943),
 ('布', 31942),
 ('羽', 31941),
 ('鳥', 31940),
 ('ナ', 31939),
 ('淵', 31938),
 ('б', 31937),
 ('ム', 31936),
 ('县', 31935),
 ('號', 31934),
 ('訓', 319

## Check Result

In [24]:
tokenizer_custom = AutoTokenizer.from_pretrained(
    '../Tokenizer',
    use_fast=True
)

In [25]:
print('Size of Tokenizer : %d' %len(tokenizer_custom))
tokenizer_custom

Size of Tokenizer : 32000


PreTrainedTokenizerFast(name_or_path='../Tokenizer', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [26]:
print(tokenizer_custom.convert_ids_to_tokens(31500))
print(tokenizer_custom.convert_ids_to_tokens(31999))

李
贊
