In [None]:
# !pip install Korpora
# !pip install tokenizers

In [4]:
from Korpora import Korpora

In [None]:
nsmc = Korpora.load('nsmc', force_download=True)

In [8]:
import os
import shutil

if os.path.exists('data-files/nsmc'):
    shutil.rmtree('data-files/nsmc')

os.mkdir('data-files/nsmc')

with open('data-files/nsmc/train.txt', 'w', encoding='utf-8') as f:
    for line in nsmc.train.get_all_texts():
        f.write(f"{line}\n")
with open('data-files/nsmc/test.txt', 'w', encoding='utf-8') as f:
    for line in nsmc.test.get_all_texts():
        f.write(f"{line}\n")

In [9]:
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer

In [None]:
wordpiece_tokenizer = BertWordPieceTokenizer(lowercase=False)
wordpiece_tokenizer.train(
    ['data-files/nsmc/train.txt', 'data-files/nsmc/test.txt'],
    vocab_size=10000
)

if os.path.exists('data-files/nsmc/wordpiece'):
    shutil.rmtree('data-files/nsmc/wordpiece')
os.mkdir('data-files/nsmc/wordpiece')

wordpiece_tokenizer.save_model('data-files/nsmc/wordpiece')

['data-files/nsmc/wordpiece\\vocab.txt']

In [None]:
bert_tokenizer = \
    BertTokenizer.from_pretrained('data-files/nsmc/wordpiece', do_lower_case=False)


[ bert_tokenizer.tokenize(text) for text in nsmc.train.texts[:10] ]

In [16]:
result = bert_tokenizer(
    nsmc.train.texts[:5],
    padding='max_length',
    max_length=12,
    truncation=True
)
result

{'input_ids': [[2, 620, 2631, 16, 16, 1993, 3678, 1990, 3323, 3, 0, 0], [2, 997, 16, 16, 16, 2609, 2045, 2796, 1981, 1112, 16, 3], [2, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 155, 1054, 1151, 2111, 9393, 16, 16, 2245, 2942, 2080, 3], [2, 3167, 1883, 1454, 1232, 1027, 711, 1333, 3326, 2342, 3153, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [13]:
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Tokenizer

In [23]:
bpe_tokenizer = ByteLevelBPETokenizer()
bpe_tokenizer.train(
    ['data-files/nsmc/train.txt', 'data-files/nsmc/test.txt'],
    vocab_size=10000,
    special_tokens=['[PAD]']
)

if os.path.exists('data-files/nsmc/bpe'):
    shutil.rmtree('data-files/nsmc/bpe')
os.mkdir('data-files/nsmc/bpe')

bpe_tokenizer.save_model('data-files/nsmc/bpe')

['data-files/nsmc/bpe\\vocab.json', 'data-files/nsmc/bpe\\merges.txt']

In [None]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained('data-files/nsmc/bpe')
gpt_tokenizer.pad_token = '[PAD]'

[gpt_tokenizer.tokenize(text) for text in nsmc.train.texts[:5]]

In [25]:
result = gpt_tokenizer(
    nsmc.train.texts[:5],
    padding='max_length',
    max_length=12,
    truncation=True
)
result

{'input_ids': [[335, 2339, 264, 582, 4056, 465, 3809, 0, 0, 0, 0, 0], [3694, 337, 2877, 759, 2884, 357, 807, 423, 9876, 876, 2961, 7293], [695, 478, 109, 242, 644, 3966, 1068, 2286, 2717, 970, 0, 0], [936, 295, 573, 997, 9463, 1252, 1693, 3056, 898, 264, 849, 743], [4537, 5654, 2199, 7775, 5022, 1018, 3099, 1574, 2855, 3525, 302, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}