In [1]:
import torch
import transformers
import tokenizers
from tokenizers import SentencePieceBPETokenizer
from tokenizers import SentencePieceUnigramTokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
# whitespace pretokenizer ?
from tokenizers.pre_tokenizers import Whitespace

unk_token = "<UNK>"
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]

In [20]:
def train_WordPieceTrainer(file_list, vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    Train WP tokenizer from a list of files.
    """
    tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=[
            "<UNK>",
            "<SEP>",
            "<MASK>",
            "<CLS>",
        ],
        show_progress=True,
        limit_alphabet=limit_alphabet
    )
    tokenizer.pre_tokenizer = Whitespace()
    
    tokenizer.train(file_list, trainer)
    
    tokenizer.save("./WPtok-trained.json")
    tokenizer = Tokenizer.from_file("./WP_tok-trained.json")
    return tokenizer

In [22]:
def train_SentencePieceBPETokenizer(data, vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    trin SP_BPE tokenizer from a list of files.
    """
    tokenizer = SentencePieceBPETokenizer()
    tokenizer.train_from_iterator(
        data,
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        show_progress=True,
        limit_alphabet=limit_alphabet,
    )
    tokenizer.save("./SP_BPE_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_BPE_tok-trained.json")
    return tokenizer  

In [23]:
def train_SentencePieceUGTokenizer(data, vocab_size=30_000):
    """
    trin SP_UG tokenizer from a list of files.
    """
    tokenizer = SentencePieceUnigramTokenizer()
    tokenizer.train_from_iterator(
        data,
        vocab_size=vocab_size,
        show_progress=True
    )
    tokenizer.save("./SP_UG_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_UG_tok-trained.json")
    return tokenizer  

In [25]:
data = "lore ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum"

# train_iterator. for unigram UnigramTrainer can be used
train_SentencePieceBPETokenizer(data)
train_SentencePieceUGTokenizer(data)

# train using word piece trainer
# train_WordPieceTrainer(data)