In [8]:
import torch
import transformers
import tokenizers
from tokenizers import SentencePieceBPETokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
# whitespace pretokenizer ?
from tokenizers.pre_tokenizers import Whitespace

unk_token = "<UNK>"
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]

In [9]:
def train_WordPieceTrainer(file_list, vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    Train WP tokenizer from a list of files.
    """
    tokenizer = WordPiece(unk_token = unk_token)
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=[
            "<UNK>",
            "<SEP>",
            "<MASK>",
            "<CLS>",
        ],
        show_progress=True,
        limit_alphabet=limit_alphabet
    )
    tokenizer.pre_tokenizer = Whitespace()
    
    tokenizer.train(file_list, trainer)
    tokenizer.save("./WPtok-trained.json")
    tokenizer = Tokenizer.from_file("./WPtok-trained.json")
    return tokenizer

In [12]:
def train_SentencePieceBPETokenizer(data, vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    trin SP tokenizer from a list of files.
    """
    tokenizer = SentencePieceBPETokenizer()
    tokenizer.train_from_iterator(
        data,
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        show_progress=True,
        limit_alphabet=limit_alphabet,
    )
    tokenizer.save("./SPtok-trained.json")
    tokenizer = Tokenizer.from_file("./SPtok-trained.json")
    return tokenizer  