In [1]:
import torch
import transformers
import tokenizers
from tokenizers import SentencePieceBPETokenizer
from tokenizers import SentencePieceUnigramTokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer, BpeTrainer, UnigramTrainer
# whitespace pretokenizer ?
from tokenizers.pre_tokenizers import Whitespace

unk_token = "<UNK>"
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]

In [2]:
def train_WordPieceTrainer(file_list, vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    Train WP tokenizer from a list of files.
    """
    tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=spl_tokens,
        show_progress=True,
        limit_alphabet=limit_alphabet
    )
    tokenizer.pre_tokenizer = Whitespace()
    
    tokenizer.train(file_list, trainer)
    
    tokenizer.save("./WP_tok-trained.json")
    tokenizer = Tokenizer.from_file("./WP_tok-trained.json")
    return tokenizer

In [3]:
def train_SentencePieceBPETokenizer(data, vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    trin SP_BPE tokenizer from a list of files.
    """
    tokenizer = SentencePieceBPETokenizer()
    tokenizer.train_from_iterator(
        data,
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        show_progress=True,
        limit_alphabet=limit_alphabet,
    )
    tokenizer.save("./SP_BPE_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_BPE_tok-trained.json")
    return tokenizer  

In [4]:
def train_SentencePieceUGTokenizer(data, vocab_size=30_000):
    """
    trin SP_UG tokenizer from a list of files.
    """
    tokenizer = SentencePieceUnigramTokenizer()
    tokenizer.train_from_iterator(
        data,
        vocab_size=vocab_size,
        show_progress=True
    )
    tokenizer.save("./SP_UG_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_UG_tok-trained.json")
    return tokenizer  

In [5]:
file_list = ["./data.txt"]

data = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."

train_WordPieceTrainer(file_list)
train_SentencePieceBPETokenizer(data)
train_SentencePieceUGTokenizer(data)

<tokenizers.Tokenizer at 0x179ba641820>

In [None]:
# UnigramTrainer & BPETrainer DON'T work as trainers for SentencePiece tokenizer

def train_SentencePieceBPETokenizer(file_list, vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    Train SP_BP tokenizer from a list of files.
    """
    tokenizer = SentencePieceBPETokenizer(unk_token = unk_token)
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=spl_tokens,
        show_progress=True,
        limit_alphabet=limit_alphabet
    )
    tokenizer.train(file_list, trainer)

    tokenizer.save("./SP_BP_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_BP_tok_tok-trained.json")
    return tokenizer

def train_SentencePieceUGTokenizer(file_list, vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    Train SP_UG tokenizer from a list of files.
    """
    tokenizer = SentencePieceUnigramTokenizer()
    trainer = UnigramTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=spl_tokens,
        show_progress=True,
        limit_alphabet=limit_alphabet
    )
    tokenizer.train(file_list, trainer)

    tokenizer.save("./SP_UG_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_UG_tok-trained.json")
    return tokenizer