In [1]:
import torch
import transformers
import tokenizers
from tokenizers import SentencePieceBPETokenizer
from tokenizers import SentencePieceUnigramTokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer, BpeTrainer, UnigramTrainer
# whitespace pretokenizer ?
from tokenizers.pre_tokenizers import Whitespace

unk_token = "<UNK>"
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]

In [2]:
def train_WordPieceTrainer(file_list, vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    Train WP tokenizer from a list of files.
    """
    tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=spl_tokens,
        show_progress=True,
        limit_alphabet=limit_alphabet
    )
    tokenizer.pre_tokenizer = Whitespace()
    
    tokenizer.train(file_list, trainer)
    
    tokenizer.save("./WP_tok-trained.json")
    tokenizer = Tokenizer.from_file("./WP_tok-trained.json")
    return tokenizer

In [3]:
def train_iterator_mul_files(files):
    for path in files:
        with open(path, "r") as f:
            for line in f:
                yield line

In [4]:
def train_SentencePieceBPETokenizer(files, vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    trin SP_BPE tokenizer from a list of files.
    """
    train_it = train_iterator_mul_files(files)

    tokenizer = SentencePieceBPETokenizer()
    tokenizer.train_from_iterator(
        train_it,
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        show_progress=True,
        limit_alphabet=limit_alphabet,
    )
    tokenizer.save("./SP_BPE_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_BPE_tok-trained.json")
    return tokenizer  

In [5]:
def train_SentencePieceUGTokenizer(filelist, vocab_size=30_000):
    """
    trin SP_UG tokenizer from a list of files.
    """
    train_it = train_iterator_mul_files(filelist)

    tokenizer = SentencePieceUnigramTokenizer()
    tokenizer.train_from_iterator(
        train_it,
        vocab_size=vocab_size,
        show_progress=True
    )
    tokenizer.save("./SP_UG_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_UG_tok-trained.json")
    return tokenizer  

In [6]:
file_list = ["./data.txt"]

train_WordPieceTrainer(file_list)
train_SentencePieceBPETokenizer(file_list)
train_SentencePieceUGTokenizer(file_list)

<tokenizers.Tokenizer at 0x1fb6fe70e60>