In [7]:
import torch
import transformers
import tokenizers
from tokenizers import SentencePieceBPETokenizer
from tokenizers import SentencePieceUnigramTokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer, BpeTrainer, UnigramTrainer
# whitespace pretokenizer ?
from tokenizers.pre_tokenizers import Whitespace
# use bert pretokenizer
from typing import List


unk_token = "<UNK>"
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]

In [9]:
def train_WordPieceTokenizer(file_list: List[str], vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    Train WP tokenizer from a list of files.
    """
    tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=spl_tokens,
        show_progress=True,
        limit_alphabet=limit_alphabet
    )
    tokenizer.pre_tokenizer = Whitespace()
    
    tokenizer.train(file_list, trainer)
    
    tokenizer.save("./WP_tok-trained.json")
    tokenizer = Tokenizer.from_file("./WP_tok-trained.json")
    return tokenizer

In [10]:
def train_iterator_mul_files(files):
    for path in files:
        with open(path, "r") as f:
            for line in f:
                yield line

In [11]:
def train_SentencePieceBPETokenizer(files: List[str], vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    trin SP_BPE tokenizer from a list of files.
    """
    train_it = train_iterator_mul_files(files)

    tokenizer = SentencePieceBPETokenizer()
    tokenizer.train_from_iterator(
        train_it,
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        show_progress=True,
        limit_alphabet=limit_alphabet,
    )
    tokenizer.save("./SP_BPE_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_BPE_tok-trained.json")
    return tokenizer  

In [12]:
def train_SentencePieceUGTokenizer(filelist: List[str], vocab_size=30_000):
    """
    trin SP_UG tokenizer from a list of files.
    """
    train_it = train_iterator_mul_files(filelist)

    tokenizer = SentencePieceUnigramTokenizer()
    tokenizer.train_from_iterator(
        train_it,
        vocab_size=vocab_size,
        show_progress=True
    )
    tokenizer.save("./SP_UG_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_UG_tok-trained.json")
    return tokenizer  

In [20]:
import os

def convert_corpus_with_tokenizer(contents: List[str], tokenizer: Tokenizer):
    """
    Convert a list of strings with a tokenizer.
    """
    if os.path.isfile(contents[0]):
        contents = [open(path, "r").read() for path in contents]
        contents = [c.replace("\n", '') for c in contents]
    contents = tokenizer.encode_batch(contents)
    corpus_tokens = [c.tokens for c in contents]
    return corpus_tokens

In [21]:
file_list = ["./data.txt"]

train_WordPieceTokenizer(file_list)
train_SentencePieceBPETokenizer(file_list)
train_SentencePieceUGTokenizer(file_list)

<tokenizers.Tokenizer at 0x1fb6fe6db80>

In [25]:
tokenizer = Tokenizer.from_file("./SP_BPE_tok-trained.json")

corpus = convert_corpus_with_tokenizer(file_list, tokenizer)

" ".join(corpus[0])

"▁L ore m ▁I p s um ▁ i s ▁s i m p l y ▁ d um m y ▁t e x t ▁ of ▁the ▁p r in t ing ▁a n d ▁t y p es et t ing ▁in d u s t r y . ▁L ore m ▁I p s um ▁ h as ▁ b e en ▁the ▁in d u s t r y ' s ▁s t a n d a r d ▁ d um m y ▁t e x t ▁ e v e r ▁s in c e ▁the ▁ 1 5 0 0 s , ▁ w he n ▁a n ▁ u n k n o w n ▁p r in t e r ▁t o o k ▁a ▁ g a l le y ▁ of ▁t y p e ▁a n d ▁s c r a m b le d ▁ i t ▁t o ▁ m a k e ▁a ▁t y p e ▁s p e c i m en ▁ b o o k . ▁I t ▁ h as ▁s u r v i v e d ▁ n o t ▁ o n l y ▁ f i v e ▁ c en t u r i es , ▁ b u t ▁a l s o ▁the ▁ le a p ▁in t o ▁ e le c t r o n i c ▁t y p es et t ing , ▁ re m a in ing ▁ es s en t i a l l y ▁ u n c h a n g e d . ▁I t ▁ w as ▁p o p u l a r i s e d ▁in ▁the ▁ 1 9 6 0 s ▁ w i t h ▁the ▁ re le as e ▁ of ▁L et r as et ▁s he et s ▁ c o n t a in ing ▁L ore m ▁I p s um ▁p as s a g es , ▁a n d ▁ m ore ▁ re c en t l y ▁ w i t h ▁ d es k t o p ▁p u b l i s h ing ▁s of t w a re ▁ l i k e ▁ A l d u s ▁ P a g e M a k e r ▁in c l u d ing ▁ v e r s i o n s ▁ of ▁L ore m ▁