In [2]:
import torch
import os
import sentencepiece as spm

- We will use BPE (Byte Pair Encoding) for tokenization, and the pre-trained model from Hugging Face Transformers.

In [4]:
vocab_size = 4096

spm.SentencePieceTrainer.train(
    input="data/wiki.txt",
    model_prefix="data/my_wiki_tokenizer",
    model_type="bpe",
    vocab_size=vocab_size,
    self_test_sample_size=0,
    input_format="text",
    character_coverage=0.995, # 99.5% of most frequent characters are kept
    num_threads=os.cpu_count(),
    split_digits=True,
    allow_whitespace_only_pieces=True,
    byte_fallback=True,
    unk_surface=r" \342\201\207 ",
    normalization_rule_name = "identity"
    )

print("Tokenizer trained successfully.")

Tokenizer trained successfully.


### tokenize dataset

In [None]:
with open("data/wiki.txt", "r", encoding="utf-8") as f:
    text = f.read()
print(f"Dataset size: {len(text)} characters")
print(text[30000:30300])

In [5]:
sp = spm.SentencePieceProcessor(
    model_file="data/my_wiki_tokenizer.model"
)  # spm.SentencePieceProcessor is more advanced than NLTK's word_tokenize
vocab_size = sp.GetPieceSize()
print(vocab_size)

4096


In [6]:
def encode(s):
    return sp.Encode(s)


def decode(s):
    return sp.Decode(s)


In [7]:
print(encode("The quick brown fox jumps over the lazy dog."))
print(decode(encode("The quick brown fox jumps over the lazy dog.")))

[310, 4031, 116, 2897, 1090, 570, 285, 1172, 599, 1853, 4039, 751, 264, 314, 817, 4049, 3429, 4051]
The quick brown fox jumps over the lazy dog.


In [10]:
if os.path.exists("data/my_encoded_data.pt"):
    data = torch.load("data/my_encoded_data.pt")
else:  # shows how to create encoded_data.pt
    encoded_data = torch.tensor(encode(text), dtype=torch.long)
    torch.save(encoded_data, "data/my_encoded_data.pt")