#### When calling Tokenizer.encode or Tokenizer.encode_batch, the input text(s) go through the following pipeline:

    ##### normalization
    ##### pre-tokenization
    ##### model
    ##### post-processing 

In [14]:
# from tokenizers import Tokenizer
# tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")

Exception: No such file or directory (os error 2)

In [1]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
normalizer = normalizers.Sequence([NFD(), StripAccents()])

In [2]:
normalizer.normalize_str("Héllò hôw are ü?")

'Hello how are u?'

In [4]:
# When building a Tokenizer, you can customize its normalizer by just changing the corresponding attribute:

## pretokenization


Pre-tokenization is the act of splitting a text into smaller objects that give an upper bound to what your tokens will be at the end of training. A good way to think of this is that the pre-tokenizer will split your text into “words” and then, your final tokens will be parts of those words.

In [6]:
from tokenizers.pre_tokenizers import Whitespace, BertPreTokenizer

pre_tokenizer = Whitespace()
pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.")
# [("Hello", (0, 5)), ("!", (5, 6)), ("How", (7, 10)), ("are", (11, 14)), ("you", (15, 18)),
#  ("?", (18, 19)), ("I", (20, 21)), ("'", (21, 22)), ('m', (22, 23)), ("fine", (24, 28)),
#  (",", (28, 29)), ("thank", (30, 35)), ("you", (36, 39)), (".", (39, 40))]

[('Hello', (0, 5)),
 ('!', (5, 6)),
 ('How', (7, 10)),
 ('are', (11, 14)),
 ('you', (15, 18)),
 ('?', (18, 19)),
 ('I', (20, 21)),
 ("'", (21, 22)),
 ('m', (22, 23)),
 ('fine', (24, 28)),
 (',', (28, 29)),
 ('thank', (30, 35)),
 ('you', (36, 39)),
 ('.', (39, 40))]

In [7]:
pre_tokenizer = BertPreTokenizer()
pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.")
# [("Hello", (0, 5)), ("!", (5, 6)), ("How", (7, 10)), ("are", (11, 14)), ("you", (15, 18)),
#  ("?", (18, 19)), ("I", (20, 21)), ("'", (21, 22)), ('m', (22, 23)), ("fine", (24, 28)),
#  (",", (28, 29)), ("thank", (30, 35)), ("you", (36, 39)), (".", (39, 40))]

[('Hello', (0, 5)),
 ('!', (5, 6)),
 ('How', (7, 10)),
 ('are', (11, 14)),
 ('you', (15, 18)),
 ('?', (18, 19)),
 ('I', (20, 21)),
 ("'", (21, 22)),
 ('m', (22, 23)),
 ('fine', (24, 28)),
 (',', (28, 29)),
 ('thank', (30, 35)),
 ('you', (36, 39)),
 ('.', (39, 40))]

In [8]:
from tokenizers.pre_tokenizers import WhitespaceSplit
pre_tokenizer = WhitespaceSplit()
pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.")

[('Hello!', (0, 6)),
 ('How', (7, 10)),
 ('are', (11, 14)),
 ('you?', (15, 19)),
 ("I'm", (20, 23)),
 ('fine,', (24, 29)),
 ('thank', (30, 35)),
 ('you.', (36, 40))]

In [9]:
"Hello! How are you? I'm fine, thank you.".split()

['Hello!', 'How', 'are', 'you?', "I'm", 'fine,', 'thank', 'you.']

In [10]:
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Digits
pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])

pre_tokenizer.pre_tokenize_str("Call 911!")
# [("Call", (0, 4)), ("9", (5, 6)), ("1", (6, 7)), ("1", (7, 8)), ("!", (8, 9))]

[('Call', (0, 4)), ('9', (5, 6)), ('1', (6, 7)), ('1', (7, 8)), ('!', (8, 9))]

In [12]:
pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=False)])

pre_tokenizer.pre_tokenize_str("Call 911!")
# [('Call', (0, 4)), ('911', (5, 8)), ('!', (8, 9))]

[('Call', (0, 4)), ('911', (5, 8)), ('!', (8, 9))]

# Model

In [13]:
# Once the input texts are normalized and pre-tokenized, the Tokenizer applies the model on the pre-tokens. This is the part of the pipeline that needs training on your corpus (or that has been trained if you are using a pretrained tokenizer).

# The role of the model is to split your “words” into tokens, using the rules it has learned. It’s also responsible for mapping those tokens to their corresponding IDs in the vocabulary of the model.

# This model is passed along when intializing the Tokenizer so you already know how to customize this part. Currently, the 🤗 Tokenizers library supports:

# models.BPE
# models.Unigram
# models.WordLevel
# models.WordPiece
# For more details about each model and its behavior, you can check here

# Post processing

In [None]:
# Post-processing is the last step of the tokenization pipeline, to perform any additional transformation to the Encoding before it’s returned, like adding potential special tokens.

# As we saw in the quick tour, we can customize the post processor of a Tokenizer by setting the corresponding attribute. For instance, here is how we can post-process to make the inputs suitable for the BERT model:

In [18]:
from tokenizers import Tokenizer
import tokenizers

In [20]:
tokenizer= Tokenizer(tokenizers.models.WordPiece())

In [21]:
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
)

## All togather: Bert tokenizer from scratch

In [24]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece

In [25]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents
# Normalization
bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

In [26]:
# pretokenization
from tokenizers.pre_tokenizers import Whitespace
bert_tokenizer.pre_tokenizer = Whitespace()

In [27]:
from tokenizers.processors import TemplateProcessing
bert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

In [45]:
# !pip install pandas
# !pip install pyarrow

In [46]:
# import pandas as pd

# # Step 1: Read the Parquet file
# df = pd.read_parquet('res/validation-00000-of-00001.parquet', engine='pyarrow')  # or engine='fastparquet'

# # Step 2: Convert the DataFrame to a string
# df_string = df.to_string(index=False)

# # Step 3: Write the string to a text file
# with open('res/wiki.txt', 'w') as f:
#     f.write(df_string)

# print("Data written to output.txt")

Data written to output.txt


In [47]:
from tokenizers.trainers import WordPieceTrainer
trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
# files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
files = ["res/wiki.txt"]
# files = ["res/corpus.txt"]
bert_tokenizer.train(files, trainer)
# bert_tokenizer.save("./bert-corpus.json")
bert_tokenizer.save("./bert-wiki.json")






In [51]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.ids)
# [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]
tokenizer.decode([1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2])
# "Hello , y ' all ! How are you ?"

In [None]:
from tokenizers import decoders
bert_tokenizer.decoder = decoders.WordPiece()
bert_tokenizer.decode(output.ids)
# "welcome to the tokenizers library."

In [59]:
print(bert_tokenizer.token_to_id("high"))

408


In [55]:
print(bert_tokenizer.to_str())

