## --------------------BYTE-PAIR ENCODING(BPE)--------------------------------------

In [None]:
# word_based_tokenizer recap

import spacy
import re
def create_vocab(text):
    unique_words = sorted(set(text))
    unique_words.extend(["unk", "endoftext"])
    vocabulary = {word:ids for ids, words in enumerate(unique_words)}
    return vocabulary
class SimpleWordBasedTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {ids:word for word, ids in vocab.items()}
    def preprocess(self, text):
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)
        tokens = []
        for sentence in doc.sents:
            for token in sentence:
                tokens.append(token.text)
        return tokens
   def encode(self, tokens):
       ids_1 = [self.str_to_int[word] for word in tokens]
       ids_2 = []
       for i in ids_1:
           if i in self.str_to_int:
               ids_2.append("unk")
           else:
               ids_2.append(i)
       return ids_2 
   def decode(self, ids):
       text = " ".join([self.int_to_str[i] for i in ids])
       text = re.sub(r'\s+([,:;!?().])', "\1", text)
       return text

<br>
&nbsp;

## Using BPE from `tiktoken`

In [15]:
# Creating of the BPE tokenizer
import tiktoken
print(f"Tiktoken version = {tiktoken.__version__}")
BPE_tokenizer = tiktoken.get_encoding("gpt2")
BPE_tokenizer

Tiktoken version = 0.9.0


<Encoding 'gpt2'>

## Testing the tokenizer

In [23]:
text = (
    "Hello do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace"
)
token_ids = BPE_tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(token_ids[:5])
print(f"The length of token ids = {len(token_ids)}\nThe length of original text = {len(text)}")
decoded_text = BPE_tokenizer.decode(token_ids)
print(f"Decoded text:\n\t{decoded_text}")

[15496, 466, 345, 588, 8887]
The length of token ids = 18
The length of original text = 78
Decoded text:
	Hello do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace


In [26]:
# Testing the tokenizer with a simple text to demonstrate how it works
text = "AKwirw ier"
integers = BPE_tokenizer.encode(text)
print(integers)
strings = BPE_tokenizer.decode(integers)
print(strings)

[10206, 86, 343, 86, 220, 959]
AKwirw ier
