<br>

# 2) Tokenizing text

Packages that are being used in this notebook:

In [None]:
from importlib.metadata import version


print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

- Loading the raw text to work with
- [The Verdict by Edith Wharton](https://en.wikisource.org/wiki/The_Verdict) is a public domain short story

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

- Using RegEx to split the text based on whitespaces and punctuation

In [None]:
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item]
print(preprocessed[:38])

In [None]:
print("Number of tokens:", len(preprocessed))

<br>

# 2.1 Converting tokens into token IDs

- Converting the text tokens into token IDs that we can process via embedding layers later
- First step: Building a vocabulary that contains the unique words from the input text.

In [None]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

vocab = {token:integer for integer,token in enumerate(all_words)}

- The first 50 entries in this vocabulary:

In [None]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

- The tokenizer class

In [None]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

- The tokenizer encodes text to integers
- These integers can then be embedded (later) as input of/for the LLM

In [None]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

- Decoding the integers back into text

In [None]:
tokenizer.decode(ids)
tokenizer.decode(tokenizer.encode(text))

<br>

# 2.2 BytePair encoding

- GPT-2 used BytePair encoding (BPE) as its tokenizer
- [tiktoken] is the BPE tokenizer from OpenAI's open-source library.
- `tiktoken` is approx. 3x faster than the original tokenizer and 6x faster than an equivalent tokenizer in Hugging Face


In [None]:
# pip install tiktoken

In [None]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

In [None]:
strings = tokenizer.decode(integers)

print(strings)

In [None]:
tokenizer.encode("Akwirw ier", allowed_special={"<|endoftext|>"})

<br>

# 2.3 Data sampling with a sliding window

- Creating the data loading for LLMs


In [None]:
from supplementary import create_dataloader_v1


dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)