## -----------------------------WORD-BASED-TOKENIZATION-----------------------------------------

In [1]:
# Reading the text file

with open("the-verdict.txt", "r", encoding="utf=8") as f:
    raw_text = f.read()
print(f"Example of contents in our text file: \n {raw_text[:32]}\n")    
print(f"Length of charachters in our raw text file = {len(raw_text)} charachters")

Example of contents in our text file: 
 I HAD always thought Jack Gisbur

Length of charachters in our raw text file = 20479 charachters


## STEP 1: Tokenization

- The goal is to tokenize and embed this text for an LLM
- Let's develop a simple tokenizer based on some simple sample text that we can then later apply to the text above
- The following regular expression will split on whitespaces

In [2]:
# Example of splitting where white-spaces and punctuations are encountered

import re
# Creating tokens for words and punctuations
text = "Hello, world. This, is a test."
result = re.split(r"([,.]|\s)", text)
# Removing the whitespaces
result = [item for item in result if item.strip()]

result

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']

In [3]:
# Inclusion of more punctuation marks using re

result = re.split(r"([,.:;_!]|\s)", text)
result = [item for item in result if item.strip()]
result

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']

In [4]:
# Tokenization using spacy - just simplier
import spacy
def tokens(text):
    tokens = []
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    for sentence in doc.sents:
        for token in sentence:
            tokens.append(token.text)
    return tokens
print(tokens(text))



['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [5]:
# Tokenization of our full raw text(use render or spacy) - i love spacy 

preprocessed = tokens(raw_text)
# Checking the length of our tokens
len(preprocessed)

4713

## STEP 2: Creating Unique Token ID's

 - This is just basically creating a vocabulary for our tokens
 - Sort the tokens in alphabetical order then give them numerical representations

In [52]:
# sorting data - unique charachters

all_unique_words = sorted(set(preprocessed))
vocab_size = len(all_unique_words)
vocab_size

1143

In [53]:
# Creating the vocabulary

vocab = {char:integer for integer, char in enumerate(all_unique_words)}

In [54]:
# Checking some values in our dictionary

for i, (word, Id) in enumerate(vocab.items()):
    if i>10:
        break
    print((word, Id))    

('\n\n', 0)
('!', 1)
('"', 2)
("'", 3)
("'d", 4)
("'re", 5)
("'s", 6)
("'ve", 7)
('(', 8)
(')', 9)
(',', 10)


## Step 3: Creating a encode and decode method

 - Encode = text >> tokens >> token id
 - Decode = token id >> tokens >> text

In [55]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {ids:word for word, ids in vocab.items()}

    def encode(self, raw_text):
        preprocessed = tokens(raw_text)
        ids = [self.str_to_int[i] for i in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

b = SimpleTokenizerV1(vocab)
encoded_text = b.encode(raw_text)
decoded_text = b.decode(encoded_text)

##  Step 4: Adding Special Case Tokens


- It's useful to add some "special" tokens for unknown words and to denote the end of a text
- Some tokenizers use special tokens to help the LLM with additional context
- Some of these special tokens are
  - `[BOS]` (beginning of sequence) marks the beginning of text
  - `[EOS]` (end of sequence) marks where the text ends (this is usually used to concatenate multiple unrelated texts, e.g., two different Wikipedia articles or two different books, and so on)
  - `[PAD]` (padding) if we train LLMs with a batch size greater than 1 (we may include multiple texts with different lengths; with the padding token we pad the shorter texts to the longest length so that all texts have an equal length)
- `[UNK]` to represent words that are not included in the vocabulary

- Note that GPT-2 does not need any of these tokens mentioned above but only uses an `<|endoftext|>` token to reduce complexity
- The `<|endoftext|>` is analogous to the `[EOS]` token mentioned above
- GPT also uses the `<|endoftext|>` for padding (since we typically use a mask when training on batched inputs, we would not attend padded tokens anyways, so it does not matter what these tokens are)
- GPT-2 does not use an `<UNK>` token for out-of-vocabulary words; instead, GPT-2 uses a byte-pair encoding (BPE) tokenizer, which breaks down words into subword units which we will discuss in a later section


In [56]:
# Getting the unique words
unique_words = sorted(set(preprocessed))
# Adding the "endoftext" and "unknown"  tokens
unique_words.extend(["EOT", "UNK"])

In [57]:
sorted(set(preprocessed)) == sorted(list(set(preprocessed)))

True

In [58]:
# New vocabulary

vocab = {words:ids for ids, words in enumerate(unique_words)}
len(vocab.items())

1145

In [59]:
# Testing the new vocabulary extension

for key, value in list(vocab.items())[-5:]:
    print(key, value)

younger 1140
your 1141
yourself 1142
EOT 1143
UNK 1144


In [60]:
# Implementing the unknown token

class SimpleTokenizerVersion2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {ids:words for words, ids in vocab.items()}
    def encode(self, Text):
        preprocessed_1 = tokens(Text)
        preprocessed_2 = []
        for i in preprocessed_1:
            if i in self.str_to_int:
                preprocessed_2.append(i)
            else:
                preprocessed_2.append("UNK")
        
        ids = [self.str_to_int[i] for i in preprocessed_2]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [61]:
# Example of implementation of the end of text token

tokenizer = SimpleTokenizerVersion2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text12 = " EOT ".join((text1, text2))
text12

'Hello, do you like tea? EOT In the sunlit terraces of the palace.'

In [62]:
# Checking the tokens after tokenization

tokens(text12)

['Hello',
 ',',
 'do',
 'you',
 'like',
 'tea',
 '?',
 'EOT',
 'In',
 'the',
 'sunlit',
 'terraces',
 'of',
 'the',
 'palace',
 '.']

In [63]:
# Checking the postion of the end of text

f = [i for i, v in enumerate(tokens(text12)) if v == "EOT"]
f

[7]

In [64]:
# Checking if there is a token id for end of text and unkowns --> They are both present 

tokenizer.encode(text12)

[1144,
 10,
 367,
 1139,
 636,
 989,
 16,
 1143,
 61,
 1003,
 971,
 998,
 733,
 1003,
 1144,
 13]

In [65]:
list(vocab.items())[1144][0]

'UNK'

In [66]:
# Decode

tokenizer.decode(tokenizer.encode(text12))

'UNK, do you like tea? EOT In the sunlit terraces of the UNK.'