# Tokenization

## import required pkg

In [2]:
import numpy as np
import torch
import re
import tiktoken

# load the data

In [3]:
with open("data.txt", "r") as file:
    raw_text = file.read()

In [4]:
raw_text[:100]

'Alice was beginning to get very tired of sitting by her sister\non the bank, and of having nothing to'

### create Tokenizer 

In [5]:
class WordTokenizer:
    def __init__(self, raw_text):

        # split the data into words
        words = re.split(r'[,.?|!"\' ]', raw_text)
        
        # remove the empty words
        words = [word.strip() for word in words if word.strip()]
        
        # find the unique words and sort them
        self.vocab = sorted(list(set(words)))
        
        # build the dictionaries
        self.stoi = {ch:i for i, ch in enumerate(self.vocab)}
        self.itos = {i:ch for i, ch in enumerate(self.vocab)}

    def encode(self, words):
        # convert every character to the correspoding integer (token id) value
        return [self.stoi[word] for word in words]

    def decode(self, encoded_value):
        # convert every integer into corresponding character
        return " ".join([self.itos[num] for num in encoded_value])

    def print_info(self):
        print(self.stoi)
        print(self.itos)

    def n_vocab(self):
        return len(self.vocab)

In [6]:
# create tokenizer
tokenizer = WordTokenizer(raw_text = raw_text)

# print the size of vocab
print(f"size of vocab = {tokenizer.n_vocab()}")

size of vocab = 4677


In [7]:
# get all the tokens
words = re.split(r'[,.?|!"\' ]', raw_text)
words = [word.strip() for word in words if word.strip()]
tokens = tokenizer.encode(words)

In [8]:
tokens[:10]

[70, 4436, 1251, 4175, 2074, 4387, 4172, 2968, 3676, 1360]

In [9]:
tokenizer.decode(tokens[:10])

'Alice was beginning to get very tired of sitting by'

# Use tiktoken to create the tokens

## gpt2 tiktoken

In [18]:
# create the tokenizer using gpt2 vocabulary
tokenizer = tiktoken.get_encoding('gpt2')

# print the vocab size
print(f"vocab size gpt2= {tokenizer.n_vocab}")

vocab size gpt2= 50257


In [19]:
token = tokenizer.encode(raw_text)
token[:10]

[44484, 373, 3726, 284, 651, 845, 10032, 286, 5586, 416]

In [20]:
tokenizer.decode(token[:10])

'Alice was beginning to get very tired of sitting by'

## gpt-4o tiktoken

In [25]:
tokenizer2 = tiktoken.encoding_for_model("gpt-4o")
print(f" vocab size of gpt-4o = {tokenizer2.n_vocab}")

 vocab size of gpt-4o = 200019


In [26]:
token2 = tokenizer2.encode(raw_text)
token2[:10]

[100151, 673, 10526, 316, 717, 1869, 25920, 328, 17379, 656]

In [27]:
tokenizer2.decode(token2[:10])

'Alice was beginning to get very tired of sitting by'