In [5]:
from transformers import AutoTokenizer

ice_breaker_tokenizer = AutoTokenizer.from_pretrained("Sigurdur/icebreaker")
jonfd_tokenizer = AutoTokenizer.from_pretrained("jonfd/gpt2-igc-is")

In [6]:
import tiktoken

gpt4_tokenizer = tiktoken.encoding_for_model("gpt-4")
gpt2_tokenizer = tiktoken.encoding_for_model("gpt-2")

In [20]:
n_words = 0
n_chars = 0

unique_words = set()

try:
    with open("./test_data/laxdaela.txt", "r") as f:
        test = f.readline()
        while test:
            words = test.split()
            for word in words:
                if word not in unique_words:
                    unique_words.add(word)
            n_chars += len(test)
            n_words += len(test.split())
            test = f.readline()
except FileNotFoundError:
    print("Error: file not found")
except AttributeError:
    print("Error: the tokenizer is not defined")

n_words, n_chars, len(unique_words)

(29403, 160440, 6064)

In [7]:
# read test file


def test_tokenizer(
    tokenize_text, test_path="./test_data/laxdaela.txt",
):

    n_tokens = 0
    n_words = 0
    n_chars = 0

    try:
        with open(test_path, "r") as f:
            test = f.readline()
            while test:
                tokens = tokenize_text(test)
                n_tokens += len(tokens)
                n_chars += len(test)
                n_words += len(test.split())
                test = f.readline()
    except FileNotFoundError:
        print("Error: file not found")
    except AttributeError:
        print("Error: the tokenizer is not defined")


    tokens_per_word = n_tokens / n_words
    tokens_per_char = n_tokens / n_chars

    return tokens_per_word, tokens_per_char

In [18]:
test_cases = [
    [ice_breaker_tokenizer.tokenize, "icebreaker"], 
    [jonfd_tokenizer.tokenize, "jonfd"],
    [gpt4_tokenizer.encode, "gpt-4"],
    [gpt2_tokenizer.encode, "gpt-2"]
              ]

for test_case in test_cases:
    print(f"Testing {test_case[1]} tokenizer")
    tokens_per_word, tokens_per_char = test_tokenizer(test_case[0])
    print(f"Tokens per word: {tokens_per_word:.2f}")
    print(f"Tokens per character: {tokens_per_char:.2f}")
    print("\n")

Testing icebreaker tokenizer
Tokens per word: 1.40
Tokens per character: 0.26


Testing jonfd tokenizer
Tokens per word: 1.34
Tokens per character: 0.25


Testing gpt-4 tokenizer
Tokens per word: 2.46
Tokens per character: 0.45


Testing gpt-2 tokenizer
Tokens per word: 2.82
Tokens per character: 0.52




In [25]:
# vocab size
print("Vocab size")
print("gpt-2", gpt2_tokenizer.n_vocab)
print("gpt-4", gpt4_tokenizer.n_vocab)
print("icebreaker", ice_breaker_tokenizer.vocab_size)
print("jonfd", jonfd_tokenizer.vocab_size)

Vocab size
gpt-2 50257
gpt-4 100277
icebreaker 32000
jonfd 51000


In [9]:
# Generates a dictionary mapping unique texts/tokens with their corresponding color
def get_token_colors(tokens):
    unique_tokens = list(set(tokens))
    token_colors = {
        token: number_to_color(index) for index, token in enumerate(unique_tokens)
    }
    return token_colors


# Displays the tokens with the unique color
def display_colored_tokens(tokens, token_colors):
    colored_text = ""
    for token in tokens:
        color = token_colors[token]
        colored_text += f'<span style="background-color: {color}; padding: 2px; color: black">{token}</span> '

    display(HTML(colored_text))


# Generates a unique HSL color based on the index of the token
def number_to_color(number):
    golden_ratio_conjugate = 0.618033988749895
    a = 1664525
    c = 1013904223
    m = 2**32

    pseudorandom = (a * number + c) % m
    hue = ((pseudorandom * golden_ratio_conjugate) % 1) * 360
    s = 60 + (pseudorandom % 21)
    l = 70 + (pseudorandom % 21)

    return f"hsl({hue}, {s}%, {l}%)"

In [15]:
text = ""
max_length = 1024
with open("./test_data/laxdaela.txt", "r") as f:
    for line in f:
        text += line

        if len(text) > max_length:
            break
    

In [16]:
from tokenizers.tools import EncodingVisualizer
from bs4 import BeautifulSoup as bs
from transformers import AutoTokenizer
from IPython.display import HTML, display

model = "./new_tokenizer"
# tokenizer = AutoTokenizer.from_pretrained("jonfd/gpt2-igc-is")
tokenizer = AutoTokenizer.from_pretrained(model)
viz = EncodingVisualizer(tokenizer._tokenizer)
html = viz(text, default_to_notebook=False)
soup = bs(html, "html.parser")
spans = soup.find_all("span")

# extract text
tokens = [span.text for span in spans]
token_colors = get_token_colors(tokens)

display(HTML(f"<p ><b>Model:</b> {model}</p><p><b>Tokens:</b>"))
display_colored_tokens(tokens, token_colors)

# Calculate and display the number of tokens and characters
num_tokens = len(tokens)
num_chars = len(text)
print(f"Number of Tokens: {num_tokens}")
print(f"Number of Characters: {num_chars}")

print("=" * 50)

Number of Tokens: 366
Number of Characters: 1450
