In [1]:
# !pip install tiktoken

In [16]:
import os

all_paths = []
for root, dirs, files in os.walk("./processed_data"):
    for file in files:
        if file.endswith(".txt"):
             all_paths.append(os.path.join(root, file))

len(all_paths)

242553

In [17]:
from datasets import load_dataset

dataset = load_dataset('text', data_files=all_paths, split='train', streaming=True)

Resolving data files:   0%|          | 0/242553 [00:00<?, ?it/s]

In [18]:
from tqdm.auto import tqdm

iter_dataset = iter(dataset)

length = 242553
def batch_iterator(batch_size=10):
    for _ in tqdm(range(0, length, batch_size)):
        yield [next(iter_dataset)['text'] for _ in range(batch_size)]

In [19]:
from transformers import AutoTokenizer
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode

byte_to_unicode_map = bytes_to_unicode()
unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items())
base_vocab = list(unicode_to_byte_map.keys())


base_tokenizer = AutoTokenizer.from_pretrained("gpt2")
new_tokenizer = base_tokenizer.train_new_from_iterator(
    batch_iterator(), vocab_size=32000, min_frequency=2, initial_alphabet=base_vocab
)

new_tokenizer.save_pretrained("new_tokenizer")

  0%|          | 0/24256 [00:00<?, ?it/s]






('new_tokenizer/tokenizer_config.json',
 'new_tokenizer/special_tokens_map.json',
 'new_tokenizer/vocab.json',
 'new_tokenizer/merges.txt',
 'new_tokenizer/added_tokens.json',
 'new_tokenizer/tokenizer.json')

In [None]:
import sentencepiece as spm
import os

data = ",".join(all_paths)

options = dict(
    input=data,
    model_prefix='tokenicer',
    model_type='bpe',
    vocab_size=5000,
    normalization_rule_name='identity',
    remove_extra_whitespaces=False,
    input_sentence_size=2*1e6,
    max_sentence_length=4192,
    seed_sentencepiece_size=1e5,
    shuffle_input_sentence=True,
    #rare word treatment
    character_coverage=0.9995,
    byte_fallback=True,
    #merge rules
    split_digits=True,
    split_by_unicode_script=True,
    split_by_whitespace=True,
    split_by_number=True,
    max_sentencepiece_length=16,
    add_dummy_prefix=True,
    allow_whitespace_only_pieces=True,
    #special tokens
    unk_id=0, # the unk token MUST exist
    bos_id=1,
    eos_id=2,
    pad_id=-1,
    # system
    num_threads=os.cpu_count(), # use all cores
)

spm.SentencePieceTrainer.train(**options)

In [6]:
# Generates a dictionary mapping unique texts/tokens with their corresponding color
def get_token_colors(tokens):
    unique_tokens = list(set(tokens))
    token_colors = {token: number_to_color(index) for index, token in enumerate(unique_tokens)}
    return token_colors


# Displays the tokens with the unique color
def display_colored_tokens(tokens, token_colors):
    colored_text = ""
    for token in tokens:
        color = token_colors[token]
        colored_text += f'<span style="background-color: {color}; padding: 2px; color: black">{token}</span> '

    display(HTML(colored_text))

# Generates a unique HSL color based on the index of the token
def number_to_color(number):
    golden_ratio_conjugate = 0.618033988749895
    a = 1664525
    c = 1013904223
    m = 2**32

    pseudorandom = (a * number + c) % m
    hue = ((pseudorandom * golden_ratio_conjugate) % 1) * 360
    s = 60 + (pseudorandom % 21)
    l = 70 + (pseudorandom % 21)

    return f"hsl({hue}, {s}%, {l}%)"

In [13]:
text = "Leiðarvísir puttaferðalangsins um Vetrarbrautina (enska The Hitchhiker's Guide to the Galaxy skammstafað HHGTTG, eða H2G2) eftir Douglas Adams var upphaflega útvarpsleikrit sem sent var út af BBC í Bretlandi. Í dag hafa verið gefnar út fimm bækur, sjónvarpsþættir, tölvuleikur, hljómplata, tvö leikrit og kvikmynd var frumsýnd í maí 2005. Þó allar þessar útgáfur fjalli um sama söguþráðinn þá er mikill munur á sögunum og eru þær oft í algerri þversögn við aðrar útgáfur. Eina undantekningin á þessu er að upptaka af fyrstu útvarpsseríuna sem gefin var út sem hljómplata var eftir sama handriti og með næstum því sömu leikurum og útvarpsserían."

In [None]:
from IPython.display import HTML, display

sp = spm.SentencePieceProcessor()
sp.load('tokenicer.model')

tokens = sp.encode_as_pieces(text)
token_colors = get_token_colors(tokens)

display(HTML(f"<p ><b>Model:</b> tokenICEr</p><p><b>Tokens:</b>"))
display_colored_tokens(tokens, token_colors)

# Calculate and display the number of tokens and characters
num_tokens = len(tokens)
num_chars = len(text)
print(f"Number of Tokens: {num_tokens}")
print(f"Number of Characters: {num_chars}")
print("="*50)

Number of Tokens: 139
Number of Characters: 399


In [23]:
from IPython.display import HTML, display
import tiktoken

model = "gpt-2"

enc = tiktoken.encoding_for_model("gpt-2")  # GPT-4 tokenizer
tokens = [
    enc.decode([i]) for i in enc.encode(text)
]  # tokanize the text, then convert individual tokens back to text
token_colors = get_token_colors(tokens)

display(HTML(f"<p ><b>Model:</b> {model}</p><p><b>Tokens:</b>"))
display_colored_tokens(tokens, token_colors)

# Calculate and display the number of tokens and characters
num_tokens = len(tokens)
num_chars = len(text)
print(f"Number of Tokens: {num_tokens}")
print(f"Number of Characters: {num_chars}")
print("=" * 50)

# Additional code for raw tokens
# raw_tokens = [tokz.tokenize(text) for tokz in [AutoTokenizer.from_pretrained(model) for model in model_nm]]
# raw_tokens_json = json.dumps(raw_tokens, indent=2)
# print(f"Raw Tokens: {raw_tokens_json}")

Number of Tokens: 309
Number of Characters: 644


In [22]:
from IPython.display import HTML, display
import tiktoken

model = "gpt4"

enc = tiktoken.encoding_for_model("gpt-4") # GPT-4 tokenizer
tokens = [enc.decode([i]) for i in enc.encode(text)] # tokanize the text, then convert individual tokens back to text
token_colors = get_token_colors(tokens)

display(HTML(f"<p ><b>Model:</b> {model}</p><p><b>Tokens:</b>"))
display_colored_tokens(tokens, token_colors)

# Calculate and display the number of tokens and characters
num_tokens = len(tokens)
num_chars = len(text)
print(f"Number of Tokens: {num_tokens}")
print(f"Number of Characters: {num_chars}")
print("=" * 50)

Number of Tokens: 273
Number of Characters: 644


In [21]:
from tokenizers.tools import EncodingVisualizer
from bs4 import BeautifulSoup as bs
from transformers import AutoTokenizer
from IPython.display import HTML, display

model = "jonfd/gpt2-igc-is"
tokenizer = AutoTokenizer.from_pretrained(model)
viz = EncodingVisualizer(tokenizer._tokenizer)
html = viz(text, default_to_notebook=False)
soup = bs(html, "html.parser")
spans = soup.find_all("span")

# extract text
tokens = [span.text for span in spans]
token_colors = get_token_colors(tokens)

display(HTML(f"<p ><b>Model:</b> {model}</p><p><b>Tokens:</b>"))
display_colored_tokens(tokens, token_colors)

# Calculate and display the number of tokens and characters
num_tokens = len(tokens)
num_chars = len(text)
print(f"Number of Tokens: {num_tokens}")
print(f"Number of Characters: {num_chars}")

print("=" * 50)

Number of Tokens: 152
Number of Characters: 644


In [20]:
from tokenizers.tools import EncodingVisualizer
from bs4 import BeautifulSoup as bs
from transformers import AutoTokenizer
from IPython.display import HTML, display

model = "./new_tokenizer"
# tokenizer = AutoTokenizer.from_pretrained("jonfd/gpt2-igc-is")
tokenizer = AutoTokenizer.from_pretrained(model)
viz = EncodingVisualizer(tokenizer._tokenizer)
html = viz(text, default_to_notebook=False)
soup = bs(html, "html.parser")
spans = soup.find_all('span')

# extract text
tokens = [span.text for span in spans]
token_colors = get_token_colors(tokens)

display(HTML(f"<p ><b>Model:</b> {model}</p><p><b>Tokens:</b>"))
display_colored_tokens(tokens, token_colors)

# Calculate and display the number of tokens and characters
num_tokens = len(tokens)
num_chars = len(text)
print(f"Number of Tokens: {num_tokens}")
print(f"Number of Characters: {num_chars}")

print("=" * 50)

Number of Tokens: 155
Number of Characters: 644
