In [8]:
# !pip install tiktoken

In [1]:
import os

all_paths = []
for root, dirs, files in os.walk("./processed_data"):
    for file in files:
        if file.endswith(".txt"):
             all_paths.append(os.path.join(root, file))

In [2]:
import sentencepiece as spm
import os

data = ",".join(all_paths)

options = dict(
    input=data,
    model_prefix='tokenicer',
    model_type='bpe',
    vocab_size=3000,
    normalization_rule_name='identity',
    remove_extra_whitespaces=False,
    input_sentence_size=2*1e6,
    max_sentence_length=4192,
    seed_sentencepiece_size=1e5,
    shuffle_input_sentence=True,
    #rare word treatment
    character_coverage=0.9995,
    byte_fallback=True,
    #merge rules
    split_digits=True,
    split_by_unicode_script=True,
    split_by_whitespace=True,
    split_by_number=True,
    max_sentencepiece_length=16,
    add_dummy_prefix=True,
    allow_whitespace_only_pieces=True,
    #special tokens
    unk_id=0, # the unk token MUST exist
    bos_id=1,
    eos_id=2,
    pad_id=-1,
    # system
    num_threads=os.cpu_count(), # use all cores
)

spm.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./processed_data/29.txt
  input: ./processed_data/15.txt
  input: ./processed_data/14.txt
  input: ./processed_data/28.txt
  input: ./processed_data/16.txt
  input: ./processed_data/17.txt
  input: ./processed_data/13.txt
  input: ./processed_data/12.txt
  input: ./processed_data/10.txt
  input: ./processed_data/38.txt
  input: ./processed_data/39.txt
  input: ./processed_data/11.txt
  input: ./processed_data/76.txt
  input: ./processed_data/62.txt
  input: ./processed_data/89.txt
  input: ./processed_data/88.txt
  input: ./processed_data/63.txt
  input: ./processed_data/77.txt
  input: ./processed_data/49.txt
  input: ./processed_data/61.txt
  input: ./processed_data/75.txt
  input: ./processed_data/74.txt
  input: ./processed_data/60.txt
  input: ./processed_data/48.txt
  input: ./processed_data/64.txt
  input: ./processed_data/70.txt
  input: ./processed_data/58.txt
  input: ./processed_data/59.tx

ainer.cc(268) LOG(INFO) Added: freq=10 size=1480 all=15697 active=1155 piece=▁þeirri
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=10 size=1500 all=15707 active=1165 piece=▁landsbyggðinni
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=10 min_freq=5
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=9 size=1520 all=15802 active=1096 piece=öfn
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=9 size=1540 all=15886 active=1180 piece=laus
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=9 size=1560 all=15955 active=1249 piece=▁ári
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=9 size=1580 all=15998 active=1292 piece=▁sann
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=9 size=1600 all=16017 active=1311 piece=▁meira
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=9 min_freq=5
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=9 size=1620 all=16014 active=998 piece=▁dagskrá
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=9 size=1640 all=16028 active=101

In [30]:
# Generates a dictionary mapping unique texts/tokens with their corresponding color
def get_token_colors(tokens):
    unique_tokens = list(set(tokens))
    token_colors = {token: number_to_color(index) for index, token in enumerate(unique_tokens)}
    return token_colors


# Displays the tokens with the unique color
def display_colored_tokens(tokens, token_colors):
    colored_text = ""
    for token in tokens:
        color = token_colors[token]
        colored_text += f'<span style="background-color: {color}; padding: 2px; color: black">{token}</span> '

    display(HTML(colored_text))

# Generates a unique HSL color based on the index of the token
def number_to_color(number):
    golden_ratio_conjugate = 0.618033988749895
    a = 1664525
    c = 1013904223
    m = 2**32

    pseudorandom = (a * number + c) % m
    hue = ((pseudorandom * golden_ratio_conjugate) % 1) * 360
    s = 60 + (pseudorandom % 21)
    l = 70 + (pseudorandom % 21)

    return f"hsl({hue}, {s}%, {l}%)"

In [4]:
text = "Nói albinói er fyrsta kvikmynd Dags Kára Péturssonar í fullri lengd. Hún fjallar um ungan dreng að nafni Nói, sem býr í afskekktum bæ á Íslandi. Nói á í erfiðleikum í skóla og fær litla virðingu heima hjá sér. Hann kynnist ungri stelpu frá Reykjavík og ákveður að strjúka í burtu með henni. En hún er ekki á sama máli. Kvikmyndin hlaut sex Edduverðlaun 2003 og var send í forval til Óskarsins 2004. "

In [35]:
from IPython.display import HTML, display

sp = spm.SentencePieceProcessor()
sp.load('tokenicer.model')

tokens = sp.encode_as_pieces(text)
token_colors = get_token_colors(tokens)

display(HTML(f"<p ><b>Model:</b> tokenICEr</p><p><b>Tokens:</b>"))
display_colored_tokens(tokens, token_colors)

# Calculate and display the number of tokens and characters
num_tokens = len(tokens)
num_chars = len(text)
print(f"Number of Tokens: {num_tokens}")
print(f"Number of Characters: {num_chars}")
print("="*50)

Number of Tokens: 139
Number of Characters: 399


In [53]:
from IPython.display import HTML, display
import tiktoken

model = "gpt-2"

enc = tiktoken.encoding_for_model("gpt-2")  # GPT-4 tokenizer
tokens = [
    enc.decode([i]) for i in enc.encode(text)
]  # tokanize the text, then convert individual tokens back to text
token_colors = get_token_colors(tokens)

display(HTML(f"<p ><b>Model:</b> {model}</p><p><b>Tokens:</b>"))
display_colored_tokens(tokens, token_colors)

# Calculate and display the number of tokens and characters
num_tokens = len(tokens)
num_chars = len(text)
print(f"Number of Tokens: {num_tokens}")
print(f"Number of Characters: {num_chars}")
print("=" * 50)

# Additional code for raw tokens
# raw_tokens = [tokz.tokenize(text) for tokz in [AutoTokenizer.from_pretrained(model) for model in model_nm]]
# raw_tokens_json = json.dumps(raw_tokens, indent=2)
# print(f"Raw Tokens: {raw_tokens_json}")

Number of Tokens: 203
Number of Characters: 399


In [55]:
from IPython.display import HTML, display
import tiktoken

model = "gpt4"

enc = tiktoken.encoding_for_model("gpt-4") # GPT-4 tokenizer
tokens = [enc.decode([i]) for i in enc.encode(text)] # tokanize the text, then convert individual tokens back to text
token_colors = get_token_colors(tokens)

display(HTML(f"<p ><b>Model:</b> {model}</p><p><b>Tokens:</b>"))
display_colored_tokens(tokens, token_colors)

# Calculate and display the number of tokens and characters
num_tokens = len(tokens)
num_chars = len(text)
print(f"Number of Tokens: {num_tokens}")
print(f"Number of Characters: {num_chars}")
print("=" * 50)

Number of Tokens: 182
Number of Characters: 399
