In [27]:
import sentencepiece as spm
import os

options = dict(
    input='processed_data/text.txt',
    model_prefix='tokenicer',
    model_type='bpe',
    vocab_size=400,
    normalization_rule_name='identity',
    remove_extra_whitespaces=False,
    input_sentence_size=2*1e6,
    max_sentence_length=4192,
    seed_sentencepiece_size=1e5,
    shuffle_input_sentence=True,
    #rare word treatment
    character_coverage=0.9995,
    byte_fallback=True,
    #merge rules
    split_digits=True,
    split_by_unicode_script=True,
    split_by_whitespace=True,
    split_by_number=True,
    max_sentencepiece_length=16,
    add_dummy_prefix=True,
    allow_whitespace_only_pieces=True,
    #special tokens
    unk_id=0, # the unk token MUST exist
    bos_id=1,
    eos_id=2,
    pad_id=-1,
    # system
    num_threads=os.cpu_count(), # use all cores
)

spm.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: processed_data/text.txt
  input_format: 
  model_prefix: tokenicer
  model_type: BPE
  vocab_size: 400
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 2000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 100000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 8
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy

In [34]:
# Generates a dictionary mapping unique texts/tokens with their corresponding color
def get_token_colors(text, tokenizer):
    tokens = tokenizer.tokenize(text)
    unique_tokens = list(set(tokens))
    token_colors = {token: number_to_color(index) for index, token in enumerate(unique_tokens)}
    return token_colors

def get_token_colors_sentencepiece(text, tokenizer):
    tokens = tokenizer.encode_as_pieces(text)
    unique_tokens = list(set(tokens))
    token_colors = {token: number_to_color(index) for index, token in enumerate(unique_tokens)}
    return token_colors


# Displays the tokens with the unique color
def display_colored_tokens(tokens, token_colors):
    colored_text = ""
    for token in tokens:
        color = token_colors[token]
        colored_text += f'<span style="background-color: {color}; padding: 2px;">{token}</span> '

    display(HTML(colored_text))

# Generates a unique HSL color based on the index of the token
def number_to_color(number):
    golden_ratio_conjugate = 0.618033988749895
    a = 1664525
    c = 1013904223
    m = 2**32

    pseudorandom = (a * number + c) % m
    hue = ((pseudorandom * golden_ratio_conjugate) % 1) * 360
    s = 60 + (pseudorandom % 21)
    l = 70 + (pseudorandom % 21)

    return f"hsl({hue}, {s}%, {l}%)"

In [35]:
model_nm = ['./tokenicer', 'openai-community/gpt2']

text = "Öðru sinni barnaði Loftur vinnukonu á staðnum, og drap hann þá barnsmóður sína með gjörningum. Henni var ætlað að bera aska inn í eldhús og úr því; voru þeir til flýtis bornir á nokkurs konar trogmynduðu verkfæri, er hét askafloti og tók marga aska í einu"

In [36]:

# importing from huggingface transformers
from transformers import AutoTokenizer
from IPython.display import HTML, display
import json


# Display tokens with colors for each model
for model in model_nm:
    tokz = AutoTokenizer.from_pretrained(model)
    token_colors = get_token_colors(text, tokz)
    tokens = tokz.tokenize(text)
    
    display(HTML(f"<p><b>Model:</b> {model}</p><p><b>Tokens:</b>"))
    display_colored_tokens(tokens, token_colors)
    
    # Calculate and display the number of tokens and characters
    num_tokens = len(tokens)
    num_chars = len(text)
    print(f"Number of Tokens: {num_tokens}")
    print(f"Number of Characters: {num_chars}")
    print("="*50)

# Additional code for raw tokens
#raw_tokens = [tokz.tokenize(text) for tokz in [AutoTokenizer.from_pretrained(model) for model in model_nm]]
#raw_tokens_json = json.dumps(raw_tokens, indent=2)
#print(f"Raw Tokens: {raw_tokens_json}")

Number of Tokens: 283
Number of Characters: 255


Number of Tokens: 135
Number of Characters: 255


In [43]:
from IPython.display import HTML, display
import json

sp = spm.SentencePieceProcessor()
sp.load('tokenicer.model')

text = "Öðru sinni barnaði Loftur vinnukonu á staðnum"

tokens = sp.encode_as_pieces(text)

token_colors = get_token_colors_sentencepiece(text, sp)

display(HTML(f"<p><b>Model:</b> tokenICEr</p><p><b>Tokens:</b>"))
display_colored_tokens(tokens, token_colors)

# Calculate and display the number of tokens and characters
num_tokens = len(tokens)
num_chars = len(text)
print(f"Number of Tokens: {num_tokens}")
print(f"Number of Characters: {num_chars}")
print("="*50)



Number of Tokens: 25
Number of Characters: 45
