following along:
* https://colab.research.google.com/drive/1JMLa53HDuA-i7ZBmqV7ZnA3c_fvtXnx-?usp=sharing
* https://www.youtube.com/watch?v=zduSFxRajkE

tom lehrer's songs: https://tomlehrersongs.com/

the elements song: https://tomlehrersongs.com/wp-content/uploads/2018/12/the-elements.pdf

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
lyrics = """THE ELEMENTS

There's antimony, arsenic, aluminum, selenium,
And hydrogen and oxygen and nitrogen and rhenium,
And nickel, neodymium, neptunium, germanium,
And iron, americium, ruthenium, uranium,

Europium, zirconium, lutetium, vanadium,
And lanthanum and osmium and astatine and radium,
And gold and protactinium and indium and gallium,
And iodine and thorium and thulium and thallium.

There's yttrium, ytterbium, actinium, rubidium,
And boron, gadolinium, niobium, iridium,
And strontium and silicon and silver and samarium,
And bismuth, bromine, lithium, beryllium, and barium.

There's holmium and helium and hafnium and erbium,
And phosphorus and francium and fluorine and terbium,
And manganese and mercury, molybdenum, magnesium,
Dysprosium and scandium and cerium and cesium.

And lead, praseodymium and platinum, plutonium,
Palladium, promethium, potassium, polonium,
And tantalum, technetium, titanium, tellurium,
And cadmium and calcium and chromium and curium.

There's sulfur, californium and fermium, berkelium,
And also mendelevium, einsteinium, nobelium,
And argon, krypton, neon, radon, xenon, zinc and rhodium,
And chlorine, carbon, cobalt, copper, tungsten, tin and sodium.

These are the only ones o_f which the news has come to Ha'vard,
And there may be many others but they haven't been discavard."""
import abc
import string
import typing as T
from collections import Counter

import regex
import tqdm

import random_neural_net_models.tokenization as rnnm_tok
import random_neural_net_models.utils as utils

phrase = "From adolescence to senility, bypassing maturity."

## basic byte pair encoding

In [None]:
tokens = rnnm_tok.text_to_ids(phrase)
tokens[:5], tokens[-5:], len(tokens), len(set(tokens))

In [None]:
stats = rnnm_tok.get_stats(tokens)
stats.most_common(5)

In [None]:
rnnm_tok.merge_token_ids(rnnm_tok.TokenIDs(ids=[5, 6, 6, 7, 9, 1]), (6, 7), 99)

In [None]:
base_symbols = string.ascii_letters + string.digits
base_symbols

In [None]:
base_ids = rnnm_tok.text_to_ids(base_symbols)
base_ids[:5], base_ids[-5:]

In [None]:
replacement_id = max(tokens + base_ids) + 1
replacement_id

In [None]:
pair_to_replace = stats.most_common()[0][0]
pair_to_replace

In [None]:
tokens2 = rnnm_tok.merge_token_ids(tokens, pair_to_replace, replacement_id)
tokens2

In [None]:
max(tokens2), max(tokens)

In [None]:
len(tokens), len(tokens2), len(set(tokens)), len(set(tokens2))

In [None]:
vocab_size = len(set(tokens)) + 20
pair_map, tokens3 = rnnm_tok.repeated_merge(
    tokens, vocab_size, return_new_ids=True, show_progress=True
)

In [None]:
pair_map.map

In [None]:
tokens3

In [None]:
len(pair_map)

In [None]:
len(tokens), len(tokens3), len(set(tokens)), len(set(tokens3))

In [None]:
pair_map

In [None]:
vocab = {idx: bytes([idx]) for idx in set(tokens + base_ids)}
for (token0, token1), idx in pair_map.items():
    vocab[idx] = vocab[token0] + vocab[token1]
vocab

In [None]:
tokens3

In [None]:
vocab

In [None]:
rnnm_tok.decode(tokens3, vocab)

In [None]:
pair_map

In [None]:
test_bpe_token_ids = rnnm_tok.encode("bla bla and bla", pair_map)
test_bpe_token_ids

In [None]:
rnnm_tok.decode(test_bpe_token_ids, vocab)

https://github.com/openai/gpt-2

https://github.com/openai/tiktoken

https://github.com/google/sentencepiece

## Tokenizer classes

Basic

In [None]:
tokenizer = rnnm_tok.TokenizerSimple()

In [None]:
vocab_size = 60
tokenizer.fit(lyrics, vocab_size, verbose=True)

In [None]:
simple_token_ids = tokenizer.encode(phrase)
simple_token_ids[:3]

In [None]:
tokenizer.decode(simple_token_ids)

In [None]:
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

pattern = regex.compile(GPT4_SPLIT_PATTERN)

pattern.findall(phrase)

In [None]:
c0 = Counter([1, 1, 1])
c1 = Counter([1, 1, 2])
c2 = Counter()
c2.update(c0)
c2.update(c1)
c0, c1, c2

https://www.lesswrong.com/posts/aPeJE8bSo6rAFoLqg/solidgoldmagikarp-plus-prompt-generation

Regex

In [None]:
pattern = rnnm_tok.GPT4_SPLIT_PATTERN
tokenizer = rnnm_tok.TokenizerRegex()
tokenizer.fit(lyrics, vocab_size, pattern)

In [None]:
vocab_size

In [None]:
regex_token_ids = tokenizer.encode(phrase)
regex_token_ids[:5]

In [None]:
tokenizer.decode(regex_token_ids)

regex + special tokens

In [None]:
special_strings = """
<|endoftext|>Hello world this is one document
<|endoftext|>And this is another document
<|endoftext|><|fim_prefix|>And this one has<|fim_suffix|> tokens.<|fim_middle|> FIM
<|endoftext|>Last document!!! 👋<|endofprompt|>
""".strip()
print(special_strings)

In [None]:
special_token2id_map = {
    "<|endoftext|>": 100257,
    "<|fim_prefix|>": 100258,
    "<|fim_middle|>": 100259,
    "<|fim_suffix|>": 100260,
    "<|endofprompt|>": 100276,
}
vocab_size = 200
tokenizer = rnnm_tok.TokenizerRegex()
tokenizer.fit(
    lyrics, vocab_size=vocab_size, pattern=rnnm_tok.GPT4_SPLIT_PATTERN
)

In [None]:
tokenizer.register_special_tokens(special_token2id_map)
tokenizer.special_token2id_map

In [None]:
encoded_ids = tokenizer.encode(special_strings)
encoded_ids

In [None]:
decoded_text = tokenizer.decode(encoded_ids)
print(decoded_text)

In [None]:
print(special_strings)