### Tokenization tutorial
(Reference : https://www.youtube.com/watch?v=zduSFxRajkE)

In [94]:
# ord() converts character to unicode code points
# Unicode is updated frequently, about 140000 characters supported
# Cannot use this nativaly since too many vocabs & and update frequently, thus not stable
str = "한국어 좋아해요 (⡳⡵⡷) ／人◕‿‿◕人＼"
print([ord(x) for x in str])

[54620, 44397, 50612, 32, 51339, 50500, 54644, 50836, 32, 40, 10355, 10357, 10359, 41, 32, 65295, 20154, 9685, 8255, 8255, 9685, 20154, 65340]


In [95]:
# Unicode can encoded using into binary data
# Types: utf-8 (mostly used), utf-16, utf-32
print("String: ", str)
print("String length: ", len(str))
tokens = list(str.encode("utf-8"))
print("Encoded: ", tokens)
print("Encoded length: ", len(tokens))


String:  한국어 좋아해요 (⡳⡵⡷) ／人◕‿‿◕人＼
String length:  23
Encoded:  [237, 149, 156, 234, 181, 173, 236, 150, 180, 32, 236, 162, 139, 236, 149, 132, 237, 149, 180, 236, 154, 148, 32, 40, 226, 161, 179, 226, 161, 181, 226, 161, 183, 41, 32, 239, 188, 143, 228, 186, 186, 226, 151, 149, 226, 128, 191, 226, 128, 191, 226, 151, 149, 228, 186, 186, 239, 188, 188]
Encoded length:  59


In [96]:
# Other utf types are not efficient in many cases
# Here utf-32 includes several "0" token, thus longer token length compare to utf-8 encoding
print("Encoded (utf-32): ", list(str.encode("utf-32")))
print("Encoded length (utf-32): ", len(list(str.encode("utf-32"))))

Encoded (utf-32):  [255, 254, 0, 0, 92, 213, 0, 0, 109, 173, 0, 0, 180, 197, 0, 0, 32, 0, 0, 0, 139, 200, 0, 0, 68, 197, 0, 0, 116, 213, 0, 0, 148, 198, 0, 0, 32, 0, 0, 0, 40, 0, 0, 0, 115, 40, 0, 0, 117, 40, 0, 0, 119, 40, 0, 0, 41, 0, 0, 0, 32, 0, 0, 0, 15, 255, 0, 0, 186, 78, 0, 0, 213, 37, 0, 0, 63, 32, 0, 0, 63, 32, 0, 0, 213, 37, 0, 0, 186, 78, 0, 0, 60, 255, 0, 0]
Encoded length (utf-32):  96


#### Byte Pair encoding

In [97]:
# Byte pair encoding replaces frequent byte pair into a single byte
# e.g. xyabxyxy => zabzz where z = xy
# This reduces the length of token sequences
# Transformers with fixed context length, it can effectively attend larger context (as multiple tokens' context are squeezed into single token)
str = """
페이커는 게임 내부적으로 현대 미드라이너의 개념을 정립한 선수로 평가된다. 페이커의 등장은 고전적 1세대 미드라이너 시대의 종언을 알렸다. 시즌 2까지 미드 라이너는 무리하지 않고 파밍을 하면서 성장하고 주로 필요한 순간에만 합류를 했는데, 2013 시즌부터 등장한 페이커는 상대 라이너를 끊임없이 압박하면서 이득을 취하는 플레이를 시작했다.
유명 아마추어 게이머인 도파는 "미드 라이너로서 최고의 플레이는 상대방을 극단적으로 압박해 적 정글을 불러들일 수밖에 없는 상황을 만들고 2:1 상황을 만든 후에 죽지 않는 것이다. 그리고 이 2:1 드래블링을 최초로 시작한 것이 페이커였고 이를 최적화시켜 완성한 사람이 루키였다."라고 평가했다.[101]
즉 상대하는 입장에서 적 정글을 불러들일 수밖에 없는 상황을 만든 뒤에 적 정글이 갱이나 견제를 해오면 본인은 어그로를 끌면서 팀적으로 상대 정글의 위치를 이용해 다른 라인에 힘을 실어주기 혹은 역갱을 성공시켜 그에 따른 스노우볼을 굴리거나 최적의 상황으로는 1대2 역관광을 해 게임을 터뜨리는 것이다. 반대로 본인에게 견제가 오지 않는다면, 라인 주도권을 이용해 한발 빠른 합류와 타 라인에 대한 견제를 넣거나 아예 미드 라인에서 솔킬을 내버리고 폭파시켜버리는 등, 다시 말해 적 미드 정글을 자기 마음대로 컨트롤하여 게임을 터뜨리는, 소위 스포츠에서 통칭되는 '크랙 플레이'의 정점에 가까웠던 선수였다.
실제로 이 2대1 드리블링이 가능한 지 여부는 이후 미드, 탑 선수 중 S급과 A급을 가르는 중요한 지표중 하나가 되었다. 페이커가 최초로 시작하고 루키가 최적화시킨 후 2015 시즌부터 본격적으로 마린, 스맵, 큐베, 칸, 더샤이, 비디디, 쵸비 등 속칭 "S급 선수"들이 대거 등장하기 시작했다. [102]
실제로 이러한 플레이의 과정에서 나오는 페이커 특유의 외줄타기, 솔로 킬, 넓은 챔프폭, 슈퍼 플레이는 사람들을 열광하게 해 경기 외적으로 막 인기를 얻기 시작하던 리그 오브 레전드와 롤챔스의 유명세를 더욱 증폭시키는 기폭제 역할을 하기도 했고 르블랑, 아리, 제드 등의 암살자 챔피언의 인기가 급상승하기도 했다.
이 패러다임은 2018 시즌까지 미드라이너의 유일한 크랙 플레이로 남아있었다. 하지만 2019 시즌 탁월한 운영과 로밍으로 승부를 보는 도인비와 캡스의 등장과 2020 시즌 육각형 플레이로 LCK 암흑기를 끝낸 쇼메이커 이후로 더 이상 상대 미드를 극한으로 압박하지 않고 맵을 넓게 쓰는 방식으로도 캐리할 수 있다는 해법이 나오면서 전체적인 미드 플레이 스타일은 크게 이 둘로 나뉘게 되었다. 그리고 미드 라이너들의 상향평준화와 2022 시즌 내구도 패치로 인해 더 이상 프로 레벨에서 상대방을 압도하기가 힘들어지자 도인비와 캡스, 쇼메이커 스타일이 유행하기 시작했다.[103][104]
그 외에도 단순히 내 눈앞에 서 있는 상대방을 도륙하는 것만이 아니라 와드를 박는 위치나 타이밍, 이를 이용한 상대방 미드와 정글 위치 찾기 등의 지능적 플레이 역시 솔로랭크 리플레이와 관전 등으로 알려진 것만 여럿 된다.[105][106] 종합적으로 페이커는 게임 내외에 엄청난 영향을 끼친 선수로, 단기적으로나마 페이커 급의 영향력을 끼친 선수는 서포터를 넘어 아예 롤판의 기본 운영을 정립한 마타, LPL 탑 라인에 급변을 불러와 슈퍼스타에 등극한 더샤이, 커리어는 앞선 선수들에 비해 부족하지만 팀의 1옵션으로 포지션의 인식을 뒤바꾼 롤판의 임요환 매드라이프정도 뿐이다. 즉, 리그와 게임을 바꾼 개인.
"""
tokens = list(str.encode("utf-8"))
tokens = list(map(int, tokens)) # for convinience

In [98]:
print("Length: ", len(tokens))
tokens[:10]

Length:  4112


[10, 237, 142, 152, 236, 157, 180, 236, 187, 164]

In [99]:
# Find byte (or int here) pair statistics
def byte_pair(tokens):
    pairs = {}
    for pair in zip(tokens, tokens[1:]):
        pairs[pair] = pairs.get(pair, 0) + 1
    return pairs

pairs = byte_pair(tokens)
pairs

{(10, 237): 1,
 (237, 142): 8,
 (142, 152): 8,
 (152, 236): 23,
 (236, 157): 141,
 (157, 180): 60,
 (180, 236): 29,
 (236, 187): 12,
 (187, 164): 11,
 (164, 235): 16,
 (235, 138): 33,
 (138, 148): 31,
 (148, 32): 30,
 (32, 234): 35,
 (234, 178): 21,
 (178, 140): 11,
 (140, 236): 14,
 (236, 158): 27,
 (158, 132): 8,
 (132, 32): 46,
 (32, 235): 112,
 (235, 130): 17,
 (130, 180): 6,
 (180, 235): 31,
 (235, 182): 9,
 (182, 128): 6,
 (128, 236): 18,
 (236, 160): 40,
 (160, 129): 17,
 (129, 236): 14,
 (236, 156): 23,
 (156, 188): 13,
 (188, 235): 17,
 (235, 161): 37,
 (161, 156): 33,
 (156, 32): 66,
 (32, 237): 60,
 (237, 152): 3,
 (152, 132): 1,
 (132, 235): 11,
 (235, 140): 17,
 (140, 128): 19,
 (128, 32): 35,
 (235, 175): 12,
 (175, 184): 12,
 (184, 235): 22,
 (235, 147): 39,
 (147, 156): 18,
 (156, 235): 25,
 (235, 157): 15,
 (157, 188): 20,
 (188, 236): 21,
 (235, 132): 11,
 (132, 136): 7,
 (136, 236): 17,
 (157, 152): 21,
 (152, 32): 31,
 (234, 176): 17,
 (176, 156): 3,
 (235, 133): 2,

In [100]:
# sort by the most frequent byte pair occurances
sorted(((v, k) for k,v in pairs.items()), reverse=True)[:10]

[(176, (32, 236)),
 (141, (236, 157)),
 (112, (32, 235)),
 (66, (156, 32)),
 (60, (157, 180)),
 (60, (32, 237)),
 (52, (237, 149)),
 (46, (132, 32)),
 (40, (236, 160)),
 (39, (235, 147))]

In [101]:
# merge byte pairs into new token
def merge_tokens(ids, pair, idx):
    # in the list of ints (ids), replace all consecutive occurrences of pair with the new token idx
    newids = []
    i = 0
    while i < len(ids): # iterate through all tokens
        # if we are not at the very last position AND the pair matches, replace it
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
            newids.append(idx)
            i += 2
        else:
            newids.append(ids[i])
            i += 1
    return newids

def merge_byte_pairs(ids, vocab_size):
    # Each merge iteration will increase total voacb size by 1
    vocab_idx = 256
    merge_iter = vocab_size - vocab_idx
    merges = {}  # (int, int) -> int
    for _ in range(merge_iter):
        stats = byte_pair(ids)
        pair = max(stats, key=stats.get)
        print(f"merging {pair} into a new token {vocab_idx}")
        ids = merge_tokens(ids, pair, vocab_idx)
        merges[pair] = vocab_idx
        vocab_idx += 1
        
    return ids, merges

# This method prevents calling byte_pair every iterations
def merge_byte_pairs_fast(ids, vocab_size):
    # Each merge iteration will increase total voacb size by 1
    vocab_idx = 256
    merge_iter = vocab_size - vocab_idx
    merges = {}  # (int, int) -> int
    stats = byte_pair(ids)
    pairs = sorted(stats.items(), key=lambda x: x[1], reverse=True)[:merge_iter]

    for pair, _ in pairs:
        print(f"merging {pair} into a new token {vocab_idx}")
        ids = merge_tokens(ids, pair, vocab_idx)
        merges[pair] = vocab_idx
        vocab_idx += 1
            
    return ids, merges

# Final vocab size is 267
vocab_size = 267

# Call the new function to perform the merging
ids, merges = merge_byte_pairs(tokens, vocab_size)
print("---------------------------")

# Call the fast version of merging by calling pair stats only once
fids, fmerges = merge_byte_pairs_fast(tokens, vocab_size)

merging (32, 236) into a new token 256
merging (236, 157) into a new token 257
merging (32, 235) into a new token 258
merging (32, 237) into a new token 259
merging (257, 180) into a new token 260
merging (237, 149) into a new token 261
merging (32, 234) into a new token 262
merging (235, 138) into a new token 263
merging (235, 161) into a new token 264
merging (156, 256) into a new token 265
merging (257, 132) into a new token 266
---------------------------
merging (32, 236) into a new token 256
merging (236, 157) into a new token 257
merging (32, 235) into a new token 258
merging (156, 32) into a new token 259
merging (157, 180) into a new token 260
merging (32, 237) into a new token 261
merging (237, 149) into a new token 262
merging (132, 32) into a new token 263
merging (236, 160) into a new token 264
merging (235, 147) into a new token 265
merging (235, 161) into a new token 266


In [102]:
print("tokens length:", len(tokens))
print("BPE encoded tokens length:", len(ids))
print(f"compression ratio: {len(tokens) / len(ids):.2f}X")

tokens length: 4112
BPE encoded tokens length: 3389
compression ratio: 1.21X


In [103]:
print("tokens length:", len(tokens))
print("Fast BPE encoded tokens length:", len(fids))
print(f"Fast compression ratio: {len(tokens) / len(fids):.2f}X")

tokens length: 4112
Fast BPE encoded tokens length: 3492
Fast compression ratio: 1.18X


In [104]:
# Note: tokenization step is independent to the pretraining LLM model
# raw text (unicode code point) ---- (tokenizer) ---> token sequence -> LLM

In [105]:
# decoding: given token sequence, convert into raw text

vocab = {i: bytes([i]) for i in range(256)} # original bytes
# merged tokens
for (i1, i2), tokenidx in merges.items(): 
    vocab[tokenidx] = vocab[i1] + vocab[i2]
list(vocab.items())[-5:]

[(262, b' \xea'),
 (263, b'\xeb\x8a'),
 (264, b'\xeb\xa1'),
 (265, b'\x9c \xec'),
 (266, b'\xec\x9d\x84')]

In [106]:
def decode(encodedtokens):
    tokens = b"".join(vocab[i] for i in encodedtokens) # 1) token -> unicode byte
    # default errors is "strict", replace prints invalid text instead without raising the error
    # not all bytes are utf-8 decodable because of utf encoding rules
    return tokens.decode("utf-8", errors="replace") # 2) unicode byte -> human readable text

# � is not decodable token
decode(ids[:20])

'\n페이커는 게임 내�'

In [107]:
# encoding: given raw text, convert into token sequences

def encode(text, merges):
  # given a string, return list of integers (the tokens)
  tokens = list(text.encode("utf-8"))

  # tokens less than length 2 cannot be merged
  while len(tokens) >= 2: 
    # find byte pairs to be merged dict((int, int): int)
    pairs = byte_pair(tokens)
    # Find the pair with the minimum value (=token) in the merges dictionary
    # Recall merges = ((token1, token2): newtoken above value 255)
    # Because early merges should be conducted first
    pair = min(pairs, key=lambda p: merges.get(p, float("inf")))
    if pair not in merges:
      break # nothing else can be merged
    idx = merges[pair]
    tokens = merge_tokens(tokens, pair, idx)
  return tokens

print(encode("페이커 기습숭배", merges))

[237, 142, 152, 260, 236, 187, 164, 262, 184, 176, 236, 138, 181, 236, 136, 173, 235, 176, 176]


In [108]:
# encode and decode are inverse process
decode(encode("자장면 짬뽕 탕수육 깐풍기"))

TypeError: encode() missing 1 required positional argument: 'merges'

#### Regex patterns used in GPT

In [44]:
import regex as re

# from gpt-2/src/encoder.py Encoder.pat
gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
print(re.findall(gpt2pat, "You're learning NLP, right? I've got 10 tips for you!   "))

['You', "'re", ' learning', ' NLP', ',', ' right', '?', ' I', "'ve", ' got', ' 10', ' tips', ' for', ' you', '!', '   ']


###### **1. Matches Contractions (`'s|'t|'re|'ve|'m|'ll|'d`)**
`'re` → Matches `"You're"`.
`'ve` → Matches `"I've"`.

###### **2. Matches Words (` ?\p{L}+`)**
`"You"`, `"learning"`, `"NLP"`, `"right"`, `"I"`, `"got"`, `"tips"`, `"for"`, `"you"`.

######  **3. Matches Numbers (` ?\p{N}+`)**
`"10"`.

###### **4. Matches Punctuation (` ?[^\s\p{L}\p{N}]+`)**
`","`, `"?"`, `"!"`.

###### **5. Matches Trailing Whitespace (`\s+(?!\S)`)**
`"   "` (at the end).

## tiktoken

In [48]:
# tiktoken library fast BPE tokeniser for openai models
%pip install -q tiktoken

import tiktoken

Note: you may need to restart the kernel to use updated packages.


In [56]:
# Pattens are in https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py

# GPT-2 (does not merge spaces)
enc = tiktoken.get_encoding("gpt2")
print(enc.encode("    This is a simple test to compare tokenization strategies between different GPT types"))

# GPT-4 (merges spaces)
enc = tiktoken.get_encoding("cl100k_base")
print(enc.encode("    This is a simple test to compare tokenization strategies between different GPT types"))

# GPT-4o (merges spaces, efficient token size)
enc = tiktoken.get_encoding("o200k_base")
print(enc.encode("    This is a simple test to compare tokenization strategies between different GPT types"))

[220, 220, 220, 770, 318, 257, 2829, 1332, 284, 8996, 11241, 1634, 10064, 1022, 1180, 402, 11571, 3858]
[262, 1115, 374, 264, 4382, 1296, 311, 9616, 4037, 2065, 15174, 1990, 2204, 480, 2898, 4595]
[271, 1328, 382, 261, 4705, 1746, 316, 12221, 6602, 2860, 15142, 2870, 2647, 174803, 6009]


In [None]:
# from gpt2 get_encoder func https://github.com/openai/gpt-2/blob/master/src/encoder.py
# vocab.bpe: merges ((token1, token2) -> merged token id)
# encoder.json: vocab (token integer -> byte)

!wget https://openaipublic.blob.core.windows.net/gpt-2/models/1558M/vocab.bpe
!wget https://openaipublic.blob.core.windows.net/gpt-2/models/1558M/encoder.json

--2024-12-19 14:53:44--  https://openaipublic.blob.core.windows.net/gpt-2/models/1558M/vocab.bpe
Resolving openaipublic.blob.core.windows.net (openaipublic.blob.core.windows.net)... 57.150.97.129
Connecting to openaipublic.blob.core.windows.net (openaipublic.blob.core.windows.net)|57.150.97.129|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 456318 (446K) [application/octet-stream]
Saving to: ‘vocab.bpe’


2024-12-19 14:53:45 (511 KB/s) - ‘vocab.bpe’ saved [456318/456318]

--2024-12-19 14:53:45--  https://openaipublic.blob.core.windows.net/gpt-2/models/1558M/encoder.json
Resolving openaipublic.blob.core.windows.net (openaipublic.blob.core.windows.net)... 57.150.97.129
Connecting to openaipublic.blob.core.windows.net (openaipublic.blob.core.windows.net)|57.150.97.129|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1042301 (1018K) [application/json]
Saving to: ‘encoder.json’


2024-12-19 14:53:48 (744 KB/s) - ‘encoder.json’ saved [10423

In [62]:
import json

with open('encoder.json', 'r') as f:
    encoder = json.load(f) 

with open('vocab.bpe', 'r', encoding="utf-8") as f:
    bpe_data = f.read()
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]

In [80]:
print(f"Final 5 merges: {[(k, v) for k, v in encoder.items()][-5:]}")
print(f"Final 5 vocabs: {bpe_merges[-5:]}")

Final 5 merges: [('Ġregress', 50252), ('ĠCollider', 50253), ('Ġinformants', 50254), ('Ġgazed', 50255), ('<|endoftext|>', 50256)]
Final 5 vocabs: [('om', 'inated'), ('Ġreg', 'ress'), ('ĠColl', 'ider'), ('Ġinform', 'ants'), ('Ġg', 'azed')]


In [81]:
print(f"length of encoder: {len(encoder)}") # 256 raw byte tokens + 50000 merges + 1 special token
print(f"lenght of vocab: {len(bpe_merges)}") 

length of encoder: 50257
lenght of vocab: 50000


In [83]:
# special token
# token used to signal end of document
encoder["<|endoftext|>"]

50256

In [None]:
# Note: gpt-4o have 2 special tokens
# can be found at https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py#L100C5-L100C62
# special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018}

In [84]:
class BasicTokenizer:
    
    def __init__(self):
        self.merges = {}  # (int, int) -> int
        self.utf_size = 256 # utf encoding size (before merging)
    
    def train(self, text, vocab_size, verbose=False):
        tokens = list(text.encode("utf-8"))
        merge_iter = vocab_size - self.utf_size
        for i in range(merge_iter):
            stats = byte_pair(tokens)
            pair = max(stats, key=stats.get)
            if verbose:
                print(f"merging {pair} into a new token {self.utf_size + i}")
            tokens = merge_tokens(tokens, pair, self.utf_size + i)
            self.merges[pair] = self.utf_size + i

    def encode(self, text):
        tokens = list(text.encode("utf-8"))

        while len(tokens) >= 2: 
            pairs = byte_pair(tokens)
            pair = min(pairs, key=lambda p: merges.get(p, float("inf")))
            if pair not in self.merges:
                break
            idx = self.merges[pair]
            tokens = merge_tokens(tokens, pair, idx)
        return tokens
    
    def decode(self, ids):
        vocab = {i: bytes([i]) for i in range(256)} 
        for (i1, i2), tokenidx in self.merges.items(): 
            vocab[tokenidx] = vocab[i1] + vocab[i2]
        
        tokens = b"".join(vocab[i] for i in ids)  
        return tokens.decode("utf-8", errors="replace") 

In [85]:
basictokenizer = BasicTokenizer()

with open("taylor.txt", "r") as file:
    txt = file.read()

In [86]:
basictokenizer.train(txt, 285)

In [88]:
enc = basictokenizer.encode("Faker is the best 숭배할 시간이에요")
print(enc)

dec = basictokenizer.decode(enc)
print(dec)

[70, 97, 107, 101, 114, 32, 105, 115, 32, 116, 104, 101, 32, 98, 101, 115, 116, 32, 236, 136, 173, 235, 176, 176, 237, 149, 160, 32, 236, 139, 156, 234, 176, 132, 236, 157, 180, 236, 151, 144, 236, 154, 148]
Faker is the best 숭배할 시간이에요


## sentencepiece

In [90]:
%pip install -q sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [91]:
# sentencepiece is used in Llama and Mistral
# they run bpe on unicode code points directly
import sentencepiece as spm

In [111]:
with open("toy.txt", "w", encoding="utf-8") as f:
    f.write("In 2024, the world saw dramatic changes—politics, tech, and culture shifted. Social media platforms like X (formerly Twitter) & Meta thrived, while others faltered. Passwords like Pa$$w0rd! became outdated; biometrics took over. Emojis 🎉 & hashtags #Change2024 dominated conversations. At the heart of innovation, AI models (e.g., ChatGPT) continued their evolution. Companies discussed ethics: Should AIs say “yes” to every 🛠️? Or “no”? Meanwhile, everyday folks exclaimed, \"OMG! What a time to be alive! 😅.\" Data (1TB, 2TB) flowed rapidly, and files like report_v2_final.pdf defined workflows. In a fast-paced era, the question was simple: Where do we go next? 🌍")

In [112]:
import os

# due to long history, sentencepiece has a lot of configurations
options = dict(
  # input spec
  input="toy.txt",
  input_format="text",
  # output spec
  model_prefix="tok400", # output filename prefix
  # algorithm spec
  # BPE alg
  model_type="bpe",
  vocab_size=400,
  # normalization (popular config before LLM era, prefer not to touch in LLM era)
  normalization_rule_name="identity", # ew, turn off normalization
  remove_extra_whitespaces=False,
  input_sentence_size=200000000, # max number of training sentences
  max_sentence_length=4192, # max number of bytes per sentence
  seed_sentencepiece_size=1000000,
  shuffle_input_sentence=True,
  # rare word treatment (important configs)
  character_coverage=0.99995,
  byte_fallback=True,
  # merge rules (simiilar to regex rules in tiktoken)
  split_digits=True,
  split_by_unicode_script=True,
  split_by_whitespace=True,
  split_by_number=True,
  max_sentencepiece_length=16,
  add_dummy_prefix=True,
  allow_whitespace_only_pieces=True,
  # special tokens
  unk_id=0, # the UNK token MUST exist 
  bos_id=1, # the others are optional, set to -1 to turn off
  eos_id=2, # end of sentence
  pad_id=-1, # -1, thus not use pad id
  # systems
  num_threads=os.cpu_count(), # use ~all system resources
)

spm.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: toy.txt
  input_format: text
  model_prefix: tok400
  model_type: BPE
  vocab_size: 400
  self_test_sample_size: 0
  character_coverage: 0.99995
  input_sentence_size: 200000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 8
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  diffe

In [113]:
sp = spm.SentencePieceProcessor()
sp.load('tok400.model')
vocab = [[sp.id_to_piece(idx), idx] for idx in range(sp.get_piece_size())]
vocab # order: special token - byte tokens - merged tokens - raw codepoints tokens (in toy.txt)

[['<unk>', 0],
 ['<s>', 1],
 ['</s>', 2],
 ['<0x00>', 3],
 ['<0x01>', 4],
 ['<0x02>', 5],
 ['<0x03>', 6],
 ['<0x04>', 7],
 ['<0x05>', 8],
 ['<0x06>', 9],
 ['<0x07>', 10],
 ['<0x08>', 11],
 ['<0x09>', 12],
 ['<0x0A>', 13],
 ['<0x0B>', 14],
 ['<0x0C>', 15],
 ['<0x0D>', 16],
 ['<0x0E>', 17],
 ['<0x0F>', 18],
 ['<0x10>', 19],
 ['<0x11>', 20],
 ['<0x12>', 21],
 ['<0x13>', 22],
 ['<0x14>', 23],
 ['<0x15>', 24],
 ['<0x16>', 25],
 ['<0x17>', 26],
 ['<0x18>', 27],
 ['<0x19>', 28],
 ['<0x1A>', 29],
 ['<0x1B>', 30],
 ['<0x1C>', 31],
 ['<0x1D>', 32],
 ['<0x1E>', 33],
 ['<0x1F>', 34],
 ['<0x20>', 35],
 ['<0x21>', 36],
 ['<0x22>', 37],
 ['<0x23>', 38],
 ['<0x24>', 39],
 ['<0x25>', 40],
 ['<0x26>', 41],
 ['<0x27>', 42],
 ['<0x28>', 43],
 ['<0x29>', 44],
 ['<0x2A>', 45],
 ['<0x2B>', 46],
 ['<0x2C>', 47],
 ['<0x2D>', 48],
 ['<0x2E>', 49],
 ['<0x2F>', 50],
 ['<0x30>', 51],
 ['<0x31>', 52],
 ['<0x32>', 53],
 ['<0x33>', 54],
 ['<0x34>', 55],
 ['<0x35>', 56],
 ['<0x36>', 57],
 ['<0x37>', 58],
 ['<0x38>', 5

In [None]:
# indivisual codepoint tokens should have low frequency (= rare)
# character_coverage configs will let them not to be added to vocab

In [117]:
ids = sp.encode("hello 페이커 기습숭배 시간입니다")
print(ids)

[334, 267, 343, 305, 334, 240, 145, 155, 239, 160, 183, 239, 190, 167, 334, 237, 187, 179, 239, 141, 184, 239, 139, 176, 238, 179, 179, 334, 239, 142, 159, 237, 179, 135, 239, 161, 136, 238, 142, 139, 238, 142, 167]


In [118]:
# Since korean characters are not trained (not included in toy.txt)
# As byte_fallback=True, it encodes to utf-8 <0x..> tokens
# If byte_fallback=false, it maps to <unk> token 0

# Space switches spaces in to '_' (including first space - config add_dummy_prefix=True)
# Let world in "hello world" and "world" have same token ("world" becomes " world")
print([sp.id_to_piece(idx) for idx in ids])

['▁', 'he', 'l', 'lo', '▁', '<0xED>', '<0x8E>', '<0x98>', '<0xEC>', '<0x9D>', '<0xB4>', '<0xEC>', '<0xBB>', '<0xA4>', '▁', '<0xEA>', '<0xB8>', '<0xB0>', '<0xEC>', '<0x8A>', '<0xB5>', '<0xEC>', '<0x88>', '<0xAD>', '<0xEB>', '<0xB0>', '<0xB0>', '▁', '<0xEC>', '<0x8B>', '<0x9C>', '<0xEA>', '<0xB0>', '<0x84>', '<0xEC>', '<0x9E>', '<0x85>', '<0xEB>', '<0x8B>', '<0x88>', '<0xEB>', '<0x8B>', '<0xA4>']


## Finding appropriate vocab_size

In [None]:
# Increasing vocab_size will increase computations
# Because the final layers of LLM modles are logits/probs outputing layer nn.Linear(embed_size, vocab_size)
# Large vocab_size may under train Embedding vector of some vocabs in nn.Embedding(vocab_size, embed_size)
# When adding new tokens from existing model, initialize new vocabs' embedding vecotrs randomly

## Tokenization of other modalities

In [None]:
# The architecture stays the same!
# e.g. image - VQGAN, video - sora