# Tokenizer V1

## Imports

## Training our own tokenizer

### Step-1: Starting with a Training Corpus

Here we have made a dummy corpus of 4 documents represented in a List

In [32]:
corpus = ["The model’s performance improved slightly after fine-tuning on domain-specific data.",
          "A lightweight transformer can often outperform larger models when properly optimized.",
          "Experimental results show that scaling depth contributes more to coherence than width.",
          "Future work could explore hybrid architectures combining attention with retrieval mechanisms."
          ]
print("Training Corpus: ")
for doc in corpus:
  print(doc)


Training Corpus: 
The model’s performance improved slightly after fine-tuning on domain-specific data.
A lightweight transformer can often outperform larger models when properly optimized.
Experimental results show that scaling depth contributes more to coherence than width.
Future work could explore hybrid architectures combining attention with retrieval mechanisms.


In [33]:
unique_characters = set()
for doc in corpus:
  for char in doc:
    unique_characters.add(char)

vocabulary = list(unique_characters)
vocabulary.sort()

vocabulary

[' ',
 '-',
 '.',
 'A',
 'E',
 'F',
 'T',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '’']

In [34]:
end_of_word_token = "<|eos|>"
start_of_word_token = "<|sos|>"

vocabulary.append(end_of_word_token)
vocabulary.append(start_of_word_token)

vocabulary

[' ',
 '-',
 '.',
 'A',
 'E',
 'F',
 'T',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '’',
 '<|eos|>',
 '<|sos|>']

In [35]:
len(vocabulary)

34

In [36]:
word_splits = {}
for doc in corpus:
  for word in doc.split(' '):
    if word:
      char_list = [start_of_word_token] + list(word) + [end_of_word_token]
      word_tuple = tuple(char_list)

      if word_tuple not in word_splits:
        word_splits[word_tuple] = 0
      word_splits[word_tuple] += 1

word_splits

{('<|sos|>', 'T', 'h', 'e', '<|eos|>'): 1,
 ('<|sos|>', 'm', 'o', 'd', 'e', 'l', '’', 's', '<|eos|>'): 1,
 ('<|sos|>',
  'p',
  'e',
  'r',
  'f',
  'o',
  'r',
  'm',
  'a',
  'n',
  'c',
  'e',
  '<|eos|>'): 1,
 ('<|sos|>', 'i', 'm', 'p', 'r', 'o', 'v', 'e', 'd', '<|eos|>'): 1,
 ('<|sos|>', 's', 'l', 'i', 'g', 'h', 't', 'l', 'y', '<|eos|>'): 1,
 ('<|sos|>', 'a', 'f', 't', 'e', 'r', '<|eos|>'): 1,
 ('<|sos|>',
  'f',
  'i',
  'n',
  'e',
  '-',
  't',
  'u',
  'n',
  'i',
  'n',
  'g',
  '<|eos|>'): 1,
 ('<|sos|>', 'o', 'n', '<|eos|>'): 1,
 ('<|sos|>',
  'd',
  'o',
  'm',
  'a',
  'i',
  'n',
  '-',
  's',
  'p',
  'e',
  'c',
  'i',
  'f',
  'i',
  'c',
  '<|eos|>'): 1,
 ('<|sos|>', 'd', 'a', 't', 'a', '.', '<|eos|>'): 1,
 ('<|sos|>', 'A', '<|eos|>'): 1,
 ('<|sos|>',
  'l',
  'i',
  'g',
  'h',
  't',
  'w',
  'e',
  'i',
  'g',
  'h',
  't',
  '<|eos|>'): 1,
 ('<|sos|>',
  't',
  'r',
  'a',
  'n',
  's',
  'f',
  'o',
  'r',
  'm',
  'e',
  'r',
  '<|eos|>'): 1,
 ('<|sos|>', 'c', 

In [37]:
import collections
def get_pairs(splits):
  pairs = collections.defaultdict(int)
  for word, freq in splits.items():
    symbols = list(word)
    for i in range(len(symbols)-1):
      pair = (symbols[i], symbols[i+1])
      pairs[pair] += freq
  return pairs

get_pairs(word_splits)

defaultdict(int,
            {('<|sos|>', 'T'): 1,
             ('T', 'h'): 1,
             ('h', 'e'): 3,
             ('e', '<|eos|>'): 6,
             ('<|sos|>', 'm'): 4,
             ('m', 'o'): 3,
             ('o', 'd'): 2,
             ('d', 'e'): 3,
             ('e', 'l'): 2,
             ('l', '’'): 1,
             ('’', 's'): 1,
             ('s', '<|eos|>'): 5,
             ('<|sos|>', 'p'): 2,
             ('p', 'e'): 5,
             ('e', 'r'): 8,
             ('r', 'f'): 2,
             ('f', 'o'): 3,
             ('o', 'r'): 6,
             ('r', 'm'): 3,
             ('m', 'a'): 2,
             ('a', 'n'): 5,
             ('n', 'c'): 2,
             ('c', 'e'): 2,
             ('<|sos|>', 'i'): 1,
             ('i', 'm'): 3,
             ('m', 'p'): 1,
             ('p', 'r'): 2,
             ('r', 'o'): 2,
             ('o', 'v'): 1,
             ('v', 'e'): 1,
             ('e', 'd'): 2,
             ('d', '<|eos|>'): 3,
             ('<|sos|>', 's'): 3,
           

In [38]:
def merge_pair(pair, splits):
  new_splits = {}
  (first, second) = pair
  merged_token = first + second
  for word, freq in splits.items():
    symbols = list(word)
    new_symbols = list()
    i=0
    while i < len(symbols):
      if i < len(symbols)-1 and symbols[i] == first and symbols[i+1] == second:
        new_symbols.append(merged_token)
        i += 2
      else:
        new_symbols.append(symbols[i])
        i += 1
    new_splits[tuple(new_symbols)] = freq

  return new_splits



In [39]:
num_merges = 15

merges = {}

current_splits = word_splits.copy()

print("Starting BPE Merges\n")
print(f"Initial Splits: {current_splits}")
print("*"*30)

for i in range(num_merges):
  print(f"\nIteration {i+1}/{num_merges}")
  pairs = get_pairs(current_splits)
  if not pairs:
    print("No more pairs to merge")
    break

  sorted_pairs = sorted(pairs.items(), key=lambda x: x[1], reverse=True)
  print(f"\nTop 5 Pair Frequencies: {sorted_pairs[:5]}")

  best_pair = max(pairs, key=pairs.get)
  best_freq = pairs[best_pair]
  print(f"\nFound Best Pair: {best_pair}: best_freq: {best_freq}")

  current_splits = merge_pair(best_pair, current_splits)
  new_token = best_pair[0] + best_pair[1]
  print(f"\nMerging {best_pair} into {new_token}")
  print(f"\n Splits after merge: {current_splits}")

  vocabulary.append(new_token)
  merges[best_pair] = new_token
  print("*"*30)

Starting BPE Merges

Initial Splits: {('<|sos|>', 'T', 'h', 'e', '<|eos|>'): 1, ('<|sos|>', 'm', 'o', 'd', 'e', 'l', '’', 's', '<|eos|>'): 1, ('<|sos|>', 'p', 'e', 'r', 'f', 'o', 'r', 'm', 'a', 'n', 'c', 'e', '<|eos|>'): 1, ('<|sos|>', 'i', 'm', 'p', 'r', 'o', 'v', 'e', 'd', '<|eos|>'): 1, ('<|sos|>', 's', 'l', 'i', 'g', 'h', 't', 'l', 'y', '<|eos|>'): 1, ('<|sos|>', 'a', 'f', 't', 'e', 'r', '<|eos|>'): 1, ('<|sos|>', 'f', 'i', 'n', 'e', '-', 't', 'u', 'n', 'i', 'n', 'g', '<|eos|>'): 1, ('<|sos|>', 'o', 'n', '<|eos|>'): 1, ('<|sos|>', 'd', 'o', 'm', 'a', 'i', 'n', '-', 's', 'p', 'e', 'c', 'i', 'f', 'i', 'c', '<|eos|>'): 1, ('<|sos|>', 'd', 'a', 't', 'a', '.', '<|eos|>'): 1, ('<|sos|>', 'A', '<|eos|>'): 1, ('<|sos|>', 'l', 'i', 'g', 'h', 't', 'w', 'e', 'i', 'g', 'h', 't', '<|eos|>'): 1, ('<|sos|>', 't', 'r', 'a', 'n', 's', 'f', 'o', 'r', 'm', 'e', 'r', '<|eos|>'): 1, ('<|sos|>', 'c', 'a', 'n', '<|eos|>'): 1, ('<|sos|>', 'o', 'f', 't', 'e', 'n', '<|eos|>'): 1, ('<|sos|>', 'o', 'u', 't', 

In [40]:
print("\n---BPE Merges Complete---")
print(f"Final Vocab Size: {len(vocabulary)}")
print(f"Learned Merges: (Pair --> New Token)")

for pair, token in merges.items():
  print(f"{pair} --> {token}")

print("*"*37)
print(f"\n{current_splits}")


---BPE Merges Complete---
Final Vocab Size: 49
Learned Merges: (Pair --> New Token)
('e', 'r') --> er
('e', '<|eos|>') --> e<|eos|>
('o', 'r') --> or
('i', 'n') --> in
('n', '<|eos|>') --> n<|eos|>
('s', '<|eos|>') --> s<|eos|>
('<|sos|>', 'c') --> <|sos|>c
('t', 'h') --> th
('<|sos|>', 'm') --> <|sos|>m
('p', 'er') --> per
('<|sos|>', 'o') --> <|sos|>o
('.', '<|eos|>') --> .<|eos|>
('t', 'e') --> te
('<|sos|>', 'w') --> <|sos|>w
('<|sos|>c', 'o') --> <|sos|>co
*************************************

{('<|sos|>', 'T', 'h', 'e<|eos|>'): 1, ('<|sos|>m', 'o', 'd', 'e', 'l', '’', 's<|eos|>'): 1, ('<|sos|>', 'per', 'f', 'or', 'm', 'a', 'n', 'c', 'e<|eos|>'): 1, ('<|sos|>', 'i', 'm', 'p', 'r', 'o', 'v', 'e', 'd', '<|eos|>'): 1, ('<|sos|>', 's', 'l', 'i', 'g', 'h', 't', 'l', 'y', '<|eos|>'): 1, ('<|sos|>', 'a', 'f', 't', 'er', '<|eos|>'): 1, ('<|sos|>', 'f', 'in', 'e', '-', 't', 'u', 'n', 'in', 'g', '<|eos|>'): 1, ('<|sos|>o', 'n<|eos|>'): 1, ('<|sos|>', 'd', 'o', 'm', 'a', 'in', '-', 's', 'p