In [1]:
from transformers import AutoTokenizer

In [21]:
corpus=[
    "This is my google colab notebook for transformers.",
    "I am learning transformers now.",
    "huggingface include many transformers!",
    "There are dataset in huggingface also!"]
tokenizer=AutoTokenizer.from_pretrained("gpt2")

In [22]:
from collections import defaultdict

word_freq=defaultdict(int)

for text in corpus:
  words_with_offsets=tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
  new_words=[word for word, offset in words_with_offsets]
  for word in new_words:
    word_freq[word]+=1

word_freq

defaultdict(int,
            {'This': 1,
             'Ġis': 1,
             'Ġmy': 1,
             'Ġgoogle': 1,
             'Ġcolab': 1,
             'Ġnotebook': 1,
             'Ġfor': 1,
             'Ġtransformers': 3,
             '.': 2,
             'I': 1,
             'Ġam': 1,
             'Ġlearning': 1,
             'Ġnow': 1,
             'huggingface': 1,
             'Ġinclude': 1,
             'Ġmany': 1,
             '!': 2,
             'There': 1,
             'Ġare': 1,
             'Ġdataset': 1,
             'Ġin': 1,
             'Ġhuggingface': 1,
             'Ġalso': 1})

In [23]:
alphabets=[]

for word in word_freq.keys():
  for char in word:
    if char not in alphabets:
      alphabets.append(char)
print(alphabets)

['T', 'h', 'i', 's', 'Ġ', 'm', 'y', 'g', 'o', 'l', 'e', 'c', 'a', 'b', 'n', 't', 'k', 'f', 'r', '.', 'I', 'w', 'u', 'd', '!']


In [24]:
vocab = ["<|endoftext|>"] + alphabets.copy()

In [25]:
splits={word:[c for c in word] for word in word_freq}
splits

{'This': ['T', 'h', 'i', 's'],
 'Ġis': ['Ġ', 'i', 's'],
 'Ġmy': ['Ġ', 'm', 'y'],
 'Ġgoogle': ['Ġ', 'g', 'o', 'o', 'g', 'l', 'e'],
 'Ġcolab': ['Ġ', 'c', 'o', 'l', 'a', 'b'],
 'Ġnotebook': ['Ġ', 'n', 'o', 't', 'e', 'b', 'o', 'o', 'k'],
 'Ġfor': ['Ġ', 'f', 'o', 'r'],
 'Ġtransformers': ['Ġ',
  't',
  'r',
  'a',
  'n',
  's',
  'f',
  'o',
  'r',
  'm',
  'e',
  'r',
  's'],
 '.': ['.'],
 'I': ['I'],
 'Ġam': ['Ġ', 'a', 'm'],
 'Ġlearning': ['Ġ', 'l', 'e', 'a', 'r', 'n', 'i', 'n', 'g'],
 'Ġnow': ['Ġ', 'n', 'o', 'w'],
 'huggingface': ['h', 'u', 'g', 'g', 'i', 'n', 'g', 'f', 'a', 'c', 'e'],
 'Ġinclude': ['Ġ', 'i', 'n', 'c', 'l', 'u', 'd', 'e'],
 'Ġmany': ['Ġ', 'm', 'a', 'n', 'y'],
 '!': ['!'],
 'There': ['T', 'h', 'e', 'r', 'e'],
 'Ġare': ['Ġ', 'a', 'r', 'e'],
 'Ġdataset': ['Ġ', 'd', 'a', 't', 'a', 's', 'e', 't'],
 'Ġin': ['Ġ', 'i', 'n'],
 'Ġhuggingface': ['Ġ', 'h', 'u', 'g', 'g', 'i', 'n', 'g', 'f', 'a', 'c', 'e'],
 'Ġalso': ['Ġ', 'a', 'l', 's', 'o']}

In [26]:
def pair_freqs_compute(splits):
  pair_freqs = defaultdict(int)

  for word,freq in word_freq.items():
    split=splits[word]
    if len(split)==1:
      continue

    for i in range(len(split)-1):
      pair=(split[i],split[i+1])
      pair_freqs[pair]+=freq

  return pair_freqs
pair_freqs=pair_freqs_compute(splits)
pair_freqs

defaultdict(int,
            {('T', 'h'): 2,
             ('h', 'i'): 1,
             ('i', 's'): 2,
             ('Ġ', 'i'): 3,
             ('Ġ', 'm'): 2,
             ('m', 'y'): 1,
             ('Ġ', 'g'): 1,
             ('g', 'o'): 1,
             ('o', 'o'): 2,
             ('o', 'g'): 1,
             ('g', 'l'): 1,
             ('l', 'e'): 2,
             ('Ġ', 'c'): 1,
             ('c', 'o'): 1,
             ('o', 'l'): 1,
             ('l', 'a'): 1,
             ('a', 'b'): 1,
             ('Ġ', 'n'): 2,
             ('n', 'o'): 2,
             ('o', 't'): 1,
             ('t', 'e'): 1,
             ('e', 'b'): 1,
             ('b', 'o'): 1,
             ('o', 'k'): 1,
             ('Ġ', 'f'): 1,
             ('f', 'o'): 4,
             ('o', 'r'): 4,
             ('Ġ', 't'): 3,
             ('t', 'r'): 3,
             ('r', 'a'): 3,
             ('a', 'n'): 4,
             ('n', 's'): 3,
             ('s', 'f'): 3,
             ('r', 'm'): 3,
             ('m', 'e'): 3,
   

In [28]:
best_pair=""
max_freq=None

for pair,freq in pair_freqs.items():
  if max_freq is None or freq>max_freq:
    best_pair=pair
    max_freq=freq

print(max_freq,best_pair)

5 ('i', 'n')


In [30]:
merges = {("Ġ", "t"): "Ġt"}
vocab.append("Ġt")

In [37]:
def merge_pair(a, b, splits):
    for word in word_freq:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [38]:
splits=merge_pair("i", "n", splits)
splits['huggingface']

['h', 'u', 'g', 'g', 'in', 'g', 'f', 'a', 'c', 'e']

In [40]:
vocab_size = 50

while len(vocab) < vocab_size:
    pair_freqs = pair_freqs_compute(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [41]:
print(merges)

{('Ġ', 't'): 'Ġt', ('f', 'o'): 'fo', ('fo', 'r'): 'for', ('a', 'n'): 'an', ('e', 'r'): 'er', ('Ġt', 'r'): 'Ġtr', ('Ġtr', 'an'): 'Ġtran', ('Ġtran', 's'): 'Ġtrans', ('Ġtrans', 'for'): 'Ġtransfor', ('Ġtransfor', 'm'): 'Ġtransform', ('Ġtransform', 'er'): 'Ġtransformer', ('Ġtransformer', 's'): 'Ġtransformers', ('Ġ', 'a'): 'Ġa', ('in', 'g'): 'ing', ('T', 'h'): 'Th', ('i', 's'): 'is', ('Ġ', 'm'): 'Ġm', ('o', 'o'): 'oo', ('l', 'e'): 'le', ('Ġ', 'n'): 'Ġn', ('Ġn', 'o'): 'Ġno', ('h', 'u'): 'hu', ('hu', 'g'): 'hug'}


In [42]:
print(vocab)

['<|endoftext|>', 'T', 'h', 'i', 's', 'Ġ', 'm', 'y', 'g', 'o', 'l', 'e', 'c', 'a', 'b', 'n', 't', 'k', 'f', 'r', '.', 'I', 'w', 'u', 'd', '!', 'Ġt', 'fo', 'for', 'an', 'er', 'Ġt', 'Ġtr', 'Ġtran', 'Ġtrans', 'Ġtransfor', 'Ġtransform', 'Ġtransformer', 'Ġtransformers', 'Ġa', 'ing', 'Th', 'is', 'Ġm', 'oo', 'le', 'Ġn', 'Ġno', 'hu', 'hug']


In [43]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])