In [None]:
# if running on Google colab
!pip install transformers
!pip install sentencepiece
!pip install einops
!pip install torchtyping
!pip install unidecode
import torch as t

from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/mlab/days/w2d4

# NOTE: I had to modify the tokenizer.py file for this to work with my setup
import days.w2d4.tokenizer as tok_tests

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/mlab/days/w2d4


# Part 0: string.split()

In [None]:
import re
def corpus_common_tokens(strs, topk=30000):
  token_counts = {}
  for s in strs:
    tokens = re.findall(r"\w+|[^\w\s]", s)
    for t in tokens:
      if t not in token_counts: token_counts[t] = 1
      else: token_counts[t] += 1
  return sorted(token_counts, key=token_counts.get, reverse=True)[:topk]

tok_tests.test_tokenizer_from_corpus_fn(corpus_common_tokens)

In [None]:
class Tokenizer:
  def __init__(self, token_list):
    self.token_list = token_list
    self.token_by_piece = {x['piece'] : x for x in token_list}
    self.token_by_id    = {x['id'] :    x for x in token_list}
    self.UNK_ID = 3

  def decode(self, ids):
    out = ""
    for id in ids:
      out += self.token_by_id[id]['piece']
    return out

  def tokenize(self, string):
    pieces = re.findall(r"\w+|[^\w\s]", string)
    return [self.token_by_piece[p]['id'] if p in self.token_by_piece
            else self.UNK_ID 
            for p in pieces]

tok_tests.test_tokenizer(Tokenizer)

# Part 1: BPE

In [None]:
from collections import defaultdict

In [None]:
import json

with open('bpe_tokens.json') as f:
  token_list = json.load(f)

In [None]:
class BPETokenizer(Tokenizer):
  def tokenize(self, string):
    pcs = list(string)
    for tok in self.token_list:
      pc = tok['piece']
      i = 0
      while i < len(pcs) - 1:
        if pcs[i] + pcs[i+1] == pc:
          pcs[i] = pc
          pcs.pop(i+1)
        else:
          i += 1
    return [self.token_by_piece[p]['id'] if p in self.token_by_piece 
            else self.UNK_ID 
            for p in pcs]

  def from_corpus(corpus, vocab_size=1000):
    corpus = [list(text) for text in corpus]
    # add characters to token list
    token_list = list(set([c for text in corpus for c in text]))

    # add new tokens until arriving at desired vocab size
    while len(token_list) < vocab_size:
      # count byte pairs
      byte_pairs = defaultdict(lambda: 0)
      for text in corpus:
        for i in range(len(text) - 1):
          byte_pairs[(text[i], text[i+1])] += 1
      bp = max(byte_pairs.items(), key=lambda x: x[1])[0] # most common bp
      new_token = bp[0] + bp[1]
      token_list.append(new_token)
      # replace occurances of bp with new token
      for text in corpus:
        i = 0
        while i < len(text) - 1:
          if (text[i], text[i+1]) == bp:
            text[i] = new_token
            text.pop(i+1)
          i += 1
      
    token_list = [{'piece': t, 'id': i+4} for i,t in enumerate(token_list)]
    return BPETokenizer(token_list)

In [None]:
tokenizer = BPETokenizer(token_list)
token_ids = tokenizer.tokenize("Hello my name is Sam.")
print(token_ids)
print(tokenizer.decode(token_ids))

[15496, 616, 1438, 318, 3409, 764]
Hello my name is Sam.


In [None]:
with open('shakespeare.txt', 'r') as f:
  corpus = f.readlines()
corpus = corpus[:100]

tokenizer = BPETokenizer.from_corpus(corpus, vocab_size=1000)
soln_tokenizer = tok_tests.BPETokenizer.from_corpus(corpus)

In [None]:
test = "If I were a rich man I would eat a palm tree."
soln_tokenizer = tok_tests.BPETokenizer.from_corpus(corpus)
token_ids = tokenizer.tokenize(test)
soln_token_ids = soln_tokenizer.tokenize(test)
for t1, t2 in zip(token_ids, soln_token_ids):
  s1, s2 = tokenizer.decode([t1]), soln_tokenizer.decode([t2])
  if s1 != s2: print(s1, s2)

In [None]:
tokens = [t['piece'] for t in tokenizer.token_list]
soln_tokens = [t['piece'] for t in soln_tokenizer.token_list]
for p in tokens:
  if p not in soln_tokens: 
    print(p)

THE 
IT
IONS 
ION 
INE 
COPI
COPIES 
ITIONS 
AT
<<THIS ELECTRONIC VERS
<<THIS ELECTRONIC VERSION OF THE 
<<THIS ELECTRONIC VERSION OF THE COMPLETE WORKS 
PROVIDED BY PROJECT GUTENBERG ETEXT OF ILLINOIS BENED
PROVIDED BY PROJECT GUTENBERG ETEXT OF ILLINOIS BENEDICTINE 
WITH PERMISSION.  ELECTRONIC AND MACHINE READABLE 
OF THE FUTURE COND
OF THE FUTURE CONDITIONS 
OF THE FUTURE CONDITIONS OF THIS PRESENTAT
TO GIVE IT AWAY TO ANYONE YOU LIKE, BUT NO CHARGES ARE AL
TO GIVE IT AWAY TO ANYONE YOU LIKE, BUT NO CHARGES ARE ALL
TO GIVE IT AWAY TO ANYONE YOU LIKE, BUT NO CHARGES ARE ALLOW
TO GIVE IT AWAY TO ANYONE YOU LIKE, BUT NO CHARGES ARE ALLOWED!!

**W


My from_corpus doesn't match the solution, but mine looks better (e.g. it learns that "THE " should be a token) and I'm more confident the code is error free.

In [None]:
tok_tests.test_tokenizer_from_corpus(BPETokenizer)

AssertionError: ignored