In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from typing import List
from collections import defaultdict, Counter
import tokenizer
import re
import json

In [None]:
def corpus_common_tokens(str_list: List[str]):# -> List[{"piece": str, "id": int}]:
    words = [word for text in str_list for word in re.findall(r"\w+|[^\w\s]", text)]

    word_counts = Counter(words)
    return [token for token, _ in word_counts.most_common(30000)]
tokenizer.test_tokenizer_from_corpus_fn(corpus_common_tokens)

Cool!


In [None]:
class Tokenizer:
    def __init__(self, token_list) -> None:
        self.token_list = token_list
        self.piece_to_id = {token['piece']: token['id'] for token in token_list}
        self.vocab = {token['piece']: token for token in token_list}
        self.id_to_piece = {token['id']: token['piece'] for token in token_list}
        self.unk_token_id = 3 # [UNK]
    
    def decode(self, id_list: List[int]) -> str:
        return ' '.join(self.id_to_piece[i] for i in id_list)
    
    def tokenize(self, string: str) -> List[int]:
        token_split = re.findall(r"\w+|[^\w\s]", string)
        return [self.piece_to_id.get(token, self.unk_token_id) for token in token_split]
        
tokenizer.test_tokenizer(Tokenizer)

f = open('bpe_tokens.json')
token_list = json.load(f)

t = Tokenizer(token_list)
t.decode(t.tokenize("Hello there. POSIdfjpso $ $"))

class BPETokenizer(Tokenizer):
    def __init__(self, token_list):
        super().__init__(token_list)
        
    def tokenize(self, string: str):
        # token_split = re.findall(r"\w+|[^\w\s]", string)
        tokens = list(string)
        print(string)
        print(self.piece_to_id.keys())
        
        for piece in self.piece_to_id.keys():
            new_tokens = []
            while len(tokens) > 1:
                tok1, tok2 = tokens[0], tokens[1]
                if tok1 + tok2 == piece:
                    # delete i, i+1 from tokens
                    # insert tok1 + tok2
                    tokens.pop(0)
                    tokens.pop(0)
                    new_tokens.append(piece)
                else:
                    new_tokens.append(tokens.pop(0))
            if len(tokens) == 1:
                new_tokens.append(tokens.pop(0))
            tokens = new_tokens
        return [self.piece_to_id[token] for token in tokens]
              
    @classmethod
    def from_corpus(cls, corpus, num_tokens=1000):
        # BPETokenizer.from_corpus(minicorpus)
        # tokens = 
        unique_tokens = {token: i+5 for i, token in enumerate(set(list("".join(corpus))))} # token: id
        
        corpus = [list(tokens) for tokens in corpus]
        while len(unique_tokens) < num_tokens:
            
            pair_count = Counter((tokens[i], tokens[i+1]) for tokens in corpus for i in range(len(tokens)-1) )
            
            tok1, tok2 = pair_count.most_common(1)[0][0]
            # pair_count[tok1, tok2] = 0
            unique_tokens[tok1 + tok2] = len(unique_tokens) + 5
            
            new_corpus = []
            for tokens in corpus:
                new_tokens = []
                while len(tokens) > 1:
                    if tokens[0] == tok1 and tokens[1] == tok2:
                        tokens.pop(0)
                        tokens.pop(0)
                        new_tokens.append(tok1 + tok2)

                        # pair_count.subtract({(new_tokens[-1], tok1): 1, (tok2, tokens[0]): 1, (tok1 + tok2, tokens[0]): -1})
                    else:
                        new_tokens.append(tokens.pop(0))
                if len(tokens) == 1:
                    new_tokens.append(tokens.pop(0))
                new_corpus.append( new_tokens)
            corpus = new_corpus
        return cls([{'piece': k, 'id': v} for k, v in unique_tokens.items()])

# tokenizer.test_bpe_tokenizer_from_corpus(BPETokenizer)

print(tokenizer.BPETokenizer.from_corpus(["a   b   c"], num_tokens = 8).vocab.keys())
print(BPETokenizer.from_corpus(["a   b   c"], num_tokens = 8).vocab.keys())

tokenizer.test_bpe_tokenizer(BPETokenizer)

Extra cool
dict_keys(['c', ' ', 'b', 'a', '  ', '   ', 'a  ', 'a   '])
dict_keys(['c', ' ', 'b', 'a', '  ', '   ', 'a   ', 'a   b'])
hello, my name is tom trundlewich
dict_keys([':', 'm', '1', 'V', 'D', 'l', 'v', '2', '5', 'K', 'o', 'B', 'C', 'X', 'b', 'I', 'R', ';', ')', 'Y', 'h', 'G', '(', 'A', '?', 'q', 'e', ' ', 'F', '.', ',', '!', ']', 'W', 'M', 'U', 's', 'a', 'r', '0', 'p', '-', 'i', 'S', 'E', 'O', 'N', 'g', '<', "'", 'x', 'J', '4', '\n', '3', '[', 'T', 'L', 'w', 'n', 'y', 'k', 't', 'H', 'u', 'j', '>', 'c', 'z', 'P', 'd', '9', 'f', '  ', '   ', 'e ', '    ', 'th', ' t', 't ', 's ', 'ou', ', ', 'er', 'in', 'd ', '. ', 'an', 'or', 'y ', 'o ', 'll', '.\n', 'ha', 'on', 'hi', 'you', ' th', 'en', 'I ', '        ', 'ar', 'ea', 'of', 'es', ' s', 'll ', 'no', 'er ', 'ing', 'and ', 've ', 'a ', 'wi', 'is ', ',\n', 'the ', 'r ', 'of ', 'he ', 'om', 'it', '; ', 're', 'for', ' to ', 'st', 'ER', 'se', 'ch', 'at ', 'at', ' the ', ' m', 'ir', 'ow', 'his ', 'OL', 'him', 'e, ', 'LO', "'s ", '     