In [3]:
import re
import collections
from typing import List
import tokenizer as tok_tests

In [4]:
def corpus_common_tokens(corpus: List[str], most_common=30_000):
    concated_corpus = " ".join(corpus)
    token_list = re.findall(r"\w+|[^\w\s]", concated_corpus)
    counts_dict = collections.Counter(token_list)
    most_common_items = counts_dict.most_common(most_common)
    return [k for k, v in most_common_items]

tok_tests.test_tokenizer_from_corpus_fn(corpus_common_tokens)

In [5]:
class Tokenizer:
    def __init__(self, token_list, unk_id=3) -> None:
        self.str_to_id = {d['piece']:d['id'] for d in token_list}
        #print(self.str_to_id)
        self.id_to_str = {v: k for k, v in self.str_to_id.items()}
        self.unk_id = unk_id
    
    def decode(self, input_ids) -> str:
        decoded = [self.id_to_str[i] for i in input_ids]
        return " ".join(decoded)
    
    def tokenize(self, string) -> List[int]:
        token_strs = re.findall(r"\w+|[^\w\s]", string)
        return [self.str_to_id.get(string, self.unk_id) for string in token_strs]

tok_tests.test_tokenizer(Tokenizer)

In [9]:
class BPETokenizer(Tokenizer):
    def __init__(self, token_list):
        super().__init__(token_list)
        print(self.str_to_id)

    def tokenize(self, string) -> List[int]:
        # "aabc" with tokens {"a": 0, "b": 1, "c":2, "aa": 4, "aab": 5}
        # ["a", "a", "b", "c"]
        # ["aa", "b", "c"]
        # ["aab", "c"]

        char_list = list(string)
        token_list = []
        have_merged = True

        while have_merged:
            i = 1
            have_merged = False
            while i < len(char_list):
                token_pair = "".join(char_list[i - 1 : i + 1])
                if token_pair in self.str_to_id:
                    token_list.append(
                        self.str_to_id[token_pair]
                    )
                    i += 2
                    have_merged = True
                elif i != len(char_list) - 1:
                    token_list.append(self.str_to_id[char_list[i - 1]])
                    i += 1
                else:
                    token_list.append(self.str_to_id[char_list[i]])
                    i += 1
        
        return token_list


tok_tests.test_bpe_tokenizer(BPETokenizer)

{'.': 5, ',': 6, 'w': 7, '?': 8, "'": 9, 'o': 10, 'J': 11, '[': 12, ')': 13, '>': 14, '0': 15, 'G': 16, 'a': 17, 'z': 18, '(': 19, 'l': 20, '4': 21, '1': 22, 'P': 23, 'c': 24, 'H': 25, '\n': 26, 'L': 27, 'n': 28, 'b': 29, 'C': 30, 'v': 31, 'u': 32, 'p': 33, 'F': 34, '!': 35, 'U': 36, 'j': 37, 'M': 38, 'I': 39, 'E': 40, 'e': 41, 'S': 42, 'i': 43, 'f': 44, 's': 45, 'x': 46, 'N': 47, 't': 48, 'T': 49, '<': 50, 'h': 51, '-': 52, 'Y': 53, 'V': 54, '9': 55, ';': 56, ' ': 57, 'q': 58, 'X': 59, '3': 60, 'g': 61, '2': 62, 'K': 63, 'd': 64, 'O': 65, 'r': 66, 'm': 67, ':': 68, 'y': 69, 'A': 70, 'k': 71, 'W': 72, ']': 73, 'R': 74, 'B': 75, '5': 76, 'D': 77, '  ': 78, '   ': 79, 'e ': 80, '    ': 81, 'th': 82, ' t': 83, 't ': 84, 's ': 85, 'ou': 86, ', ': 87, 'er': 88, 'in': 89, 'd ': 90, '. ': 91, 'an': 92, 'or': 93, 'y ': 94, 'o ': 95, 'll': 96, '.\n': 97, 'ha': 98, 'on': 99, 'hi': 100, 'you': 101, ' th': 102, 'en': 103, 'I ': 104, '        ': 105, 'ar': 106, 'ea': 107, 'of': 108, 'es': 109, ' s'