In [1]:
import re
import collections
from typing import List
import tokenizer as tok_tests
import time

In [2]:
def corpus_common_tokens(corpus: List[str], most_common=30_000):
    concated_corpus = " ".join(corpus)
    token_list = re.findall(r"\w+|[^\w\s]", concated_corpus)
    counts_dict = collections.Counter(token_list)
    most_common_items = counts_dict.most_common(most_common)
    return [k for k, v in most_common_items]

tok_tests.test_tokenizer_from_corpus_fn(corpus_common_tokens)

In [3]:
class Tokenizer:
    def __init__(self, token_list, unk_id=3) -> None:
        self.str_to_id = {d['piece']:d['id'] for d in token_list}
        self.id_to_str = {v: k for k, v in self.str_to_id.items()}
        self.unk_id = unk_id
    
    def decode(self, input_ids) -> str:
        decoded = [self.id_to_str[i] for i in input_ids]
        return " ".join(decoded)
    
    def tokenize(self, string) -> List[int]:
        token_strs = re.findall(r"\w+|[^\w\s]", string)
        return [self.str_to_id.get(string, self.unk_id) for string in token_strs]

tok_tests.test_tokenizer(Tokenizer)

running their tokenizer
running our tokenizer


In [4]:
example_dict = {"a": 0, "b": 1, "c":2, "aa": 4, "aab": 5}
example_str = 'aabc'
class BPETokenizer(Tokenizer):
    def __init__(self, token_list):
        super().__init__(token_list)

    def tokenize(self, string) -> List[int]:

        # ["a", "a", "b", "c"]
        # ["aa", "b", "c"]
        # ["aab", "c"]

        char_list = list(string)
        have_merged = True

        while have_merged:
            i = 1
            have_merged = False

            while i < len(char_list) :
                token_pair = "".join(char_list[i -1: i + 1])
                is_token = token_pair in self.str_to_id

                if is_token:
                    char_list[i-1] = token_pair
                    char_list.pop(i)
                    have_merged = True

                i += 1
        
        print(char_list)

        return [self.str_to_id[tok] for tok in char_list]


tokenizer=BPETokenizer([{'piece':'a', 'id':0}])
tokenizer.str_to_id=example_dict
print(tokenizer.tokenize('aabc'))
tok_tests.test_bpe_tokenizer(BPETokenizer)

['aab', 'c']
[5, 2]
['he', 'll', 'o', ', my ', 'nam', 'e ', 'is', ' t', 'om', ' tru', 'n', 'd', 'le', 'wi', 'ch']


AssertionError: got (671, 96, 53, 464, 807, 80, 168, 83, 125, 471, 46, 14, 725, 118, 134) instead of (27, 296, 53, 464, 807, 80, 168, 83, 125, 83, 17, 504, 725, 118, 134), theirs:hello, my name is tom trundlewich, ours:hello, my name is tom trundlewich, theirs:f , ell, ours:he, ll