In [1]:
import os
import regex
from tqdm import tqdm
from collections import defaultdict

def gen_word_count(files):
    pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
    count = defaultdict(int)
    for f in files:
        data = open(f, 'r').read()
        for w in regex.findall(pat_str, data):
            count[regex.sub(" ", "Ġ", w)] += 1
    return dict(count)
        
def get_substrings(counts):
    T = set()
    for w in counts:
        for i in range(len(w[0])):
            for j in range(i+1, len(w[0])):
                T.add(w[0][i:j])
    print('total', len(T))
    alphabet = [t for t in T if len(t)==1]
    print('alphabet', len(alphabet))
    print('T', len(T)-len(alphabet))
    print()
    return T

class TestTextIterator:
    def __init__(self, files, pat_str, batch_size):
        self.files = files
        self.file_idx = 0
        self.pat_str = pat_str
        self.batch_size = batch_size
    def __len__(self):
        return len(self.files)//self.batch_size + 1
    def __iter__(self):
        return self
    def __next__(self):
        if self.file_idx >= len(self.files):
            raise StopIteration
        current = self.file_idx
        self.file_idx += self.batch_size
        return [[regex.sub(" ", "Ġ", w) for w in regex.findall(self.pat_str, open(self.files[t], 'r').read())] 
               for t in range(current, min(self.file_idx,len(self.files)))]
    
un_files = [f"../project5/data/un/TXT/{f}" for f in os.listdir("../project5/data/un/TXT/")]

In [2]:
tti = TestTextIterator(
    un_files[:350],
    r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
    100
)

# text_iterator gives batches of lists of words
from pcatt.hf.greedtok import GreedTok
GT_Train = GreedTok([], special_tokens_map={}).train_new_from_iterator(
    tti, 
    vocab_size = 100,
    special_tokens_map={
        "pad_token":"<pad>",
        "unk_token":"<unk>", 
        "eos_token":"<eos>"
    },
    min_word_count=1,
    max_token_size=1000
)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Trie constructed
Word counts size: 32151
Token set size: 0
Empty token set size selected -> all possible substrings with...
Max token size: 1000
Min. word count: 1
len:  100
Final candidate token set size: 285402
Initial setup phase: 456 ms
0. |<pad> [3c 70 61 64 3e ] | 0
1. |<unk> [3c 75 6e 6b 3e ] | 0
2. |<eos> [3c 65 6f 73 3e ] | 0
Starting main routine...
4. |Ġ [c4 a0 ] | 308087 | 38 ms | 169 ms | shortlist: 274826
5. |Ġthe [c4 a0 74 68 65 ] | 76164 | 32 ms | 47 ms | shortlist: 1227
6. |tion [74 69 6f 6e ] | 45264 | 14 ms | 30 ms | shortlist: 37228
7. |Ġand [c4 a0 61 6e 64 ] | 31770 | 15 ms | 16 ms | shortlist: 299
8. |Ġof [c4 a0 6f 66 ] | 31192 | 10 ms | 11 ms | shortlist: 514
9. |in [69 6e ] | 30601 | 11 ms | 48 ms | shortlist: 84159
10. |re [72 65 ] | 25253 | 17 ms | 39 ms | shortlist: 49568
11. |Ġt [c4 a0 74 ] | 21088 | 15 ms | 20 ms | shortlist: 11004
12. |Ġa [c4 a0 61 ] | 20232 | 12 ms | 19 ms | shortlist: 21062
13. |en [65 6e ] | 20201 | 12 ms | 33 ms | shortlist: 44498
14. 

In [3]:
GT_Train.save_pretrained('pcatt/hf/examples/greedtok_test2')

tokenizer config file saved in pcatt/hf/examples/greedtok_test2/tokenizer_config.json
special_tokens_map file saved in pcatt/hf/examples/greedtok_test2/special_tokens_map.json
added tokens file saved in pcatt/hf/examples/greedtok_test2/added_tokens.txt


('pcatt/hf/examples/greedtok_test2/tokenizer_config.json',
 'pcatt/hf/examples/greedtok_test2/special_tokens_map.json',
 'pcatt/hf/examples/greedtok_test2/added_tokens.txt')

In [4]:
tti = TestTextIterator(
    un_files[:350],
    r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
    100
)
test_split = [x for x in next(tti)]
test = ["".join(x) for x in test_split]
print(max([len(x) for x in test]))

33646


In [5]:
from pcatt.hf.greedtok import GreedTok
GT_Train = GreedTok.from_pretrained("pcatt/hf/examples/greedtok_test2")

Trie constructed
unk_token <unk> 1
pad_token <pad> 0
eos_token <eos> 2


In [6]:
test_encode = GT_Train(test_split[:50], test_split[50:], is_split_into_words=True)
print(test_encode['input_ids'][1][:100])
test_decode = GT_Train.batch_decode(test_encode['input_ids'])
print(regex.sub("Ġ"," ", test_decode[1])[:200])

[176, 201, 216, 40, 201, 63, 203, 214, 22, 82, 22, 201, 3, 221, 33, 146, 77, 92, 144, 6, 4, 3, 182, 201, 212, 217, 198, 208, 30, 3, 110, 211, 202, 3, 166, 82, 203, 31, 205, 197, 3, 217, 212, 21, 3, 221, 33, 214, 60, 38, 199, 5, 11, 215, 3, 180, 9, 74, 200, 72, 7, 4, 3, 171, 12, 13, 16, 93, 215, 81, 209, 198, 55, 11, 216, 98, 3, 110, 202, 17, 216, 221, 145, 81, 218, 12, 50, 23, 18, 74, 21, 146, 3, 189, 33, 214, 60, 220, 212, 13]
Let me congratulate you. Sir, and the Republic 
of Bulgaria upon your election as President of the General Assembly at its 
forty-seventh session. Your experience as a respected political leader and 



In [7]:
test_encode = GT_Train(test[:50], test[50:], is_split_into_words=False)
print(test_encode['input_ids'][1][:100])
test_decode = GT_Train.batch_decode(test_encode['input_ids'])
print(regex.sub("Ġ"," ", test_decode[1])[:200])

[176, 201, 216, 40, 201, 63, 203, 214, 22, 82, 22, 201, 3, 221, 33, 146, 77, 92, 144, 6, 4, 3, 182, 201, 212, 217, 198, 208, 30, 3, 110, 211, 202, 3, 166, 82, 203, 31, 205, 197, 3, 217, 212, 21, 3, 221, 33, 214, 60, 38, 199, 5, 11, 215, 3, 180, 9, 74, 200, 72, 7, 4, 3, 171, 12, 13, 16, 93, 215, 81, 209, 198, 55, 11, 216, 98, 3, 110, 202, 17, 216, 221, 145, 81, 218, 12, 50, 23, 18, 74, 21, 146, 3, 189, 33, 214, 60, 220, 212, 13]
Let me congratulate you. Sir, and the Republic 
of Bulgaria upon your election as President of the General Assembly at its 
forty-seventh session. Your experience as a respected political leader and 



In [8]:
test_encode = GT_Train(test, is_split_into_words=False)
print(test_encode['input_ids'][0][:100])
test_decode = GT_Train.batch_decode(test_encode['input_ids'])
print(regex.sub("Ġ"," ", test_decode[0])[:200])

[173, 216, 97, 205, 218, 18, 40, 201, 26, 38, 56, 217, 9, 11, 216, 4, 3, 33, 216, 81, 216, 29, 3, 206, 211, 8, 4, 23, 212, 201, 197, 207, 13, 215, 25, 204, 211, 26, 9, 199, 24, 24, 3, 110, 209, 201, 27, 63, 203, 214, 22, 82, 22, 32, 3, 221, 33, 146, 77, 92, 144, 87, 3, 221, 33, 214, 60, 38, 199, 5, 29, 4, 26, 9, 74, 200, 12, 199, 221, 7, 4, 3, 110, 171, 12, 13, 16, 93, 215, 81, 209, 198, 55, 11, 216, 98, 66, 216, 221, 145]
It gives me pleasure at the outset to join the speakers who preceded 
me in congratulating you. Sir, on your election to the presidency of the 
General Assembly at its forty-seventh session. My delega


In [9]:
test_encode = GT_Train(test_split, is_split_into_words=True)
print(test_encode['input_ids'][0][:100])
test_decode = GT_Train.batch_decode(test_encode['input_ids'])
print(regex.sub("Ġ"," ", test_decode[0])[:200])

[173, 216, 97, 205, 218, 18, 40, 201, 26, 38, 56, 217, 9, 11, 216, 4, 3, 33, 216, 81, 216, 29, 3, 206, 211, 8, 4, 23, 212, 201, 197, 207, 13, 215, 25, 204, 211, 26, 9, 199, 24, 24, 3, 110, 209, 201, 27, 63, 203, 214, 22, 82, 22, 32, 3, 221, 33, 146, 77, 92, 144, 87, 3, 221, 33, 214, 60, 38, 199, 5, 29, 4, 26, 9, 74, 200, 12, 199, 221, 7, 4, 3, 110, 171, 12, 13, 16, 93, 215, 81, 209, 198, 55, 11, 216, 98, 66, 216, 221, 145]
It gives me pleasure at the outset to join the speakers who preceded 
me in congratulating you. Sir, on your election to the presidency of the 
General Assembly at its forty-seventh session. My delega


In [10]:
from pcatt.hf.greedtok import GreedTok
GT = GreedTok(ranked_tokens = ['aa', 'bb', 'abc', 'bc', '12', '123', '34', "<pad>", "<eos>"],
         special_tokens_map = {"pad_token":"<pad>", "eos_token":"<eos>"})
GT.save_pretrained("pcatt/hf/examples/greedtok_test1")

Trie constructed
eos_token <eos> 8
pad_token <pad> 7
tokenizer config file saved in pcatt/hf/examples/greedtok_test1/tokenizer_config.json
special_tokens_map file saved in pcatt/hf/examples/greedtok_test1/special_tokens_map.json
added tokens file saved in pcatt/hf/examples/greedtok_test1/added_tokens.txt


('pcatt/hf/examples/greedtok_test1/tokenizer_config.json',
 'pcatt/hf/examples/greedtok_test1/special_tokens_map.json',
 'pcatt/hf/examples/greedtok_test1/added_tokens.txt')

In [11]:
from pcatt.hf.greedtok import GreedTok
GT2 = GreedTok.from_pretrained("pcatt/hf/examples/greedtok_test1")

Trie constructed
pad_token <pad> 7
eos_token <eos> 8


In [12]:
#basic decoding
print(GT2.batch_decode([[0,1], [1,2], [3,4,7]]))
print(GT2.batch_decode([[3,4,7,8], [200,1,222]], skip_special_tokens=True))

['aabb', 'bbabc', 'bc12<pad>']
['bc12', '\\xbfbb\\xd5']


In [13]:
# testing __call__
GT2(["aabc", "aa1234", "abv"], is_split_into_words=False)

{'input_ids': [[0, 3], [0, 5, 61], [106, 107, 127]]}

In [14]:
# testing __call__ presplit
GT2([["aa","bc"], ["aa", "123", "4"], ["ab","v"]], is_split_into_words=True)

{'input_ids': [[0, 3], [0, 5, 61], [106, 107, 127]]}

In [15]:
# testing __call__ no padding and no truncation
outputs = GT2(["aabc", "<pad>aa1234<eos>", "abv<pad>"], 
    is_split_into_words=False, 
    padding=False,
    return_attention_mask=True,
    return_special_tokens_mask=True,
    max_length = 10)
for k,v in outputs.items():
    print(k)
    for o in v:
        print('\t', len(o), ': ', o)

special_tokens_mask
	 2 :  [0, 0]
	 5 :  [1, 0, 0, 0, 1]
	 4 :  [0, 0, 0, 1]
input_ids
	 2 :  [0, 3]
	 5 :  [7, 0, 5, 61, 8]
	 4 :  [106, 107, 127, 7]


In [16]:
# testing __call__ with padding and truncation
outputs = GT2(["aabc", 
               "aa1234",
               "abv<pad>",
               "abv<pad>abv<pad>abv<pad>aa1234"], 
    is_split_into_words=False, 
    padding="max_length",
    truncation = "longest_first",
    return_overflowing_tokens=True,
    return_attention_mask=True,
    return_special_tokens_mask=True,
    max_length = 10)
for k,v in outputs.items():
    print(k)
    for o in v:
        print('\t', len(o), ': ', o)

special_tokens_mask
	 10 :  [0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
	 10 :  [0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
	 10 :  [0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
	 10 :  [0, 0, 0, 1, 0, 0, 0, 1, 0, 0]
overflowing_tokens
	 0 :  []
	 0 :  []
	 0 :  []
	 5 :  [127, 7, 0, 5, 61]
attention_mask
	 10 :  [1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
	 10 :  [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
	 10 :  [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
	 10 :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
input_ids
	 10 :  [0, 3, 7, 7, 7, 7, 7, 7, 7, 7]
	 10 :  [0, 5, 61, 7, 7, 7, 7, 7, 7, 7]
	 10 :  [106, 107, 127, 7, 7, 7, 7, 7, 7, 7]
	 10 :  [106, 107, 127, 7, 106, 107, 127, 7, 106, 107]


In [17]:
# testing pairs
outputs = GT2(["aabc", "aa1234"],
               ["abv<pad>", "abv<pad>abv<pad>abv<pad>aa1234"])
for k,v in outputs.items():
    print(k)
    for o in v:
        print('\t', len(o), ': ', o)

input_ids
	 6 :  [0, 3, 106, 107, 127, 7]
	 18 :  [0, 5, 61, 106, 107, 127, 7, 106, 107, 127, 7, 106, 107, 127, 7, 0, 5, 61]


In [18]:
# testing pairs with presplit words
outputs = GT2([["aa","bc"], ["aa","1234"]],
               [["abv", "<pad>"], ["abv<pad>abv<pad>","abv<pad>","aa1234"]],
             is_split_into_words=True)
for k,v in outputs.items():
    print(k)
    for o in v:
        print('\t', len(o), ': ', o)

input_ids
	 6 :  [0, 3, 106, 107, 127, 7]
	 18 :  [0, 5, 61, 106, 107, 127, 7, 106, 107, 127, 7, 106, 107, 127, 7, 0, 5, 61]


In [19]:
outputs = GT2([["aa","bc"], ["aa","1234"]],
               [["abv", "<pad>"], ["abv<pad>abv<pad>","abv<pad>","aa1234"]],
             is_split_into_words=True, 
    padding="max_length",
    truncation = "only_second",
    return_token_type_ids=True,
    return_overflowing_tokens=True,
    return_attention_mask=True,
    return_special_tokens_mask=True,
    max_length = 10)
for k,v in outputs.items():
    print(k)
    for o in v:
        print('\t', len(o), ': ', o)

special_tokens_mask
	 10 :  [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
	 10 :  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
token_type_ids
	 10 :  [0, 0, 1, 1, 1, 1, 7, 7, 7, 7]
	 10 :  [0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
attention_mask
	 10 :  [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
	 10 :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
input_ids
	 10 :  [0, 3, 106, 107, 127, 7, 7, 7, 7, 7]
	 10 :  [0, 5, 61, 106, 107, 127, 7, 106, 107, 127]


In [22]:
from pcatt.pco_tokenizer import build as build_pco

In [23]:
words = [t for t in open('cpp_inputs/words/un.txt').read().strip().split(" ")] 
counts = [int(t.strip()) for t in open('cpp_inputs/counts/un.txt').read().strip().split('\n')]
un_counts = {a:b for a,b in zip(words, counts)}

In [24]:
# we can use train_new_from_counts instead to get the same result as:
'''
test = build_pco(un_counts)
test.initialize_graph(5, 1)
test_tokens, test_scores = test.solve_to_step(100)
'''

from pcatt.hf.greedtok import GreedTok

greedtok = GreedTok().train_new_from_counts(un_counts, 100, max_token_length=5, min_word_count=1)

Trie constructed
Word counts size: 105505
Token set size: 0
Empty token set size selected -> all possible substrings with...
Max token size: 5
Min. word count: 1
Final candidate token set size: 81136
Initial setup phase: 582 ms
Starting main routine...
1. |Ġ [c4 a0 ] | 30035114 | 42 ms | 243 ms | shortlist: 75764
2. |Ġth [c4 a0 74 68 ] | 7109102 | 24 ms | 28 ms | shortlist: 1864
3. |tion [74 69 6f 6e ] | 4043268 | 16 ms | 31 ms | shortlist: 7700
4. |Ġof [c4 a0 6f 66 ] | 3300812 | 13 ms | 13 ms | shortlist: 371
5. |Ġa [c4 a0 61 ] | 3259093 | 9 ms | 17 ms | shortlist: 7092
6. |in [69 6e ] | 2782359 | 15 ms | 65 ms | shortlist: 21307
7. |re [72 65 ] | 2384688 | 14 ms | 41 ms | shortlist: 13589
8. |Ġto [c4 a0 74 6f ] | 2228162 | 13 ms | 14 ms | shortlist: 1091
9. |er [65 72 ] | 1910725 | 9 ms | 39 ms | shortlist: 16660
10. |en [65 6e ] | 1831877 | 13 ms | 38 ms | shortlist: 13572
11. |Ġco [c4 a0 63 6f ] | 1782132 | 13 ms | 19 ms | shortlist: 4574
12. |it [69 74 ] | 1622191 | 11 ms | 26 ms 

In [25]:
#simply
import pcatt.hf
from transformers import AutoTokenizer
AutoTokenizer.from_pretrained("pcatt/hf/examples/greedtok_test2")

Trie constructed
unk_token <unk> 1
pad_token <pad> 0
eos_token <eos> 2
