In [1]:
import os
from pcatt import greedy_builder
from tqdm import tqdm
import regex
import re
import time

def process_wiki_xml(f):
    containers = []
    container = []
    for line in f:
        if line.startswith("<"):
            container = " ".join(container[1:])
            if len(container.split(" ")) >= 25:
                containers.append(container)
            container = []
            token_count = 0
            continue
        line = line.strip()
        if len(line) > 0:
            container.append(line)
    return containers

def tokenize_file(f, tokens, pat_str):
    gb = greedy_builder.build(tokens)
    return gb.batch_tokenize([regex.findall(pat_str, doc)
                    for doc in process_wiki_xml(open(f))])

def read_cpp_res(domain):
    tokens = [bytes.fromhex(t.strip()) for t in open(f'cpp_outputs/{domain}/tokens.txt','r').read().strip().split('\n')]
    merges_per_turn = [int(x) for x in open(f'cpp_outputs/{domain}/merges.txt','r').read().strip().split('\n')]
    total = 0
    totals = []
    for m in merges_per_turn:
        total += m
        totals.append(total)
    return tokens, merges_per_turn, totals

In [2]:
tokens, _, __ = read_cpp_res('wiki')
gb = greedy_builder.build_greedy_tokenizer(tokens)

orig = process_wiki_xml(open("/data/jiapeng/wiki/cleaned/AA/wiki_00"))
print("Number of texts:", len(orig))

pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
processed = [[re.sub(' ','Ġ',x) for x in regex.findall(pat_str, doc)] for doc in orig]

start = time.time()
tokenized = gb.batch_tokenize(processed)
end = time.time() - start
print(sum([len(x) for x in processed])/end, "words/second")
tokens = [bytes([i]) for i in range(256)] + tokens

Number of texts: 451
1190748.4580214329 words/second


In [3]:
print(orig[0][:500]) #original text
print(orig[0][-500:]) 

Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions they claim maintain unnecessary coercion and hierarchy, typically including, though not necessarily limited to, the state and capitalism. Anarchism advocates for the replacement of the state with stateless societies or other forms of free associations. As a historically left-wing movement, usually placed on the farthest left of the political spectrum, it is
ideas. The Marxist criticism of anarchism is that it has a utopian character because all individuals should have anarchist views and values. According to the Marxist view, that a social idea would follow directly from this human ideal and out of the free will of every individual formed its essence. Marxists state that this contradiction was responsible for their inability to act. In the anarchist vision, the conflict between liberty and equality was resolved through coexistence and intertwining

In [4]:
print(tokenized[0][0:100]) #encoding original text up to 100 tokens

[3138, 2127, 983, 335, 258, 1138, 6680, 263, 2943, 384, 335, 1785, 2878, 484, 261, 629, 1561, 6514, 324, 5266, 263, 2614, 716, 290, 597, 308, 427, 257, 5538, 689, 1140, 4241, 9659, 774, 3879, 289, 263, 286, 880, 8813, 44, 4452, 934, 44, 2286, 514, 6639, 3441, 290, 44, 257, 1176, 263, 2787, 983, 46, 633, 2127, 983, 6367, 628, 324, 257, 7340, 261, 257, 1176, 301, 1697, 6450, 7422, 423, 682, 3656, 261, 2382, 1956, 598, 46, 663, 258, 10230, 1411, 45, 6741, 2943, 44, 2753, 1679, 326, 257, 279, 2351, 371, 1411, 261, 257, 1138, 5516, 6637]


In [5]:
[tokens[t] for t in tokenized[0][0:10]] # token word form \xc4\xa0 is Ġ special char

[b'An',
 b'arch',
 b'ism',
 b'\xc4\xa0is',
 b'\xc4\xa0a',
 b'\xc4\xa0political',
 b'\xc4\xa0philosophy',
 b'\xc4\xa0and',
 b'\xc4\xa0movement',
 b'\xc4\xa0that']

In [6]:
decoded_example = re.sub('Ġ', ' ', b''.join([tokens[c] for c in tokenized[0]]).decode('utf-8'))
print(decoded_example[:500]) #decoding tokens
print(decoded_example[-500:])

Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions they claim maintain unnecessary coercion and hierarchy, typically including, though not necessarily limited to, the state and capitalism. Anarchism advocates for the replacement of the state with stateless societies or other forms of free associations. As a historically left-wing movement, usually placed on the farthest left of the political spectrum, it is
ideas. The Marxist criticism of anarchism is that it has a utopian character because all individuals should have anarchist views and values. According to the Marxist view, that a social idea would follow directly from this human ideal and out of the free will of every individual formed its essence. Marxists state that this contradiction was responsible for their inability to act. In the anarchist vision, the conflict between liberty and equality was resolved through coexistence and intertwining

In [7]:
def get_cpp_counts(domain):
    words = [t for t in open(f'cpp_inputs/words/{domain}.txt','r').read().strip().split(' ')]
    counts = [int(t) for t in open(f'cpp_inputs/counts/{domain}.txt','r').read().strip().split('\n')]
    return {w:c for w,c in zip(words,counts)}

un_counts = get_cpp_counts('un')

In [8]:
# python wrapper for PCO greedy algo
# set token_candidates to look at all possible substrings
# else specify the exact token candidates
token_candidates = set()
greedy_tokenizer = greedy_builder.build_greedy_pco_tokenizer(un_counts, token_candidates)
# initialize graph
greedy_tokenizer.initialize_graph()

Word counts size: 105505
Token set size: 0
Empty token set size selected -> all possible substrings...
Final token set size: 884708
Initial setup phase: 1941 ms


In [9]:
# let's solve for the first 50 steps
tokens, scores = greedy_tokenizer.solve_to_step(50)

Starting main routine...
1. |Ġ [c4 a0 ] | 30035114 | 117 ms | 638 ms | shortlist: 858979
2. |Ġthe [c4 a0 74 68 65 ] | 8286735 | 166 ms | 213 ms | shortlist: 3378
3. |tion [74 69 6f 6e ] | 4043268 | 39 ms | 81 ms | shortlist: 88654
4. |Ġof [c4 a0 6f 66 ] | 3300812 | 45 ms | 47 ms | shortlist: 1045
5. |Ġand [c4 a0 61 6e 64 ] | 3262209 | 35 ms | 35 ms | shortlist: 717
6. |in [69 6e ] | 2782359 | 36 ms | 147 ms | shortlist: 228318
7. |re [72 65 ] | 2384688 | 51 ms | 112 ms | shortlist: 123799
8. |Ġt [c4 a0 74 ] | 2299618 | 47 ms | 61 ms | shortlist: 28821
9. |Ġa [c4 a0 61 ] | 2171690 | 43 ms | 62 ms | shortlist: 47226
10. |er [65 72 ] | 1910147 | 41 ms | 112 ms | shortlist: 149877
11. |en [65 6e ] | 1824071 | 45 ms | 103 ms | shortlist: 121023
12. |Ġco [c4 a0 63 6f ] | 1782132 | 48 ms | 67 ms | shortlist: 37757
13. |it [69 74 ] | 1622191 | 40 ms | 78 ms | shortlist: 79570
14. |Ġw [c4 a0 77 ] | 1404713 | 43 ms | 51 ms | shortlist: 13165
15. |es [65 73 ] | 1365110 | 41 ms | 107 ms | shortlis

In [10]:
print(len(tokens))

50


In [11]:
# add in some manual tokens in between
# useful for warm starts, not that we will have to recalculate the whole cache again after
tokens, scores = greedy_tokenizer.custom_steps(["scar", "edy"])

50. |scar [73 63 61 72 ] | 1602
51. |edy [65 64 79 ] | 5645


In [12]:
print(len(tokens))

52


In [13]:
# continue solving till k=60
tokens, scores = greedy_tokenizer.solve_to_step(60) 

Starting main routine...
53. |Ġe [c4 a0 65 ] | 533333 | 77 ms | 97 ms | shortlist: 26039
54. |il [69 6c ] | 507782 | 39 ms | 59 ms | shortlist: 50519
55. |Ġc [c4 a0 63 ] | 483397 | 40 ms | 52 ms | shortlist: 30385
56. |Ġb [c4 a0 62 ] | 468378 | 38 ms | 48 ms | shortlist: 26087
57. |ly [6c 79 ] | 466512 | 38 ms | 67 ms | shortlist: 85549
58. |th [74 68 ] | 457339 | 43 ms | 55 ms | shortlist: 25726
59. |as [61 73 ] | 457261 | 37 ms | 53 ms | shortlist: 35662
60. |ec [65 63 ] | 453637 | 39 ms | 52 ms | shortlist: 27366
Total time taken: 0 seconds


In [14]:
print(len(tokens))
print(tokens)

60
['Ġ', 'Ġthe', 'tion', 'Ġof', 'Ġand', 'in', 're', 'Ġt', 'Ġa', 'er', 'en', 'Ġco', 'it', 'Ġw', 'es', 'Ġs', 'or', 'at', 'is', 'al', 'Ġp', 'on', 'an', 'Ġin', 'ed', 'Ġto', 'Ġf', 'Ġbe', 'ation', 'ic', 'ou', 'ar', 'ment', 'Ġthat', 'ing', 'Ġdevelop', 'Ġm', 'le', 'Ġh', 'Ġre', 'ĠUnited', 'Ġd', 'Ġcountr', 'st', 'Ġinternational', 'ro', 'ce', 've', 'Ġn', 'Ġwhich', 'scar', 'edy', 'Ġe', 'il', 'Ġc', 'Ġb', 'ly', 'th', 'as', 'ec']
