In [2]:
import os
import greedy_builder
from tqdm import tqdm
import regex
import re

def process_wiki_xml(f):
    containers = []
    container = []
    for line in f:
        if line.startswith("<"):
            container = " ".join(container[1:])
            if len(container.split(" ")) >= 25:
                containers.append(container)
            container = []
            token_count = 0
            continue
        line = line.strip()
        if len(line) > 0:
            container.append(line)
    return containers

def tokenize_file(f, tokens, pat_str):
    gb = greedy_builder.build(tokens)
    return gb.batch_tokenize([regex.findall(pat_str, doc)
                    for doc in process_wiki_xml(open(f))])

def read_cpp_res(domain):
    tokens = [bytes.fromhex(t.strip()) for t in open(f'cpp_outputs/{domain}/tokens.txt','r').read().strip().split('\n')]
    merges_per_turn = [int(x) for x in open(f'cpp_outputs/{domain}/merges.txt','r').read().strip().split('\n')]
    total = 0
    totals = []
    for m in merges_per_turn:
        total += m
        totals.append(total)
    return tokens, merges_per_turn, totals

In [4]:
tokens, _, __ = read_cpp_res('wiki')
gb = greedy_builder.build(tokens)

orig = process_wiki_xml(open("/data/jiapeng/wiki/cleaned/AA/wiki_00"))
print("Number of texts:", len(orig))
pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
tokenized = gb.batch_tokenize([[re.sub(' ','Ġ',x) for x in regex.findall(pat_str, doc)] for doc in orig])

tokens = [bytes([i]) for i in range(256)] + tokens
for i, t in enumerate(tokenized):
    #index, max token id, boolean: if original text match decoded tokenized text
    print(i, max(t), re.sub('Ġ', ' ', b''.join([tokens[c] for c in t]).decode('utf-8'))==orig[i])

Number of texts: 451
0 10230 True
1 10248 True
2 10254 True
3 10244 True
4 10251 True
5 10253 True
6 10254 True
7 10232 True
8 9883 True
9 10239 True
10 10104 True
11 10249 True
12 9995 True
13 10242 True
14 10252 True
15 10182 True
16 10072 True
17 10253 True
18 10236 True
19 10255 True
20 9654 True
21 10233 True
22 10203 True
23 10240 True
24 10239 True
25 10252 True
26 10253 True
27 10251 True
28 10254 True
29 10255 True
30 10247 True
31 10232 True
32 10218 True
33 10253 True
34 10255 True
35 10163 True
36 10238 True
37 10213 True
38 10253 True
39 10224 True
40 10203 True
41 9853 True
42 10165 True
43 10221 True
44 10211 True
45 10248 True
46 10223 True
47 10221 True
48 10247 True
49 10205 True
50 9886 True
51 10253 True
52 10252 True
53 10243 True
54 10206 True
55 10248 True
56 10251 True
57 10178 True
58 10247 True
59 10128 True
60 10248 True
61 10254 True
62 10039 True
63 10251 True
64 10178 True
65 10150 True
66 10228 True
67 10230 True
68 10203 True
69 10252 True
70 9819 True
7

In [5]:
print(orig[0]) #original text

Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions they claim maintain unnecessary coercion and hierarchy, typically including, though not necessarily limited to, the state and capitalism. Anarchism advocates for the replacement of the state with stateless societies or other forms of free associations. As a historically left-wing movement, usually placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement. Humans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th a

In [6]:
print(tokenized[0]) #encoding original text

[3138, 2127, 983, 335, 258, 1138, 6680, 263, 2943, 384, 335, 1785, 2878, 484, 261, 629, 1561, 6514, 324, 5266, 263, 2614, 716, 290, 597, 308, 427, 257, 5538, 689, 1140, 4241, 9659, 774, 3879, 289, 263, 286, 880, 8813, 44, 4452, 934, 44, 2286, 514, 6639, 3441, 290, 44, 257, 1176, 263, 2787, 983, 46, 633, 2127, 983, 6367, 628, 324, 257, 7340, 261, 257, 1176, 301, 1697, 6450, 7422, 423, 682, 3656, 261, 2382, 1956, 598, 46, 663, 258, 10230, 1411, 45, 6741, 2943, 44, 2753, 1679, 326, 257, 279, 2351, 371, 1411, 261, 257, 1138, 5516, 6637, 44, 402, 335, 2753, 1884, 3945, 1989, 276, 983, 263, 309, 105, 1474, 2100, 677, 120, 983, 343, 257, 309, 105, 1474, 2100, 4520, 355, 4744, 1474, 2100, 1244, 983, 41, 261, 257, 1244, 361, 2943, 46, 325, 383, 752, 3154, 277, 7422, 1994, 5816, 286, 880, 2127, 411, 1125, 660, 257, 5808, 261, 5816, 2685, 44, 2090, 5461, 44, 423, 1038, 3402, 270, 46, 1842, 257, 5906, 261, 8606, 286, 880, 2127, 484, 3830, 44, 269, 1010, 287, 983, 6133, 5266, 359, 7408, 46, 2480, 7

In [7]:
print(re.sub('Ġ', ' ', b''.join([tokens[c] for c in tokenized[0]]).decode('utf-8'))) #decoding tokens

Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions they claim maintain unnecessary coercion and hierarchy, typically including, though not necessarily limited to, the state and capitalism. Anarchism advocates for the replacement of the state with stateless societies or other forms of free associations. As a historically left-wing movement, usually placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement. Humans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th a