In [1]:
import cython

In [8]:
import datrie

In [8]:
def convert_line_old(line, lineno=None):
    parts = line.split(':    ')
    return '\t'.join(parts)

In [20]:
def convert_line(line):
    ix = line.find(b':')
    assert ix != -1
    offset = line[:ix]
    rest = line[ix+5:]
    return b'\t'.join((offset, rest))

In [21]:
def convert_file(path):
    with open(path, 'rb') as src:
        with open(path.replace('.txt', '.tsv'), 'wb') as dst:
            for (srclineno, srcline) in enumerate(src):
                try:
                    dst.write(convert_line(srcline, srclineno) + '\r')
                except:
                    print("[%d]: %s" % (srclineno, srcline))
                

In [22]:
line = b'2615:    AccessibleComputing'

In [23]:
convert_line(line)

b'2615\tAccessibleComputing'

In [7]:
import string
import datrie

In [8]:
string.ascii_letters

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [9]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [10]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
allowed = (
    string.printable +
    string.punctuation
)

In [39]:
words = datrie.Trie(allowed)
offset_to_words = datrie.Trie(string.digits)

In [40]:
def convert_word(line):
    ix = line.find('\t')
    assert ix != -1
    offset = line[:ix]
    if line[-1] == '\n':
        word = line[ix+1:-1]
    else:
        word = line[ix+1:]
    return (word, offset)

In [41]:
def add_to_trie(trie, offset_trie, key, value):
    offset = int(value)
    if key in trie:
        existing = trie[key]
        if value not in existing:
            existing.append(offset)
            existing.sort()
    else:
        trie[key] = [ offset, ]
    
    if value in offset_trie:
        existing = offset_trie[value]
        if key not in existing:
            existing.append(key)
            existing.sort()
    else:
        offset_trie[value] = [ key, ]
    
    lower_value = key.lower()
    if lower_value == value:
        return

    if lower_value not in trie:
        trie[lower_value] = [ offset, ]

In [42]:
def add_word(line):
    (word, offset) = convert_word(line)
    add_to_trie(words, offset_to_words, word, offset)

In [43]:
failed = []

In [44]:
with open('words.tsv', 'r') as f:
    for line in f:
        try:
            add_word(line)
        except Exception as e:
            failed.append((e, line))

In [45]:
words.save('words.trie')

In [46]:
offset_to_words.save('words_by_offset.trie')

In [47]:
%timeit datrie.Trie.load('words.trie')

1 loops, best of 3: 1.95 s per loop


In [48]:
words.keys()[:10]

['A',
 'Aani',
 'Aaron',
 'Aaronic',
 'Aaronical',
 'Aaronite',
 'Aaronitic',
 'Aaru',
 'Ab',
 'Ababdeh']

In [35]:
%timeit words.prefixes('aaron')

The slowest run took 18.40 times longer than the fastest. This could mean that an intermediate result is being cached 
1000000 loops, best of 3: 195 ns per loop


In [44]:
%timeit words['aaron']

The slowest run took 21.18 times longer than the fastest. This could mean that an intermediate result is being cached 
10000000 loops, best of 3: 109 ns per loop


In [47]:
words.suffixes('Aa')

['ron']

In [49]:
words.items('aa')

[('aaron', [44])]