In [51]:
import math
import functools
import operator
from lieutenant.major import encode, decode
from lieutenant.pronounce import load_cmudict

In [52]:
dictionary = load_cmudict()

In [59]:
table = encode.generate_encoding_table(dictionary)

In [75]:
%%timeit
dfa = encode.generate_encoding_automaton(table)

384 ms ± 24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [55]:
prod = lambda it: functools.reduce(operator.mul, it)

In [56]:
def count_encodings(d: str) -> int:
    return sum(
        prod(
            len(table[di])
                if di != ''
                else 1
            for di in P
        ) for P in encode.generate_partitions(d, dfa)
    )

In [60]:
s = '31415926535'
for l in range(1, len(s) + 1):
    print(s[:l], math.log(count_encodings(s[:l]))/math.log(10))

3 2.1903316981702914
31 4.846658692656829
314 7.298049652763437
3141 9.954248215790052
31415 12.50290809546097
314159 14.835974475129742
3141592 17.34786937092634
31415926 19.756485916000344
314159265 22.304927822678543
3141592653 24.49604350126202
31415926535 27.045139572562512


In [61]:
math.log(count_encodings(s)) / math.log(10)

27.045139572562512

In [62]:
list(encode.generate_partitions('1234567890', dfa))

[('1234', '5678', '90'),
 ('1234', '5678', '9', '0'),
 ('1234', '567', '890'),
 ('1234', '567', '89', '0'),
 ('1234', '567', '8', '90'),
 ('1234', '567', '8', '9', '0'),
 ('1234', '56', '78', '90'),
 ('1234', '56', '78', '9', '0'),
 ('1234', '56', '7', '890'),
 ('1234', '56', '7', '89', '0'),
 ('1234', '56', '7', '8', '90'),
 ('1234', '56', '7', '8', '9', '0'),
 ('1234', '5', '678', '90'),
 ('1234', '5', '678', '9', '0'),
 ('1234', '5', '67', '890'),
 ('1234', '5', '67', '89', '0'),
 ('1234', '5', '67', '8', '90'),
 ('1234', '5', '67', '8', '9', '0'),
 ('1234', '5', '6', '78', '90'),
 ('1234', '5', '6', '78', '9', '0'),
 ('1234', '5', '6', '7', '890'),
 ('1234', '5', '6', '7', '89', '0'),
 ('1234', '5', '6', '7', '8', '90'),
 ('1234', '5', '6', '7', '8', '9', '0'),
 ('123', '4567', '890'),
 ('123', '4567', '89', '0'),
 ('123', '4567', '8', '90'),
 ('123', '4567', '8', '9', '0'),
 ('123', '456', '78', '90'),
 ('123', '456', '78', '9', '0'),
 ('123', '456', '7', '890'),
 ('123', '456', '

In [63]:
set(str(n).rjust(3, '0') for n in range(1000)) - set(table.keys())

{'063',
 '066',
 '068',
 '069',
 '083',
 '088',
 '099',
 '168',
 '206',
 '229',
 '266',
 '289',
 '333',
 '336',
 '368',
 '388',
 '398',
 '433',
 '455',
 '466',
 '489',
 '555',
 '558',
 '566',
 '568',
 '589',
 '663',
 '668',
 '683',
 '688',
 '689',
 '693',
 '699',
 '768',
 '789',
 '829',
 '833',
 '838',
 '866',
 '868',
 '879',
 '883',
 '887',
 '888',
 '889',
 '893',
 '898',
 '899',
 '933',
 '938',
 '966',
 '988',
 '989'}

In [64]:
[(k, v) for k, v in table.items() if len(v) == 1 and len([*v][0]) <= 5]

[('5970', {'alpex'}),
 ('387', {'amfac'}),
 ('347252', {'aol'}),
 ('1959', {'awb'}),
 ('9549', {'blurb'}),
 ('93195', {'bmw'}),
 ('9703', {'buxom'}),
 ('0018', {'cctv'}),
 ('02282', {'cnnfn'}),
 ('738', {'comfy'}),
 ('008', {'csv'}),
 ('138', {'demov'}),
 ('18195', {'dfw'}),
 ('1819', {'dfw'}),
 ('459', {'eralp'}),
 ('8000', {"fcc's"}),
 ('844284362', {'fyi'}),
 ('6396', {'gmbh'}),
 ('6818', {'hfdf'}),
 ('666', {'hgh'}),
 ('6090', {'hsbc'}),
 ('6135', {'html'}),
 ('6119', {'http'}),
 ('6570', {'jelks'}),
 ('7598', {'klopf'}),
 ('298', {'knopf'}),
 ('7936', {'kpmg'}),
 ('71956', {'kwh'}),
 ('5398', {'lampf'}),
 ('538', {'lymph'}),
 ('3927', {'mbank'}),
 ('3704', {'mixer'}),
 ('35094752', {'mpg'}),
 ('350944', {'mph'}),
 ('299', {'nabob'}),
 ('2900', {"nbc's"}),
 ('2270', {'nynex'}),
 ('7083', {'oxfam'}),
 ('9370', {'pemex'}),
 ('993', {'ppm'}),
 ('468', {'rajiv'}),
 ('443', {'rearm'}),
 ('4089', {'rsvp'}),
 ('4398', {'rumpf'}),
 ('0709', {'saxby'}),
 ('0881', {'sffed'}),
 ('0440', {'sir

In [65]:
sorted(((k, s) for k, v in table.items() for s in v if len(k) == 1), key=lambda t: -len(t[1]))

[('0', "highway's"),
 ('6', 'chihuahua'),
 ('2', 'ahwahnee'),
 ('2', 'hwang-ho'),
 ('2', 'hawaiian'),
 ('2', 'weighing'),
 ('2', 'know-how'),
 ('0', 'highways'),
 ('0', "hawaii's"),
 ('1', 'haddaway'),
 ('1', 'hattaway'),
 ('1', 'hiawatha'),
 ('1', 'outweigh'),
 ('1', 'hathaway'),
 ('1', 'hideaway'),
 ('1', 'hatheway'),
 ('1', 'hughette'),
 ('5', 'holloway'),
 ('5', 'alleyway'),
 ('5', 'hollaway'),
 ('5', 'halloway'),
 ('3', 'eighmey'),
 ('3', 'mayhugh'),
 ('3', 'moawiya'),
 ('3', 'mayeaux'),
 ('3', 'hougham'),
 ('7', 'caughey'),
 ('7', 'walkway'),
 ('7', 'haughey'),
 ('7', 'hawkeye'),
 ('2', 'hanaway'),
 ('2', 'hongwei'),
 ('2', 'one-way'),
 ('2', 'whinney'),
 ('2', 'auyeung'),
 ('0', "hughes'"),
 ('0', 'yessuey'),
 ('0', "waugh's"),
 ('0', "howie's"),
 ('1', 'ottaway'),
 ('1', 'heighth'),
 ('1', 'wedowee'),
 ('1', 'weighed'),
 ('1', 'hataway'),
 ('1', 'headway'),
 ('1', 'hadaway'),
 ('1', 'haywood'),
 ('1', 'hughett'),
 ('1', 'ayodhya'),
 ('1', 'two-way'),
 ('1', 'weighty'),
 ('1', '

In [66]:
decode.decode_from_dictionary('zysk', dictionary)

{'007'}

In [67]:
table['6']

{'achee',
 'achey',
 'achoa',
 'achoo',
 'age',
 'agee',
 'aichi',
 'aiesha',
 'aisha',
 'ajay',
 'asch',
 'asche',
 'ash',
 'asha',
 'ashe',
 'ashey',
 'ashy',
 'asia',
 'auch',
 'auge',
 'awash',
 'ayyash',
 'cha',
 'chae',
 'chai',
 'chao',
 'chau',
 'chaw',
 'che',
 'chea',
 'chee',
 'chew',
 'chewy',
 'chez',
 'chia',
 'chihuahua',
 'chiu',
 'cho',
 'choe',
 'choi',
 'choo',
 'chou',
 'chow',
 'choy',
 'chu',
 'chuah',
 'ciao',
 'czaja',
 'each',
 'eash',
 'edge',
 'edgeway',
 'edgy',
 'ege',
 'eiichi',
 'eiji',
 'eishi',
 'esch',
 'esche',
 'esh',
 'etch',
 'g',
 'g.',
 'ga',
 'gee',
 'geo',
 'gioia',
 'gway',
 'gyi',
 'h',
 'h.',
 'hach',
 'hachey',
 'hage',
 'hagee',
 'hagey',
 'hajj',
 'hao-chi',
 'hasch',
 'hash',
 'hashi',
 'hasz',
 'hatch',
 'hauch',
 'hauge',
 'hausch',
 'hayashi',
 'heagy',
 'hedge',
 'heesch',
 'hege',
 'hesch',
 'hetu',
 'hiaa',
 'hitch',
 'hodge',
 'hoesch',
 'hoge',
 'hooch',
 'hoochie',
 'hooge',
 'hosch',
 'hoshaw',
 'houge',
 'housh',
 'hsia',
 'hs

In [74]:
'states=%d, transitions=%d, words=%d' % (len(dfa.states), sum(map(len, dfa.transitions.values())), len(table.keys()))

'states=5572, transitions=21438, words=31072'

In [79]:
num, = decode.decode_from_dictionary('antidisestablishmentarianism', dictionary)

In [87]:
list(encode.accepted_prefixes(num, dfa))

[('', '2110019563214203'),
 ('2', '110019563214203'),
 ('21', '10019563214203'),
 ('211', '0019563214203'),
 ('2110', '019563214203'),
 ('21100', '19563214203'),
 ('2110019563214203', '')]

In [88]:
[(p, table[p], s) for p, s in _]

[('',
  {'a',
   'a.',
   'ae',
   'ah',
   'aha',
   'ahah',
   'ahh',
   'aho',
   'ahoy',
   'ahuja',
   'ai',
   'aiwa',
   'aja',
   'aoi',
   'au',
   'aue',
   'aux',
   'aw',
   'away',
   'awe',
   'ay',
   'aye',
   'ayo',
   'e',
   'e.',
   'eau',
   'eaux',
   'ee',
   'eeo',
   'eh',
   'eu',
   'ewe',
   'ewy',
   'eye',
   'ha',
   'ha-ha',
   'ha-ha-ha',
   'hah',
   'haigh',
   'hao',
   'hau',
   'haugh',
   'haw',
   'hawaii',
   'hawe',
   'hay',
   'haye',
   'he',
   'hee',
   'heehaw',
   'heh',
   'heiwa',
   'hew',
   'hewe',
   'hewey',
   'hey',
   'heye',
   'hi',
   'high',
   'highway',
   'hiway',
   'ho',
   'hoe',
   'hoey',
   'hoh',
   'hoho',
   'hoi',
   'hoo',
   'hooey',
   'hou',
   'houx',
   'houy',
   'how',
   'howe',
   'howey',
   'howie',
   'hoy',
   'hoye',
   'hu',
   'hua',
   'hue',
   'huey',
   'hugh',
   'hughey',
   'hughie',
   'hughy',
   'huh',
   'hui',
   'huie',
   'huwe',
   'hwa',
   'hwe',
   'hy',
   'hye',
   'i',
   '