/
breakwords.py
executable file
·53 lines (40 loc) · 1.39 KB
/
breakwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python3
'''
Generate wordlist:
occurs --nocount 100-0.txt > wordlist.txt
Generate space-less Shakespeare:
cp 100-0.txt 100-0-nospace.txt
rpl --quiet ' ' '' 100-0-nospace.txt
'''
import sys, argparse
try:
import re2 as re
except ImportError:
import re
MAX_MATCHES = 30
# Parse command-line arguments
parser = argparse.ArgumentParser(description="Add spaces between words in text.")
parser.add_argument(
'wordlist_file',
metavar='WORDLIST-FILE',
help='word list to use',
)
args = parser.parse_args()
# Read word list, downcase and sort by decreasing word length
with open(args.wordlist_file) as f:
lexicon = list(set(l.strip().lower() for l in f.readlines()))
lexicon.sort(key=lambda x: len(x), reverse=True)
# FIXME: allow punctuation
lexicon_regex = '|'.join(lexicon)
# It's faster to match with a one-or-more regex:
lexicon_match_regex = '({})+'.format(lexicon_regex)
# Then use a repeated regex to extract the words:
lexicon_extract_regex = '({})?'.format(lexicon_regex) * MAX_MATCHES
def breakwords(text, flags=0):
m = re.fullmatch(lexicon_match_regex, text, flags=flags)
if m == None:
return '(no matches)'
m = re.fullmatch(lexicon_extract_regex, text, flags=flags)
return [match for match in m.groups() if match is not None]
for l in [l.strip() for l in sys.stdin.readlines()]:
print('{}: {}'.format(l, breakwords(l, flags=re.IGNORECASE)))