/
tokenizer.py
92 lines (90 loc) · 3.12 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python
# __author__ = "Ronie Martinez"
# __copyright__ = "Copyright 2018-2019, Ronie Martinez"
# __credits__ = ["Ronie Martinez"]
# __maintainer__ = "Ronie Martinez"
# __email__ = "ronmarti18@gmail.com"
def tokenize(data):
iterable = iter(data)
buffer = ''
while True:
try:
char = next(iterable)
if char == '\\':
if buffer == '\\':
yield buffer + char
buffer = ''
continue
elif len(buffer):
yield buffer
buffer = char
try:
buffer += next(iterable)
except StopIteration:
break
elif char.isalpha():
if len(buffer):
if buffer.endswith('}'):
yield buffer
yield char
buffer = ''
elif buffer.startswith('\\'):
buffer += char
else:
yield char
elif char.isdigit():
if len(buffer):
yield buffer
buffer = char
while True:
try:
char = next(iterable)
except StopIteration:
break
if char.isspace():
yield buffer
buffer = ''
break
elif char.isdigit() or char == '.':
buffer += char
else:
if buffer.endswith('.'):
yield buffer[:-1]
yield buffer[-1]
else:
yield buffer
buffer = ''
if char == '\\':
buffer = char
else:
yield char
break
elif char.isspace():
if len(buffer):
yield buffer
buffer = ''
elif char in '{}*':
# FIXME: Anything that starts with '\math' passes. There is a huge list of math symbols in
# unimathsymbols.txt and hard-coding all of them is inefficient.
if buffer.startswith(r'\begin') or buffer.startswith(r'\end') or buffer.startswith(r'\math'):
if buffer.endswith('}'):
yield buffer
yield char
buffer = ''
else:
buffer += char
else:
if len(buffer):
yield buffer
buffer = ''
yield char
else:
if len(buffer):
yield buffer
buffer = ''
if len(char):
yield char
except StopIteration:
break
if len(buffer):
yield buffer