-
-
Notifications
You must be signed in to change notification settings - Fork 17
/
utils.py
203 lines (166 loc) · 6.73 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
"""Just a bunch of utility functions for sphinxlint."""
import regex as re
from polib import pofile
from sphinxlint import rst
def match_size(re_match):
return re_match.end() - re_match.start()
def _clean_heuristic(paragraph, regex):
"""Remove the regex from the paragraph.
The remove starts by most "credible" ones (here lies the dragons).
To remove `(.*)` from `(abc def ghi (jkl)`, a bad move consists of
removing everything (eating a lone `(`), while the most credible
action to take is to remove `(jkl)`, leaving a lone `(`.
"""
while True:
candidate = min(
regex.finditer(paragraph, overlapped=True), key=match_size, default=None
)
if candidate is None:
return paragraph
paragraph = paragraph[: candidate.start()] + paragraph[candidate.end() :]
def clean_paragraph(paragraph):
"""Removes all good constructs, so detectors can focus on bad ones.
It removes all well formed inline literals, inline internal
targets, and roles.
"""
paragraph = escape2null(paragraph)
paragraph = _clean_heuristic(paragraph, rst.INLINE_LITERAL_RE)
paragraph = _clean_heuristic(paragraph, rst.INLINE_INTERNAL_TARGET_RE)
paragraph = _clean_heuristic(paragraph, rst.HYPERLINK_REFERENCES_RE)
paragraph = _clean_heuristic(paragraph, rst.ANONYMOUS_HYPERLINK_REFERENCES_RE)
paragraph = rst.NORMAL_ROLE_RE.sub("", paragraph)
return paragraph.replace("\x00", "\\")
def escape2null(text):
r"""Return a string with escape-backslashes converted to nulls.
It ease telling appart escaping-backslashes and normal backslashes
in regex.
For example : \\\\\\` is hard to match, even with the eyes, it's
hard to know which backslash escapes which backslash, and it's
very hard to know if the backtick is escaped.
By replacing the escaping backslashes with another character they
become easy to spot:
0\0\0\`
(This example uses zeros for readability but the function actually
uses null bytes, \x00.)
So we easily see that the backtick is **not** escaped: it's
preceded by a backslash, not an escaping backslash.
"""
parts = []
start = 0
while True:
found = text.find("\\", start)
if found == -1:
parts.append(text[start:])
return "".join(parts)
parts.append(text[start:found])
parts.append("\x00" + text[found + 1 : found + 2])
start = found + 2 # skip character after escape
def paragraphs(lines):
"""Yield (paragraph_line_no, paragraph_text) pairs describing
paragraphs of the given lines.
"""
paragraph = []
paragraph_lno = 1
for lno, line in enumerate(lines, start=1):
if line != "\n":
if not paragraph:
# save the lno of the first line of the para
paragraph_lno = lno
paragraph.append(line)
elif paragraph:
yield paragraph_lno, "".join(paragraph)
paragraph = []
if paragraph:
yield paragraph_lno, "".join(paragraph)
def looks_like_glued(match):
"""Tell appart glued tags and tags with a missing colon.
In one case we can have:
the:issue:`123`, it's clearly a missing space before the role tag.
should return True in this case.
In another case we can have:
c:func:`foo`, it's a missing colon before the tag.
should return False in this case.
"""
match_string = match.group(0)
if match_string.count(":") == 1:
# With a single : there's no choice, another : is missing.
return False
known_start_tag = {"c", "py"}
if re.match(" *(" + "|".join(known_start_tag) + "):", match_string):
# Before c:anything:` or py:anything:` we can bet it's a missing colon.
return False
# In other cases it's probably a glued word.
return True
def is_multiline_non_rst_block(line):
"""Returns True if the next lines are an indented literal block."""
if re.match(r"^\s*\.\.$", line): # it's the start of a comment block.
return True
if rst.DIRECTIVES_CONTAINING_RST_RE.match(line):
return False
if rst.DIRECTIVES_CONTAINING_ARBITRARY_CONTENT_RE.match(line):
return True
if re.match(r"^ *.. productionlist::", line):
return True
if re.match(r"^ *\.\. ", line) and type_of_explicit_markup(line) == "comment":
return True
if line.endswith("::\n"): # It's a literal block
return True
return False
def hide_non_rst_blocks(lines, hidden_block_cb=None):
"""Filters out literal, comments, code blocks, ...
The filter actually replace "removed" lines by empty lines, so the
line numbering still make sense.
"""
in_literal = None
excluded_lines = []
block_line_start = None
output = []
for lineno, line in enumerate(lines, start=1):
if in_literal is not None:
current_indentation = len(re.match(" *", line).group(0))
if current_indentation > in_literal or line == "\n":
excluded_lines.append(line if line == "\n" else line[in_literal:])
line = "\n" # Hiding line
else:
in_literal = None
if hidden_block_cb:
hidden_block_cb(block_line_start, "".join(excluded_lines))
excluded_lines = []
if in_literal is None and is_multiline_non_rst_block(line):
in_literal = len(re.match(" *", line).group(0))
block_line_start = lineno
assert not excluded_lines
if (
re.match(r" *\.\. ", line)
and type_of_explicit_markup(line) == "comment"
):
line = "\n"
output.append(line)
if excluded_lines and hidden_block_cb:
hidden_block_cb(block_line_start, "".join(excluded_lines))
return output
def type_of_explicit_markup(line):
"""Tell apart various explicit markup blocks."""
line = line.lstrip()
if re.match(rf"\.\. {rst.ALL_DIRECTIVES}::", line):
return "directive"
if re.match(r"\.\. \[[0-9]+\] ", line):
return "footnote"
if re.match(r"\.\. \[[^\]]+\] ", line):
return "citation"
if re.match(r"\.\. _.*[^_]: ", line):
return "target"
if re.match(r"\.\. \|[^\|]*\| ", line):
return "substitution_definition"
return "comment"
def po2rst(text):
"""Extract msgstr entries from a po content, keeping linenos."""
output = []
po = pofile(text, encoding="UTF-8")
for entry in po.translated_entries():
# Don't check original msgid, assume it's checked directly.
while len(output) + 1 < entry.linenum:
output.append("\n")
for line in entry.msgstr.splitlines():
output.append(line + "\n")
return "".join(output)