-
Notifications
You must be signed in to change notification settings - Fork 120
/
__init__.py
executable file
·316 lines (248 loc) · 8.8 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#!/usr/bin/env python3
"""
Defines various package-level constants and helper functions.
"""
import argparse
import os
import shutil
import sys
from pathlib import Path
from typing import Set, Union, List, Tuple
from rich.console import Console
from rich.text import Text
from rich.theme import Theme
from natsort import natsorted, ns
import regex
import se.easy_xml
VERSION = "2.4.0"
MESSAGE_INDENT = " "
UNICODE_BOM = "\ufeff"
NO_BREAK_SPACE = "\u00a0"
WORD_JOINER = "\u2060"
HAIR_SPACE = "\u200a"
ZERO_WIDTH_SPACE = "\ufeff"
SHY_HYPHEN = "\u00ad"
FUNCTION_APPLICATION = "\u2061"
NO_BREAK_HYPHEN = "\u2011"
COMBINING_VERTICAL_LINE_ABOVE = "\u030d"
COMBINING_ACUTE_ACCENT = "\u0301"
INVISIBLE_TIMES = "\u2062"
SELECTORS_TO_SIMPLIFY = [":first-child", ":only-child", ":last-child", ":nth-child", ":nth-last-child", ":first-of-type", ":only-of-type", ":last-of-type", ":nth-of-type", ":nth-last-of-type"]
MESSAGE_TYPE_WARNING = 1
MESSAGE_TYPE_ERROR = 2
COVER_HEIGHT = 2100
COVER_WIDTH = 1400
TITLEPAGE_WIDTH = 1400
RICH_THEME = Theme({
"xhtml": "bright_blue",
"xml": "bright_blue",
"val": "bright_blue",
"attr": "bright_blue",
"class": "bright_blue",
"path": "bright_blue",
"url": "bright_blue",
"text": "bright_blue",
"bash": "bright_blue",
"css": "bright_blue"
})
class SeException(Exception):
""" Wrapper class for SE exceptions """
code = 0
# Note that we skip error codes 1 and 2 as they have special meanings:
# http://www.tldp.org/LDP/abs/html/exitcodes.html
class InvalidXhtmlException(SeException):
""" Invalid XHTML """
code = 3
class InvalidEncodingException(SeException):
""" Invalid encoding """
code = 4
class MissingDependencyException(SeException):
""" Missing dependency """
code = 5
class InvalidInputException(SeException):
""" Invalid input """
code = 6
class FileExistsException(SeException):
""" File exists """
code = 7
class InvalidFileException(SeException):
""" Invalid file """
code = 8
class InvalidLanguageException(SeException):
""" Invalid language """
code = 9
class InvalidSeEbookException(SeException):
""" Invalid SE ebook """
code = 10
class InvalidArgumentsException(SeException):
""" Invalid arguments """
code = 11
class RemoteCommandErrorException(SeException):
""" Error in remote command """
code = 12
class LintFailedException(SeException):
""" Lint failed """
code = 13
class InvalidCssException(SeException):
""" Invalid CSS """
code = 14
class InvalidSvgException(SeException):
""" Invalid SVG """
code = 15
class InvalidXmlException(SeException):
""" Invalid XHTML """
code = 16
class BuildFailedException(SeException):
""" Build failed """
code = 17
def __init__(self, message, messages: List = None):
super().__init__(message)
self.messages = messages if messages else []
def strip_bom(string: str) -> str:
"""
Remove the Unicode Byte Order Mark from a string.
INPUTS
string: A Unicode string
OUTPUTS
The input string with the Byte Order Mark removed
"""
if string.startswith(UNICODE_BOM):
string = string[1:]
return string
def prep_output(message: str, plain_output: bool = False) -> str:
"""
Return a message formatted for the chosen output style, i.e., color or plain.
"""
if plain_output:
# Replace color markup with `
message = regex.sub(r"\[(?:/|xhtml|xml|val|attr|css|val|class|path|url|text|bash|link)(?:=[^\]]*?)*\]", "`", message)
message = regex.sub(r"`+", "`", message)
return message
def print_error(message: Union[SeException, str], verbose: bool = False, is_warning: bool = False, plain_output: bool = False) -> None:
"""
Helper function to print a colored error message to the console.
Allowed BBCode tags:
[link=foo]bar[/] - Hyperlink
[xml] - XML, usually a tag
[xhtml] - XHTML, usually a tag
[attr] - A lone XHTML attribute name (without `="foo"`)
[val] - A lone XHTML attribute value (not a class)
[class] - A lone XHTML class value
[path] - Filesystem path or glob
[url] - A URL
[text] - Non-semantic text that requires color
[bash] - A command or flag of a command
"""
label = "Error" if not is_warning else "Warning"
bg_color = "red" if not is_warning else "yellow"
# We have to print to stdout in case we're called from GNU Parallel, otherwise weird newline issues occur
# This no longer works with rich because it can't (yet) output to stderr
output_file = sys.stderr if not is_warning and not is_called_from_parallel() else sys.stdout
message = str(message)
if verbose:
message = str(message).replace("\n", f"\n{MESSAGE_INDENT}")
console = Console(file=output_file, highlight=False, theme=RICH_THEME, force_terminal=is_called_from_parallel()) # Syntax highlighting will do weird things when printing paths; force_terminal prints colors when called from GNU Parallel
if plain_output:
# Replace color markup with `
message = prep_output(message, True)
console.print(f"{MESSAGE_INDENT if verbose else ''}[{label}] {message}")
else:
console.print(f"{MESSAGE_INDENT if verbose else ''}[white on {bg_color} bold] {label} [/] {message}")
def is_positive_integer(value: str) -> int:
"""
Helper function for argparse.
Raise an exception if value is not a positive integer.
"""
try:
int_value = int(value)
if int_value <= 0:
raise argparse.ArgumentTypeError(f"{value} is not a positive integer")
except Exception as ex:
raise argparse.ArgumentTypeError(f"{value} is not a positive integer") from ex
return int_value
def quiet_remove(file: Path) -> None:
"""
Helper function to delete a file without crashing if it doesn't exist.
This has to remain until the SE server is on Ubuntu 22.04 or Python 3.8.5+
"""
try:
file.unlink()
except Exception:
pass
def get_target_filenames(targets: list, allowed_extensions: Union[tuple, str]) -> list:
"""
Helper function to convert a list of filenames or directories into a list of filenames based on some parameters.
allowed_extensions is only applied on targets that are directories.
INPUTS
targets: A list of filenames or directories
allowed_extensions: A tuple containing a series of allowed filename extensions; extensions must begin with "."
OUTPUTS
A set of file paths and filenames contained in the target list.
"""
target_xhtml_filenames = set()
if isinstance(allowed_extensions, str):
allowed_extensions = (allowed_extensions,)
for target in targets:
target = Path(target).resolve()
if target.is_dir():
for file_path in target.glob("**/*"):
file_path.resolve()
if allowed_extensions:
if file_path.suffix in allowed_extensions:
target_xhtml_filenames.add(file_path)
else:
target_xhtml_filenames.add(file_path)
else:
# If we're looking at an actual file, just add it regardless of whether it's ignored
target_xhtml_filenames.add(target)
return natsorted(list(target_xhtml_filenames), key=lambda x: str(x.name), alg=ns.PATH)
def is_called_from_parallel(return_none=True) -> Union[bool,None]:
"""
Decide if we're being called from GNU parallel.
This is good to know in case we want to tweak some output.
This is almost always passed directly to the force_terminal option of rich.console(),
meaning that `None` means "guess terminal status" and `False` means "no colors at all".
We typically want to guess, so this returns None by default if not called from Parallel.
To return false in that case, pass return_none=False
"""
import psutil # pylint: disable=import-outside-toplevel
try:
for line in psutil.Process(psutil.Process().ppid()).cmdline():
if regex.search(fr"{os.sep}parallel$", line):
return True
except:
# If we can't figure it out, don't worry about it
pass
return None if return_none else False
def get_dom_if_not_ignored(xhtml: str, ignored_types: List[str] = None) -> Tuple[bool, Union[None, se.easy_xml.EasyXmlTree]]:
"""
Given a string of XHTML, return a dom tree ONLY IF the dom does not contain a
top-level <section> element with any of the passed semantics.
Pass an empty list to ignored_types to ignore nothing.
Pass None to ignored_types to ignore a default set of SE files.
RETURNS
A tuple of (is_ignored, dom)
If the file is ignored, is_ignored will be True.
If the dom couldn't be created (for example it is invalid XML) then the dom part
of the tuple will be None.
"""
is_ignored = False
ignored_regex = None
dom = None
try:
dom = se.easy_xml.EasyXmlTree(xhtml)
except:
return (False, None)
# Ignore some SE files
# Default ignore list
if ignored_types is None:
ignored_regex = "(colophon|titlepage|imprint|copyright-page|halftitlepage|toc|loi)"
elif len(ignored_types) > 0:
ignored_regex = "("
for item in ignored_types:
ignored_regex = f"{ignored_regex}{regex.escape(item)}|"
ignored_regex = ignored_regex.rstrip("|") + ")"
if ignored_regex:
if dom.xpath(f"/html[re:test(@epub:prefix, '[\\s\\b]se:[\\s\\b]')]/body/*[(name() = 'section' or name() = 'nav') and re:test(@epub:type, '\\b{ignored_regex}\\b')]"):
is_ignored = True
return (is_ignored, dom)