/
se_epub_lint.py
1156 lines (930 loc) · 63.6 KB
/
se_epub_lint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
Contains the LintMessage class and the Lint function, which is broken out of
the SeEpub class for readability and maintainability.
Strictly speaking, the lint() function should be a class member of SeEpub. But
the function is very big and it makes editing easier to put in a separate file.
"""
import os
import glob
import filecmp
import html
import unicodedata
from pathlib import Path
import io
import regex
import roman
from pkg_resources import resource_filename
import lxml.cssselect
import lxml.etree as etree
from bs4 import BeautifulSoup, NavigableString
import se
import se.formatting
import se.easy_xml
import se.images
class LintMessage:
"""
An object representing an output message for the lint function.
Contains information like message text, severity, and the epub filename that generated the message.
"""
text = ""
filename = ""
message_type = se.MESSAGE_TYPE_WARNING
is_submessage = False
def __init__(self, text: str, message_type=se.MESSAGE_TYPE_WARNING, filename: str = "", is_submessage: bool = False):
self.text = text.strip()
self.filename = filename
self.message_type = message_type
self.is_submessage = is_submessage
def _get_malformed_urls(xhtml: str) -> list:
"""
Helper function used in self.lint()
Get a list of URLs in the epub that do not match SE standards.
INPUTS
xhtml: A string of XHTML to check
OUTPUTS
A list of strings representing any malformed URLs in the XHTML string
"""
messages = []
# Check for non-https URLs
if "http://gutenberg.org" in xhtml or "https://gutenberg.org" in xhtml:
messages.append(LintMessage("gutenberg.org URL missing leading www.", se.MESSAGE_TYPE_ERROR))
if "http://www.gutenberg.org" in xhtml:
messages.append(LintMessage("Non-https gutenberg.org URL.", se.MESSAGE_TYPE_ERROR))
if "http://www.pgdp.net" in xhtml:
messages.append(LintMessage("Non-https pgdp.net URL.", se.MESSAGE_TYPE_ERROR))
if "http://catalog.hathitrust.org" in xhtml:
messages.append(LintMessage("Non-https hathitrust.org URL.", se.MESSAGE_TYPE_ERROR))
if "http://archive.org" in xhtml:
messages.append(LintMessage("Non-https archive.org URL.", se.MESSAGE_TYPE_ERROR))
if "www.archive.org" in xhtml:
messages.append(LintMessage("archive.org URL should not have leading www.", se.MESSAGE_TYPE_ERROR))
if "http://en.wikipedia.org" in xhtml:
messages.append(LintMessage("Non-https en.wikipedia.org URL.", se.MESSAGE_TYPE_ERROR))
# Check for malformed canonical URLs
if regex.search(r"books\.google\.com/books\?id=.+?[&#]", xhtml):
messages.append(LintMessage("Non-canonical Google Books URL. Google Books URLs must look exactly like https://books.google.com/books?id=<BOOK-ID>"))
if "babel.hathitrust.org" in xhtml:
messages.append(LintMessage("Non-canonical HathiTrust URL. HathiTrust URLs must look exactly like https://catalog.hathitrust.org/Record/<BOOK-ID>"))
if ".gutenberg.org/files/" in xhtml:
messages.append(LintMessage("Non-canonical Project Gutenberg URL. Project Gutenberg URLs must look exactly like https://www.gutenberg.org/ebooks/<BOOK-ID>"))
if "archive.org/stream" in xhtml:
messages.append(LintMessage("Non-canonical archive.org URL. Internet Archive URLs must look exactly like https://archive.org/details/<BOOK-ID>"))
return messages
def _get_unused_selectors(self) -> set:
"""
Helper function used in self.lint(); merge directly into lint()?
Get a list of CSS selectors that do not actually select HTML in the epub.
INPUTS
None
OUTPUTS
A list of strings representing CSS selectors that do not actually select HTML in the epub.
"""
try:
with open(self.path / "src" / "epub" / "css" / "local.css", encoding="utf-8") as file:
css = file.read()
except Exception:
raise FileNotFoundError("Couldn't open {}".format(self.path / "src" / "epub" / "css" / "local.css"))
# Remove @supports directives, as the parser can't handle them
css = regex.sub(r"^@supports\(.+?\){(.+?)}\s*}", "\\1}", css, flags=regex.MULTILINE | regex.DOTALL)
# Remove actual content of css selectors
css = regex.sub(r"{[^}]+}", "", css)
# Remove trailing commas
css = regex.sub(r",", "", css)
# Remove comments
css = regex.sub(r"/\*.+?\*/", "", css, flags=regex.DOTALL)
# Remove @ defines
css = regex.sub(r"^@.+", "", css, flags=regex.MULTILINE)
# Construct a dictionary of selectors
selectors = {line for line in css.splitlines() if line != ""}
unused_selectors = set(selectors)
# Get a list of .xhtml files to search
filenames = glob.glob(str(self.path / "src" / "epub" / "text" / "*.xhtml"))
# Now iterate over each CSS selector and see if it's used in any of the files we found
for selector in selectors:
try:
sel = lxml.cssselect.CSSSelector(selector, translator="html", namespaces=se.XHTML_NAMESPACES)
except lxml.cssselect.ExpressionError:
# This gets thrown if we use pseudo-elements, which lxml doesn't support
unused_selectors.remove(selector)
continue
except lxml.cssselect.SelectorSyntaxError as ex:
raise se.InvalidCssException("Couldn't parse CSS in or near this line: {}\n{}".format(selector, ex))
for filename in filenames:
if not filename.endswith("titlepage.xhtml") and not filename.endswith("imprint.xhtml") and not filename.endswith("uncopyright.xhtml"):
# We have to remove the default namespace declaration from our document, otherwise
# xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
with open(filename, "r", encoding="utf-8") as file:
xhtml = file.read().replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")
try:
tree = etree.fromstring(str.encode(xhtml))
except etree.XMLSyntaxError as ex:
raise se.InvalidXhtmlException("Couldn't parse XHTML in file: {}, error: {}".format(filename, str(ex)))
except Exception:
raise se.InvalidXhtmlException("Couldn't parse XHTML in file: {}".format(filename))
if tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES):
unused_selectors.remove(selector)
break
return unused_selectors
def lint(self, metadata_xhtml) -> list:
"""
Check this ebook for some common SE style errors.
INPUTS
None
OUTPUTS
A list of LintMessage objects.
"""
messages = []
license_file_path = resource_filename("se", str(Path("data") / "templates" / "LICENSE.md"))
gitignore_file_path = resource_filename("se", str(Path("data") / "templates" / "gitignore"))
core_css_file_path = resource_filename("se", str(Path("data") / "templates" / "core.css"))
logo_svg_file_path = resource_filename("se", str(Path("data") / "templates" / "logo.svg"))
uncopyright_file_path = resource_filename("se", str(Path("data") / "templates" / "uncopyright.xhtml"))
has_halftitle = False
has_frontmatter = False
has_cover_source = False
cover_svg_title = ""
titlepage_svg_title = ""
xhtml_css_classes = {}
headings = []
# Get the ebook language, for later use
language = regex.search(r"<dc:language>([^>]+?)</dc:language>", metadata_xhtml).group(1)
# Check local.css for various items, for later use
abbr_elements = []
css = ""
with open(self.path / "src" / "epub" / "css" / "local.css", "r", encoding="utf-8") as file:
css = file.read()
local_css_has_subtitle_style = "span[epub|type~=\"subtitle\"]" in css
abbr_styles = regex.findall(r"abbr\.[a-z]+", css)
matches = regex.findall(r"^h[0-6]\s*,?{?", css, flags=regex.MULTILINE)
if matches:
messages.append(LintMessage("Do not directly select h[0-6] elements, as they are used in template files; use more specific selectors.", se.MESSAGE_TYPE_ERROR, "local.css"))
# Check for presence of ./dist/ folder
if (self.path / "dist").exists():
messages.append(LintMessage("Illegal ./dist/ folder. Do not commit compiled versions of the source.", se.MESSAGE_TYPE_ERROR, "./dist/"))
# Check if there are non-typogrified quotes or em-dashes in metadata descriptions
if regex.search(r"#description\">[^<]+?(['\"]|\-\-)[^<]+?</meta>", metadata_xhtml.replace("\">", "").replace("=\"", "")) is not None:
messages.append(LintMessage("Non-typogrified \", ', or -- detected in metadata long description", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check if there are non-typogrified quotes or em-dashes in the title.
# The open-ended start and end of the regex also catches title-sort
if regex.search(r"title\">[^<]+?(['\"]|\-\-)[^<]+?<", metadata_xhtml) is not None:
messages.append(LintMessage("Non-typogrified \", ', or -- detected in metadata title", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for malformed long description HTML
long_description = regex.findall(r"<meta id=\"long-description\".+?>(.+?)</meta>", metadata_xhtml, flags=regex.DOTALL)
if long_description:
long_description = "<?xml version=\"1.0\"?><html xmlns=\"http://www.w3.org/1999/xhtml\">" + html.unescape(long_description[0]) + "</html>"
try:
etree.parse(io.StringIO(long_description))
except lxml.etree.XMLSyntaxError as ex:
messages.append(LintMessage("Metadata long description is not valid HTML. LXML says: " + str(ex), se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for double spacing
regex_string = r"[{}{} ]{{2,}}".format(se.NO_BREAK_SPACE, se.HAIR_SPACE)
matches = regex.findall(regex_string, metadata_xhtml)
if matches:
messages.append(LintMessage("Double spacing detected in file. Sentences should be single-spaced.", se.MESSAGE_TYPE_ERROR, "content.opf"))
if regex.search(r"<dc:description id=\"description\">[^<]+?(['\"]|\-\-)[^<]+?</dc:description>", metadata_xhtml) is not None:
messages.append(LintMessage("Non-typogrified \", ', or -- detected in metadata dc:description.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for punctuation outside quotes. We don't check single quotes because contractions are too common.
matches = regex.findall(r"[a-zA-Z][”][,.]", metadata_xhtml)
if matches:
messages.append(LintMessage("Comma or period outside of double quote. Generally punctuation should go within single and double quotes.", se.MESSAGE_TYPE_WARNING, "content.opf"))
# Make sure long-description is escaped HTML
if "<meta id=\"long-description\" property=\"se:long-description\" refines=\"#description\">\n\t\t\t<p>" not in metadata_xhtml:
messages.append(LintMessage("Long description must be escaped HTML.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for HTML entities in long-description, but allow &amp;
if regex.search(r"&[a-z]+?;", metadata_xhtml.replace("&amp;", "")):
messages.append(LintMessage("HTML entites detected in metadata. Use Unicode equivalents instead.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for illegal em-dashes in <dc:subject>
if regex.search(r"<dc:subject id=\"[^\"]+?\">[^<]+?—[^<]+?</dc:subject>", metadata_xhtml) is not None:
messages.append(LintMessage("Illegal em-dash detected in dc:subject; use --", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for empty production notes
if "<meta property=\"se:production-notes\">Any special notes about the production of this ebook for future editors/producers? Remove this element if not.</meta>" in metadata_xhtml:
messages.append(LintMessage("Empty production-notes element in metadata.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for illegal VCS URLs
matches = regex.findall(r"<meta property=\"se:url\.vcs\.github\">([^<]+?)</meta>", metadata_xhtml)
if matches:
for match in matches:
if not match.startswith("https://github.com/standardebooks/"):
messages.append(LintMessage("Illegal se:url.vcs.github. VCS URLs must begin with https://github.com/standardebooks/: {}".format(match), se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for HathiTrust scan URLs instead of actual record URLs
if "babel.hathitrust.org" in metadata_xhtml or "hdl.handle.net" in metadata_xhtml:
messages.append(LintMessage("Use HathiTrust record URLs, not page scan URLs, in metadata, imprint, and colophon. Record URLs look like: https://catalog.hathitrust.org/Record/<RECORD-ID>", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for illegal se:subject tags
matches = regex.findall(r"<meta property=\"se:subject\">([^<]+?)</meta>", metadata_xhtml)
if matches:
for match in matches:
if match not in se.SE_GENRES:
messages.append(LintMessage("Illegal se:subject: {}".format(match), se.MESSAGE_TYPE_ERROR, "content.opf"))
else:
messages.append(LintMessage("No se:subject <meta> tag found.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for CDATA tags
if "<![CDATA[" in metadata_xhtml:
messages.append(LintMessage("<![CDATA[ detected. Run `clean` to canonicalize <![CDATA[ sections.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check that our provided identifier matches the generated identifier
identifier = regex.sub(r"<.+?>", "", regex.findall(r"<dc:identifier id=\"uid\">.+?</dc:identifier>", metadata_xhtml)[0])
if identifier != self.generated_identifier:
messages.append(LintMessage("<dc:identifier> does not match expected: {}".format(self.generated_identifier), se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check that the GitHub repo URL is as expected
if ("<meta property=\"se:url.vcs.github\">" + self.generated_github_repo_url + "</meta>") not in metadata_xhtml:
messages.append(LintMessage("GitHub repo URL does not match expected: {}".format(self.generated_github_repo_url), se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check if se:name.person.full-name matches their titlepage name
matches = regex.findall(r"<meta property=\"se:name\.person\.full-name\" refines=\"#([^\"]+?)\">([^<]*?)</meta>", metadata_xhtml)
duplicate_names = []
for match in matches:
name_matches = regex.findall(r"<([a-z:]+)[^<]+?id=\"{}\"[^<]*?>([^<]*?)</\1>".format(match[0]), metadata_xhtml)
for name_match in name_matches:
if name_match[1] == match[1]:
duplicate_names.append(name_match[1])
if duplicate_names:
messages.append(LintMessage("se:name.person.full-name property identical to regular name. If the two are identical the full name <meta> element must be removed.", se.MESSAGE_TYPE_ERROR, "content.opf"))
for duplicate_name in duplicate_names:
messages.append(LintMessage(duplicate_name, se.MESSAGE_TYPE_ERROR, "", True))
# Check for malformed URLs
for message in _get_malformed_urls(metadata_xhtml):
message.filename = "content.opf"
messages.append(message)
if regex.search(r"id\.loc\.gov/authorities/names/[^\.]+\.html", metadata_xhtml):
messages.append(LintMessage("id.loc.gov URL ending with illegal .html", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Does the manifest match the generated manifest?
for manifest in regex.findall(r"<manifest>.*?</manifest>", metadata_xhtml, flags=regex.DOTALL):
manifest = regex.sub(r"[\n\t]", "", manifest)
expected_manifest = regex.sub(r"[\n\t]", "", self.generate_manifest())
if manifest != expected_manifest:
messages.append(LintMessage("<manifest> does not match expected structure.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Make sure some static files are unchanged
try:
if not filecmp.cmp(license_file_path, self.path / "LICENSE.md"):
messages.append(LintMessage("LICENSE.md does not match {}".format(license_file_path), se.MESSAGE_TYPE_ERROR, "LICENSE.md"))
except Exception:
messages.append(LintMessage("Missing ./LICENSE.md", se.MESSAGE_TYPE_ERROR, "LICENSE.md"))
if not filecmp.cmp(core_css_file_path, self.path / "src" / "epub" / "css" / "core.css"):
messages.append(LintMessage("core.css does not match {}".format(core_css_file_path), se.MESSAGE_TYPE_ERROR, "core.css"))
if not filecmp.cmp(logo_svg_file_path, self.path / "src" / "epub" / "images" / "logo.svg"):
messages.append(LintMessage("logo.svg does not match {}".format(logo_svg_file_path), se.MESSAGE_TYPE_ERROR, "logo.svg"))
if not filecmp.cmp(uncopyright_file_path, self.path / "src" / "epub" / "text" / "uncopyright.xhtml"):
messages.append(LintMessage("uncopyright.xhtml does not match {}".format(uncopyright_file_path), se.MESSAGE_TYPE_ERROR, "uncopyright.xhtml"))
# Check for unused selectors
unused_selectors = _get_unused_selectors(self)
if unused_selectors:
messages.append(LintMessage("Unused CSS selectors:", se.MESSAGE_TYPE_ERROR, "local.css"))
for selector in unused_selectors:
messages.append(LintMessage(selector, se.MESSAGE_TYPE_ERROR, "", True))
# Now iterate over individual files for some checks
for root, _, filenames in os.walk(self.path):
for filename in sorted(filenames, key=se.natural_sort_key):
if ".git" in str(Path(root) / filename):
continue
if filename.startswith("cover.source."):
has_cover_source = True
if filename != "LICENSE.md" and regex.findall(r"[A-Z]", filename):
messages.append(LintMessage("Illegal uppercase letter in filename", se.MESSAGE_TYPE_ERROR, filename))
if "-0" in filename:
messages.append(LintMessage("Illegal leading 0 in filename", se.MESSAGE_TYPE_ERROR, filename))
if filename.endswith(tuple(se.BINARY_EXTENSIONS)) or filename.endswith("core.css"):
continue
if filename.startswith(".") or filename.startswith("README"):
if filename == ".gitignore":
# .gitignore is optional, because our standard gitignore ignores itself.
# So if it's present, it must match our template.
if not filecmp.cmp(gitignore_file_path, str(self.path / ".gitignore")):
messages.append(LintMessage(".gitignore does not match {}".format(gitignore_file_path), se.MESSAGE_TYPE_ERROR, ".gitignore"))
continue
else:
messages.append(LintMessage("Illegal {} file detected in {}".format(filename, root), se.MESSAGE_TYPE_ERROR))
continue
with open(Path(root) / filename, "r", encoding="utf-8") as file:
try:
file_contents = file.read()
except UnicodeDecodeError:
# This is more to help developers find weird files that might choke 'lint', hopefully unnecessary for end users
messages.append(LintMessage("Problem decoding file as utf-8", se.MESSAGE_TYPE_ERROR, filename))
continue
if "http://standardebooks.org" in file_contents:
messages.append(LintMessage("Non-HTTPS Standard Ebooks URL detected.", se.MESSAGE_TYPE_ERROR, filename))
if "UTF-8" in file_contents:
messages.append(LintMessage("String \"UTF-8\" must always be lowercase.", se.MESSAGE_TYPE_ERROR, filename))
if filename == "halftitle.xhtml":
has_halftitle = True
if "<title>Half Title</title>" not in file_contents:
messages.append(LintMessage("Half title <title> tag must contain exactly: \"Half Title\".", se.MESSAGE_TYPE_ERROR, filename))
if filename == "colophon.xhtml":
if "<a href=\"{}\">{}</a>".format(self.generated_identifier.replace("url:", ""), self.generated_identifier.replace("url:https://", "")) not in file_contents:
messages.append(LintMessage("Unexpected SE identifier in colophon. Expected: {}".format(self.generated_identifier), se.MESSAGE_TYPE_ERROR, filename))
if ">trl<" in metadata_xhtml and "translated from" not in file_contents:
messages.append(LintMessage("Translator detected in metadata, but no 'translated from LANG' block in colophon", se.MESSAGE_TYPE_ERROR, filename))
# Check if we forgot to fill any variable slots
matches = regex.findall(r"(TITLE|YEAR|AUTHOR|PRODUCER|PG_[A-Z]+|TRANSCRIBER_[0-9]+|[A-Z]+_URL|PAINTING|ARTIST|SE_[A-Z]+)", file_contents)
for match in matches:
messages.append(LintMessage("Missing data in colophon: {}".format(match), se.MESSAGE_TYPE_ERROR, filename))
# Are the sources represented correctly?
# We don't have a standard yet for more than two sources (transcription and scan) so just ignore that case for now.
matches = regex.findall(r"<dc:source>([^<]+?)</dc:source>", metadata_xhtml)
if len(matches) <= 2:
for link in matches:
if "gutenberg.org" in link and "<a href=\"{}\">Project Gutenberg</a>".format(link) not in file_contents:
messages.append(LintMessage("Source not represented in colophon.xhtml. It should read: <a href=\"{}\">Project Gutenberg</a>".format(link), se.MESSAGE_TYPE_WARNING, filename))
if "hathitrust.org" in link and "the<br/>\n\t\t\t<a href=\"{}\">HathiTrust Digital Library</a>".format(link) not in file_contents:
messages.append(LintMessage("Source not represented in colophon.xhtml. It should read: the<br/> <a href=\"{}\">HathiTrust Digital Library</a>".format(link), se.MESSAGE_TYPE_WARNING, filename))
if "archive.org" in link and "the<br/>\n\t\t\t<a href=\"{}\">Internet Archive</a>".format(link) not in file_contents:
messages.append(LintMessage("Source not represented in colophon.xhtml. It should read: the<br/> <a href=\"{}\">Internet Archive</a>".format(link), se.MESSAGE_TYPE_WARNING, filename))
if "books.google.com" in link and "<a href=\"{}\">Google Books</a>".format(link) not in file_contents:
messages.append(LintMessage("Source not represented in colophon.xhtml. It should read: <a href=\"{}\">Google Books</a>".format(link), se.MESSAGE_TYPE_WARNING, filename))
if filename == "titlepage.xhtml":
if "<title>Titlepage</title>" not in file_contents:
messages.append(LintMessage("Titlepage <title> tag must contain exactly: \"Titlepage\".", se.MESSAGE_TYPE_ERROR, filename))
if filename.endswith(".svg"):
# Check for fill: #000 which should simply be removed
matches = regex.findall(r"fill=\"\s*#000", file_contents) + regex.findall(r"style=\"[^\"]*?fill:\s*#000", file_contents)
if matches:
messages.append(LintMessage("Illegal style=\"fill: #000\" or fill=\"#000\".", se.MESSAGE_TYPE_ERROR, filename))
# Check for illegal height or width on root <svg> element
if filename != "logo.svg": # Do as I say, not as I do...
matches = regex.findall(r"<svg[^>]*?(height|width)=[^>]*?>", file_contents)
if matches:
messages.append(LintMessage("Illegal height or width on root <svg> element. Size SVGs using the viewbox attribute only.", se.MESSAGE_TYPE_ERROR, filename))
# Check for illegal transform attribute
matches = regex.findall(r"<[a-z]+[^>]*?transform=[^>]*?>", file_contents)
if matches:
messages.append(LintMessage("Illegal transform attribute. SVGs should be optimized to remove use of transform. Try using Inkscape to save as an \"optimized SVG\".", se.MESSAGE_TYPE_ERROR, filename))
if os.sep + "src" + os.sep not in root:
# Check that cover and titlepage images are in all caps
if filename == "cover.svg":
matches = regex.findall(r"<text[^>]+?>.*[a-z].*</text>", file_contents)
if matches:
messages.append(LintMessage("Lowercase letters in cover. Cover text must be all uppercase.", se.MESSAGE_TYPE_ERROR, filename))
# Save for later comparison with titlepage
matches = regex.findall(r"<title>(.*?)</title>", file_contents)
for match in matches:
cover_svg_title = match.replace("The cover for ", "")
if filename == "titlepage.svg":
matches = regex.findall(r"<text[^>]+?>(.*[a-z].*)</text>", html.unescape(file_contents))
for match in matches:
if match not in ("translated by", "illustrated by", "and"):
messages.append(LintMessage("Lowercase letters in titlepage. Titlepage text must be all uppercase except \"translated by\" and \"illustrated by\".", se.MESSAGE_TYPE_ERROR, filename))
# For later comparison with cover
matches = regex.findall(r"<title>(.*?)</title>", file_contents)
for match in matches:
titlepage_svg_title = match.replace("The titlepage for ", "")
if filename.endswith(".css"):
# Check CSS style
# First remove @supports selectors and normalize indentation within them
matches = regex.findall(r"^@supports\(.+?\){.+?}\s*}", file_contents, flags=regex.MULTILINE | regex.DOTALL)
for match in matches:
processed_match = regex.sub(r"^@supports\(.+?\){\s*(.+?)\s*}\s*}", "\\1", match.replace("\n\t", "\n") + "\n}", flags=regex.MULTILINE | regex.DOTALL)
file_contents = file_contents.replace(match, processed_match)
# Remove comments that are on their own line
file_contents = regex.sub(r"^/\*.+?\*/\n", "", file_contents, flags=regex.MULTILINE | regex.DOTALL)
# Check for unneeded white-space nowrap in abbr selectors
matches = regex.findall(r"abbr.+?{[^}]*?white-space:\s*nowrap;[^}]*?}", css)
if matches:
messages.append(LintMessage("abbr selector does not need white-space: nowrap; as it inherits it from core.css.", se.MESSAGE_TYPE_ERROR, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_ERROR, filename, True))
# Don't specify border color
matches = regex.findall(r"(?:border|color).+?(?:#[a-f0-9]{0,6}|black|white|red)", file_contents, flags=regex.IGNORECASE)
if matches:
messages.append(LintMessage("Don't specify border colors, so that reading systems can adjust for night mode.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# If we select on the xml namespace, make sure we define the namespace in the CSS, otherwise the selector won't work
matches = regex.findall(r"\[\s*xml\s*\|", file_contents)
if matches and "@namespace xml \"http://www.w3.org/XML/1998/namespace\";" not in file_contents:
messages.append(LintMessage("[xml|attr] selector in CSS, but no XML namespace declared (@namespace xml \"http://www.w3.org/XML/1998/namespace\";).", se.MESSAGE_TYPE_ERROR, filename))
if filename.endswith(".xhtml"):
for message in _get_malformed_urls(file_contents):
message.filename = filename
messages.append(message)
# Check if this is a frontmatter file
if filename not in ("titlepage.xhtml", "imprint.xhtml", "toc.xhtml"):
matches = regex.findall(r"epub:type=\"[^\"]*?frontmatter[^\"]*?\"", file_contents)
if matches:
has_frontmatter = True
# Add new CSS classes to global list
if filename not in se.IGNORED_FILENAMES:
matches = regex.findall(r"(?:class=\")[^\"]+?(?:\")", file_contents)
for match in matches:
for css_class in match.replace("class=", "").replace("\"", "").split():
if css_class in xhtml_css_classes:
xhtml_css_classes[css_class] += 1
else:
xhtml_css_classes[css_class] = 1
#xhtml_css_classes = xhtml_css_classes + match.replace("class=", "").replace("\"", "").split()
# Read file contents into a DOM for querying
dom = BeautifulSoup(file_contents, "lxml")
# Store all headings to check for ToC references later
if filename != "toc.xhtml":
for match in dom.select("h1,h2,h3,h4,h5,h6"):
# Remove any links to the endnotes
endnote_ref = match.find("a", attrs={"epub:type": regex.compile("^.*noteref.*$")})
if endnote_ref:
endnote_ref.extract()
# Decide whether to remove subheadings based on the following logic:
# If the closest parent <section> is a part or division, then keep subtitle
# Else, if the closest parent <section> is a halftitlepage, then discard subtitle
# Else, if the first child of the heading is not z3998:roman, then also discard subtitle
# Else, keep the subtitle.
heading_subtitle = match.find(attrs={"epub:type": regex.compile("^.*subtitle.*$")})
if heading_subtitle:
# If an <h#> tag has a subtitle, the non-subtitle text must also be wrapped in a <span>.
# This invocation of match.find() returns all text nodes. We don't want any text nodes, so if it returns anything then we know we're
# missing a <span> somewhere.
if match.find(text=True, recursive=False).strip():
messages.append(LintMessage("<{}> tag has subtitle <span>, but first line is not wrapped in a <span>. See semantics manual for structure of headers with subtitles.".format(match.name), se.MESSAGE_TYPE_ERROR, filename))
# OK, move on with processing headers.
parent_section = match.find_parents("section")
# Sometimes we might not have a parent <section>, like in Keats' Poetry
if not parent_section:
parent_section = match.find_parents("body")
closest_section_epub_type = parent_section[0].get("epub:type") or ""
heading_first_child_epub_type = match.find("span", recursive=False).get("epub:type") or ""
if regex.findall(r"^.*(part|division|volume).*$", closest_section_epub_type) and not regex.findall(r"^.*se:short-story.*$", closest_section_epub_type):
remove_subtitle = False
elif regex.findall(r"^.*halftitlepage.*$", closest_section_epub_type):
remove_subtitle = True
elif not regex.findall(r"^.*z3998:roman.*$", heading_first_child_epub_type):
remove_subtitle = True
else:
remove_subtitle = False
if remove_subtitle:
heading_subtitle.extract()
normalized_text = " ".join(match.get_text().split())
headings = headings + [(normalized_text, filename)]
# Check for direct z3998:roman spans that should have their semantic pulled into the parent element
matches = regex.findall(r"<([a-z0-9]+)[^>]*?>\s*(<span epub:type=\"z3998:roman\">[^<]+?</span>)\s*</\1>", file_contents, flags=regex.DOTALL)
if matches:
messages.append(LintMessage("If <span> exists only for the z3998:roman semantic, then z3998:roman should be pulled into parent tag instead.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match[1], se.MESSAGE_TYPE_WARNING, filename, True))
# Check for "Hathi Trust" instead of "HathiTrust"
if "Hathi Trust" in file_contents:
messages.append(LintMessage("\"Hathi Trust\" should be \"HathiTrust\"", se.MESSAGE_TYPE_ERROR, filename))
# Check for uppercase letters in IDs or classes
matches = dom.select("[id],[class]")
for match in matches:
if match.has_attr("id"):
normalized_id = unicodedata.normalize("NFKD", match["id"])
uppercase_matches = regex.findall(r"[A-Z]", normalized_id)
for _ in uppercase_matches:
messages.append(LintMessage("Uppercase ID attribute: {}. Attribute values must be all lowercase.".format(match["id"]), se.MESSAGE_TYPE_ERROR, filename))
number_matches = regex.findall(r"^[0-9]", normalized_id)
for _ in number_matches:
messages.append(LintMessage("ID starting with a number is illegal XHTML: {}".format(match["id"]), se.MESSAGE_TYPE_ERROR, filename))
if match.has_attr("class"):
for css_class in match["class"]:
uppercase_matches = regex.findall(r"[A-Z]", unicodedata.normalize("NFKD", css_class))
for _ in uppercase_matches:
messages.append(LintMessage("Uppercase class attribute: {}. Attribute values must be all lowercase.".format(css_class), se.MESSAGE_TYPE_ERROR, filename))
matches = [x for x in dom.select("section") if not x.has_attr("id")]
if matches:
messages.append(LintMessage("<section> element without id attribute.", se.MESSAGE_TYPE_ERROR, filename))
# Check for empty title tags
if "<title/>" in file_contents or "<title></title>" in file_contents:
messages.append(LintMessage("Empty <title> tag.", se.MESSAGE_TYPE_ERROR, filename))
# Check for numeric entities
matches = regex.findall(r"&#[0-9]+?;", file_contents)
if matches:
messages.append(LintMessage("Illegal numeric entity (like Α) in file.", se.MESSAGE_TYPE_ERROR, filename))
# Check for <hr> tags before the end of a section, which is a common PG artifact
matches = regex.findall(r"<hr[^>]*?/?>\s*</section>", file_contents, flags=regex.DOTALL)
if matches:
messages.append(LintMessage("Illegal <hr/> before the end of a section.", se.MESSAGE_TYPE_ERROR, filename))
# Check for double greater-than at the end of a tag
matches = regex.findall(r"(>>|>>)", file_contents)
if matches:
messages.append(LintMessage("Tags should end with a single >.", se.MESSAGE_TYPE_WARNING, filename))
# Check for nbsp before ampersand (&)
matches = regex.findall(r"[^{}]\&".format(se.NO_BREAK_SPACE), file_contents)
if matches:
messages.append(LintMessage("Required nbsp not found before &", se.MESSAGE_TYPE_WARNING, filename))
# Check for nbsp after ampersand (&)
matches = regex.findall(r"\&[^{}]".format(se.NO_BREAK_SPACE), file_contents)
if matches:
messages.append(LintMessage("Required nbsp not found after &", se.MESSAGE_TYPE_WARNING, filename))
# Check for nbsp before times
matches = regex.findall(r"[0-9]+[^{}]<abbr class=\"time".format(se.NO_BREAK_SPACE), file_contents)
if matches:
messages.append(LintMessage("Required nbsp not found before <abbr class=\"time\">", se.MESSAGE_TYPE_WARNING, filename))
# Check for low-hanging misquoted fruit
matches = regex.findall(r"[A-Za-z]+[“‘]", file_contents)
if matches:
messages.append(LintMessage("Possible mis-curled quotation mark.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check that times have colons and not periods
matches = regex.findall(r"[0-9]\.[0-9]+\s<abbr class=\"time", file_contents) + regex.findall(r"at [0-9]\.[0-9]+", file_contents)
if matches:
messages.append(LintMessage("Times must be separated by colons (:) not periods (.)", se.MESSAGE_TYPE_ERROR, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_ERROR, filename, True))
# Check for leading 0 in IDs
matches = regex.findall(r"id=\"[^\"]+?\-0[0-9]+[^\"]*?\"", file_contents)
if matches:
messages.append(LintMessage("Illegal leading 0 in ID attribute", se.MESSAGE_TYPE_ERROR, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_ERROR, filename, True))
# Check for stage direction that ends in ?! but also has a trailing period
matches = regex.findall(r"<i epub:type=\"z3998:stage-direction\">(?:(?!<i).)*?\.</i>[,:;!?]", file_contents)
if matches:
messages.append(LintMessage("Stage direction ending in period next to other punctuation. Remove trailing periods in stage direction.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check for ending punctuation inside italics
matches = regex.findall(r"(<([ib]) epub:type=\"se:[^\"]+?\">[^<]+?[\.,\!\?]</\2>)", file_contents)
if matches:
messages.append(LintMessage("Ending punctuation inside italics.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match[0], se.MESSAGE_TYPE_WARNING, filename, True))
# Check for money not separated by commas
matches = regex.findall(r"[£\$][0-9]{4,}", file_contents)
if matches:
messages.append(LintMessage("Numbers not grouped by commas. Separate numbers greater than 1,000 with commas at every three numerals.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check for trailing commas inside <i> tags at the close of dialog
if ",</i>”" in file_contents:
messages.append(LintMessage("Comma inside <i> tag before closing dialog. (Search for ,</i>”)", se.MESSAGE_TYPE_WARNING, filename))
# Check for period following Roman numeral, which is an old-timey style we must fix
# But ignore the numeral if it's the first item in a <p> tag, as that suggests it might be a kind of list item.
matches = regex.findall(r"(?<!<p[^>]*?>)<span epub:type=\"z3998:roman\">[^<]+?</span>\.\s+[a-z]", file_contents)
if matches:
messages.append(LintMessage("Roman numeral followed by a period. When in mid-sentence Roman numerals must not be followed by a period.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check for two em dashes in a row
matches = regex.findall(r"—{}*—+".format(se.WORD_JOINER), file_contents)
if matches:
messages.append(LintMessage("Two or more em-dashes in a row detected. Elided words should use the two- or three-em-dash Unicode character, and dialog ending in em-dashes should only end in a single em-dash.", se.MESSAGE_TYPE_ERROR, filename))
# Check for <abbr class="name"> that does not contain spaces
matches = regex.findall(r"<abbr class=\"name\">[^<]*?[A-Z]\.[A-Z]\.[^<]*?</abbr>", file_contents)
if matches:
messages.append(LintMessage("Initials in <abbr class=\"name\"> not separated by spaces.", se.MESSAGE_TYPE_ERROR, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_ERROR, filename, True))
# Check for empty <h2> missing epub:type="title" attribute
if "<h2>" in file_contents:
messages.append(LintMessage("<h2> tag without epub:type=\"title\" attribute.", se.MESSAGE_TYPE_WARNING, filename))
# Check for a common typo
if "z3998:nonfiction" in file_contents:
messages.append(LintMessage("Typo: z3998:nonfiction should be z3998:non-fiction", se.MESSAGE_TYPE_ERROR, filename))
# Check for empty <p> tags
matches = regex.findall(r"<p>\s*</p>", file_contents)
if "<p/>" in file_contents or matches:
messages.append(LintMessage("Empty <p> tag. Use <hr/> for scene breaks if appropriate.", se.MESSAGE_TYPE_ERROR, filename))
# Check for <p> tags that end with <br/>
matches = regex.findall(r"(\s*<br/?>\s*)+</p>", file_contents)
if matches:
messages.append(LintMessage("<br/> tag found before closing </p> tag.", se.MESSAGE_TYPE_ERROR, filename))
# Check for single words that are in italics, but that have closing punctuation outside italics
# Outer wrapping match is so that .findall returns the entire match and not the subgroup
# The first regex also matches the first few characters before the first double quote; we use those for more sophisticated
# checks below, to give fewer false positives like `with its downy red hairs and its “<i xml:lang="fr">doigts de faune</i>.”`
matches = regex.findall(r"((?:.{1,2}\s)?“<(i|em)[^>]*?>[^<]+?</\2>[\!\?\.])", file_contents) + regex.findall(r"([\.\!\?] <(i|em)[^>]*?>[^<]+?</\2>[\!\?\.])", file_contents)
# But, if we've matched a name of something, don't include that as an error. For example, `He said, “<i epub:type="se:name.publication.book">The Decameron</i>.”`
# We also exclude the match from the list if:
# 1. The double quote is directly preceded by a lowercase letter and a space: `with its downy red hairs and its “<i xml:lang="fr">doigts de faune</i>.”`
# 2. The double quote is directly preceded by a lowercase letter, a comma, and a space, and the first letter within the double quote is lowercase: In the original, “<i xml:lang="es">que era un Conde de Irlos</i>.”
matches = [x for x in matches if "epub:type=\"se:name." not in x[0] and "epub:type=\"z3998:taxonomy" not in x[0] and not regex.match(r"^[a-z’]+\s“", x[0]) and not regex.match(r"^[a-z’]+,\s“[a-z]", se.formatting.remove_tags(x[0]))]
if matches:
messages.append(LintMessage("When a complete clause is italicized, ending punctuation EXCEPT commas must be within containing italics.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match[0], se.MESSAGE_TYPE_WARNING, filename, True))
# Check for foreign phrases with italics going *outside* quotes
matches = regex.findall(r"<i[^>]*?>“.+?\b", file_contents) + regex.findall(r"”</i>", file_contents)
if matches:
messages.append(LintMessage("When italicizing language in dialog, italics go INSIDE quotation marks.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check for style attributes
matches = regex.findall(r"<.+?style=\"", file_contents)
if matches:
messages.append(LintMessage("Illegal style attribute. Do not use inline styles, any element can be targeted with a clever enough selector.", se.MESSAGE_TYPE_ERROR, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_ERROR, filename, True))
# Check for uppercase HTML tags
if regex.findall(r"<[A-Z]+", file_contents):
messages.append(LintMessage("One or more uppercase HTML tags.", se.MESSAGE_TYPE_ERROR, filename))
# Check for nbsp within <abbr class="name">, which is redundant
matches = regex.findall(r"<abbr[^>]+?class=\"name\"[^>]*?>[^<]*?{}[^<]*?</abbr>".format(se.NO_BREAK_SPACE), file_contents)
if matches:
messages.append(LintMessage("No-break space detected in <abbr class=\"name\">. This is redundant.", se.MESSAGE_TYPE_ERROR, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_ERROR, filename, True))
# Check for Roman numerals in <title> tag
if regex.findall(r"<title>[Cc]hapter [XxIiVv]+", file_contents):
messages.append(LintMessage("No Roman numerals allowed in <title> tag; use decimal numbers.", se.MESSAGE_TYPE_ERROR, filename))
# If the chapter has a number and no subtitle, check the <title> tag...
matches = regex.findall(r"<h([0-6]) epub:type=\"title z3998:roman\">([^<]+)</h\1>", file_contents, flags=regex.DOTALL)
# ...But only make the correction if there's one <h#> tag. If there's more than one, then the xhtml file probably requires an overarching title
if matches and len(regex.findall(r"<h(?:[0-6])", file_contents)) == 1:
try:
chapter_number = roman.fromRoman(matches[0][1].upper())
regex_string = r"<title>(Chapter|Section|Part) {}".format(chapter_number)
if not regex.findall(regex_string, file_contents):
messages.append(LintMessage("<title> tag doesn't match expected value; should be \"Chapter {}\". (Beware hidden Unicode characters!)".format(chapter_number), se.MESSAGE_TYPE_ERROR, filename))
except Exception:
messages.append(LintMessage("<h#> tag is marked with z3998:roman, but is not a Roman numeral", se.MESSAGE_TYPE_ERROR, filename))
# If the chapter has a number and subtitle, check the <title> tag...
matches = regex.findall(r"<h([0-6]) epub:type=\"title\">\s*<span epub:type=\"z3998:roman\">([^<]+)</span>\s*<span epub:type=\"subtitle\">(.+?)</span>\s*</h\1>", file_contents, flags=regex.DOTALL)
# ...But only make the correction if there's one <h#> tag. If there's more than one, then the xhtml file probably requires an overarching title
if matches and len(regex.findall(r"<h(?:[0-6])", file_contents)) == 1:
chapter_number = roman.fromRoman(matches[0][1].upper())
# First, remove endnotes in the subtitle, then remove all other tags (but not tag contents)
chapter_title = regex.sub(r"<a[^<]+?epub:type=\"noteref\"[^<]*?>[^<]+?</a>", "", matches[0][2]).strip()
chapter_title = regex.sub(r"<[^<]+?>", "", chapter_title)
regex_string = r"<title>(Chapter|Section|Part) {}: {}".format(chapter_number, regex.escape(chapter_title))
if not regex.findall(regex_string, file_contents):
messages.append(LintMessage("<title> tag doesn't match expected value; should be \"Chapter {}: {}\". (Beware hidden Unicode characters!)".format(chapter_number, chapter_title), se.MESSAGE_TYPE_ERROR, filename))
# Check for missing subtitle styling
if "epub:type=\"subtitle\"" in file_contents and not local_css_has_subtitle_style:
messages.append(LintMessage("Subtitles detected, but no subtitle style detected in local.css.", se.MESSAGE_TYPE_ERROR, filename))
# Check for whitespace before noteref
matches = regex.findall(r"\s+<a href=\"endnotes\.xhtml#note-[0-9]+?\" id=\"noteref-[0-9]+?\" epub:type=\"noteref\">[0-9]+?</a>", file_contents)
if matches:
messages.append(LintMessage("Illegal white space before noteref.", se.MESSAGE_TYPE_ERROR, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check for <li> elements that don't have a direct block child
if filename != "toc.xhtml":
matches = regex.findall(r"<li(?:\s[^>]*?>|>)\s*[^\s<]", file_contents)
if matches:
messages.append(LintMessage("<li> without direct block-level child.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check for IDs on <h#> tags
matches = regex.findall(r"<h[0-6][^>]*?id=[^>]*?>", file_contents, flags=regex.DOTALL)
if matches:
messages.append(LintMessage("<h#> tag with id attribute. <h#> tags should be wrapped in <section> tags, which should hold the id attribute.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check to see if <h#> tags are correctly titlecased
matches = regex.finditer(r"<h([0-6])([^>]*?)>(.*?)</h\1>", file_contents, flags=regex.DOTALL)
for match in matches:
if "z3998:roman" not in match.group(2):
title = match.group(3).strip()
# Remove leading roman numerals first
title = regex.sub(r"^<span epub:type=\"[^\"]*?z3998:roman[^\"]*?\">(.*?)</span>", "", title, flags=regex.DOTALL)
# Remove leading leftover spacing and punctuation
title = regex.sub(r"^[\s\.\,\!\?\:\;]*", "", title)
# Remove endnotes
title = regex.sub(r"<a[^>]*?epub:type=\"noteref\"[^>]*?>[0-9]+</a>", "", title)
# Normalize whitespace
title = regex.sub(r"\s+", " ", title, flags=regex.DOTALL).strip()
# Remove nested <span>s in subtitles, which might trip up the next regex block
title = regex.sub(r"(<span epub:type=\"subtitle\">[^<]*?)<span[^>]*?>([^<]*?</span>)", r"\1\2", title, flags=regex.DOTALL)
title = regex.sub(r"(<span epub:type=\"subtitle\">[^<]*?)</span>([^<]*?</span>)", r"\1\2", title, flags=regex.DOTALL)
# Do we have a subtitle? If so the first letter of that must be capitalized, so we pull that out
subtitle_matches = regex.findall(r"(.*?)<span epub:type=\"subtitle\">(.*?)</span>(.*?)", title, flags=regex.DOTALL)
if subtitle_matches:
for title_header, subtitle, title_footer in subtitle_matches:
title_header = se.formatting.titlecase(se.formatting.remove_tags(title_header).strip())
subtitle = se.formatting.titlecase(se.formatting.remove_tags(subtitle).strip())
title_footer = se.formatting.titlecase(se.formatting.remove_tags(title_footer).strip())
titlecased_title = title_header + " " + subtitle + " " + title_footer
titlecased_title = titlecased_title.strip()
title = se.formatting.remove_tags(title).strip()
if title != titlecased_title:
messages.append(LintMessage("Title \"{}\" not correctly titlecased. Expected: {}".format(title, titlecased_title), se.MESSAGE_TYPE_WARNING, filename))
# No subtitle? Much more straightforward
else:
titlecased_title = se.formatting.remove_tags(se.formatting.titlecase(title))
title = se.formatting.remove_tags(title)
if title != titlecased_title:
messages.append(LintMessage("Title \"{}\" not correctly titlecased. Expected: {}".format(title, titlecased_title), se.MESSAGE_TYPE_WARNING, filename))
# Check for <figure> tags without id attributes
matches = regex.findall(r"<img[^>]*?id=\"[^>]+?>", file_contents)
if matches:
messages.append(LintMessage("<img> tag with ID attribute. ID attributes go on parent <figure> tags.", se.MESSAGE_TYPE_ERROR, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_ERROR, filename, True))
# Check for closing dialog without comma
matches = regex.findall(r"[a-z]+?” [a-zA-Z]+? said", file_contents)
if matches:
messages.append(LintMessage("Dialog without ending comma.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check for non-typogrified img alt attributes
matches = regex.findall(r"alt=\"[^\"]*?('|--|")[^\"]*?\"", file_contents)
if matches:
messages.append(LintMessage("Non-typogrified ', \" (as "), or -- in image alt attribute.", se.MESSAGE_TYPE_ERROR, filename))
# Check alt attributes not ending in punctuation
if filename not in se.IGNORED_FILENAMES:
matches = regex.findall(r"alt=\"[^\"]*?[a-zA-Z]\"", file_contents)
if matches:
messages.append(LintMessage("Alt attribute doesn't appear to end with punctuation. Alt attributes must be composed of complete sentences ending in appropriate punctuation.", se.MESSAGE_TYPE_ERROR, filename))
# Check alt attributes match image titles
images = dom.select("img[src$=svg]")
for image in images:
alt_text = image["alt"]
title_text = ""
image_ref = image["src"].split("/").pop()
try:
with open(self.path / "src" / "epub" / "images" / image_ref, "r", encoding="utf-8") as image_source:
try:
title_text = BeautifulSoup(image_source, "lxml").title.get_text()
except Exception:
messages.append(LintMessage("{} missing <title> element.".format(image_ref), se.MESSAGE_TYPE_ERROR, image_ref))
if title_text != "" and alt_text != "" and title_text != alt_text:
messages.append(LintMessage("The <title> of {} doesn’t match the alt text in {}".format(image_ref, filename), se.MESSAGE_TYPE_ERROR, filename))
except FileNotFoundError:
messages.append(LintMessage("The image {} doesn’t exist".format(image_ref), se.MESSAGE_TYPE_ERROR, filename))
# Check for punctuation after endnotes
regex_string = r"<a[^>]*?epub:type=\"noteref\"[^>]*?>[0-9]+</a>[^\s<–\]\)—{}]".format(se.WORD_JOINER)
matches = regex.findall(regex_string, file_contents)
if matches:
messages.append(LintMessage("Endnote links must be outside of punctuation, including quotation marks.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_ERROR, filename, True))
# Check for nbsp in measurements, for example: 90 mm
matches = regex.findall(r"[0-9]+[\- ][mck][mgl]\b", file_contents)
if matches:
messages.append(LintMessage("Measurements must be separated by a no-break space, not a dash or regular space.", se.MESSAGE_TYPE_ERROR, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_ERROR, filename, True))
# Check for line breaks after <br/> tags
matches = regex.findall(r"<br\s*?/>[^\n]", file_contents)
if matches:
messages.append(LintMessage("<br/> tags must be followed by a newline, and subsequent content must be indented to the same level.", se.MESSAGE_TYPE_ERROR, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_ERROR, filename, True))
# Check for <pre> tags
if "<pre" in file_contents:
messages.append(LintMessage("Illegal <pre> tag.", se.MESSAGE_TYPE_ERROR, filename))
# Check for double spacing
regex_string = r"[{}{} ]{{2,}}".format(se.NO_BREAK_SPACE, se.HAIR_SPACE)
matches = regex.findall(regex_string, file_contents)
if matches:
messages.append(LintMessage("Double spacing detected in file. Sentences should be single-spaced. (Note that double spaces might include Unicode no-break spaces!)", se.MESSAGE_TYPE_ERROR, filename))
# Check for punctuation outside quotes. We don't check single quotes because contractions are too common.
matches = regex.findall(r"[a-zA-Z][”][,.]", file_contents)
if matches:
messages.append(LintMessage("Comma or period outside of double quote. Generally punctuation should go within single and double quotes.", se.MESSAGE_TYPE_WARNING, filename))
# Did someone use colons instead of dots for SE identifiers? e.g. se:name:vessel:ship
matches = regex.findall(r"\bse:[a-z]+:(?:[a-z]+:?)*", file_contents)
if matches:
messages.append(LintMessage("Illegal colon (:) detected in SE identifier. SE identifiers are separated by dots (.) not colons (:). Identifier: {}".format(matches), se.MESSAGE_TYPE_ERROR, filename))
# Check for leftover asterisms
matches = regex.findall(r"\*\s*(\*\s*)+", file_contents)
if matches:
messages.append(LintMessage("Illegal asterism (***) detected. Section/scene breaks must be defined by an <hr/> tag.", se.MESSAGE_TYPE_ERROR, filename))
# Check for space before endnote backlinks
if filename == "endnotes.xhtml":
# Do we have to replace Ibid.?
matches = regex.findall(r"\bibid\b", file_contents, flags=regex.IGNORECASE)
if matches:
messages.append(LintMessage("Illegal \"Ibid\" in endnotes. \"Ibid\" means \"The previous reference\" which is meaningless with popup endnotes, and must be replaced by the actual thing \"Ibid\" refers to.", se.MESSAGE_TYPE_ERROR, filename))
endnote_referrers = dom.select("li[id^=note-] a")
bad_referrers = []
for referrer in endnote_referrers:
# We check against the attr value here because I couldn't figure out how to select an XML-namespaced attribute using BS4
if "epub:type" in referrer.attrs and referrer.attrs["epub:type"] == "backlink":
is_first_sib = True
for sib in referrer.previous_siblings:
if is_first_sib:
is_first_sib = False
if isinstance(sib, NavigableString):
if sib == "\n": # Referrer preceded by newline. Check if all previous sibs are tags.
continue
elif sib == " " or str(sib) == se.NO_BREAK_SPACE or regex.search(r"[^\s] $", str(sib)): # Referrer preceded by a single space; we're OK
break
else: # Referrer preceded by a string that is not a newline and does not end with a single space
bad_referrers.append(referrer)
break
else:
# We got here because the first sib was a newline, or not a string. So, check all previous sibs.
if isinstance(sib, NavigableString) and sib != "\n":
bad_referrers.append(referrer)
break
if bad_referrers:
messages.append(LintMessage("Endnote referrer link not preceded by exactly one space, or a newline if all previous siblings are elements.", se.MESSAGE_TYPE_WARNING, filename))
for referrer in bad_referrers:
messages.append(LintMessage(str(referrer), se.MESSAGE_TYPE_WARNING, filename, True))
# If we're in the imprint, are the sources represented correctly?
# We don't have a standard yet for more than two sources (transcription and scan) so just ignore that case for now.
if filename == "imprint.xhtml":