-
Notifications
You must be signed in to change notification settings - Fork 0
/
anemone.py
766 lines (726 loc) · 57.3 KB
/
anemone.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
#!/usr/bin/env python3
"""
Anemone 1.42 (http://ssb22.user.srcf.net/anemone)
(c) 2023-24 Silas S. Brown. License: Apache 2
Run program with --help for usage instructions.
"""
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Where to find history:
# on GitHub at https://github.com/ssb22/indexer
# and on GitLab at https://gitlab.com/ssb22/indexer
# and on BitBucket https://bitbucket.org/ssb22/indexer
# and at https://gitlab.developers.cam.ac.uk/ssb22/indexer
# and in China: https://gitee.com/ssb22/indexer
def anemone(*files,**options):
"""This function can be called by scripts that
import anemone: simply put the equivalent of
the command line into 'files' and 'options'.
You can also specify a JSON dictionary instead
of the name of a JSON file, and/or an HTML
string instead of the name of an HTML file
(this can also be done on the command line
with careful quoting).
If you do not give this function any arguments
it will look at the system command line.
Return value is a list of warnings, if any."""
R=Run(*[(json.dumps(f) if type(f)==dict else f) for f in files],**options)
if R.mp3_recode: check_we_got_LAME()
write_all(R,get_texts(R))
R.cleanup() ; return R.warnings
def populate_argument_parser(args): # INTERNAL
"""Calls add_argument on args, with the names
of all the command-line options, which are
also options for anemone(), and help text."""
# TODO: could also run this with an object that takes the help text we give it and puts it into the module documentation?
args.add_argument("files",metavar="file",nargs="+",help="file name of: an MP3 recording, a text file containing its title (if no full text), a JSON file containing its time markers, an XHTML file containing its full text, or the output ZIP file. Only one output file may be specified, but any number of the other files can be included; URLs may be given if they are to be fetched (HTML assumed if no extension). If only MP3 files are given then titles are taken from their filenames. You may also specify @filename where filename contains a list of files one per line.")
args.add_argument("--lang",default="en",help="the ISO 639 language code of the publication (defaults to en for English)")
args.add_argument("--title",default="",help="the title of the publication")
args.add_argument("--url",default="",help="the URL or ISBN of the publication")
args.add_argument("--creator",default="",help="the creator name, if known")
args.add_argument("--publisher",default="",help="the publisher name, if known")
args.add_argument("--reader",default="",help="the name of the reader who voiced the recordings, if known")
args.add_argument("--date",help="the publication date as YYYY-MM-DD, default is current date")
args.add_argument("--marker-attribute",default="data-pid",help="the attribute used in the HTML to indicate a segment number corresponding to a JSON time marker entry, default is data-pid")
args.add_argument("--page-attribute",default="data-no",help="the attribute used in the HTML to indicate a page number, default is data-no")
args.add_argument("--image-attribute",default="data-zoom",help="the attribute used in the HTML to indicate an absolute image URL to be included in the DAISY file, default is data-zoom")
args.add_argument("--refresh",action="store_true",help="if images etc have already been fetched from URLs, ask the server if they should be fetched again (use If-Modified-Since)")
args.add_argument("--cache",default="cache",help="path name for the URL-fetching cache (default 'cache' in the current directory; set to empty string if you don't want to save anything)")
args.add_argument("--reload",dest="refetch",action="store_true",help="if images etc have already been fetched from URLs, fetch them again without If-Modified-Since")
args.add_argument("--delay",default=0,help="minimum number of seconds between URL fetches (default none)")
args.add_argument("--user-agent",default=f"Mozilla/5.0 (compatible, {' '.join(generator.split()[:2])})",help="User-Agent string to send for URL fetches")
args.add_argument("--daisy3",action="store_true",help="Use the Daisy 3 format (ANSI/NISO Z39.86) instead of the Daisy 2.02 format. This may require more modern reader software, and Anemone does not yet support Daisy 3 only features like tables in the text.")
args.add_argument("--mp3-recode",action="store_true",help="re-code the MP3 files to ensure they are constant bitrate and more likely to work with the more limited DAISY-reading programs like FSReader 3 (this option requires LAME)")
args.add_argument("--allow-jumps",action="store_true",help="Allow jumps in heading levels e.g. h1 to h3 if the input HTML does it. This seems OK on modern readers but might cause older reading devices to give an error. Without this option, headings are promoted where necessary to ensure only incremental depth increase.") # might cause older reading devices to give an error: and is also flagged up by the validator
args.add_argument("--strict-ncc-divs",action="store_true",help="When generating Daisy 2, avoid using a heading in the navigation control centre when there isn't a heading in the text. This currently applies when spans with verse numbering are detected. Turning on this option will make the DAISY more conformant to the specification, but some readers (EasyReader 10, Thorium) won't show these headings in the navigation in Daisy 2 (but will show them anyway in Daisy 3, so this option is applied automatically in Daisy 3). On the other hand, when using verse-numbered spans without this option, EasyReader 10 may not show any text at all in Daisy 2 (Anemone will warn if this is the case). This setting cannot stop EasyReader promoting all verses to headings (losing paragraph formatting) in Daisy 3, which is the least bad option if you want these navigation points to work.")
args.add_argument("--merge-books",default="",help="Combine multiple books into one, for saving media on CD-based DAISY players that cannot handle more than one book. The format of this option is book1/N1,book2/N2,etc where book1 is the book title and N1 is the number of MP3 files to group into it. All headings are pushed down one level and book name headings are added at top level.")
args.add_argument("--chapter-titles",default="",help="Comma-separated list of titles to use for chapters that don't have titles, e.g. 'Chapter N' in the language of the book (this can help for search-based navigation)")
args.add_argument("--dry-run",action="store_true",help="Don't actually output DAISY, just check the input and parameters")
generator=__doc__.strip().split('\n')[0] # string we use to identify ourselves in HTTP requests and in Daisy files
def get_argument_parser(): # INTERNAL
"Creates and populates our argument parser"
from argparse import ArgumentParser
args = ArgumentParser(prog="anemone",description=generator,fromfile_prefix_chars='@')
populate_argument_parser(args)
return args
import time, sys, os, re, json, math, tempfile
from collections import namedtuple as NT
from functools import reduce
from subprocess import run, PIPE
from zipfile import ZipFile, ZIP_DEFLATED
from html.parser import HTMLParser
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count
from urllib.request import urlopen,Request
from urllib.error import HTTPError
from urllib.parse import unquote
from pathlib import Path # Python 3.5+
from shutil import which
def error(m): # INTERNAL
"""Main error handler. If we're running as an
application, print message and error-exit. If
we're a module, raise an exception instead."""
if __name__=="__main__": sys.stderr.write(f"Error: {m}\n"),sys.exit(1)
else: raise AnemoneError(str(m))
class AnemoneError(Exception): pass
try: from mutagen.mp3 import MP3
except ImportError: error("Anemone needs the Mutagen library to determine MP3 play lengths.\nPlease do: pip install mutagen")
from mutagen.wave import WAVE
class Run(): # INTERNAL
"""The parameters we need for an Anemone run.
Constructor can either parse args from the
command line, or from anemone() caller."""
def __init__(R,*inFiles,**kwargs):
# I know the convention is "self" but I am
# working in giant print so my screen area
# is important, so 1 letter please...
R.recordingFiles,R.jsonData = [],[]
R.textFiles,R.htmlData = [],[]
R.imageFiles,R.outputFile = [],None
R.filesToDelete = [] ; R.warnings = []
if inFiles: R.__dict__.update(get_argument_parser().parse_args(list(inFiles)+['--'+k.replace('_','-') for k,v in kwargs.items() if v==True]+[a for k,v in kwargs.items() for a in ['--'+k.replace('_','-'),str(v)] if not v==True and not v==False and not v==None]).__dict__) # so flag=False is ignored (as we default to False), and option=None means use the default
else: R.__dict__.update(get_argument_parser().parse_args().__dict__)
for f in R.files:
f = f.strip()
if f.lower().endswith(f"{os.extsep}zip"):
if R.outputFile: error(f"Only one {os.extsep}zip output file may be specified")
R.outputFile = f ; continue
if re.match("https?://",f):
f=fetch(f,True,R.cache,R.refresh,R.refetch,R.delay,R.user_agent)
if not R.cache:
R.filesToDelete.append(f)
if f.startswith('{') and f.endswith('}'):
R.jsonData.append(json.loads(f))
continue # don't treat as a file
elif f.startswith('<') and f.endswith('>'):
R.htmlData.append(f) ; continue
elif not os.path.exists(f): error(f"File not found: {f}")
if f.lower().endswith(f"{os.extsep}mp3") or f.lower().endswith(f"{os.extsep}wav"):
if f.endswith(f"{os.extsep}wav") and not R.mp3_recode: error("wav input requires mp3 recode to be set")
R.recordingFiles.append(f)
elif f.lower().endswith(f"{os.extsep}json"): R.jsonData.append(json.load(open(f,encoding="utf-8")))
elif f.lower().endswith(f"{os.extsep}txt"):
R.textFiles.append(f)
elif f.lower().endswith(f"{os.extsep}html") or not os.extsep in f.rsplit(os.sep,1)[-1]:
R.htmlData.append(open(f,encoding="utf-8").read())
else: error(f"Can't handle '{f}'")
if not R.recordingFiles: error("Creating DAISY files without audio is not yet implemented")
if R.htmlData and not R.jsonData: error("Full text without time markers is not yet implemented")
if R.jsonData and not R.htmlData: error("Time markers without full text is not implemented")
if R.htmlData and R.textFiles: error("Combining full text with title-only text files is not yet implemented. Please specify full text for everything or just titles for everything, not both.")
if R.jsonData and not len(R.recordingFiles)==len(R.jsonData): error(f"If JSON marker files are specified, there must be exactly one JSON file for each recording file. We got f{len(R.jsonData)} JSON files and f{len(R.recordingFiles)} recording files.")
if R.textFiles and not len(R.recordingFiles)==len(R.textFiles): error(f"If text files are specified, there must be exactly one text file for each recording file. We got f{len(R.textFiles)} text files and f{len(R.recordingFiles)} recording files.")
if R.htmlData and not len(R.recordingFiles)==len(R.htmlData): error(f"If HTML documents are specified, there must be exactly one HTML document for each recording. We got f{len(R.htmlData)} HTML documents and f{len(R.recordingFiles)} recordings.")
if not R.outputFile: R.outputFile=f"output_daisy{os.extsep}zip"
if not R.title: R.title=R.outputFile.replace(f"{os.extsep}zip","").replace("_daisy","")
def warning(R,warningText):
R.warnings.append(warningText) ; sys.stderr.write(f"WARNING: {warningText}\n")
def cleanup(R):
for f in R.filesToDelete:
try: Path(f).unlink()
except: R.warning(f"couldn't delete {f}\n")
def check_we_got_LAME(): # INTERNAL
if which('lame'): return
if sys.platform=='win32':
os.environ["PATH"] += r";C:\Program Files (x86)\Lame for Audacity;C:\Program Files\Lame for Audacity"
if which('lame'): return
error(f"Anemone requires the LAME program to recode MP3s.\nPlease {'run the exe installer from lame.buanzo.org' if sys.platform=='win32' else 'install lame'} and try again.")
PageInfo = NT('PageInfo',['duringId','pageNo'])
TagAndText = NT('TagAndText',['tag','text'])
TextsAndTimesWithPages = NT('TextsAndTimesWithPages',['textsAndTimes','pageInfos'])
ChapterTOCInfo = NT('ChapterTOCInfo',['hTag','hLine','itemNo'])
BookTOCInfo = NT('BookTOCInfo',['hTag','hLine','recNo','itemNo'])
def get_texts(R): # INTERNAL
"""Gets the text markup required for the run,
extracting it from HTML (guided by JSON IDs)
if we need to do that."""
if R.textFiles: return [open(f,encoding="utf-8").read().strip() for f in R.textFiles] # section titles only, from text files
elif not R.htmlData: return [r[:-len(f"{os.extsep}mp3")] for r in R.recordingFiles] # section titles only, from MP3 filenames
recordingTexts = []
for h,j in zip(R.htmlData,R.jsonData):
markers = j['markers']
want_pids = [jsonAttr(m,"id") for m in markers]
id_to_content = {}
pageNos = []
allowedInlineTags=['br'] # Dolphin EasyReader does not render <strong> and <em>, and constructs like "(<em>Publication name</em>" result in incorrect space after "(" so best leave it out
assert not 'rt' in allowedInlineTags, "if allowing this, need to revise rt suppression logic" # and would have to rely on rp parens for most readers, so if a text has a LOT of ruby it could get quite unreadable
class PidsExtractor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.addTo = None
self.suppress = 0
self.imgsMaybeAdd = None
self.pageNoGoesAfter = 0
self.theStartTag = None
def handle_starttag(self,tag,attrs):
tag = tagRewrite.get(tag,tag)
attrs = dict(attrs)
imgURL = attrs.get(R.image_attribute,None)
if imgURL and re.match("https?://.*[.][^/]*$",imgURL) and not (self.addTo==None and self.imgsMaybeAdd==None):
# TODO: might want to check attrs.get("alt",""), but DAISY3 standard does not list alt as a valid attribute for img, so we'd have to put it after with br etc (changing text_htm) and we don't know if it's in the audio or not: probably best just to leave it and rely on there being a separate caption with ID if it's in the audio
img = f'<img src="{(R.imageFiles.index(imgURL) if imgURL in R.imageFiles else len(R.imageFiles))+1}{imgURL[imgURL.rindex("."):]}" {f"""id="i{R.imageFiles.index(imgURL) if imgURL in R.imageFiles else len(R.imageFiles)}" """ if R.daisy3 else ""}/>' # will be moved after paragraph by text_htm
if not imgURL in R.imageFiles:
R.imageFiles.append(imgURL)
if self.addTo==None: self.imgsMaybeAdd.append(img)
else: self.addTo.append(img)
pageNo = attrs.get(R.page_attribute,None)
if pageNo:
pageNos.append(PageInfo(self.pageNoGoesAfter,pageNo))
if attrs.get(R.marker_attribute,None) in want_pids:
self.theStartTag = tag
self.tagDepth = 0
a = attrs[R.marker_attribute]
self.pageNoGoesAfter = want_pids.index(a)
id_to_content[a] = ((tag if re.match('h[1-6]$',tag) or tag=='span' else 'p'),[])
if self.imgsMaybeAdd: self.imgsMaybeAddTo += self.imgsMaybeAdd # and imgsMaybeAdd will be reset to [] when this element is closed
self.addTo = id_to_content[a][1]
return
if tag==self.theStartTag and not tag=="p": # can nest
self.tagDepth += 1
if not self.addTo==None and tag in allowedInlineTags: self.addTo.append(f'<{tag}>')
elif not self.addTo==None and tag=='a': self.lastAStart = len(self.addTo)
elif tag=='rt': self.suppress += 1
def handle_endtag(self,tag):
tag = tagRewrite.get(tag,tag)
if self.suppress and tag=='rt': self.suppress -= 1
elif not self.addTo==None:
if tag==self.theStartTag and self.tagDepth == 0:
self.highestImage,self.imgsMaybeAddTo, self.imgsMaybeAdd = len(R.imageFiles),self.addTo,[] # if we find any images (not in an id'd element) after the end of the id'd element, we might want to add them in with any inside it, but only if there's another id'd element after them i.e. not if they're just random decoration at the bottom of the page
self.addTo = None
elif tag in allowedInlineTags: self.addTo.append(f'</{tag}>')
elif tag=="a" and re.match('[!-.:-~]$',"".join(self.addTo[self.lastAStart:]).strip()): del self.addTo[self.lastAStart:] # remove single-character link, probably to footnote (we won't know if it's in the audio or not, we're not supporting jumps and the symbols might need normalising) but do allow numbers (might be paragraph numbers etc) and non-ASCII (might be single-character CJK word)
if tag==self.theStartTag and self.tagDepth: self.tagDepth -= 1
if tag=='html' and self.imgsMaybeAdd and hasattr(self,'highestImage'): del R.imageFiles[self.highestImage:] # do not include ones that were only in imgsMaybeAdd at the end of the page (and not also elsewhere)
def handle_data(self,data):
if not self.addTo==None and not self.suppress:
self.addTo.append(data.replace('&','&').replace('<','<'))
PidsExtractor().feed(h)
rTxt = []
for i in range(len(markers)):
if i: rTxt.append(parseTime(jsonAttr(markers[i],"time"))) # assume marker 0 is 0
if want_pids[i] in id_to_content:
tag,content = id_to_content[want_pids[i]]
content = ''.join(content).strip()
rTxt.append(TagAndText(tag,re.sub('( *</?br> *)+','<br />',content))) # (allow line breaks inside paragraphs, in case any in mid-"sentence", but collapse them because readers typically add extra space to each)
else:
R.warning(f"JSON {len(recordingTexts)+1} marker {i+1} marks paragraph ID {want_pids[i]} which is not present in HTML {len(recordingTexts)+1}. Anemone will make this a blank paragraph.")
rTxt.append(TagAndText('p',''))
recordingTexts.append(TextsAndTimesWithPages(rTxt,pageNos))
return recordingTexts
tagRewrite = { # used by get_texts
'legend':'h3', # used in fieldset
}
def jsonAttr(d,suffix):
"""Returns the value of a dictionary key whose
name ends with the given lower-case suffix
(after converting names to lower case), after
checking exactly one key does this. Used for
checking JSON for things like paragraphId if
you know only that it ends with 'Id'"""
keys = [k for k in d.keys() if k.lower().endswith(suffix)]
if not keys: error(f"No *{suffix} in {repr(keys)}")
if len(keys)>1: error(f"More than one *{suffix} in {repr(keys)}")
return str(d[keys[0]])
def parseTime(t):
"""Parses a string containing seconds,
minutes:seconds or hours:minutes:seconds
(decimal fractions of seconds allowed),
and returns floating-point seconds"""
tot = 0.0 ; mul = 1
for u in reversed(t.split(':')):
tot += float(u)*mul ; mul *= 60
return tot
def write_all(R,recordingTexts): # INTERNAL
"each item is either 1 text for section title of whole recording, or a TextsAndTimesWithPages i.e. ([TagAndText,time,TagAndText,time,TagAndText],[PageInfo,...])"
assert len(R.recordingFiles) == len(recordingTexts)
headings = getHeadings(R,recordingTexts)
if R.dry_run: return sys.stderr.write(f"Dry run: {len(R.warnings) if R.warnings else 'no'} warning{'' if len(R.warnings)==1 else 's'} for {R.outputFile}\n")
merge0lenSpans(recordingTexts,headings)
hasFullText = any(type(t)==TextsAndTimesWithPages for t in recordingTexts)
if R.mp3_recode: # parallelise lame if possible
if not __name__=="__main__": sys.stderr.write(f"Making {R.outputFile}...\n"),sys.stderr.flush() # especially if repeatedly called, print which outputFile we're working on BEFORE the mp3s also
executor = ThreadPoolExecutor(max_workers=cpu_count())
recordings=[executor.submit(recodeMP3,f) for f in R.recordingFiles]
z = ZipFile(R.outputFile,"w",ZIP_DEFLATED,False)
def D(s): return s.replace("\n","\r\n") # in case old readers require DOS line endings
if hasFullText: z.writestr("0000.txt",D(f"""
If you're reading this, it likely means your
operating system has unpacked the ZIP file
and is showing you its contents. While it
is possible to extract recordings and text
this way, it is better to send the whole ZIP
to a DAISY reader so that its recordings and
text can be connected with each other. If
you are using EasyReader on a mobile device,
close this file and navigate up a level to
find the original ZIP file so it can be sent
to EasyReader as a whole. Some other DAISY
readers need to be pointed at the {'OPF' if R.daisy3 else 'NCC'} file
instead, or at the whole directory/folder.
""")) # TODO: message in other languages?
# (it's iOS users that need the above, apparently. Can't DAISY have a non-ZIP extension so Apple systems don't automatically unpack it? but we do need to manually unpack if writing to a CD-ROM for old devices. Can't Apple look at some kind of embedded "don't auto-unpack this zip" request?)
secsSoFar = 0
durations = [] ; curP = 1
for recNo in range(1,len(recordingTexts)+1):
rTxt = recordingTexts[recNo-1]
f = R.recordingFiles[recNo-1]
secsThisRecording = (MP3(f) if f.lower().endswith(f"{os.extsep}mp3") else WAVE(f)).info.length
durations.append(secsThisRecording)
if R.mp3_recode: sys.stderr.write(f"Adding {recNo:04d}.mp3..."),sys.stderr.flush()
z.writestr(f"{recNo:04d}.mp3",recordings[recNo-1].result() if R.mp3_recode else open(R.recordingFiles[recNo-1],'rb').read())
if R.mp3_recode: sys.stderr.write(" done\n")
z.writestr(f'{recNo:04d}.smil',D(section_smil(R,recNo,secsSoFar,secsThisRecording,curP,rTxt.textsAndTimes if type(rTxt)==TextsAndTimesWithPages else rTxt)))
z.writestr(f'{recNo:04d}.{"xml" if R.daisy3 else "htm"}',D(text_htm(R,(rTxt.textsAndTimes[::2] if type(rTxt)==TextsAndTimesWithPages else [TagAndText('h1',rTxt)]),curP)))
secsSoFar += secsThisRecording
curP += (1+len(rTxt.textsAndTimes)//2 if type(rTxt)==TextsAndTimesWithPages else 1)
for n,u in enumerate(R.imageFiles): z.writestr(f'{n+1}{u[u.rindex("."):]}',fetch(u,False,R.cache,R.refresh,R.refetch,R.delay,R.user_agent))
if not R.date: R.date = "%d-%02d-%02d" % time.localtime()[:3]
if R.daisy3:
z.writestr('dtbook.2005.basic.css',D(d3css))
z.writestr('package.opf',D(package_opf(R,hasFullText,len(recordingTexts),secsSoFar)))
z.writestr('text.res',D(textres))
else: z.writestr('master.smil',D(master_smil(R,headings,secsSoFar)))
z.writestr('navigation.ncx' if R.daisy3 else 'ncc.html',D(ncc_html(R,headings,hasFullText,secsSoFar,[[0]+(t.textsAndTimes if type(t)==TextsAndTimesWithPages else [t])+[durations[i]] for i,t in enumerate(recordingTexts)],[(t.pageInfos if type(t)==TextsAndTimesWithPages else []) for t in recordingTexts])))
if not R.daisy3: z.writestr('er_book_info.xml',D(er_book_info(durations))) # not DAISY standard but EasyReader can use this
z.close()
sys.stderr.write(f"Wrote {R.outputFile}\n")
def getHeadings(R,recordingTexts): # INTERNAL
ret = []
cvChapCount = chapNo = 0
try: bookTitlesAndNumChaps = [(n,int(v)) for n,v in [b.split('/') for b in R.merge_books.split(',') if b]]
except: error(f"Unable to parse merge-books={R.merge_books}")
for t in recordingTexts:
chapNo += 1
if bookTitlesAndNumChaps and chapNo==bookTitlesAndNumChaps[0][1]+1:
del bookTitlesAndNumChaps[0]
if not bookTitlesAndNumChaps: error("merge-books did not account for all files (check the counts)")
chapNo = 1
if not type(t)==TextsAndTimesWithPages:
if bookTitlesAndNumChaps and chapNo==1: error("merge-books with non-HTML not yet implemented")
ret.append(t) ; continue # title only
textsAndTimes,pages = t ; first = None
chapHeadings = []
for v,u in enumerate(textsAndTimes):
if type(u)==float: continue # time
tag,text = u
if first==None: first = v
if not tag.startswith('h'):
continue
if v//2 - 1 == first//2 and not textsAndTimes[first].tag.startswith('h'): # chapter starts with non-heading followed by heading: check the non-heading for "Chapter N" etc
nums=re.findall("[1-9][0-9]*",textsAndTimes[first].text)
if len(nums)==1:
text=f"{nums[0]}: {text}" # for TOC
textsAndTimes[v-1] = (textsAndTimes[first-1] if first else 0) + 0.001 # for audio jump-navigation to include the "Chapter N" (TODO: option to merge the in-chapter text instead, so "Chapter N" appears as part of the heading, not scrolled past quickly? merge0lenSpans will now do this if the chapter paragraph is promoted to heading, but beware we might not want the whole of the 'chapter N' text to be part of the TOC, just the number. Thorium actually stops playing when it hits the 0-length paragraph before the heading, so promoting it might be better; trying the +0.001 for now to make timestamps not exactly equal)
chapHeadings.append(ChapterTOCInfo(tag,re.sub('<img src.*?/>','',text),v//2))
if not chapHeadings:
# This'll be a problem, as master_smil and ncc_html need headings to refer to the chapter at all. (Well, ncc_html can also do it by page number if we have them, but we haven't tested all DAISY readers with page number only navigation, and what if we don't even have page numbers?)
# So let's see if we can at least get a chapter number.
if not first==None: nums=re.findall("[1-9][0-9]*",textsAndTimes[first].text)
else:
R.warning(f"Chapter {chapNo} is completely blank! (Is {'--marker-attribute' if __name__=='__main__' else 'marker_attribute'} set correctly?)")
nums = [] ; first = 0 ; textsAndTimes.append(TagAndText('p',''))
chapterNumberTextFull = chapterNumberText = nums[0] if len(nums)==1 and not nums[0]=="1" else str(chapNo)
if R.chapter_titles:
if ',' in R.chapter_titles: chapterNumberTextFull,R.chapter_titles = R.chapter_titles.split(',',1)
else: chapterNumberTextFull,R.chapter_titles = R.chapter_titles, ""
if not chapterNumberText in chapterNumberTextFull: R.warning(f"Title for chapter {chapNo} is '{chapterNumberTextFull}' which does not contain the expected '{chapterNumberText}'")
# In EasyReader 10 on Android, unless there is at least one HEADING (not just div), navigation display is non-functional. And every heading must point to a 'real' heading in the text, otherwise EasyReader 10 will delete all the text in Daisy 2, or promote something to a heading in Daisy 3 (this is not done by Thorium Reader)
# So let's add a "real" start-of-chapter heading before the text, with time 0.001 second (don't set it to 0 or Thorium can have issues)
textsAndTimes.insert(first,(textsAndTimes[first-1] if first else 0)+0.001)
textsAndTimes.insert(first,TagAndText('h1',chapterNumberTextFull)) # we'll ref this
chapHeadings=[ChapterTOCInfo('h1',chapterNumberTextFull,first//2)] # points to our extra heading
if textsAndTimes[first+2].text.startswith(chapterNumberText): textsAndTimes[first+2]=TagAndText(textsAndTimes[first+2].tag,textsAndTimes[first+2].text[len(chapterNumberText):].strip()) # because we just had the number as a heading, so we don't also need it repeated as 1st thing in text
first += 2 # past the heading we added
if first+2<len(textsAndTimes) and re.search("[1-9][0-9]*",textsAndTimes[first+2].text):
v2 = int(re.findall("[1-9][0-9]*",textsAndTimes[first+2].text)[0]) # might not start at 2, might start at 13 or something, but does it then increase incrementally:
if [re.findall("[1-9][0-9]*",textsAndTimes[f].text)[:1] for f in range(first+4,len(textsAndTimes),2)]==[[str(n)] for n in range(v2+1,v2+(len(textsAndTimes)-first)//2)]: # looks like we're dealing with consecutive chapter and verse numbers with no other headings, so index the verse numbers
v = 1
while v < (len(textsAndTimes)-first)//2+2:
lastV = v
while lastV < (len(textsAndTimes)-first)//2+1 and (0 if v==1 else textsAndTimes[first+2*v-3])==textsAndTimes[first+2*lastV-1]: lastV += 1 # check for a span of them sharing a time
chapHeadings.append(ChapterTOCInfo('div' if R.daisy3 or R.strict_ncc_divs else 'h2',f"{chapterNumberText}:{v}{'' if v==lastV else f'-{lastV}'}",first//2+v-1))
v = lastV + 1
cvChapCount += 1
if bookTitlesAndNumChaps:
chapHeadings=[ChapterTOCInfo(f'h{int(i.hTag[1:])+1}' if i.hTag.startswith('h') else i.hTag,i.hLine,i.itemNo) for i in chapHeadings] # add 1 to each heading level
if chapNo==1: chapHeadings.insert(0,ChapterTOCInfo('h1',bookTitlesAndNumChaps[0][0],chapHeadings[0].itemNo)) # the book title (must point to a real heading for similar reason as above, TODO: if there's substantial text before 1st heading, we'll need to insert a heading in the text with 0.001s audio or something instead of doing this; may also need to in-place change recordingTexts adding 1 to all headings: don't do this unless inserting h1)
ret.append(chapHeadings)
if len(bookTitlesAndNumChaps)>1 or bookTitlesAndNumChaps and not chapNo==bookTitlesAndNumChaps[0][1]: R.warning("merge-books specified more files than given")
if cvChapCount not in [0,len(ret)]: R.warning(f"Verse-indexed only {cvChapCount} of {len(ret)} chapters")
if cvChapCount and not R.daisy3 and not R.strict_ncc_divs: R.warning("Verse-indexing in Daisy 2 can prevent EasyReader 10 from displaying the text: try Daisy 3 instead") # (and with strict_ncc_divs, verses are not shown in Book navigation in Daisy 2)
return ret
def merge0lenSpans(recordingTexts,headings): # INTERNAL
for cT,cH in zip(recordingTexts,headings):
if not type(cT)==TextsAndTimesWithPages:
continue
textsAndTimes,pages = cT
i=0
while i < len(textsAndTimes)-2:
while i < len(textsAndTimes)-2 and type(textsAndTimes[i])==TagAndText and (0 if i==0 else textsAndTimes[i-1])==textsAndTimes[i+1] and textsAndTimes[i].tag==textsAndTimes[i+2].tag: # tag identical and 0-length
textsAndTimes[i] = TagAndText(textsAndTimes[i].tag, f"{textsAndTimes[i].text}{' ' if textsAndTimes[i].tag=='span' else '<br>'}{textsAndTimes[i+2].text}") # new combined item
del textsAndTimes[i+1:i+3] # old
for hI,hV in enumerate(cH):
if hV.itemNo > i//2: cH[hI]=ChapterTOCInfo(hV.hTag,hV.hLine,hV.itemNo-1)
for pI,pInfo in enumerate(pages):
if pInfo.duringId > i//2: pages[pI]=PageInfo(pInfo.duringId-1,pInfo.pageNo)
i += 1
def recodeMP3(f):
"""Takes an MP3 or WAV filename, re-codes it
as suitable for DAISY, and returns the bytes
of new MP3 data for putting into DAISY ZIP"""
if f.endswith("wav"): return run(["lame","--quiet",f,"-m","m","--resample","44.1","-b","64","-q","0","-o","-"],check=True,stdout=PIPE).stdout # TODO: ensure lame doesn't take any headers or images embedded in the wav (if it does, we might need first to convert to headerless pcm as below)
# If that didn't return, we have MP3 input.
# It seems broken players like FSReader can get timing wrong if mp3 contains
# too many tags at the start (e.g. images).
# eyed3 clear() won't help: it zeros out bytes without changing indices.
# To ensure everything is removed, better decode to raw PCM and re-encode :-(
pcm = f[:-3]+"pcm" # instead of .mp3
m = re.search(b'(?s)([0-9]+) kHz, ([0-9]+).*?([0-9]+) bit',run(["lame","-t","--decode",f,"-o",pcm],check=True,stdout=PIPE,stderr=PIPE).stderr) # hope nobody disabled --decode when building LAME (is OK on lame.buanzo.org EXEs)
if not m: error("lame did not give expected format for frequency, channels and bits output")
mp3bytes = run(["lame","--quiet","-r","-s",m.group(1).decode('latin1')]+(['-a'] if m.group(2)==b'2' else [])+['-m','m','--bitwidth',m.group(3).decode('latin1'),pcm,"--resample","44.1","-b","64","-q","0","-o","-"],check=True,stdout=PIPE).stdout
os.remove(pcm) ; return mp3bytes
def fetch(url,
returnFilename=False, # if True, returns the cached filename we saved it in, if False, return the actual data
cache = "cache", # the cache directory (None = don't save, unless returnFilename=True in which case we write a temporary file which the caller should remove)
refresh = False, # if True, send If-Modified-Since request if we have a cached item
refetch = False, # if True, reloads
delay = 0, # between fetches (tracked globally)
user_agent = None):
"""Fetches a URL, with delay and cache options
(see comments on parameters)"""
ifModSince = None
if cache:
fn = re.sub('[%&?@*#{}<>!:+`=|$]','',cache+os.sep+unquote(re.sub('.*?://','',url)).replace('/',os.sep)) # these characters need to be removed on Windows's filesystem; TODO: store original URL somewhere just in case some misguided webmaster puts two identical URLs modulo those characters??
if fn.endswith(os.sep): fn += "index.html"
fn = os.sep.join(f.replace('.',os.extsep) for f in fn.split(os.sep)) # just in case we're on RISC OS (not tested)
fnExc = fn+os.extsep+"exception"
if os.path.exists(fn):
if refetch: pass # ignore already dl'd
elif refresh:
ifModSince=os.stat(fn).st_mtime
elif returnFilename: return fn
else: return open(fn,'rb').read()
elif os.path.exists(fnExc) and not refetch and not refresh: raise HTTPError("",int(open(fnExc).read()),"HTTP error on last fetch",{},None) # useful especially if a wrapper script is using our fetch() for multiple chapters and stopping on a 404
Path(fn[:fn.rindex(os.sep)]).mkdir(parents=True,exist_ok=True)
sys.stderr.write("Fetching "+url+"...")
sys.stderr.flush()
global _last_urlopen_time
try: _last_urlopen_time
except: _last_urlopen_time = 0
if delay: time.sleep(min(0,_last_urlopen_time+delay-time.time()))
headers = {"User-Agent":user_agent} if user_agent else {}
if ifModSince:
t = time.gmtime(ifModSince)
headers["If-Modified-Since"]=f"{'Mon Tue Wed Thu Fri Sat Sun'.split()[t.tm_wday]}, {t.tm_mday} {'Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec'.split()[t.tm_mon-1]} {t.tm_year} {t.tm_hour:02d}:{t.tm_min:02d}:{t.tm_sec:02d} GMT"
try: dat = urlopen(Request(url,headers=headers)).read()
except HTTPError as e:
_last_urlopen_time = time.time()
if e.getcode()==304 and cache:
sys.stderr.write(" no new data\n")
if returnFilename: return fn
else: return open(fn,'rb').read()
else:
sys.stderr.write(f"error {e.getcode()}\n")
if cache: open(fnExc,"w").write(str(e.getcode()))
raise
_last_urlopen_time = time.time()
if cache:
open(fn,'wb').write(dat)
sys.stderr.write(" saved\n")
else: sys.stderr.write(" fetched\n")
if not returnFilename: return dat
elif cache: return fn
ext = url.rsplit('/',1)[-1]
if not '.' in ext: ext="html" # ../index.html
else: ext=ext.rsplit('.',1)[-1]
t = tempfile.NamedTemporaryFile('wb',delete=False,suffix=os.extsep+ext)
t.write(dat) ; return t.name
def ncc_html(R, headings = [],
hasFullText = False,
totalSecs = 0,
recTimeTxts = [], # including 0,tot
pageNos=[]): # INTERNAL
"""Returns the Navigation Control Centre (NCC)
pageNos is [[PageInfo,...],...]"""
numPages = sum(len(l) for l in pageNos)
maxPageNo = max((max((int(i.pageNo) for i in PNs),default=0) for PNs in pageNos),default=0)
# TODO: we assume all pages are 'normal' pages
# (not 'front' pages in Roman letters etc)
headingsR = normaliseDepth(R,HReduce(headings)) # (hType,hText,recNo,textNo)
return deBlank(f"""<?xml version="1.0" encoding="utf-8"?>
{'<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' if R.daisy3 else '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'}
<{'ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"' if R.daisy3 else f'html lang="{R.lang}" xmlns="http://www.w3.org/1999/xhtml"'} xml:lang="{R.lang}">
<head>
{'<meta name="dtb:uid" content=""/>' if R.daisy3 else '<meta content="text/html; charset=utf-8" http-equiv="Content-type" />'}
{f'<meta name="dtb:totalPageCount" content="{numPages}" />' if R.daisy3 else ''}
{f'<meta name="dtb:maxPageNumber" content="{maxPageNo}" />' if R.daisy3 else ''}
{'' if R.daisy3 else f'<title>{R.title}</title>'}
<meta name="dc:creator" content="{R.creator}" />
<meta name="dc:date" content="{R.date}" scheme="yyyy-mm-dd" />
<meta name="dc:language" content="{R.lang}" scheme="ISO 639" />
<meta name="dc:publisher" content="{deHTML(R.publisher)}" />
<meta name="dc:title" content="{deHTML(R.title)}" />
<meta name="dc:type" content="text" />
<meta name="dc:identifier" content="{R.url}" />
<meta name="dc:format" content="{'ANSI/NISO Z39.86-2005' if R.daisy3 else 'Daisy 2.02'}" />
<meta name="ncc:narrator" content="{R.reader}" />
<meta name="ncc:producedDate" content="{R.date}" />
<meta name="{'dtb' if R.daisy3 else 'ncc'}:generator" content="{generator}" />
<meta name="ncc:charset" content="utf-8" />
<meta name="ncc:pageFront" content="0" />
<meta name="ncc:maxPageNormal" content="{maxPageNo}" />
<meta name="ncc:pageNormal" content="{numPages}" />
<meta name="ncc:pageSpecial" content="0" />
<meta name="ncc:tocItems" content="{len(headingsR)+sum(len(PNs) for PNs in pageNos)}" />
<meta name="ncc:totalTime" content="{hmsTime(totalSecs)}" />
<meta name="ncc:multimediaType" content="{"audioFullText" if hasFullText else "audioNcc"}" />
<meta name="{'dtb' if R.daisy3 else 'ncc'}:depth" content="{max(int(h.hTag[1:]) for h in headingsR if h.hTag.startswith('h'))+(1 if any(h.hTag=='div' for h in headingsR) else 0)}" />
<meta name="ncc:files" content="{2+len(headings)*(3 if hasFullText else 2)+len(R.imageFiles)}" />
</head>
{f'<docTitle><text>{R.title}</text></docTitle>' if R.daisy3 else ''}
{f'<docAuthor><text>{R.creator}</text></docAuthor>' if R.daisy3 else ''}
<{'navMap id="navMap"' if R.daisy3 else 'body'}>"""+''.join((f"""
<navPoint id="s{s+1}" class="{t.hTag}" playOrder="{s+1}">
<navLabel><text>{t.hLine}</text>{'' if recTimeTxts[t.recNo][2*t.itemNo]==recTimeTxts[t.recNo][2*t.itemNo+2] else f'''<audio src="{t.recNo+1:04d}.mp3" clipBegin="{hmsTime(recTimeTxts[t.recNo][2*t.itemNo])}" clipEnd="{hmsTime(recTimeTxts[t.recNo][2*t.itemNo+2])}"/>'''}</navLabel>
<content src="{t.recNo+1:04d}.smil#pr{t.recNo+1}.{t.itemNo}"/>
{'</navPoint>'*numDaisy3NavpointsToClose(s,headingsR)}""" if R.daisy3 else ''.join(f"""
<span class="page-normal" id="page{N}"><a href="{r+1:04d}.smil#t{r+1}.{after}">{N}</a></span>""" for r,PNs in enumerate(pageNos) for (PO,(after,N)) in enumerate(PNs) if (r,after)<=t[2:4] and (not s or (r,after)>headingsR[s-1][2:4]))+f"""
<{t.hTag} class="{'section' if s or R.allow_jumps else 'title'}" id="s{s+1}">
<a href="{t.recNo+1:04d}.smil#t{t.recNo+1}.{t.itemNo}">{t.hLine}</a>
</{t.hTag}>""") for s,t in enumerate(headingsR))+('</navMap><pageList id="page">'+''.join(f"""
<pageTarget class="pagenum" type="normal" value="{N}" id="page{N}" playOrder="{len(headingsR)+sum(len(P) for P in pageNos[:r])+PO+1}">
<navLabel><text>{N}</text></navLabel>
<content src="{r+1:04d}.smil#pr{r+1}.{after}"/>
</pageTarget>""" for r,PNs in enumerate(pageNos) for (PO,(after,N)) in enumerate(PNs))+f"""
</pageList>
</ncx>""" if R.daisy3 else """
</body>
</html>"""))
def numDaisy3NavpointsToClose(s,headingsR):
thisTag = headingsR[s]
if thisTag.hTag.startswith('h'):
thisDepth = int(thisTag.hTag[1])
else: thisDepth = None # a div navpoint
if s+1==len(headingsR): nextDepth = 1
else:
nextTag = headingsR[s+1]
if nextTag.hTag.startswith('h'):
nextDepth = int(nextTag.hTag[1])
else: nextDepth = None # another div
if thisDepth == nextDepth: return 1 # e.g. this is div and next is div, or same heading level
elif nextDepth==None: return 0 # never close if it's heading followed by div
headingNums_closed = ''.join(('' if not i.hTag.startswith('h') else i.hTag[1] if int(i.hTag[1])>=nextDepth else '0') for i in reversed(headingsR[:s+1])).split('0',1)[0]
if thisDepth: D=thisDepth
else: D=int(headingNums_closed[0]) # the heading before this div (TODO: this code assumes there will be one, which is currently true as these divs are generated only by verse numbering)
N = sum(1 for j in range(nextDepth,D+1) if str(j) in headingNums_closed)
return N+(1 if thisDepth==None else 0)
def HReduce(headings): # INTERNAL
"convert a list of ChapterTOCInfo lists (or text strings for unstructured chapters) into a single BookTOCInfo list"
return reduce(lambda a,b:a+b,[([BookTOCInfo(hType,hText,recNo,textNo) for (hType,hText,textNo) in i] if type(i)==list else [BookTOCInfo('h1',i,recNo,0)]) for recNo,i in enumerate(headings)],[])
def normaliseDepth(R,items): # INTERNAL
"Ensure that heading items' depth conforms to DAISY standard, in a BookTOCInfo list"
if R.allow_jumps: return items
curDepth = 0
for i in range(len(items)):
ii = items[i] # TagAndText or BookTOCInfo
if ii[0].lower().startswith('h'):
depth = int(ii[0][1:])
if depth > curDepth+1:
if type(ii)==BookTOCInfo: items[i]=BookTOCInfo(f'h{curDepth+1}',ii.hLine,ii.recNo,ii.itemNo)
else: items[i]=TagAndText(f'h{curDepth+1}',ii.text)
curDepth = depth
return items
def master_smil(R,headings = [],
totalSecs = 0): # INTERNAL
"Compile the master smil for a DAISY file"
headings = HReduce(headings)
return f"""<?xml version="1.0"?>
<!DOCTYPE smil PUBLIC "-//W3C//DTD SMIL 1.0//EN" "http://www.w3.org/TR/REC-smil/SMIL10.dtd">
<smil>
<head>
<meta name="dc:title" content="{deHTML(R.title)}" />
<meta name="dc:format" content="Daisy 2.02" />
<meta name="ncc:generator" content="{generator}" />
<meta name="ncc:timeInThisSmil" content="{hmsTime(totalSecs)}" />
<layout>
<region id="textView" />
</layout>
</head>
<body>"""+''.join(f"""
<ref title="{deHTML(t.hLine)}" src="{t.recNo+1:04d}.smil#t{t.recNo+1}.{t.itemNo}" id="ms_{s+1:04d}" />""" for s,t in enumerate(headings))+"""
</body>
</smil>
"""
def section_smil(R, recNo=1,
totalSecsSoFar=0,
secsThisRecording=0,
startP=0,
textsAndTimes=[]): # INTERNAL
"Compile a section SMIL for a DAISY file"
if not type(textsAndTimes)==list: textsAndTimes=[textsAndTimes]
textsAndTimes = [0]+textsAndTimes+[secsThisRecording]
return deBlank(f"""<?xml version="1.0" encoding="utf-8"?>
{'<!DOCTYPE smil PUBLIC "-//NISO//DTD dtbsmil 2005-2//EN" "http://www.daisy.org/z3986/2005/dtbsmil-2005-2.dtd">' if R.daisy3 else '<!DOCTYPE smil PUBLIC "-//W3C//DTD SMIL 1.0//EN" "http://www.w3.org/TR/REC-smil/SMIL10.dtd">'}
{'<smil xmlns="http://www.w3.org/2001/SMIL20/">' if R.daisy3 else '<smil>'}
<head>
{'<meta name="dtb:uid" content=""/>' if R.daisy3 else '<meta name="dc:format" content="Daisy 2.02" />'}
<meta name="{'dtb' if R.daisy3 else 'ncc'}:generator" content="{generator}" />
<meta name="{'dtb' if R.daisy3 else 'ncc'}:totalElapsedTime" content="{hmsTime(totalSecsSoFar)}" />""" + ("" if R.daisy3 else f"""
<meta name="ncc:timeInThisSmil" content="{hmsTime(secsThisRecording)}" />
<meta name="title" content="{deHTML(textsAndTimes[1][1])}" />
<meta name="dc:title" content="{deHTML(textsAndTimes[1][1])}" />
<layout>
<region id="textView" />
</layout>""")+f"""
</head>
<body>
<seq id="sq{recNo}" dur="{hmsTime(secsThisRecording) if R.daisy3 else f'{secsThisRecording:.3f}s'}" fill="remove">"""+"".join(f"""
<par {'' if R.daisy3 else 'endsync="last" '}id="pr{recNo}.{i//2}">
<text id="t{recNo}.{i//2}" src="{recNo:04d}.{'xml' if R.daisy3 else 'htm'}#p{startP+i//2}" />
{'' if R.daisy3 or textsAndTimes[i-1]==textsAndTimes[i+1] else f'<seq id="sq{recNo}.{i//2}a">'}
{'' if textsAndTimes[i-1]==textsAndTimes[i+1] else f'''<audio src="{recNo:04d}.mp3" clip{'B' if R.daisy3 else '-b'}egin="{hmsTime(textsAndTimes[i-1]) if R.daisy3 else f'npt={textsAndTimes[i-1]:.3f}s'}" clip{'E' if R.daisy3 else '-e'}nd="{hmsTime(textsAndTimes[i+1]) if R.daisy3 else f'npt={textsAndTimes[i+1]:.3f}s'}" id="aud{recNo}.{i//2}" />'''}
{'' if R.daisy3 or textsAndTimes[i-1]==textsAndTimes[i+1] else '</seq>'}
</par>{''.join(f'<par><text id="t{recNo}.{i//2}.{j}" src="{recNo:04d}.xml#{re.sub(".*"+chr(34)+" id=.","",imageID)}"/></par>' for j,imageID in enumerate(re.findall('<img src="[^"]*" id="[^"]*',textsAndTimes[i][1]))) if R.daisy3 else ''}""" for i in range(1,len(textsAndTimes),2))+"""
</seq>
</body>
</smil>
""")
# (do not omit text with 0-length audio altogether, even in Daisy 2: unlike image tags after paragraphs, it might end up not being displayed by EasyReader etc. Omitting audio does NOT save being stopped at the beginning of the chapter when rewinding by paragraph: is this a bug or a feature?)
def deBlank(s): return re.sub("\n *\n","\n",s) # INTERNAL (see use above)
def hmsTime(secs): return f"{int(secs/3600)}:{int(secs/60)%60:02d}:{secs%60:06.3f}"
def deHTML(t):
"Remove HTML tags from t, collapse whitespace and escape quotes so it can be included in an XML attribute"
return re.sub(r'\s+',' ',re.sub('<[^>]*>','',t)).replace('"','"').strip()
def package_opf(R,hasFullText,numRecs,totalSecs): # INTERNAL
"Make the package OPF for a DAISY 3 file"
return f"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE package
PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.2 Package//EN" "http://openebook.org/dtds/oeb-1.2/oebpkg12.dtd">
<package xmlns="http://openebook.org/namespaces/oeb-package/1.0/" unique-identifier="{R.url}">
<metadata>
<dc-metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:Format>ANSI/NISO Z39.86-2005</dc:Format>
<dc:Language>{R.lang}</dc:Language>
<dc:Date>{R.date}</dc:Date>
<dc:Publisher>{R.publisher}</dc:Publisher>
<dc:Title>{R.title}</dc:Title>
<dc:Identifier id="{R.url}"/>
<dc:Creator>{R.creator}</dc:Creator>
<dc:Type>text</dc:Type>
</dc-metadata>
<x-metadata>
<meta name="dtb:multimediaType" content="{"audioFullText" if hasFullText else "audioNcc"}"/>
<meta name="dtb:totalTime" content="{hmsTime(totalSecs)}"/>
<meta name="dtb:multimediaContent" content="audio,text{',image' if R.imageFiles else ''}"/>
<meta name="dtb:narrator" content="{deHTML(R.reader)}"/>
<meta name="dtb:producedDate" content="{R.date}"/>
</x-metadata>
</metadata>
<manifest>
<item href="package.opf" id="opf" media-type="text/xml"/>"""+''.join(f"""
<item href="{i:04d}.mp3" id="opf-{i}" media-type="audio/mpeg"/>""" for i in range(1,numRecs+1))+''.join(f"""
<item href="{i+1}{u[u.rindex("."):]}" id="opf-{i+numRecs+1}" media-type="image/{u[u.rindex(".")+1:].lower().replace("jpg","jpeg")}"/>""" for i,u in enumerate(R.imageFiles))+f"""
<item href="dtbook.2005.basic.css" id="opf-{len(R.imageFiles)+numRecs+1}" media-type="text/css"/>"""+''.join(f"""
<item href="{i:04d}.xml" id="opf-{i+len(R.imageFiles)+numRecs+1}" media-type="application/x-dtbook+xml"/>""" for i in range(1,numRecs+1))+''.join(f"""
<item href="{i:04d}.smil" id="{i:04d}" media-type="application/smil+xml"/>""" for i in range(1,numRecs+1))+f"""
<item href="navigation.ncx" id="ncx" media-type="application/x-dtbncx+xml"/>
<item href="text.res" id="resource" media-type="application/x-dtbresource+xml"/>
</manifest>
<spine>"""+"".join(f"""
<itemref idref="{i:04d}"/>""" for i in range(1,numRecs+1))+"""
</spine>
</package>
"""
def text_htm(R,paras,offset=0): # INTERNAL
"paras = TagAndText list, text is xhtml i.e. & use & etc"
return deBlank(f"""<?xml version="1.0"{' encoding="utf-8"' if R.daisy3 else ''}?>{'<?xml-stylesheet type="text/css" href="dtbook.2005.basic.css"?>' if R.daisy3 else ''}
{'<!DOCTYPE dtbook PUBLIC "-//NISO//DTD dtbook 2005-3//EN" "http://www.daisy.org/z3986/2005/dtbook-2005-3.dtd">' if R.daisy3 else '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'}
<{'dtbook xmlns="http://www.daisy.org/z3986/2005/dtbook/" version="2005-2"' if R.daisy3 else f'html lang="{R.lang}" xmlns="http://www.w3.org/1999/xhtml"'} xml:lang="{R.lang}">
<head>
{'<meta name="dt:version" content="1.0" />' if R.daisy3 else ''}
{f'<meta name="dc:Title" content="{deHTML(R.title)}"/>' if R.daisy3 else f'<title>{R.title}</title>'}
{f'<meta name="dc:Creator" content="{deHTML(R.creator)}"/>' if R.daisy3 else ''}
{f'<meta name="dc:Publisher" content="{deHTML(R.publisher)}"/>' if R.daisy3 else ''}
{f'<meta name="dc:Date" content="{R.date}"/>' if R.daisy3 else ''}
{f'<meta name="dc:Language" content="{R.lang}" />' if R.daisy3 else ''}
{f'<meta name="dc:identifier" content="{R.url}" />' if R.daisy3 else ''}
{f'<meta name="dtb:uid" content="{R.url}"/>' if R.daisy3 else '<meta content="text/html; charset=utf-8" http-equiv="content-type"/>'}
<meta name="generator" content="{generator}"/>
</head>
<{'book' if R.daisy3 else 'body'}>
{f'<frontmatter><doctitle>{R.title}</doctitle><docauthor>{R.creator}</docauthor></frontmatter><bodymatter>' if R.daisy3 else ''}
"""+"\n".join(f"""{''.join(f'<level{n}>' for n in range(min(int(tag[1:]),next(int(paras[p].tag[1:]) for p in range(num-1,-1,-1) if paras[p].tag.startswith('h'))+1) if any(paras[P].tag.startswith('h') for P in range(num-1,-1,-1)) else 1,int(tag[1:])+1)) if R.daisy3 and tag.startswith('h') else ''}{'<level1>' if R.daisy3 and not num and not tag.startswith('h') else ''}{'<p>' if tag=='span' and (num==0 or not paras[num-1].tag=="span" or paras[num-1].text.endswith("<br />")) else ''}<{tag} id=\"p{num+offset}\"{(' class="word"' if len(text.split())==1 else ' class="sentence"') if tag=='span' else ''}>{re.sub("<br />$","",re.sub('<img src="[^"]*" [^/]*/>','',text))}</{tag}>{'</p>' if tag=='span' and (text.endswith("<br />") or num+1==len(paras) or not paras[num+1].tag=='span') else ''}{'<p><imggroup>' if R.daisy3 and re.search('<img src="',text) else ''}{''.join(re.findall('<img src="[^"]*" [^/]*/>',text))}{'</imggroup></p>' if R.daisy3 and re.search('<img src="',text) else ''}{''.join(f'</level{n}>' for n in range(next(int(paras[p].tag[1:]) for p in range(num,-1,-1) if paras[p].tag.startswith('h')) if any(paras[P].tag.startswith('h') for P in range(num,-1,-1)) else 1,0 if num+1==len(paras) else int(paras[num+1].tag[1:])-1,-1)) if R.daisy3 and (num+1==len(paras) or paras[num+1].tag.startswith('h')) else ''}""" for num,(tag,text) in enumerate(normaliseDepth(R,paras)))+f"""
</{'bodymatter></book' if R.daisy3 else 'body'}>
</{'dtbook' if R.daisy3 else 'html'}>
""")
def er_book_info(durations): # INTERNAL
"durations = list of secsThisRecording"
return """<?xml version="1.0" encoding="utf-8"?>
<book_info>
<smil_info>"""+"".join(f"""
<smil nr="{s}" Name="{s+1:04d}.smil" dur="{d:f}"/>""" for s,d in enumerate(durations))+"""
</smil_info>
</book_info>
"""
d3css = """/* Simplified from Z39.86 committee CSS, removed non-dark background (allow custom) */
dtbook { display:block; width: 100% }
head { display: none }
book {
display: block;
font-family: arial, verdana, sans-serif;
line-height: 1.5em;
margin-top: 4em;
margin-bottom: 2em;
margin-left: 6em;
margin-right: 6em;
}
bodymatter {
display: block; margin-top: 1em; margin-bottom: 1em }
h1, h2, h3, h4, h5, h6 {
display: block; font-weight: bold; margin-bottom: 0.5em }
h1 { font-size: 1.7em; margin-top: 1.5em }
h2 { font-size: 1.5em; margin-top: 1.2em }
h3 { font-size: 1.4em; margin-top: 1.0em }
h4 { font-size: 1.3em; margin-top: 1.0em }
h5 { font-size: 1.2em; margin-top: 1.0em }
h6 { margin-top: 1.0em }
p { display: block; margin-top: 0.7em }
a { display: inline; text-decoration: underline }
em { display: inline; font-style: italic }
strong { display: inline; font-weight: bold }
span { display: inline; }
"""
textres = """<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE resources
PUBLIC "-//NISO//DTD resource 2005-1//EN" "http://www.daisy.org/z3986/2005/resource-2005-1.dtd">
<resources xmlns="http://www.daisy.org/z3986/2005/resource/" version="2005-1"><!-- SKIPPABLE NCX --><scope nsuri="http://www.daisy.org/z3986/2005/ncx/"><nodeSet id="ns001" select="//smilCustomTest[@bookStruct='LINE_NUMBER']"><resource xml:lang="en" id="r001"><text>Row</text></resource></nodeSet><nodeSet id="ns002" select="//smilCustomTest[@bookStruct='NOTE']"><resource xml:lang="en" id="r002"><text>Note</text></resource></nodeSet><nodeSet id="ns003" select="//smilCustomTest[@bookStruct='NOTE_REFERENCE']"><resource xml:lang="en" id="r003"><text>Note reference</text></resource></nodeSet><nodeSet id="ns004" select="//smilCustomTest[@bookStruct='ANNOTATION']"><resource xml:lang="en" id="r004"><text>Annotation</text></resource></nodeSet><nodeSet id="ns005" select="//smilCustomTest[@id='annoref']"><resource xml:lang="en" id="r005"><text>Annotation reference</text></resource></nodeSet><nodeSet id="ns006" select="//smilCustomTest[@bookStruct='PAGE_NUMBER']"><resource xml:lang="en" id="r006"><text>Page</text></resource></nodeSet><nodeSet id="ns007" select="//smilCustomTest[@bookStruct='OPTIONAL_SIDEBAR']"><resource xml:lang="en" id="r007"><text>Optional sidebar</text></resource></nodeSet><nodeSet id="ns008" select="//smilCustomTest[@bookStruct='OPTIONAL_PRODUCER_NOTE']"><resource xml:lang="en" id="r008"><text>Optional producer note</text></resource></nodeSet></scope><!-- ESCAPABLE SMIL --><scope nsuri="http://www.w3.org/2001/SMIL20/"><nodeSet id="esns001" select="//seq[@bookStruct='line']"><resource xml:lang="en" id="esr001"><text>Row</text></resource></nodeSet><nodeSet id="esns002" select="//seq[@class='note']"><resource xml:lang="en" id="esr002"><text>Note</text></resource></nodeSet><nodeSet id="esns003" select="//seq[@class='noteref']"><resource xml:lang="en" id="esr003"><text>Note reference</text></resource></nodeSet><nodeSet id="esns004" select="//seq[@class='annotation']"><resource xml:lang="en" id="esr004"><text>Annotation</text></resource></nodeSet><nodeSet id="esns005" select="//seq[@class='annoref']"><resource xml:lang="en" id="esr005"><text>Annotation reference</text></resource></nodeSet><nodeSet id="esns006" select="//seq[@class='pagenum']"><resource xml:lang="en" id="esr006"><text>Page</text></resource></nodeSet><nodeSet id="esns007" select="//seq[@class='sidebar']"><resource xml:lang="en" id="esr007"><text>Optional sidebar</text></resource></nodeSet><nodeSet id="esns008" select="//seq[@class='prodnote']"><resource xml:lang="en" id="esr008"><text>Optional producer note</text></resource></nodeSet></scope><!-- ESCAPABLE DTBOOK --><scope nsuri="http://www.daisy.org/z3986/2005/dtbook/"><nodeSet id="ns009" select="//annotation"><resource xml:lang="en" id="r009"><text>Annotation</text></resource></nodeSet><nodeSet id="ns010" select="//blockquote"><resource xml:lang="en" id="r010"><text>Quote</text></resource></nodeSet><nodeSet id="ns011" select="//code"><resource xml:lang="en" id="r011"><text>Code</text></resource></nodeSet><nodeSet id="ns012" select="//list"><resource xml:lang="en" id="r012"><text>List</text></resource></nodeSet><nodeSet id="ns018" select="//note"><resource xml:lang="en" id="r018"><text>Note</text></resource></nodeSet><nodeSet id="ns013" select="//poem"><resource xml:lang="en" id="r013"><text>Poem</text></resource></nodeSet><nodeSet id="ns0014" select="//prodnote[@render='optional']"><resource xml:lang="en" id="r014"><text>Optional producer note</text></resource></nodeSet><nodeSet id="ns015" select="//sidebar[@render='optional']"><resource xml:lang="en" id="r015"><text>Optional sidebar</text></resource></nodeSet><nodeSet id="ns016" select="//table"><resource xml:lang="en" id="r016"><text>Table</text></resource></nodeSet><nodeSet id="ns017" select="//tr"><resource xml:lang="en" id="r017"><text>Table row</text></resource></nodeSet></scope></resources>"""
if __name__ == "__main__": anemone()