Skip to content

Commit 1f80544

Browse files
robinwhittletonacabal
authored andcommitted
Parallelise endnote processing
Previously we were loading each file sequentially. As each file is independent of the rest, if we can work on them in parallel we could speed up the process. Using concurrent.futures (Python 3.2+) we can use an executer to schedule the tasks. This lead to the following stats on my machine (Macbook Air 2012, dual i5) running against Pepys and incrementing every endnote by 1: Existing code: 65.47 real 64.20 user 0.47 sys 61.36 real 61.03 user 0.23 sys 69.37 real 67.36 user 0.74 sys With ThreadPoolExecutor 29.19 real 48.03 user 2.85 sys 28.06 real 46.90 user 2.75 sys 28.26 real 47.19 user 2.74 sys With ProcessPoolExecutor 27.89 real 101.88 user 0.61 sys 28.03 real 101.86 user 0.58 sys 27.77 real 101.56 user 0.56 sys Given that, I went with ProcessPoolExecutor, but either is a halving of time over the original.
1 parent afb92f7 commit 1f80544

File tree

1 file changed

+20
-16
lines changed

1 file changed

+20
-16
lines changed

reorder-endnotes

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env python3
22

3+
import concurrent.futures
34
import argparse
45
import os
56
import fnmatch
@@ -65,26 +66,29 @@ def main():
6566
se.print_error("Couldn’t open endnotes file: {}".format(endnotes_filename))
6667
exit(1)
6768

68-
for root, _, filenames in os.walk(source_directory):
69-
for filename in fnmatch.filter(filenames, "*.xhtml"):
70-
# Skip endnotes.xhtml since we already processed it
71-
if filename == "endnotes.xhtml":
72-
continue
69+
with concurrent.futures.ProcessPoolExecutor() as executor:
70+
for root, _, filenames in os.walk(source_directory):
71+
for filename in fnmatch.filter(filenames, "*.xhtml"):
72+
# Skip endnotes.xhtml since we already processed it
73+
if filename == "endnotes.xhtml":
74+
continue
7375

74-
with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
75-
xhtml = file.read()
76-
processed_xhtml = xhtml
76+
executor.submit(process_endnotes_in_file, filename, root, note_range, step)
7777

78-
for endnote_number in note_range:
79-
processed_xhtml = regex.sub(r"(<a[^>]*?>){}</a>".format(endnote_number), r"\g<1>{}</a>".format(endnote_number + step), processed_xhtml, flags=regex.DOTALL)
80-
processed_xhtml = processed_xhtml.replace("id=\"noteref-{}\"".format(endnote_number), "id=\"noteref-{}\"".format(endnote_number + step), 1)
81-
processed_xhtml = processed_xhtml.replace("#note-{}\"".format(endnote_number), "#note-{}\"".format(endnote_number + step), 1)
78+
def process_endnotes_in_file(filename: str, root: str, note_range: range, step: int):
79+
with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
80+
xhtml = file.read()
81+
processed_xhtml = xhtml
8282

83-
if processed_xhtml != xhtml:
84-
file.seek(0)
85-
file.write(processed_xhtml)
86-
file.truncate()
83+
for endnote_number in note_range:
84+
processed_xhtml = regex.sub(r"(<a[^>]*?>){}</a>".format(endnote_number), r"\g<1>{}</a>".format(endnote_number + step), processed_xhtml, flags=regex.DOTALL)
85+
processed_xhtml = processed_xhtml.replace("id=\"noteref-{}\"".format(endnote_number), "id=\"noteref-{}\"".format(endnote_number + step), 1)
86+
processed_xhtml = processed_xhtml.replace("#note-{}\"".format(endnote_number), "#note-{}\"".format(endnote_number + step), 1)
8787

88+
if processed_xhtml != xhtml:
89+
file.seek(0)
90+
file.write(processed_xhtml)
91+
file.truncate()
8892

8993
if __name__ == "__main__":
9094
main()

0 commit comments

Comments
 (0)