Skip to content

Commit

Permalink
Parallelise endnote processing
Browse files Browse the repository at this point in the history
Previously we were loading each file sequentially. As each file is independent of the rest, if we can work on them in parallel we could speed up the process. Using concurrent.futures (Python 3.2+) we can use an executer to schedule the tasks. This lead to the following stats on my machine (Macbook Air 2012, dual i5) running against Pepys and incrementing every endnote by 1:

Existing code:
65.47 real        64.20 user         0.47 sys
61.36 real        61.03 user         0.23 sys
69.37 real        67.36 user         0.74 sys

With ThreadPoolExecutor
29.19 real        48.03 user         2.85 sys
28.06 real        46.90 user         2.75 sys
28.26 real        47.19 user         2.74 sys

With ProcessPoolExecutor
27.89 real       101.88 user         0.61 sys
28.03 real       101.86 user         0.58 sys
27.77 real       101.56 user         0.56 sys

Given that, I went with ProcessPoolExecutor, but either is a halving of time over the original.
  • Loading branch information
robinwhittleton authored and acabal committed Oct 1, 2018
1 parent afb92f7 commit 1f80544
Showing 1 changed file with 20 additions and 16 deletions.
36 changes: 20 additions & 16 deletions reorder-endnotes
@@ -1,5 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3


import concurrent.futures
import argparse import argparse
import os import os
import fnmatch import fnmatch
Expand Down Expand Up @@ -65,26 +66,29 @@ def main():
se.print_error("Couldn’t open endnotes file: {}".format(endnotes_filename)) se.print_error("Couldn’t open endnotes file: {}".format(endnotes_filename))
exit(1) exit(1)


for root, _, filenames in os.walk(source_directory): with concurrent.futures.ProcessPoolExecutor() as executor:
for filename in fnmatch.filter(filenames, "*.xhtml"): for root, _, filenames in os.walk(source_directory):
# Skip endnotes.xhtml since we already processed it for filename in fnmatch.filter(filenames, "*.xhtml"):
if filename == "endnotes.xhtml": # Skip endnotes.xhtml since we already processed it
continue if filename == "endnotes.xhtml":
continue


with open(os.path.join(root, filename), "r+", encoding="utf-8") as file: executor.submit(process_endnotes_in_file, filename, root, note_range, step)
xhtml = file.read()
processed_xhtml = xhtml


for endnote_number in note_range: def process_endnotes_in_file(filename: str, root: str, note_range: range, step: int):
processed_xhtml = regex.sub(r"(<a[^>]*?>){}</a>".format(endnote_number), r"\g<1>{}</a>".format(endnote_number + step), processed_xhtml, flags=regex.DOTALL) with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
processed_xhtml = processed_xhtml.replace("id=\"noteref-{}\"".format(endnote_number), "id=\"noteref-{}\"".format(endnote_number + step), 1) xhtml = file.read()
processed_xhtml = processed_xhtml.replace("#note-{}\"".format(endnote_number), "#note-{}\"".format(endnote_number + step), 1) processed_xhtml = xhtml


if processed_xhtml != xhtml: for endnote_number in note_range:
file.seek(0) processed_xhtml = regex.sub(r"(<a[^>]*?>){}</a>".format(endnote_number), r"\g<1>{}</a>".format(endnote_number + step), processed_xhtml, flags=regex.DOTALL)
file.write(processed_xhtml) processed_xhtml = processed_xhtml.replace("id=\"noteref-{}\"".format(endnote_number), "id=\"noteref-{}\"".format(endnote_number + step), 1)
file.truncate() processed_xhtml = processed_xhtml.replace("#note-{}\"".format(endnote_number), "#note-{}\"".format(endnote_number + step), 1)


if processed_xhtml != xhtml:
file.seek(0)
file.write(processed_xhtml)
file.truncate()


if __name__ == "__main__": if __name__ == "__main__":
main() main()

0 comments on commit 1f80544

Please sign in to comment.