Permalink
Browse files

Parallelise endnote processing

Previously we were loading each file sequentially. As each file is independent of the rest, if we can work on them in parallel we could speed up the process. Using concurrent.futures (Python 3.2+) we can use an executer to schedule the tasks. This lead to the following stats on my machine (Macbook Air 2012, dual i5) running against Pepys and incrementing every endnote by 1:

Existing code:
65.47 real        64.20 user         0.47 sys
61.36 real        61.03 user         0.23 sys
69.37 real        67.36 user         0.74 sys

With ThreadPoolExecutor
29.19 real        48.03 user         2.85 sys
28.06 real        46.90 user         2.75 sys
28.26 real        47.19 user         2.74 sys

With ProcessPoolExecutor
27.89 real       101.88 user         0.61 sys
28.03 real       101.86 user         0.58 sys
27.77 real       101.56 user         0.56 sys

Given that, I went with ProcessPoolExecutor, but either is a halving of time over the original.
  • Loading branch information...
robinwhittleton authored and acabal committed Sep 29, 2018
1 parent afb92f7 commit 1f80544c6fe45ab3e444c223b114c0bf3a4969bf
Showing with 20 additions and 16 deletions.
  1. +20 −16 reorder-endnotes
@@ -1,5 +1,6 @@
#!/usr/bin/env python3

import concurrent.futures
import argparse
import os
import fnmatch
@@ -65,26 +66,29 @@ def main():
se.print_error("Couldn’t open endnotes file: {}".format(endnotes_filename))
exit(1)

for root, _, filenames in os.walk(source_directory):
for filename in fnmatch.filter(filenames, "*.xhtml"):
# Skip endnotes.xhtml since we already processed it
if filename == "endnotes.xhtml":
continue
with concurrent.futures.ProcessPoolExecutor() as executor:
for root, _, filenames in os.walk(source_directory):
for filename in fnmatch.filter(filenames, "*.xhtml"):
# Skip endnotes.xhtml since we already processed it
if filename == "endnotes.xhtml":
continue

with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
xhtml = file.read()
processed_xhtml = xhtml
executor.submit(process_endnotes_in_file, filename, root, note_range, step)

for endnote_number in note_range:
processed_xhtml = regex.sub(r"(<a[^>]*?>){}</a>".format(endnote_number), r"\g<1>{}</a>".format(endnote_number + step), processed_xhtml, flags=regex.DOTALL)
processed_xhtml = processed_xhtml.replace("id=\"noteref-{}\"".format(endnote_number), "id=\"noteref-{}\"".format(endnote_number + step), 1)
processed_xhtml = processed_xhtml.replace("#note-{}\"".format(endnote_number), "#note-{}\"".format(endnote_number + step), 1)
def process_endnotes_in_file(filename: str, root: str, note_range: range, step: int):
with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
xhtml = file.read()
processed_xhtml = xhtml

if processed_xhtml != xhtml:
file.seek(0)
file.write(processed_xhtml)
file.truncate()
for endnote_number in note_range:
processed_xhtml = regex.sub(r"(<a[^>]*?>){}</a>".format(endnote_number), r"\g<1>{}</a>".format(endnote_number + step), processed_xhtml, flags=regex.DOTALL)
processed_xhtml = processed_xhtml.replace("id=\"noteref-{}\"".format(endnote_number), "id=\"noteref-{}\"".format(endnote_number + step), 1)
processed_xhtml = processed_xhtml.replace("#note-{}\"".format(endnote_number), "#note-{}\"".format(endnote_number + step), 1)

if processed_xhtml != xhtml:
file.seek(0)
file.write(processed_xhtml)
file.truncate()

if __name__ == "__main__":
main()

0 comments on commit 1f80544

Please sign in to comment.