Parallelise endnote processing

Previously we were loading each file sequentially. As each file is independent of the rest, if we can work on them in parallel we could speed up the process. Using concurrent.futures (Python 3.2+) we can use an executer to schedule the tasks. This lead to the following stats on my machine (Macbook Air 2012, dual i5) running against Pepys and incrementing every endnote by 1: Existing code: 65.47 real 64.20 user 0.47 sys 61.36 real 61.03 user 0.23 sys 69.37 real 67.36 user 0.74 sys With ThreadPoolExecutor 29.19 real 48.03 user 2.85 sys 28.06 real 46.90 user 2.75 sys 28.26 real 47.19 user 2.74 sys With ProcessPoolExecutor 27.89 real 101.88 user 0.61 sys 28.03 real 101.86 user 0.58 sys 27.77 real 101.56 user 0.56 sys Given that, I went with ProcessPoolExecutor, but either is a halving of time over the original.
standardebooks · Oct 1, 2018 · 1f80544 · 1f80544
1 parent afb92f7
commit 1f80544
Showing 1 changed file with 20 additions and 16 deletions.
diff --git a/reorder-endnotes b/reorder-endnotes
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 
+import concurrent.futures
 import argparse
 import os
 import fnmatch
@@ -65,26 +66,29 @@ def main():
 		se.print_error("Couldn’t open endnotes file: {}".format(endnotes_filename))
 		exit(1)
 
-	for root, _, filenames in os.walk(source_directory):
+	with concurrent.futures.ProcessPoolExecutor() as executor:
-		for filename in fnmatch.filter(filenames, "*.xhtml"):
+		for root, _, filenames in os.walk(source_directory):
-			# Skip endnotes.xhtml since we already processed it
+			for filename in fnmatch.filter(filenames, "*.xhtml"):
-			if filename == "endnotes.xhtml":
+				# Skip endnotes.xhtml since we already processed it
-				continue
+				if filename == "endnotes.xhtml":
+					continue
 
-			with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
+				executor.submit(process_endnotes_in_file, filename, root, note_range, step)
-				xhtml = file.read()
-				processed_xhtml = xhtml
 
-				for endnote_number in note_range:
+def process_endnotes_in_file(filename: str, root: str, note_range: range, step: int):
-					processed_xhtml = regex.sub(r"(<a[^>]*?>){}</a>".format(endnote_number), r"\g<1>{}</a>".format(endnote_number + step), processed_xhtml, flags=regex.DOTALL)
+	with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
-					processed_xhtml = processed_xhtml.replace("id=\"noteref-{}\"".format(endnote_number), "id=\"noteref-{}\"".format(endnote_number + step), 1)
+		xhtml = file.read()
-					processed_xhtml = processed_xhtml.replace("#note-{}\"".format(endnote_number), "#note-{}\"".format(endnote_number + step), 1)
+		processed_xhtml = xhtml
 
-				if processed_xhtml != xhtml:
+		for endnote_number in note_range:
-					file.seek(0)
+			processed_xhtml = regex.sub(r"(<a[^>]*?>){}</a>".format(endnote_number), r"\g<1>{}</a>".format(endnote_number + step), processed_xhtml, flags=regex.DOTALL)
-					file.write(processed_xhtml)
+			processed_xhtml = processed_xhtml.replace("id=\"noteref-{}\"".format(endnote_number), "id=\"noteref-{}\"".format(endnote_number + step), 1)
-					file.truncate()
+			processed_xhtml = processed_xhtml.replace("#note-{}\"".format(endnote_number), "#note-{}\"".format(endnote_number + step), 1)
 
+		if processed_xhtml != xhtml:
+			file.seek(0)
+			file.write(processed_xhtml)
+			file.truncate()
 
 if __name__ == "__main__":
 	main()