Merge pull request #8702 from tk0miya/4304_linkcheck_same_url

linkcheck: Do not check the availability of the same URL repeatedly
sphinx-doc · Jan 20, 2021 · 82ef497 · 82ef497
2 parents 42de5e2 + cead0f6
commit 82ef497
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 73 deletions.
diff --git a/CHANGES b/CHANGES
@@ -10,6 +10,7 @@ Incompatible changes
 Deprecated
 ----------
 
+* ``sphinx.builders.linkcheck.node_line_or_0()``
 * ``sphinx.ext.autodoc.AttributeDocumenter.isinstanceattribute()``
 * ``sphinx.ext.autodoc.directive.DocumenterBridge.reporter``
 * ``sphinx.ext.autodoc.importer.get_module_members()``
@@ -60,6 +61,8 @@ Bugs fixed
   + or ^) are used as keystrokes
 * #8629: html: A type warning for html_use_opensearch is shown twice
 * #8665: html theme: Could not override globaltoc_maxdepth in theme.conf
+* #4304: linkcheck: Fix race condition that could lead to checking the
+  availability of the same URL twice
 * #8094: texinfo: image files on the different directory with document are not
   copied
 * #8671: :confval:`highlight_options` is not working

diff --git a/doc/extdev/deprecated.rst b/doc/extdev/deprecated.rst
@@ -26,6 +26,11 @@ The following is a list of deprecated interfaces.
      - (will be) Removed
      - Alternatives
 
+   * - ``sphinx.builders.linkcheck.node_line_or_0()``
+     - 3.5
+     - 5.0
+     - ``sphinx.util.nodes.get_node_line()``
+
    * - ``sphinx.ext.autodoc.AttributeDocumenter.isinstanceattribute()``
      - 3.5
      - 5.0

diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
@@ -14,11 +14,12 @@
 import socket
 import threading
 import time
+import warnings
 from datetime import datetime, timezone
 from email.utils import parsedate_to_datetime
 from html.parser import HTMLParser
 from os import path
-from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple
+from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, cast
 from urllib.parse import unquote, urlparse
 
 from docutils import nodes
@@ -28,7 +29,9 @@
 
 from sphinx.application import Sphinx
 from sphinx.builders import Builder
+from sphinx.deprecation import RemovedInSphinx40Warning
 from sphinx.locale import __
+from sphinx.transforms.post_transforms import SphinxPostTransform
 from sphinx.util import encode_uri, logging, requests
 from sphinx.util.console import darkgray, darkgreen, purple, red, turquoise  # type: ignore
 from sphinx.util.nodes import get_node_line
@@ -37,6 +40,10 @@
 
 uri_re = re.compile('([a-z]+:)?//')  # matches to foo:// and // (a protocol relative URL)
 
+Hyperlink = NamedTuple('Hyperlink', (('next_check', float),
+                                     ('uri', Optional[str]),
+                                     ('docname', Optional[str]),
+                                     ('lineno', Optional[int])))
 RateLimit = NamedTuple('RateLimit', (('delay', float), ('next_check', float)))
 
 DEFAULT_REQUEST_HEADERS = {
@@ -52,6 +59,8 @@ def node_line_or_0(node: Element) -> int:
     PriorityQueue items must be comparable. The line number is part of the
     tuple used by the PriorityQueue, keep an homogeneous type for comparison.
     """
+    warnings.warn('node_line_or_0() is deprecated.',
+                  RemovedInSphinx40Warning, stacklevel=2)
     return get_node_line(node) or 0
 
 
@@ -98,6 +107,7 @@ class CheckExternalLinksBuilder(Builder):
                 '%(outdir)s/output.txt')
 
     def init(self) -> None:
+        self.hyperlinks = {}    # type: Dict[str, Hyperlink]
         self.to_ignore = [re.compile(x) for x in self.app.config.linkcheck_ignore]
         self.anchors_ignore = [re.compile(x)
                                for x in self.app.config.linkcheck_anchors_ignore]
@@ -406,28 +416,26 @@ def prepare_writing(self, docnames: Set[str]) -> None:
         return
 
     def write_doc(self, docname: str, doctree: Node) -> None:
+        pass
+
+    def write_entry(self, what: str, docname: str, filename: str, line: int,
+                    uri: str) -> None:
+        with open(path.join(self.outdir, 'output.txt'), 'a') as output:
+            output.write("%s:%s: [%s] %s\n" % (filename, line, what, uri))
+
+    def write_linkstat(self, data: dict) -> None:
+        with open(path.join(self.outdir, 'output.json'), 'a') as output:
+            output.write(json.dumps(data))
+            output.write('\n')
+
+    def finish(self) -> None:
         logger.info('')
         n = 0
 
-        # reference nodes
-        for refnode in doctree.traverse(nodes.reference):
-            if 'refuri' not in refnode:
-                continue
-            uri = refnode['refuri']
-            lineno = node_line_or_0(refnode)
-            uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
-            self.wqueue.put(uri_info, False)
+        for hyperlink in self.hyperlinks.values():
+            self.wqueue.put(hyperlink, False)
             n += 1
 
-        # image nodes
-        for imgnode in doctree.traverse(nodes.image):
-            uri = imgnode['candidates'].get('?')
-            if uri and '://' in uri:
-                lineno = node_line_or_0(imgnode)
-                uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
-                self.wqueue.put(uri_info, False)
-                n += 1
-
         done = 0
         while done < n:
             self.process_result(self.rqueue.get())
@@ -436,25 +444,43 @@ def write_doc(self, docname: str, doctree: Node) -> None:
         if self.broken:
             self.app.statuscode = 1
 
-    def write_entry(self, what: str, docname: str, filename: str, line: int,
-                    uri: str) -> None:
-        with open(path.join(self.outdir, 'output.txt'), 'a') as output:
-            output.write("%s:%s: [%s] %s\n" % (filename, line, what, uri))
-
-    def write_linkstat(self, data: dict) -> None:
-        with open(path.join(self.outdir, 'output.json'), 'a') as output:
-            output.write(json.dumps(data))
-            output.write('\n')
-
-    def finish(self) -> None:
         self.wqueue.join()
         # Shutdown threads.
         for worker in self.workers:
             self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)
 
 
+class HyperlinkCollector(SphinxPostTransform):
+    builders = ('linkcheck',)
+    default_priority = 800
+
+    def run(self, **kwargs: Any) -> None:
+        builder = cast(CheckExternalLinksBuilder, self.app.builder)
+        hyperlinks = builder.hyperlinks
+
+        # reference nodes
+        for refnode in self.document.traverse(nodes.reference):
+            if 'refuri' not in refnode:
+                continue
+            uri = refnode['refuri']
+            lineno = get_node_line(refnode)
+            uri_info = Hyperlink(CHECK_IMMEDIATELY, uri, self.env.docname, lineno)
+            if uri not in hyperlinks:
+                hyperlinks[uri] = uri_info
+
+        # image nodes
+        for imgnode in self.document.traverse(nodes.image):
+            uri = imgnode['candidates'].get('?')
+            if uri and '://' in uri:
+                lineno = get_node_line(imgnode)
+                uri_info = Hyperlink(CHECK_IMMEDIATELY, uri, self.env.docname, lineno)
+                if uri not in hyperlinks:
+                    hyperlinks[uri] = uri_info
+
+
 def setup(app: Sphinx) -> Dict[str, Any]:
     app.add_builder(CheckExternalLinksBuilder)
+    app.add_post_transform(HyperlinkCollector)
 
     app.add_config_value('linkcheck_ignore', [], None)
     app.add_config_value('linkcheck_auth', [], None)

diff --git a/tests/roots/test-linkcheck-localserver-two-links/conf.py b/tests/roots/test-linkcheck-localserver-two-links/conf.py
diff --git a/tests/roots/test-linkcheck-localserver-two-links/index.rst b/tests/roots/test-linkcheck-localserver-two-links/index.rst
diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py
@@ -573,40 +573,3 @@ def test_limit_rate_bails_out_after_waiting_max_time(app):
     checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)}
     next_check = checker.limit_rate(FakeResponse())
     assert next_check is None
-
-
-@pytest.mark.sphinx(
-    'linkcheck', testroot='linkcheck-localserver-two-links', freshenv=True,
-)
-def test_priorityqueue_items_are_comparable(app):
-    with http_server(OKHandler):
-        app.builder.build_all()
-    content = (app.outdir / 'output.json').read_text()
-    rows = [json.loads(x) for x in sorted(content.splitlines())]
-    assert rows == [
-        {
-            'filename': 'index.rst',
-            # Should not be None.
-            'lineno': 0,
-            'status': 'working',
-            'code': 0,
-            'uri': 'http://localhost:7777/',
-            'info': '',
-        },
-        {
-            'filename': 'index.rst',
-            'lineno': 0,
-            'status': 'working',
-            'code': 0,
-            'uri': 'http://localhost:7777/',
-            'info': '',
-        },
-        {
-            'filename': 'index.rst',
-            'lineno': 4,
-            'status': 'working',
-            'code': 0,
-            'uri': 'http://localhost:7777/',
-            'info': '',
-        }
-    ]