Skip to content

Commit

Permalink
Merge pull request #8702 from tk0miya/4304_linkcheck_same_url
Browse files Browse the repository at this point in the history
linkcheck: Do not check the availability of the same URL repeatedly
  • Loading branch information
tk0miya committed Jan 20, 2021
2 parents 42de5e2 + cead0f6 commit 82ef497
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 73 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Incompatible changes
Deprecated
----------

* ``sphinx.builders.linkcheck.node_line_or_0()``
* ``sphinx.ext.autodoc.AttributeDocumenter.isinstanceattribute()``
* ``sphinx.ext.autodoc.directive.DocumenterBridge.reporter``
* ``sphinx.ext.autodoc.importer.get_module_members()``
Expand Down Expand Up @@ -60,6 +61,8 @@ Bugs fixed
+ or ^) are used as keystrokes
* #8629: html: A type warning for html_use_opensearch is shown twice
* #8665: html theme: Could not override globaltoc_maxdepth in theme.conf
* #4304: linkcheck: Fix race condition that could lead to checking the
availability of the same URL twice
* #8094: texinfo: image files on the different directory with document are not
copied
* #8671: :confval:`highlight_options` is not working
Expand Down
5 changes: 5 additions & 0 deletions doc/extdev/deprecated.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ The following is a list of deprecated interfaces.
- (will be) Removed
- Alternatives

* - ``sphinx.builders.linkcheck.node_line_or_0()``
- 3.5
- 5.0
- ``sphinx.util.nodes.get_node_line()``

* - ``sphinx.ext.autodoc.AttributeDocumenter.isinstanceattribute()``
- 3.5
- 5.0
Expand Down
84 changes: 55 additions & 29 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@
import socket
import threading
import time
import warnings
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from html.parser import HTMLParser
from os import path
from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple
from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, cast
from urllib.parse import unquote, urlparse

from docutils import nodes
Expand All @@ -28,7 +29,9 @@

from sphinx.application import Sphinx
from sphinx.builders import Builder
from sphinx.deprecation import RemovedInSphinx40Warning
from sphinx.locale import __
from sphinx.transforms.post_transforms import SphinxPostTransform
from sphinx.util import encode_uri, logging, requests
from sphinx.util.console import darkgray, darkgreen, purple, red, turquoise # type: ignore
from sphinx.util.nodes import get_node_line
Expand All @@ -37,6 +40,10 @@

uri_re = re.compile('([a-z]+:)?//') # matches to foo:// and // (a protocol relative URL)

Hyperlink = NamedTuple('Hyperlink', (('next_check', float),
('uri', Optional[str]),
('docname', Optional[str]),
('lineno', Optional[int])))
RateLimit = NamedTuple('RateLimit', (('delay', float), ('next_check', float)))

DEFAULT_REQUEST_HEADERS = {
Expand All @@ -52,6 +59,8 @@ def node_line_or_0(node: Element) -> int:
PriorityQueue items must be comparable. The line number is part of the
tuple used by the PriorityQueue, keep an homogeneous type for comparison.
"""
warnings.warn('node_line_or_0() is deprecated.',
RemovedInSphinx40Warning, stacklevel=2)
return get_node_line(node) or 0


Expand Down Expand Up @@ -98,6 +107,7 @@ class CheckExternalLinksBuilder(Builder):
'%(outdir)s/output.txt')

def init(self) -> None:
self.hyperlinks = {} # type: Dict[str, Hyperlink]
self.to_ignore = [re.compile(x) for x in self.app.config.linkcheck_ignore]
self.anchors_ignore = [re.compile(x)
for x in self.app.config.linkcheck_anchors_ignore]
Expand Down Expand Up @@ -406,28 +416,26 @@ def prepare_writing(self, docnames: Set[str]) -> None:
return

def write_doc(self, docname: str, doctree: Node) -> None:
pass

def write_entry(self, what: str, docname: str, filename: str, line: int,
uri: str) -> None:
with open(path.join(self.outdir, 'output.txt'), 'a') as output:
output.write("%s:%s: [%s] %s\n" % (filename, line, what, uri))

def write_linkstat(self, data: dict) -> None:
with open(path.join(self.outdir, 'output.json'), 'a') as output:
output.write(json.dumps(data))
output.write('\n')

def finish(self) -> None:
logger.info('')
n = 0

# reference nodes
for refnode in doctree.traverse(nodes.reference):
if 'refuri' not in refnode:
continue
uri = refnode['refuri']
lineno = node_line_or_0(refnode)
uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
self.wqueue.put(uri_info, False)
for hyperlink in self.hyperlinks.values():
self.wqueue.put(hyperlink, False)
n += 1

# image nodes
for imgnode in doctree.traverse(nodes.image):
uri = imgnode['candidates'].get('?')
if uri and '://' in uri:
lineno = node_line_or_0(imgnode)
uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno)
self.wqueue.put(uri_info, False)
n += 1

done = 0
while done < n:
self.process_result(self.rqueue.get())
Expand All @@ -436,25 +444,43 @@ def write_doc(self, docname: str, doctree: Node) -> None:
if self.broken:
self.app.statuscode = 1

def write_entry(self, what: str, docname: str, filename: str, line: int,
uri: str) -> None:
with open(path.join(self.outdir, 'output.txt'), 'a') as output:
output.write("%s:%s: [%s] %s\n" % (filename, line, what, uri))

def write_linkstat(self, data: dict) -> None:
with open(path.join(self.outdir, 'output.json'), 'a') as output:
output.write(json.dumps(data))
output.write('\n')

def finish(self) -> None:
self.wqueue.join()
# Shutdown threads.
for worker in self.workers:
self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False)


class HyperlinkCollector(SphinxPostTransform):
builders = ('linkcheck',)
default_priority = 800

def run(self, **kwargs: Any) -> None:
builder = cast(CheckExternalLinksBuilder, self.app.builder)
hyperlinks = builder.hyperlinks

# reference nodes
for refnode in self.document.traverse(nodes.reference):
if 'refuri' not in refnode:
continue
uri = refnode['refuri']
lineno = get_node_line(refnode)
uri_info = Hyperlink(CHECK_IMMEDIATELY, uri, self.env.docname, lineno)
if uri not in hyperlinks:
hyperlinks[uri] = uri_info

# image nodes
for imgnode in self.document.traverse(nodes.image):
uri = imgnode['candidates'].get('?')
if uri and '://' in uri:
lineno = get_node_line(imgnode)
uri_info = Hyperlink(CHECK_IMMEDIATELY, uri, self.env.docname, lineno)
if uri not in hyperlinks:
hyperlinks[uri] = uri_info


def setup(app: Sphinx) -> Dict[str, Any]:
app.add_builder(CheckExternalLinksBuilder)
app.add_post_transform(HyperlinkCollector)

app.add_config_value('linkcheck_ignore', [], None)
app.add_config_value('linkcheck_auth', [], None)
Expand Down
1 change: 0 additions & 1 deletion tests/roots/test-linkcheck-localserver-two-links/conf.py

This file was deleted.

6 changes: 0 additions & 6 deletions tests/roots/test-linkcheck-localserver-two-links/index.rst

This file was deleted.

37 changes: 0 additions & 37 deletions tests/test_build_linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,40 +573,3 @@ def test_limit_rate_bails_out_after_waiting_max_time(app):
checker.rate_limits = {"localhost": RateLimit(90.0, 0.0)}
next_check = checker.limit_rate(FakeResponse())
assert next_check is None


@pytest.mark.sphinx(
'linkcheck', testroot='linkcheck-localserver-two-links', freshenv=True,
)
def test_priorityqueue_items_are_comparable(app):
with http_server(OKHandler):
app.builder.build_all()
content = (app.outdir / 'output.json').read_text()
rows = [json.loads(x) for x in sorted(content.splitlines())]
assert rows == [
{
'filename': 'index.rst',
# Should not be None.
'lineno': 0,
'status': 'working',
'code': 0,
'uri': 'http://localhost:7777/',
'info': '',
},
{
'filename': 'index.rst',
'lineno': 0,
'status': 'working',
'code': 0,
'uri': 'http://localhost:7777/',
'info': '',
},
{
'filename': 'index.rst',
'lineno': 4,
'status': 'working',
'code': 0,
'uri': 'http://localhost:7777/',
'info': '',
}
]

0 comments on commit 82ef497

Please sign in to comment.