Skip to content

Commit

Permalink
Gracefully fallback to html5lib for parsing non-compliant index pages
Browse files Browse the repository at this point in the history
This reworks the HTML parsing logic, to gracefully use `html5lib` on
non-compliant HTML 5 documents. This warning softens the failure mode
for users who are using commercial package index solutions that do not
follow the requisite standards and serve malformed HTML documents.
  • Loading branch information
pradyunsg committed Jan 30, 2022
1 parent cc35c93 commit c01b0b2
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 3 deletions.
25 changes: 24 additions & 1 deletion src/pip/_internal/index/collector.py
Expand Up @@ -38,6 +38,7 @@
from pip._internal.models.search_scope import SearchScope
from pip._internal.network.session import PipSession
from pip._internal.network.utils import raise_for_status
from pip._internal.utils.deprecation import deprecated
from pip._internal.utils.filetypes import is_archive_file
from pip._internal.utils.misc import pairwise, redact_auth_from_url
from pip._internal.vcs import vcs
Expand Down Expand Up @@ -342,12 +343,34 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
"""
Parse an HTML document, and yield its anchor elements as Link objects.
"""
encoding = page.encoding or "utf-8"

# Check if the page starts with a valid doctype, to decide whether to use
# http.parser or (deprecated) html5lib for parsing -- unless explicitly
# requested to use html5lib.
if not use_deprecated_html5lib:
expected_doctype = "<!doctype html>".encode(encoding)
actual_start = page.content[: len(expected_doctype)]
if actual_start.decode(encoding).lower() != "<!doctype html>":

This comment has been minimized.

Copy link
@ppena-LiveData

ppena-LiveData Jan 31, 2022

These three lines are not a correct check. The HTML5 spec says an HTML5 doc can start with an optional BOM, "Any number of comments and ASCII whitespace," and then the DOCTYPE, which can have "One or more ASCII whitespace," whereas this check is WAY more strict so will incorrectly say things are not valid that are valid. Why not just use a try..except around the parser.feed(page.content.decode(encoding)) call to let html.parser.HTMLParser get the declaration for you and only fallback on exception?

BTW, HTMLLinkParser.handle_decl() in this file is also way too strict. I think this is the correct code to use:

re.match(r'DOCTYPE\s+html\s*(?:SYSTEM\s+["\']about:legacy-compat["\'])?\s*$', decl, re.IGNORECASE)

If you agree, I can create a pull request with those suggested changes.

This comment has been minimized.

Copy link
@pfmoore

pfmoore Jan 31, 2022

Member

-1. We shouldn't make this code any more complex at this point, for anything short of a critical real-world case that can't be handled any other way.

We can make the code more standards-compliant later, once the current urgency has died down and we can design something robust and maintainable.

This comment has been minimized.

Copy link
@pradyunsg

pradyunsg Feb 1, 2022

Author Member

Let's move the discussion to #10868?

deprecated(
reason=(
f"The HTML index page being used ({page.url}) is not a proper "
"HTML 5 document. This is in violation of PEP 503 which requires "
"these pages to be well-formed HTML 5 documents. Please reach out "
"to the owners of this index page, and ask them to update this "
"index page to a valid HTML 5 document."
),
replacement=None,
gone_in="22.2",
issue=10825,
)
use_deprecated_html5lib = True

if use_deprecated_html5lib:
yield from _parse_links_html5lib(page)
return

parser = HTMLLinkParser()
encoding = page.encoding or "utf-8"
parser.feed(page.content.decode(encoding))

url = page.url
Expand Down
21 changes: 19 additions & 2 deletions tests/unit/test_collector.py
Expand Up @@ -540,9 +540,9 @@ def test_parse_links_caches_same_page_by_url() -> None:


def test_parse_link_handles_deprecated_usage_properly() -> None:
html = b'<a href="/pkg1-1.0.tar.gz"><a href="/pkg1-2.0.tar.gz">'
html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
url = "https://example.com/simple/"
page = HTMLPage(html, encoding=None, url=url)
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

parsed_links = list(parse_links(page, use_deprecated_html5lib=True))

Expand All @@ -551,6 +551,23 @@ def test_parse_link_handles_deprecated_usage_properly() -> None:
assert "pkg1-2.0" in parsed_links[1].url


@mock.patch("pip._internal.index.collector.deprecated")
def test_parse_links_presents_deprecation_warning_on_non_html5_page(
mock_deprecated: mock.Mock,
) -> None:
html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
url = "https://example.com/simple/"
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

assert len(parsed_links) == 2, parsed_links
assert "pkg1-1.0" in parsed_links[0].url
assert "pkg1-2.0" in parsed_links[1].url

mock_deprecated.assert_called_once()


@mock.patch("pip._internal.index.collector.raise_for_status")
def test_request_http_error(
mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture
Expand Down

0 comments on commit c01b0b2

Please sign in to comment.