Skip to content

Commit

Permalink
Warn on bad/missing doctype declarations instead of erroring out
Browse files Browse the repository at this point in the history
This is a less disruptive mode of operation, and helps ensure that users
who are using documents that don't declare the doctype are presented
with a warning; instead of getting a hard-failure.
  • Loading branch information
pradyunsg committed Feb 1, 2022
1 parent 193259d commit 3bb5a61
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 18 deletions.
24 changes: 24 additions & 0 deletions src/pip/_internal/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,30 @@ class UninstallationError(PipError):
"""General exception during uninstallation"""


class BadHTMLDoctypeDeclaration(DiagnosticPipError):
reference = "bad-index-doctype"

def __init__(self, *, url: str) -> None:
super().__init__(
kind="warning",
message=(
"The package index page being used does not have a proper HTML "
"doctype declaration."
),
context=f"Problematic URL: {escape(url)}",
note_stmt="This is an issue with the page at the URL mentioned above.",
hint_stmt=(
"You might need to reach out to the owner of that package index, "
"to get this fixed."
),
link="https://github.com/pypa/pip/issues/10825",
)


class MissingHTMLDoctypeDeclaration(BadHTMLDoctypeDeclaration):
reference = "missing-index-doctype"


class MissingPyProjectBuildRequires(DiagnosticPipError):
"""Raised when pyproject.toml has `build-system`, but no `build-system.requires`."""

Expand Down
45 changes: 27 additions & 18 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from optparse import Values
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
Expand All @@ -33,7 +32,11 @@
from pip._vendor.requests import Response
from pip._vendor.requests.exceptions import RetryError, SSLError

from pip._internal.exceptions import NetworkConnectionError
from pip._internal.exceptions import (
BadHTMLDoctypeDeclaration,
MissingHTMLDoctypeDeclaration,
NetworkConnectionError,
)
from pip._internal.models.link import Link
from pip._internal.models.search_scope import SearchScope
from pip._internal.network.session import PipSession
Expand Down Expand Up @@ -418,20 +421,34 @@ class HTMLLinkParser(HTMLParser):
elements' attributes.
"""

def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self._seen_decl = False
def __init__(self, url: str) -> None:
super().__init__(convert_charrefs=True)
self._dealt_with_doctype_issues = False

self.url: str = url
self.base_url: Optional[str] = None
self.anchors: List[Dict[str, Optional[str]]] = []

def handle_decl(self, decl: str) -> None:
if decl.lower() != "doctype html":
self._raise_error()
self._seen_decl = True
self._dealt_with_doctype_issues = True
match = re.match(
r"""doctype\s+html\s*(?:SYSTEM\s+(["'])about:legacy-compat\1)?\s*$""",
decl,
re.IGNORECASE,
)
if match is None:
logger.warn(
"[present-diagnostic] %s",
BadHTMLDoctypeDeclaration(url=self.url),
)

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
if not self._seen_decl:
self._raise_error()
if not self._dealt_with_doctype_issues:
logger.warn(
"[present-diagnostic] %s",
MissingHTMLDoctypeDeclaration(url=self.url),
)
self._dealt_with_doctype_issues = True

if tag == "base" and self.base_url is None:
href = self.get_href(attrs)
Expand All @@ -446,14 +463,6 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
return value
return None

def _raise_error(self) -> None:
raise ValueError(
"HTML doctype missing or incorrect. Expected <!DOCTYPE html>.\n\n"
"If you believe this error to be incorrect, try passing the "
"command line option --use-deprecated=html5lib and please leave "
"a comment on the pip issue at https://github.com/pypa/pip/issues/10825."
)


def _handle_get_page_fail(
link: Link,
Expand Down

0 comments on commit 3bb5a61

Please sign in to comment.