Merge pull request #5826 from uranusjr/htmlpage-extract-localize

Turn attributes on HTMLPage into local variables and standalone functions
pypa · Sep 29, 2018 · a5d4f88 · a5d4f88
2 parents 4cc5f37 + 6f85fcd
commit a5d4f88
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 44 deletions.
diff --git a/news/5826.trivial b/news/5826.trivial
@@ -0,0 +1 @@
+Refactor HTMLPage to reduce attributes on it.
diff --git a/src/pip/_internal/index.py b/src/pip/_internal/index.py
@@ -34,7 +34,7 @@
 from pip._internal.utils.deprecation import deprecated
 from pip._internal.utils.logging import indent_log
 from pip._internal.utils.misc import (
-    ARCHIVE_EXTENSIONS, SUPPORTED_EXTENSIONS, cached_property, normalize_path,
+    ARCHIVE_EXTENSIONS, SUPPORTED_EXTENSIONS, normalize_path,
     remove_auth_from_url,
 )
 from pip._internal.utils.packaging import check_requires_python
@@ -415,7 +415,7 @@ def find_all_candidates(self, project_name):
             logger.debug('Analyzing links from page %s', page.url)
             with indent_log():
                 page_versions.extend(
-                    self._package_versions(page.links, search)
+                    self._package_versions(page.iter_links(), search)
                 )
 
         dependency_versions = self._package_versions(
@@ -706,24 +706,50 @@ def egg_info_matches(
         return None
 
 
+def _determine_base_url(document, page_url):
+    """Determine the HTML document's base URL.
+
+    This looks for a ``<base>`` tag in the HTML document. If present, its href
+    attribute denotes the base URL of anchor tags in the document. If there is
+    no such tag (or if it does not have a valid href attribute), the HTML
+    file's URL is used as the base URL.
+
+    :param document: An HTML document representation. The current
+        implementation expects the result of ``html5lib.parse()``.
+    :param page_url: The URL of the HTML document.
+    """
+    for base in document.findall(".//base"):
+        href = base.get("href")
+        if href is not None:
+            return href
+    return page_url
+
+
+def _get_encoding_from_headers(headers):
+    """Determine if we have any encoding information in our headers.
+    """
+    if headers and "Content-Type" in headers:
+        content_type, params = cgi.parse_header(headers["Content-Type"])
+        if "charset" in params:
+            return params['charset']
+    return None
+
+
+_CLEAN_LINK_RE = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
+
+
+def _clean_link(url):
+    """Makes sure a link is fully encoded.  That is, if a ' ' shows up in
+    the link, it will be rewritten to %20 (while not over-quoting
+    % or other characters)."""
+    return _CLEAN_LINK_RE.sub(lambda match: '%%%2x' % ord(match.group(0)), url)
+
+
 class HTMLPage(object):
     """Represents one page, along with its URL"""
 
     def __init__(self, content, url, headers=None):
-        # Determine if we have any encoding information in our headers
-        encoding = None
-        if headers and "Content-Type" in headers:
-            content_type, params = cgi.parse_header(headers["Content-Type"])
-
-            if "charset" in params:
-                encoding = params['charset']
-
         self.content = content
-        self.parsed = html5lib.parse(
-            self.content,
-            transport_encoding=encoding,
-            namespaceHTMLElements=False,
-        )
         self.url = url
         self.headers = headers
 
@@ -849,39 +875,22 @@ def _get_content_type(url, session):
 
         return resp.headers.get("Content-Type", "")
 
-    @cached_property
-    def base_url(self):
-        bases = [
-            x for x in self.parsed.findall(".//base")
-            if x.get("href") is not None
-        ]
-        if bases and bases[0].get("href"):
-            return bases[0].get("href")
-        else:
-            return self.url
-
-    @property
-    def links(self):
+    def iter_links(self):
         """Yields all links in the page"""
-        for anchor in self.parsed.findall(".//a"):
+        document = html5lib.parse(
+            self.content,
+            transport_encoding=_get_encoding_from_headers(self.headers),
+            namespaceHTMLElements=False,
+        )
+        base_url = _determine_base_url(document, self.url)
+        for anchor in document.findall(".//a"):
             if anchor.get("href"):
                 href = anchor.get("href")
-                url = self.clean_link(
-                    urllib_parse.urljoin(self.base_url, href)
-                )
+                url = _clean_link(urllib_parse.urljoin(base_url, href))
                 pyrequire = anchor.get('data-requires-python')
                 pyrequire = unescape(pyrequire) if pyrequire else None
                 yield Link(url, self, requires_python=pyrequire)
 
-    _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
-
-    def clean_link(self, url):
-        """Makes sure a link is fully encoded.  That is, if a ' ' shows up in
-        the link, it will be rewritten to %20 (while not over-quoting
-        % or other characters)."""
-        return self._clean_re.sub(
-            lambda match: '%%%2x' % ord(match.group(0)), url)
-
 
 Search = namedtuple('Search', 'supplied canonical formats')
 """Capture key aspects of a search.

diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py
@@ -1,9 +1,10 @@
 import os.path
 
 import pytest
+from pip._vendor import html5lib
 
 from pip._internal.download import PipSession
-from pip._internal.index import HTMLPage, Link, PackageFinder
+from pip._internal.index import Link, PackageFinder, _determine_base_url
 
 
 def test_sort_locations_file_expand_dir(data):
@@ -107,8 +108,11 @@ def test_fragments(self):
         ),
     ],
 )
-def test_base_url(html, url, expected):
-    assert HTMLPage(html, url).base_url == expected
+def test_determine_base_url(html, url, expected):
+    document = html5lib.parse(
+        html, transport_encoding=None, namespaceHTMLElements=False,
+    )
+    assert _determine_base_url(document, url) == expected
 
 
 class MockLogger(object):