Skip to content

Commit

Permalink
Merge pull request #5826 from uranusjr/htmlpage-extract-localize
Browse files Browse the repository at this point in the history
Turn attributes on HTMLPage into local variables and standalone functions
  • Loading branch information
pradyunsg committed Sep 29, 2018
2 parents 4cc5f37 + 6f85fcd commit a5d4f88
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 44 deletions.
1 change: 1 addition & 0 deletions news/5826.trivial
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Refactor HTMLPage to reduce attributes on it.
91 changes: 50 additions & 41 deletions src/pip/_internal/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from pip._internal.utils.deprecation import deprecated
from pip._internal.utils.logging import indent_log
from pip._internal.utils.misc import (
ARCHIVE_EXTENSIONS, SUPPORTED_EXTENSIONS, cached_property, normalize_path,
ARCHIVE_EXTENSIONS, SUPPORTED_EXTENSIONS, normalize_path,
remove_auth_from_url,
)
from pip._internal.utils.packaging import check_requires_python
Expand Down Expand Up @@ -415,7 +415,7 @@ def find_all_candidates(self, project_name):
logger.debug('Analyzing links from page %s', page.url)
with indent_log():
page_versions.extend(
self._package_versions(page.links, search)
self._package_versions(page.iter_links(), search)
)

dependency_versions = self._package_versions(
Expand Down Expand Up @@ -706,24 +706,50 @@ def egg_info_matches(
return None


def _determine_base_url(document, page_url):
"""Determine the HTML document's base URL.
This looks for a ``<base>`` tag in the HTML document. If present, its href
attribute denotes the base URL of anchor tags in the document. If there is
no such tag (or if it does not have a valid href attribute), the HTML
file's URL is used as the base URL.
:param document: An HTML document representation. The current
implementation expects the result of ``html5lib.parse()``.
:param page_url: The URL of the HTML document.
"""
for base in document.findall(".//base"):
href = base.get("href")
if href is not None:
return href
return page_url


def _get_encoding_from_headers(headers):
"""Determine if we have any encoding information in our headers.
"""
if headers and "Content-Type" in headers:
content_type, params = cgi.parse_header(headers["Content-Type"])
if "charset" in params:
return params['charset']
return None


_CLEAN_LINK_RE = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)


def _clean_link(url):
"""Makes sure a link is fully encoded. That is, if a ' ' shows up in
the link, it will be rewritten to %20 (while not over-quoting
% or other characters)."""
return _CLEAN_LINK_RE.sub(lambda match: '%%%2x' % ord(match.group(0)), url)


class HTMLPage(object):
"""Represents one page, along with its URL"""

def __init__(self, content, url, headers=None):
# Determine if we have any encoding information in our headers
encoding = None
if headers and "Content-Type" in headers:
content_type, params = cgi.parse_header(headers["Content-Type"])

if "charset" in params:
encoding = params['charset']

self.content = content
self.parsed = html5lib.parse(
self.content,
transport_encoding=encoding,
namespaceHTMLElements=False,
)
self.url = url
self.headers = headers

Expand Down Expand Up @@ -849,39 +875,22 @@ def _get_content_type(url, session):

return resp.headers.get("Content-Type", "")

@cached_property
def base_url(self):
bases = [
x for x in self.parsed.findall(".//base")
if x.get("href") is not None
]
if bases and bases[0].get("href"):
return bases[0].get("href")
else:
return self.url

@property
def links(self):
def iter_links(self):
"""Yields all links in the page"""
for anchor in self.parsed.findall(".//a"):
document = html5lib.parse(
self.content,
transport_encoding=_get_encoding_from_headers(self.headers),
namespaceHTMLElements=False,
)
base_url = _determine_base_url(document, self.url)
for anchor in document.findall(".//a"):
if anchor.get("href"):
href = anchor.get("href")
url = self.clean_link(
urllib_parse.urljoin(self.base_url, href)
)
url = _clean_link(urllib_parse.urljoin(base_url, href))
pyrequire = anchor.get('data-requires-python')
pyrequire = unescape(pyrequire) if pyrequire else None
yield Link(url, self, requires_python=pyrequire)

_clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)

def clean_link(self, url):
"""Makes sure a link is fully encoded. That is, if a ' ' shows up in
the link, it will be rewritten to %20 (while not over-quoting
% or other characters)."""
return self._clean_re.sub(
lambda match: '%%%2x' % ord(match.group(0)), url)


Search = namedtuple('Search', 'supplied canonical formats')
"""Capture key aspects of a search.
Expand Down
10 changes: 7 additions & 3 deletions tests/unit/test_index.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os.path

import pytest
from pip._vendor import html5lib

from pip._internal.download import PipSession
from pip._internal.index import HTMLPage, Link, PackageFinder
from pip._internal.index import Link, PackageFinder, _determine_base_url


def test_sort_locations_file_expand_dir(data):
Expand Down Expand Up @@ -107,8 +108,11 @@ def test_fragments(self):
),
],
)
def test_base_url(html, url, expected):
assert HTMLPage(html, url).base_url == expected
def test_determine_base_url(html, url, expected):
document = html5lib.parse(
html, transport_encoding=None, namespaceHTMLElements=False,
)
assert _determine_base_url(document, url) == expected


class MockLogger(object):
Expand Down

0 comments on commit a5d4f88

Please sign in to comment.