diff --git a/tests/test_url.py b/tests/test_url.py index 7ac5d092..bfdd5bd6 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -4,7 +4,10 @@ import unittest from w3lib.url import (is_url, safe_url_string, safe_download_url, url_query_parameter, add_or_replace_parameter, url_query_cleaner, - file_uri_to_path, path_to_file_uri, any_to_uri, urljoin_rfc) + file_uri_to_path, path_to_file_uri, any_to_uri, urljoin_rfc, + canonicalize_url, parse_url) +from six.moves.urllib.parse import urlparse + class UrlTests(unittest.TestCase): @@ -347,6 +350,209 @@ def test_urljoin_rfc_deprecated(self): self.assertEqual(jurl, b"http://www.example.com/test") +class CanonicalizeUrlTest(unittest.TestCase): + + def test_canonicalize_url(self): + # simplest case + self.assertEqual(canonicalize_url("http://www.example.com/"), + "http://www.example.com/") + + def test_return_str(self): + assert isinstance(canonicalize_url(u"http://www.example.com"), str) + assert isinstance(canonicalize_url(b"http://www.example.com"), str) + + def test_append_missing_path(self): + self.assertEqual(canonicalize_url("http://www.example.com"), + "http://www.example.com/") + + def test_typical_usage(self): + self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"), + "http://www.example.com/do?a=1&b=2&c=3") + self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"), + "http://www.example.com/do?a=3&b=2&c=1") + self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"), + "http://www.example.com/do?a=1") + + def test_sorting(self): + self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"), + "http://www.example.com/do?a=50&b=2&b=5&c=3") + + def test_keep_blank_values(self): + self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False), + "http://www.example.com/do?a=2") + self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"), + "http://www.example.com/do?a=2&b=") + self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False), + "http://www.example.com/do?a=2") + self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"), + "http://www.example.com/do?a=2&b=&c=") + + self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'), + 'http://www.example.com/do?1750%2C4=') + + def test_spaces(self): + self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"), + "http://www.example.com/do?a=1&q=a+space") + self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"), + "http://www.example.com/do?a=1&q=a+space") + self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"), + "http://www.example.com/do?a=1&q=a+space") + + def test_canonicalize_url_unicode_path(self): + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé"), + "http://www.example.com/r%C3%A9sum%C3%A9") + + def test_canonicalize_url_unicode_query_string(self): + # default encoding for path and query is UTF-8 + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"), + "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") + + # passed encoding will affect query string + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'), + "http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9") + + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'), + "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF") + + def test_canonicalize_url_unicode_query_string_wrong_encoding(self): + # trying to encode with wrong encoding + # fallback to UTF-8 + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'), + "http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC") + + self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'), + "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F") + + def test_normalize_percent_encoding_in_paths(self): + self.assertEqual(canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"), + "http://www.example.com/r%C3%A9sum%C3%A9") + + # non-UTF8 encoded sequences: they should be kept untouched, only upper-cased + # 'latin1'-encoded sequence in path + self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"), + "http://www.example.com/a%A3do") + + # 'latin1'-encoded path, UTF-8 encoded query string + self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"), + "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9") + + # 'latin1'-encoded path and query string + self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"), + "http://www.example.com/a%A3do?q=r%E9sum%E9") + + def test_normalize_percent_encoding_in_query_arguments(self): + self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"), + "http://www.example.com/do?k=b%A3") + + self.assertEqual(canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"), + "http://www.example.com/do?k=r%C3%A9sum%C3%A9") + + def test_non_ascii_percent_encoding_in_paths(self): + self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"), + "http://www.example.com/a%20do?a=1"), + self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"), + "http://www.example.com/a%20%20do?a=1"), + self.assertEqual(canonicalize_url(u"http://www.example.com/a do£.html?a=1"), + "http://www.example.com/a%20do%C2%A3.html?a=1") + self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"), + "http://www.example.com/a%20do%C2%A3.html?a=1") + + def test_non_ascii_percent_encoding_in_query_arguments(self): + self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=£500&a=5&z=3"), + u"http://www.example.com/do?a=5&price=%C2%A3500&z=3") + self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"), + "http://www.example.com/do?a=5&price=%C2%A3500&z=3") + self.assertEqual(canonicalize_url(b"http://www.example.com/do?price(\xc2\xa3)=500&a=1"), + "http://www.example.com/do?a=1&price%28%C2%A3%29=500") + + def test_urls_with_auth_and_ports(self): + self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com:81/do?now=1"), + u"http://user:pass@www.example.com:81/do?now=1") + + def test_remove_fragments(self): + self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag"), + u"http://user:pass@www.example.com/do?a=1") + self.assertEqual(canonicalize_url(u"http://user:pass@www.example.com/do?a=1#frag", keep_fragments=True), + u"http://user:pass@www.example.com/do?a=1#frag") + + def test_dont_convert_safe_characters(self): + # dont convert safe characters to percent encoding representation + self.assertEqual(canonicalize_url( + "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"), + "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html") + + def test_safe_characters_unicode(self): + # urllib.quote uses a mapping cache of encoded characters. when parsing + # an already percent-encoded url, it will fail if that url was not + # percent-encoded as utf-8, that's why canonicalize_url must always + # convert the urls to string. the following test asserts that + # functionality. + self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'), + 'http://www.example.com/caf%E9-con-leche.htm') + + def test_domains_are_case_insensitive(self): + self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"), + "http://www.example.com/") + + def test_canonicalize_idns(self): + self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'), + 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher') + # Japanese (+ reordering query parameters) + self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'), + 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5') + + def test_quoted_slash_and_question_sign(self): + self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"), + "http://foo.com/AC%2FDC+rocks%3F/?yeah=1") + self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"), + "http://foo.com/AC%2FDC/") + + def test_canonicalize_urlparsed(self): + # canonicalize_url() can be passed an already urlparse'd URL + self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")), + "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") + self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')), + 'http://www.example.com/caf%E9-con-leche.htm') + self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")), + "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9") + + def test_canonicalize_parse_url(self): + # parse_url() wraps urlparse and is used in link extractors + self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")), + "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") + self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')), + 'http://www.example.com/caf%E9-con-leche.htm') + self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")), + "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9") + + def test_canonicalize_url_idempotence(self): + for url, enc in [(u'http://www.bücher.de/résumé?q=résumé', 'utf8'), + (u'http://www.example.com/résumé?q=résumé', 'latin1'), + (u'http://www.example.com/résumé?country=Россия', 'cp1251'), + (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'iso2022jp')]: + canonicalized = canonicalize_url(url, encoding=enc) + + # if we canonicalize again, we ge the same result + self.assertEqual(canonicalize_url(canonicalized, encoding=enc), canonicalized) + + # without encoding, already canonicalized URL is canonicalized identically + self.assertEqual(canonicalize_url(canonicalized), canonicalized) + + def test_canonicalize_url_idna_exceptions(self): + # missing DNS label + self.assertEqual( + canonicalize_url(u"http://.example.com/résumé?q=résumé"), + "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") + + # DNS label too long + self.assertEqual( + canonicalize_url( + u"http://www.{label}.com/résumé?q=résumé".format( + label=u"example"*11)), + "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format( + label=u"example"*11)) + + if __name__ == "__main__": unittest.main() diff --git a/w3lib/url.py b/w3lib/url.py index c3d84664..1bd8c7e5 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -10,7 +10,8 @@ import six from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit, urldefrag, urlencode, urlparse, - quote, parse_qs, parse_qsl) + quote, parse_qs, parse_qsl, + ParseResult, unquote, urlunparse) from six.moves.urllib.request import pathname2url, url2pathname from w3lib.util import to_bytes, to_native_str, to_unicode @@ -279,3 +280,174 @@ def any_to_uri(uri_or_path): # this last one is deprecated ; include it to be on the safe side "urljoin_rfc"] + + +def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): + # IDNA encoding can fail for too long labels (>63 characters) + # or missing labels (e.g. http://.example.com) + try: + netloc = parts.netloc.encode('idna') + except UnicodeError: + netloc = parts.netloc + + return ( + to_native_str(parts.scheme), + to_native_str(netloc), + + # default encoding for path component SHOULD be UTF-8 + quote(to_bytes(parts.path, path_encoding), _safe_chars), + quote(to_bytes(parts.params, path_encoding), _safe_chars), + + # encoding of query and fragment follows page encoding + # or form-charset (if known and passed) + quote(to_bytes(parts.query, encoding), _safe_chars), + quote(to_bytes(parts.fragment, encoding), _safe_chars) + ) + + +def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, + encoding=None): + """Canonicalize the given url by applying the following procedures: + + - sort query arguments, first by key, then by value + - percent encode paths ; non-ASCII characters are percent-encoded + using UTF-8 (RFC-3986) + - percent encode query arguments ; non-ASCII characters are percent-encoded + using passed `encoding` (UTF-8 by default) + - normalize all spaces (in query arguments) '+' (plus symbol) + - normalize percent encodings case (%2f -> %2F) + - remove query arguments with blank values (unless `keep_blank_values` is True) + - remove fragments (unless `keep_fragments` is True) + + The url passed can be bytes or unicode, while the url returned is + always a native str (bytes in Python 2, unicode in Python 3). + + For examples see the tests in tests/test_utils_url.py + """ + # If supplied `encoding` is not compatible with all characters in `url`, + # fallback to UTF-8 as safety net. + # UTF-8 can handle all Unicode characters, + # so we should be covered regarding URL normalization, + # if not for proper URL expected by remote website. + try: + scheme, netloc, path, params, query, fragment = _safe_ParseResult( + parse_url(url), encoding=encoding) + except UnicodeEncodeError as e: + scheme, netloc, path, params, query, fragment = _safe_ParseResult( + parse_url(url), encoding='utf8') + + # 1. decode query-string as UTF-8 (or keep raw bytes), + # sort values, + # and percent-encode them back + if six.PY2: + keyvals = parse_qsl(query, keep_blank_values) + else: + # Python3's urllib.parse.parse_qsl does not work as wanted + # for percent-encoded characters that do not match passed encoding, + # they get lost. + # + # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] + # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), + # instead of \xa3 that you get with Python2's parse_qsl) + # + # what we want here is to keep raw bytes, and percent encode them + # so as to preserve whatever encoding what originally used. + # + # See https://tools.ietf.org/html/rfc3987#section-6.4: + # + # For example, it is possible to have a URI reference of + # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the + # document name is encoded in iso-8859-1 based on server settings, but + # where the fragment identifier is encoded in UTF-8 according to + # [XPointer]. The IRI corresponding to the above URI would be (in XML + # notation) + # "http://www.example.org/r%E9sum%E9.xml#résumé". + # Similar considerations apply to query parts. The functionality of + # IRIs (namely, to be able to include non-ASCII characters) can only be + # used if the query part is encoded in UTF-8. + keyvals = parse_qsl_to_bytes(query, keep_blank_values) + keyvals.sort() + query = urlencode(keyvals) + + # 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes) + # and percent-encode path again (this normalizes to upper-case %XX) + uqp = _unquotepath(path) + path = quote(uqp, _safe_chars) or '/' + + fragment = '' if not keep_fragments else fragment + + # every part should be safe already + return urlunparse((scheme, netloc.lower(), path, params, query, fragment)) + + +def _unquotepath(path): + for reserved in ('2f', '2F', '3f', '3F'): + path = path.replace('%' + reserved, '%25' + reserved.upper()) + + if six.PY2: + # in Python 2, '%a3' becomes '\xa3', which is what we want + return unquote(path) + else: + # in Python 3, + # standard lib's unquote() does not work for non-UTF-8 + # percent-escaped characters, they get lost. + # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) + # + # unquote_to_bytes() returns raw bytes instead + return unquote_to_bytes(path) + + +def parse_url(url, encoding=None): + """Return urlparsed url from the given argument (which could be an already + parsed url) + """ + if isinstance(url, ParseResult): + return url + return urlparse(to_unicode(url, encoding)) + + +if not six.PY2: + from urllib.parse import _coerce_args, unquote_to_bytes + + def parse_qsl_to_bytes(qs, keep_blank_values=False): + """Parse a query given as a string argument. + + Data are returned as a list of name, value pairs as bytes. + + Arguments: + + qs: percent-encoded query string to be parsed + + keep_blank_values: flag indicating whether blank values in + percent-encoded queries should be treated as blank strings. A + true value indicates that blanks should be retained as blank + strings. The default false value indicates that blank values + are to be ignored and treated as if they were not included. + + """ + # This code is the same as Python3's parse_qsl() + # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) + # except for the unquote(s, encoding, errors) calls replaced + # with unquote_to_bytes(s) + qs, _coerce_result = _coerce_args(qs) + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = unquote_to_bytes(name) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = unquote_to_bytes(value) + value = _coerce_result(value) + r.append((name, value)) + return r