From 24dbe887e2c27146cd39327761302486dc06f859 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Fri, 26 Feb 2016 21:14:59 +0300 Subject: [PATCH 1/5] Add tests by @redapple, do urljoin on unicode strings. --- tests/test_html.py | 22 ++++++++++++++++++++++ w3lib/html.py | 6 +++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/tests/test_html.py b/tests/test_html.py index 1c9a0035..5906f59c 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -286,6 +286,28 @@ def test_tag_name(self): """ self.assertEqual(get_base_url(text, baseurl), 'https://example.org') + def test_get_base_url_utf8(self): + baseurl = u'https://example.org' + + text = u""" + + Dummy + blahablsdfsal& + """ + self.assertEqual(get_base_url(text, baseurl), + 'http://example.org/snowman%E2%8D%A8') + + def test_get_base_url_latin1(self): + baseurl = u'https://example.org' + + text = u""" + + Dummy + blahablsdfsal& + """ + self.assertEqual(get_base_url(text, baseurl, encoding='latin-1'), + 'http://example.org/sterling%A3') + class GetMetaRefreshTest(unittest.TestCase): def test_get_meta_refresh(self): diff --git a/w3lib/html.py b/w3lib/html.py index fbfc1ad9..23d99b16 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -281,11 +281,11 @@ def get_base_url(text, baseurl='', encoding='utf-8'): """ text = str_to_unicode(text, encoding) - baseurl = unicode_to_str(baseurl, encoding) m = _baseurl_re.search(text) if m: - baseurl = moves.urllib.parse.urljoin(baseurl, m.group(1).encode(encoding)) - return safe_url_string(baseurl) + baseurl = str_to_unicode(baseurl, encoding) + baseurl = moves.urllib.parse.urljoin(baseurl, m.group(1)) + return safe_url_string(unicode_to_str(baseurl, encoding)) def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')): """Return the http-equiv parameter of the HTML meta element from the given From 2c00a144c8a392ac14920f9ccb8abcf7b9044b36 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Tue, 22 Mar 2016 22:52:45 +0100 Subject: [PATCH 2/5] Fix tests on non-ASCII characters in URL + new safe_url_string() --- tests/test_html.py | 29 ++++++++++++++++-- tests/test_url.py | 75 ++++++++++++++++++++++++++++++++++++++++++---- w3lib/html.py | 35 ++++++++++++---------- w3lib/url.py | 65 +++++++++++++++++++++++++++------------- w3lib/util.py | 32 ++++++++++++++++++++ 5 files changed, 192 insertions(+), 44 deletions(-) diff --git a/tests/test_html.py b/tests/test_html.py index 5906f59c..68133cb5 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -236,6 +236,7 @@ def test_unquote_markup(self): class GetBaseUrlTest(unittest.TestCase): + def test_get_base_url(self): baseurl = u'https://example.org' @@ -298,6 +299,8 @@ def test_get_base_url_utf8(self): 'http://example.org/snowman%E2%8D%A8') def test_get_base_url_latin1(self): + # page encoding does not affect URL path encoding before percent-escaping + # we should still use UTF-8 by default baseurl = u'https://example.org' text = u""" @@ -306,7 +309,19 @@ def test_get_base_url_latin1(self): blahablsdfsal& """ self.assertEqual(get_base_url(text, baseurl, encoding='latin-1'), - 'http://example.org/sterling%A3') + 'http://example.org/sterling%C2%A3') + + def test_get_base_url_latin1_percent(self): + # non-UTF-8 percent-encoded characters sequence are left untouched + baseurl = u'https://example.org' + + text = u""" + + Dummy + blahablsdfsal& + """ + self.assertEqual(get_base_url(text, baseurl), + 'http://example.org/sterling%a3') class GetMetaRefreshTest(unittest.TestCase): @@ -357,10 +372,18 @@ def test_nonascii_url_utf8(self): self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/to%C2%A3')) def test_nonascii_url_latin1(self): - # non-ascii chars in the url (latin1) + # non-ascii chars in the url path (latin1) + # should end up UTF-8 encoded anyway baseurl = 'http://example.com' body = b"""""" - self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%A3')) + self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3')) + + def test_nonascii_url_latin1_query(self): + # non-ascii chars in the url path and query (latin1) + # only query part should be kept latin1 encoded before percent escaping + baseurl = 'http://example.com' + body = b"""""" + self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3?unit=%B5')) def test_commented_meta_refresh(self): # html commented meta refresh header must not directed diff --git a/tests/test_url.py b/tests/test_url.py index 77641203..f0807534 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from __future__ import absolute_import import os import unittest @@ -14,9 +15,13 @@ def test_safe_url_string(self): '%E8%8D%89%E8%96%99%20%E7%B4%A0%E5%AD%90') self.assertEqual(safe_url_string(motoko), safe_url_string(safe_url_string(motoko))) - self.assertEqual(safe_url_string(u'\xa9'), # copyright symbol + self.assertEqual(safe_url_string(u'©'), # copyright symbol '%C2%A9') - self.assertEqual(safe_url_string(u'\xa9', 'iso-8859-1'), + # page-encoding does not affect URL path + self.assertEqual(safe_url_string(u'©', 'iso-8859-1'), + '%C2%A9') + # path_encoding does + self.assertEqual(safe_url_string(u'©', path_encoding='iso-8859-1'), '%A9') self.assertEqual(safe_url_string("http://www.example.org/"), 'http://www.example.org/') @@ -31,16 +36,76 @@ def test_safe_url_string(self): self.assertEqual(safe_url_string("http://www.example.com/Brochures_&_Paint_Cards&PageSize=200"), "http://www.example.com/Brochures_&_Paint_Cards&PageSize=200") - safeurl = safe_url_string(u"http://www.example.com/\xa3", encoding='latin-1') + # page-encoding does not affect URL path + # we still end up UTF-8 encoding characters before percent-escaping + safeurl = safe_url_string(u"http://www.example.com/£") self.assertTrue(isinstance(safeurl, str)) - self.assertEqual(safeurl, "http://www.example.com/%A3") + self.assertEqual(safeurl, "http://www.example.com/%C2%A3") + + safeurl = safe_url_string(u"http://www.example.com/£", encoding='utf-8') + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%A3") - safeurl = safe_url_string(u"http://www.example.com/\xa3", encoding='utf-8') + safeurl = safe_url_string(u"http://www.example.com/£", encoding='latin-1') self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3") + safeurl = safe_url_string(u"http://www.example.com/£", path_encoding='latin-1') + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%A3") + self.assertTrue(isinstance(safe_url_string(b'http://example.com/'), str)) + def test_safe_url_string_with_query(self): + safeurl = safe_url_string(u"http://www.example.com/£?unit=µ") + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") + + safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='utf-8') + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") + + safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='latin-1') + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5") + + safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", path_encoding='latin-1') + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%C2%B5") + + safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='latin-1', path_encoding='latin-1') + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5") + + def test_safe_url_string_misc(self): + # mixing Unicode and percent-escaped sequences + safeurl = safe_url_string(u"http://www.example.com/£?unit=%C2%B5") + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") + + safeurl = safe_url_string(u"http://www.example.com/%C2%A3?unit=µ") + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") + + def test_safe_url_string_bytes_input(self): + safeurl = safe_url_string(b"http://www.example.com/") + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/") + + # bytes input is assumed to be UTF-8 + safeurl = safe_url_string(b"http://www.example.com/\xc2\xb5") + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%B5") + + # page-encoding encoded bytes still end up as UTF-8 sequences in path + safeurl = safe_url_string(b"http://www.example.com/\xb5", encoding='latin1') + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%B5") + + safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5", encoding='latin1') + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5") + def test_safe_download_url(self): self.assertEqual(safe_download_url('http://www.example.org/../'), 'http://www.example.org/') diff --git a/w3lib/html.py b/w3lib/html.py index 23d99b16..24d01a55 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -8,7 +8,7 @@ import six from six import moves -from w3lib.util import str_to_unicode, unicode_to_str +from w3lib.util import to_bytes, to_unicode from w3lib.url import safe_url_string _ent_re = re.compile(r'&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)', re.IGNORECASE) @@ -91,10 +91,10 @@ def convert_entity(m): return u'' if remove_illegal and groups.get('semicolon') else m.group(0) - return _ent_re.sub(convert_entity, str_to_unicode(text, encoding)) + return _ent_re.sub(convert_entity, to_unicode(text, encoding)) def has_entities(text, encoding=None): - return bool(_ent_re.search(str_to_unicode(text, encoding))) + return bool(_ent_re.search(to_unicode(text, encoding))) def replace_tags(text, token='', encoding=None): """Replace all markup tags found in the given `text` by the given token. @@ -116,7 +116,7 @@ def replace_tags(text, token='', encoding=None): """ - return _tag_re.sub(token, str_to_unicode(text, encoding)) + return _tag_re.sub(token, to_unicode(text, encoding)) _REMOVECOMMENTS_RE = re.compile(u'', re.DOTALL) @@ -130,7 +130,7 @@ def remove_comments(text, encoding=None): """ - text = str_to_unicode(text, encoding) + text = to_unicode(text, encoding) return _REMOVECOMMENTS_RE.sub(u'', text) def remove_tags(text, which_ones=(), keep=(), encoding=None): @@ -199,7 +199,7 @@ def remove_tag(m): regex = '/]+).*?>' retags = re.compile(regex, re.DOTALL | re.IGNORECASE) - return retags.sub(remove_tag, str_to_unicode(text, encoding)) + return retags.sub(remove_tag, to_unicode(text, encoding)) def remove_tags_with_content(text, which_ones=(), encoding=None): """Remove tags and their content. @@ -215,7 +215,7 @@ def remove_tags_with_content(text, which_ones=(), encoding=None): """ - text = str_to_unicode(text, encoding) + text = to_unicode(text, encoding) if which_ones: tags = '|'.join([r'<%s.*?|<%s\s*/>' % (tag, tag, tag) for tag in which_ones]) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) @@ -235,9 +235,9 @@ def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \ """ - text = str_to_unicode(text, encoding) + text = to_unicode(text, encoding) for ec in which_ones: - text = text.replace(ec, str_to_unicode(replace_by, encoding)) + text = text.replace(ec, to_unicode(replace_by, encoding)) return text def unquote_markup(text, keep=(), remove_illegal=True, encoding=None): @@ -261,7 +261,7 @@ def _get_fragments(txt, pattern): offset = match_e yield txt[offset:] - text = str_to_unicode(text, encoding) + text = to_unicode(text, encoding) ret_text = u'' for fragment in _get_fragments(text, _cdata_re): if isinstance(fragment, six.string_types): @@ -280,12 +280,15 @@ def get_base_url(text, baseurl='', encoding='utf-8'): """ - text = str_to_unicode(text, encoding) + text = to_unicode(text, encoding) m = _baseurl_re.search(text) if m: - baseurl = str_to_unicode(baseurl, encoding) - baseurl = moves.urllib.parse.urljoin(baseurl, m.group(1)) - return safe_url_string(unicode_to_str(baseurl, encoding)) + return moves.urllib.parse.urljoin( + safe_url_string(baseurl), + safe_url_string(m.group(1), encoding=encoding) + ) + else: + return safe_url_string(baseurl) def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')): """Return the http-equiv parameter of the HTML meta element from the given @@ -298,9 +301,9 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', """ if six.PY2: - baseurl = unicode_to_str(baseurl, encoding) + baseurl = to_bytes(baseurl, encoding) try: - text = str_to_unicode(text, encoding) + text = to_unicode(text, encoding) except UnicodeDecodeError: print(text) raise diff --git a/w3lib/url.py b/w3lib/url.py index 43932405..c6af1530 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -7,8 +7,11 @@ import posixpath import warnings import six -from six import moves -from w3lib.util import unicode_to_str +from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit, + urldefrag, urlencode, urlparse, + quote, parse_qs, parse_qsl) +from six.moves.urllib.request import pathname2url, url2pathname +from w3lib.util import to_bytes, to_native_str, to_unicode # Python 2.x urllib.always_safe become private in Python 3.x; # its content is copied here @@ -47,13 +50,13 @@ def urljoin_rfc(base, ref, encoding='utf-8'): str_base = unicode_to_str(base, encoding) str_ref = unicode_to_str(ref, encoding) - return moves.urllib.parse.urljoin(str_base, str_ref) + return urljoin(str_base, str_ref) _reserved = b';/?:@&=+$|,#' # RFC 3986 (Generic Syntax) _unreserved_marks = b"-_.!~*'()" # RFC 3986 sec 2.3 _safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks -def safe_url_string(url, encoding='utf8'): +def safe_url_string(url, encoding='utf8', path_encoding='utf8'): """Convert the given url into a legal URL by escaping unsafe characters according to RFC-3986. @@ -67,9 +70,31 @@ def safe_url_string(url, encoding='utf8'): Always returns a str. """ - s = unicode_to_str(url, encoding) - return moves.urllib.parse.quote(s, _safe_chars) - + # Python3's urlsplit() chokes on bytes input with non-ASCII chars, + # so let's decode (to Unicode) using page encoding. + # + # it is assumed that a raw bytes input comes from the page + # corresponding to the encoding + # + # Note: if this assumption is wrong, this will fail; + # in the general case, users are required to use Unicode + # or safe ASCII bytes input + parts = urlsplit(to_unicode(url, encoding=encoding)) + + # quote() in Python2 return type follows input type; + # quote() in Python3 always returns Unicode (native str) + return urlunsplit(( + to_native_str(parts.scheme), + to_native_str(parts.netloc), + + # default encoding for path component SHOULD be UTF-8 + quote(to_bytes(parts.path, path_encoding), _safe_chars), + + # encoding of query and fragment follows page encoding + # or form-charset (if known and passed) + quote(to_bytes(parts.query, encoding), _safe_chars), + quote(to_bytes(parts.fragment, encoding), _safe_chars), + )) _parent_dirs = re.compile(r'/?(\.\./)+') @@ -82,14 +107,14 @@ def safe_download_url(url): to be within the document root. """ safe_url = safe_url_string(url) - scheme, netloc, path, query, _ = moves.urllib.parse.urlsplit(safe_url) + scheme, netloc, path, query, _ = urlsplit(safe_url) if path: path = _parent_dirs.sub('', posixpath.normpath(path)) if url.endswith('/') and not path.endswith('/'): path += '/' else: path = '/' - return moves.urllib.parse.urlunsplit((scheme, netloc, path, query, '')) + return urlunsplit((scheme, netloc, path, query, '')) def is_url(text): return text.partition("://")[0] in ('file', 'http', 'https') @@ -123,8 +148,8 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0): """ - queryparams = moves.urllib.parse.parse_qs( - moves.urllib.parse.urlsplit(str(url))[3], + queryparams = parse_qs( + urlsplit(str(url))[3], keep_blank_values=keep_blank_values ) return queryparams.get(parameter, [default])[0] @@ -157,7 +182,7 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u if isinstance(parameterlist, (six.text_type, bytes)): parameterlist = [parameterlist] - url = moves.urllib.parse.urldefrag(url)[0] + url = urldefrag(url)[0] base, _, query = url.partition('?') seen = set() querylist = [] @@ -187,8 +212,8 @@ def add_or_replace_parameter(url, name, new_value): >>> """ - parsed = moves.urllib.parse.urlsplit(url) - args = moves.urllib.parse.parse_qsl(parsed.query, keep_blank_values=True) + parsed = urlsplit(url) + args = parse_qsl(parsed.query, keep_blank_values=True) new_args = [] found = False @@ -202,15 +227,15 @@ def add_or_replace_parameter(url, name, new_value): if not found: new_args.append((name, new_value)) - query = moves.urllib.parse.urlencode(new_args) - return moves.urllib.parse.urlunsplit(parsed._replace(query=query)) + query = urlencode(new_args) + return urlunsplit(parsed._replace(query=query)) def path_to_file_uri(path): """Convert local filesystem path to legal File URIs as described in: http://en.wikipedia.org/wiki/File_URI_scheme """ - x = moves.urllib.request.pathname2url(os.path.abspath(path)) + x = pathname2url(os.path.abspath(path)) if os.name == 'nt': x = x.replace('|', ':') # http://bugs.python.org/issue5861 return 'file:///%s' % x.lstrip('/') @@ -219,8 +244,8 @@ def file_uri_to_path(uri): """Convert File URI to local filesystem path according to: http://en.wikipedia.org/wiki/File_URI_scheme """ - uri_path = moves.urllib.parse.urlparse(uri).path - return moves.urllib.request.url2pathname(uri_path) + uri_path = urlparse(uri).path + return url2pathname(uri_path) def any_to_uri(uri_or_path): """If given a path name, return its File URI, otherwise return it @@ -228,5 +253,5 @@ def any_to_uri(uri_or_path): """ if os.path.splitdrive(uri_or_path)[0]: return path_to_file_uri(uri_or_path) - u = moves.urllib.parse.urlparse(uri_or_path) + u = urlparse(uri_or_path) return uri_or_path if u.scheme else path_to_file_uri(uri_or_path) diff --git a/w3lib/util.py b/w3lib/util.py index 09035b80..d8513eef 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -13,3 +13,35 @@ def unicode_to_str(text, encoding=None, errors='strict'): if isinstance(text, six.text_type): return text.encode(encoding, errors) return text + +def to_unicode(text, encoding=None, errors='strict'): + """Return the unicode representation of a bytes object `text`. If `text` + is already an unicode object, return it as-is.""" + if isinstance(text, six.text_type): + return text + if not isinstance(text, (bytes, six.text_type)): + raise TypeError('to_unicode must receive a bytes, str or unicode ' + 'object, got %s' % type(text).__name__) + if encoding is None: + encoding = 'utf-8' + return text.decode(encoding, errors) + +def to_bytes(text, encoding=None, errors='strict'): + """Return the binary representation of `text`. If `text` + is already a bytes object, return it as-is.""" + if isinstance(text, bytes): + return text + if not isinstance(text, six.string_types): + raise TypeError('to_bytes must receive a unicode, str or bytes ' + 'object, got %s' % type(text).__name__) + if encoding is None: + encoding = 'utf-8' + return text.encode(encoding, errors) + +def to_native_str(text, encoding=None, errors='strict'): + """ Return str representation of `text` + (bytes in Python 2.x and unicode in Python 3.x). """ + if six.PY2: + return to_bytes(text, encoding, errors) + else: + return to_unicode(text, encoding, errors) From 5daebcda3f91a3ec5404c3c38d4855ad4c92d619 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Wed, 23 Mar 2016 10:21:26 +0100 Subject: [PATCH 3/5] Update safe_url_string() docstring --- w3lib/url.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/w3lib/url.py b/w3lib/url.py index c6af1530..3eae56b0 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -57,18 +57,20 @@ def urljoin_rfc(base, ref, encoding='utf-8'): _safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks def safe_url_string(url, encoding='utf8', path_encoding='utf8'): - """Convert the given url into a legal URL by escaping unsafe characters + """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. - If a unicode url is given, it is first converted to str using the given - encoding (which defaults to 'utf-8'). When passing a encoding, you should - use the encoding of the original page (the page from which the url was - extracted from). + If a bytes URL is given, it is first converted to `str` using the given + encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for + URL path component (unless overriden by path_encoding), and given + encoding is used for query string or form data. + When passing a encoding, you should use the encoding of the + original page (the page from which the url was extracted from). - Calling this function on an already "safe" url will return the url + Calling this function on an already "safe" URL will return the URL unmodified. - Always returns a str. + Always returns a native `str` (bytes in Python2, unicode in Python3). """ # Python3's urlsplit() chokes on bytes input with non-ASCII chars, # so let's decode (to Unicode) using page encoding. From 3253677571b23d627dbaf7499723cd118963b6ab Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Wed, 23 Mar 2016 16:45:37 +0100 Subject: [PATCH 4/5] Remove deprecated urljoin_rfc() Was deprecated since v1.1: https://github.com/scrapy/w3lib/blob/v1.1/w3lib/url.py --- w3lib/url.py | 36 +----------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/w3lib/url.py b/w3lib/url.py index 3eae56b0..ac5de7e3 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -7,7 +7,7 @@ import posixpath import warnings import six -from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit, +from six.moves.urllib.parse import (urlsplit, urlunsplit, urldefrag, urlencode, urlparse, quote, parse_qs, parse_qsl) from six.moves.urllib.request import pathname2url, url2pathname @@ -18,40 +18,6 @@ _ALWAYS_SAFE_BYTES = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' b'abcdefghijklmnopqrstuvwxyz' b'0123456789' b'_.-') - - -def urljoin_rfc(base, ref, encoding='utf-8'): - r""" - .. warning:: - - This function is deprecated and will be removed in future. - Please use ``urlparse.urljoin`` instead. - - Same as urlparse.urljoin but supports unicode values in base and ref - parameters (in which case they will be converted to str using the given - encoding). - - Always returns a str. - - >>> import w3lib.url - >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html') - 'http://www.example.com/otherpath/index2.html' - >>> - - >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') - 'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm' - >>> - - - """ - - warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead", - DeprecationWarning) - - str_base = unicode_to_str(base, encoding) - str_ref = unicode_to_str(ref, encoding) - return urljoin(str_base, str_ref) - _reserved = b';/?:@&=+$|,#' # RFC 3986 (Generic Syntax) _unreserved_marks = b"-_.!~*'()" # RFC 3986 sec 2.3 _safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks From b8b90555a92ba7d8a54495a5d2b3447539c57b0e Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Fri, 25 Mar 2016 18:59:19 +0100 Subject: [PATCH 5/5] Support Internationalized Domain Names with safe_url_string() --- tests/test_url.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ w3lib/url.py | 2 +- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/tests/test_url.py b/tests/test_url.py index f0807534..9d73d1ee 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -106,6 +106,52 @@ def test_safe_url_string_bytes_input(self): self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5") + def test_safe_url_idna(self): + # adapted from: + # https://ssl.icu-project.org/icu-bin/idnbrowser + # http://unicode.org/faq/idn.html + # + various others + websites = ( + (u'http://www.färgbolaget.nu/färgbolaget', 'http://www.xn--frgbolaget-q5a.nu/f%C3%A4rgbolaget'), + (u'http://www.räksmörgås.se/?räksmörgås=yes', 'http://www.xn--rksmrgs-5wao1o.se/?r%C3%A4ksm%C3%B6rg%C3%A5s=yes'), + (u'http://www.brændendekærlighed.com/brændende/kærlighed', 'http://www.xn--brndendekrlighed-vobh.com/br%C3%A6ndende/k%C3%A6rlighed'), + (u'http://www.예비교사.com', 'http://www.xn--9d0bm53a3xbzui.com'), + (u'http://理容ナカムラ.com', 'http://xn--lck1c3crb1723bpq4a.com'), + (u'http://あーるいん.com', 'http://xn--l8je6s7a45b.com'), + + # --- real websites --- + + # in practice, this redirect (301) to http://www.buecher.de/?q=b%C3%BCcher + (u'http://www.bücher.de/?q=bücher', 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher'), + + # Japanese + (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5'), + + # Russian + (u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'), + (u'http://кто.рф/index.php?domain=Что', 'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'), + + # Korean + (u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'), + (u'http://맨체스터시티축구단.한국/', 'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'), + + # Arabic + (u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'), + + # Chinese + (u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'), + (u'https://www2.xn--0kwr83e.在线', 'https://www2.xn--0kwr83e.xn--3ds443g'), + (u'https://www3.贷款.xn--3ds443g', 'https://www3.xn--0kwr83e.xn--3ds443g'), + ) + for idn_input, safe_result in websites: + safeurl = safe_url_string(idn_input) + self.assertEqual(safeurl, safe_result) + + # make sure the safe URL is unchanged when made safe a 2nd time + for _, safe_result in websites: + safeurl = safe_url_string(safe_result) + self.assertEqual(safeurl, safe_result) + def test_safe_download_url(self): self.assertEqual(safe_download_url('http://www.example.org/../'), 'http://www.example.org/') diff --git a/w3lib/url.py b/w3lib/url.py index ac5de7e3..3bac3d35 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -53,7 +53,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'): # quote() in Python3 always returns Unicode (native str) return urlunsplit(( to_native_str(parts.scheme), - to_native_str(parts.netloc), + to_native_str(parts.netloc.encode('idna')), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars),