From 3267c27e66e3e439fef5a9be079f539f8819cab4 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sat, 24 Feb 2018 15:34:29 -0800 Subject: [PATCH 1/3] add 'encode_stray_percents', a new argument to _percent_decode(), its variants, and URL.normalize(). This option (disabled by default, and enabled for normalize), exists to turn unmatched % characters to their encoded equivalent, correcting an otherwise improperly encoded URL. Major browsers don't do this by default, so it's currently limited to the explicit normalize step. Fixes #61. --- hyperlink/_url.py | 82 +++++++++++++++++++++----------------- hyperlink/test/test_url.py | 8 +++- 2 files changed, 53 insertions(+), 37 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 81c992a0..1a3300e3 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -474,17 +474,19 @@ def iter_pairs(iterable): return iter(iterable) -def _decode_unreserved(text, normalize_case=False): +def _decode_unreserved(text, normalize_case=False, encode_stray_percents=False): return _percent_decode(text, normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, _decode_map=_UNRESERVED_DECODE_MAP) -def _decode_userinfo_part(text, normalize_case=False): +def _decode_userinfo_part(text, normalize_case=False, encode_stray_percents=False): return _percent_decode(text, normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, _decode_map=_USERINFO_DECODE_MAP) -def _decode_path_part(text, normalize_case=False): +def _decode_path_part(text, normalize_case=False, encode_stray_percents=False): """ >>> _decode_path_part(u'%61%77%2f%7a') u'aw%2fz' @@ -492,21 +494,25 @@ def _decode_path_part(text, normalize_case=False): u'aw%2Fz' """ return _percent_decode(text, normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, _decode_map=_PATH_DECODE_MAP) -def _decode_query_part(text, normalize_case=False): +def _decode_query_part(text, normalize_case=False, encode_stray_percents=False): return _percent_decode(text, normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, _decode_map=_QUERY_DECODE_MAP) -def _decode_fragment_part(text, normalize_case=False): +def _decode_fragment_part(text, normalize_case=False, encode_stray_percents=False): return _percent_decode(text, normalize_case=normalize_case, + encode_stray_percents=encode_stray_percents, _decode_map=_FRAGMENT_DECODE_MAP) def _percent_decode(text, normalize_case=False, subencoding='utf-8', - raise_subencoding_exc=False, _decode_map=_HEX_CHAR_MAP): + raise_subencoding_exc=False, encode_stray_percents=False, + _decode_map=_HEX_CHAR_MAP): """Convert percent-encoded text characters to their normal, human-readable equivalents. @@ -550,26 +556,26 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8', res = [bits[0]] append = res.append - if not normalize_case: - for item in bits[1:]: - try: - append(_decode_map[item[:2]]) - append(item[2:]) - except KeyError: + for item in bits[1:]: + hexpair, rest = item[:2], item[2:] + try: + append(_decode_map[hexpair]) + append(rest) + except KeyError: + pair_is_hex = hexpair in _HEX_CHAR_MAP + if pair_is_hex or not encode_stray_percents: append(b'%') + else: + # if it's undecodable, treat as a real percent sign, + # which is reserved (because it wasn't in the + # context-aware _decode_map passed in), and should + # stay in an encoded state. + append(b'%25') + if normalize_case and pair_is_hex: + append(hexpair.upper()) + append(rest) + else: append(item) - else: - for item in bits[1:]: - try: - append(_decode_map[item[:2]]) - append(item[2:]) - except KeyError: - append(b'%') - if item[:2] in _HEX_CHAR_MAP: - append(item[:2].upper()) - append(item[2:]) - else: - append(item) unquoted_bytes = b''.join(res) @@ -1140,7 +1146,7 @@ def from_text(cls, text): rooted, userinfo, uses_netloc) def normalize(self, scheme=True, host=True, path=True, query=True, - fragment=True, userinfo=True): + fragment=True, userinfo=True, encode_stray_percents=True): """Return a new URL object with several standard normalizations applied: @@ -1149,6 +1155,8 @@ def normalize(self, scheme=True, host=True, path=True, query=True, * Convert scheme and host casing to lowercase (`RFC 3986 3.2.2`_) * Resolve any "." and ".." references in the path (`RFC 3986 6.2.2.3`_) * Ensure an ending slash on URLs with an empty path (`RFC 3986 6.2.3`_) + * Encode any stray percent signs (`%`) in percent-encoded + fields (path, query, fragment, userinfo) All are applied by default, but normalizations can be disabled per-part by passing `False` for that part's corresponding @@ -1160,10 +1168,13 @@ def normalize(self, scheme=True, host=True, path=True, query=True, path (bool): Normalize the path (see above for details) query (bool): Normalize the query string fragment (bool): Normalize the fragment + encode_stray_percents (bool): Encode isolated percent signs + for any percent-encoded fields which are being + normalized (defaults to True). - >>> url = URL.from_text(u'Http://example.COM/a/../b/./c%2f?%61') + >>> url = URL.from_text(u'Http://example.COM/a/../b/./c%2f?%61%') >>> print(url.normalize().to_text()) - http://example.com/b/c%2F?a + http://example.com/b/c%2F?a%25 .. _RFC 3986 3.2.2: https://tools.ietf.org/html/rfc3986#section-3.2.2 .. _RFC 3986 2.3: https://tools.ietf.org/html/rfc3986#section-2.3 @@ -1178,22 +1189,21 @@ def normalize(self, scheme=True, host=True, path=True, query=True, kw['scheme'] = self.scheme.lower() if host: kw['host'] = self.host.lower() + def _dec_unres(target): + return _decode_unreserved(target, normalize_case=True, + encode_stray_percents=encode_stray_percents) if path: if self.path: - kw['path'] = [_decode_unreserved(p, normalize_case=True) - for p in _resolve_dot_segments(self.path)] + kw['path'] = [_dec_unres(p) for p in _resolve_dot_segments(self.path)] else: kw['path'] = (u'',) if query: - kw['query'] = [(_decode_unreserved(k, normalize_case=True), - _decode_unreserved(v, normalize_case=True) - if v else v) for k, v in self.query] + kw['query'] = [(_dec_unres(k), _dec_unres(v) if v else v) + for k, v in self.query] if fragment: - kw['fragment'] = _decode_unreserved(self.fragment, - normalize_case=True) + kw['fragment'] = _dec_unres(self.fragment) if userinfo: - kw['userinfo'] = u':'.join([_decode_unreserved(p, - normalize_case=True) + kw['userinfo'] = u':'.join([_dec_unres(p) for p in self.userinfo.split(':', 1)]) return self.replace(**kw) diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index 1e777648..3987eec5 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -1152,7 +1152,13 @@ def test_normalize(self): assert norm_delimited_url.to_text() == '/a%2Fb/cd%3F?k%3D=v%23#test' # test invalid percent encoding during normalize - assert URL(path=('', '%te%sts')).normalize().to_text() == '/%te%sts' + assert URL(path=('', '%te%sts')).normalize(encode_stray_percents=False).to_text() == '/%te%sts' + assert URL(path=('', '%te%sts')).normalize().to_text() == '/%25te%25sts' + + percenty_url = URL(scheme='ftp', path=['%%%', '%a%b'], query=[('%', '%%')], fragment='%', userinfo='%:%') + + assert percenty_url.to_text(with_password=True) == 'ftp://%:%@/%%%/%a%b?%=%%#%' + assert percenty_url.normalize().to_text(with_password=True) == 'ftp://%25:%25@/%25%25%25/%25a%25b?%25=%25%25#%25' def test_str(self): # see also issue #49 From 8de4016a3414c55158c5d317a6264419c195e02f Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sun, 25 Feb 2018 18:49:50 -0800 Subject: [PATCH 2/3] URL.normalize() docs: add cite, argument doc, remove todo --- hyperlink/_url.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 1a3300e3..3190267d 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -1156,7 +1156,7 @@ def normalize(self, scheme=True, host=True, path=True, query=True, * Resolve any "." and ".." references in the path (`RFC 3986 6.2.2.3`_) * Ensure an ending slash on URLs with an empty path (`RFC 3986 6.2.3`_) * Encode any stray percent signs (`%`) in percent-encoded - fields (path, query, fragment, userinfo) + fields (path, query, fragment, userinfo) (`RFC 3986 2.4`_) All are applied by default, but normalizations can be disabled per-part by passing `False` for that part's corresponding @@ -1168,6 +1168,7 @@ def normalize(self, scheme=True, host=True, path=True, query=True, path (bool): Normalize the path (see above for details) query (bool): Normalize the query string fragment (bool): Normalize the fragment + userinfo (bool): Normalize the userinfo encode_stray_percents (bool): Encode isolated percent signs for any percent-encoded fields which are being normalized (defaults to True). @@ -1181,9 +1182,9 @@ def normalize(self, scheme=True, host=True, path=True, query=True, .. _RFC 3986 2.1: https://tools.ietf.org/html/rfc3986#section-2.1 .. _RFC 3986 6.2.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.2.3 .. _RFC 3986 6.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.3 + .. _RFC 3986 2.4: https://tools.ietf.org/html/rfc3986#section-2.4 """ - # TODO: userinfo? kw = {} if scheme: kw['scheme'] = self.scheme.lower() From 4cbf71f57f8ba34b1d1364383f70113c8f3e66ba Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sun, 25 Feb 2018 21:01:40 -0800 Subject: [PATCH 3/3] shorten up encode_stray_percents to just percents, per @glyph review --- hyperlink/_url.py | 6 +++--- hyperlink/test/test_url.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 3190267d..b3ad03d2 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -1146,7 +1146,7 @@ def from_text(cls, text): rooted, userinfo, uses_netloc) def normalize(self, scheme=True, host=True, path=True, query=True, - fragment=True, userinfo=True, encode_stray_percents=True): + fragment=True, userinfo=True, percents=True): """Return a new URL object with several standard normalizations applied: @@ -1169,7 +1169,7 @@ def normalize(self, scheme=True, host=True, path=True, query=True, query (bool): Normalize the query string fragment (bool): Normalize the fragment userinfo (bool): Normalize the userinfo - encode_stray_percents (bool): Encode isolated percent signs + percents (bool): Encode isolated percent signs for any percent-encoded fields which are being normalized (defaults to True). @@ -1192,7 +1192,7 @@ def normalize(self, scheme=True, host=True, path=True, query=True, kw['host'] = self.host.lower() def _dec_unres(target): return _decode_unreserved(target, normalize_case=True, - encode_stray_percents=encode_stray_percents) + encode_stray_percents=percents) if path: if self.path: kw['path'] = [_dec_unres(p) for p in _resolve_dot_segments(self.path)] diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index 3987eec5..b522c35a 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -1152,7 +1152,7 @@ def test_normalize(self): assert norm_delimited_url.to_text() == '/a%2Fb/cd%3F?k%3D=v%23#test' # test invalid percent encoding during normalize - assert URL(path=('', '%te%sts')).normalize(encode_stray_percents=False).to_text() == '/%te%sts' + assert URL(path=('', '%te%sts')).normalize(percents=False).to_text() == '/%te%sts' assert URL(path=('', '%te%sts')).normalize().to_text() == '/%25te%25sts' percenty_url = URL(scheme='ftp', path=['%%%', '%a%b'], query=[('%', '%%')], fragment='%', userinfo='%:%')