From 5d4b542c4d8f0b4bc8f77aa809487e6724a487cd Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Wed, 10 Jan 2018 18:48:33 -0800 Subject: [PATCH 1/3] enable _percent_decode to decode percent encodings within unicode text --- hyperlink/_url.py | 12 ++++++++---- hyperlink/test/test_decoded_url.py | 10 ++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 04ac4b71..746a6884 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -523,18 +523,22 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8', u'abc def' Args: - text (unicode): The ASCII text with percent-encoding present. + text (unicode): Text with percent-encoding present. normalize_case (bool): Whether undecoded percent segments, such as encoded delimiters, should be uppercased, per RFC 3986 Section 2.1. See :func:`_decode_path_part` for an example. + subencoding (unicode): The name of the encoding underlying the + percent-encoding. Pass `False` to get back bytes. + raise_subencoding_exc (bool): Whether an error in decoding the bytes + underlying the percent-decoding should be raised. Returns: - unicode: The percent-decoded version of *text*, with UTF-8 - decoding applied. + unicode: The percent-decoded version of *text*, with decoding + applied, unless `subencoding=False` which returns bytes. """ try: - quoted_bytes = text.encode("ascii") + quoted_bytes = text.encode(subencoding or 'utf-8') except UnicodeEncodeError: return text diff --git a/hyperlink/test/test_decoded_url.py b/hyperlink/test/test_decoded_url.py index 53fef34e..faac1de3 100644 --- a/hyperlink/test/test_decoded_url.py +++ b/hyperlink/test/test_decoded_url.py @@ -145,3 +145,13 @@ def test_twisted_compat(self): def test_percent_decode_bytes(self): assert _percent_decode('%00', subencoding=False) == b'\0' + + def test_percent_decode_mixed(self): + assert _percent_decode('abcdé%C3%A9éfg') == 'abcdéééfg' + + # still allow percent encoding in the case of an error + assert _percent_decode('abcdé%C3éfg') == 'abcdé%C3éfg' + + # ...unless explicitly told otherwise + with self.assertRaises(UnicodeDecodeError): + _percent_decode('abcdé%C3éfg', raise_subencoding_exc=True) From a0cf6d5fef39e960f241f6dc41af98c2a3c9283e Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Wed, 10 Jan 2018 18:51:02 -0800 Subject: [PATCH 2/3] remove excessive _encode_* from DecodedURL now that _percent_decode() supports mixed decoding --- hyperlink/_url.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 746a6884..8f1708bd 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -1677,8 +1677,7 @@ def path(self): return self._path except AttributeError: pass - self._path = tuple([_percent_decode(_encode_path_part(p), - raise_subencoding_exc=True) + self._path = tuple([_percent_decode(p, raise_subencoding_exc=True) for p in self._url.path]) return self._path @@ -1688,8 +1687,7 @@ def query(self): return self._query except AttributeError: pass - _q = [tuple(_percent_decode(_encode_query_part(x), - raise_subencoding_exc=True) + _q = [tuple(_percent_decode(x, raise_subencoding_exc=True) if x is not None else None for x in (k, v)) for k, v in self._url.query] @@ -1703,8 +1701,7 @@ def fragment(self): except AttributeError: pass frag = self._url.fragment - self._fragment = _percent_decode(_encode_fragment_part(frag), - raise_subencoding_exc=True) + self._fragment = _percent_decode(frag, raise_subencoding_exc=True) return self._fragment @property @@ -1713,8 +1710,7 @@ def userinfo(self): return self._userinfo except AttributeError: pass - self._userinfo = tuple([_percent_decode(_encode_userinfo_part(p), - raise_subencoding_exc=True) + self._userinfo = tuple([_percent_decode(p, raise_subencoding_exc=True) for p in self._url.userinfo.split(':', 1)]) return self._userinfo From 4dd846dea1c75df3ddc2c15d3988a43576627b7f Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sat, 24 Feb 2018 12:40:37 -0800 Subject: [PATCH 3/3] add a couple more tests around mixed percent decoding and fix docstring for _percent_decode, per @markrwilliams review --- hyperlink/_url.py | 17 +++++++++-------- hyperlink/test/test_decoded_url.py | 8 ++++++++ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 8f1708bd..81c992a0 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -510,10 +510,11 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8', """Convert percent-encoded text characters to their normal, human-readable equivalents. - All characters in the input text must be valid ASCII. All special - characters underlying the values in the percent-encoding must be - valid UTF-8. If a non-UTF8-valid string is passed, the original - text is returned with no changes applied. + All characters in the input text must be encodable by + *subencoding*. All special characters underlying the values in the + percent-encoding must be decodable as *subencoding*. If a + non-*subencoding*-valid string is passed, the original text is + returned with no changes applied. Only called by field-tailored variants, e.g., :func:`_decode_path_part`, as every percent-encodable part of the @@ -528,17 +529,17 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8', as encoded delimiters, should be uppercased, per RFC 3986 Section 2.1. See :func:`_decode_path_part` for an example. subencoding (unicode): The name of the encoding underlying the - percent-encoding. Pass `False` to get back bytes. + percent-encoding. Pass `False` to get back raw bytes. raise_subencoding_exc (bool): Whether an error in decoding the bytes underlying the percent-decoding should be raised. Returns: - unicode: The percent-decoded version of *text*, with decoding - applied, unless `subencoding=False` which returns bytes. + unicode: The percent-decoded version of *text*, decoded by + *subencoding*, unless `subencoding=False` which returns bytes. """ try: - quoted_bytes = text.encode(subencoding or 'utf-8') + quoted_bytes = text.encode('utf-8' if subencoding is False else subencoding) except UnicodeEncodeError: return text diff --git a/hyperlink/test/test_decoded_url.py b/hyperlink/test/test_decoded_url.py index faac1de3..5515fce7 100644 --- a/hyperlink/test/test_decoded_url.py +++ b/hyperlink/test/test_decoded_url.py @@ -147,6 +147,8 @@ def test_percent_decode_bytes(self): assert _percent_decode('%00', subencoding=False) == b'\0' def test_percent_decode_mixed(self): + # See https://github.com/python-hyper/hyperlink/pull/59 for a + # nice discussion of the possibilities assert _percent_decode('abcdé%C3%A9éfg') == 'abcdéééfg' # still allow percent encoding in the case of an error @@ -155,3 +157,9 @@ def test_percent_decode_mixed(self): # ...unless explicitly told otherwise with self.assertRaises(UnicodeDecodeError): _percent_decode('abcdé%C3éfg', raise_subencoding_exc=True) + + # check that getting raw bytes works ok + assert _percent_decode('a%00b', subencoding=False) == b'a\x00b' + + # when not encodable as subencoding + assert _percent_decode('é%25é', subencoding='ascii') == 'é%25é'