From 3267c27e66e3e439fef5a9be079f539f8819cab4 Mon Sep 17 00:00:00 2001
From: Mahmoud Hashemi <mahmoud@hatnote.com>
Date: Sat, 24 Feb 2018 15:34:29 -0800
Subject: [PATCH 1/3] add 'encode_stray_percents', a new argument to
 _percent_decode(), its variants, and URL.normalize(). This option (disabled
 by default, and enabled for normalize), exists to turn unmatched % characters
 to their encoded equivalent, correcting an otherwise improperly encoded URL.
 Major browsers don't do this by default, so it's currently limited to the
 explicit normalize step. Fixes #61.

---
 hyperlink/_url.py          | 82 +++++++++++++++++++++-----------------
 hyperlink/test/test_url.py |  8 +++-
 2 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/hyperlink/_url.py b/hyperlink/_url.py
index 81c992a0..1a3300e3 100644
--- a/hyperlink/_url.py
+++ b/hyperlink/_url.py
@@ -474,17 +474,19 @@ def iter_pairs(iterable):
     return iter(iterable)
 
 
-def _decode_unreserved(text, normalize_case=False):
+def _decode_unreserved(text, normalize_case=False, encode_stray_percents=False):
     return _percent_decode(text, normalize_case=normalize_case,
+                           encode_stray_percents=encode_stray_percents,
                            _decode_map=_UNRESERVED_DECODE_MAP)
 
 
-def _decode_userinfo_part(text, normalize_case=False):
+def _decode_userinfo_part(text, normalize_case=False, encode_stray_percents=False):
     return _percent_decode(text, normalize_case=normalize_case,
+                           encode_stray_percents=encode_stray_percents,
                            _decode_map=_USERINFO_DECODE_MAP)
 
 
-def _decode_path_part(text, normalize_case=False):
+def _decode_path_part(text, normalize_case=False, encode_stray_percents=False):
     """
     >>> _decode_path_part(u'%61%77%2f%7a')
     u'aw%2fz'
@@ -492,21 +494,25 @@ def _decode_path_part(text, normalize_case=False):
     u'aw%2Fz'
     """
     return _percent_decode(text, normalize_case=normalize_case,
+                           encode_stray_percents=encode_stray_percents,
                            _decode_map=_PATH_DECODE_MAP)
 
 
-def _decode_query_part(text, normalize_case=False):
+def _decode_query_part(text, normalize_case=False, encode_stray_percents=False):
     return _percent_decode(text, normalize_case=normalize_case,
+                           encode_stray_percents=encode_stray_percents,
                            _decode_map=_QUERY_DECODE_MAP)
 
 
-def _decode_fragment_part(text, normalize_case=False):
+def _decode_fragment_part(text, normalize_case=False, encode_stray_percents=False):
     return _percent_decode(text, normalize_case=normalize_case,
+                           encode_stray_percents=encode_stray_percents,
                            _decode_map=_FRAGMENT_DECODE_MAP)
 
 
 def _percent_decode(text, normalize_case=False, subencoding='utf-8',
-                    raise_subencoding_exc=False, _decode_map=_HEX_CHAR_MAP):
+                    raise_subencoding_exc=False, encode_stray_percents=False,
+                    _decode_map=_HEX_CHAR_MAP):
     """Convert percent-encoded text characters to their normal,
     human-readable equivalents.
 
@@ -550,26 +556,26 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8',
     res = [bits[0]]
     append = res.append
 
-    if not normalize_case:
-        for item in bits[1:]:
-            try:
-                append(_decode_map[item[:2]])
-                append(item[2:])
-            except KeyError:
+    for item in bits[1:]:
+        hexpair, rest = item[:2], item[2:]
+        try:
+            append(_decode_map[hexpair])
+            append(rest)
+        except KeyError:
+            pair_is_hex = hexpair in _HEX_CHAR_MAP
+            if pair_is_hex or not encode_stray_percents:
                 append(b'%')
+            else:
+                # if it's undecodable, treat as a real percent sign,
+                # which is reserved (because it wasn't in the
+                # context-aware _decode_map passed in), and should
+                # stay in an encoded state.
+                append(b'%25')
+            if normalize_case and pair_is_hex:
+                append(hexpair.upper())
+                append(rest)
+            else:
                 append(item)
-    else:
-        for item in bits[1:]:
-            try:
-                append(_decode_map[item[:2]])
-                append(item[2:])
-            except KeyError:
-                append(b'%')
-                if item[:2] in _HEX_CHAR_MAP:
-                    append(item[:2].upper())
-                    append(item[2:])
-                else:
-                    append(item)
 
     unquoted_bytes = b''.join(res)
 
@@ -1140,7 +1146,7 @@ def from_text(cls, text):
                    rooted, userinfo, uses_netloc)
 
     def normalize(self, scheme=True, host=True, path=True, query=True,
-                  fragment=True, userinfo=True):
+                  fragment=True, userinfo=True, encode_stray_percents=True):
         """Return a new URL object with several standard normalizations
         applied:
 
@@ -1149,6 +1155,8 @@ def normalize(self, scheme=True, host=True, path=True, query=True,
         * Convert scheme and host casing to lowercase (`RFC 3986 3.2.2`_)
         * Resolve any "." and ".." references in the path (`RFC 3986 6.2.2.3`_)
         * Ensure an ending slash on URLs with an empty path (`RFC 3986 6.2.3`_)
+        * Encode any stray percent signs (`%`) in percent-encoded
+          fields (path, query, fragment, userinfo)
 
         All are applied by default, but normalizations can be disabled
         per-part by passing `False` for that part's corresponding
@@ -1160,10 +1168,13 @@ def normalize(self, scheme=True, host=True, path=True, query=True,
            path (bool): Normalize the path (see above for details)
            query (bool): Normalize the query string
            fragment (bool): Normalize the fragment
+           encode_stray_percents (bool): Encode isolated percent signs
+              for any percent-encoded fields which are being
+              normalized (defaults to True).
 
-        >>> url = URL.from_text(u'Http://example.COM/a/../b/./c%2f?%61')
+        >>> url = URL.from_text(u'Http://example.COM/a/../b/./c%2f?%61%')
         >>> print(url.normalize().to_text())
-        http://example.com/b/c%2F?a
+        http://example.com/b/c%2F?a%25
 
         .. _RFC 3986 3.2.2: https://tools.ietf.org/html/rfc3986#section-3.2.2
         .. _RFC 3986 2.3: https://tools.ietf.org/html/rfc3986#section-2.3
@@ -1178,22 +1189,21 @@ def normalize(self, scheme=True, host=True, path=True, query=True,
             kw['scheme'] = self.scheme.lower()
         if host:
             kw['host'] = self.host.lower()
+        def _dec_unres(target):
+            return _decode_unreserved(target, normalize_case=True,
+                                      encode_stray_percents=encode_stray_percents)
         if path:
             if self.path:
-                kw['path'] = [_decode_unreserved(p, normalize_case=True)
-                              for p in _resolve_dot_segments(self.path)]
+                kw['path'] = [_dec_unres(p) for p in _resolve_dot_segments(self.path)]
             else:
                 kw['path'] = (u'',)
         if query:
-            kw['query'] = [(_decode_unreserved(k, normalize_case=True),
-                            _decode_unreserved(v, normalize_case=True)
-                            if v else v) for k, v in self.query]
+            kw['query'] = [(_dec_unres(k), _dec_unres(v) if v else v)
+                           for k, v in self.query]
         if fragment:
-            kw['fragment'] = _decode_unreserved(self.fragment,
-                                                normalize_case=True)
+            kw['fragment'] = _dec_unres(self.fragment)
         if userinfo:
-            kw['userinfo'] = u':'.join([_decode_unreserved(p,
-                                                           normalize_case=True)
+            kw['userinfo'] = u':'.join([_dec_unres(p)
                                         for p in self.userinfo.split(':', 1)])
 
         return self.replace(**kw)
diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py
index 1e777648..3987eec5 100644
--- a/hyperlink/test/test_url.py
+++ b/hyperlink/test/test_url.py
@@ -1152,7 +1152,13 @@ def test_normalize(self):
         assert norm_delimited_url.to_text() == '/a%2Fb/cd%3F?k%3D=v%23#test'
 
         # test invalid percent encoding during normalize
-        assert URL(path=('', '%te%sts')).normalize().to_text() == '/%te%sts'
+        assert URL(path=('', '%te%sts')).normalize(encode_stray_percents=False).to_text() == '/%te%sts'
+        assert URL(path=('', '%te%sts')).normalize().to_text() == '/%25te%25sts'
+
+        percenty_url = URL(scheme='ftp', path=['%%%', '%a%b'], query=[('%', '%%')], fragment='%', userinfo='%:%')
+
+        assert percenty_url.to_text(with_password=True) == 'ftp://%:%@/%%%/%a%b?%=%%#%'
+        assert percenty_url.normalize().to_text(with_password=True) == 'ftp://%25:%25@/%25%25%25/%25a%25b?%25=%25%25#%25'
 
     def test_str(self):
         # see also issue #49

From 8de4016a3414c55158c5d317a6264419c195e02f Mon Sep 17 00:00:00 2001
From: Mahmoud Hashemi <mahmoud@hatnote.com>
Date: Sun, 25 Feb 2018 18:49:50 -0800
Subject: [PATCH 2/3] URL.normalize() docs: add cite, argument doc, remove todo

---
 hyperlink/_url.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/hyperlink/_url.py b/hyperlink/_url.py
index 1a3300e3..3190267d 100644
--- a/hyperlink/_url.py
+++ b/hyperlink/_url.py
@@ -1156,7 +1156,7 @@ def normalize(self, scheme=True, host=True, path=True, query=True,
         * Resolve any "." and ".." references in the path (`RFC 3986 6.2.2.3`_)
         * Ensure an ending slash on URLs with an empty path (`RFC 3986 6.2.3`_)
         * Encode any stray percent signs (`%`) in percent-encoded
-          fields (path, query, fragment, userinfo)
+          fields (path, query, fragment, userinfo) (`RFC 3986 2.4`_)
 
         All are applied by default, but normalizations can be disabled
         per-part by passing `False` for that part's corresponding
@@ -1168,6 +1168,7 @@ def normalize(self, scheme=True, host=True, path=True, query=True,
            path (bool): Normalize the path (see above for details)
            query (bool): Normalize the query string
            fragment (bool): Normalize the fragment
+           userinfo (bool): Normalize the userinfo
            encode_stray_percents (bool): Encode isolated percent signs
               for any percent-encoded fields which are being
               normalized (defaults to True).
@@ -1181,9 +1182,9 @@ def normalize(self, scheme=True, host=True, path=True, query=True,
         .. _RFC 3986 2.1: https://tools.ietf.org/html/rfc3986#section-2.1
         .. _RFC 3986 6.2.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.2.3
         .. _RFC 3986 6.2.3: https://tools.ietf.org/html/rfc3986#section-6.2.3
+        .. _RFC 3986 2.4: https://tools.ietf.org/html/rfc3986#section-2.4
 
         """
-        # TODO: userinfo?
         kw = {}
         if scheme:
             kw['scheme'] = self.scheme.lower()

From 4cbf71f57f8ba34b1d1364383f70113c8f3e66ba Mon Sep 17 00:00:00 2001
From: Mahmoud Hashemi <mahmoud@hatnote.com>
Date: Sun, 25 Feb 2018 21:01:40 -0800
Subject: [PATCH 3/3] shorten up encode_stray_percents to just percents, per
 @glyph review

---
 hyperlink/_url.py          | 6 +++---
 hyperlink/test/test_url.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hyperlink/_url.py b/hyperlink/_url.py
index 3190267d..b3ad03d2 100644
--- a/hyperlink/_url.py
+++ b/hyperlink/_url.py
@@ -1146,7 +1146,7 @@ def from_text(cls, text):
                    rooted, userinfo, uses_netloc)
 
     def normalize(self, scheme=True, host=True, path=True, query=True,
-                  fragment=True, userinfo=True, encode_stray_percents=True):
+                  fragment=True, userinfo=True, percents=True):
         """Return a new URL object with several standard normalizations
         applied:
 
@@ -1169,7 +1169,7 @@ def normalize(self, scheme=True, host=True, path=True, query=True,
            query (bool): Normalize the query string
            fragment (bool): Normalize the fragment
            userinfo (bool): Normalize the userinfo
-           encode_stray_percents (bool): Encode isolated percent signs
+           percents (bool): Encode isolated percent signs
               for any percent-encoded fields which are being
               normalized (defaults to True).
 
@@ -1192,7 +1192,7 @@ def normalize(self, scheme=True, host=True, path=True, query=True,
             kw['host'] = self.host.lower()
         def _dec_unres(target):
             return _decode_unreserved(target, normalize_case=True,
-                                      encode_stray_percents=encode_stray_percents)
+                                      encode_stray_percents=percents)
         if path:
             if self.path:
                 kw['path'] = [_dec_unres(p) for p in _resolve_dot_segments(self.path)]
diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py
index 3987eec5..b522c35a 100644
--- a/hyperlink/test/test_url.py
+++ b/hyperlink/test/test_url.py
@@ -1152,7 +1152,7 @@ def test_normalize(self):
         assert norm_delimited_url.to_text() == '/a%2Fb/cd%3F?k%3D=v%23#test'
 
         # test invalid percent encoding during normalize
-        assert URL(path=('', '%te%sts')).normalize(encode_stray_percents=False).to_text() == '/%te%sts'
+        assert URL(path=('', '%te%sts')).normalize(percents=False).to_text() == '/%te%sts'
         assert URL(path=('', '%te%sts')).normalize().to_text() == '/%25te%25sts'
 
         percenty_url = URL(scheme='ftp', path=['%%%', '%a%b'], query=[('%', '%%')], fragment='%', userinfo='%:%')