python · nascheme · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022
diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst
@@ -339,6 +339,18 @@ or on combining URL components into a URL string.
 
 .. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser
 
+.. function:: pathsplit(path)
+
+   Parse a path that includes an optional query and fragment. Like
+   :func:`urlsplit`, this function returns a 5-item :term:`named tuple`::
+
+      (addressing scheme, network location, path, query, fragment identifier).
+
+   The scheme and network location components will always be empty.
+
+   .. versionadded:: 3.11
+
+
 .. function:: urlunsplit(parts)
 
    Combine the elements of a tuple as returned by :func:`urlsplit` into a

diff --git a/Lib/http/server.py b/Lib/http/server.py
@@ -678,13 +678,10 @@ def send_head(self):
         path = self.translate_path(self.path)
         f = None
         if os.path.isdir(path):
-            parts = urllib.parse.urlsplit(self.path)
-            if not parts.path.endswith('/'):
+            new_url = _get_redirect_url(self.path)
+            if new_url:
                 # redirect browser - doing basically what apache does
                 self.send_response(HTTPStatus.MOVED_PERMANENTLY)
-                new_parts = (parts[0], parts[1], parts[2] + '/',
-                             parts[3], parts[4])
-                new_url = urllib.parse.urlunsplit(new_parts)
                 self.send_header("Location", new_url)
                 self.send_header("Content-Length", "0")
                 self.end_headers()
@@ -881,6 +878,23 @@ def guess_type(self, path):
         return 'application/octet-stream'
 
 
+def _get_redirect_url(path):
+    """Returns URL with trailing slash on path, if required.  If not required,
+    returns None.
+    """
+    # Previous versions of this module used urllib.parse.urlsplit() here.
+    # However, the 'path' is not truly a URI in that it can't have a scheme or
+    # netloc.  We need to avoid parsing it incorrectly.  For example, as
+    # reported in gh-87389, a path starting with a double slash should not be
+    # treated as a relative URI.  Also, a path with a colon in the first
+    # component could also be parsed wrongly.
+    parts = urllib.parse.pathsplit(path)
+    if parts.path.endswith('/'):
+        return None  # already has slash, no redirect needed
+    return urllib.parse.urlunsplit(('', '', parts.path + '/', parts.query,
+                                    parts.fragment))
+
+
 # Utilities for CGIHTTPRequestHandler
 
 def _url_collapse_path(path):

diff --git a/Lib/test/test_httpservers.py b/Lib/test/test_httpservers.py
@@ -334,7 +334,7 @@ class request_handler(NoLogRequestHandler, SimpleHTTPRequestHandler):
         pass
 
     def setUp(self):
-        BaseTestCase.setUp(self)
+        super().setUp()
         self.cwd = os.getcwd()
         basetempdir = tempfile.gettempdir()
         os.chdir(basetempdir)
@@ -362,7 +362,7 @@ def tearDown(self):
             except:
                 pass
         finally:
-            BaseTestCase.tearDown(self)
+            super().tearDown()
 
     def check_status_and_reason(self, response, status, data=None):
         def close_conn():
@@ -418,6 +418,26 @@ def test_undecodable_filename(self):
         self.check_status_and_reason(response, HTTPStatus.OK,
                                      data=os_helper.TESTFN_UNDECODABLE)
 
+    def test_get_dir_redirect_location_domain_injection_bug(self):
+        """Ensure //evil.co/..%2f../../X does not put //evil.co/ in Location.
+
+        //domain/ in a Location header is a redirect to a new domain name.
+        https://github.com/python/cpython/issues/87389
+
+        This checks that a path resolving to a directory on our server cannot
+        resolve into a redirect to another server telling it that the
+        directory in question exists on the Referrer server.
+        """
+        os.mkdir(os.path.join(self.tempdir, 'existing_directory'))
+        attack_url = f'//python.org/..%2f..%2f..%2f..%2f..%2f../%0a%0d/../{self.tempdir_name}/existing_directory'
+        response = self.request(attack_url)
+        self.check_status_and_reason(response, HTTPStatus.MOVED_PERMANENTLY)
+        location = response.getheader('Location')
+        self.assertFalse(location.startswith('//'), msg=location)
+        self.assertEqual(location, attack_url[1:] + '/',
+                msg='Expected Location: to start with a single / and '
+                'end with a / as this is a directory redirect.')
+
     def test_get(self):
         #constructs the path relative to the root directory of the HTTPServer
         response = self.request(self.base_url + '/test')

diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
@@ -1101,6 +1101,35 @@ def test_urlsplit_normalization(self):
                         with self.assertRaises(ValueError):
                             urllib.parse.urlsplit(url)
 
+    def test_urlunsplit_relative(self):
+        cases = [
+            # expected result is a relative URL without netloc and scheme
+            (('', 'a', '', '', ''), '//a'),
+            # extra leading slashes need to be stripped to avoid confusion
+            # with a relative URL
+            (('', '', '//a', '', ''), '/a'),
+            (('', '', '///a', '', ''), '/a'),
+            # not relative so extra leading slashes don't need stripping since
+            # they don't cause confusion
+            (('http', 'x.y', '//a', '', ''), 'http://x.y//a'),
+            # avoid confusion with path containing colon
+            (('', '', 'a:b', '', ''), './a:b'),
+        ]
+        for parts, result in cases:
+            self.assertEqual(urllib.parse.urlunsplit(parts), result)
+
+    def test_pathsplit(self):
+        cases = [
+            ('//a', ('', '', '//a', '', '')),
+            ('a:b', ('', '', 'a:b', '', '')),
+            ('/a/b?x#y', ('', '', '/a/b', 'x', 'y')),
+            ('/a/b#y', ('', '', '/a/b', '', 'y')),
+            ('/a/b?x', ('', '', '/a/b', 'x', '')),
+        ]
+        for uri, result in cases:
+            self.assertEqual(urllib.parse.pathsplit(uri), result)
+
+
 class Utility_Tests(unittest.TestCase):
     """Testcase to test the various utility functions in the urllib."""
     # In Python 2 this test class was in test_urllib.

diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
@@ -36,7 +36,7 @@
 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
            "urlsplit", "urlunsplit", "urlencode", "parse_qs",
            "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
-           "unquote", "unquote_plus", "unquote_to_bytes",
+           "unquote", "unquote_plus", "unquote_to_bytes", "pathsplit",
            "DefragResult", "ParseResult", "SplitResult",
            "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
 
@@ -480,6 +480,29 @@ def urlsplit(url, scheme='', allow_fragments=True):
     v = SplitResult(scheme, netloc, url, query, fragment)
     return _coerce_result(v)
 
+# typed=True avoids BytesWarnings being emitted during cache key
+# comparison since this API supports both bytes and str input.
+@functools.lru_cache(typed=True)
+def pathsplit(path):
+    """Parse a path that includes an optional query and fragment.
+    The full syntax is:
+
+    <path>?<query>#<fragment>
+
+    The result is a named 5-tuple with fields set corresponding to the above.
+    It is either a SplitResult or SplitResultBytes object, depending on the
+    type of the url parameter.
+
+    Note that % escapes are not expanded.
+    """
+    path, _coerce_result = _coerce_args(path)
+    for b in _UNSAFE_URL_BYTES_TO_REMOVE:
+        path = path.replace(b, "")
+    path, _, fragment = path.partition('#')
+    path, _, query = path.partition('?')
+    v = SplitResult('', '', path, query, fragment)
+    return _coerce_result(v)
+
 def urlunparse(components):
     """Put a parsed URL back together again.  This may result in a
     slightly different, but equivalent URL, if the URL that was parsed
@@ -491,14 +514,32 @@ def urlunparse(components):
         url = "%s;%s" % (url, params)
     return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
 
+# Returns true if path can confused with a scheme.  I.e. a relative path
+# without leading dot that includes a colon in the first component.
+_is_scheme_like = re.compile(r'[^/.][^/]*:').match
+
 def urlunsplit(components):
     """Combine the elements of a tuple as returned by urlsplit() into a
     complete URL as a string. The data argument can be any five-item iterable.
     This may result in a slightly different, but equivalent URL, if the URL that
     was parsed originally had unnecessary delimiters (for example, a ? with an
     empty query; the RFC states that these are equivalent)."""
-    scheme, netloc, url, query, fragment, _coerce_result = (
+    scheme, netloc, path, query, fragment, _coerce_result = (
                                           _coerce_args(*components))
+    if not scheme and not netloc:
+        # Building a relative URI.  Need to be careful that path is not
+        # confused with scheme or netloc.
+        if path.startswith('//'):
+            # gh-87389: don't treat first component of path as netloc
+            url = '/' + path.lstrip('/')
+        elif _is_scheme_like(path):
+            # first component has colon, ensure it will not be parsed as the
+            # scheme
+            url = './' + path
+        else:
+            url = path
+    else:
+        url = path
     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
         if url and url[:1] != '/': url = '/' + url
         url = '//' + (netloc or '') + url

diff --git a/Misc/NEWS.d/next/Security/2022-06-16-12-13-55.gh-issue-87389.MS9wAR.rst b/Misc/NEWS.d/next/Security/2022-06-16-12-13-55.gh-issue-87389.MS9wAR.rst
@@ -0,0 +1,8 @@
+:mod:`http.server`: Fix an open redirection vulnerability in the HTTP server
+when an URI path starts with ``//``.  Vulnerability discovered, and initial
+fix proposed, by Hamza Avvan.  Change :func:`urllib.parse.urlunsplit` to
+sanitize ``path`` argument in order to avoid confusing the first component of
+the path as a net location or scheme.  Add :func:`urllib.parse.pathsplit`
+function.
+
+Co-authored-by: Gregory P. Smith <gps@google.com>