diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index ce82552a3ae4be..52f011e01fb085 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -147,18 +147,34 @@ The :mod:`urllib.request` module defines the following functions: attribute to modify its position in the handlers list. -.. function:: pathname2url(path) +.. function:: pathname2url(path, include_scheme=False) - Convert the pathname *path* from the local syntax for a path to the form used in - the path component of a URL. This does not produce a complete URL. The return - value will already be quoted using the :func:`~urllib.parse.quote` function. + Convert the local pathname *path* to a percent-encoded URL. If + *include_scheme* is false (the default), the URL is returned without a + ``file:`` scheme prefix; set this argument to true to generate a complete + URL. + .. versionchanged:: 3.14 + The *include_scheme* argument was added. -.. function:: url2pathname(path) + .. versionchanged:: 3.14 + Generates :rfc:`8089`-compliant file URLs for absolute paths. URLs for + UNC paths on Windows systems begin with two slashes (previously four.) + URLs for absolute paths on non-Windows systems begin with three slashes + (previously one.) + + +.. function:: url2pathname(url) + + Convert the percent-encoded *url* to a local pathname. + + .. versionchanged:: 3.14 + Supports :rfc:`8089`-compliant file URLs. Raises + :exc:`~urllib.error.URLError` if a scheme other than ``file:`` is used. + If the URL uses a non-local authority, then on Windows a UNC path is + returned, and on other platforms a :exc:`~urllib.error.URLError` + exception is raised. - Convert the path component *path* from a percent-encoded URL to the local syntax for a - path. This does not accept a complete URL. This function uses - :func:`~urllib.parse.unquote` to decode *path*. .. function:: getproxies() diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 7f9e3107a6e1a0..5b56433f0bd0f5 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -447,6 +447,28 @@ unittest (Contributed by Jacob Walls in :gh:`80958`.) +urllib.request +-------------- + +* Improve support for ``file:`` URIs in :mod:`urllib.request`: + + * :func:`~urllib.request.pathname2url` accepts a *include_scheme* + argument, which defaults to false. When set to true, a complete URL + with a ``file:`` prefix is returned. + * :func:`~urllib.request.url2pathname` discards a ``file:`` prefix if given. + * On Windows, :func:`~urllib.request.pathname2url` generates URIs that + begin with two slashes (rather than four) when given a UNC path. + * On non-Windows platforms, :func:`~urllib.request.pathname2url` generates + URIs that begin with three slashes (rather than one) when given an + absolute path. :func:`~urllib.request.url2pathname` performs the opposite + transformation, so ``file:///etc/hosts`` becomes ``/etc/hosts``. + * On non-Windows platforms, :func:`~urllib.request.url2pathname` raises + :exc:`urllib.error.URLError` if the URI includes a non-local authority, + like ``file://other-machine/etc/hosts``. + + (Contributed by Barney Gale in :gh:`125866`.) + + .. Add improved modules above alphabetically, not here at the end. Optimizations diff --git a/Lib/nturl2path.py b/Lib/nturl2path.py index 6453f202c26d14..6165b65babc4be 100644 --- a/Lib/nturl2path.py +++ b/Lib/nturl2path.py @@ -1,9 +1,9 @@ """Convert a NT pathname to a file URL and vice versa. -This module only exists to provide OS-specific code +This module previously provided OS-specific code for urllib.requests, thus do not use directly. """ -# Testing is done through test_urllib. +# Testing is done through test_nturl2path. def url2pathname(url): """OS-specific conversion from a relative URL of the 'file' scheme diff --git a/Lib/test/test_nturl2path.py b/Lib/test/test_nturl2path.py new file mode 100644 index 00000000000000..28e550f4b357a5 --- /dev/null +++ b/Lib/test/test_nturl2path.py @@ -0,0 +1,111 @@ +import nturl2path +import unittest +import urllib.parse + + +class nturl2path_Tests(unittest.TestCase): + """Test pathname2url() and url2pathname()""" + + def test_basic(self): + # Make sure simple tests pass + expected_path = "parts\\of\\a\\path" + expected_url = "parts/of/a/path" + result = nturl2path.pathname2url(expected_path) + self.assertEqual(expected_url, result, + "pathname2url() failed; %s != %s" % + (result, expected_url)) + result = nturl2path.url2pathname(expected_url) + self.assertEqual(expected_path, result, + "url2pathame() failed; %s != %s" % + (result, expected_path)) + + def test_quoting(self): + # Test automatic quoting and unquoting works for pathnam2url() and + # url2pathname() respectively + given = "needs\\quot=ing\\here" + expect = "needs/%s/here" % urllib.parse.quote("quot=ing") + result = nturl2path.pathname2url(given) + self.assertEqual(expect, result, + "pathname2url() failed; %s != %s" % + (expect, result)) + expect = given + result = nturl2path.url2pathname(result) + self.assertEqual(expect, result, + "url2pathname() failed; %s != %s" % + (expect, result)) + given = "make sure\\using_quote" + expect = "%s/using_quote" % urllib.parse.quote("make sure") + result = nturl2path.pathname2url(given) + self.assertEqual(expect, result, + "pathname2url() failed; %s != %s" % + (expect, result)) + given = "make+sure/using_unquote" + expect = "make+sure\\using_unquote" + result = nturl2path.url2pathname(given) + self.assertEqual(expect, result, + "url2pathname() failed; %s != %s" % + (expect, result)) + + def test_pathname2url(self): + # Test special prefixes are correctly handled in pathname2url() + fn = nturl2path.pathname2url + self.assertEqual(fn('\\\\?\\C:\\dir'), '///C:/dir') + self.assertEqual(fn('\\\\?\\unc\\server\\share\\dir'), '/server/share/dir') + self.assertEqual(fn("C:"), '///C:') + self.assertEqual(fn("C:\\"), '///C:') + self.assertEqual(fn('C:\\a\\b.c'), '///C:/a/b.c') + self.assertEqual(fn('C:\\a\\b%#c'), '///C:/a/b%25%23c') + self.assertEqual(fn('C:\\a\\b\xe9'), '///C:/a/b%C3%A9') + self.assertEqual(fn('C:\\foo\\bar\\spam.foo'), "///C:/foo/bar/spam.foo") + # Long drive letter + self.assertRaises(IOError, fn, "XX:\\") + # No drive letter + self.assertEqual(fn("\\folder\\test\\"), '/folder/test/') + self.assertEqual(fn("\\\\folder\\test\\"), '////folder/test/') + self.assertEqual(fn("\\\\\\folder\\test\\"), '/////folder/test/') + self.assertEqual(fn('\\\\some\\share\\'), '////some/share/') + self.assertEqual(fn('\\\\some\\share\\a\\b.c'), '////some/share/a/b.c') + self.assertEqual(fn('\\\\some\\share\\a\\b%#c\xe9'), '////some/share/a/b%25%23c%C3%A9') + # Round-tripping + urls = ['///C:', + '/////folder/test/', + '///C:/foo/bar/spam.foo'] + for url in urls: + self.assertEqual(fn(nturl2path.url2pathname(url)), url) + + def test_url2pathname(self): + fn = nturl2path.url2pathname + self.assertEqual(fn('/C:/'), 'C:\\') + self.assertEqual(fn("///C|"), 'C:') + self.assertEqual(fn("///C:"), 'C:') + self.assertEqual(fn('///C:/'), 'C:\\') + self.assertEqual(fn('/C|//'), 'C:\\') + self.assertEqual(fn('///C|/path'), 'C:\\path') + # No DOS drive + self.assertEqual(fn("///C/test/"), '\\\\\\C\\test\\') + self.assertEqual(fn("////C/test/"), '\\\\C\\test\\') + # DOS drive paths + self.assertEqual(fn('C:/path/to/file'), 'C:\\path\\to\\file') + self.assertEqual(fn('C|/path/to/file'), 'C:\\path\\to\\file') + self.assertEqual(fn('/C|/path/to/file'), 'C:\\path\\to\\file') + self.assertEqual(fn('///C|/path/to/file'), 'C:\\path\\to\\file') + self.assertEqual(fn("///C|/foo/bar/spam.foo"), 'C:\\foo\\bar\\spam.foo') + # Non-ASCII drive letter + self.assertRaises(IOError, fn, "///\u00e8|/") + # UNC paths + self.assertEqual(fn('//server/path/to/file'), '\\\\server\\path\\to\\file') + self.assertEqual(fn('////server/path/to/file'), '\\\\server\\path\\to\\file') + self.assertEqual(fn('/////server/path/to/file'), '\\\\\\server\\path\\to\\file') + # Localhost paths + self.assertEqual(fn('//localhost/C:/path/to/file'), 'C:\\path\\to\\file') + self.assertEqual(fn('//localhost/C|/path/to/file'), 'C:\\path\\to\\file') + # Round-tripping + paths = ['C:', + r'\\\C\test\\', + r'C:\foo\bar\spam.foo'] + for path in paths: + self.assertEqual(fn(nturl2path.pathname2url(path)), path) + + +if __name__ == '__main__': + unittest.main() diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 3ee17f96b817e1..1c0ad86f8b3a85 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -713,7 +713,7 @@ def constructLocalFileUrl(self, filePath): filePath.encode("utf-8") except UnicodeEncodeError: raise unittest.SkipTest("filePath is not encodable to utf8") - return "file://%s" % urllib.request.pathname2url(filePath) + return urllib.request.pathname2url(filePath, include_scheme=True) def createNewTempFile(self, data=b""): """Creates a new temporary file containing the specified data, @@ -1526,15 +1526,17 @@ def test_pathname2url_win(self): self.assertEqual(fn('\\\\?\\C:\\dir'), '///C:/dir') self.assertEqual(fn('\\\\?\\unc\\server\\share\\dir'), '//server/share/dir') self.assertEqual(fn("C:"), '///C:') - self.assertEqual(fn("C:\\"), '///C:') + # Path root is meaningful and should be preserved. + self.assertEqual(fn("C:\\"), '///C:/') self.assertEqual(fn('C:\\a\\b.c'), '///C:/a/b.c') self.assertEqual(fn('C:\\a\\b%#c'), '///C:/a/b%25%23c') self.assertEqual(fn('C:\\a\\b\xe9'), '///C:/a/b%C3%A9') self.assertEqual(fn('C:\\foo\\bar\\spam.foo'), "///C:/foo/bar/spam.foo") - # Long drive letter - self.assertRaises(IOError, fn, "XX:\\") - # No drive letter - self.assertEqual(fn("\\folder\\test\\"), '/folder/test/') + # Long drive letter: treat as relative path, like ntpath.isabs()/splitroot() + self.assertEqual(fn("XX:\\"), "XX%3A/") + # No drive letter: use empty authority + self.assertEqual(fn("\\folder\\test\\"), '///folder/test/') + # UNC paths: UNC server becomes URL authority self.assertEqual(fn("\\\\folder\\test\\"), '//folder/test/') self.assertEqual(fn("\\\\\\folder\\test\\"), '///folder/test/') self.assertEqual(fn('\\\\some\\share\\'), '//some/share/') @@ -1551,9 +1553,10 @@ def test_pathname2url_win(self): 'test specific to POSIX pathnames') def test_pathname2url_posix(self): fn = urllib.request.pathname2url - self.assertEqual(fn('/'), '/') - self.assertEqual(fn('/a/b.c'), '/a/b.c') - self.assertEqual(fn('/a/b%#c'), '/a/b%25%23c') + # Absolute paths: use zero-length authority. + self.assertEqual(fn('/'), '///') + self.assertEqual(fn('/a/b.c'), '///a/b.c') + self.assertEqual(fn('/a/b%#c'), '///a/b%25%23c') @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') @@ -1563,29 +1566,29 @@ def test_url2pathname_win(self): self.assertEqual(fn("///C|"), 'C:') self.assertEqual(fn("///C:"), 'C:') self.assertEqual(fn('///C:/'), 'C:\\') - self.assertEqual(fn('/C|//'), 'C:\\') + self.assertEqual(fn('/C|//'), 'C:\\\\') self.assertEqual(fn('///C|/path'), 'C:\\path') # No DOS drive - self.assertEqual(fn("///C/test/"), '\\\\\\C\\test\\') + self.assertEqual(fn("///C/test/"), '\\C\\test\\') self.assertEqual(fn("////C/test/"), '\\\\C\\test\\') - # DOS drive paths - self.assertEqual(fn('C:/path/to/file'), 'C:\\path\\to\\file') + # DOS drive paths: see RFC 8089 (D.2.) + self.assertEqual(fn('file:C:/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('/C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('///C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn("///C|/foo/bar/spam.foo"), 'C:\\foo\\bar\\spam.foo') - # Non-ASCII drive letter - self.assertRaises(IOError, fn, "///\u00e8|/") - # UNC paths + # Non-ASCII drive letter: treat as real DOS drive, like ntpath.isabs()/splitroot() + self.assertEqual(fn("///\u00e8|/"), "\u00e8:\\") + # UNC paths: see RFC 8089 (E.3.) self.assertEqual(fn('//server/path/to/file'), '\\\\server\\path\\to\\file') self.assertEqual(fn('////server/path/to/file'), '\\\\server\\path\\to\\file') - self.assertEqual(fn('/////server/path/to/file'), '\\\\\\server\\path\\to\\file') - # Localhost paths + self.assertEqual(fn('/////server/path/to/file'), '\\\\server\\path\\to\\file') + # Localhost paths: see RFC 8989 (2.) self.assertEqual(fn('//localhost/C:/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('//localhost/C|/path/to/file'), 'C:\\path\\to\\file') # Round-tripping paths = ['C:', - r'\\\C\test\\', + r'\C\test\\', r'C:\foo\bar\spam.foo'] for path in paths: self.assertEqual(fn(urllib.request.pathname2url(path)), path) @@ -1595,10 +1598,12 @@ def test_url2pathname_win(self): def test_url2pathname_posix(self): fn = urllib.request.url2pathname self.assertEqual(fn('/foo/bar'), '/foo/bar') - self.assertEqual(fn('//foo/bar'), '//foo/bar') - self.assertEqual(fn('///foo/bar'), '///foo/bar') - self.assertEqual(fn('////foo/bar'), '////foo/bar') - self.assertEqual(fn('//localhost/foo/bar'), '//localhost/foo/bar') + # URI from a machine called 'foo': should raise URLError + self.assertRaises(urllib.error.URLError, fn, '//foo/bar') + # URI with empty or local authority: discard authority section + self.assertEqual(fn('///foo/bar'), '/foo/bar') + self.assertEqual(fn('////foo/bar'), '//foo/bar') + self.assertEqual(fn('//localhost/foo/bar'), '/foo/bar') class Utility_Tests(unittest.TestCase): """Testcase to test the various utility functions in the urllib.""" diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index b90ccc2f125b93..312478d7afc0a4 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -43,10 +43,6 @@ def test___all__(self): context = {} exec('from urllib.%s import *' % module, context) del context['__builtins__'] - if module == 'request' and os.name == 'nt': - u, p = context.pop('url2pathname'), context.pop('pathname2url') - self.assertEqual(u.__module__, 'nturl2path') - self.assertEqual(p.__module__, 'nturl2path') for k, v in context.items(): self.assertEqual(v.__module__, 'urllib.%s' % module, "%r is exposed in 'urllib.%s' but defined in %r" % @@ -827,14 +823,15 @@ def test_file(self): urls = [ "file://localhost%s" % urlpath, "file://%s" % urlpath, - "file://%s%s" % (socket.gethostbyname('localhost'), urlpath), ] - try: - localaddr = socket.gethostbyname(socket.gethostname()) - except socket.gaierror: - localaddr = '' - if localaddr: - urls.append("file://%s%s" % (localaddr, urlpath)) + if os.name != 'nt': + urls.append("file://%s%s" % (socket.gethostbyname('localhost'), urlpath)) + try: + localaddr = socket.gethostbyname(socket.gethostname()) + except socket.gaierror: + localaddr = '' + if localaddr: + urls.append("file://%s%s" % (localaddr, urlpath)) for url in urls: f = open(TESTFN, "wb") diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index bc35d8a80e5d03..5c8bceaa4b4be7 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1448,16 +1448,6 @@ def parse_http_list(s): return [part.strip() for part in res] class FileHandler(BaseHandler): - # Use local file or FTP depending on form of URL - def file_open(self, req): - url = req.selector - if url[:2] == '//' and url[2:3] != '/' and (req.host and - req.host != 'localhost'): - if not req.host in self.get_names(): - raise URLError("file:// scheme is supported only on localhost") - else: - return self.open_local_file(req) - # names for the localhost names = None def get_names(self): @@ -1474,8 +1464,7 @@ def get_names(self): def open_local_file(self, req): import email.utils import mimetypes - host = req.host - filename = req.selector + filename = req.full_url localfile = url2pathname(filename) try: stats = os.stat(localfile) @@ -1485,24 +1474,22 @@ def open_local_file(self, req): headers = email.message_from_string( 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % (mtype or 'text/plain', size, modified)) - if host: - host, port = _splitport(host) - if not host or \ - (not port and _safe_gethostbyname(host) in self.get_names()): - if host: - origurl = 'file://' + host + filename - else: - origurl = 'file://' + filename - return addinfourl(open(localfile, 'rb'), headers, origurl) + return addinfourl(open(localfile, 'rb'), headers, filename) except OSError as exp: raise URLError(exp) - raise URLError('file not on local host') -def _safe_gethostbyname(host): + file_open = open_local_file + + +def _is_local_host(host): + if not host or host == 'localhost': + return True try: - return socket.gethostbyname(host) + name = socket.gethostbyname(host) except socket.gaierror: - return None + return False + return name in FileHandler().get_names() + class FTPHandler(BaseHandler): def ftp_open(self, req): @@ -1649,19 +1636,53 @@ def data_open(self, req): MAXFTPCACHE = 10 # Trim the ftp cache beyond this size -# Helper for non-unix systems -if os.name == 'nt': - from nturl2path import url2pathname, pathname2url -else: - def url2pathname(pathname): - """OS-specific conversion from a relative URL of the 'file' scheme - to a file system path; not recommended for general use.""" - return unquote(pathname) - - def pathname2url(pathname): - """OS-specific conversion from a file system path to a relative URL - of the 'file' scheme; not recommended for general use.""" - return quote(pathname) +def pathname2url(path, include_scheme=False): + """Convert the local pathname *path* to a percent-encoded URL.""" + prefix = 'file:' if include_scheme else '' + if os.name == 'nt': + path = path.replace('\\', '/') + drive, root, tail = os.path.splitroot(path) + if drive: + # Handle special UNC prefixes + if drive[:4] == '//?/': + drive = drive[4:] + if drive[:4].upper() == 'UNC/': + drive = '//' + drive[4:] + # DOS drives are preceded by three slashes + if drive[1:2] == ':': + prefix += '///' + elif root: + # Rooted paths are preceded by two slashes + prefix += '//' + tail = quote(tail) + return prefix + drive + root + tail + +def url2pathname(url): + """Convert the percent-encoded URL *url* to a local pathname.""" + scheme, authority, path = urlsplit(url, scheme='file')[:3] + if scheme != 'file': + raise URLError(f'URL {url!r} uses non-`file` scheme {scheme!r}') + if os.name == 'nt': + path = unquote(path) + if authority and authority != 'localhost': + # e.g. file://server/share/path + path = f'//{authority}{path}' + elif path.startswith('///'): + # e.g. file://///server/share/path + path = path[1:] + else: + if path[0:1] == '/' and path[2:3] in ':|': + # e.g. file:////c:/path + path = path[1:] + if path[1:2] == '|': + # e.g. file:///c|path + path = path[:1] + ':' + path[2:] + path = path.replace('/', '\\') + else: + if not _is_local_host(authority): + raise URLError(f'URL {url!r} uses non-local authority {authority!r}') + path = unquote(path) + return path ftpcache = {} diff --git a/Misc/NEWS.d/next/Library/2024-10-30-01-00-24.gh-issue-125866.hj0R4P.rst b/Misc/NEWS.d/next/Library/2024-10-30-01-00-24.gh-issue-125866.hj0R4P.rst new file mode 100644 index 00000000000000..cb0cdaa8fc0962 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-30-01-00-24.gh-issue-125866.hj0R4P.rst @@ -0,0 +1,15 @@ +Improve support for ``file:`` URIs in :mod:`urllib.request`: + +* :func:`~urllib.request.pathname2url` accepts a *include_scheme* + argument, which defaults to false. When set to true, a complete URL + with a ``file:`` prefix is returned. +* :func:`~urllib.request.url2pathname` discards a ``file:`` prefix if given. +* On Windows, :func:`~urllib.request.pathname2url` generates URIs that + begin with two slashes (rather than four) when given a UNC path. +* On non-Windows platforms, :func:`~urllib.request.pathname2url` generates + URIs that begin with three slashes (rather than one) when given an + absolute path. :func:`~urllib.request.url2pathname` performs the opposite + transformation, so ``file:///etc/hosts`` becomes ``/etc/hosts``. +* On non-Windows platforms, :func:`~urllib.request.url2pathname` raises + :exc:`urllib.error.URLError` if the URI includes a non-local authority, + like ``file://other-machine/etc/hosts``.