From 0a4f60a77e85af697d536717454a101cadc870d3 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 1 Feb 2024 08:08:35 +0000 Subject: [PATCH 01/17] GH-114847: Speed up `posixpath.realpath()` Apply the following optimizations to `posixpath.realpath()`: - Remove use of recursion - Directly construct child paths rather than using `join()` - Use `os.getcwd[b]()` rather than `abspath()` - Use `startswith(sep)` rather than `isabs()` --- Lib/posixpath.py | 83 +++++++++++-------- ...-02-01-08-09-20.gh-issue-114847.-JrWrR.rst | 1 + 2 files changed, 49 insertions(+), 35 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-02-01-08-09-20.gh-issue-114847.-JrWrR.rst diff --git a/Lib/posixpath.py b/Lib/posixpath.py index e4f155e41a3221..91e256f72a1103 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -432,49 +432,46 @@ def realpath(filename, *, strict=False): """Return the canonical path of the specified filename, eliminating any symbolic links encountered in the path.""" filename = os.fspath(filename) - path, ok = _joinrealpath(filename[:0], filename, strict, {}) - return abspath(path) - -# Join two paths, normalizing and eliminating any symbolic links -# encountered in the second path. -def _joinrealpath(path, rest, strict, seen): - if isinstance(path, bytes): + if isinstance(filename, bytes): sep = b'/' curdir = b'.' pardir = b'..' + getcwd = os.getcwdb else: sep = '/' curdir = '.' pardir = '..' - - if isabs(rest): - rest = rest[1:] - path = sep - - while rest: - name, _, rest = rest.partition(sep) + getcwd = os.getcwd + + seen = {} + stack = [] + querying = True + path = sep if filename.startswith(sep) else getcwd() + for part in reversed(filename.split(sep)): + stack.append((False, part)) + + while stack: + is_symlink, name = stack.pop() + if is_symlink: + # resolved symlink + seen[name] = path + continue if not name or name == curdir: # current dir continue if name == pardir: # parent dir - if path: - path, name = split(path) - if name == pardir: - path = join(path, pardir, pardir) + newpath, name = split(path) + if name == pardir: + path = path + sep + pardir else: - path = pardir + path = newpath continue - newpath = join(path, name) - try: - st = os.lstat(newpath) - except OSError: - if strict: - raise - is_link = False + if len(path) == 1: + newpath = path + name else: - is_link = stat.S_ISLNK(st.st_mode) - if not is_link: + newpath = path + sep + name + if not querying: path = newpath continue # Resolve the symbolic link @@ -490,14 +487,30 @@ def _joinrealpath(path, rest, strict, seen): os.stat(newpath) else: # Return already resolved part + rest of the path unchanged. - return join(newpath, rest), False - seen[newpath] = None # not resolved symlink - path, ok = _joinrealpath(path, os.readlink(newpath), strict, seen) - if not ok: - return join(path, rest), False - seen[newpath] = path # resolved symlink + path = newpath + querying = False + continue + try: + st = os.lstat(newpath) + if not stat.S_ISLNK(st.st_mode): + path = newpath + continue + target = os.readlink(newpath) + except OSError: + if strict: + raise + else: + path = newpath + querying = False + continue - return path, True + seen[newpath] = None # not resolved symlink + if target.startswith(sep): + path = sep + stack.append((True, newpath)) + for part in reversed(target.split(sep)): + stack.append((False, part)) + return path supports_unicode_filenames = (sys.platform == 'darwin') diff --git a/Misc/NEWS.d/next/Library/2024-02-01-08-09-20.gh-issue-114847.-JrWrR.rst b/Misc/NEWS.d/next/Library/2024-02-01-08-09-20.gh-issue-114847.-JrWrR.rst new file mode 100644 index 00000000000000..bf011fed3efdbc --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-02-01-08-09-20.gh-issue-114847.-JrWrR.rst @@ -0,0 +1 @@ +Speed up :func:`os.path.realpath` on non-Windows platforms. From 5aa9c51dc5ed69ea3c624839a8ed7430cef7299e Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 1 Feb 2024 08:31:23 +0000 Subject: [PATCH 02/17] Tiny tweak --- Lib/posixpath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 91e256f72a1103..63d95fc0f6948d 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -467,7 +467,7 @@ def realpath(filename, *, strict=False): else: path = newpath continue - if len(path) == 1: + if path == sep: newpath = path + name else: newpath = path + sep + name From b7587109db4d1c2e47e4b75901c01477a1427f94 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 1 Feb 2024 08:35:43 +0000 Subject: [PATCH 03/17] Slightly speed up handling of empty/dot parts --- Lib/posixpath.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 63d95fc0f6948d..504a05ee9987ab 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -448,7 +448,8 @@ def realpath(filename, *, strict=False): querying = True path = sep if filename.startswith(sep) else getcwd() for part in reversed(filename.split(sep)): - stack.append((False, part)) + if part and part != curdir: + stack.append((False, part)) while stack: is_symlink, name = stack.pop() @@ -456,9 +457,6 @@ def realpath(filename, *, strict=False): # resolved symlink seen[name] = path continue - if not name or name == curdir: - # current dir - continue if name == pardir: # parent dir newpath, name = split(path) @@ -509,7 +507,8 @@ def realpath(filename, *, strict=False): path = sep stack.append((True, newpath)) for part in reversed(target.split(sep)): - stack.append((False, part)) + if part and part != curdir: + stack.append((False, part)) return path From 60d3bcb0d7af66efe64bde861b712f6e411a8147 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 31 Mar 2024 19:38:16 +0100 Subject: [PATCH 04/17] Simplify '..' handling. --- Lib/posixpath.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 186acc3a999a8c..29d50cd46169e8 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -439,11 +439,7 @@ def realpath(filename, *, strict=False): continue if name == pardir: # parent dir - newpath, name = split(path) - if name == pardir: - path = path + sep + pardir - else: - path = newpath + path = dirname(path) continue if path == sep: newpath = path + name From b8493081b9fef2e066c1829daae1ef2a33213bcc Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 31 Mar 2024 19:55:06 +0100 Subject: [PATCH 05/17] A few more perf improvements and simplifications. --- Lib/posixpath.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 29d50cd46169e8..1fcd7c2c384704 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -424,12 +424,9 @@ def realpath(filename, *, strict=False): getcwd = os.getcwd seen = {} - stack = [] - querying = True path = sep if filename.startswith(sep) else getcwd() - for part in reversed(filename.split(sep)): - if part and part != curdir: - stack.append((False, part)) + stack = [(False, part) for part in reversed(filename.split(sep))] + querying = True while stack: is_symlink, name = stack.pop() @@ -437,17 +434,22 @@ def realpath(filename, *, strict=False): # resolved symlink seen[name] = path continue + if not name or name == curdir: + # current dir + continue if name == pardir: # parent dir - path = dirname(path) + path = path[:path.rfind(sep)] or sep continue if path == sep: newpath = path + name else: newpath = path + sep + name + if not querying: path = newpath continue + # Resolve the symbolic link if newpath in seen: # Already seen this path @@ -482,9 +484,7 @@ def realpath(filename, *, strict=False): if target.startswith(sep): path = sep stack.append((True, newpath)) - for part in reversed(target.split(sep)): - if part and part != curdir: - stack.append((False, part)) + stack.extend((False, part) for part in reversed(target.split(sep))) return path From 085e1050ccaa0c0ab2dd02ca88794f727a2c90d4 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 31 Mar 2024 20:08:11 +0100 Subject: [PATCH 06/17] Simplify symlink resolution signalling --- Lib/posixpath.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 1fcd7c2c384704..795a2f5e60f9ac 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -423,16 +423,16 @@ def realpath(filename, *, strict=False): pardir = '..' getcwd = os.getcwd - seen = {} path = sep if filename.startswith(sep) else getcwd() - stack = [(False, part) for part in reversed(filename.split(sep))] + rest = filename.split(sep)[::-1] + seen = {} querying = True - while stack: - is_symlink, name = stack.pop() - if is_symlink: + while rest: + name = rest.pop() + if name is None: # resolved symlink - seen[name] = path + seen[rest.pop()] = path continue if not name or name == curdir: # current dir @@ -445,11 +445,9 @@ def realpath(filename, *, strict=False): newpath = path + name else: newpath = path + sep + name - if not querying: path = newpath continue - # Resolve the symbolic link if newpath in seen: # Already seen this path @@ -483,8 +481,9 @@ def realpath(filename, *, strict=False): seen[newpath] = None # not resolved symlink if target.startswith(sep): path = sep - stack.append((True, newpath)) - stack.extend((False, part) for part in reversed(target.split(sep))) + rest.append(newpath) + rest.append(None) + rest.extend(target.split(sep)[::-1]) return path From 69dabd2aa0860476e27f3a899e7f4e633dae0784 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 31 Mar 2024 20:39:20 +0100 Subject: [PATCH 07/17] Tiny tweaks --- Lib/posixpath.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 795a2f5e60f9ac..f02f83766fc48d 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -423,7 +423,7 @@ def realpath(filename, *, strict=False): pardir = '..' getcwd = os.getcwd - path = sep if filename.startswith(sep) else getcwd() + path = sep if filename[:1] == sep else getcwd() rest = filename.split(sep)[::-1] seen = {} querying = True @@ -463,7 +463,7 @@ def realpath(filename, *, strict=False): # Return already resolved part + rest of the path unchanged. path = newpath querying = False - continue + continue try: st = os.lstat(newpath) if not stat.S_ISLNK(st.st_mode): @@ -479,7 +479,7 @@ def realpath(filename, *, strict=False): continue seen[newpath] = None # not resolved symlink - if target.startswith(sep): + if target[:1] == sep: path = sep rest.append(newpath) rest.append(None) From 197c871edeec082fca9c152a8ddadf1c91bb8d6e Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 31 Mar 2024 20:40:44 +0100 Subject: [PATCH 08/17] Spacing --- Lib/posixpath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index f02f83766fc48d..06c88a0062ad57 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -477,13 +477,13 @@ def realpath(filename, *, strict=False): path = newpath querying = False continue - seen[newpath] = None # not resolved symlink if target[:1] == sep: path = sep rest.append(newpath) rest.append(None) rest.extend(target.split(sep)[::-1]) + return path From ecd1fe7157e729a84d91a5bb5bd81fd413235a5c Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 1 Apr 2024 22:26:25 +0100 Subject: [PATCH 09/17] Comments --- Lib/posixpath.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 06c88a0062ad57..1bfe95c245573a 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -423,15 +423,27 @@ def realpath(filename, *, strict=False): pardir = '..' getcwd = os.getcwd - path = sep if filename[:1] == sep else getcwd() + # The stack of unresolved path parts. rest = filename.split(sep)[::-1] + + # The resolved path, which is absolute throughout this function. + # Note: getcwd() returns a normalized and symlink-free path. + path = sep if filename[:1] == sep else getcwd() + + # Mapping from symlink paths to *fully resolved* symlink targets. If a + # symlink is encountered but not yet resolved, the value is None. This is + # used both to detect symlink loops and to speed up repeated traversals of + # the same links. seen = {} + + # Whether we're calling lstat() and readlink() to resolve symlinks. If we + # encounter an OSError in non-strict mode, this is switched off. querying = True while rest: name = rest.pop() if name is None: - # resolved symlink + # resolved symlink target seen[rest.pop()] = path continue if not name or name == curdir: @@ -474,14 +486,20 @@ def realpath(filename, *, strict=False): if strict: raise else: + # Return already resolved part + rest of the path unchanged. path = newpath querying = False continue - seen[newpath] = None # not resolved symlink if target[:1] == sep: + # Symlink target is absolute; reset resolved path. path = sep + seen[newpath] = None # not resolved symlink + # Push the symlink path onto the stack, and signal its specialness by + # also pushing None. When these entries are popped, we'll record the + # fully-resolved symlink target in the 'seen' mapping . rest.append(newpath) rest.append(None) + # Push the unresolved symlink target parts onto the stack. rest.extend(target.split(sep)[::-1]) return path From 4b83c97fa32fed7ce292b043f9fefdf67410c8dc Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 1 Apr 2024 22:39:32 +0100 Subject: [PATCH 10/17] Final tweaks --- Lib/posixpath.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 1bfe95c245573a..343cb1e5ff2aa0 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -428,7 +428,7 @@ def realpath(filename, *, strict=False): # The resolved path, which is absolute throughout this function. # Note: getcwd() returns a normalized and symlink-free path. - path = sep if filename[:1] == sep else getcwd() + path = sep if filename.startswith(sep) else getcwd() # Mapping from symlink paths to *fully resolved* symlink targets. If a # symlink is encountered but not yet resolved, the value is None. This is @@ -490,13 +490,13 @@ def realpath(filename, *, strict=False): path = newpath querying = False continue - if target[:1] == sep: + if target.startswith(sep): # Symlink target is absolute; reset resolved path. path = sep seen[newpath] = None # not resolved symlink # Push the symlink path onto the stack, and signal its specialness by # also pushing None. When these entries are popped, we'll record the - # fully-resolved symlink target in the 'seen' mapping . + # fully-resolved symlink target in the 'seen' mapping. rest.append(newpath) rest.append(None) # Push the unresolved symlink target parts onto the stack. From 77712d2fccd83f0f748ea76c77ddd2a8005c7fb5 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 2 Apr 2024 00:13:41 +0100 Subject: [PATCH 11/17] Further explain 'rest'. --- Lib/posixpath.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 343cb1e5ff2aa0..4bdcbfa23f9586 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -423,7 +423,10 @@ def realpath(filename, *, strict=False): pardir = '..' getcwd = os.getcwd - # The stack of unresolved path parts. + # The stack of unresolved path parts. When popped, a special value of None + # indicates that a symlink target has been resolved, and that the original + # symlink path can be retrieved by popping again. The [::-1] slice is a + # very fast way of spelling list(reversed(...)). rest = filename.split(sep)[::-1] # The resolved path, which is absolute throughout this function. From 15199a87c55cf22d2b88f3068ec1d079a0357d05 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Tue, 2 Apr 2024 20:42:07 +0100 Subject: [PATCH 12/17] Update Lib/posixpath.py Co-authored-by: Petr Viktorin --- Lib/posixpath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 4bdcbfa23f9586..ef9038d23a1d6c 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -454,7 +454,7 @@ def realpath(filename, *, strict=False): continue if name == pardir: # parent dir - path = path[:path.rfind(sep)] or sep + path = path[:path.rindex(sep)] or sep continue if path == sep: newpath = path + name From 31955eead612107c8ec350fac8a5f8a03349ffe6 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 3 Apr 2024 20:21:56 +0100 Subject: [PATCH 13/17] Undo re-ordering of lstat() and seen lookup --- Lib/posixpath.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 89136afa4e5816..14c552158a2250 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -454,6 +454,17 @@ def realpath(filename, *, strict=False): if not querying: path = newpath continue + try: + st = os.lstat(newpath) + except OSError: + if strict: + raise + is_link = False + else: + is_link = stat.S_ISLNK(st.st_mode) + if not is_link: + path = newpath + continue # Resolve the symbolic link if newpath in seen: # Already seen this path @@ -470,24 +481,11 @@ def realpath(filename, *, strict=False): path = newpath querying = False continue - try: - st = os.lstat(newpath) - if not stat.S_ISLNK(st.st_mode): - path = newpath - continue - target = os.readlink(newpath) - except OSError: - if strict: - raise - else: - # Return already resolved part + rest of the path unchanged. - path = newpath - querying = False - continue + seen[newpath] = None # not resolved symlink + target = os.readlink(newpath) if target.startswith(sep): # Symlink target is absolute; reset resolved path. path = sep - seen[newpath] = None # not resolved symlink # Push the symlink path onto the stack, and signal its specialness by # also pushing None. When these entries are popped, we'll record the # fully-resolved symlink target in the 'seen' mapping. From fe62fef0050638c6de40475fd0d353f10bd981e6 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 4 Apr 2024 01:37:44 +0100 Subject: [PATCH 14/17] Re-optimise link check --- Lib/posixpath.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index 14c552158a2250..ad895de3935b27 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -456,13 +456,12 @@ def realpath(filename, *, strict=False): continue try: st = os.lstat(newpath) + if not stat.S_ISLNK(st.st_mode): + path = newpath + continue except OSError: if strict: raise - is_link = False - else: - is_link = stat.S_ISLNK(st.st_mode) - if not is_link: path = newpath continue # Resolve the symbolic link From ebccd2bbd8d8c349c8a098e729b3e76653e59f3f Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 4 Apr 2024 01:43:19 +0100 Subject: [PATCH 15/17] Stop querying when `lstat()` fails. --- Lib/posixpath.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index ad895de3935b27..d9005ecfeb5786 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -463,6 +463,7 @@ def realpath(filename, *, strict=False): if strict: raise path = newpath + querying = False continue # Resolve the symbolic link if newpath in seen: From d2a024803aac420755071ba90e6e597af2065082 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 4 Apr 2024 14:11:28 +0100 Subject: [PATCH 16/17] Keep querying if we hit an OSError from lstat() --- Lib/posixpath.py | 1 - Lib/test/test_posixpath.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index d9005ecfeb5786..ad895de3935b27 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -463,7 +463,6 @@ def realpath(filename, *, strict=False): if strict: raise path = newpath - querying = False continue # Resolve the symbolic link if newpath in seen: diff --git a/Lib/test/test_posixpath.py b/Lib/test/test_posixpath.py index cbb7c4c52d9697..807f985f7f4df7 100644 --- a/Lib/test/test_posixpath.py +++ b/Lib/test/test_posixpath.py @@ -456,6 +456,15 @@ def test_realpath_relative(self): finally: os_helper.unlink(ABSTFN) + @os_helper.skip_unless_symlink + @skip_if_ABSTFN_contains_backslash + def test_realpath_missing_pardir(self): + try: + os.symlink(os_helper.TESTFN + "1", os_helper.TESTFN) + self.assertEqual(realpath("nonexistent/../" + os_helper.TESTFN), ABSTFN + "1") + finally: + os_helper.unlink(os_helper.TESTFN) + @os_helper.skip_unless_symlink @skip_if_ABSTFN_contains_backslash def test_realpath_symlink_loops(self): From eed739a6d26fa71b05a6ecd50a1f65dbb1e05c06 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Fri, 5 Apr 2024 13:02:19 +0100 Subject: [PATCH 17/17] Update Lib/posixpath.py Co-authored-by: Petr Viktorin --- Lib/posixpath.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/posixpath.py b/Lib/posixpath.py index ad895de3935b27..0e8bb5ab10d916 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -431,7 +431,8 @@ def realpath(filename, *, strict=False): seen = {} # Whether we're calling lstat() and readlink() to resolve symlinks. If we - # encounter an OSError in non-strict mode, this is switched off. + # encounter an OSError for a symlink loop in non-strict mode, this is + # switched off. querying = True while rest: