Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-117586: Speed up pathlib.Path.walk() by working with strings #117726

Merged
merged 1 commit into from Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
37 changes: 37 additions & 0 deletions Lib/glob.py
Expand Up @@ -498,3 +498,40 @@ def select_exists(self, path, exists=False):
yield path
except OSError:
pass

@classmethod
def walk(cls, root, top_down, on_error, follow_symlinks):
"""Walk the directory tree from the given root, similar to os.walk().
"""
paths = [root]
while paths:
path = paths.pop()
if isinstance(path, tuple):
yield path
continue
try:
with cls.scandir(path) as scandir_it:
dirnames = []
filenames = []
if not top_down:
paths.append((path, dirnames, filenames))
for entry in scandir_it:
name = entry.name
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
if not top_down:
paths.append(cls.parse_entry(entry))
dirnames.append(name)
else:
filenames.append(name)
except OSError:
filenames.append(name)
except OSError as error:
if on_error is not None:
on_error(error)
else:
if top_down:
yield path, dirnames, filenames
if dirnames:
prefix = cls.add_slash(path)
paths += [cls.concat_path(prefix, d) for d in reversed(dirnames)]
20 changes: 6 additions & 14 deletions Lib/pathlib/__init__.py
Expand Up @@ -586,18 +586,6 @@ def iterdir(self):
"""
return (self._make_child_relpath(name) for name in os.listdir(self))

def _scandir(self):
return os.scandir(self)

def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
path_str = entry.name if str(self) == '.' else entry.path
path = self.with_segments(path_str)
path._str = path_str
path._drv = self.drive
path._root = self.root
path._tail_cached = self._tail + [entry.name]
return path

def _make_child_relpath(self, name):
if not name:
Expand Down Expand Up @@ -663,8 +651,12 @@ def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
def walk(self, top_down=True, on_error=None, follow_symlinks=False):
"""Walk the directory tree from this directory, similar to os.walk()."""
sys.audit("pathlib.Path.walk", self, on_error, follow_symlinks)
return _abc.PathBase.walk(
self, top_down=top_down, on_error=on_error, follow_symlinks=follow_symlinks)
root_dir = str(self)
results = self._globber.walk(root_dir, top_down, on_error, follow_symlinks)
for path_str, dirnames, filenames in results:
if root_dir == '.':
path_str = path_str[2:]
yield self._from_parsed_string(path_str), dirnames, filenames

def absolute(self):
"""Return an absolute version of this path
Expand Down
65 changes: 8 additions & 57 deletions Lib/pathlib/_abc.py
Expand Up @@ -45,9 +45,15 @@ def _is_case_sensitive(parser):

class Globber(glob._Globber):
lstat = operator.methodcaller('lstat')
scandir = operator.methodcaller('_scandir')
add_slash = operator.methodcaller('joinpath', '')

@staticmethod
def scandir(path):
# Emulate os.scandir(), which returns an object that can be used as a
# context manager. This method is called by walk() and glob().
from contextlib import nullcontext
return nullcontext(path.iterdir())

@staticmethod
def concat_path(path, text):
"""Appends text to the given path.
Expand Down Expand Up @@ -677,20 +683,6 @@ def iterdir(self):
"""
raise UnsupportedOperation(self._unsupported_msg('iterdir()'))

def _scandir(self):
# Emulate os.scandir(), which returns an object that can be used as a
# context manager. This method is called by walk() and glob().
from contextlib import nullcontext
return nullcontext(self.iterdir())

def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
# PathBase._scandir() yields PathBase objects, so this is a no-op.
return entry

def _make_child_relpath(self, name):
return self.joinpath(name)

def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.parser)
Expand Down Expand Up @@ -724,48 +716,7 @@ def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):

def walk(self, top_down=True, on_error=None, follow_symlinks=False):
"""Walk the directory tree from this directory, similar to os.walk()."""
paths = [self]

while paths:
path = paths.pop()
if isinstance(path, tuple):
yield path
continue

# We may not have read permission for self, in which case we can't
# get a list of the files the directory contains. os.walk()
# always suppressed the exception in that instance, rather than
# blow up for a minor reason when (say) a thousand readable
# directories are still left to visit. That logic is copied here.
try:
scandir_obj = path._scandir()
except OSError as error:
if on_error is not None:
on_error(error)
continue

with scandir_obj as scandir_it:
dirnames = []
filenames = []
if not top_down:
paths.append((path, dirnames, filenames))
for entry in scandir_it:
try:
is_dir = entry.is_dir(follow_symlinks=follow_symlinks)
except OSError:
# Carried over from os.path.isdir().
is_dir = False

if is_dir:
if not top_down:
paths.append(path._make_child_direntry(entry))
dirnames.append(entry.name)
else:
filenames.append(entry.name)

if top_down:
yield path, dirnames, filenames
paths += [path._make_child_relpath(d) for d in reversed(dirnames)]
return self._globber.walk(self, top_down, on_error, follow_symlinks)

def absolute(self):
"""Return an absolute version of this path
Expand Down
@@ -0,0 +1 @@
Speed up :meth:`pathlib.Path.walk` by working with strings internally.