Skip to content

Commit

Permalink
Much faster implementation of FileList, for big egg_info speedups
Browse files Browse the repository at this point in the history
  • Loading branch information
mx-moth committed Oct 14, 2016
1 parent 7edbffc commit 44630f1
Show file tree
Hide file tree
Showing 3 changed files with 451 additions and 44 deletions.
262 changes: 232 additions & 30 deletions setuptools/command/egg_info.py
Expand Up @@ -3,6 +3,7 @@
Create a distribution's .egg-info directory and contents"""

from distutils.filelist import FileList as _FileList
from distutils.errors import DistutilsInternalError
from distutils.util import convert_path
from distutils import log
import distutils.errors
Expand All @@ -27,6 +28,7 @@
parse_requirements, safe_name, parse_version,
safe_version, yield_lines, EntryPoint, iter_entry_points, to_filename)
import setuptools.unicode_utils as unicode_utils
from setuptools.glob import glob

from pkg_resources.extern import packaging

Expand All @@ -36,6 +38,88 @@
pass


def translate_pattern(glob):
"""
Translate a file path glob like '*.txt' in to a regular expression.
This differs from fnmatch.translate which allows wildcards to match
directory separators. It also knows about '**/' which matches any number of
directories.
"""
pat = ''

# This will split on '/' within [character classes]. This is deliberate.
chunks = glob.split(os.path.sep)

sep = re.escape(os.sep)
valid_char = '[^%s]' % (sep,)

for c, chunk in enumerate(chunks):
last_chunk = c == len(chunks) - 1

# Chunks that are a literal ** are globstars. They match anything.
if chunk == '**':
if last_chunk:
# Match anything if this is the last component
pat += '.*'
else:
# Match '(name/)*'
pat += '(?:%s+%s)*' % (valid_char, sep)
continue # Break here as the whole path component has been handled

# Find any special characters in the remainder
i = 0
chunk_len = len(chunk)
while i < chunk_len:
char = chunk[i]
if char == '*':
# Match any number of name characters
pat += valid_char + '*'
elif char == '?':
# Match a name character
pat += valid_char
elif char == '[':
# Character class
inner_i = i + 1
# Skip initial !/] chars
if inner_i < chunk_len and chunk[inner_i] == '!':
inner_i = inner_i + 1
if inner_i < chunk_len and chunk[inner_i] == ']':
inner_i = inner_i + 1

# Loop till the closing ] is found
while inner_i < chunk_len and chunk[inner_i] != ']':
inner_i = inner_i + 1

if inner_i >= chunk_len:
# Got to the end of the string without finding a closing ]
# Do not treat this as a matching group, but as a literal [
pat += re.escape(char)
else:
# Grab the insides of the [brackets]
inner = chunk[i + 1:inner_i]
char_class = ''

# Class negation
if inner[0] == '!':
char_class = '^'
inner = inner[1:]

char_class += re.escape(inner)
pat += '[%s]' % (char_class,)

# Skip to the end ]
i = inner_i
else:
pat += re.escape(char)
i += 1

# Join each chunk with the dir separator
if not last_chunk:
pat += sep

return re.compile(pat + r'\Z(?ms)')


class egg_info(Command):
description = "create a distribution's .egg-info directory"

Expand Down Expand Up @@ -239,7 +323,151 @@ def check_broken_egg_info(self):


class FileList(_FileList):
"""File list that accepts only existing, platform-independent paths"""
# Implementations of the various MANIFEST.in commands

def process_template_line(self, line):
# Parse the line: split it up, make sure the right number of words
# is there, and return the relevant words. 'action' is always
# defined: it's the first word of the line. Which of the other
# three are defined depends on the action; it'll be either
# patterns, (dir and patterns), or (dir_pattern).
(action, patterns, dir, dir_pattern) = self._parse_template_line(line)

# OK, now we know that the action is valid and we have the
# right number of words on the line for that action -- so we
# can proceed with minimal error-checking.
if action == 'include':
self.debug_print("include " + ' '.join(patterns))
for pattern in patterns:
if not self.include(pattern):
log.warn("warning: no files found matching '%s'", pattern)

elif action == 'exclude':
self.debug_print("exclude " + ' '.join(patterns))
for pattern in patterns:
if not self.exclude(pattern):
log.warn(("warning: no previously-included files "
"found matching '%s'"), pattern)

elif action == 'global-include':
self.debug_print("global-include " + ' '.join(patterns))
for pattern in patterns:
if not self.global_include(pattern):
log.warn(("warning: no files found matching '%s' "
"anywhere in distribution"), pattern)

elif action == 'global-exclude':
self.debug_print("global-exclude " + ' '.join(patterns))
for pattern in patterns:
if not self.global_exclude(pattern):
log.warn(("warning: no previously-included files matching "
"'%s' found anywhere in distribution"),
pattern)

elif action == 'recursive-include':
self.debug_print("recursive-include %s %s" %
(dir, ' '.join(patterns)))
for pattern in patterns:
if not self.recursive_include(dir, pattern):
log.warn(("warning: no files found matching '%s' "
"under directory '%s'"),
pattern, dir)

elif action == 'recursive-exclude':
self.debug_print("recursive-exclude %s %s" %
(dir, ' '.join(patterns)))
for pattern in patterns:
if not self.recursive_exclude(dir, pattern):
log.warn(("warning: no previously-included files matching "
"'%s' found under directory '%s'"),
pattern, dir)

elif action == 'graft':
self.debug_print("graft " + dir_pattern)
if not self.graft(dir_pattern):
log.warn("warning: no directories found matching '%s'",
dir_pattern)

elif action == 'prune':
self.debug_print("prune " + dir_pattern)
if not self.prune(dir_pattern):
log.warn(("no previously-included directories found "
"matching '%s'"), dir_pattern)

else:
raise DistutilsInternalError(
"this cannot happen: invalid action '%s'" % action)

def _remove_files(self, predicate):
"""
Remove all files from the file list that match the predicate.
Return True if any matching files were removed
"""
found = False
for i in range(len(self.files) - 1, -1, -1):
if predicate(self.files[i]):
self.debug_print(" removing " + self.files[i])
del self.files[i]
found = True
return found

def include(self, pattern):
"""Include files that match 'pattern'."""
found = [f for f in glob(pattern) if not os.path.isdir(f)]
self.extend(found)
return bool(found)

def exclude(self, pattern):
"""Exclude files that match 'pattern'."""
match = translate_pattern(pattern)
return self._remove_files(match.match)

def recursive_include(self, dir, pattern):
"""
Include all files anywhere in 'dir/' that match the pattern.
"""
full_pattern = os.path.join(dir, '**', pattern)
found = [f for f in glob(full_pattern, recursive=True)
if not os.path.isdir(f)]
self.extend(found)
return bool(found)

def recursive_exclude(self, dir, pattern):
"""
Exclude any file anywhere in 'dir/' that match the pattern.
"""
match = translate_pattern(os.path.join(dir, '**', pattern))
return self._remove_files(match.match)

def graft(self, dir):
"""Include all files from 'dir/'."""
found = distutils.filelist.findall(dir)
self.extend(found)
return bool(found)

def prune(self, dir):
"""Filter out files from 'dir/'."""
match = translate_pattern(os.path.join(dir, '**'))
return self._remove_files(match.match)

def global_include(self, pattern):
"""
Include all files anywhere in the current directory that match the
pattern. This is very inefficient on large file trees.
"""
if self.allfiles is None:
self.findall()
match = translate_pattern(os.path.join('**', pattern))
found = [f for f in self.allfiles if match.match(f)]
self.extend(found)
return bool(found)

def global_exclude(self, pattern):
"""
Exclude all files anywhere that match the pattern.
"""
match = translate_pattern(os.path.join('**', pattern))
return self._remove_files(match.match)

def append(self, item):
if item.endswith('\r'): # Fix older sdists built on Windows
Expand Down Expand Up @@ -302,7 +530,6 @@ def run(self):
self.filelist = FileList()
if not os.path.exists(self.manifest):
self.write_manifest() # it must exist so it'll get in the list
self.filelist.findall()
self.add_defaults()
if os.path.exists(self.template):
self.read_template()
Expand Down Expand Up @@ -341,38 +568,13 @@ def add_defaults(self):
elif os.path.exists(self.manifest):
self.read_manifest()
ei_cmd = self.get_finalized_command('egg_info')
self._add_egg_info(cmd=ei_cmd)
self.filelist.include_pattern("*", prefix=ei_cmd.egg_info)

def _add_egg_info(self, cmd):
"""
Add paths for egg-info files for an external egg-base.
The egg-info files are written to egg-base. If egg-base is
outside the current working directory, this method
searchs the egg-base directory for files to include
in the manifest. Uses distutils.filelist.findall (which is
really the version monkeypatched in by setuptools/__init__.py)
to perform the search.
Since findall records relative paths, prefix the returned
paths with cmd.egg_base, so add_default's include_pattern call
(which is looking for the absolute cmd.egg_info) will match
them.
"""
if cmd.egg_base == os.curdir:
# egg-info files were already added by something else
return

discovered = distutils.filelist.findall(cmd.egg_base)
resolved = (os.path.join(cmd.egg_base, path) for path in discovered)
self.filelist.allfiles.extend(resolved)
self.filelist.graft(ei_cmd.egg_info)

def prune_file_list(self):
build = self.get_finalized_command('build')
base_dir = self.distribution.get_fullname()
self.filelist.exclude_pattern(None, prefix=build.build_base)
self.filelist.exclude_pattern(None, prefix=base_dir)
self.filelist.prune(build.build_base)
self.filelist.prune(base_dir)
sep = re.escape(os.sep)
self.filelist.exclude_pattern(r'(^|' + sep + r')(RCS|CVS|\.svn)' + sep,
is_regex=1)
Expand Down

0 comments on commit 44630f1

Please sign in to comment.