Skip to content

Commit

Permalink
Do not scan whole file tree when making MANIFEST
Browse files Browse the repository at this point in the history
When building a MANIFEST from a MANIFEST.in, setuptools previously
scanned the whole directory tree in to a list, and then picked matching
files based on MANIFEST.in commands from this list.

Now, files are found using the `glob` library from Python 3.5. This only
explores directories that need to be scanned, resulting in a large speed
improvement for projects with large file trees. A modified `glob`
module has been included. It has been changed to support back to
Python 2.6, and to include `.hidden` files in its matches. The previous
functionality included `.hidden` files in its glob matches. It is
unclear if this behaviour is desired and required, or accidental and not
required, but for strict backwards-compatibility, this behaviour is
kept.

Each command in the MANIFEST.in is now represented by its own function
on the FileList (`include`, `exclude`, `graft`, etc.) to allow for an
efficient implementation. The previous commands
`FileList.include_pattern` and `FileList.exclude_pattern` still
exist for backwards compatibility, but these use the slow 'scan all
files' method, so are discouraged. `global_include` by its nature must
scan all directories in the project to work, so this does not receive
any speed improvements.

The changes will speed up creating packages for the vast majority of
users. There are a few unusual corner cases, such as multiple `graft`
commands operating on the same set of directories, that will be slower.
These can be solved by consolidating the overlapping `graft` commands in
to one command.
  • Loading branch information
mx-moth committed Aug 29, 2016
1 parent 1aa7190 commit 5c655a8
Show file tree
Hide file tree
Showing 3 changed files with 472 additions and 62 deletions.
297 changes: 249 additions & 48 deletions setuptools/command/egg_info.py
Expand Up @@ -2,40 +2,123 @@
Create a distribution's .egg-info directory and contents"""

from distutils.filelist import FileList as _FileList
from distutils.util import convert_path
from distutils import log
import collections
import distutils.errors
import distutils.filelist
import io
import os
import re
import sys
import io
import warnings
import time
import collections

from setuptools.extern import six
from setuptools.extern.six.moves import map
import warnings
from distutils import log
from distutils.filelist import FileList as _FileList
from distutils.filelist import translate_pattern
from distutils.util import convert_path
from fnmatch import translate

from setuptools import Command
from setuptools.command.sdist import sdist
from setuptools.command.sdist import walk_revctrl
from setuptools.command.setopt import edit_config
from setuptools.command import bdist_egg
from pkg_resources import (
parse_requirements, safe_name, parse_version,
safe_version, yield_lines, EntryPoint, iter_entry_points, to_filename)
import setuptools.unicode_utils as unicode_utils

from pkg_resources import (
EntryPoint, iter_entry_points, parse_requirements, parse_version, safe_name,
safe_version, to_filename, yield_lines)
from pkg_resources.extern import packaging
from setuptools import Command
from setuptools.command import bdist_egg
from setuptools.command.sdist import sdist, walk_revctrl
from setuptools.command.setopt import edit_config
from setuptools.extern import six
from setuptools.extern.six.moves import map
from setuptools.glob import glob

try:
from setuptools_svn import svn_utils
except ImportError:
pass



def translate_pattern(glob):
"""
Translate a file path glob like '*.txt' in to a regular expression.
This differs from fnmatch.translate which allows wildcards to match
directory separators. It also knows about '**/' which matches any number of
directories.
"""
pat = ''

# This will split on '/' within [character classes]. This is deliberate.
chunks = glob.split(os.path.sep)

sep = re.escape(os.sep)
valid_char = '[^%s]' % (sep,)

for c, chunk in enumerate(chunks):
last_chunk = c == len(chunks) - 1

# Chunks that are a literal ** are globstars. They match anything.
if chunk == '**':
if last_chunk:
# Match anything if this is the last component
pat += '.*'
else:
# Match '(name/)*'
pat += '(?:%s+%s)*' % (valid_char, sep)
continue # Break here as the whole path component has been handled

# Find any special characters in the remainder
i = 0
chunk_len = len(chunk)
while i < chunk_len:
char = chunk[i]
if char == '*':
# Match any number of name characters
pat += valid_char + '*'
elif char == '?':
# Match a name character
pat += valid_char
elif char == '[':
# Character class
inner_i = i + 1
# Skip initial !/] chars
if inner_i < chunk_len and chunk[inner_i] == '!':
inner_i = inner_i + 1
if inner_i < chunk_len and chunk[inner_i] == ']':
inner_i = inner_i + 1

# Loop till the closing ] is found
while inner_i < chunk_len and chunk[inner_i] != ']':
inner_i = inner_i + 1

if inner_i >= chunk_len:
# Got to the end of the string without finding a closing ]
# Do not treat this as a matching group, but as a literal [
pat += re.escape(char)
else:
# Grab the insides of the [brackets]
inner = chunk[i + 1:inner_i]
char_class = ''

# Class negation
if inner[0] == '!':
char_class = '^'
inner = inner[1:]

char_class += re.escape(inner)
pat += '[%s]' % (char_class,)

# Skip to the end ]
i = inner_i
else:
pat += re.escape(char)
i += 1

# Join each chunk with the dir separator
if not last_chunk:
pat += sep

return re.compile(pat + r'\Z(?ms)')


class egg_info(Command):
description = "create a distribution's .egg-info directory"

Expand Down Expand Up @@ -239,7 +322,151 @@ def check_broken_egg_info(self):


class FileList(_FileList):
"""File list that accepts only existing, platform-independent paths"""
# Implementations of the various MANIFEST.in commands

def process_template_line(self, line):
# Parse the line: split it up, make sure the right number of words
# is there, and return the relevant words. 'action' is always
# defined: it's the first word of the line. Which of the other
# three are defined depends on the action; it'll be either
# patterns, (dir and patterns), or (dir_pattern).
(action, patterns, dir, dir_pattern) = self._parse_template_line(line)

# OK, now we know that the action is valid and we have the
# right number of words on the line for that action -- so we
# can proceed with minimal error-checking.
if action == 'include':
self.debug_print("include " + ' '.join(patterns))
for pattern in patterns:
if not self.include(pattern):
log.warn("warning: no files found matching '%s'", pattern)

elif action == 'exclude':
self.debug_print("exclude " + ' '.join(patterns))
for pattern in patterns:
if not self.exclude(pattern):
log.warn(("warning: no previously-included files "
"found matching '%s'"), pattern)

elif action == 'global-include':
self.debug_print("global-include " + ' '.join(patterns))
for pattern in patterns:
if not self.global_include(pattern):
log.warn(("warning: no files found matching '%s' "
"anywhere in distribution"), pattern)

elif action == 'global-exclude':
self.debug_print("global-exclude " + ' '.join(patterns))
for pattern in patterns:
if not self.global_exclude(pattern):
log.warn(("warning: no previously-included files matching "
"'%s' found anywhere in distribution"),
pattern)

elif action == 'recursive-include':
self.debug_print("recursive-include %s %s" %
(dir, ' '.join(patterns)))
for pattern in patterns:
if not self.recursive_include(dir, pattern):
log.warn(("warning: no files found matching '%s' "
"under directory '%s'"),
pattern, dir)

elif action == 'recursive-exclude':
self.debug_print("recursive-exclude %s %s" %
(dir, ' '.join(patterns)))
for pattern in patterns:
if not self.recursive_exclude(dir, pattern):
log.warn(("warning: no previously-included files matching "
"'%s' found under directory '%s'"),
pattern, dir)

elif action == 'graft':
self.debug_print("graft " + dir_pattern)
if not self.graft(dir_pattern):
log.warn("warning: no directories found matching '%s'",
dir_pattern)

elif action == 'prune':
self.debug_print("prune " + dir_pattern)
if not self.prune(dir_pattern):
log.warn(("no previously-included directories found "
"matching '%s'"), dir_pattern)

else:
raise DistutilsInternalError(
"this cannot happen: invalid action '%s'" % action)

def _remove_files(self, predicate):
"""
Remove all files from the file list that match the predicate.
Return True if any matching files were removed
"""
found = False
for i in range(len(self.files) - 1, -1, -1):
if predicate(self.files[i]):
self.debug_print(" removing " + self.files[i])
del self.files[i]
found = True
return found

def include(self, pattern):
"""Include files that match 'pattera'n."""
found = [f for f in glob(pattern) if not os.path.isdir(f)]
self.extend(found)
return bool(found)

def exclude(self, pattern):
"""Exclude files that match 'pattern'."""
match = translate_pattern(pattern)
return self._remove_files(match.match)

def recursive_include(self, dir, pattern):
"""
Include all files anywhere in 'dir/' that match the pattern.
"""
full_pattern = os.path.join(dir, '**', pattern)
found = [f for f in glob(full_pattern, recursive=True)
if not os.path.isdir(f)]
self.extend(found)
return bool(found)

def recursive_exclude(self, dir, pattern):
"""
Exclude any file anywhere in 'dir/' that match the pattern.
"""
match = translate_pattern(os.path.join(dir, '**', pattern))
return self._remove_files(match.match)

def graft(self, dir):
"""Include all files from 'dir/'."""
found = distutils.filelist.findall(dir)
self.extend(found)
return bool(found)

def prune(self, dir):
"""Filter out files from 'dir/'."""
match = translate_pattern(os.path.join(dir, '**'))
return self._remove_files(match.match)

def global_include(self, pattern):
"""
Include all files anywhere in the current directory that match the
pattern. This is very inefficient on large file trees.
"""
if self.allfiles is None:
self.findall()
match = translate_pattern(os.path.join('**', pattern))
found = [f for f in self.allfiles if match.match(f)]
self.extend(found)
return bool(found)

def global_exclude(self, pattern):
"""
Exclude all files anywhere that match the pattern.
"""
match = translate_pattern(os.path.join('**', pattern))
return self._remove_files(match.match)

def append(self, item):
if item.endswith('\r'): # Fix older sdists built on Windows
Expand Down Expand Up @@ -302,7 +529,6 @@ def run(self):
self.filelist = FileList()
if not os.path.exists(self.manifest):
self.write_manifest() # it must exist so it'll get in the list
self.filelist.findall()
self.add_defaults()
if os.path.exists(self.template):
self.read_template()
Expand Down Expand Up @@ -341,38 +567,13 @@ def add_defaults(self):
elif os.path.exists(self.manifest):
self.read_manifest()
ei_cmd = self.get_finalized_command('egg_info')
self._add_egg_info(cmd=ei_cmd)
self.filelist.include_pattern("*", prefix=ei_cmd.egg_info)

def _add_egg_info(self, cmd):
"""
Add paths for egg-info files for an external egg-base.
The egg-info files are written to egg-base. If egg-base is
outside the current working directory, this method
searchs the egg-base directory for files to include
in the manifest. Uses distutils.filelist.findall (which is
really the version monkeypatched in by setuptools/__init__.py)
to perform the search.
Since findall records relative paths, prefix the returned
paths with cmd.egg_base, so add_default's include_pattern call
(which is looking for the absolute cmd.egg_info) will match
them.
"""
if cmd.egg_base == os.curdir:
# egg-info files were already added by something else
return

discovered = distutils.filelist.findall(cmd.egg_base)
resolved = (os.path.join(cmd.egg_base, path) for path in discovered)
self.filelist.allfiles.extend(resolved)
self.filelist.graft(ei_cmd.egg_info)

def prune_file_list(self):
build = self.get_finalized_command('build')
base_dir = self.distribution.get_fullname()
self.filelist.exclude_pattern(None, prefix=build.build_base)
self.filelist.exclude_pattern(None, prefix=base_dir)
self.filelist.prune(build.build_base)
self.filelist.prune(base_dir)
sep = re.escape(os.sep)
self.filelist.exclude_pattern(r'(^|' + sep + r')(RCS|CVS|\.svn)' + sep,
is_regex=1)
Expand Down

0 comments on commit 5c655a8

Please sign in to comment.