From bff3efcc87c41b48d001622a33350e341a19489a Mon Sep 17 00:00:00 2001 From: "Jason R. Coombs" Date: Fri, 7 Feb 2020 18:02:52 -0500 Subject: [PATCH 1/7] Improve zipfile.Path performance on zipfiles with a large number of entries. --- Lib/test/test_zipfile.py | 142 ++++++++++++++++++++++++++++----------- Lib/zipfile.py | 115 ++++++++++++++++++++++++------- 2 files changed, 195 insertions(+), 62 deletions(-) diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index c334715f3d81b1..09fc8506006100 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -2724,16 +2724,71 @@ def test_extract_command(self): self.assertEqual(f.read(), zf.read(zi)) +class TestExecutablePrependedZip(unittest.TestCase): + """Test our ability to open zip files with an executable prepended.""" + + def setUp(self): + self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata') + self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata') + + def _test_zip_works(self, name): + # bpo28494 sanity check: ensure is_zipfile works on these. + self.assertTrue(zipfile.is_zipfile(name), + f'is_zipfile failed on {name}') + # Ensure we can operate on these via ZipFile. + with zipfile.ZipFile(name) as zipfp: + for n in zipfp.namelist(): + data = zipfp.read(n) + self.assertIn(b'FAVORITE_NUMBER', data) + + def test_read_zip_with_exe_prepended(self): + self._test_zip_works(self.exe_zip) + + def test_read_zip64_with_exe_prepended(self): + self._test_zip_works(self.exe_zip64) + + @unittest.skipUnless(sys.executable, 'sys.executable required.') + @unittest.skipUnless(os.access('/bin/bash', os.X_OK), + 'Test relies on #!/bin/bash working.') + def test_execute_zip2(self): + output = subprocess.check_output([self.exe_zip, sys.executable]) + self.assertIn(b'number in executable: 5', output) + + @unittest.skipUnless(sys.executable, 'sys.executable required.') + @unittest.skipUnless(os.access('/bin/bash', os.X_OK), + 'Test relies on #!/bin/bash working.') + def test_execute_zip64(self): + output = subprocess.check_output([self.exe_zip64, sys.executable]) + self.assertIn(b'number in executable: 5', output) + + # Poor man's technique to consume a (smallish) iterable. consume = tuple +# from jaraco.itertools 5.0 +class jaraco: + class itertools: + class Counter: + def __init__(self, i): + self.count = 0 + self._orig_iter = iter(i) + + def __iter__(self): + return self + + def __next__(self): + result = next(self._orig_iter) + self.count += 1 + return result + + def add_dirs(zf): """ Given a writable zip file zf, inject directory entries for any directories implied by the presence of children. """ - for name in zipfile.Path._implied_dirs(zf.namelist()): + for name in zipfile.CompleteDirs._implied_dirs(zf.namelist()): zf.writestr(name, b"") return zf @@ -2774,44 +2829,6 @@ def build_alpharep_fixture(): return zf -class TestExecutablePrependedZip(unittest.TestCase): - """Test our ability to open zip files with an executable prepended.""" - - def setUp(self): - self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata') - self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata') - - def _test_zip_works(self, name): - # bpo-28494 sanity check: ensure is_zipfile works on these. - self.assertTrue(zipfile.is_zipfile(name), - f'is_zipfile failed on {name}') - # Ensure we can operate on these via ZipFile. - with zipfile.ZipFile(name) as zipfp: - for n in zipfp.namelist(): - data = zipfp.read(n) - self.assertIn(b'FAVORITE_NUMBER', data) - - def test_read_zip_with_exe_prepended(self): - self._test_zip_works(self.exe_zip) - - def test_read_zip64_with_exe_prepended(self): - self._test_zip_works(self.exe_zip64) - - @unittest.skipUnless(sys.executable, 'sys.executable required.') - @unittest.skipUnless(os.access('/bin/bash', os.X_OK), - 'Test relies on #!/bin/bash working.') - def test_execute_zip2(self): - output = subprocess.check_output([self.exe_zip, sys.executable]) - self.assertIn(b'number in executable: 5', output) - - @unittest.skipUnless(sys.executable, 'sys.executable required.') - @unittest.skipUnless(os.access('/bin/bash', os.X_OK), - 'Test relies on #!/bin/bash working.') - def test_execute_zip64(self): - output = subprocess.check_output([self.exe_zip64, sys.executable]) - self.assertIn(b'number in executable: 5', output) - - class TestPath(unittest.TestCase): def setUp(self): self.fixtures = contextlib.ExitStack() @@ -2849,6 +2866,14 @@ def test_iterdir_and_types(self): i, = h.iterdir() assert i.is_file() + def test_subdir_is_dir(self): + for alpharep in self.zipfile_alpharep(): + root = zipfile.Path(alpharep) + assert (root / 'b').is_dir() + assert (root / 'b/').is_dir() + assert (root / 'g').is_dir() + assert (root / 'g/').is_dir() + def test_open(self): for alpharep in self.zipfile_alpharep(): root = zipfile.Path(alpharep) @@ -2910,6 +2935,45 @@ def test_missing_dir_parent(self): root = zipfile.Path(alpharep) assert (root / 'missing dir/').parent.at == '' + def test_mutability(self): + """ + If the underlying zipfile is changed, the Path object should + reflect that change. + """ + for alpharep in self.zipfile_alpharep(): + root = zipfile.Path(alpharep) + a, b, g = root.iterdir() + alpharep.writestr('foo.txt', 'foo') + alpharep.writestr('bar/baz.txt', 'baz') + assert any( + child.name == 'foo.txt' + for child in root.iterdir()) + assert (root / 'foo.txt').read_text() == 'foo' + baz, = (root / 'bar').iterdir() + assert baz.read_text() == 'baz' + + HUGE_ZIPFILE_NUM_ENTRIES = 2 ** 13 + + def huge_zipfile(self): + """Create a read-only zipfile with a huge number of entries entries.""" + strm = io.BytesIO() + zf = zipfile.ZipFile(strm, "w") + for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)): + zf.writestr(entry, entry) + zf.mode = 'r' + return zf + + def test_joinpath_constant_time(self): + """ + Ensure joinpath on items in zipfile is linear time. + """ + root = zipfile.Path(self.huge_zipfile()) + entries = jaraco.itertools.Counter(root.iterdir()) + for entry in entries: + entry.joinpath('suffix') + # Check the file iterated all items + assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES + if __name__ == "__main__": unittest.main() diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 2da87ef505e6ec..75fcfac8e58440 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -16,6 +16,8 @@ import sys import threading import time +import contextlib +from collections import OrderedDict try: import zlib # We may need its compression method @@ -2159,6 +2161,90 @@ def _ancestry(path): path, tail = posixpath.split(path) +class CompleteDirs(ZipFile): + """ + A ZipFile subclass that ensures that implied directories + are always included in the namelist. + """ + + @staticmethod + def _implied_dirs(names): + parents = itertools.chain.from_iterable(map(_parents, names)) + # Deduplicate entries in original order + implied_dirs = OrderedDict.fromkeys( + p + posixpath.sep for p in parents + # Cast names to a set for O(1) lookups + if p + posixpath.sep not in set(names) + ) + return implied_dirs + + def namelist(self): + names = super(CompleteDirs, self).namelist() + return names + list(self._implied_dirs(names)) + + def _name_set(self): + return set(self.namelist()) + + def resolve_dir(self, name): + """ + If the name represents a directory, return that name + as a directory (with the trailing slash). + """ + names = self._name_set() + dirname = name + '/' + dir_match = name not in names and dirname in names + return dirname if dir_match else name + + @classmethod + def make(cls, source): + """ + Given a source (filename or zipfile), return an + appropriate CompleteDirs subclass. + """ + if isinstance(source, CompleteDirs): + return source + + if not isinstance(source, ZipFile): + return cls(_pathlib_compat(source)) + + # Only allow for FastPath when supplied zipfile is read-only + if 'r' not in source.mode: + cls = CompleteDirs + + res = cls.__new__(cls) + vars(res).update(vars(source)) + return res + + +class FastLookup(CompleteDirs): + """ + ZipFile subclass to ensure implicit + dirs exist and are resolved rapidly. + """ + def namelist(self): + with contextlib.suppress(AttributeError): + return self.__names + self.__names = super(FastLookup, self).namelist() + return self.__names + + def _name_set(self): + with contextlib.suppress(AttributeError): + return self.__lookup + self.__lookup = super(FastLookup, self)._name_set() + return self.__lookup + + +def _pathlib_compat(path): + """ + For path-like objects, convert to a filename for compatibility + on Python 3.6.1 and earlier. + """ + try: + return path.__fspath__() + except AttributeError: + return str(path) + + class Path: """ A pathlib-compatible interface for zip files. @@ -2227,7 +2313,7 @@ class Path: __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})" def __init__(self, root, at=""): - self.root = root if isinstance(root, ZipFile) else ZipFile(root) + self.root = FastLookup.make(root) self.at = at @property @@ -2259,12 +2345,12 @@ def is_file(self): return not self.is_dir() def exists(self): - return self.at in self._names() + return self.at in self.root._name_set() def iterdir(self): if not self.is_dir(): raise ValueError("Can't listdir a file") - subs = map(self._next, self._names()) + subs = map(self._next, self.root.namelist()) return filter(self._is_child, subs) def __str__(self): @@ -2274,26 +2360,11 @@ def __repr__(self): return self.__repr.format(self=self) def joinpath(self, add): - next = posixpath.join(self.at, add) - next_dir = posixpath.join(self.at, add, "") - names = self._names() - return self._next(next_dir if next not in names and next_dir in names else next) + next = posixpath.join(self.at, _pathlib_compat(add)) + return self._next(self.root.resolve_dir(next)) __truediv__ = joinpath - @staticmethod - def _implied_dirs(names): - return _unique_everseen( - parent + "/" - for name in names - for parent in _parents(name) - if parent + "/" not in names - ) - - @classmethod - def _add_implied_dirs(cls, names): - return names + list(cls._implied_dirs(names)) - @property def parent(self): parent_at = posixpath.dirname(self.at.rstrip('/')) @@ -2301,9 +2372,6 @@ def parent(self): parent_at += '/' return self._next(parent_at) - def _names(self): - return self._add_implied_dirs(self.root.namelist()) - def main(args=None): import argparse @@ -2365,5 +2433,6 @@ def addToZip(zf, path, zippath): zippath = '' addToZip(zf, path, zippath) + if __name__ == "__main__": main() From c9b89aea5ba4d8da932eec000c3e5d6523c36dc8 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Fri, 7 Feb 2020 23:10:18 +0000 Subject: [PATCH 2/7] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Misc/NEWS.d/next/Library/2020-02-07-23-10-14.bpo-0.DHwddE.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2020-02-07-23-10-14.bpo-0.DHwddE.rst diff --git a/Misc/NEWS.d/next/Library/2020-02-07-23-10-14.bpo-0.DHwddE.rst b/Misc/NEWS.d/next/Library/2020-02-07-23-10-14.bpo-0.DHwddE.rst new file mode 100644 index 00000000000000..40c6d1e25d1b8a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-02-07-23-10-14.bpo-0.DHwddE.rst @@ -0,0 +1 @@ +Improved performance of zipfile.Path for files with a large number of entries. \ No newline at end of file From 05b52f7107574c98877b18f9262a479d53cff181 Mon Sep 17 00:00:00 2001 From: "Jason R. Coombs" Date: Sun, 9 Feb 2020 16:05:46 -0500 Subject: [PATCH 3/7] Add bpo to blurb --- ....bpo-0.DHwddE.rst => 2020-02-07-23-14-14.bpo-39595.DHwddE.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Misc/NEWS.d/next/Library/{2020-02-07-23-10-14.bpo-0.DHwddE.rst => 2020-02-07-23-14-14.bpo-39595.DHwddE.rst} (100%) diff --git a/Misc/NEWS.d/next/Library/2020-02-07-23-10-14.bpo-0.DHwddE.rst b/Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst similarity index 100% rename from Misc/NEWS.d/next/Library/2020-02-07-23-10-14.bpo-0.DHwddE.rst rename to Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst From 9c6d05106dac5eb8e2fb23000f4f13fc28d23f88 Mon Sep 17 00:00:00 2001 From: "Jason R. Coombs" Date: Sun, 9 Feb 2020 16:09:20 -0500 Subject: [PATCH 4/7] Sync with importlib_metadata 1.5 (6fe70ca) --- Lib/importlib/metadata.py | 12 ++++++----- Lib/test/test_importlib/fixtures.py | 23 ++++++++++++++++++-- Lib/test/test_importlib/test_main.py | 32 ++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/Lib/importlib/metadata.py b/Lib/importlib/metadata.py index ae8ecf9b8500cc..831f593277ccd4 100644 --- a/Lib/importlib/metadata.py +++ b/Lib/importlib/metadata.py @@ -391,6 +391,7 @@ class FastPath: def __init__(self, root): self.root = root + self.base = os.path.basename(root).lower() def joinpath(self, child): return pathlib.Path(self.root, child) @@ -413,12 +414,11 @@ def zip_children(self): ) def is_egg(self, search): - root_n_low = os.path.split(self.root)[1].lower() - + base = self.base return ( - root_n_low == search.normalized + '.egg' - or root_n_low.startswith(search.prefix) - and root_n_low.endswith('.egg')) + base == search.versionless_egg_name + or base.startswith(search.prefix) + and base.endswith('.egg')) def search(self, name): for child in self.children(): @@ -439,6 +439,7 @@ class Prepared: prefix = '' suffixes = '.dist-info', '.egg-info' exact_matches = [''][:0] + versionless_egg_name = '' def __init__(self, name): self.name = name @@ -448,6 +449,7 @@ def __init__(self, name): self.prefix = self.normalized + '-' self.exact_matches = [ self.normalized + suffix for suffix in self.suffixes] + self.versionless_egg_name = self.normalized + '.egg' class MetadataPathFinder(DistributionFinder): diff --git a/Lib/test/test_importlib/fixtures.py b/Lib/test/test_importlib/fixtures.py index 0b4ce18d5a6cd7..695c92a786cb0b 100644 --- a/Lib/test/test_importlib/fixtures.py +++ b/Lib/test/test_importlib/fixtures.py @@ -47,14 +47,28 @@ def tempdir_as_cwd(): yield tmp -class SiteDir: +@contextlib.contextmanager +def install_finder(finder): + sys.meta_path.append(finder) + try: + yield + finally: + sys.meta_path.remove(finder) + + +class Fixtures: def setUp(self): self.fixtures = ExitStack() self.addCleanup(self.fixtures.close) + + +class SiteDir(Fixtures): + def setUp(self): + super(SiteDir, self).setUp() self.site_dir = self.fixtures.enter_context(tempdir()) -class OnSysPath: +class OnSysPath(Fixtures): @staticmethod @contextlib.contextmanager def add_sys_path(dir): @@ -198,3 +212,8 @@ def build_files(file_defs, prefix=pathlib.Path()): def DALS(str): "Dedent and left-strip" return textwrap.dedent(str).lstrip() + + +class NullFinder: + def find_module(self, name): + pass diff --git a/Lib/test/test_importlib/test_main.py b/Lib/test/test_importlib/test_main.py index c5f1dbbae325ed..42a79992ecc8c0 100644 --- a/Lib/test/test_importlib/test_main.py +++ b/Lib/test/test_importlib/test_main.py @@ -7,6 +7,11 @@ import unittest import importlib.metadata +try: + import pyfakefs.fake_filesystem_unittest as ffs +except ImportError: + from .stubs import fake_filesystem_unittest as ffs + from . import fixtures from importlib.metadata import ( Distribution, EntryPoint, @@ -185,6 +190,33 @@ def test_egg(self): version('foo') +class MissingSysPath(fixtures.OnSysPath, unittest.TestCase): + site_dir = '/does-not-exist' + + def test_discovery(self): + """ + Discovering distributions should succeed even if + there is an invalid path on sys.path. + """ + importlib.metadata.distributions() + + +class InaccessibleSysPath(fixtures.OnSysPath, ffs.TestCase): + site_dir = '/access-denied' + + def setUp(self): + super(InaccessibleSysPath, self).setUp() + self.setUpPyfakefs() + self.fs.create_dir(self.site_dir, perm_bits=000) + + def test_discovery(self): + """ + Discovering distributions should succeed even if + there is an invalid path on sys.path. + """ + list(importlib.metadata.distributions()) + + class TestEntryPoints(unittest.TestCase): def __init__(self, *args): super(TestEntryPoints, self).__init__(*args) From db760ba55c62f37dfb84481994cd56cb6d04357b Mon Sep 17 00:00:00 2001 From: "Jason R. Coombs" Date: Sun, 9 Feb 2020 16:14:26 -0500 Subject: [PATCH 5/7] Update blurb. --- .../next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst b/Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst index 40c6d1e25d1b8a..3a461389af7d18 100644 --- a/Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst +++ b/Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst @@ -1 +1 @@ -Improved performance of zipfile.Path for files with a large number of entries. \ No newline at end of file +Improved performance of zipfile.Path for files with a large number of entries. Also improved performance and fixed minor issue as published with `importlib_metadata 1.5 `_. From d5b9fe501775f72c2bf39b42141c003e891ec8bf Mon Sep 17 00:00:00 2001 From: "Jason R. Coombs" Date: Sun, 9 Feb 2020 16:24:29 -0500 Subject: [PATCH 6/7] Remove compatibility code --- Lib/zipfile.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 75fcfac8e58440..4510fac250b979 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -2205,7 +2205,7 @@ def make(cls, source): return source if not isinstance(source, ZipFile): - return cls(_pathlib_compat(source)) + return cls(source) # Only allow for FastPath when supplied zipfile is read-only if 'r' not in source.mode: @@ -2234,17 +2234,6 @@ def _name_set(self): return self.__lookup -def _pathlib_compat(path): - """ - For path-like objects, convert to a filename for compatibility - on Python 3.6.1 and earlier. - """ - try: - return path.__fspath__() - except AttributeError: - return str(path) - - class Path: """ A pathlib-compatible interface for zip files. @@ -2360,7 +2349,7 @@ def __repr__(self): return self.__repr.format(self=self) def joinpath(self, add): - next = posixpath.join(self.at, _pathlib_compat(add)) + next = posixpath.join(self.at, add) return self._next(self.root.resolve_dir(next)) __truediv__ = joinpath From 093b637faebd327a3aa2316dd289590c77e6a5ac Mon Sep 17 00:00:00 2001 From: "Jason R. Coombs" Date: Mon, 10 Feb 2020 23:00:45 -0500 Subject: [PATCH 7/7] Add stubs module, omitted from earlier commit --- Lib/test/test_importlib/stubs.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 Lib/test/test_importlib/stubs.py diff --git a/Lib/test/test_importlib/stubs.py b/Lib/test/test_importlib/stubs.py new file mode 100644 index 00000000000000..e5b011c399fa96 --- /dev/null +++ b/Lib/test/test_importlib/stubs.py @@ -0,0 +1,10 @@ +import unittest + + +class fake_filesystem_unittest: + """ + Stubbed version of the pyfakefs module + """ + class TestCase(unittest.TestCase): + def setUpPyfakefs(self): + self.skipTest("pyfakefs not available")