From 609d9bad76cb25a8b9d46027241a8e49aaeb6075 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Fri, 5 May 2023 22:06:39 +0200 Subject: [PATCH 01/18] depend: imphookapi: rework TOC-style tuple detection Rework TOC-style 3-tuple detection in `add_datas` and `binaries`; instead of checking for instance of `TOC` class, check if the length of the first entry is three. This will allow us to replace the `TOC` class with regular list. --- PyInstaller/depend/imphookapi.py | 35 +++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/PyInstaller/depend/imphookapi.py b/PyInstaller/depend/imphookapi.py index 68e48a9d80..38a35efba2 100644 --- a/PyInstaller/depend/imphookapi.py +++ b/PyInstaller/depend/imphookapi.py @@ -17,7 +17,6 @@ modules will be frozen into the executable. """ -from PyInstaller.building.datastruct import TOC from PyInstaller.building.utils import format_binaries_and_datas from PyInstaller.lib.modulegraph.modulegraph import (RuntimeModule, RuntimePackage) @@ -430,29 +429,37 @@ def del_imports(self, *module_names): """ self._deleted_imports.extend(module_names) - def add_binaries(self, list_of_tuples): + def add_binaries(self, binaries): """ - Add all external dynamic libraries in the passed list of `(name, path)` 2-tuples as dependencies of the + Add all external dynamic libraries in the passed list of `(src_name, dest_name)` 2-tuples as dependencies of the current module. This is equivalent to adding to the global `binaries` hook attribute. - For convenience, the `list_of_tuples` may also be a single TOC or TREE instance. + For convenience, the `binaries` may also be a list of TOC-style 3-tuples `(dest_name, src_name, typecode)`. """ - if isinstance(list_of_tuples, TOC): - self._added_binaries.extend(i[:2] for i in list_of_tuples) + + # Detect TOC 3-tuple list by checking the length of the first entry + if binaries and len(binaries[0]) == 3: + self._added_binaries.extend(entry[:2] for entry in binaries) else: - self._added_binaries.extend(format_binaries_and_datas(list_of_tuples)) + # NOTE: `format_binaries_and_datas` changes tuples from input format `(src_name, dest_name)` to output + # format `(dest_name, src_name)`. + self._added_binaries.extend(format_binaries_and_datas(binaries)) - def add_datas(self, list_of_tuples): + def add_datas(self, datas): """ - Add all external data files in the passed list of `(name, path)` 2-tuples as dependencies of the current - module. This is equivalent to adding to the global `datas` hook attribute. + Add all external data files in the passed list of `(src_name, dest_name)` 2-tuples as dependencies of the + current module. This is equivalent to adding to the global `datas` hook attribute. - For convenience, the `list_of_tuples` may also be a single TOC or TREE instance. + For convenience, the `datas` may also be a list of TOC-style 3-tuples `(dest_name, src_name, typecode)`. """ - if isinstance(list_of_tuples, TOC): - self._added_datas.extend(i[:2] for i in list_of_tuples) + + # Detect TOC 3-tuple list by checking the length of the first entry + if datas and len(datas[0]) == 3: + self._added_datas.extend(entry[:2] for entry in datas) else: - self._added_datas.extend(format_binaries_and_datas(list_of_tuples)) + # NOTE: `format_binaries_and_datas` changes tuples from input format `(src_name, dest_name)` to output + # format `(dest_name, src_name)`. + self._added_datas.extend(format_binaries_and_datas(datas)) def set_module_collection_mode(self, name, mode): """" From 132f6736c6ccef847fce0c1f0fc8f97c01735a4f Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Fri, 5 May 2023 22:26:02 +0200 Subject: [PATCH 02/18] building: splash: remove reference to TOC class Instantiate `self.binaries` as a regular list instead of an instance of the `TOC` class; we do not need any of its functionality here. --- PyInstaller/building/splash.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/PyInstaller/building/splash.py b/PyInstaller/building/splash.py index 75ed7e9319..4336bc0485 100644 --- a/PyInstaller/building/splash.py +++ b/PyInstaller/building/splash.py @@ -16,7 +16,7 @@ from PyInstaller import log as logging from PyInstaller.archive.writers import SplashWriter from PyInstaller.building import splash_templates -from PyInstaller.building.datastruct import TOC, Target +from PyInstaller.building.datastruct import Target from PyInstaller.building.utils import _check_guts_eq, _check_guts_toc, misc from PyInstaller.compat import is_darwin, is_win, is_cygwin from PyInstaller.utils.hooks import tcl_tk as tcltk_utils @@ -66,11 +66,11 @@ def __init__(self, image_file, binaries, datas, **kwargs): .. note:: If PIL (Pillow) is installed and the image is bigger than max_img_size, the image will be resized to fit into the specified area. - :param TOC binaries: - The TOC of binaries the Analysis build target found. This TOC includes all extensionmodules and their - dependencies. This is required to figure out, if the users program uses tkinter. - :param TOC datas: - The TOC of data the Analysis build target found. This TOC includes all data-file dependencies of the + :param list binaries: + The TOC list of binaries the Analysis build target found. This TOC includes all extension modules and their + binary dependencies. This is required to determine whether the user's program uses `tkinter`. + :param list datas: + The TOC list of data the Analysis build target found. This TOC includes all data-file dependencies of the modules. This is required to check if all splash screen requirements can be bundled. :keyword text_pos: @@ -207,7 +207,7 @@ def __init__(self, image_file, binaries, datas, **kwargs): # The user wants a full copy of tk, so make all tk files a requirement. self.splash_requirements.update(toc[0] for toc in tcltk_tree) - self.binaries = TOC() + self.binaries = [] if not self.uses_tkinter: # The user's script does not use tkinter, so we need to provide a TOC of all necessary files add the shared # libraries to the binaries. From 24e57a73aecb0b88eb6bfa71f270b332a9205c05 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Fri, 5 May 2023 22:27:46 +0200 Subject: [PATCH 03/18] building: splash: clean up string formatting in logger method calls Use the lazy string formatting offered by logger method calls instead of directly formatting strings using the % syntax. --- PyInstaller/building/splash.py | 38 ++++++++++++++++------------------ 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/PyInstaller/building/splash.py b/PyInstaller/building/splash.py index 4336bc0485..b70d54a916 100644 --- a/PyInstaller/building/splash.py +++ b/PyInstaller/building/splash.py @@ -198,14 +198,14 @@ def __init__(self, image_file, binaries, datas, **kwargs): # Check if tcl/tk was found assert all(self.tcl_lib) assert all(self.tk_lib) - logger.debug("Use Tcl Library from %s and Tk From %s" % (self.tcl_lib, self.tk_lib)) + logger.debug("Use Tcl Library from %s and Tk From %s", self.tcl_lib, self.tk_lib) self.splash_requirements = set([self.tcl_lib[0], self.tk_lib[0]] + splash_requirements) logger.info("Collect tcl/tk binaries for the splash screen") tcltk_tree = tcltk_utils.collect_tcl_tk_files(self._tkinter_file) if self.full_tk: # The user wants a full copy of tk, so make all tk files a requirement. - self.splash_requirements.update(toc[0] for toc in tcltk_tree) + self.splash_requirements.update(entry[0] for entry in tcltk_tree) self.binaries = [] if not self.uses_tkinter: @@ -216,7 +216,7 @@ def __init__(self, image_file, binaries, datas, **kwargs): # Only add the intersection of the required and the collected resources, or add all entries if full_tk is # true. - self.binaries.extend(toc for toc in tcltk_tree if toc[0] in self.splash_requirements) + self.binaries.extend(entry for entry in tcltk_tree if entry[0] in self.splash_requirements) # Handle extra requirements of Tcl/Tk shared libraries (e.g., vcruntime140.dll on Windows - see issue #6284). # These need to be added to splash requirements, so they are extracted into the initial runtime directory in @@ -237,26 +237,26 @@ def __init__(self, image_file, binaries, datas, **kwargs): self.splash_requirements.update([name for name, *_ in binaries if name.lower() in EXTRA_REQUIREMENTS]) # Check if all requirements were found. - fnames = [toc[0] for toc in (binaries + datas + self.binaries)] + collected_files = set(entry[0] for entry in (binaries + datas + self.binaries)) - def _filter(_item): - if _item not in fnames: + def _filter_requirement(filename): + if filename not in collected_files: # Item is not bundled, so warn the user about it. This actually may happen on some tkinter installations # that are missing the license.terms file. logger.warning( "The local Tcl/Tk installation is missing the file %s. The behavior of the splash screen is " - "therefore undefined and may be unsupported." % _item + "therefore undefined and may be unsupported.", filename ) return False return True # Remove all files which were not found. - self.splash_requirements = set(filter(_filter, self.splash_requirements)) + self.splash_requirements = set(filter(_filter_requirement, self.splash_requirements)) # Test if the tcl/tk version is supported by the bootloader. self.test_tk_version() - logger.debug("Splash Requirements: %s" % self.splash_requirements) + logger.debug("Splash Requirements: %s", self.splash_requirements) self.__postinit__() @@ -300,7 +300,7 @@ def _check_guts(self, data, last_build): return False def assemble(self): - logger.info("Building Splash %s" % self.name) + logger.info("Building Splash %s", self.name) # Function to resize a given image to fit into the area defined by max_img_size. def _resize_image(_image, _orig_size): @@ -331,16 +331,14 @@ def _resize_image(_image, _orig_size): _img.close() _img_resized.close() _image_data = _image_stream.getvalue() - logger.info( - "Resized image %s from dimensions %s to (%d, %d)" % (self.image_file, str(_orig_size), _w, _h) - ) + logger.info("Resized image %s from dimensions %s to (%d, %d)", self.image_file, str(_orig_size), _w, _h) return _image_data else: raise ValueError( "The splash image dimensions (w: %d, h: %d) exceed max_img_size (w: %d, h:%d), but the image " "cannot be resized due to missing PIL.Image! Either install the Pillow package, adjust the " - "max_img_size, or use an image of compatible dimensions." % - (_orig_size[0], _orig_size[1], self.max_img_size[0], self.max_img_size[1]) + "max_img_size, or use an image of compatible dimensions.", _orig_size[0], _orig_size[1], + self.max_img_size[0], self.max_img_size[1] ) # Open image file @@ -368,11 +366,11 @@ def _resize_image(_image, _orig_size): img.save(image_data, format='PNG') img.close() image = image_data.getvalue() - logger.info("Converted image %s to PNG format" % self.image_file) + logger.info("Converted image %s to PNG format", self.image_file) else: raise ValueError( "The image %s needs to be converted to a PNG file, but PIL.Image is not available! Either install the " - "Pillow package, or use a PNG image for you splash screen." % self.image_file + "Pillow package, or use a PNG image for you splash screen.", self.image_file ) image_file.close() @@ -396,15 +394,15 @@ def test_tk_version(self): if tcl_version < 8.6 or tk_version < 8.6: logger.warning( "The installed Tcl/Tk (%s/%s) version might not work with the splash screen feature of the bootloader. " - "The bootloader is tested against Tcl/Tk 8.6" % - (self._tkinter_module.TCL_VERSION, self._tkinter_module.TK_VERSION) + "The bootloader is tested against Tcl/Tk 8.6", self._tkinter_module.TCL_VERSION, + self._tkinter_module.TK_VERSION ) # This should be impossible, since tcl/tk is released together with the same version number, but just in case if tcl_version != tk_version: logger.warning( "The installed version of Tcl (%s) and Tk (%s) do not match. PyInstaller is tested against matching " - "versions" % (self._tkinter_module.TCL_VERSION, self._tkinter_module.TK_VERSION) + "versions", self._tkinter_module.TCL_VERSION, self._tkinter_module.TK_VERSION ) # Ensure that Tcl is built with multi-threading support. From e0e0ae05364d72e0ae7095c289ea511673aee3a9 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Sat, 6 May 2023 21:03:25 +0200 Subject: [PATCH 04/18] depend: remove use of TOC class in get_bootstrap_modules We are returning a simple list of modules, so there's no need for TOC class there as we know the list has no duplicates. --- PyInstaller/depend/analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyInstaller/depend/analysis.py b/PyInstaller/depend/analysis.py index 4ea44c5241..50f96d6951 100644 --- a/PyInstaller/depend/analysis.py +++ b/PyInstaller/depend/analysis.py @@ -921,7 +921,7 @@ def get_bootstrap_modules(): # Import 'struct' modules to get real paths to module file names. mod_struct = __import__('struct') # Basic modules necessary for the bootstrap process. - loader_mods = TOC() + loader_mods = list() loaderpath = os.path.join(HOMEPATH, 'PyInstaller', 'loader') # On some platforms (Windows, Debian/Ubuntu) '_struct' and zlib modules are built-in modules (linked statically) # and thus does not have attribute __file__. 'struct' module is required for reading Python bytecode from From ec13909283666d93011fe644aef801ed57fc6e45 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Sat, 6 May 2023 21:16:00 +0200 Subject: [PATCH 05/18] depend: remove optional TOC argument to _make_toc() helper There is exactly one call that passes the existing TOC in our codebase, and even there we can simply append the output to the existing TOC. The returned toc is now a list as opposed to instance of the TOC class. This removes implict deduplication functionality, but should not be an issue at this step, because entries are generated from the module graph (and so there should not be any duplicates here). --- PyInstaller/building/build_main.py | 2 +- PyInstaller/depend/analysis.py | 25 +++++++++++-------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/PyInstaller/building/build_main.py b/PyInstaller/building/build_main.py index a9514810d8..1a74084de6 100644 --- a/PyInstaller/building/build_main.py +++ b/PyInstaller/building/build_main.py @@ -619,7 +619,7 @@ def assemble(self): self.scripts = self.graph.nodes_to_toc(priority_scripts) # Extend the binaries list with all the Extensions modulegraph has found. - self.binaries = self.graph.make_binaries_toc(self.binaries) + self.binaries += self.graph.make_binaries_toc() # Post-process GLib schemas self.datas = compile_glib_schema_files(self.datas, os.path.join(CONF['workpath'], "_pyi_gschema_compilation")) diff --git a/PyInstaller/depend/analysis.py b/PyInstaller/depend/analysis.py index 50f96d6951..5d68cee302 100644 --- a/PyInstaller/depend/analysis.py +++ b/PyInstaller/depend/analysis.py @@ -43,7 +43,6 @@ from PyInstaller import HOMEPATH, PACKAGEPATH from PyInstaller import log as logging -from PyInstaller.building.datastruct import TOC from PyInstaller.building.utils import add_suffix_to_extension from PyInstaller.compat import ( BAD_MODULE_TYPES, BINARY_MODULE_TYPES, MODULE_TYPES_TO_TOC_DICT, PURE_PYTHON_MODULE_TYPES, PY3_BASE_MODULES, @@ -541,9 +540,9 @@ def get_code_objects(self): code_dict[node.identifier] = node.code return code_dict - def _make_toc(self, typecode=None, existing_TOC=None): + def _make_toc(self, typecode=None): """ - Return the name, path and type of selected nodes as a TOC, or appended to a TOC. The selection is via a list + Return the name, path and type of selected nodes as a TOC. The selection is determined by the given list of PyInstaller TOC typecodes. If that list is empty we return the complete flattened graph as a TOC with the ModuleGraph note types in place of typecodes -- meant for debugging only. Normally we return ModuleGraph nodes whose types map to the requested PyInstaller typecode(s) as indicated in the MODULE_TYPES_TO_TOC_DICT. @@ -559,16 +558,17 @@ def _make_toc(self, typecode=None, existing_TOC=None): regex_str = '(' + '|'.join(PY3_BASE_MODULES) + r')(\.|$)' module_filter = re.compile(regex_str) - result = existing_TOC or TOC() + toc = list() for node in self.iter_graph(start=self._top_script_node): # Skip modules that are in base_library.zip. if module_filter.match(node.identifier): continue entry = self._node_to_toc(node, typecode) + # Append the entry. We do not check for duplicates here; the TOC normalization is left to caller. + # However, as entries are obtained from modulegraph, there should not be any duplicates at this stage. if entry is not None: - # TOC.append the data. This checks for a pre-existing name and skips it if it exists. - result.append(entry) - return result + toc.append(entry) + return toc def make_pure_toc(self): """ @@ -577,11 +577,11 @@ def make_pure_toc(self): # PyInstaller should handle special module types without code object. return self._make_toc(PURE_PYTHON_MODULE_TYPES) - def make_binaries_toc(self, existing_toc): + def make_binaries_toc(self): """ Return all binary Python modules formatted as TOC. """ - return self._make_toc(BINARY_MODULE_TYPES, existing_toc) + return self._make_toc(BINARY_MODULE_TYPES) def make_missing_toc(self): """ @@ -624,16 +624,13 @@ def _node_to_toc(node, typecode=None): toc_type = MODULE_TYPES_TO_TOC_DICT[mg_type] return name, path, toc_type - def nodes_to_toc(self, node_list, existing_TOC=None): + def nodes_to_toc(self, nodes): """ Given a list of nodes, create a TOC representing those nodes. This is mainly used to initialize a TOC of scripts with the ones that are runtime hooks. The process is almost the same as _make_toc(), but the caller guarantees the nodes are valid, so minimal checking. """ - result = existing_TOC or TOC() - for node in node_list: - result.append(self._node_to_toc(node)) - return result + return [self._node_to_toc(node) for node in nodes] # Return true if the named item is in the graph as a BuiltinModule node. The passed name is a basename. def is_a_builtin(self, name): From 73e202e67a95218df2b16ab443872bda06f76a9d Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Sun, 7 May 2023 17:52:30 +0200 Subject: [PATCH 06/18] building: have Tree inherit from list instead of TOC Turn `Tree` class into child of regular `list` instead of `TOC` class, as part of on-going `TOC` class depreciation. As `Tree` describes the contents of the filesystem, implicit TOC de-duplication should not be required in the first place. Also initialize `Target.dependencies` with plain `list`. --- PyInstaller/building/datastruct.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PyInstaller/building/datastruct.py b/PyInstaller/building/datastruct.py index 64368946c9..b83e0adb35 100644 --- a/PyInstaller/building/datastruct.py +++ b/PyInstaller/building/datastruct.py @@ -146,7 +146,7 @@ def __init__(self): self.__class__.invcnum += 1 self.tocfilename = os.path.join(CONF['workpath'], '%s-%02d.toc' % (self.__class__.__name__, self.invcnum)) self.tocbasename = os.path.basename(self.tocfilename) - self.dependencies = TOC() + self.dependencies = [] def __postinit__(self): """ @@ -198,9 +198,9 @@ def _save_guts(self): misc.save_py_data_struct(self.tocfilename, data) -class Tree(Target, TOC): +class Tree(Target, list): """ - This class is a way of creating a TOC (Table of Contents) that describes some or all of the files within a + This class is a way of creating a TOC (Table of Contents) list that describes some or all of the files within a directory. """ def __init__(self, root=None, prefix=None, excludes=None, typecode='DATA'): @@ -221,7 +221,7 @@ def __init__(self, root=None, prefix=None, excludes=None, typecode='DATA'): the typcodes. """ Target.__init__(self) - TOC.__init__(self) + list.__init__(self) self.root = root self.prefix = prefix self.excludes = excludes From 2792fc23ac8224b0eedf18316779443322841de2 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Sun, 7 May 2023 22:06:54 +0200 Subject: [PATCH 07/18] building: begin replacing TOC class with normalization helper function For now, the normalization helper function is a stub that internally uses the TOC class; this allows us to gradually limit/encapsulate the use of TOC class within the codebase. --- PyInstaller/building/api.py | 37 ++++++++++++++++---------- PyInstaller/building/datastruct.py | 12 +++++++++ PyInstaller/building/osx.py | 7 +++-- PyInstaller/building/toc_conversion.py | 16 +++++------ 4 files changed, 48 insertions(+), 24 deletions(-) diff --git a/PyInstaller/building/api.py b/PyInstaller/building/api.py index 21b1649fe0..3664df0907 100644 --- a/PyInstaller/building/api.py +++ b/PyInstaller/building/api.py @@ -24,7 +24,7 @@ from PyInstaller import HOMEPATH, PLATFORM from PyInstaller import log as logging from PyInstaller.archive.writers import CArchiveWriter, ZlibArchiveWriter -from PyInstaller.building.datastruct import TOC, Target, _check_guts_eq +from PyInstaller.building.datastruct import Target, _check_guts_eq, normalize_pyz_toc, normalize_toc from PyInstaller.building.utils import ( _check_guts_toc, _make_clean_directory, _rmtree, checkCache, get_code_object, strip_paths_in_code, compile_pymodule ) @@ -51,7 +51,7 @@ class PYZ(Target): def __init__(self, *tocs, **kwargs): """ tocs - One or more TOCs (Tables of Contents), usually an `Analysis.pure` and an `Analysis.zipped_data`. + One or more TOC (Table of Contents) lists, usually an `Analysis.pure` and an `Analysis.zipped_data`. If the passed TOC has an attribute `_code_cache`, it is expected to be a dictionary of module code objects from ModuleGraph. @@ -103,7 +103,7 @@ def __init__(self, *tocs, **kwargs): # Merge input TOC(s) and their code object dictionaries (if available). Skip the bootstrap modules, which will # be passed on to CArchive. bootstrap_module_names = set(name for name, _, typecode in self.dependencies if typecode == 'PYMODULE') - self.toc = TOC() + self.toc = [] self.code_dict = {} for toc in tocs: self.code_dict.update(getattr(toc, '_code_cache', {})) @@ -116,6 +116,9 @@ def __init__(self, *tocs, **kwargs): continue self.toc.append(entry) + # Normalize TOC + self.toc = normalize_pyz_toc(self.toc) + # Alphabetically sort the TOC to enable reproducible builds. self.toc.sort() @@ -187,7 +190,7 @@ def __init__( ): """ toc - A TOC (Table of Contents) + A TOC (Table of Contents) list. name An optional filename for the PKG. cdict @@ -203,7 +206,7 @@ def __init__( """ super().__init__() - self.toc = toc + self.toc = normalize_toc(toc) # Ensure guts contain normalized TOC self.cdict = cdict self.name = name if name is None: @@ -322,7 +325,7 @@ class EXE(Target): def __init__(self, *args, **kwargs): """ args - One or more arguments that are either instances of TOC or Target. + One or more arguments that are either an instance of `Target` or an iterable representing TOC list. kwargs Possible keyword arguments: @@ -469,12 +472,12 @@ def __init__(self, *args, **kwargs): # file already exists. self.pkgname = os.path.join(CONF['workpath'], base_name + '.pkg') - self.toc = TOC() + self.toc = [] - _deps_toc = TOC() # See the note below + _deps_toc = [] # See the note below for arg in args: - # Valid arguments: PYZ object, Splash object, and TOC-like iterables + # Valid arguments: PYZ object, Splash object, and TOC-list iterables if isinstance(arg, (PYZ, Splash)): # Add object as an entry to the TOC, and merge its dependencies TOC if isinstance(arg, PYZ): @@ -573,6 +576,9 @@ def makeabs(path): else: raise TypeError(f"Unsupported type for version info argument: {type(self.versrsrc)!r}") + # Normalize TOC + self.toc = normalize_toc(self.toc) + self.pkg = PKG( self.toc, name=self.pkgname, @@ -589,7 +595,7 @@ def makeabs(path): # Get the path of the bootloader and store it in a TOC, so it can be checked for being changed. exe = self._bootloader_file('run', '.exe' if is_win or is_cygwin else '') - self.exefiles = TOC([(os.path.basename(exe), exe, 'EXECUTABLE')]) + self.exefiles = [(os.path.basename(exe), exe, 'EXECUTABLE')] self.__postinit__() @@ -879,7 +885,7 @@ class COLLECT(Target): def __init__(self, *args, **kwargs): """ args - One or more arguments that are either TOCs or Targets. + One or more arguments that are either an instance of `Target` or an iterable representing TOC list. kwargs Possible keyword arguments: @@ -906,7 +912,7 @@ def __init__(self, *args, **kwargs): # DISTPATH). Old .spec formats included parent path, so strip it away. self.name = os.path.join(CONF['distpath'], os.path.basename(kwargs.get('name'))) - self.toc = TOC() + self.toc = [] for arg in args: # Valid arguments: EXE object and TOC-like iterables if isinstance(arg, EXE): @@ -931,6 +937,9 @@ def __init__(self, *args, **kwargs): else: raise TypeError(f"Invalid argument type for COLLECT: {type(arg)!r}") + # Normalize TOC + self.toc = normalize_toc(self.toc) + self.__postinit__() _GUTS = ( @@ -1035,8 +1044,8 @@ def __init__(self, *args): # onedir, `a.datas` and `a.binaries` need to be passed to `COLLECT` (as they were before the MERGE), while # `a.dependencies` needs to be passed to `EXE`. This split requires DEPENDENCY entries to be in a separate # TOC. - analysis.binaries = TOC(binaries) - analysis.datas = TOC(datas) + analysis.binaries = normalize_toc(binaries) + analysis.datas = normalize_toc(datas) analysis.dependencies += binaries_refs + datas_refs def _process_toc(self, toc, path_to_exe): diff --git a/PyInstaller/building/datastruct.py b/PyInstaller/building/datastruct.py index b83e0adb35..1e34609e66 100644 --- a/PyInstaller/building/datastruct.py +++ b/PyInstaller/building/datastruct.py @@ -293,3 +293,15 @@ def assemble(self): else: result.append((resfilename, fullfilename, self.typecode)) self[:] = result + + +def normalize_toc(toc): + # TODO: for now, this is a stub using TOC class. Replace it with priority-based de-duplication. + normalized_toc = TOC(toc) + return list(normalized_toc) + + +def normalize_pyz_toc(toc): + # TODO: for now, this is a stub using TOC class. Replace it with priority-based de-duplication. + normalized_toc = TOC(toc) + return list(normalized_toc) diff --git a/PyInstaller/building/osx.py b/PyInstaller/building/osx.py index 7209a30011..f804f2e523 100644 --- a/PyInstaller/building/osx.py +++ b/PyInstaller/building/osx.py @@ -14,7 +14,7 @@ import shutil from PyInstaller.building.api import COLLECT, EXE -from PyInstaller.building.datastruct import TOC, Target, logger +from PyInstaller.building.datastruct import Target, logger, normalize_toc from PyInstaller.building.utils import _check_path_overlap, _rmtree, checkCache from PyInstaller.compat import is_darwin from PyInstaller.building.icon import normalize_icon_type @@ -53,7 +53,7 @@ def __init__(self, *args, **kwargs): self.appname = os.path.splitext(base_name)[0] self.version = kwargs.get("version", "0.0.0") - self.toc = TOC() + self.toc = [] self.strip = False self.upx = False self.console = True @@ -109,6 +109,9 @@ def __init__(self, *args, **kwargs): else: raise ValueError("No EXECUTABLE entry found in the TOC!") + # Normalize TOC + self.toc = normalize_toc(self.toc) + self.__postinit__() _GUTS = ( diff --git a/PyInstaller/building/toc_conversion.py b/PyInstaller/building/toc_conversion.py index 0a372aff7b..ce43f7aa15 100644 --- a/PyInstaller/building/toc_conversion.py +++ b/PyInstaller/building/toc_conversion.py @@ -15,7 +15,7 @@ import pkg_resources from PyInstaller import log as logging -from PyInstaller.building.datastruct import TOC, Tree +from PyInstaller.building.datastruct import Tree, normalize_toc from PyInstaller.compat import ALL_SUFFIXES from PyInstaller.depend.utils import get_path_to_egg @@ -97,11 +97,11 @@ def _get_distribution_for_node(self, node): # Public methods. def make_binaries_toc(self): - # TODO create a real TOC when handling of more files is added. - return [(x, y, 'BINARY') for x, y in self._binaries] + toc = [(x, y, 'BINARY') for x, y in self._binaries] + return normalize_toc(toc) def make_datas_toc(self): - toc = TOC((x, y, 'DATA') for x, y in self._datas) + toc = [(x, y, 'DATA') for x, y in self._datas] for dist in self._distributions: if ( dist._pyinstaller_info['egg'] and not dist._pyinstaller_info['zipped'] @@ -110,7 +110,7 @@ def make_datas_toc(self): # this is a un-zipped, not-zip-safe egg tree = Tree(dist.location, excludes=PY_IGNORE_EXTENSIONS) toc.extend(tree) - return toc + return normalize_toc(toc) def make_zipfiles_toc(self): # TODO create a real TOC when handling of more files is added. @@ -119,7 +119,7 @@ def make_zipfiles_toc(self): if dist._pyinstaller_info['zipped'] and not dist._pyinstaller_info['egg']: # Hmm, this should never happen as normal zip-files are not associated with a distribution, are they? toc.append(("eggs/" + os.path.basename(dist.location), dist.location, 'ZIPFILE')) - return toc + return normalize_toc(toc) @staticmethod def __collect_data_files_from_zip(zipfilename): @@ -138,7 +138,7 @@ def __collect_data_files_from_zip(zipfilename): return Tree(workpath, excludes=PY_IGNORE_EXTENSIONS) def make_zipped_data_toc(self): - toc = TOC() + toc = [] logger.debug('Looking for egg data files...') for dist in self._distributions: if dist._pyinstaller_info['egg']: @@ -153,4 +153,4 @@ def make_zipped_data_toc(self): else: # this is an un-zipped, not-zip-safe egg, handled in make_datas_toc() pass - return toc + return normalize_toc(toc) From 7634f8545e19b4ac5f3741905e6b18aed10cdc51 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Sun, 7 May 2023 22:10:13 +0200 Subject: [PATCH 08/18] building: main: move extension processing further up in workflow Process extensions (rename them to full filenames) immediately after we obtain the list of extensions from the modulegraph. --- PyInstaller/building/build_main.py | 36 +++++++++++++++--------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/PyInstaller/building/build_main.py b/PyInstaller/building/build_main.py index 1a74084de6..25b43a2dfe 100644 --- a/PyInstaller/building/build_main.py +++ b/PyInstaller/building/build_main.py @@ -621,6 +621,24 @@ def assemble(self): # Extend the binaries list with all the Extensions modulegraph has found. self.binaries += self.graph.make_binaries_toc() + # Convert extension module names into full filenames, and append suffix. Ensure that extensions that come from + # the lib-dynload are collected into _MEIPASS/lib-dynload instead of directly into _MEIPASS. + for idx, (dest, source, typecode) in enumerate(self.binaries): + if typecode != 'EXTENSION': + continue + + # Convert to full filename and append suffix + dest, source, typecode = add_suffix_to_extension(dest, source, typecode) + + # Divert into lib-dyload, if necessary (i.e., if file comes from lib-dynload directory) and its destination + # path does not already have a directory prefix. + src_parent = os.path.basename(os.path.dirname(source)) + if src_parent == 'lib-dynload' and not os.path.dirname(os.path.normpath(dest)): + dest = os.path.join('lib-dynload', dest) + + # Update + self.binaries[idx] = (dest, source, typecode) + # Post-process GLib schemas self.datas = compile_glib_schema_files(self.datas, os.path.join(CONF['workpath'], "_pyi_gschema_compilation")) self.datas = TOC(self.datas) @@ -708,24 +726,6 @@ def assemble(self): self.binding_redirects[:] = list(set(self.binding_redirects)) logger.info("Found binding redirects: \n%s", self.binding_redirects) - # Convert extension module names into full filenames, and append suffix. Ensure that extensions that come from - # the lib-dynload are collected into _MEIPASS/lib-dynload instead of directly into _MEIPASS. - for idx, (dest, source, typecode) in enumerate(self.binaries): - if typecode != 'EXTENSION': - continue - - # Convert to full filename and append suffix - dest, source, typecode = add_suffix_to_extension(dest, source, typecode) - - # Divert into lib-dyload, if necessary (i.e., if file comes from lib-dynload directory) and its destination - # path does not already have a directory prefix. - src_parent = os.path.basename(os.path.dirname(source)) - if src_parent == 'lib-dynload' and not os.path.dirname(os.path.normpath(dest)): - dest = os.path.join('lib-dynload', dest) - - # Update - self.binaries[idx] = (dest, source, typecode) - # Write warnings about missing modules. self._write_warnings() # Write debug information about the graph From 1f2bed88d182d2ab6bf324e54505891fb671f945 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Sun, 7 May 2023 22:35:38 +0200 Subject: [PATCH 09/18] building: main: remove direct uses of TOC class Instead of using TOC class, perform explicit TOC list normalization at certain build stages. --- PyInstaller/building/build_main.py | 65 +++++++++++++++++++----------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/PyInstaller/building/build_main.py b/PyInstaller/building/build_main.py index 25b43a2dfe..ecbf41b27d 100644 --- a/PyInstaller/building/build_main.py +++ b/PyInstaller/building/build_main.py @@ -26,7 +26,7 @@ from PyInstaller import log as logging from PyInstaller.archive import pyz_crypto from PyInstaller.building.api import COLLECT, EXE, MERGE, PYZ -from PyInstaller.building.datastruct import TOC, Target, Tree, _check_guts_eq +from PyInstaller.building.datastruct import TOC, Target, Tree, _check_guts_eq, normalize_toc, normalize_pyz_toc from PyInstaller.building.osx import BUNDLE from PyInstaller.building.splash import Splash from PyInstaller.building.toc_conversion import DependencyProcessor @@ -255,7 +255,7 @@ class Analysis(Target): """ Class that performs analysis of the user's main Python scripts. - An Analysis has five outputs, all TOCs (Table of Contents) accessed as attributes of the analysis. + An Analysis has five outputs, all TOC (Table of lists Contents) accessed as attributes of the analysis. scripts The scripts you gave Analysis as input, with any runtime hook scripts prepended. @@ -397,13 +397,13 @@ def __init__( self.hiddenimports.append('tinyaes') self.excludes = excludes or [] - self.scripts = TOC() - self.pure = TOC() - self.binaries = TOC() - self.zipfiles = TOC() - self.zipped_data = TOC() - self.datas = TOC() - self.dependencies = TOC() + self.scripts = [] + self.pure = [] + self.binaries = [] + self.zipfiles = [] + self.zipped_data = [] + self.datas = [] + self.dependencies = [] self.binding_redirects = CONF['binding_redirects'] = [] self.win_no_prefer_redirects = win_no_prefer_redirects self.win_private_assemblies = win_private_assemblies @@ -412,14 +412,18 @@ def __init__( self.module_collection_mode = module_collection_mode or {} # Initialize 'binaries' and 'datas' with lists specified in .spec file. + # Ensure the lists are normalized before guts comparison. if binaries: logger.info("Appending 'binaries' from .spec") for name, pth in format_binaries_and_datas(binaries, workingdir=spec_dir): self.binaries.append((name, pth, 'BINARY')) + self.binaries = normalize_toc(self.binaries) + if datas: logger.info("Appending 'datas' from .spec") for name, pth in format_binaries_and_datas(datas, workingdir=spec_dir): self.datas.append((name, pth, 'DATA')) + self.datas = normalize_toc(self.datas) self.__postinit__() @@ -486,13 +490,14 @@ def _check_guts(self, data, last_build): logger.info("Building because %s changed", filename) return True # Now we know that none of the input parameters and none of the input files has changed. So take the values - # calculated resp. analysed in the last run and store them in `self`. - self.scripts = TOC(data['scripts']) - self.pure = TOC(data['pure']) - self.binaries = TOC(data['binaries']) - self.zipfiles = TOC(data['zipfiles']) - self.zipped_data = TOC(data['zipped_data']) - self.datas = TOC(data['datas']) + # that were calculated / analyzed in the last run and store them in `self`. These TOC lists should already + # be normalized. + self.scripts = data['scripts'] + self.pure = data['pure'] + self.binaries = data['binaries'] + self.zipfiles = data['zipfiles'] + self.zipped_data = data['zipped_data'] + self.datas = data['datas'] # Store previously found binding redirects in CONF for later use by PKG/COLLECT from PyInstaller.config import CONF @@ -517,7 +522,7 @@ def assemble(self): libzip_filename = os.path.join(CONF['workpath'], 'base_library.zip') create_py3_base_library(libzip_filename, graph=self.graph) # Bundle base_library.zip as data file. - # Data format of TOC item: ('relative_path_in_dist_dir', 'absolute_path_on_disk', 'DATA') + # Data format of TOC item: ('relative_path_in_dist_dir', 'absolute_path_on_disk', 'DATA') self.datas.append((os.path.basename(libzip_filename), libzip_filename, 'DATA')) # Expand sys.path of module graph. The attribute is the set of paths to use for imports: sys.path, plus our @@ -583,9 +588,11 @@ def assemble(self): # Update 'binaries' TOC and 'datas' TOC. deps_proc = DependencyProcessor(self.graph, self.graph._additional_files_cache) + self.binaries.extend(deps_proc.make_binaries_toc()) self.datas.extend(deps_proc.make_datas_toc()) - self.zipped_data.extend(deps_proc.make_zipped_data_toc()) + + self.zipped_data = deps_proc.make_zipped_data_toc() # Already normalized # Note: zipped eggs are collected below # -- Look for dlls that are imported by Python 'ctypes' module. -- @@ -617,6 +624,7 @@ def assemble(self): # Initialize the scripts list with priority scripts in the proper order. self.scripts = self.graph.nodes_to_toc(priority_scripts) + self.scripts = normalize_toc(self.scripts) # Should not really contain duplicates, but just in case... # Extend the binaries list with all the Extensions modulegraph has found. self.binaries += self.graph.make_binaries_toc() @@ -639,9 +647,13 @@ def assemble(self): # Update self.binaries[idx] = (dest, source, typecode) + # Perform initial normalization of `datas` and `binaries` + self.datas = normalize_toc(self.datas) + self.binaries = normalize_toc(self.binaries) + # Post-process GLib schemas self.datas = compile_glib_schema_files(self.datas, os.path.join(CONF['workpath'], "_pyi_gschema_compilation")) - self.datas = TOC(self.datas) + self.datas = normalize_toc(self.datas) # Process the pure-python modules list. Depending on the collection mode, these entries end up either in "pure" # list for collection into the PYZ archive, or in the "datas" list for collection as external data files. @@ -696,6 +708,9 @@ def assemble(self): self.datas.append((dest_path, obj_path, "DATA")) + # Normalize list of pure-python modules (these will end up in PYZ archive, so use specific normalization). + self.pure = normalize_pyz_toc(self.pure) + # And get references to module code objects constructed by ModuleGraph to avoid writing .pyc files to hdd. self.pure._code_cache = code_cache @@ -711,19 +726,23 @@ def assemble(self): collected_packages = self.graph.get_collected_packages() self.binaries.extend( - isolated.call(find_binary_dependencies, list(self.binaries), self.binding_redirects, collected_packages) + isolated.call(find_binary_dependencies, self.binaries, self.binding_redirects, collected_packages) ) # Include zipped Python eggs. logger.info('Looking for eggs') - self.zipfiles.extend(deps_proc.make_zipfiles_toc()) + self.zipfiles = deps_proc.make_zipfiles_toc() # Already normalized + + # Final normalization of datas and binaries + self.datas = normalize_toc(self.datas) + self.binaries = normalize_toc(self.binaries) # Verify that Python dynamic library can be found. Without dynamic Python library PyInstaller cannot continue. self._check_python_library(self.binaries) if is_win: # Remove duplicate redirects - self.binding_redirects[:] = list(set(self.binding_redirects)) + self.binding_redirects = list(set(self.binding_redirects)) logger.info("Found binding redirects: \n%s", self.binding_redirects) # Write warnings about missing modules. @@ -876,7 +895,7 @@ def build(spec, distpath, workpath, clean_build): 'WARNFILE': CONF['warnfile'], 'workpath': CONF['workpath'], # PyInstaller classes for .spec. - 'TOC': TOC, + 'TOC': TOC, # Kept for backward compatibility even though `TOC` class is deprecated. 'Analysis': Analysis, 'BUNDLE': BUNDLE, 'COLLECT': COLLECT, From 5d9cab58792ac3b09b6fc23fd169526142d96ab3 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Mon, 8 May 2023 00:08:03 +0200 Subject: [PATCH 10/18] building: work around the issue with passing code cache around When `Analysis.pure` TOC list was an instance of the `TOC` class, we could dynamically add an attribute `_code_cache` to it, and use that to pass the code cache dictionary around. The `PYZ` writer would then check for the presence of the attribute on the received arguments and extend its internal code cache accordingly. Now that the TOC list is a plain `list`, the custom attribute cannot be added anymore. To work around this, we store new dictionary in global `CONF['code_cache']`, where we associate the list's `id()` with its code cache dictionary. This gives us the same semantics as the old approach. --- PyInstaller/building/api.py | 6 +++++- PyInstaller/building/build_main.py | 13 +++++++++++-- PyInstaller/config.py | 2 ++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/PyInstaller/building/api.py b/PyInstaller/building/api.py index 3664df0907..ed984c1489 100644 --- a/PyInstaller/building/api.py +++ b/PyInstaller/building/api.py @@ -106,7 +106,11 @@ def __init__(self, *tocs, **kwargs): self.toc = [] self.code_dict = {} for toc in tocs: - self.code_dict.update(getattr(toc, '_code_cache', {})) + # Check if code cache association exists for the given TOC list + code_cache = CONF['code_cache'].get(id(toc)) + if code_cache is not None: + self.code_dict.update(code_cache) + for entry in toc: name, _, typecode = entry # PYZ expects PYMODULE entries (python code objects) and DATA entries (data collected from zipped eggs). diff --git a/PyInstaller/building/build_main.py b/PyInstaller/building/build_main.py index ecbf41b27d..48f526c614 100644 --- a/PyInstaller/building/build_main.py +++ b/PyInstaller/building/build_main.py @@ -711,8 +711,15 @@ def assemble(self): # Normalize list of pure-python modules (these will end up in PYZ archive, so use specific normalization). self.pure = normalize_pyz_toc(self.pure) - # And get references to module code objects constructed by ModuleGraph to avoid writing .pyc files to hdd. - self.pure._code_cache = code_cache + # Associate the `pure` TOC list instance with code cache in the global `CONF`; this is used by `PYZ` writer + # to obtain modules' code from cache instead + # + # (NOTE: back when `pure` was an instance of `TOC` class, the code object was passed by adding an attribute + # to the `pure` itself; now that `pure` is plain `list`, we cannot do that anymore. But the association via + # object ID should have the same semantics as the added attribute). + from PyInstaller.config import CONF + global_code_cache_map = CONF['code_cache'] + global_code_cache_map[id(self.pure)] = code_cache # Add remaining binary dependencies - analyze Python C-extensions and what DLLs they depend on. # @@ -861,6 +868,8 @@ def build(spec, distpath, workpath, clean_build): CONF['dot-file'] = os.path.join(workpath, 'graph-%s.dot' % CONF['specnm']) CONF['xref-file'] = os.path.join(workpath, 'xref-%s.html' % CONF['specnm']) + CONF['code_cache'] = dict() + # Clean PyInstaller cache (CONF['cachedir']) and temporary files (workpath) to be able start a clean build. if clean_build: logger.info('Removing temporary files and cleaning cache in %s', CONF['cachedir']) diff --git a/PyInstaller/config.py b/PyInstaller/config.py index 80b18186ef..18db9d8a75 100644 --- a/PyInstaller/config.py +++ b/PyInstaller/config.py @@ -44,6 +44,8 @@ workpath tests_modgraph - cached PyiModuleGraph object to speed up tests + +code_cache - dictionary associating `Analysis.pure` list instances with code cache dictionaries. Used by PYZ writer. """ # NOTE: Do not import other PyInstaller modules here. Just define constants here. From b59792f729419f27b0b46cb27508de5052c1d0ff Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Mon, 8 May 2023 02:16:08 +0200 Subject: [PATCH 11/18] tests: fix CONF override in test_issue_2492 and test_issue_5131 The CONF override in `test_issue_2492` and `test_issue_5131` should initialize the newly-introduced `code_cache` dictionary. --- tests/functional/test_regression.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/functional/test_regression.py b/tests/functional/test_regression.py index 3b184626b5..96aa3a7d28 100644 --- a/tests/functional/test_regression.py +++ b/tests/functional/test_regression.py @@ -33,7 +33,8 @@ def test_issue_2492(monkeypatch, tmpdir): 'dot-file': str(tmpdir.join('imports.dot')), 'xref-file': str(tmpdir.join('imports.xref')), 'hiddenimports': [], - 'specnm': 'issue_2492_script' + 'specnm': 'issue_2492_script', + 'code_cache': dict(), } ) # Speedup: avoid analyzing base_library.zip @@ -87,7 +88,8 @@ def getImports(*args, **kwargs): 'dot-file': str(tmpdir.join('imports.dot')), 'xref-file': str(tmpdir.join('imports.xref')), 'hiddenimports': [], - 'specnm': 'issue_5131_script' + 'specnm': 'issue_5131_script', + 'code_cache': dict(), } ) # Speedup: avoid analyzing base_library.zip From 7527eaf9514c42424117655fe8a5ac9d78ba431b Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Mon, 8 May 2023 13:48:44 +0200 Subject: [PATCH 12/18] building: splash: fix detection of tkinter usage The `Splash` target is trying to detect whether user's code already uses `_tkinter` by looking for extension in the `binaries` TOC. However, it uses the `filenames` property of the `TOC` class, which is not available anymore now that the TOC lists have been switched to plain `list`. Futhermore, as it is trying to look up `_tkinter` extension by module name, this means that the detection has been effectively broken since #7273 where we pushed conversion of extension module names to full extension filenames into `Analysis`. So the search in `binaries` needs to be done with full extension filename rather than module name. --- PyInstaller/building/splash.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/PyInstaller/building/splash.py b/PyInstaller/building/splash.py index b70d54a916..76728c3b5d 100644 --- a/PyInstaller/building/splash.py +++ b/PyInstaller/building/splash.py @@ -12,6 +12,7 @@ import os import re import struct +import pathlib from PyInstaller import log as logging from PyInstaller.archive.writers import SplashWriter @@ -186,7 +187,8 @@ def __init__(self, image_file, binaries, datas, **kwargs): ) # Calculated / analysed values - self.uses_tkinter = self._uses_tkinter(binaries) + self.uses_tkinter = self._uses_tkinter(self._tkinter_file, binaries) + logger.debug("Program uses tkinter: %r", self.uses_tkinter) self.script = self.generate_script() self.tcl_lib, self.tk_lib = tcltk_utils.find_tcl_tk_shared_libs(self._tkinter_file) if is_darwin: @@ -448,9 +450,14 @@ def generate_script(self): return script @staticmethod - def _uses_tkinter(binaries): - # Test for _tkinter instead of tkinter, because a user might use a different wrapping library for tk. - return '_tkinter' in binaries.filenames + def _uses_tkinter(tkinter_file, binaries): + # Test for _tkinter extension instead of tkinter module, because user might use a different wrapping library for + # Tk. Use `pathlib.PurePath˙ in comparisons to account for case normalization and separator normalization. + tkinter_file = pathlib.PurePath(tkinter_file) + for dest_name, src_name, typecode in binaries: + if pathlib.PurePath(src_name) == tkinter_file: + return True + return False @staticmethod def _find_rundir(structure): From f83dad649fb4bd7595a5eb87e6bcb23c307a06a4 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Mon, 8 May 2023 16:40:38 +0200 Subject: [PATCH 13/18] building: implement TOC list normalization helpers Replace the TOC list normalization stubs with proper implementation that encodes priority information. This for example ensures that if a file is collected both as a DATA and BINARY, we always treat is as BINARY (and hence subject it to additional binary processing). --- PyInstaller/building/datastruct.py | 56 ++++++++++++++++++++++++++---- news/7615.core.rst | 9 +++++ 2 files changed, 59 insertions(+), 6 deletions(-) create mode 100644 news/7615.core.rst diff --git a/PyInstaller/building/datastruct.py b/PyInstaller/building/datastruct.py index 1e34609e66..b2b3fee83e 100644 --- a/PyInstaller/building/datastruct.py +++ b/PyInstaller/building/datastruct.py @@ -10,6 +10,7 @@ #----------------------------------------------------------------------------- import os +import pathlib from PyInstaller import log as logging from PyInstaller.building.utils import _check_guts_eq @@ -296,12 +297,55 @@ def assemble(self): def normalize_toc(toc): - # TODO: for now, this is a stub using TOC class. Replace it with priority-based de-duplication. - normalized_toc = TOC(toc) - return list(normalized_toc) + # Default priority: 0 + _TOC_TYPE_PRIORITIES = { + # DEPENDENCY entries need to replace original entries, so they need the highest priority. + 'DEPENDENCY': 2, + # BINARY/EXTENSION entries undergo additional processing, so give them precedence over DATA and other entries. + 'BINARY': 1, + 'EXTENSION': 1, + } + + def _type_case_normalization_fcn(typecode): + # Case-normalize all entries except OPTION. + return typecode not in { + "OPTION", + } + + return _normalize_toc(toc, _TOC_TYPE_PRIORITIES, _type_case_normalization_fcn) def normalize_pyz_toc(toc): - # TODO: for now, this is a stub using TOC class. Replace it with priority-based de-duplication. - normalized_toc = TOC(toc) - return list(normalized_toc) + # Default priority: 0 + _TOC_TYPE_PRIORITIES = { + # Ensure that modules are never shadowed by PYZ-embedded data files. + 'PYMODULE': 1, + } + + return _normalize_toc(toc, _TOC_TYPE_PRIORITIES) + + +def _normalize_toc(toc, toc_type_priorities, type_case_normalization_fcn=lambda typecode: False): + tmp_toc = dict() + for entry in toc: + dest_name, src_name, typecode = entry + + # Normalize the destination name for uniqueness. Use `pathlib.PurePath` to ensure that keys are both + # case-normalized (on OSes where applicable) and directory-separator normalized (just in case). + if type_case_normalization_fcn(typecode): + entry_key = pathlib.PurePath(dest_name) + else: + entry_key = dest_name + + existing_entry = tmp_toc.get(entry_key) + if existing_entry is None: + # Entry does not exist - insert + tmp_toc[entry_key] = entry + else: + # Entry already exists - replace if its typecode has higher priority + _, _, existing_typecode = existing_entry + if toc_type_priorities.get(typecode, 0) > toc_type_priorities.get(existing_typecode, 0): + tmp_toc[entry_key] = entry + + # Return the items as list. The order matches the original order due to python dict maintaining the insertion order. + return list(tmp_toc.values()) diff --git a/news/7615.core.rst b/news/7615.core.rst new file mode 100644 index 0000000000..b421f7a0d6 --- /dev/null +++ b/news/7615.core.rst @@ -0,0 +1,9 @@ +Remove the use of the ``TOC`` class in the analysis / build process, +and use plain ``list`` instances instead. The implicit normalization +(de-duplication) of TOC entries performed by the ``TOC`` class has been +replaced with explicit normalization. The TOC lists produced by ``Analysis`` +are explicitly normalized at the end of Analysis instantiation, before +they are stored in the Analysis properties (e.g., ``Analysis.pure``, +``Analysis.binaries``, ``Analysis.datas``). Similarly, the TOC lists +passed to the build targets (e.g., ``PYZ``, ``EXE``, ``COLLECT``) are +explicitly normalized as part of the targets' instantiation process. From 0603e01ee6fdc732332a4bc21e17840ceef0c5e6 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Mon, 8 May 2023 17:01:37 +0200 Subject: [PATCH 14/18] building: EXE: remove the work-around for merging PYZ.dependencies We now have TOC normalization with priority system in place, so we can directly extend the EXE's TOC with entries from all passed targets' dependencies. --- PyInstaller/building/api.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/PyInstaller/building/api.py b/PyInstaller/building/api.py index ed984c1489..1e477d86d8 100644 --- a/PyInstaller/building/api.py +++ b/PyInstaller/building/api.py @@ -478,8 +478,6 @@ def __init__(self, *args, **kwargs): self.toc = [] - _deps_toc = [] # See the note below - for arg in args: # Valid arguments: PYZ object, Splash object, and TOC-list iterables if isinstance(arg, (PYZ, Splash)): @@ -488,31 +486,13 @@ def __init__(self, *args, **kwargs): self.toc.append((os.path.basename(arg.name), arg.name, "PYZ")) else: self.toc.append((os.path.basename(arg.name), arg.name, "SPLASH")) - # See the note below (and directly extend self.toc once this workaround is not necessary anymore). - # self.toc.extend(arg.dependencies) - for entry in arg.dependencies: - _, _, typecode = entry - if typecode in ('EXTENSION', 'BINARY', 'DATA'): - _deps_toc.append(entry) - else: - self.toc.append(entry) + self.toc.extend(arg.dependencies) elif miscutils.is_iterable(arg): # TOC-like iterable self.toc.extend(arg) else: raise TypeError(f"Invalid argument type for EXE: {type(arg)!r}") - # NOTE: this is an ugly work-around that ensures that when MERGE is used, the EXE's TOC is first populated with - # MERGE'd `binaries` and `datas` entries (which should be DEPENDENCY references for shared resources, and BINARY - # or DATA entries for non-shared resources), and that `PYZ.dependencies` is merged last. The latter may contain - # entries for `_struct` and `zlib` extensions, and if they end up in the TOC first, they will block the - # corresponding DEPENDENCY entries (if they are available) from being added to TOC. Which will in turn result in - # missing extensions with certain onefile/onedir referencing combinations. And even if not, the result would be - # but sub-optimal, as the extensions could be shared via DEPENDENCY mechanism. This work-around can be removed - # once we replace the TOC class with mechanism that implements a typecode-based priority system for the entries. - self.toc.extend(_deps_toc) - del _deps_toc - if self.runtime_tmpdir is not None: self.toc.append(("pyi-runtime-tmpdir " + self.runtime_tmpdir, "", "OPTION")) From e6f6f1f18278a15522a357d390537d4723573acb Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Tue, 9 May 2023 00:45:54 +0200 Subject: [PATCH 15/18] tests: add basic tests for the new TOC normalization helpers --- tests/unit/test_toc_normalization.py | 186 +++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 tests/unit/test_toc_normalization.py diff --git a/tests/unit/test_toc_normalization.py b/tests/unit/test_toc_normalization.py new file mode 100644 index 0000000000..0a090bfb87 --- /dev/null +++ b/tests/unit/test_toc_normalization.py @@ -0,0 +1,186 @@ +#----------------------------------------------------------------------------- +# Copyright (c) 2023, PyInstaller Development Team. +# +# Distributed under the terms of the GNU General Public License (version 2 +# or later) with exception for distributing the bootloader. +# +# The full license is in the file COPYING.txt, distributed with this software. +# +# SPDX-License-Identifier: (GPL-2.0-or-later WITH Bootloader-exception) +#----------------------------------------------------------------------------- + +# Tests for explicit TOC list normalization that replaced the implicit normalization with class:``TOC``. +import copy + +from PyInstaller import compat +from PyInstaller.building.datastruct import normalize_pyz_toc, normalize_toc + +# Tests for regular TOC normalization. + +_BASE_TOC = [ + ('libpython3.10.so', '/usr/lib64/libpython3.10.so', 'BINARY'), + ('libsomething.so', '/usr/local/lib64/libsomething.so', 'BINARY'), + ('README', '/home/user/tmp/README', 'DATA'), + ('data/data.csv', '/home/user/tmp/data/data.csv', 'DATA'), + ('dependency.bin', 'other_multipackage:dependency.bin', 'DEPENDENCY'), + ('myextension.so', 'myextension.so', 'EXTENSION'), +] + + +def test_normalize_toc_no_duplicates(): + # No duplicates. We expect the output list to match the input list. + toc = copy.copy(_BASE_TOC) + expected_toc = _BASE_TOC + + normalized_toc = normalize_toc(toc) + assert normalized_toc == expected_toc + + +def test_normalize_toc_duplicate_binary(): + # A duplicated BINARY entry. We expect the (second) duplicate to be removed. + toc = copy.copy(_BASE_TOC) + toc.insert(2, ('libsomething.so', '/opt/something/lib/libsomething.so', 'BINARY')) + expected_toc = _BASE_TOC + + normalized_toc = normalize_toc(toc) + assert normalized_toc == expected_toc + + +def test_normalize_toc_duplicate_binary_case_sensitive(): + # A BINARY entry that is a duplicate only on case-insensitive OSes. + toc = copy.copy(_BASE_TOC) + toc.insert(2, ('libSoMeThInG.so', '/opt/something/lib/libSoMeThInG.so', 'BINARY')) + expected_toc = _BASE_TOC + + if compat.is_win: + expected_toc = _BASE_TOC + else: + expected_toc = toc + + normalized_toc = normalize_toc(toc) + assert normalized_toc == expected_toc + + +def test_normalize_toc_duplicate_data(): + # A duplicated DATA entry. We expect the (second) duplicate to be removed. + toc = copy.copy(_BASE_TOC) + toc.insert(3, ('README', '/home/user/tmp/README', 'DATA')) + expected_toc = _BASE_TOC + + normalized_toc = normalize_toc(toc) + assert normalized_toc == expected_toc + + +def test_normalize_toc_duplicate_data_case_sensitive(): + # A DATA entry that is a duplicate on case-insensitive OSes. + toc = copy.copy(_BASE_TOC) + toc.insert(-1, ('readme', '/home/user/tmp-other/readme', 'DATA')) + expected_toc = _BASE_TOC + + if compat.is_win: + expected_toc = _BASE_TOC + else: + expected_toc = toc + + normalized_toc = normalize_toc(toc) + assert normalized_toc == expected_toc + + +def test_normalize_toc_conflicting_binary_and_data1(): + # An entry that's duplicated as both BINARY and DATA. + # BINARY entry should be kept. + toc = copy.copy(_BASE_TOC) + toc.insert(2, ('libsomething.so', '/usr/local/lib64/libsomething.so', 'DATA')) # Insert after BINARY entry + expected_toc = _BASE_TOC + + normalized_toc = normalize_toc(toc) + assert normalized_toc == expected_toc + + +def test_normalize_toc_conflicting_binary_and_data2(): + # An entry that's duplicated as both BINARY and DATA, in reverse order. + # BINARY entry should be kept. + toc = copy.copy(_BASE_TOC) + toc.insert(1, ('libsomething.so', '/usr/local/lib64/libsomething.so', 'DATA')) # Insert before BINARY entry + expected_toc = _BASE_TOC + + normalized_toc = normalize_toc(toc) + assert normalized_toc == expected_toc + + +def test_normalize_toc_multipackage_dependency(): + # An entry that's duplicated as both BINARY, DATA, EXTENSION, and DEPENDENCY. + # DEPENDENCY should have the highest priority of the four. + # The priority-based replacement during normalization might not preserve the order, so we need to sort the + # resulting and expected TOC before comparing them. In this particular case, we insert duplicates at the + # start of the list, so de-duplication effectively moves the DEPENDENCY entry to the first place in the + # output list. + toc = copy.copy(_BASE_TOC) + toc.insert(0, ('dependency.bin', '/mnt/somewhere/dependency.bin', 'EXTENSION')) + toc.insert(0, ('dependency.bin', '/mnt/somewhere/dependency.bin', 'BINARY')) + toc.insert(0, ('dependency.bin', '/mnt/somewhere/dependency.bin', 'DATA')) + expected_toc = _BASE_TOC + + normalized_toc = normalize_toc(toc) + assert sorted(normalized_toc) == sorted(expected_toc) + + +# Tests for PYZ TOC normalization. +_BASE_PYZ_TOC = [ + ('copy', '/usr/lib64/python3.11/copy.py', 'PYMODULE'), + ('csv', '/usr/lib64/python3.11/csv.py', 'PYMODULE'), + ('dataclasses', '/usr/lib64/python3.11/dataclasses.py', 'PYMODULE'), + ('datetime', '/usr/lib64/python3.11/datetime.py', 'PYMODULE'), + ('decimal', '/usr/lib64/python3.11/decimal.py', 'PYMODULE'), + ('mymodule1', 'mymodule1.py', 'PYMODULE'), + ('mymodule2', 'mymodule2.py', 'PYMODULE'), +] + + +def test_normalize_pyz_toc_no_duplicates(): + # No duplicates. We expect the output list to match the input list. + toc = copy.copy(_BASE_PYZ_TOC) + expected_toc = _BASE_PYZ_TOC + + normalized_toc = normalize_pyz_toc(toc) + assert normalized_toc == expected_toc + + +def test_normalize_pyz_toc_duplicates(): + # Duplicated entry. We expect the (second) duplicate to be removed. + toc = copy.copy(_BASE_PYZ_TOC) + toc.insert(6, ('mymodule1', 'some-other-path/mymodule1.py', 'PYMODULE')) + expected_toc = _BASE_PYZ_TOC + + normalized_toc = normalize_pyz_toc(toc) + assert normalized_toc == expected_toc + + +def test_normalize_pyz_toc_case_sensitivity(): + # Duplicated entry with different case. In PYZ, the entries are case-sensitive, so the list is not modified. + toc = copy.copy(_BASE_PYZ_TOC) + toc.insert(6, ('MyMoDuLe1', 'some-other-path/MyMoDuLe1.py', 'PYMODULE')) + expected_toc = toc + + normalized_toc = normalize_pyz_toc(toc) + assert normalized_toc == expected_toc + + +def test_normalize_pyz_toc_module_and_data1(): + # In PYZ TOC, a DATA entry should not mask a PYMODULE one. + toc = copy.copy(_BASE_PYZ_TOC) + toc.insert(5, ('mymodule1', 'data-dir/mymodule1', 'DATA')) + expected_toc = _BASE_PYZ_TOC + + normalized_toc = normalize_pyz_toc(toc) + assert normalized_toc == expected_toc + + +def test_normalize_pyz_toc_module_and_data2(): + # In PYZ TOC, a DATA entry should not mask a PYMODULE one. Variant with switched order. + toc = copy.copy(_BASE_PYZ_TOC) + toc.insert(6, ('mymodule1', 'data-dir/mymodule1', 'DATA')) + expected_toc = _BASE_PYZ_TOC + + normalized_toc = normalize_pyz_toc(toc) + assert normalized_toc == expected_toc From 69e6130dfbd638f85144212491f7e702e7517853 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Tue, 9 May 2023 12:42:07 +0200 Subject: [PATCH 16/18] building: add deprecation warning to TOC class And ignore it in the TOC class unit tests. --- PyInstaller/building/datastruct.py | 12 +++++++++++- news/7615.deprecation.rst | 5 +++++ tests/unit/test_TOC.py | 3 +++ 3 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 news/7615.deprecation.rst diff --git a/PyInstaller/building/datastruct.py b/PyInstaller/building/datastruct.py index b2b3fee83e..c5b961d3d6 100644 --- a/PyInstaller/building/datastruct.py +++ b/PyInstaller/building/datastruct.py @@ -11,6 +11,7 @@ import os import pathlib +import warnings from PyInstaller import log as logging from PyInstaller.building.utils import _check_guts_eq @@ -38,8 +39,9 @@ def unique_name(entry): return name +# This class is deprecated and has been replaced by plain lists with explicit normalization (de-duplication) via +# `normalize_toc` and `normalize_pyz_toc` helper functions. class TOC(list): - # TODO: simplify the representation and use directly Modulegraph objects. """ TOC (Table of Contents) class is a list of tuples of the form (name, path, typecode). @@ -59,6 +61,14 @@ class TOC(list): """ def __init__(self, initlist=None): super().__init__() + + # Deprecation warning + warnings.warn( + "TOC class is deprecated. Use a plain list of 3-element tuples instead.", + DeprecationWarning, + stacklevel=2, + ) + self.filenames = set() if initlist: for entry in initlist: diff --git a/news/7615.deprecation.rst b/news/7615.deprecation.rst new file mode 100644 index 0000000000..691ef3cca2 --- /dev/null +++ b/news/7615.deprecation.rst @@ -0,0 +1,5 @@ +The ``TOC`` class is now deprecated; use a plain ``list`` with the same +three-element tuples instead. PyInstaller now performs explicit +normalization (i.e., entry de-duplication) of the TOC lists passed +to the build targets (e.g., ``PYZ``, ``EXE``, ``COLLECT``) during their +instantiation. diff --git a/tests/unit/test_TOC.py b/tests/unit/test_TOC.py index 15f32e9d5c..10c658af3f 100644 --- a/tests/unit/test_TOC.py +++ b/tests/unit/test_TOC.py @@ -29,6 +29,9 @@ ELEMS3 = (('PIL.Image.py', '/usr/lib/python2.7/encodings/__init__.py', 'PYMODULE'),) +# Ignore deprecation warnings for the TOC class +pytestmark = pytest.mark.filterwarnings("ignore:TOC class is deprecated.") + def test_init_empty(): toc = TOC() From cf95497ca8972cdc0fa8f0a833f44c5a148d85f1 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Tue, 9 May 2023 14:33:49 +0200 Subject: [PATCH 17/18] docs: update documentation on TOC lists Update documentation on TOC lists now that the TOC class has been officially deprecated. --- doc/advanced-topics.rst | 221 +++++++++++++++++++++------------------- news/7615.doc.rst | 2 + 2 files changed, 120 insertions(+), 103 deletions(-) create mode 100644 news/7615.doc.rst diff --git a/doc/advanced-topics.rst b/doc/advanced-topics.rst index 2a0a2b99a4..70c7ce17b2 100644 --- a/doc/advanced-topics.rst +++ b/doc/advanced-topics.rst @@ -267,144 +267,159 @@ Functions .. _the toc and tree classes: -The TOC and Tree Classes -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -PyInstaller manages lists of files using the ``TOC`` -(Table Of Contents) class. -It provides the ``Tree`` class as a convenient way to build a ``TOC`` -from a folder path. - -TOC Class (Table of Contents) ---------------------------------- - -Objects of the ``TOC`` class are used as input to the classes created in -a spec file. -For example, the ``scripts`` member of an Analysis object is a TOC -containing a list of scripts. -The ``pure`` member is a TOC with a list of modules, and so on. - -Basically a ``TOC`` object contains a list of tuples of the form - - ``(``\ *name*\ ``,``\ *path*\ ``,``\ *typecode*\ ``)`` - -In fact, it acts as an ordered set of tuples; -that is, it contains no duplicates -(where uniqueness is based on the *name* element of each tuple). -Within this constraint, a TOC preserves the order of tuples added to it. - -A TOC behaves like a list and supports the same methods -such as appending, indexing, etc. -A TOC also behaves like a set, and supports taking differences and intersections. -In all of these operations a list of tuples can be used as one argument. -For example, the following expressions are equivalent ways to -add a file to the ``a.datas`` member:: - - a.datas.append( [ ('README', 'src/README.txt', 'DATA' ) ] ) - a.datas += [ ('README', 'src/README.txt', 'DATA' ) ] - -Set-difference makes excluding modules quite easy. For example:: - - a.binaries - [('badmodule', None, None)] - -is an expression that produces a new ``TOC`` that is a copy of -``a.binaries`` from which any tuple named ``badmodule`` has been removed. -The right-hand argument to the subtraction operator -is a list that contains one tuple -in which *name* is ``badmodule`` and the *path* and *typecode* elements -are ``None``. -Because set membership is based on the *name* element of a tuple only, -it is not necessary to give accurate *path* and *typecode* elements when subtracting. - -In order to add files to a TOC, you need to know the *typecode* values -and their related *path* values. -A *typecode* is a one-word string. -PyInstaller uses a number of *typecode* values internally, -but for the normal case you need to know only these: - - -+---------------+--------------------------------------+-----------------------+--------------------------------------+ -| **typecode** | **description** | **name** | **path** | -+===============+======================================+=======================+======================================+ -| 'DATA' | Arbitrary files. | Run-time name. | Full path name in build. | -+---------------+--------------------------------------+-----------------------+--------------------------------------+ -| 'BINARY' | A shared library. | Run-time name. | Full path name in build. | -+---------------+--------------------------------------+-----------------------+--------------------------------------+ -| 'EXTENSION' | A binary extension to Python. | Run-time name. | Full path name in build. | -+---------------+--------------------------------------+-----------------------+--------------------------------------+ -| 'OPTION' | A Python run-time option. | Option code | ignored. | -+---------------+--------------------------------------+-----------------------+--------------------------------------+ - -The run-time name of a file will be used in the final bundle. +The Table of Contents (TOC) lists and the Tree Class +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +PyInstaller manages lists of files that are to be collected in the +so-called Table of Contents (TOC) list format. These lists contain +three-element tuples that encapsulate information about a file's +destination name, the file's full source path, and its type. + +As part of utilities for managing the TOC lists, PyInstaller provides +a ``Tree`` class as a convenient way to build a TOC list from the +contents of the given directory. This utility class can be used either +in the :ref:`.spec files ` file or from custom hooks. + + +Table of Contents (TOC) lists +----------------------------- + +The ``Analysis`` object produces several TOC lists that provide information +about files to be collected. The files are grouped into distinct lists +based on their type or function, for example: +- ``Analysis.scripts``: program script(s) +- ``Analysis.pure``: pure-python modules +- ``Analysis.binaries``: binary extension modules and shared libraries +- ``Analysis.datas``: data files + +The generated TOC lists are passed to various build targets within the +:ref:`spec file `, such as ``PYZ``, ``EXE``, and +``COLLECT``. + +Each TOC list contains three-element tuples, + + ``(dest_name, src_name , typecode)`` + +where ``dest_name`` is the destination file name (i.e., file name within +the frozen application; as such, it must always be a relative name), +``src_name`` is the source file name (the path from where the file is +collected), and ``typecode`` is a string that denotes the type of the +file (or entry). + +Internally, PyInstaller uses a number of *typecode* values, but for the +normal case you need to know only these: + ++---------------+---------------------------------------+----------------------------------+---------------------------------------------+ +| **typecode** | **description** | **dest_name** | **src_name** | ++===============+=======================================+==================================+=============================================+ +| 'DATA' | Arbitrary (data) files. | Name in the frozen application. | Full path to the file on the build system. | ++---------------+---------------------------------------+----------------------------------+---------------------------------------------+ +| 'BINARY' | A shared library. | Name in the frozen application. | Full path to the file on the build system. | ++---------------+---------------------------------------+----------------------------------+---------------------------------------------+ +| 'EXTENSION' | A Python binary extension. | Name in the frozen application. | Full path to the file on the build system. | ++---------------+---------------------------------------+----------------------------------+---------------------------------------------+ +| 'OPTION' | A PyInstaller/Python run-time option. | Option name (and optional value, | Ignored. | +| | | separated by a whitespace). | | ++---------------+---------------------------------------+----------------------------------+---------------------------------------------+ + +The destination name corresponds to the name of the final in the +frozen application, relative to the top-level application directory. It may include path elements, for example :file:`extras/mydata.txt`. -A ``BINARY`` file or an ``EXTENSION`` file is assumed to be loadable, executable code, -for example a dynamic library. -The types are treated the same. -``EXTENSION`` is generally used for a Python extension module, -for example a module compiled by Cython_. -PyInstaller will examine either type of file for dependencies, -and if any are found, they are also included. +Entries of type ``BINARY`` and ``EXTENSION`` are assumed to represent a +file containing loadable executable code, such as a dynamic library. +Generally, ``EXTENSION`` is used to denote Python extensions modules, +such as modules compiled by Cython_. The two file types are treated in +the same way; PyInstaller scans them for additional link-time +dependencies and collects any dependencies that are discovered. On some +operating systems, binaries and extensions undergo additional processing +(such as path rewriting for link-time dependencies and code-signing +on macOS). + +The TOC lists produced by ``Analysis`` can be modified in the +:ref:`spec file ` file before they are passed on to +the build targets to either include additional entries (although it is +preferable to pass extra files to be included via `binaries` or `datas` +arguments of `Analysis`) or remove unwanted entries. + +.. versionchanged:: 5.11 + + In PyInstaller versions prior to 5.11, the TOC lists were in fact + instances of the :class:`TOC` class, which internally performed + implicit entry de-duplication; i.e., trying to insert an entry with + existing target name would result in no changes to the list. + + However, due to the shortcomings of the ``TOC`` class that resulted from + loosely-defined and conflicting semantics, the use of the ``TOC`` class + has been deprecated. The TOC lists are now instances of plain ``list``, + and PyInstaller performs explicit list normalization (entry de-duplication). + The explicit normalization is performed at the end of ``Analysis`` + instantiation, when the lists are stored in the class' properties (such + as ``Analysis.datas`` and ``Analysis.binaries``). Similarly, explicit + list normalization is also performed once the build targets (``EXE``, + ``PYZ``, ``PKG``, ``COLLECT``, ``BUNDLE``) consolidate the input TOC + lists into the final list. + The Tree Class ------------------- +-------------- -The Tree class is a way of creating a TOC that describes some or all of the -files within a directory: +The ``Tree`` class offers a convenient way of creating a TOC list that +describes contents of the given directory: ``Tree(``\ *root*\ ``, prefix=``\ *run-time-folder*\ ``, excludes=``\ *string_list*\ ``, typecode=``\ *code* | ``'DATA' )`` -* The *root* argument is a path string to a directory. +* The *root* argument is a string denoting the path to the directory. It may be absolute or relative to the spec file directory. -* The *prefix* argument, if given, is a name for a subfolder - within the run-time folder to contain the tree files. - If you omit *prefix* or give ``None``, - the tree files will be at - the top level of the run-time folder. +* The optional *prefix* argument is a name for a sub-directory + in the application directory into which files are to be collected. + If not specified or set to ``None``, the files will be collected + into the top-level application directory. -* The *excludes* argument, if given, is a list of one or more +* The optional *excludes* argument is a list of one or more strings that match files in the *root* that should be omitted from the Tree. An item in the list can be either: - a name, which causes files or folders with this basename to be excluded - - ``*.ext``, which causes files with this extension to be excluded + - a glob pattern (e.g., ``*.ext``), which causes matching files to be excluded -* The *typecode* argument, if given, specifies the TOC typecode string - that applies to all items in the Tree. - If omitted, the default is ``DATA``, which is appropriate for most cases. +* The optional *typecode* argument specifies the TOC typecode string + that is assigned to all entries in the TOC list. + The default value is ``DATA``, which is appropriate for most cases. For example:: - extras_toc = Tree('../src/extras', prefix='extras', excludes=['tmp','*.pyc']) + extras_toc = Tree('../src/extras', prefix='extras', excludes=['tmp', '*.pyc']) -This creates ``extras_toc`` as a TOC object that lists +This creates ``extras_toc`` as a TOC list that contains entries for all files from the relative path :file:`../src/extras`, omitting those that have the basename (or are in a folder named) ``tmp`` -or that have the type ``.pyc``. +or have the ``.pyc`` extension. Each tuple in this TOC has: -* A *name* composed of :file:`extras/{filename}`. +* A *dest_name* in form of:file:`extras/{filename}`. -* A *path* consisting of a complete, absolute path to that file in the +* A *src_name* that corresponds to the full absolute path to that file in the :file:`../src/extras` folder (relative to the location of the spec file). -* A *typecode* of ``DATA`` (by default). +* A *typecode* of ``DATA`` (the default). An example of creating a TOC listing some binary modules:: - cython_mods = Tree( '..src/cy_mods', excludes=['*.pyx','*.py','*.pyc'], typecode='EXTENSION' ) + cython_mods = Tree('..src/cy_mods', excludes=['*.pyx', '*.py', '*.pyc'], typecode='EXTENSION') -This creates a TOC with a tuple for every file in the :file:`cy_mods` folder, -excluding any with the ``.pyx``, ``.py`` or ``.pyc`` suffixes -(so presumably collecting the ``.pyd`` or ``.so`` modules created by Cython). +This creates a TOC list with entries for each file in the :file:`cy_mods` directory, +excluding files with the ``.pyx``, ``.py``, or ``.pyc`` extension +(so presumably collecting only the ``.pyd`` or ``.so`` modules created by Cython). Each tuple in this TOC has: -* Its own filename as *name* (no prefix; the file will be at the top level of the bundle). +* A *dest_name* that corresponds to the file's basename (all files are collected + in top-level application directory). -* A *path* as an absolute path to that file in :file:`../src/cy_mods` - relative to the spec file. +* A *src_name* that corresponds to the full absolute path to that file in + :file:`../src/cy_mods` relative to the spec file. * A *typecode* of ``EXTENSION`` (``BINARY`` could be used as well). diff --git a/news/7615.doc.rst b/news/7615.doc.rst new file mode 100644 index 0000000000..38ac071635 --- /dev/null +++ b/news/7615.doc.rst @@ -0,0 +1,2 @@ +Update the documentation on TOC lists and ``Tree`` class to reflect the +deprecation of the ``TOC`` class. From 97bc98eb5d21eecbaaed21c81c8cf5213532eaef Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Wed, 10 May 2023 16:23:09 +0200 Subject: [PATCH 18/18] building: ensure TOC de-duplication when dest_name contains pardir loops Ensure that TOC is properly de-duplicated even if dest_name contains loops with parent directory path components. For example, `numpy/core/../../numpy.libs/libquadmath-2d0c479f.so.0.0.0` and `numpy/linalg/../../numpy.libs/libquadmath-2d0c479f.so.0.0.0` should be considered duplicates, as they are both normalized to `numpy.libs/libquadmath-2d0c479f.so.0.0.0`. Therefore, we now have the TOC normalization helpers to always sanitize the `dest_name` using `os.path.normpath` (with `pathlib` lacking the equivalent functionality), so that the entries are properly de-duplicated and that destination name is always in its compact/normalized form. We should probably also look into path normalization in the `bindepend.getImports` function, but at the end of the day, the TOC normalization serves as the last guard against problematic entries. --- PyInstaller/building/datastruct.py | 10 +++++---- tests/unit/test_toc_normalization.py | 31 +++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/PyInstaller/building/datastruct.py b/PyInstaller/building/datastruct.py index c5b961d3d6..078b592101 100644 --- a/PyInstaller/building/datastruct.py +++ b/PyInstaller/building/datastruct.py @@ -337,8 +337,10 @@ def normalize_pyz_toc(toc): def _normalize_toc(toc, toc_type_priorities, type_case_normalization_fcn=lambda typecode: False): tmp_toc = dict() - for entry in toc: - dest_name, src_name, typecode = entry + for dest_name, src_name, typecode in toc: + # Always sanitize the dest_name with `os.path.normpath` to remove any local loops with parent directory path + # components. `pathlib` does not seem to offer equivalent functionality. + dest_name = os.path.normpath(dest_name) # Normalize the destination name for uniqueness. Use `pathlib.PurePath` to ensure that keys are both # case-normalized (on OSes where applicable) and directory-separator normalized (just in case). @@ -350,12 +352,12 @@ def _normalize_toc(toc, toc_type_priorities, type_case_normalization_fcn=lambda existing_entry = tmp_toc.get(entry_key) if existing_entry is None: # Entry does not exist - insert - tmp_toc[entry_key] = entry + tmp_toc[entry_key] = (dest_name, src_name, typecode) else: # Entry already exists - replace if its typecode has higher priority _, _, existing_typecode = existing_entry if toc_type_priorities.get(typecode, 0) > toc_type_priorities.get(existing_typecode, 0): - tmp_toc[entry_key] = entry + tmp_toc[entry_key] = (dest_name, src_name, typecode) # Return the items as list. The order matches the original order due to python dict maintaining the insertion order. return list(tmp_toc.values()) diff --git a/tests/unit/test_toc_normalization.py b/tests/unit/test_toc_normalization.py index 0a090bfb87..4e861a6263 100644 --- a/tests/unit/test_toc_normalization.py +++ b/tests/unit/test_toc_normalization.py @@ -11,6 +11,7 @@ # Tests for explicit TOC list normalization that replaced the implicit normalization with class:``TOC``. import copy +import pathlib from PyInstaller import compat from PyInstaller.building.datastruct import normalize_pyz_toc, normalize_toc @@ -21,7 +22,7 @@ ('libpython3.10.so', '/usr/lib64/libpython3.10.so', 'BINARY'), ('libsomething.so', '/usr/local/lib64/libsomething.so', 'BINARY'), ('README', '/home/user/tmp/README', 'DATA'), - ('data/data.csv', '/home/user/tmp/data/data.csv', 'DATA'), + (str(pathlib.PurePath('data/data.csv')), '/home/user/tmp/data/data.csv', 'DATA'), ('dependency.bin', 'other_multipackage:dependency.bin', 'DEPENDENCY'), ('myextension.so', 'myextension.so', 'EXTENSION'), ] @@ -125,6 +126,34 @@ def test_normalize_toc_multipackage_dependency(): assert sorted(normalized_toc) == sorted(expected_toc) +def test_normalize_toc_with_parent_pardir_loops(): + # Check that de-duplication works even if destination paths contain local loop with parent directory (..) components + # but can be normalized to the same path. Furthermore, we expect TOC normalization to sanitize the dest_name with + # normalized version. + toc = [ + ( + str(pathlib.PurePath('numpy/core/../../numpy.libs/libquadmath-2d0c479f.so.0.0.0')), + '/path/to/venv/lib/python3.11/site-packages/numpy/core/../../numpy.libs/libquadmath-2d0c479f.so.0.0.0', + 'BINARY', + ), + ( + str(pathlib.PurePath('numpy/linalg/../../numpy.libs/libquadmath-2d0c479f.so.0.0.0')), + '/path/to/venv/lib/python3.11/site-packages/numpy/linalg/../../numpy.libs/libquadmath-2d0c479f.so.0.0.0', + 'BINARY', + ), + ] + expected_toc = [ + ( + str(pathlib.PurePath('numpy.libs/libquadmath-2d0c479f.so.0.0.0')), + '/path/to/venv/lib/python3.11/site-packages/numpy/core/../../numpy.libs/libquadmath-2d0c479f.so.0.0.0', + 'BINARY', + ), + ] + + normalized_toc = normalize_toc(toc) + assert sorted(normalized_toc) == sorted(expected_toc) + + # Tests for PYZ TOC normalization. _BASE_PYZ_TOC = [ ('copy', '/usr/lib64/python3.11/copy.py', 'PYMODULE'),