Skip to content

Commit

Permalink
FindModuleCache: leverage BuildSourceSet
Browse files Browse the repository at this point in the history
On large codebases, the time in `load_graph` is dominated
by `find_module` because this operation is itself `O(n)`
where `n` is the number of input files, which ends up
being `O(n**2)` because it is called for every import
statement in the codebase.

Introduce a fast path that leverages the fact that for
imports within the code being typechecked, we already
have a mapping of module import path to file path in
`BuildSourceSet`

In a real-world codebase with ~13k files split across
dozens of disjoint folders, this brings `load_graph`
from ~180s down to ~48s, with profiling showing that
`parse` is now taking the vast majority of the time,
as expected.
  • Loading branch information
huguesb committed Sep 24, 2020
1 parent 835b427 commit 04bc7f6
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 32 deletions.
32 changes: 3 additions & 29 deletions mypy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from mypy import moduleinfo
from mypy.fixup import fixup_module
from mypy.modulefinder import (
BuildSource, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
BuildSource, BuildSourceSet, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
ModuleNotFoundReason
)
from mypy.nodes import Expression
Expand Down Expand Up @@ -106,33 +106,6 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None:
self.errors = [] # type: List[str] # Filled in by build if desired


class BuildSourceSet:
"""Efficiently test a file's membership in the set of build sources."""

def __init__(self, sources: List[BuildSource]) -> None:
self.source_text_present = False
self.source_modules = set() # type: Set[str]
self.source_paths = set() # type: Set[str]

for source in sources:
if source.text is not None:
self.source_text_present = True
elif source.path:
self.source_paths.add(source.path)
else:
self.source_modules.add(source.module)

def is_source(self, file: MypyFile) -> bool:
if file.path and file.path in self.source_paths:
return True
elif file._fullname in self.source_modules:
return True
elif self.source_text_present:
return True
else:
return False


def build(sources: List[BuildSource],
options: Options,
alt_lib_path: Optional[str] = None,
Expand Down Expand Up @@ -621,7 +594,8 @@ def __init__(self, data_dir: str,
or options.use_fine_grained_cache)
and not has_reporters)
self.fscache = fscache
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options)
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options,
source_set=self.source_set)
self.metastore = create_metastore(options)

# a mapping from source files to their corresponding shadow files
Expand Down
100 changes: 97 additions & 3 deletions mypy/modulefinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from mypy.defaults import PYTHON3_VERSION_MIN
from mypy.fscache import FileSystemCache
from mypy.nodes import MypyFile
from mypy.options import Options
from mypy import sitepkgs

Expand Down Expand Up @@ -92,6 +93,33 @@ def __repr__(self) -> str:
self.base_dir)


class BuildSourceSet:
"""Efficiently test a file's membership in the set of build sources."""

def __init__(self, sources: List[BuildSource]) -> None:
self.source_text_present = False
self.source_modules = {} # type: Dict[str, str]
self.source_paths = set() # type: Set[str]

for source in sources:
if source.text is not None:
self.source_text_present = True
if source.path:
self.source_paths.add(source.path)
if source.module:
self.source_modules[source.module] = source.path or ''

def is_source(self, file: MypyFile) -> bool:
if file.path and file.path in self.source_paths:
return True
elif file._fullname in self.source_modules:
return True
elif self.source_text_present:
return True
else:
return False


class FindModuleCache:
"""Module finder with integrated cache.
Expand All @@ -107,8 +135,10 @@ def __init__(self,
search_paths: SearchPaths,
fscache: Optional[FileSystemCache] = None,
options: Optional[Options] = None,
ns_packages: Optional[List[str]] = None) -> None:
ns_packages: Optional[List[str]] = None,
source_set: Optional[BuildSourceSet] = None) -> None:
self.search_paths = search_paths
self.source_set = source_set
self.fscache = fscache or FileSystemCache()
# Cache for get_toplevel_possibilities:
# search_paths -> (toplevel_id -> list(package_dirs))
Expand All @@ -124,6 +154,39 @@ def clear(self) -> None:
self.initial_components.clear()
self.ns_ancestors.clear()

def find_module_via_source_set(self, id: str) -> Optional[ModuleSearchResult]:
if not self.source_set:
return None
p = self.source_set.source_modules.get(id, None)
if p and self.fscache.isfile(p):
# NB: need to make sure we still have __init__.py all the way up
# otherwise we might have false positives compared to slow path
d = os.path.dirname(p)
for i in range(id.count('.')):
if not self.fscache.isfile(os.path.join(d, '__init__.py')):
return None
d = os.path.dirname(d)
return p

idx = id.rfind('.')
if idx != - 1:
parent = self.find_module_via_source_set(id[:idx])
if (
parent and isinstance(parent, str)
and not parent.endswith('__init__.py')
and not self.fscache.isdir(os.path.splitext(parent)[0])
):
# if
# 1. we're looking for foo.bar.baz
# 2. foo.bar.py[i] is in the source set
# 3. foo.bar is not a directory
# then we don't want to go spelunking in other search paths to find
# another 'bar' module, because it's a waste of time and even in the
# unlikely event that we did find one that matched, it probably would
# be completely unrelated and undesirable
return ModuleNotFoundReason.NOT_FOUND
return None

def find_lib_path_dirs(self, id: str, lib_path: Tuple[str, ...]) -> PackageDirs:
"""Find which elements of a lib_path have the directory a module needs to exist.
Expand Down Expand Up @@ -209,8 +272,8 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
"""
working_dir = os.getcwd()
parent_search = FindModuleCache(SearchPaths((), (), (), ()))
while any(file.endswith(("__init__.py", "__init__.pyi"))
for file in os.listdir(working_dir)):
while any(os.path.exists(os.path.join(working_dir, f))
for f in ["__init__.py", "__init__.pyi"]):
working_dir = os.path.dirname(working_dir)
parent_search.search_paths = SearchPaths((working_dir,), (), (), ())
if not isinstance(parent_search._find_module(id), ModuleNotFoundReason):
Expand All @@ -220,6 +283,37 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
def _find_module(self, id: str) -> ModuleSearchResult:
fscache = self.fscache

# fast path for any modules in the current source set
# this is particularly important when there are a large number of search
# paths which share the first (few) component(s) due to the use of namespace
# packages, for instance
# foo/
# company/
# __init__.py
# foo/
# bar/
# company/
# __init__.py
# bar/
# baz/
# company/
# __init__.py
# baz/
#
# mypy gets [foo/company/foo, foo/company/bar, foo/company/baz, ...] as input
# and computes [foo, bar, baz, ...] as the module search path
#
# This would result in O(n) search for every import of company.* and since,
# leading to O(n**2) behavior in load_graph as such imports are unsurprisingly
# present at least once, and usually many more times than that, in each and
# every file being parsed
#
# Thankfully, such cases are efficiently handled by looking up the module path
# via BuildSourceSet
p = self.find_module_via_source_set(id)
if p:
return p

# If we're looking for a module like 'foo.bar.baz', it's likely that most of the
# many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover
# that only once and cache it for when we look for modules like 'foo.bar.blah'
Expand Down

0 comments on commit 04bc7f6

Please sign in to comment.