From e3c45e297697a4dc5d00f6772f3b3c6fbfbbdbb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Janne=20R=C3=B6nkk=C3=B6?= Date: Fri, 19 Jul 2019 23:46:21 +0300 Subject: [PATCH] Refactor file checking for a simpler parallel API This change prepares the code for enabling Prospector to take advantage of running PyLint parallel. Iterating files is moved into generator (_iterate_file_descrs) so that parallel checking can use the same implementation (_check_file) just by providing different kind of generator that reads the files from parent process. The refactoring removes code duplication that existed in PyLinter._do_check method; checking module content from stdin had identical implementation to checking content from a source file. Made PyLinter.expand_files a private method. The previous implementation of parallel linting created new PyLinter objects in the worker (child) process causing failure when running under Prospector because Prospector uses a custom PyLinter class (a class inherited from PyLinter) and PyLint naturally just creates PyLinter object. This caused linting to fail because there is options for Prospector's IndentChecker which was not created in the worker process. The new implementation passes the original PyLinter object into workers when the workers are created. See https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods Note that as Windows uses spawn method by default, PyLinter object (and its) members need to be pickleable from now on with the exception being PyLinter.reporter which is not passed to child processes. The performance has remained about the same based on quick tests done with Django project containing about 30 000 lines of code; with the old implementation linting took 26-28 seconds with 8 jobs on quad core i7 and 24-27 seconds with the new implementation. --- CONTRIBUTORS.txt | 1 + ChangeLog | 12 +- pylint/checkers/classes.py | 6 +- pylint/config.py | 11 +- pylint/lint.py | 484 ++++++++++--------- pylint/message/message_definition.py | 4 +- tests/message/unittest_message_definition.py | 6 +- tests/unittest_lint.py | 22 +- 8 files changed, 290 insertions(+), 256 deletions(-) diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 81e6b318b81..ed5dc411085 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -339,3 +339,4 @@ contributors: * laike9m: contributor +* Janne Rönkkö: contributor diff --git a/ChangeLog b/ChangeLog index d02e14984ca..603d1a96afb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -31,11 +31,21 @@ Release date: TBA Closes #2729 +* Allow parallel linting when run under Prospector + What's New in Pylint 2.4.3? =========================== -Release date: TBA + Pass the actual PyLinter object to sub processes to allow using custom + PyLinter classes. + + PyLinter object (and all its members except reporter) needs to support + pickling so the PyLinter object can be passed to worker processes. + +* Refactor file checking + + Remove code duplication from file checking. * Fix an issue with ``unnecessary-comprehension`` in comprehensions with additional repacking of elements. diff --git a/pylint/checkers/classes.py b/pylint/checkers/classes.py index 9f5d0994389..38a0d1075d2 100644 --- a/pylint/checkers/classes.py +++ b/pylint/checkers/classes.py @@ -634,11 +634,15 @@ def _has_same_layout_slots(slots, assigned_value): } +def _scope_default(): + return collections.defaultdict(list) + + class ScopeAccessMap: """Store the accessed variables per scope.""" def __init__(self): - self._scopes = collections.defaultdict(lambda: collections.defaultdict(list)) + self._scopes = collections.defaultdict(_scope_default) def set_accessed(self, node): """Set the given node as accessed.""" diff --git a/pylint/config.py b/pylint/config.py index 219bc6b78e6..419a5a93477 100644 --- a/pylint/config.py +++ b/pylint/config.py @@ -37,6 +37,7 @@ import configparser import contextlib import copy +import functools import io import optparse import os @@ -719,10 +720,8 @@ def read_config_file(self, config_file=None, verbose=None): opt = "-".join(["long"] * helplevel) + "-help" if opt in self._all_options: break # already processed - # pylint: disable=unused-argument - def helpfunc(option, opt, val, p, level=helplevel): - print(self.help(level)) - sys.exit(0) + + helpfunc = functools.partial(self.helpfunc, level=helplevel) helpmsg = "%s verbose help." % " ".join(["more"] * helplevel) optdict = {"action": "callback", "callback": helpfunc, "help": helpmsg} @@ -830,6 +829,10 @@ def help(self, level=0): with _patch_optparse(): return self.cmdline_parser.format_help() + def helpfunc(self, option, opt, val, p, level): # pylint: disable=unused-argument + print(self.help(level)) + sys.exit(0) + class OptionsProviderMixIn: """Mixin to provide options to an OptionsManager""" diff --git a/pylint/lint.py b/pylint/lint.py index 095eb0e2b5f..7bd6901b1ff 100644 --- a/pylint/lint.py +++ b/pylint/lint.py @@ -60,6 +60,7 @@ """ import collections import contextlib +import functools import operator import os import sys @@ -240,68 +241,6 @@ def _cpu_count() -> int: return 1 -if multiprocessing is not None: - - class ChildLinter(multiprocessing.Process): - def run(self): - # pylint: disable=no-member, unbalanced-tuple-unpacking - tasks_queue, results_queue, self._config = self._args - - self._config["jobs"] = 1 # Child does not parallelize any further. - self._python3_porting_mode = self._config.pop("python3_porting_mode", None) - self._plugins = self._config.pop("plugins", None) - - # Run linter for received files/modules. - for file_or_module in iter(tasks_queue.get, "STOP"): - try: - result = self._run_linter(file_or_module[0]) - results_queue.put(result) - except Exception as ex: - print( - "internal error with sending report for module %s" - % file_or_module, - file=sys.stderr, - ) - print(ex, file=sys.stderr) - results_queue.put({}) - - def _run_linter(self, file_or_module): - linter = PyLinter() - - # Register standard checkers. - linter.load_default_plugins() - # Load command line plugins. - if self._plugins: - linter.load_plugin_modules(self._plugins) - - linter.load_configuration_from_config(self._config) - - # Load plugin specific configuration - linter.load_plugin_configuration() - - linter.set_reporter(reporters.CollectingReporter()) - - # Enable the Python 3 checker mode. This option is - # passed down from the parent linter up to here, since - # the Python 3 porting flag belongs to the Run class, - # instead of the Linter class. - if self._python3_porting_mode: - linter.python3_porting_mode() - - # Run the checks. - linter.check(file_or_module) - - msgs = [_get_new_args(m) for m in linter.reporter.messages] - return ( - file_or_module, - linter.file_state.base_name, - linter.current_name, - msgs, - linter.stats, - linter.msg_status, - ) - - # pylint: disable=too-many-instance-attributes,too-many-public-methods class PyLinter( config.OptionsManagerMixIn, @@ -320,6 +259,9 @@ class PyLinter( IDE plugin developers: you may have to call `astroid.builder.MANAGER.astroid_cache.clear()` across runs if you want to ensure the latest code version is actually checked. + + This class needs to support pickling for parallel linting to work. The exception + is reporter member; see check_parallel function for more details. """ __implements__ = (interfaces.ITokenChecker,) @@ -989,9 +931,10 @@ def should_analyze_file(modname, path, is_argument=False): # pylint: enable=unused-argument - def check(self, files_or_modules): - """main checking entry: check a list of files or modules from their - name. + def initialize(self): + """Initialize linter for linting + + This method is called before any linting is done. """ # initialize msgs_state now that all messages have been registered into # the store @@ -999,124 +942,17 @@ def check(self, files_or_modules): if not msg.may_be_emitted(): self._msgs_state[msg.msgid] = False - if not isinstance(files_or_modules, (list, tuple)): - files_or_modules = (files_or_modules,) - - if self.config.jobs == 1: - self._do_check(files_or_modules) - else: - self._parallel_check(files_or_modules) - - def _get_jobs_config(self): - child_config = collections.OrderedDict() - filter_options = {"long-help"} - filter_options.update((opt_name for opt_name, _ in self._external_opts)) - for opt_providers in self._all_options.values(): - for optname, optdict, val in opt_providers.options_and_values(): - if optdict.get("deprecated"): - continue - - if optname not in filter_options: - child_config[optname] = utils._format_option_value(optdict, val) - child_config["python3_porting_mode"] = self._python3_porting_mode - child_config["plugins"] = self._dynamic_plugins - return child_config - - def _parallel_task(self, files_or_modules): - # Prepare configuration for child linters. - child_config = self._get_jobs_config() - - children = [] - manager = multiprocessing.Manager() - tasks_queue = manager.Queue() - results_queue = manager.Queue() - - # Send files to child linters. - expanded_files = [] - for descr in self.expand_files(files_or_modules): - modname, filepath, is_arg = descr["name"], descr["path"], descr["isarg"] - if self.should_analyze_file(modname, filepath, is_argument=is_arg): - expanded_files.append(descr) - - # do not start more jobs than needed - for _ in range(min(self.config.jobs, len(expanded_files))): - child_linter = ChildLinter(args=(tasks_queue, results_queue, child_config)) - child_linter.start() - children.append(child_linter) - - for files_or_module in expanded_files: - path = files_or_module["path"] - tasks_queue.put([path]) - - # collect results from child linters - failed = False - for _ in expanded_files: - try: - result = results_queue.get() - except Exception as ex: - print( - "internal error while receiving results from child linter", - file=sys.stderr, - ) - print(ex, file=sys.stderr) - failed = True - break - yield result - - # Stop child linters and wait for their completion. - for _ in range(self.config.jobs): - tasks_queue.put("STOP") - for child in children: - child.join() - - if failed: - print("Error occurred, stopping the linter.", file=sys.stderr) - sys.exit(32) - - def _parallel_check(self, files_or_modules): - # Reset stats. - self.open() - - all_stats = [] - module = None - for result in self._parallel_task(files_or_modules): - if not result: - continue - (_, self.file_state.base_name, module, messages, stats, msg_status) = result - - for msg in messages: - msg = Message(*msg) - self.set_current_module(module) - self.reporter.handle_message(msg) + def check(self, files_or_modules): + """main checking entry: check a list of files or modules from their name. - all_stats.append(stats) - self.msg_status |= msg_status + files_or_modules is either a string or list of strings presenting modules to check. + """ - self.stats = _merge_stats(all_stats) - self.current_name = module + self.initialize() - # Insert stats data to local checkers. - for checker in self.get_checkers(): - if checker is not self: - checker.stats = self.stats + if not isinstance(files_or_modules, (list, tuple)): + files_or_modules = (files_or_modules,) - def _do_check(self, files_or_modules): - walker = ASTWalker(self) - _checkers = self.prepare_checkers() - tokencheckers = [ - c - for c in _checkers - if interfaces.implements(c, interfaces.ITokenChecker) and c is not self - ] - rawcheckers = [ - c for c in _checkers if interfaces.implements(c, interfaces.IRawChecker) - ] - # notify global begin - for checker in _checkers: - checker.open() - if interfaces.implements(checker, interfaces.IAstroidChecker): - walker.add_checker(checker) - # build ast and check modules or packages if self.config.from_stdin: if len(files_or_modules) != 1: raise exceptions.InvalidArgsError( @@ -1124,62 +960,102 @@ def _do_check(self, files_or_modules): ) filepath = files_or_modules[0] - try: - # Note that this function does not really perform an - # __import__ but may raise an ImportError exception, which - # we want to catch here. - modname = ".".join(modutils.modpath_from_file(filepath)) - except ImportError: - modname = os.path.splitext(os.path.basename(filepath))[0] - - self.set_current_module(modname, filepath) - - # get the module representation - ast_node = _ast_from_string(_read_stdin(), filepath, modname) - - if ast_node is not None: - self.file_state = FileState(filepath) - self.check_astroid_module(ast_node, walker, rawcheckers, tokencheckers) - # warn about spurious inline messages handling - spurious_messages = self.file_state.iter_spurious_suppression_messages( - self.msgs_store - ) - for msgid, line, args in spurious_messages: - self.add_message(msgid, line, None, args) + self._check_files( + functools.partial(_ast_from_string, _read_stdin()), + [self._get_file_descr_from_stdin(filepath)], + ) + elif self.config.jobs == 1: + self._check_files(self.get_ast, self._iterate_file_descrs(files_or_modules)) else: - for descr in self.expand_files(files_or_modules): - modname, filepath, is_arg = descr["name"], descr["path"], descr["isarg"] - if not self.should_analyze_file(modname, filepath, is_argument=is_arg): - continue - - self.set_current_module(modname, filepath) - # get the module representation - ast_node = self.get_ast(filepath, modname) - if ast_node is None: - continue - - self.file_state = FileState(descr["basename"]) - self._ignore_file = False - # fix the current file (if the source file was not available or - # if it's actually a c extension) - self.current_file = ast_node.file # pylint: disable=maybe-no-member - before_check_statements = walker.nbstatements - self.check_astroid_module(ast_node, walker, rawcheckers, tokencheckers) - self.stats["by_module"][modname]["statement"] = ( - walker.nbstatements - before_check_statements - ) - # warn about spurious inline messages handling - spurious_messages = self.file_state.iter_spurious_suppression_messages( - self.msgs_store - ) - for msgid, line, args in spurious_messages: - self.add_message(msgid, line, None, args) - # notify global end - self.stats["statement"] = walker.nbstatements - for checker in reversed(_checkers): - checker.close() + check_parallel( + self, self.config.jobs, self._iterate_file_descrs(files_or_modules) + ) + + def check_single_file(self, name, filepath, modname): + """Check single file + + The arguments are the same that are documented in _check_files + + The initialize() method should be called before calling this method + """ + with self._astroid_module_checker() as check_astroid_module: + self._check_file( + self.get_ast, check_astroid_module, name, filepath, modname + ) + + def _check_files(self, get_ast, file_descrs): + """Check all files from file_descrs + + The file_descrs should be iterable of tuple (name, filepath, modname) + where + - name: full name of the module + - filepath: path of the file + - modname: module name + """ + with self._astroid_module_checker() as check_astroid_module: + for name, filepath, modname in file_descrs: + self._check_file(get_ast, check_astroid_module, name, filepath, modname) + + def _check_file(self, get_ast, check_astroid_module, name, filepath, modname): + """Check a file using the passed utility functions (get_ast and check_astroid_module) + + :param get_ast: callable returning AST from defined file taking the following arguments + - filepath: path to the file to check + - name: Python module name + :param check_astroid_module: callable checking an AST taking the following arguments + - ast: AST of the module + :param name: full name of the module + :param filepath: path to checked file + :param modname: name of the checked Python module + """ + self.set_current_module(name, filepath) + # get the module representation + ast_node = get_ast(filepath, name) + if ast_node is None: + return + + self._ignore_file = False + + self.file_state = FileState(modname) + # fix the current file (if the source file was not available or + # if it's actually a c extension) + self.current_file = ast_node.file # pylint: disable=maybe-no-member + check_astroid_module(ast_node) + # warn about spurious inline messages handling + spurious_messages = self.file_state.iter_spurious_suppression_messages( + self.msgs_store + ) + for msgid, line, args in spurious_messages: + self.add_message(msgid, line, None, args) - def expand_files(self, modules): + @staticmethod + def _get_file_descr_from_stdin(filepath): + """Return file description (tuple of module name, file path, base name) from given file path + + This method is used for creating suitable file description for _check_files when the + source is standard input. + """ + try: + # Note that this function does not really perform an + # __import__ but may raise an ImportError exception, which + # we want to catch here. + modname = ".".join(modutils.modpath_from_file(filepath)) + except ImportError: + modname = os.path.splitext(os.path.basename(filepath))[0] + + return (modname, filepath, filepath) + + def _iterate_file_descrs(self, files_or_modules): + """Return generator yielding file descriptions (tuples of module name, file path, base name) + + The returned generator yield one item for each Python module that should be linted. + """ + for descr in self._expand_files(files_or_modules): + name, filepath, is_arg = descr["name"], descr["path"], descr["isarg"] + if self.should_analyze_file(name, filepath, is_argument=is_arg): + yield (name, filepath, descr["basename"]) + + def _expand_files(self, modules): """get modules and errors from a list of modules and handle errors """ result, errors = utils.expand_modules( @@ -1208,6 +1084,40 @@ def set_current_module(self, modname, filepath=None): for msg_cat in MSG_TYPES.values(): self.stats["by_module"][modname][msg_cat] = 0 + @contextlib.contextmanager + def _astroid_module_checker(self): + """Context manager for checking ASTs + + The value in the context is callable accepting AST as its only argument. + """ + walker = ASTWalker(self) + _checkers = self.prepare_checkers() + tokencheckers = [ + c + for c in _checkers + if interfaces.implements(c, interfaces.ITokenChecker) and c is not self + ] + rawcheckers = [ + c for c in _checkers if interfaces.implements(c, interfaces.IRawChecker) + ] + # notify global begin + for checker in _checkers: + checker.open() + if interfaces.implements(checker, interfaces.IAstroidChecker): + walker.add_checker(checker) + + yield functools.partial( + self.check_astroid_module, + walker=walker, + tokencheckers=tokencheckers, + rawcheckers=rawcheckers, + ) + + # notify global end + self.stats["statement"] = walker.nbstatements + for checker in reversed(_checkers): + checker.close() + def get_ast(self, filepath, modname): """return an ast(roid) representation for a module""" try: @@ -1227,7 +1137,34 @@ def get_ast(self, filepath, modname): self.add_message("astroid-error", args=(ex.__class__, ex)) def check_astroid_module(self, ast_node, walker, rawcheckers, tokencheckers): - """Check a module from its astroid representation.""" + """Check a module from its astroid representation. + + For return value see _check_astroid_module + """ + before_check_statements = walker.nbstatements + + retval = self._check_astroid_module( + ast_node, walker, rawcheckers, tokencheckers + ) + + self.stats["by_module"][self.current_name]["statement"] = ( + walker.nbstatements - before_check_statements + ) + + return retval + + def _check_astroid_module(self, ast_node, walker, rawcheckers, tokencheckers): + """Check given AST node with given walker and checkers + + :param ast_mode: AST node of the module to check + :param walker: pylint.utils.ast_walker.ASTWalker instance + :param rawcheckers: List of token checkers to use + :param tokencheckers: List of raw checkers to use + + :returns: True if the module was checked, False if ignored, + None if the module contents could not be parsed + :rtype: bool + """ try: tokens = utils.tokenize_module(ast_node) except tokenize.TokenError as ex: @@ -1317,6 +1254,77 @@ def _report_evaluation(self): self.reporter.display_reports(sect) +def check_parallel(linter, jobs, files): + """Use the given linter to lint the files with given amount of workers (jobs) + """ + # The reporter does not need to be passed to worker processess, i.e. the reporter does + # not need to be pickleable + original_reporter = linter.reporter + linter.reporter = None + + # The linter is inherited by all the pool's workers, i.e. the linter + # is identical to the linter object here. This is requred so that + # a custom PyLinter object can be used. + with multiprocessing.Pool( + jobs, initializer=_worker_initialize, initargs=[linter] + ) as pool: + # ..and now when the workers have inherited the linter, the actual reporter + # can be set back here on the parent process so that results get stored into + # correct reporter + linter.set_reporter(original_reporter) + linter.open() + + all_stats = [] + + for module, messages, stats, msg_status in pool.imap_unordered( + _worker_check_single_file, files + ): + linter.set_current_module(module) + for msg in messages: + msg = Message(*msg) + linter.reporter.handle_message(msg) + + all_stats.append(stats) + linter.msg_status |= msg_status + + linter.stats = _merge_stats(all_stats) + + # Insert stats data to local checkers. + for checker in linter.get_checkers(): + if checker is not linter: + checker.stats = linter.stats + + +# PyLinter object used by worker processes when checking files using multiprocessing +# should only be used by the worker processes +_worker_linter = None + + +def _worker_initialize(linter): + global _worker_linter # pylint: disable=global-statement + _worker_linter = linter + + # On the worker process side the messages are just collected and passed back to + # parent process as _worker_check_file function's return value + _worker_linter.set_reporter(reporters.CollectingReporter()) + _worker_linter.open() + + +def _worker_check_single_file(file_item): + name, filepath, modname = file_item + + _worker_linter.open() + _worker_linter.check_single_file(name, filepath, modname) + + msgs = [_get_new_args(m) for m in _worker_linter.reporter.messages] + return ( + _worker_linter.current_name, + msgs, + _worker_linter.stats, + _worker_linter.msg_status, + ) + + # some reporting functions #################################################### @@ -1468,6 +1476,10 @@ class Run: ), ) + @staticmethod + def _return_one(*args): # pylint: disable=unused-argument + return 1 + def __init__(self, args, reporter=None, do_exit=True): self._rcfile = None self._plugins = [] @@ -1493,7 +1505,7 @@ def __init__(self, args, reporter=None, do_exit=True): "rcfile", { "action": "callback", - "callback": lambda *args: 1, + "callback": Run._return_one, "type": "string", "metavar": "", "help": "Specify a configuration file.", @@ -1503,7 +1515,7 @@ def __init__(self, args, reporter=None, do_exit=True): "init-hook", { "action": "callback", - "callback": lambda *args: 1, + "callback": Run._return_one, "type": "string", "metavar": "", "level": 1, diff --git a/pylint/message/message_definition.py b/pylint/message/message_definition.py index e54c15aaec4..43c2196318f 100644 --- a/pylint/message/message_definition.py +++ b/pylint/message/message_definition.py @@ -23,7 +23,7 @@ def __init__( maxversion=None, old_names=None, ): - self.checker = checker + self.checker_name = checker.name self.check_msgid(msgid) self.msgid = msgid self.symbol = symbol @@ -63,7 +63,7 @@ def format_help(self, checkerref=False): """return the help string for the given message id""" desc = self.description if checkerref: - desc += " This message belongs to the %s checker." % self.checker.name + desc += " This message belongs to the %s checker." % self.checker_name title = self.msg if self.minversion or self.maxversion: restr = [] diff --git a/tests/message/unittest_message_definition.py b/tests/message/unittest_message_definition.py index 6bf7c8b53c6..d3cbdbb8879 100644 --- a/tests/message/unittest_message_definition.py +++ b/tests/message/unittest_message_definition.py @@ -4,6 +4,7 @@ # For details: https://github.com/PyCQA/pylint/blob/master/COPYING import sys +from unittest import mock import pytest @@ -21,10 +22,13 @@ ], ) def test_create_invalid_message_type(msgid, expected): + checker_mock = mock.Mock(name='Checker') + checker_mock.name = 'checker' + with pytest.raises(InvalidMessageError) as invalid_message_error: MessageDefinition.check_msgid(msgid) with pytest.raises(InvalidMessageError) as other_invalid_message_error: - MessageDefinition("checker", msgid, "msg", "descr", "symbol", "scope") + MessageDefinition(checker_mock, msgid, "msg", "descr", "symbol", "scope") assert str(invalid_message_error.value) == expected assert str(other_invalid_message_error.value) == expected diff --git a/tests/unittest_lint.py b/tests/unittest_lint.py index 4f942cd02f5..3252a5e4375 100644 --- a/tests/unittest_lint.py +++ b/tests/unittest_lint.py @@ -743,26 +743,26 @@ def test_error_unexpected_value(self): ) +class _CustomPyLinter(PyLinter): + def should_analyze_file(self, modname, path, is_argument=False): + if os.path.basename(path) == "wrong.py": + return False + + return super(_CustomPyLinter, self).should_analyze_file( + modname, path, is_argument=is_argument + ) + + def test_custom_should_analyze_file(): """Check that we can write custom should_analyze_file that work even for arguments. """ - - class CustomPyLinter(PyLinter): - def should_analyze_file(self, modname, path, is_argument=False): - if os.path.basename(path) == "wrong.py": - return False - - return super(CustomPyLinter, self).should_analyze_file( - modname, path, is_argument=is_argument - ) - package_dir = os.path.join(HERE, "regrtest_data", "bad_package") wrong_file = os.path.join(package_dir, "wrong.py") for jobs in [1, 2]: reporter = testutils.TestReporter() - linter = CustomPyLinter() + linter = _CustomPyLinter() linter.config.jobs = jobs linter.config.persistent = 0 linter.open()