pylint-dev · Pierre-Sassoulas · Jul 28, 2021 · Mar 2, 2021 · doublethefish · Jun 8, 2021
diff --git a/ChangeLog b/ChangeLog
@@ -22,9 +22,10 @@ Release date: TBA
 
   Closes #3826
 
-* Improved the Similarity checker performance.
+* Improved the Similarity checker performance. Fix issue with ``--min-similarity-lines`` used with ``--jobs``.
 
   Close #4120
+  Close #4118
 
 * The default for ``PYLINTHOME`` is now the standard ``XDG_CACHE_HOME``, and pylint now uses ``appdirs``.
 

diff --git a/pylint/lint/parallel.py b/pylint/lint/parallel.py
@@ -160,7 +160,7 @@ def check_parallel(linter, jobs, files, arguments=None):
         pool.join()
 
     _merge_mapreduce_data(linter, all_mapreduce_data)
-    linter.stats = _merge_stats(all_stats)
+    linter.stats = _merge_stats([linter.stats] + all_stats)
 
     # Insert stats data to local checkers.
     for checker in linter.get_checkers():

diff --git a/tests/test_check_parallel.py b/tests/test_check_parallel.py
@@ -67,20 +67,96 @@ def process_module(self, _astroid):
         self.data.append(record)
 
 
+class ParallelTestChecker(BaseChecker):
+    """A checker that does need to consolidate data.
+
+    To simulate the need to consolidate data, this checker only
+    reports a message for pairs of files.
+
+    On non-parallel builds: it works on all the files in a single run.
+
+    On parallel builds: lint.parallel calls ``open`` once per file.
+
+    So if files are treated by separate processes, no messages will be
+    raised from the individual process, all messages will be raised
+    from reduce_map_data.
+    """
+
+    __implements__ = (pylint.interfaces.IRawChecker,)
+
+    name = "parallel-checker"
+    test_data = "parallel"
+    msgs = {
+        "R9999": (
+            "Test %s",
+            "parallel-test-check",
+            "Some helpful text.",
+        )
+    }
+
+    def __init__(self, linter, *args, **kwargs):
+        super().__init__(linter, *args, **kwargs)
+        self.data = []
+        self.linter = linter
+        self.stats = None
+
+    def open(self):
+        """init the checkers: reset statistics information"""
+        self.stats = self.linter.add_stats()
+        self.data = []
+
+    def close(self):
+        for _ in self.data[1::2]:  # Work on pairs of files, see class docstring.
+            self.add_message("R9999", args=("From process_module, two files seen.",))
+
+    def get_map_data(self):
+        return self.data
+
+    def reduce_map_data(self, linter, data):
+        recombined = type(self)(linter)
+        recombined.open()
+        aggregated = []
+        for d in data:
+            aggregated.extend(d)
+        for _ in aggregated[1::2]:  # Work on pairs of files, see class docstring.
+            self.add_message("R9999", args=("From reduce_map_data",))
+        recombined.close()
+
+    def process_module(self, _astroid):
+        """Called once per stream/file/astroid object"""
+        # record the number of invocations with the data object
+        record = self.test_data + str(len(self.data))
+        self.data.append(record)
+
+
 class ExtraSequentialTestChecker(SequentialTestChecker):
     """A checker that does not need to consolidate data across run invocations"""
 
     name = "extra-sequential-checker"
     test_data = "extra-sequential"
 
 
+class ExtraParallelTestChecker(ParallelTestChecker):
+    """A checker that does need to consolidate data across run invocations"""
+
+    name = "extra-parallel-checker"
+    test_data = "extra-parallel"
+
+
 class ThirdSequentialTestChecker(SequentialTestChecker):
     """A checker that does not need to consolidate data across run invocations"""
 
     name = "third-sequential-checker"
     test_data = "third-sequential"
 
 
+class ThirdParallelTestChecker(ParallelTestChecker):
+    """A checker that does need to consolidate data across run invocations"""
+
+    name = "third-parallel-checker"
+    test_data = "third-parallel"
+
+
 class TestCheckParallelFramework:
     """Tests the check_parallel() function's framework"""
 
@@ -402,3 +478,69 @@ def test_compare_workers_to_single_proc(self, num_files, num_jobs, num_checkers)
         assert (
             stats_check_parallel == expected_stats
         ), "The lint is returning unexpected results, has something changed?"
+
+    @pytest.mark.parametrize(
+        "num_files,num_jobs,num_checkers",
+        [
+            (2, 2, 1),
+            (2, 2, 2),
+            (2, 2, 3),
+            (3, 2, 1),
+            (3, 2, 2),
+            (3, 2, 3),
+            (3, 1, 1),
+            (3, 1, 2),
+            (3, 1, 3),
+            (3, 5, 1),
+            (3, 5, 2),
+            (3, 5, 3),
+            (10, 2, 1),
+            (10, 2, 2),
+            (10, 2, 3),
+            (2, 10, 1),
+            (2, 10, 2),
+            (2, 10, 3),
+        ],
+    )
+    def test_map_reduce(self, num_files, num_jobs, num_checkers):
+        """Compares the 3 key parameters for check_parallel() produces the same results
+
+        The intent here is to validate the reduce step: no stats should be lost.
+
+        Checks regression of https://github.com/PyCQA/pylint/issues/4118
+        """
+
+        # define the stats we expect to get back from the runs, these should only vary
+        # with the number of files.
+        file_infos = _gen_file_datas(num_files)
+
+        # Loop for single-proc and mult-proc so we can ensure the same linter-config
+        for do_single_proc in range(2):
+            linter = PyLinter(reporter=Reporter())
+
+            # Assign between 1 and 3 checkers to the linter, they should not change the
+            # results of the lint
+            linter.register_checker(ParallelTestChecker(linter))
+            if num_checkers > 1:
+                linter.register_checker(ExtraParallelTestChecker(linter))
+            if num_checkers > 2:
+                linter.register_checker(ThirdParallelTestChecker(linter))
+
+            if do_single_proc:
+                # establish the baseline
+                assert (
+                    linter.config.jobs == 1
+                ), "jobs>1 are ignored when calling _check_files"
+                linter._check_files(linter.get_ast, file_infos)
+                stats_single_proc = linter.stats
+            else:
+                check_parallel(
+                    linter,
+                    jobs=num_jobs,
+                    files=file_infos,
+                    arguments=None,
+                )
+                stats_check_parallel = linter.stats
+        assert (
+            stats_single_proc["by_msg"] == stats_check_parallel["by_msg"]
+        ), "Single-proc and check_parallel() should return the same thing"