Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Similar check: Passe min_lines to recombined. #4175

Merged
merged 1 commit into from
Jul 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ Release date: TBA

Closes #3826

* Improved the Similarity checker performance.
* Improved the Similarity checker performance. Fix issue with ``--min-similarity-lines`` used with ``--jobs``.

Close #4120
Close #4118

* The default for ``PYLINTHOME`` is now the standard ``XDG_CACHE_HOME``, and pylint now uses ``appdirs``.

Expand Down
2 changes: 1 addition & 1 deletion pylint/lint/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def check_parallel(linter, jobs, files, arguments=None):
pool.join()

_merge_mapreduce_data(linter, all_mapreduce_data)
linter.stats = _merge_stats(all_stats)
linter.stats = _merge_stats([linter.stats] + all_stats)

# Insert stats data to local checkers.
for checker in linter.get_checkers():
Expand Down
142 changes: 142 additions & 0 deletions tests/test_check_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,20 +67,96 @@ def process_module(self, _astroid):
self.data.append(record)


class ParallelTestChecker(BaseChecker):
"""A checker that does need to consolidate data.

To simulate the need to consolidate data, this checker only
reports a message for pairs of files.

On non-parallel builds: it works on all the files in a single run.

On parallel builds: lint.parallel calls ``open`` once per file.

So if files are treated by separate processes, no messages will be
raised from the individual process, all messages will be raised
from reduce_map_data.
"""

__implements__ = (pylint.interfaces.IRawChecker,)

name = "parallel-checker"
test_data = "parallel"
msgs = {
"R9999": (
"Test %s",
"parallel-test-check",
"Some helpful text.",
)
}

def __init__(self, linter, *args, **kwargs):
super().__init__(linter, *args, **kwargs)
self.data = []
self.linter = linter
self.stats = None

def open(self):
"""init the checkers: reset statistics information"""
self.stats = self.linter.add_stats()
self.data = []

def close(self):
for _ in self.data[1::2]: # Work on pairs of files, see class docstring.
self.add_message("R9999", args=("From process_module, two files seen.",))
JulienPalard marked this conversation as resolved.
Show resolved Hide resolved

def get_map_data(self):
return self.data

def reduce_map_data(self, linter, data):
recombined = type(self)(linter)
recombined.open()
aggregated = []
for d in data:
aggregated.extend(d)
for _ in aggregated[1::2]: # Work on pairs of files, see class docstring.
self.add_message("R9999", args=("From reduce_map_data",))
recombined.close()

def process_module(self, _astroid):
"""Called once per stream/file/astroid object"""
# record the number of invocations with the data object
record = self.test_data + str(len(self.data))
self.data.append(record)


class ExtraSequentialTestChecker(SequentialTestChecker):
"""A checker that does not need to consolidate data across run invocations"""

name = "extra-sequential-checker"
test_data = "extra-sequential"


class ExtraParallelTestChecker(ParallelTestChecker):
"""A checker that does need to consolidate data across run invocations"""

name = "extra-parallel-checker"
test_data = "extra-parallel"


class ThirdSequentialTestChecker(SequentialTestChecker):
"""A checker that does not need to consolidate data across run invocations"""

name = "third-sequential-checker"
test_data = "third-sequential"


class ThirdParallelTestChecker(ParallelTestChecker):
"""A checker that does need to consolidate data across run invocations"""

name = "third-parallel-checker"
test_data = "third-parallel"


class TestCheckParallelFramework:
"""Tests the check_parallel() function's framework"""

Expand Down Expand Up @@ -402,3 +478,69 @@ def test_compare_workers_to_single_proc(self, num_files, num_jobs, num_checkers)
assert (
stats_check_parallel == expected_stats
), "The lint is returning unexpected results, has something changed?"

@pytest.mark.parametrize(
"num_files,num_jobs,num_checkers",
[
(2, 2, 1),
(2, 2, 2),
(2, 2, 3),
(3, 2, 1),
(3, 2, 2),
(3, 2, 3),
(3, 1, 1),
(3, 1, 2),
(3, 1, 3),
(3, 5, 1),
(3, 5, 2),
(3, 5, 3),
(10, 2, 1),
(10, 2, 2),
(10, 2, 3),
(2, 10, 1),
(2, 10, 2),
(2, 10, 3),
],
)
def test_map_reduce(self, num_files, num_jobs, num_checkers):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could probably, and more neatly, either:

  1. merge this with my original (awful?) code (put linter.register_checker(ExtraParallelTestChecker(linter)) next to linter.register_checker(ExtraSequentialTestChecker(linter))
  2. create a shared function that accepts the ExtraSequentialTestChecker/ExtraParallelTestChecker types as params?

Either way, I apologize for writing the code this test is based on, it was the only way I could think to do it in the time I had.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Merging both tests, that does almost the same thing, would take less lines, but more complicated.

The first one can pass while the second can fail (before this PR), if we merge them I fear it'll add complexity while debugging the day it'll fail again.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would something like this be more readable:

file_infos = _gen_file_datas(num_files)

checkers = [
    ParallelTestChecker,
    ExtraParallelTestChecker,
    ThirdParallelTestChecker,
][:num_checkers]

# Establish the baseline:
linter = PyLinter(reporter=Reporter())
for checker in checkers[:num_checkers]:
    linter.register_checker(checker(linter))
assert linter.config.jobs == 1, "jobs>1 are ignored when calling _check_files"
linter._check_files(linter.get_ast, file_infos)
stats_single_proc = linter.stats

# Run the same in parallel:
linter = PyLinter(reporter=Reporter())
for checker in checkers[:num_checkers]:
    linter.register_checker(checker(linter))
check_parallel(linter, jobs=num_jobs, files=file_infos, arguments=None)
stats_check_parallel = linter.stats

assert (
    stats_single_proc["by_msg"] == stats_check_parallel["by_msg"]
), "Single-proc and check_parallel() should return the same thing"

? It's 9 lines shorter, mostly due to the reduced indentation that allow to use longer lines.

"""Compares the 3 key parameters for check_parallel() produces the same results

The intent here is to validate the reduce step: no stats should be lost.

Checks regression of https://github.com/PyCQA/pylint/issues/4118
"""

# define the stats we expect to get back from the runs, these should only vary
# with the number of files.
file_infos = _gen_file_datas(num_files)

# Loop for single-proc and mult-proc so we can ensure the same linter-config
for do_single_proc in range(2):
linter = PyLinter(reporter=Reporter())

# Assign between 1 and 3 checkers to the linter, they should not change the
# results of the lint
linter.register_checker(ParallelTestChecker(linter))
if num_checkers > 1:
linter.register_checker(ExtraParallelTestChecker(linter))
if num_checkers > 2:
linter.register_checker(ThirdParallelTestChecker(linter))

if do_single_proc:
# establish the baseline
assert (
linter.config.jobs == 1
), "jobs>1 are ignored when calling _check_files"
linter._check_files(linter.get_ast, file_infos)
stats_single_proc = linter.stats
else:
check_parallel(
linter,
jobs=num_jobs,
files=file_infos,
arguments=None,
)
stats_check_parallel = linter.stats
assert (
stats_single_proc["by_msg"] == stats_check_parallel["by_msg"]
), "Single-proc and check_parallel() should return the same thing"