New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Prevent rules incorrectly returning conflicting fixes to same position #2830
Changes from 23 commits
4de2882
65a95b0
e51aa4b
9e8a5f0
89dbc95
bf87ff9
12a9383
119400d
c75c1bd
b5548b4
2e5561e
a2b5424
c248845
b43748e
255a5d0
6c4b0fa
dfd16b0
bc2b727
b6435e7
49db007
d736c60
4fc8f8d
8aa9170
76f8ba1
6548269
dfdd131
ffd746c
38a0b0e
808d92d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,10 +8,12 @@ | |
analysis. | ||
""" | ||
|
||
from collections import defaultdict | ||
from collections.abc import MutableSet | ||
from copy import deepcopy | ||
from dataclasses import replace | ||
from dataclasses import dataclass, field, replace | ||
from io import StringIO | ||
from typing import Any, Callable, Optional, List, Tuple, NamedTuple, Iterator | ||
from typing import Any, Callable, Dict, Optional, List, Tuple, NamedTuple, Iterator | ||
import logging | ||
|
||
from tqdm import tqdm | ||
|
@@ -50,6 +52,44 @@ class FixPatch(NamedTuple): | |
patch_category: str | ||
|
||
|
||
@dataclass | ||
class AnchorEditInfo: | ||
"""For a given fix anchor, count of the fix edit types and fixes for it.""" | ||
|
||
delete: int = field(default=0) | ||
replace: int = field(default=0) | ||
create_before: int = field(default=0) | ||
create_after: int = field(default=0) | ||
fixes: List = field(default_factory=list) | ||
|
||
def add(self, fix): | ||
"""Adds the fix and updates stats.""" | ||
self.fixes.append(fix) | ||
setattr(self, fix.edit_type, getattr(self, fix.edit_type) + 1) | ||
|
||
@property | ||
def total(self): | ||
"""Returns total count of fixes.""" | ||
return len(self.fixes) | ||
|
||
@property | ||
def is_valid(self): | ||
"""Returns True if valid combination of fixes for anchor. | ||
|
||
Cases: | ||
* 1 fix of any type: Valid | ||
* 2 fixes: Valid if and only if types are create_before and create_after | ||
""" | ||
if self.total <= 1: | ||
# Definitely no duplicates if <= 1. | ||
return True | ||
if self.total != 2: | ||
# Definitely duplicates if > 2. | ||
return False | ||
# Special case: Ok to create before and after same segment. | ||
return self.create_before == 1 and self.create_after == 1 | ||
|
||
|
||
class BaseSegment: | ||
"""The base segment element. | ||
|
||
|
@@ -1008,13 +1048,21 @@ def apply_fixes(self, dialect, rule_code, fixes): | |
else: | ||
seg = todo_buffer.pop(0) | ||
|
||
fix_buff = fixes.copy() | ||
unused_fixes = [] | ||
while fix_buff: | ||
f = fix_buff.pop() | ||
# Look for identity not just equality. | ||
# This handles potential positioning ambiguity. | ||
if f.anchor is seg: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Below, I reworked the core
Bonus: The new logic is more straightforward (dictionary lookup/removal versus making copies of lists and moving things between them). It's probably more efficient as well -- the old logic was scanning all the unused fixes each time, to try and match it against the current segment. |
||
# Look for identity not just equality. | ||
# This handles potential positioning ambiguity. | ||
anchor_info: Optional[AnchorEditInfo] = fixes.pop(id(seg), None) | ||
if anchor_info is not None: | ||
seg_fixes = anchor_info.fixes | ||
if ( | ||
len(seg_fixes) == 2 | ||
and seg_fixes[0].edit_type == "create_after" | ||
): # pragma: no cover | ||
# Must be create_before & create_after. Swap so the | ||
# "before" comes first. | ||
seg_fixes.reverse() | ||
|
||
for f in anchor_info.fixes: | ||
assert f.anchor is seg | ||
fixes_applied.append(f) | ||
linter_logger.debug( | ||
"Matched fix against segment: %s -> %s", f, seg | ||
|
@@ -1027,9 +1075,13 @@ def apply_fixes(self, dialect, rule_code, fixes): | |
"create_before", | ||
"create_after", | ||
): | ||
if f.edit_type == "create_after": | ||
# in the case of a creation after, also add this | ||
# segment before the edit. | ||
if ( | ||
f.edit_type == "create_after" | ||
and len(anchor_info.fixes) == 1 | ||
): | ||
# in the case of a creation after that is not part | ||
# of a create_before/create_after pair, also add | ||
# this segment before the edit. | ||
seg_buffer.append(seg) | ||
|
||
# We're doing a replacement (it could be a single | ||
|
@@ -1051,18 +1103,8 @@ def apply_fixes(self, dialect, rule_code, fixes): | |
f.edit_type, f | ||
) | ||
) | ||
# We've applied a fix here. Move on, this also consumes the | ||
# fix | ||
# TODO: Maybe deal with overlapping fixes later. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Delete this comment now that we're dealing with them (by warning and discarding) |
||
break | ||
else: | ||
# We've not used the fix so we should keep it in the list | ||
# for later. | ||
unused_fixes.append(f) | ||
else: | ||
seg_buffer.append(seg) | ||
# Switch over the the unused list | ||
fixes = unused_fixes + fix_buff | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note how this tricky "end of loop" logic all goes away now that we're using a dictionary instead. |
||
# Invalidate any caches | ||
self.invalidate_caches() | ||
|
||
|
@@ -1090,6 +1132,17 @@ def apply_fixes(self, dialect, rule_code, fixes): | |
else: | ||
return self, fixes | ||
|
||
@classmethod | ||
def compute_anchor_edit_info(cls, fixes) -> Dict[int, AnchorEditInfo]: | ||
"""Group and count fixes by anchor, return dictionary.""" | ||
anchor_info = defaultdict(AnchorEditInfo) # type: ignore | ||
for fix in fixes: | ||
# :TRICKY: Use segment id() as the dictionary key since | ||
# different segments may compare as equal. | ||
anchor_id = id(fix.anchor) | ||
anchor_info[anchor_id].add(fix) | ||
return dict(anchor_info) | ||
|
||
def _validate_segment_after_fixes(self, rule_code, dialect, fixes_applied, segment): | ||
"""Checks correctness of new segment against match or parse grammar.""" | ||
root_parse_context = RootParseContext(dialect=dialect) | ||
|
@@ -1117,7 +1170,7 @@ def _validate_segment_after_fixes(self, rule_code, dialect, fixes_applied, segme | |
|
||
@staticmethod | ||
def _log_apply_fixes_check_issue(message, *args): # pragma: no cover | ||
linter_logger.warning(message, *args) | ||
linter_logger.critical(message, *args) | ||
|
||
def iter_patches(self, templated_str: str) -> Iterator[FixPatch]: | ||
"""Iterate through the segments generating fix patches. | ||
|
@@ -1336,3 +1389,41 @@ def get_table_references(self): | |
for stmt in self.get_children("statement"): | ||
references |= stmt.get_table_references() | ||
return references | ||
|
||
|
||
class IdentitySet(MutableSet): | ||
"""Similar to built-in set(), but based on object IDENTITY. | ||
|
||
This is often important when working with BaseSegment and other types, | ||
where different object instances may compare as equal. | ||
|
||
Copied from: https://stackoverflow.com/questions/16994307/identityset-in-python | ||
""" | ||
|
||
key = id # should return a hashable object | ||
|
||
def __init__(self, iterable=()): | ||
self.map = {} # id -> object | ||
self |= iterable # add elements from iterable to the set (union) | ||
|
||
def __len__(self): # Sized | ||
return len(self.map) | ||
|
||
def __iter__(self): # Iterable | ||
return self.map.values().__iter__() # pragma: no cover | ||
|
||
def __contains__(self, x): # Container | ||
return self.key(x) in self.map | ||
|
||
def add(self, value): # MutableSet | ||
"""Add an element.""" | ||
self.map[self.key(value)] = value | ||
|
||
def discard(self, value): # MutableSet | ||
"""Remove an element. Do not raise an exception if absent.""" | ||
self.map.pop(self.key(value), None) # pragma: no cover | ||
|
||
def __repr__(self): # pragma: no cover | ||
if not self: | ||
return "%s()" % (self.__class__.__name__,) | ||
return "%s(%r)" % (self.__class__.__name__, list(self)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
from sqlfluff.core.parser import WhitespaceSegment | ||
|
||
from sqlfluff.core.parser import BaseSegment, NewlineSegment | ||
from sqlfluff.core.parser.segments.base import IdentitySet | ||
from sqlfluff.core.rules.base import BaseRule, LintFix, LintResult, RuleContext | ||
from sqlfluff.core.rules.doc_decorators import document_fix_compatible | ||
from sqlfluff.core.rules.functional import Segments | ||
|
@@ -254,6 +255,7 @@ def _eval_single_select_target_element( | |
|
||
def _fixes_for_move_after_select_clause( | ||
stop_seg: BaseSegment, | ||
delete_segments: Optional[Segments] = None, | ||
add_newline: bool = True, | ||
) -> List[LintFix]: | ||
"""Cleans up by moving leftover select_clause segments. | ||
|
@@ -277,10 +279,25 @@ def _fixes_for_move_after_select_clause( | |
start_seg=start_seg, | ||
stop_seg=stop_seg, | ||
) | ||
fixes = [ | ||
LintFix.delete(seg) for seg in move_after_select_clause | ||
# :TRICKY: Below, we have a couple places where we | ||
# filter to guard against deleting the same segment | ||
# multiple times -- this is illegal. | ||
# :TRICKY: Use IdentitySet rather than set() since | ||
# different segments may compare as equal. | ||
all_deletes = IdentitySet( | ||
fix.anchor for fix in fixes if fix.edit_type == "delete" | ||
) | ||
fixes_ = [] | ||
for seg in delete_segments or []: | ||
if seg not in all_deletes: | ||
fixes.append(LintFix.delete(seg)) | ||
all_deletes.add(seg) | ||
fixes_ += [ | ||
LintFix.delete(seg) | ||
for seg in move_after_select_clause | ||
if seg not in all_deletes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tunetheweb: I'd like your thoughts on a question. This PR adds a new feature that prohibits fixes with the same anchor with one exception: It's okay to have 2 fixes, one Should we also allow multiple deletes of the same segment? There's no ambiguity there, and it would avoid needing to "fix" this rule as well as L053. It's kind of a philosophical question. If we decide to allow multiple deletes, we make it easier on rule writers. On the other hand, we're letting them be a bit sloppy. 🤷♂️ I could be convinced either way. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm.... I think safer to not allow that. In theory it shouldn't be needed so the developer ease is not a strong enough argument in my mind. And curious why L036 currently does it? Looked at the code but wasn't immediately apparent to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's not a "good" reason, and I am not that familiar with L036 details, but basically, there are a couple places that delete unnecessary whitespace, and there was no bookkeeping, so in some cases, the same whitespace gets deleted twice. Happy to leave the fix checker "as is" for now (i.e. not allow multiple deletes). |
||
] | ||
fixes.append( | ||
fixes_.append( | ||
LintFix.create_after( | ||
self._choose_anchor_segment( | ||
context, | ||
|
@@ -292,7 +309,7 @@ def _fixes_for_move_after_select_clause( | |
+ list(move_after_select_clause), | ||
) | ||
) | ||
return fixes | ||
return fixes_ | ||
|
||
if select_stmt.segments[after_select_clause_idx].is_type("newline"): | ||
# Since we're deleting the newline, we should also delete all | ||
|
@@ -304,10 +321,7 @@ def _fixes_for_move_after_select_clause( | |
loop_while=sp.is_type("whitespace"), | ||
start_seg=select_children[start_idx], | ||
) | ||
fixes += [LintFix.delete(seg) for seg in to_delete] | ||
|
||
if to_delete: | ||
|
||
# The select_clause is immediately followed by a | ||
# newline. Delete the newline in order to avoid leaving | ||
# behind an empty line after fix, *unless* we stopped | ||
|
@@ -325,15 +339,15 @@ def _fixes_for_move_after_select_clause( | |
) | ||
|
||
fixes += _fixes_for_move_after_select_clause( | ||
to_delete[-1], | ||
to_delete[-1], to_delete | ||
) | ||
elif select_stmt.segments[after_select_clause_idx].is_type( | ||
"whitespace" | ||
): | ||
# The select_clause has stuff after (most likely a comment) | ||
# Delete the whitespace immediately after the select clause | ||
# so the other stuff aligns nicely based on where the select | ||
# clause started | ||
# clause started. | ||
fixes += [ | ||
LintFix.delete( | ||
select_stmt.segments[after_select_clause_idx], | ||
|
@@ -354,11 +368,10 @@ def _fixes_for_move_after_select_clause( | |
loop_while=sp.is_type("whitespace"), | ||
start_seg=select_children[select_clause_idx - 1], | ||
) | ||
fixes += [LintFix.delete(seg) for seg in to_delete] | ||
|
||
if to_delete: | ||
fixes += _fixes_for_move_after_select_clause( | ||
to_delete[-1], | ||
to_delete, | ||
# If we deleted a newline, create a newline. | ||
any(seg for seg in to_delete if seg.is_type("newline")), | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
from typing import List, Optional | ||
|
||
from sqlfluff.core.parser import WhitespaceSegment | ||
|
||
from sqlfluff.core.parser.segments.base import IdentitySet | ||
from sqlfluff.core.rules.base import BaseRule, LintFix, LintResult, RuleContext | ||
from sqlfluff.core.rules.doc_decorators import document_fix_compatible | ||
from sqlfluff.core.rules.functional import sp | ||
|
@@ -38,6 +38,11 @@ def _eval(self, context: RuleContext) -> Optional[List[LintResult]]: | |
prev_newline = True | ||
prev_whitespace = None | ||
violations = [] | ||
memory = context.memory | ||
if not memory: | ||
# Use memory to avoid returning multiple fixes with the same anchor. | ||
# (This is illegal.) | ||
memory = dict(fix_anchors=IdentitySet()) | ||
for seg in context.segment.segments: | ||
if seg.is_meta: | ||
continue | ||
|
@@ -119,4 +124,24 @@ def _eval(self, context: RuleContext) -> Optional[List[LintResult]]: | |
) | ||
) | ||
|
||
return violations or None | ||
final_violations = [] | ||
if violations: | ||
# Check each violation. If any of its fixes uses the same anchor | ||
# as a previously returned fix, discard it. The linter can't handle | ||
# applying fixes like this. Skipping this issue is okay because it | ||
# will be detected and fixed during the next linter pass. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this code in L039? Feels like core code that should be in BaseRule. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't want L039 had duplicate anchors on this new test case (extracted from one of the
L039 wants to make two fixes to the line
Thus, it's trying to both replace and delete the same whitespace. If we return both fixes, the core linter will (appropriately) complain and discard both fixes. This bookkeeping ensures that both get fixed, but it's split across two passes through the linter loop. There may be a smarter way to do this, but this approach seems reasonable. I'm trying really hard not to do big rewrites of existing rule code during these PRs -- the goal is to eliminate the critical errors but try and avoid going down a 🐰 hole. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks makes sense. Maybe add a comment saying, something like: This rule works in two steps to remove unnecessary white space:
This can result in two delete being applied to same segment so area so check for that and replace with single delete. |
||
for violation in violations: | ||
if not any( | ||
[ | ||
fix | ||
for fix in violation.fixes | ||
if fix.anchor in memory["fix_anchors"] | ||
] | ||
): | ||
final_violations.append(violation) | ||
for fix in violation.fixes: | ||
memory["fix_anchors"].add(fix.anchor) | ||
if not final_violations: | ||
final_violations.append(LintResult()) | ||
final_violations[-1].memory = memory | ||
return final_violations |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -69,6 +69,12 @@ class Rule_L050(BaseRule): | |
|
||
def _eval(self, context: RuleContext) -> Optional[LintResult]: | ||
"""Files must not begin with newlines or whitespace.""" | ||
# Only check raw segments. This ensures we don't try and delete the same | ||
# whitespace multiple times (i.e. for non-raw segments higher in the | ||
# tree). | ||
if not context.segment.is_raw(): | ||
return None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we allowed multiple deletions, we wouldn't need this change. |
||
|
||
# If parent_stack is empty we are currently at FileSegment. | ||
if len(context.parent_stack) == 0: | ||
return None | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we add a comment explaining what each of these three parts are for?
First one I think it covered by the error message.
Second is what? When there are errors but no fixes?
Third is what the good case? We have fixes and they look valid?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will do.