New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve Jinja whitespace handling in rules #1647
Changes from 36 commits
019b68d
925dfe8
2ca72e5
d212a7f
542a1c0
66c1141
ebc9e29
b2b3f42
5d00b0d
a19d393
b38e7fa
f0a6d83
c31b4cd
8abfda4
352e419
6f18a25
c8f63c9
6212dcf
42de56b
abe5e2a
651450c
52c88e5
b370d08
39cedc9
996d19b
332aba7
168b7d3
1928ad6
07b0532
c353999
72917dd
d1cd2bc
9845311
16371cc
d5c92c8
2709aab
e06b50e
31ea23e
7ad47e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{% macro echo(colname) %} | ||
{{colname}} | ||
{% endmacro %} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{{ config(materialized='view') }} | ||
|
||
with cte_example as ( | ||
select 1 as col_name | ||
), | ||
|
||
final as | ||
( | ||
select | ||
col_name, | ||
{{- echo('col_name') -}} as col_name2 | ||
from | ||
cte_example | ||
) | ||
|
||
select * from final |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{{ config(materialized='view') }} | ||
|
||
with cte_example as ( | ||
select 1 as col_name | ||
), | ||
|
||
final as | ||
( | ||
select | ||
col_name, | ||
{{- echo('col_name') -}} as col_name2 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line looks strange after fixing because the templated file itself is pretty strange -- here's the file after rendering, before any fixes. Note that Future issues & PRs may improve the appearance when fixing templated files that use whitespace control.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interestingly the extra whitespace is EXACTLY the length of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I looked into this. The amount of whitespace is not affected by changing
This results in |
||
from | ||
cte_example | ||
) | ||
|
||
select * from final |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -382,31 +382,31 @@ def fix_string(self) -> Tuple[Any, bool]: | |
" - Skipping edit patch on non-unique templated content: %s", | ||
enriched_patch, | ||
) | ||
continue | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After my changes, this section of code was no longer covered by tests. I am not familiar with this code and had not touched it in this PR, so for now I added I ran the full test suite in |
||
# We have a single occurrence of the thing we want to patch. This | ||
# means we can use its position to place our patch. | ||
new_source_slice = slice( | ||
enriched_patch.source_slice.start + positions[0], | ||
enriched_patch.source_slice.start | ||
+ positions[0] | ||
+ len(enriched_patch.templated_str), | ||
) | ||
enriched_patch = EnrichedFixPatch( | ||
source_slice=new_source_slice, | ||
templated_slice=enriched_patch.templated_slice, | ||
patch_category=enriched_patch.patch_category, | ||
fixed_raw=enriched_patch.fixed_raw, | ||
templated_str=enriched_patch.templated_str, | ||
source_str=enriched_patch.source_str, | ||
) | ||
linter_logger.debug( | ||
" * Keeping Tricky Case. Positions: %s, New Slice: %s, Patch: %s", | ||
positions, | ||
new_source_slice, | ||
enriched_patch, | ||
) | ||
filtered_source_patches.append(enriched_patch) | ||
dedupe_buffer.append(enriched_patch.dedupe_tuple()) | ||
else: # pragma: no cover | ||
# We have a single occurrence of the thing we want to patch. This | ||
# means we can use its position to place our patch. | ||
new_source_slice = slice( | ||
enriched_patch.source_slice.start + positions[0], | ||
enriched_patch.source_slice.start | ||
+ positions[0] | ||
+ len(enriched_patch.templated_str), | ||
) | ||
enriched_patch = EnrichedFixPatch( | ||
source_slice=new_source_slice, | ||
templated_slice=enriched_patch.templated_slice, | ||
patch_category=enriched_patch.patch_category, | ||
fixed_raw=enriched_patch.fixed_raw, | ||
templated_str=enriched_patch.templated_str, | ||
source_str=enriched_patch.source_str, | ||
) | ||
linter_logger.debug( | ||
" * Keeping Tricky Case. Positions: %s, New Slice: %s, Patch: %s", | ||
positions, | ||
new_source_slice, | ||
enriched_patch, | ||
) | ||
filtered_source_patches.append(enriched_patch) | ||
dedupe_buffer.append(enriched_patch.dedupe_tuple()) | ||
continue | ||
|
||
# Sort the patches before building up the file. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -310,7 +310,7 @@ def templated_slice_to_source_slice( | |
if ts_start_sf_start < len(self.sliced_file): | ||
return self.sliced_file[1].source_slice | ||
else: | ||
return self.sliced_file[-1].source_slice | ||
return self.sliced_file[-1].source_slice # pragma: no cover | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why has this gone out of cover? Is this still the right return value? Or should the first "We should never get here" clause include this (i.e. a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know why -- I think this is one of those areas where there are no explicit tests, it's just tested implicitly by higher-level code. Presumably, updating the raw / templated mapping caused it to stop hitting this code. I can't say if it's no longer necessary or if there's just no test case that's hitting it anymore. |
||
else: | ||
start_slices = self.sliced_file[ts_start_sf_start:ts_start_sf_stop] | ||
if ts_stop_sf_start == ts_stop_sf_stop: # pragma: no cover TODO? | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -330,12 +330,52 @@ def _slice_template(cls, in_str: str) -> Iterator[RawFileSlice]: | |
} | ||
|
||
# https://jinja.palletsprojects.com/en/2.11.x/api/#jinja2.Environment.lex | ||
for _, elem_type, raw in env.lex(cls._preprocess_template(in_str)): | ||
for _, elem_type, raw in env.lex(in_str): | ||
if elem_type == "data": | ||
yield RawFileSlice(raw, "literal", idx) | ||
idx += len(raw) | ||
continue | ||
str_buff += raw | ||
|
||
if elem_type.endswith("_begin"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need a corresponding tag for skipped characters after the tag too? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Surprisingly, no. To confirm this, I added two more test cases to
I confirmed that test doesn't hit the Jinja lexer "strip" code. AFAICT, adding a Also note that in the lexer, there's a token type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure that's the case. If you put this SQL in a
And run So it definitely does something. So confused why it's not needed. Or is the fact you've not coded that the real reason for the strange "after" in above test case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I confirmed your point, @tunetheweb, and confirmed this in detail by logging No right strip:
Right strip:
Very sloppy work on my part! I'll dig into There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does seem, though, that with right stripping, no whitespace is missing from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Partial answer: Right stripping, is handled entirely in the tokenizer regexes, e.g.
Note the left vs right hand side of the I'm unsure what the "correct" behavior is here, but if right stripping needs to behave the same as left stripping, we'd want to detect when this happens and split the end token into two slices: the end token itself and a second token with the trailing whitespace. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I added handling for right stripping. As noted above, no whitespace was missing from I also updated a couple of right stripping test cases. |
||
# When a "begin" tag (whether block, comment, or data) uses | ||
# whitespace stripping | ||
# (https://jinja.palletsprojects.com/en/3.0.x/templates/#whitespace-control), | ||
# the Jinja lex() function handles this by discarding adjacent | ||
# whitespace from in_str. For more insight, see the tokeniter() | ||
# function in this file: | ||
# https://github.com/pallets/jinja/blob/main/src/jinja2/lexer.py | ||
# We want to detect and correct for this in order to: | ||
# - Correctly update "idx" (if this is wrong, that's a | ||
# potential DISASTER because lint fixes use this info to | ||
# update the source file, and incorrect values often result in | ||
# CORRUPTING the user's file so it's no longer valid SQL. :-O | ||
# - Guarantee that the slices we return fully "cover" the | ||
# contents of in_str. | ||
# | ||
# We detect skipped characters by looking ahead in in_str for | ||
# the token just returned from lex(). The token text will either | ||
# be at the current 'idx' position (if whitespace stripping did | ||
# not occur) OR it'll be farther along in in_str, but we're | ||
# GUARANTEED that lex() only skips over WHITESPACE; nothing else. | ||
|
||
# Find the token returned. Did lex() skip over any characters? | ||
num_chars_skipped = in_str.index(raw, idx) - idx | ||
if num_chars_skipped: | ||
# Yes. It skipped over some characters. Compute a string | ||
# containing the skipped characters. | ||
skipped_str = in_str[idx : idx + num_chars_skipped] | ||
|
||
# Sanity check: Verify that Jinja only skips over | ||
# WHITESPACE, never anything else. | ||
if not skipped_str.isspace(): # pragma: no cover | ||
templater_logger.warning( | ||
"Jinja lex() skipped non-whitespace: %s", skipped_str | ||
) | ||
# Treat the skipped whitespace as a literal. | ||
yield RawFileSlice(skipped_str, "literal", idx) | ||
idx += num_chars_skipped | ||
Comment on lines
+363
to
+377
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is amazing!!! Seems so simple. |
||
|
||
# raw_end and raw_begin behave a little differently in | ||
# that the whole tag shows up in one go rather than getting | ||
# parts of the tag at a time. | ||
|
@@ -361,22 +401,23 @@ def _slice_template(cls, in_str: str) -> Iterator[RawFileSlice]: | |
block_type = "block_start" | ||
if trimmed_content.split()[0] == "for": | ||
block_subtype = "loop" | ||
yield RawFileSlice(str_buff, block_type, idx, block_subtype) | ||
idx += len(str_buff) | ||
m = re.search(r"\s+$", raw, re.MULTILINE | re.DOTALL) | ||
if raw.startswith("-") and m: | ||
# Right whitespace was stripped. Split off the trailing | ||
# whitespace into a separate slice. The desired behavior is | ||
# to behave similarly as the left stripping case above. | ||
# Note that the stakes are a bit different, because lex() | ||
# hasn't *omitted* any characters from the strings it | ||
# returns, it has simply grouped them differently than we | ||
# want. | ||
trailing_chars = len(m.group(0)) | ||
yield RawFileSlice( | ||
str_buff[:-trailing_chars], block_type, idx, block_subtype | ||
) | ||
idx += len(str_buff) - trailing_chars | ||
yield RawFileSlice(str_buff[-trailing_chars:], "literal", idx) | ||
idx += trailing_chars | ||
else: | ||
yield RawFileSlice(str_buff, block_type, idx, block_subtype) | ||
idx += len(str_buff) | ||
str_buff = "" | ||
|
||
@classmethod | ||
def _preprocess_template(cls, in_str: str) -> str: | ||
"""Does any preprocessing of the template required before expansion.""" | ||
# Using Jinja whitespace stripping (e.g. `{%-` or `-%}`) breaks the | ||
# position markers between unlexed and lexed file. So let's ignore any | ||
# request to do that before lexing, by replacing '-' with '+' | ||
# | ||
# Note: '+' is the default, so shouldn't really be needed but we | ||
# explicitly state that to preserve the space for the missing '-' character | ||
# so it looks the same. | ||
in_str = in_str.replace("{%-", "{%+") | ||
in_str = in_str.replace("-%}", "+%}") | ||
in_str = in_str.replace("{#-", "{#+") | ||
in_str = in_str.replace("-#}", "+#}") | ||
return in_str |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Previously,
sqlfluff fix
was corrupting this templated line of code.