From c0b8dee95a5412f395486a9bcb4959f93509cecb Mon Sep 17 00:00:00 2001 From: Koichi ITO Date: Fri, 15 Mar 2024 01:22:14 +0900 Subject: [PATCH] [ruby/prism] Fix an AST and token incompatibility for `Prism::Translation::Parser` This PR fixes an AST and token incompatibility between Parser gem and `Prism::Translation::Parser` for dstring literal: ```ruby "foo #{bar}" ``` ## Parser gem (Expected) ```console $ bundle exec ruby -Ilib -rparser/ruby33 -ve \ 'buf = Parser::Source::Buffer.new("example.rb"); buf.source = File.read("example.rb"); p Parser::Ruby33.new.tokenize(buf)' ruby 3.3.0 (2023-12-25 revision https://github.com/ruby/prism/commit/5124f9ac75) [x86_64-darwin22] [s(:dstr, s(:str, "foo\n"), s(:str, " "), s(:begin, s(:send, nil, :bar))), [], [[:tSTRING_BEG, ["\"", #]], [:tSTRING_CONTENT, ["foo\n", #]], [:tSTRING_CONTENT, [" ", #]], [:tSTRING_DBEG, ["\#{", #]], [:tIDENTIFIER, ["bar", #]], [:tSTRING_DEND, ["}", #]], [:tSTRING_END, ["\"", #]], [:tNL, [nil, #]]]] ``` ## `Prism::Translation::Parser` (Actual) Previously, the AST and tokens returned by the Parser gem were different. In this case, `dstr` node should not be nested: ```console $ bundle exec ruby -Ilib -rprism -rprism/translation/parser33 -ve \ 'buf = Parser::Source::Buffer.new("example.rb"); buf.source = File.read("example.rb"); p Prism::Translation::Parser33.new.tokenize(buf)' ruby 3.3.0 (2023-12-25 revision https://github.com/ruby/prism/commit/5124f9ac75) [x86_64-darwin22] [s(:dstr, s(:dstr, s(:str, "foo\n"), s(:str, " ")), s(:begin, s(:send, nil, :bar))), [], [[:tSTRING_BEG, ["\"", #]], [:tSTRING_CONTENT, ["foo\n", #]], [:tSTRING_CONTENT, [" ", #]], [:tSTRING_DBEG, ["\#{", #]], [:tIDENTIFIER, ["bar", #]], [:tSTRING_DEND, ["}", #]], [:tSTRING_END, ["\"", #]], [:tNL, [nil, #]]]] ``` After this correction, the AST and tokens returned by the Parser gem are the same: ```console $ bundle exec ruby -Ilib -rprism -rprism/translation/parser33 -ve \ 'buf = Parser::Source::Buffer.new("example.rb"); buf.source = File.read("example.rb"); p Prism::Translation::Parser33.new.tokenize(buf)' ruby 3.3.0 (2023-12-25 revision https://github.com/ruby/prism/commit/5124f9ac75) [x86_64-darwin22] [s(:dstr, s(:str, "foo\n"), s(:str, " "), s(:begin, s(:send, nil, :bar))), [], [[:tSTRING_BEG, ["\"", #]], [:tSTRING_CONTENT, ["foo\n", #]], [:tSTRING_CONTENT, [" ", #]], [:tSTRING_DBEG, ["\#{", #]], [:tIDENTIFIER, ["bar", #]], [:tSTRING_DEND, ["}", #]], [:tSTRING_END, ["\"", #]], [:tNL, [nil, #]]]] ``` https://github.com/ruby/prism/commit/c1652a9ee7 --- lib/prism/translation/parser/compiler.rb | 41 ++++++++--- lib/prism/translation/parser/lexer.rb | 20 +++++- test/prism/fixtures/dstring.txt | 27 ++++++++ test/prism/snapshots/dstring.txt | 88 +++++++++++++++++++++--- 4 files changed, 156 insertions(+), 20 deletions(-) diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb index 4eb2c4a8da9002..0503d003f52305 100644 --- a/lib/prism/translation/parser/compiler.rb +++ b/lib/prism/translation/parser/compiler.rb @@ -953,14 +953,35 @@ def visit_interpolated_regular_expression_node(node) def visit_interpolated_string_node(node) if node.heredoc? children, closing = visit_heredoc(node) - builder.string_compose(token(node.opening_loc), children, closing) + + return builder.string_compose(token(node.opening_loc), children, closing) + end + + parts = if node.parts.one? { |part| part.type == :string_node } + node.parts.flat_map do |node| + if node.type == :string_node && node.unescaped.lines.count >= 2 + start_offset = node.content_loc.start_offset + + node.unescaped.lines.map do |line| + end_offset = start_offset + line.length + offsets = srange_offsets(start_offset, end_offset) + start_offset = end_offset + + builder.string_internal([line, offsets]) + end + else + visit(node) + end + end else - builder.string_compose( - token(node.opening_loc), - visit_all(node.parts), - token(node.closing_loc) - ) + visit_all(node.parts) end + + builder.string_compose( + token(node.opening_loc), + parts, + token(node.closing_loc) + ) end # :"foo #{bar}" @@ -1492,17 +1513,17 @@ def visit_string_node(node) elsif node.opening == "?" builder.character([node.unescaped, srange(node.location)]) else - parts = if node.unescaped.lines.count <= 1 + parts = if node.content.lines.count <= 1 || node.unescaped.lines.count <= 1 [builder.string_internal([node.unescaped, srange(node.content_loc)])] else start_offset = node.content_loc.start_offset - node.unescaped.lines.map do |line| - end_offset = start_offset + line.length + [node.content.lines, node.unescaped.lines].transpose.map do |content_line, unescaped_line| + end_offset = start_offset + content_line.length offsets = srange_offsets(start_offset, end_offset) start_offset = end_offset - builder.string_internal([line, offsets]) + builder.string_internal([unescaped_line, offsets]) end end diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index 9cf86476ba6c37..7febca449e8aaa 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -295,8 +295,24 @@ def to_a unless (lines = token.value.lines).one? start_offset = offset_cache[token.location.start_offset] lines.map do |line| - end_offset = start_offset + line.length - tokens << [:tSTRING_CONTENT, [line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]] + newline = line.end_with?("\r\n") ? "\r\n" : "\n" + chomped_line = line.chomp + if match = chomped_line.match(/(?\\+)\z/) + adjustment = match[:backslashes].size / 2 + adjusted_line = chomped_line.delete_suffix("\\" * adjustment) + if match[:backslashes].size.odd? + adjusted_line.delete_suffix!("\\") + adjustment += 2 + else + adjusted_line << newline + end + else + adjusted_line = line + adjustment = 0 + end + + end_offset = start_offset + adjusted_line.length + adjustment + tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]] start_offset = end_offset end next diff --git a/test/prism/fixtures/dstring.txt b/test/prism/fixtures/dstring.txt index b7a0958d3f8dee..085e0c6852a1bf 100644 --- a/test/prism/fixtures/dstring.txt +++ b/test/prism/fixtures/dstring.txt @@ -1,2 +1,29 @@ "foo bar" + +"foo + #{bar}" + +"fo +o" "ba +r" + +" +foo\ +" + +" +foo\\ +" + +" +foo\\\ +" + +" +foo\\\\ +" + +" +foo\\\\\ +" diff --git a/test/prism/snapshots/dstring.txt b/test/prism/snapshots/dstring.txt index a24eaf193af61b..ad395f8a8eb94b 100644 --- a/test/prism/snapshots/dstring.txt +++ b/test/prism/snapshots/dstring.txt @@ -1,11 +1,83 @@ -@ ProgramNode (location: (1,0)-(2,6)) +@ ProgramNode (location: (1,0)-(29,1)) ├── locals: [] └── statements: - @ StatementsNode (location: (1,0)-(2,6)) - └── body: (length: 1) - └── @ StringNode (location: (1,0)-(2,6)) + @ StatementsNode (location: (1,0)-(29,1)) + └── body: (length: 8) + ├── @ StringNode (location: (1,0)-(2,6)) + │ ├── flags: ∅ + │ ├── opening_loc: (1,0)-(1,1) = "\"" + │ ├── content_loc: (1,1)-(2,5) = "foo\n bar" + │ ├── closing_loc: (2,5)-(2,6) = "\"" + │ └── unescaped: "foo\n bar" + ├── @ InterpolatedStringNode (location: (4,0)-(5,9)) + │ ├── opening_loc: (4,0)-(4,1) = "\"" + │ ├── parts: (length: 2) + │ │ ├── @ StringNode (location: (4,1)-(5,2)) + │ │ │ ├── flags: ∅ + │ │ │ ├── opening_loc: ∅ + │ │ │ ├── content_loc: (4,1)-(5,2) = "foo\n " + │ │ │ ├── closing_loc: ∅ + │ │ │ └── unescaped: "foo\n " + │ │ └── @ EmbeddedStatementsNode (location: (5,2)-(5,8)) + │ │ ├── opening_loc: (5,2)-(5,4) = "\#{" + │ │ ├── statements: + │ │ │ @ StatementsNode (location: (5,4)-(5,7)) + │ │ │ └── body: (length: 1) + │ │ │ └── @ CallNode (location: (5,4)-(5,7)) + │ │ │ ├── flags: variable_call, ignore_visibility + │ │ │ ├── receiver: ∅ + │ │ │ ├── call_operator_loc: ∅ + │ │ │ ├── name: :bar + │ │ │ ├── message_loc: (5,4)-(5,7) = "bar" + │ │ │ ├── opening_loc: ∅ + │ │ │ ├── arguments: ∅ + │ │ │ ├── closing_loc: ∅ + │ │ │ └── block: ∅ + │ │ └── closing_loc: (5,7)-(5,8) = "}" + │ └── closing_loc: (5,8)-(5,9) = "\"" + ├── @ InterpolatedStringNode (location: (7,0)-(9,2)) + │ ├── opening_loc: ∅ + │ ├── parts: (length: 2) + │ │ ├── @ StringNode (location: (7,0)-(8,2)) + │ │ │ ├── flags: ∅ + │ │ │ ├── opening_loc: (7,0)-(7,1) = "\"" + │ │ │ ├── content_loc: (7,1)-(8,1) = "fo\no" + │ │ │ ├── closing_loc: (8,1)-(8,2) = "\"" + │ │ │ └── unescaped: "fo\no" + │ │ └── @ StringNode (location: (8,3)-(9,2)) + │ │ ├── flags: ∅ + │ │ ├── opening_loc: (8,3)-(8,4) = "\"" + │ │ ├── content_loc: (8,4)-(9,1) = "ba\nr" + │ │ ├── closing_loc: (9,1)-(9,2) = "\"" + │ │ └── unescaped: "ba\nr" + │ └── closing_loc: ∅ + ├── @ StringNode (location: (11,0)-(13,1)) + │ ├── flags: ∅ + │ ├── opening_loc: (11,0)-(11,1) = "\"" + │ ├── content_loc: (11,1)-(13,0) = "\nfoo\\\n" + │ ├── closing_loc: (13,0)-(13,1) = "\"" + │ └── unescaped: "\nfoo" + ├── @ StringNode (location: (15,0)-(17,1)) + │ ├── flags: ∅ + │ ├── opening_loc: (15,0)-(15,1) = "\"" + │ ├── content_loc: (15,1)-(17,0) = "\nfoo\\\\\n" + │ ├── closing_loc: (17,0)-(17,1) = "\"" + │ └── unescaped: "\nfoo\\\n" + ├── @ StringNode (location: (19,0)-(21,1)) + │ ├── flags: ∅ + │ ├── opening_loc: (19,0)-(19,1) = "\"" + │ ├── content_loc: (19,1)-(21,0) = "\nfoo\\\\\\\n" + │ ├── closing_loc: (21,0)-(21,1) = "\"" + │ └── unescaped: "\nfoo\\" + ├── @ StringNode (location: (23,0)-(25,1)) + │ ├── flags: ∅ + │ ├── opening_loc: (23,0)-(23,1) = "\"" + │ ├── content_loc: (23,1)-(25,0) = "\nfoo\\\\\\\\\n" + │ ├── closing_loc: (25,0)-(25,1) = "\"" + │ └── unescaped: "\nfoo\\\\\n" + └── @ StringNode (location: (27,0)-(29,1)) ├── flags: ∅ - ├── opening_loc: (1,0)-(1,1) = "\"" - ├── content_loc: (1,1)-(2,5) = "foo\n bar" - ├── closing_loc: (2,5)-(2,6) = "\"" - └── unescaped: "foo\n bar" + ├── opening_loc: (27,0)-(27,1) = "\"" + ├── content_loc: (27,1)-(29,0) = "\nfoo\\\\\\\\\\\n" + ├── closing_loc: (29,0)-(29,1) = "\"" + └── unescaped: "\nfoo\\\\"