From c0b8dee95a5412f395486a9bcb4959f93509cecb Mon Sep 17 00:00:00 2001
From: Koichi ITO <koic.ito@gmail.com>
Date: Fri, 15 Mar 2024 01:22:14 +0900
Subject: [PATCH] [ruby/prism] Fix an AST and token incompatibility for
 `Prism::Translation::Parser`

This PR fixes an AST and token incompatibility between Parser gem and `Prism::Translation::Parser`
for dstring literal:

```ruby
"foo
  #{bar}"
```

## Parser gem (Expected)

```console
$ bundle exec ruby -Ilib -rparser/ruby33 -ve \
'buf = Parser::Source::Buffer.new("example.rb"); buf.source = File.read("example.rb"); p Parser::Ruby33.new.tokenize(buf)'
ruby 3.3.0 (2023-12-25 revision https://github.com/ruby/prism/commit/5124f9ac75) [x86_64-darwin22]
[s(:dstr,
  s(:str, "foo\n"),
  s(:str, "  "),
  s(:begin,
    s(:send, nil, :bar))), [], [[:tSTRING_BEG, ["\"", #<Parser::Source::Range example.rb 0...1>]],
[:tSTRING_CONTENT, ["foo\n", #<Parser::Source::Range example.rb 1...5>]],
[:tSTRING_CONTENT, ["  ", #<Parser::Source::Range example.rb 5...7>]],
[:tSTRING_DBEG, ["\#{", #<Parser::Source::Range example.rb 7...9>]],
[:tIDENTIFIER, ["bar", #<Parser::Source::Range example.rb 9...12>]],
[:tSTRING_DEND, ["}", #<Parser::Source::Range example.rb 12...13>]],
[:tSTRING_END, ["\"", #<Parser::Source::Range example.rb 13...14>]], [:tNL, [nil, #<Parser::Source::Range example.rb 14...15>]]]]
```

## `Prism::Translation::Parser` (Actual)

Previously, the AST and tokens returned by the Parser gem were different. In this case, `dstr` node should not be nested:

```console
$ bundle exec ruby -Ilib -rprism -rprism/translation/parser33 -ve \
'buf = Parser::Source::Buffer.new("example.rb"); buf.source = File.read("example.rb"); p Prism::Translation::Parser33.new.tokenize(buf)'
ruby 3.3.0 (2023-12-25 revision https://github.com/ruby/prism/commit/5124f9ac75) [x86_64-darwin22]
[s(:dstr,
  s(:dstr,
    s(:str, "foo\n"),
    s(:str, "  ")),
  s(:begin,
    s(:send, nil, :bar))), [], [[:tSTRING_BEG, ["\"", #<Parser::Source::Range example.rb 0...1>]],
[:tSTRING_CONTENT, ["foo\n", #<Parser::Source::Range example.rb 1...5>]],
[:tSTRING_CONTENT, ["  ", #<Parser::Source::Range example.rb 5...7>]],
[:tSTRING_DBEG, ["\#{", #<Parser::Source::Range example.rb 7...9>]],
[:tIDENTIFIER, ["bar", #<Parser::Source::Range example.rb 9...12>]],
[:tSTRING_DEND, ["}", #<Parser::Source::Range example.rb 12...13>]],
[:tSTRING_END, ["\"", #<Parser::Source::Range example.rb 13...14>]], [:tNL, [nil, #<Parser::Source::Range example.rb 14...15>]]]]
```

After this correction, the AST and tokens returned by the Parser gem are the same:

```console
$ bundle exec ruby -Ilib -rprism -rprism/translation/parser33 -ve \
'buf = Parser::Source::Buffer.new("example.rb"); buf.source = File.read("example.rb"); p Prism::Translation::Parser33.new.tokenize(buf)'
ruby 3.3.0 (2023-12-25 revision https://github.com/ruby/prism/commit/5124f9ac75) [x86_64-darwin22]
[s(:dstr,
  s(:str, "foo\n"),
  s(:str, "  "),
  s(:begin,
    s(:send, nil, :bar))), [], [[:tSTRING_BEG, ["\"", #<Parser::Source::Range example.rb 0...1>]],
[:tSTRING_CONTENT, ["foo\n", #<Parser::Source::Range example.rb 1...5>]],
[:tSTRING_CONTENT, ["  ", #<Parser::Source::Range example.rb 5...7>]],
[:tSTRING_DBEG, ["\#{", #<Parser::Source::Range example.rb 7...9>]],
[:tIDENTIFIER, ["bar", #<Parser::Source::Range example.rb 9...12>]],
[:tSTRING_DEND, ["}", #<Parser::Source::Range example.rb 12...13>]],
[:tSTRING_END, ["\"", #<Parser::Source::Range example.rb 13...14>]], [:tNL, [nil, #<Parser::Source::Range example.rb 14...15>]]]]
```

https://github.com/ruby/prism/commit/c1652a9ee7
---
 lib/prism/translation/parser/compiler.rb | 41 ++++++++---
 lib/prism/translation/parser/lexer.rb    | 20 +++++-
 test/prism/fixtures/dstring.txt          | 27 ++++++++
 test/prism/snapshots/dstring.txt         | 88 +++++++++++++++++++++---
 4 files changed, 156 insertions(+), 20 deletions(-)

diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb
index 4eb2c4a8da9002..0503d003f52305 100644
--- a/lib/prism/translation/parser/compiler.rb
+++ b/lib/prism/translation/parser/compiler.rb
@@ -953,14 +953,35 @@ def visit_interpolated_regular_expression_node(node)
         def visit_interpolated_string_node(node)
           if node.heredoc?
             children, closing = visit_heredoc(node)
-            builder.string_compose(token(node.opening_loc), children, closing)
+
+            return builder.string_compose(token(node.opening_loc), children, closing)
+          end
+
+          parts = if node.parts.one? { |part| part.type == :string_node }
+            node.parts.flat_map do |node|
+              if node.type == :string_node && node.unescaped.lines.count >= 2
+                start_offset = node.content_loc.start_offset
+
+                node.unescaped.lines.map do |line|
+                  end_offset = start_offset + line.length
+                  offsets = srange_offsets(start_offset, end_offset)
+                  start_offset = end_offset
+
+                  builder.string_internal([line, offsets])
+                end
+              else
+                visit(node)
+              end
+            end
           else
-            builder.string_compose(
-              token(node.opening_loc),
-              visit_all(node.parts),
-              token(node.closing_loc)
-            )
+            visit_all(node.parts)
           end
+
+          builder.string_compose(
+            token(node.opening_loc),
+            parts,
+            token(node.closing_loc)
+          )
         end
 
         # :"foo #{bar}"
@@ -1492,17 +1513,17 @@ def visit_string_node(node)
           elsif node.opening == "?"
             builder.character([node.unescaped, srange(node.location)])
           else
-            parts = if node.unescaped.lines.count <= 1
+            parts = if node.content.lines.count <= 1 || node.unescaped.lines.count <= 1
               [builder.string_internal([node.unescaped, srange(node.content_loc)])]
             else
               start_offset = node.content_loc.start_offset
 
-              node.unescaped.lines.map do |line|
-                end_offset = start_offset + line.length
+              [node.content.lines, node.unescaped.lines].transpose.map do |content_line, unescaped_line|
+                end_offset = start_offset + content_line.length
                 offsets = srange_offsets(start_offset, end_offset)
                 start_offset = end_offset
 
-                builder.string_internal([line, offsets])
+                builder.string_internal([unescaped_line, offsets])
               end
             end
 
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
index 9cf86476ba6c37..7febca449e8aaa 100644
--- a/lib/prism/translation/parser/lexer.rb
+++ b/lib/prism/translation/parser/lexer.rb
@@ -295,8 +295,24 @@ def to_a
               unless (lines = token.value.lines).one?
                 start_offset = offset_cache[token.location.start_offset]
                 lines.map do |line|
-                  end_offset = start_offset + line.length
-                  tokens << [:tSTRING_CONTENT, [line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
+                  newline = line.end_with?("\r\n") ? "\r\n" : "\n"
+                  chomped_line = line.chomp
+                  if match = chomped_line.match(/(?<backslashes>\\+)\z/)
+                    adjustment = match[:backslashes].size / 2
+                    adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
+                    if match[:backslashes].size.odd?
+                      adjusted_line.delete_suffix!("\\")
+                      adjustment += 2
+                    else
+                      adjusted_line << newline
+                    end
+                  else
+                    adjusted_line = line
+                    adjustment = 0
+                  end
+
+                  end_offset = start_offset + adjusted_line.length + adjustment
+                  tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
                   start_offset = end_offset
                 end
                 next
diff --git a/test/prism/fixtures/dstring.txt b/test/prism/fixtures/dstring.txt
index b7a0958d3f8dee..085e0c6852a1bf 100644
--- a/test/prism/fixtures/dstring.txt
+++ b/test/prism/fixtures/dstring.txt
@@ -1,2 +1,29 @@
 "foo
   bar"
+
+"foo
+  #{bar}"
+
+"fo
+o" "ba
+r"
+
+"
+foo\
+"
+
+"
+foo\\
+"
+
+"
+foo\\\
+"
+
+"
+foo\\\\
+"
+
+"
+foo\\\\\
+"
diff --git a/test/prism/snapshots/dstring.txt b/test/prism/snapshots/dstring.txt
index a24eaf193af61b..ad395f8a8eb94b 100644
--- a/test/prism/snapshots/dstring.txt
+++ b/test/prism/snapshots/dstring.txt
@@ -1,11 +1,83 @@
-@ ProgramNode (location: (1,0)-(2,6))
+@ ProgramNode (location: (1,0)-(29,1))
 ├── locals: []
 └── statements:
-    @ StatementsNode (location: (1,0)-(2,6))
-    └── body: (length: 1)
-        └── @ StringNode (location: (1,0)-(2,6))
+    @ StatementsNode (location: (1,0)-(29,1))
+    └── body: (length: 8)
+        ├── @ StringNode (location: (1,0)-(2,6))
+        │   ├── flags: ∅
+        │   ├── opening_loc: (1,0)-(1,1) = "\""
+        │   ├── content_loc: (1,1)-(2,5) = "foo\n  bar"
+        │   ├── closing_loc: (2,5)-(2,6) = "\""
+        │   └── unescaped: "foo\n  bar"
+        ├── @ InterpolatedStringNode (location: (4,0)-(5,9))
+        │   ├── opening_loc: (4,0)-(4,1) = "\""
+        │   ├── parts: (length: 2)
+        │   │   ├── @ StringNode (location: (4,1)-(5,2))
+        │   │   │   ├── flags: ∅
+        │   │   │   ├── opening_loc: ∅
+        │   │   │   ├── content_loc: (4,1)-(5,2) = "foo\n  "
+        │   │   │   ├── closing_loc: ∅
+        │   │   │   └── unescaped: "foo\n  "
+        │   │   └── @ EmbeddedStatementsNode (location: (5,2)-(5,8))
+        │   │       ├── opening_loc: (5,2)-(5,4) = "\#{"
+        │   │       ├── statements:
+        │   │       │   @ StatementsNode (location: (5,4)-(5,7))
+        │   │       │   └── body: (length: 1)
+        │   │       │       └── @ CallNode (location: (5,4)-(5,7))
+        │   │       │           ├── flags: variable_call, ignore_visibility
+        │   │       │           ├── receiver: ∅
+        │   │       │           ├── call_operator_loc: ∅
+        │   │       │           ├── name: :bar
+        │   │       │           ├── message_loc: (5,4)-(5,7) = "bar"
+        │   │       │           ├── opening_loc: ∅
+        │   │       │           ├── arguments: ∅
+        │   │       │           ├── closing_loc: ∅
+        │   │       │           └── block: ∅
+        │   │       └── closing_loc: (5,7)-(5,8) = "}"
+        │   └── closing_loc: (5,8)-(5,9) = "\""
+        ├── @ InterpolatedStringNode (location: (7,0)-(9,2))
+        │   ├── opening_loc: ∅
+        │   ├── parts: (length: 2)
+        │   │   ├── @ StringNode (location: (7,0)-(8,2))
+        │   │   │   ├── flags: ∅
+        │   │   │   ├── opening_loc: (7,0)-(7,1) = "\""
+        │   │   │   ├── content_loc: (7,1)-(8,1) = "fo\no"
+        │   │   │   ├── closing_loc: (8,1)-(8,2) = "\""
+        │   │   │   └── unescaped: "fo\no"
+        │   │   └── @ StringNode (location: (8,3)-(9,2))
+        │   │       ├── flags: ∅
+        │   │       ├── opening_loc: (8,3)-(8,4) = "\""
+        │   │       ├── content_loc: (8,4)-(9,1) = "ba\nr"
+        │   │       ├── closing_loc: (9,1)-(9,2) = "\""
+        │   │       └── unescaped: "ba\nr"
+        │   └── closing_loc: ∅
+        ├── @ StringNode (location: (11,0)-(13,1))
+        │   ├── flags: ∅
+        │   ├── opening_loc: (11,0)-(11,1) = "\""
+        │   ├── content_loc: (11,1)-(13,0) = "\nfoo\\\n"
+        │   ├── closing_loc: (13,0)-(13,1) = "\""
+        │   └── unescaped: "\nfoo"
+        ├── @ StringNode (location: (15,0)-(17,1))
+        │   ├── flags: ∅
+        │   ├── opening_loc: (15,0)-(15,1) = "\""
+        │   ├── content_loc: (15,1)-(17,0) = "\nfoo\\\\\n"
+        │   ├── closing_loc: (17,0)-(17,1) = "\""
+        │   └── unescaped: "\nfoo\\\n"
+        ├── @ StringNode (location: (19,0)-(21,1))
+        │   ├── flags: ∅
+        │   ├── opening_loc: (19,0)-(19,1) = "\""
+        │   ├── content_loc: (19,1)-(21,0) = "\nfoo\\\\\\\n"
+        │   ├── closing_loc: (21,0)-(21,1) = "\""
+        │   └── unescaped: "\nfoo\\"
+        ├── @ StringNode (location: (23,0)-(25,1))
+        │   ├── flags: ∅
+        │   ├── opening_loc: (23,0)-(23,1) = "\""
+        │   ├── content_loc: (23,1)-(25,0) = "\nfoo\\\\\\\\\n"
+        │   ├── closing_loc: (25,0)-(25,1) = "\""
+        │   └── unescaped: "\nfoo\\\\\n"
+        └── @ StringNode (location: (27,0)-(29,1))
             ├── flags: ∅
-            ├── opening_loc: (1,0)-(1,1) = "\""
-            ├── content_loc: (1,1)-(2,5) = "foo\n  bar"
-            ├── closing_loc: (2,5)-(2,6) = "\""
-            └── unescaped: "foo\n  bar"
+            ├── opening_loc: (27,0)-(27,1) = "\""
+            ├── content_loc: (27,1)-(29,0) = "\nfoo\\\\\\\\\\\n"
+            ├── closing_loc: (29,0)-(29,1) = "\""
+            └── unescaped: "\nfoo\\\\"