[ruby/prism] Track both the unescaped bytes and source string for a r…

…egular expression so we can accurately set its encoding flags. ruby/prism@dc6dd3a926
ruby · Mar 8, 2024 · 82fb6a9 · 82fb6a9
1 parent 2d80b60
commit 82fb6a9
Show file tree

Hide file tree

Showing 31 changed files with 259 additions and 126 deletions.
diff --git a/prism/encoding.h b/prism/encoding.h
@@ -248,7 +248,7 @@ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
 /**
  * This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk
  * can compare against it because invalid multibyte characters are not a thing
- * in this encoding.
+ * in this encoding. It is also needed for handling Regexp encoding flags.
  */
 #define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
 

diff --git a/prism/parser.h b/prism/parser.h
@@ -663,6 +663,17 @@ struct pm_parser {
      */
     pm_string_t current_string;
 
+    /**
+     * This string is used to pass information from the lexer to the parser. When
+     * processing regular expressions we must track the string source for the expression
+     * as well as its unescaped representation. In that case, `current_string` will hold
+     * the unescaped value while this field will hold the translated source value. There
+     * are some escape sequences in regular expressions that will cause the associated
+     * source string to have a different value than the content of the expression so we
+     * must track this state separately.
+     */
+    pm_string_t current_regular_expression_source;
+
     /**
      * The line number at the start of the parse. This will be used to offset
      * the line numbers of all of the locations.

diff --git a/prism/prism.c b/prism/prism.c
diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb
@@ -149,6 +149,7 @@ class EncodingTest < TestCase
     escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
     escapes = escapes.concat(escapes.product(escapes).map(&:join))
     symbols = [:a, :ą, :+]
+    regexps = [/a/, /ą/, //]
 
     encodings.each_key do |encoding|
       define_method(:"test_encoding_flags_#{encoding.name}") do
@@ -168,6 +169,18 @@ class EncodingTest < TestCase
       end
     end
 
+    encodings.each_key do |encoding|
+      define_method(:"test_regular_expression_encoding_flags_#{encoding.name}") do
+        assert_regular_expression_encoding_flags(encoding, regexps.map(&:inspect))
+      end
+    end
+
+    encodings.each_key do |encoding|
+      define_method(:"test_regular_expression_escape_encoding_flags_#{encoding.name}") do
+        assert_regular_expression_encoding_flags(encoding, escapes.map { |e| "/#{e}/" })
+      end
+    end
+
     def test_coding
       result = Prism.parse("# coding: utf-8\n'string'")
       actual = result.value.statements.body.first.unescaped.encoding
@@ -454,5 +467,50 @@ def assert_symbol_character_escape_encoding_flags(encoding, escapes)
         assert_equal expected, actual
       end
     end
+
+    def assert_regular_expression_encoding_flags(encoding, regexps)
+      regexps.each do |regexp|
+        source = "# encoding: #{encoding.name}\n#{regexp}"
+
+        expected =
+          begin
+            eval(source).encoding
+          rescue SyntaxError => error
+            if error.message.include?("UTF-8 character in non UTF-8 regexp") || error.message.include?("escaped non ASCII character in UTF-8 regexp")
+              error.message[/: (.+?)\n/, 1]
+            elsif error.message.include?("invalid multibyte char")
+              # TODO (nirvdrum 26-Jan-2024): Bail out early of the rest of the test due to https://github.com/ruby/prism/issues/2104.
+              next
+            else
+              raise
+            end
+          end
+
+        actual =
+          Prism.parse(source).then do |result|
+            if result.success?
+              regexp = result.value.statements.body.first
+
+              if regexp.forced_utf8_encoding?
+                Encoding::UTF_8
+              elsif regexp.forced_binary_encoding?
+                Encoding::ASCII_8BIT
+              elsif regexp.forced_us_ascii_encoding?
+                Encoding::US_ASCII
+              else
+                encoding
+              end
+            else
+              error = result.errors.last
+
+              unless error.message.include?("UTF-8 mixed within")
+                raise error.message
+              end
+            end
+          end
+
+        assert_equal expected, actual
+      end
+    end
   end
 end
diff --git a/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt b/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt
@@ -21,7 +21,7 @@
         │   │   ├── flags: ∅
         │   │   └── arguments: (length: 2)
         │   │       ├── @ RegularExpressionNode (location: (1,15)-(1,21))
-        │   │       │   ├── flags: ∅
+        │   │       │   ├── flags: forced_us_ascii_encoding
         │   │       │   ├── opening_loc: (1,15)-(1,16) = "/"
         │   │       │   ├── content_loc: (1,16)-(1,20) = "^\\s{"
         │   │       │   ├── closing_loc: (1,20)-(1,21) = "/"
@@ -52,7 +52,7 @@
             │   ├── flags: ∅
             │   └── arguments: (length: 2)
             │       ├── @ RegularExpressionNode (location: (5,15)-(5,21))
-            │       │   ├── flags: ∅
+            │       │   ├── flags: forced_us_ascii_encoding
             │       │   ├── opening_loc: (5,15)-(5,16) = "/"
             │       │   ├── content_loc: (5,16)-(5,20) = "^\\s{"
             │       │   ├── closing_loc: (5,20)-(5,21) = "/"

diff --git a/test/prism/snapshots/newline_terminated.txt b/test/prism/snapshots/newline_terminated.txt
@@ -100,7 +100,7 @@
         │   ├── closing_loc: (37,3)-(38,0) = "\n"
         │   └── unescaped: "foo"
         └── @ RegularExpressionNode (location: (39,0)-(41,0))
-            ├── flags: ∅
+            ├── flags: forced_us_ascii_encoding
             ├── opening_loc: (39,0)-(40,0) = "%r\n"
             ├── content_loc: (40,0)-(40,3) = "foo"
             ├── closing_loc: (40,3)-(41,0) = "\n"

diff --git a/test/prism/snapshots/patterns.txt b/test/prism/snapshots/patterns.txt
@@ -165,7 +165,7 @@
         │   │   └── block: ∅
         │   ├── pattern:
         │   │   @ RegularExpressionNode (location: (9,7)-(9,12))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
         │   │   ├── opening_loc: (9,7)-(9,8) = "/"
         │   │   ├── content_loc: (9,8)-(9,11) = "foo"
         │   │   ├── closing_loc: (9,11)-(9,12) = "/"
@@ -719,14 +719,14 @@
         │   │   ├── flags: ∅
         │   │   ├── left:
         │   │   │   @ RegularExpressionNode (location: (35,7)-(35,12))
-        │   │   │   ├── flags: ∅
+        │   │   │   ├── flags: forced_us_ascii_encoding
         │   │   │   ├── opening_loc: (35,7)-(35,8) = "/"
         │   │   │   ├── content_loc: (35,8)-(35,11) = "foo"
         │   │   │   ├── closing_loc: (35,11)-(35,12) = "/"
         │   │   │   └── unescaped: "foo"
         │   │   ├── right:
         │   │   │   @ RegularExpressionNode (location: (35,16)-(35,21))
-        │   │   │   ├── flags: ∅
+        │   │   │   ├── flags: forced_us_ascii_encoding
         │   │   │   ├── opening_loc: (35,16)-(35,17) = "/"
         │   │   │   ├── content_loc: (35,17)-(35,20) = "foo"
         │   │   │   ├── closing_loc: (35,20)-(35,21) = "/"
@@ -2543,7 +2543,7 @@
         │   │   └── block: ∅
         │   ├── pattern:
         │   │   @ RegularExpressionNode (location: (112,7)-(112,12))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
         │   │   ├── opening_loc: (112,7)-(112,8) = "/"
         │   │   ├── content_loc: (112,8)-(112,11) = "foo"
         │   │   ├── closing_loc: (112,11)-(112,12) = "/"
@@ -3126,7 +3126,7 @@
         │   │   └── @ InNode (location: (143,10)-(143,23))
         │   │       ├── pattern:
         │   │       │   @ RegularExpressionNode (location: (143,13)-(143,18))
-        │   │       │   ├── flags: ∅
+        │   │       │   ├── flags: forced_us_ascii_encoding
         │   │       │   ├── opening_loc: (143,13)-(143,14) = "/"
         │   │       │   ├── content_loc: (143,14)-(143,17) = "foo"
         │   │       │   ├── closing_loc: (143,17)-(143,18) = "/"
@@ -3914,7 +3914,7 @@
         │   │       │   │   @ StatementsNode (location: (170,13)-(170,18))
         │   │       │   │   └── body: (length: 1)
         │   │       │   │       └── @ RegularExpressionNode (location: (170,13)-(170,18))
-        │   │       │   │           ├── flags: ∅
+        │   │       │   │           ├── flags: forced_us_ascii_encoding
         │   │       │   │           ├── opening_loc: (170,13)-(170,14) = "/"
         │   │       │   │           ├── content_loc: (170,14)-(170,17) = "foo"
         │   │       │   │           ├── closing_loc: (170,17)-(170,18) = "/"

diff --git a/test/prism/snapshots/regex.txt b/test/prism/snapshots/regex.txt
@@ -15,21 +15,21 @@
         │   │   ├── flags: ∅
         │   │   └── arguments: (length: 1)
         │   │       └── @ RegularExpressionNode (location: (1,4)-(1,9))
-        │   │           ├── flags: ∅
+        │   │           ├── flags: forced_us_ascii_encoding
         │   │           ├── opening_loc: (1,4)-(1,5) = "/"
         │   │           ├── content_loc: (1,5)-(1,8) = "bar"
         │   │           ├── closing_loc: (1,8)-(1,9) = "/"
         │   │           └── unescaped: "bar"
         │   ├── closing_loc: ∅
         │   └── block: ∅
         ├── @ RegularExpressionNode (location: (3,0)-(3,8))
-        │   ├── flags: ignore_case
+        │   ├── flags: ignore_case, forced_us_ascii_encoding
         │   ├── opening_loc: (3,0)-(3,3) = "%r{"
         │   ├── content_loc: (3,3)-(3,6) = "abc"
         │   ├── closing_loc: (3,6)-(3,8) = "}i"
         │   └── unescaped: "abc"
         ├── @ RegularExpressionNode (location: (5,0)-(5,5))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
         │   ├── opening_loc: (5,0)-(5,1) = "/"
         │   ├── content_loc: (5,1)-(5,4) = "a\\b"
         │   ├── closing_loc: (5,4)-(5,5) = "/"
@@ -92,7 +92,7 @@
         │   │   │   │   ├── flags: ∅
         │   │   │   │   ├── receiver:
         │   │   │   │   │   @ RegularExpressionNode (location: (11,1)-(11,14))
-        │   │   │   │   │   ├── flags: ∅
+        │   │   │   │   │   ├── flags: forced_us_ascii_encoding
         │   │   │   │   │   ├── opening_loc: (11,1)-(11,2) = "/"
         │   │   │   │   │   ├── content_loc: (11,2)-(11,13) = "(?<foo>bar)"
         │   │   │   │   │   ├── closing_loc: (11,13)-(11,14) = "/"
@@ -127,31 +127,31 @@
         │   ├── opening_loc: (11,0)-(11,1) = "["
         │   └── closing_loc: (11,26)-(11,27) = "]"
         ├── @ RegularExpressionNode (location: (13,0)-(13,6))
-        │   ├── flags: ignore_case
+        │   ├── flags: ignore_case, forced_us_ascii_encoding
         │   ├── opening_loc: (13,0)-(13,1) = "/"
         │   ├── content_loc: (13,1)-(13,4) = "abc"
         │   ├── closing_loc: (13,4)-(13,6) = "/i"
         │   └── unescaped: "abc"
         ├── @ RegularExpressionNode (location: (15,0)-(15,26))
-        │   ├── flags: ignore_case
+        │   ├── flags: ignore_case, forced_us_ascii_encoding
         │   ├── opening_loc: (15,0)-(15,3) = "%r/"
         │   ├── content_loc: (15,3)-(15,24) = "[a-z$._?][\\w$.?\#@~]*:"
         │   ├── closing_loc: (15,24)-(15,26) = "/i"
         │   └── unescaped: "[a-z$._?][\\w$.?\#@~]*:"
         ├── @ RegularExpressionNode (location: (17,0)-(17,37))
-        │   ├── flags: ignore_case
+        │   ├── flags: ignore_case, forced_us_ascii_encoding
         │   ├── opening_loc: (17,0)-(17,3) = "%r/"
         │   ├── content_loc: (17,3)-(17,35) = "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)"
         │   ├── closing_loc: (17,35)-(17,37) = "/i"
         │   └── unescaped: "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)"
         ├── @ RegularExpressionNode (location: (19,0)-(19,25))
-        │   ├── flags: ignore_case
+        │   ├── flags: ignore_case, forced_us_ascii_encoding
         │   ├── opening_loc: (19,0)-(19,3) = "%r/"
         │   ├── content_loc: (19,3)-(19,23) = "[a-z$._?][\\w$.?\#@~]*"
         │   ├── closing_loc: (19,23)-(19,25) = "/i"
         │   └── unescaped: "[a-z$._?][\\w$.?\#@~]*"
         ├── @ RegularExpressionNode (location: (21,0)-(24,1))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
         │   ├── opening_loc: (21,0)-(21,3) = "%r("
         │   ├── content_loc: (21,3)-(24,0) = "\n(?:[\\w\#$%_']|\\(\\)|\\(,\\)|\\[\\]|[0-9])*\n  (?:[\\w\#$%_']+)\n"
         │   ├── closing_loc: (24,0)-(24,1) = ")"
@@ -160,7 +160,7 @@
         │   ├── flags: ∅
         │   ├── receiver:
         │   │   @ RegularExpressionNode (location: (26,0)-(26,8))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
         │   │   ├── opening_loc: (26,0)-(26,1) = "/"
         │   │   ├── content_loc: (26,1)-(26,7) = "(?#\\))"
         │   │   ├── closing_loc: (26,7)-(26,8) = "/"
@@ -182,7 +182,7 @@
         │   ├── closing_loc: ∅
         │   └── block: ∅
         ├── @ RegularExpressionNode (location: (28,0)-(28,9))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
         │   ├── opening_loc: (28,0)-(28,3) = "%r#"
         │   ├── content_loc: (28,3)-(28,8) = "pound"
         │   ├── closing_loc: (28,8)-(28,9) = "#"
@@ -220,7 +220,7 @@
         │   │   ├── flags: ∅
         │   │   ├── receiver:
         │   │   │   @ RegularExpressionNode (location: (32,0)-(33,4))
-        │   │   │   ├── flags: ∅
+        │   │   │   ├── flags: forced_us_ascii_encoding
         │   │   │   ├── opening_loc: (32,0)-(32,1) = "/"
         │   │   │   ├── content_loc: (32,1)-(33,3) = "(?<a\\\nb>)"
         │   │   │   ├── closing_loc: (33,3)-(33,4) = "/"
@@ -254,7 +254,7 @@
         │   │   ├── flags: ∅
         │   │   ├── receiver:
         │   │   │   @ RegularExpressionNode (location: (35,0)-(35,18))
-        │   │   │   ├── flags: ∅
+        │   │   │   ├── flags: forced_us_ascii_encoding
         │   │   │   ├── opening_loc: (35,0)-(35,1) = "/"
         │   │   │   ├── content_loc: (35,1)-(35,17) = "(?<abc>)(?<abc>)"
         │   │   │   ├── closing_loc: (35,17)-(35,18) = "/"
@@ -286,7 +286,7 @@
         │   ├── flags: ∅
         │   ├── receiver:
         │   │   @ RegularExpressionNode (location: (37,0)-(37,10))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
         │   │   ├── opening_loc: (37,0)-(37,1) = "/"
         │   │   ├── content_loc: (37,1)-(37,9) = "(?<a b>)"
         │   │   ├── closing_loc: (37,9)-(37,10) = "/"
@@ -338,7 +338,7 @@
                 │           │   ├── flags: ∅
                 │           │   ├── receiver:
                 │           │   │   @ RegularExpressionNode (location: (40,6)-(40,14))
-                │           │   │   ├── flags: ∅
+                │           │   │   ├── flags: forced_us_ascii_encoding
                 │           │   │   ├── opening_loc: (40,6)-(40,7) = "/"
                 │           │   │   ├── content_loc: (40,7)-(40,13) = "(?<a>)"
                 │           │   │   ├── closing_loc: (40,13)-(40,14) = "/"

diff --git a/test/prism/snapshots/seattlerb/TestRubyParserShared.txt b/test/prism/snapshots/seattlerb/TestRubyParserShared.txt
@@ -70,7 +70,7 @@
         │   ├── opening_loc: (26,0)-(26,3) = "%i["
         │   └── closing_loc: (29,0)-(29,1) = "]"
         ├── @ RegularExpressionNode (location: (31,0)-(34,1))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
         │   ├── opening_loc: (31,0)-(31,3) = "%r["
         │   ├── content_loc: (31,3)-(34,0) = "\n\n\n"
         │   ├── closing_loc: (34,0)-(34,1) = "]"

diff --git a/test/prism/snapshots/seattlerb/bug190.txt b/test/prism/snapshots/seattlerb/bug190.txt
@@ -4,7 +4,7 @@
     @ StatementsNode (location: (1,0)-(1,6))
     └── body: (length: 1)
         └── @ RegularExpressionNode (location: (1,0)-(1,6))
-            ├── flags: ∅
+            ├── flags: forced_us_ascii_encoding
             ├── opening_loc: (1,0)-(1,3) = "%r'"
             ├── content_loc: (1,3)-(1,5) = "\\'"
             ├── closing_loc: (1,5)-(1,6) = "'"

diff --git a/test/prism/snapshots/seattlerb/bug_case_when_regexp.txt b/test/prism/snapshots/seattlerb/bug_case_when_regexp.txt
@@ -16,7 +16,7 @@
             │       ├── keyword_loc: (1,9)-(1,13) = "when"
             │       ├── conditions: (length: 1)
             │       │   └── @ RegularExpressionNode (location: (1,14)-(1,17))
-            │       │       ├── flags: ∅
+            │       │       ├── flags: forced_us_ascii_encoding
             │       │       ├── opening_loc: (1,14)-(1,15) = "/"
             │       │       ├── content_loc: (1,15)-(1,16) = "x"
             │       │       ├── closing_loc: (1,16)-(1,17) = "/"

diff --git a/test/prism/snapshots/seattlerb/bug_cond_pct.txt b/test/prism/snapshots/seattlerb/bug_cond_pct.txt
@@ -10,7 +10,7 @@
             │       ├── keyword_loc: (1,6)-(1,10) = "when"
             │       ├── conditions: (length: 1)
             │       │   └── @ RegularExpressionNode (location: (1,11)-(1,23))
-            │       │       ├── flags: ∅
+            │       │       ├── flags: forced_us_ascii_encoding
             │       │       ├── opening_loc: (1,11)-(1,14) = "%r%"
             │       │       ├── content_loc: (1,14)-(1,22) = "blahblah"
             │       │       ├── closing_loc: (1,22)-(1,23) = "%"

diff --git a/test/prism/snapshots/seattlerb/case_in.txt b/test/prism/snapshots/seattlerb/case_in.txt
@@ -338,7 +338,7 @@
         │   │   └── @ InNode (location: (46,0)-(46,11))
         │   │       ├── pattern:
         │   │       │   @ RegularExpressionNode (location: (46,3)-(46,11))
-        │   │       │   ├── flags: ∅
+        │   │       │   ├── flags: forced_us_ascii_encoding
         │   │       │   ├── opening_loc: (46,3)-(46,4) = "/"
         │   │       │   ├── content_loc: (46,4)-(46,10) = "regexp"
         │   │       │   ├── closing_loc: (46,10)-(46,11) = "/"