Skip to content

Commit

Permalink
[ruby/prism] Track both the unescaped bytes and source string for a r…
Browse files Browse the repository at this point in the history
…egular expression so we can accurately set its encoding flags.

ruby/prism@dc6dd3a926
  • Loading branch information
nirvdrum authored and matzbot committed Mar 8, 2024
1 parent 2d80b60 commit 82fb6a9
Show file tree
Hide file tree
Showing 31 changed files with 259 additions and 126 deletions.
2 changes: 1 addition & 1 deletion prism/encoding.h
Expand Up @@ -248,7 +248,7 @@ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
/**
* This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk
* can compare against it because invalid multibyte characters are not a thing
* in this encoding.
* in this encoding. It is also needed for handling Regexp encoding flags.
*/
#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])

Expand Down
11 changes: 11 additions & 0 deletions prism/parser.h
Expand Up @@ -663,6 +663,17 @@ struct pm_parser {
*/
pm_string_t current_string;

/**
* This string is used to pass information from the lexer to the parser. When
* processing regular expressions we must track the string source for the expression
* as well as its unescaped representation. In that case, `current_string` will hold
* the unescaped value while this field will hold the translated source value. There
* are some escape sequences in regular expressions that will cause the associated
* source string to have a different value than the content of the expression so we
* must track this state separately.
*/
pm_string_t current_regular_expression_source;

/**
* The line number at the start of the parse. This will be used to offset
* the line numbers of all of the locations.
Expand Down
164 changes: 114 additions & 50 deletions prism/prism.c

Large diffs are not rendered by default.

58 changes: 58 additions & 0 deletions test/prism/encoding_test.rb
Expand Up @@ -149,6 +149,7 @@ class EncodingTest < TestCase
escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
escapes = escapes.concat(escapes.product(escapes).map(&:join))
symbols = [:a, , :+]
regexps = [/a/, /ą/, //]

encodings.each_key do |encoding|
define_method(:"test_encoding_flags_#{encoding.name}") do
Expand All @@ -168,6 +169,18 @@ class EncodingTest < TestCase
end
end

encodings.each_key do |encoding|
define_method(:"test_regular_expression_encoding_flags_#{encoding.name}") do
assert_regular_expression_encoding_flags(encoding, regexps.map(&:inspect))
end
end

encodings.each_key do |encoding|
define_method(:"test_regular_expression_escape_encoding_flags_#{encoding.name}") do
assert_regular_expression_encoding_flags(encoding, escapes.map { |e| "/#{e}/" })
end
end

def test_coding
result = Prism.parse("# coding: utf-8\n'string'")
actual = result.value.statements.body.first.unescaped.encoding
Expand Down Expand Up @@ -454,5 +467,50 @@ def assert_symbol_character_escape_encoding_flags(encoding, escapes)
assert_equal expected, actual
end
end

def assert_regular_expression_encoding_flags(encoding, regexps)
regexps.each do |regexp|
source = "# encoding: #{encoding.name}\n#{regexp}"

expected =
begin
eval(source).encoding
rescue SyntaxError => error
if error.message.include?("UTF-8 character in non UTF-8 regexp") || error.message.include?("escaped non ASCII character in UTF-8 regexp")
error.message[/: (.+?)\n/, 1]
elsif error.message.include?("invalid multibyte char")
# TODO (nirvdrum 26-Jan-2024): Bail out early of the rest of the test due to https://github.com/ruby/prism/issues/2104.
next
else
raise
end
end

actual =
Prism.parse(source).then do |result|
if result.success?
regexp = result.value.statements.body.first

if regexp.forced_utf8_encoding?
Encoding::UTF_8
elsif regexp.forced_binary_encoding?
Encoding::ASCII_8BIT
elsif regexp.forced_us_ascii_encoding?
Encoding::US_ASCII
else
encoding
end
else
error = result.errors.last

unless error.message.include?("UTF-8 mixed within")
raise error.message
end
end
end

assert_equal expected, actual
end
end
end
end
Expand Up @@ -21,7 +21,7 @@
│ │ ├── flags: ∅
│ │ └── arguments: (length: 2)
│ │ ├── @ RegularExpressionNode (location: (1,15)-(1,21))
│ │ │ ├── flags:
│ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ ├── opening_loc: (1,15)-(1,16) = "/"
│ │ │ ├── content_loc: (1,16)-(1,20) = "^\\s{"
│ │ │ ├── closing_loc: (1,20)-(1,21) = "/"
Expand Down Expand Up @@ -52,7 +52,7 @@
│ ├── flags: ∅
│ └── arguments: (length: 2)
│ ├── @ RegularExpressionNode (location: (5,15)-(5,21))
│ │ ├── flags:
│ │ ├── flags: forced_us_ascii_encoding
│ │ ├── opening_loc: (5,15)-(5,16) = "/"
│ │ ├── content_loc: (5,16)-(5,20) = "^\\s{"
│ │ ├── closing_loc: (5,20)-(5,21) = "/"
Expand Down
2 changes: 1 addition & 1 deletion test/prism/snapshots/newline_terminated.txt
Expand Up @@ -100,7 +100,7 @@
│ ├── closing_loc: (37,3)-(38,0) = "\n"
│ └── unescaped: "foo"
└── @ RegularExpressionNode (location: (39,0)-(41,0))
├── flags:
├── flags: forced_us_ascii_encoding
├── opening_loc: (39,0)-(40,0) = "%r\n"
├── content_loc: (40,0)-(40,3) = "foo"
├── closing_loc: (40,3)-(41,0) = "\n"
Expand Down
12 changes: 6 additions & 6 deletions test/prism/snapshots/patterns.txt
Expand Up @@ -165,7 +165,7 @@
│ │ └── block: ∅
│ ├── pattern:
│ │ @ RegularExpressionNode (location: (9,7)-(9,12))
│ │ ├── flags:
│ │ ├── flags: forced_us_ascii_encoding
│ │ ├── opening_loc: (9,7)-(9,8) = "/"
│ │ ├── content_loc: (9,8)-(9,11) = "foo"
│ │ ├── closing_loc: (9,11)-(9,12) = "/"
Expand Down Expand Up @@ -719,14 +719,14 @@
│ │ ├── flags: ∅
│ │ ├── left:
│ │ │ @ RegularExpressionNode (location: (35,7)-(35,12))
│ │ │ ├── flags:
│ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ ├── opening_loc: (35,7)-(35,8) = "/"
│ │ │ ├── content_loc: (35,8)-(35,11) = "foo"
│ │ │ ├── closing_loc: (35,11)-(35,12) = "/"
│ │ │ └── unescaped: "foo"
│ │ ├── right:
│ │ │ @ RegularExpressionNode (location: (35,16)-(35,21))
│ │ │ ├── flags:
│ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ ├── opening_loc: (35,16)-(35,17) = "/"
│ │ │ ├── content_loc: (35,17)-(35,20) = "foo"
│ │ │ ├── closing_loc: (35,20)-(35,21) = "/"
Expand Down Expand Up @@ -2543,7 +2543,7 @@
│ │ └── block: ∅
│ ├── pattern:
│ │ @ RegularExpressionNode (location: (112,7)-(112,12))
│ │ ├── flags:
│ │ ├── flags: forced_us_ascii_encoding
│ │ ├── opening_loc: (112,7)-(112,8) = "/"
│ │ ├── content_loc: (112,8)-(112,11) = "foo"
│ │ ├── closing_loc: (112,11)-(112,12) = "/"
Expand Down Expand Up @@ -3126,7 +3126,7 @@
│ │ └── @ InNode (location: (143,10)-(143,23))
│ │ ├── pattern:
│ │ │ @ RegularExpressionNode (location: (143,13)-(143,18))
│ │ │ ├── flags:
│ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ ├── opening_loc: (143,13)-(143,14) = "/"
│ │ │ ├── content_loc: (143,14)-(143,17) = "foo"
│ │ │ ├── closing_loc: (143,17)-(143,18) = "/"
Expand Down Expand Up @@ -3914,7 +3914,7 @@
│ │ │ │ @ StatementsNode (location: (170,13)-(170,18))
│ │ │ │ └── body: (length: 1)
│ │ │ │ └── @ RegularExpressionNode (location: (170,13)-(170,18))
│ │ │ │ ├── flags:
│ │ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ │ ├── opening_loc: (170,13)-(170,14) = "/"
│ │ │ │ ├── content_loc: (170,14)-(170,17) = "foo"
│ │ │ │ ├── closing_loc: (170,17)-(170,18) = "/"
Expand Down
30 changes: 15 additions & 15 deletions test/prism/snapshots/regex.txt
Expand Up @@ -15,21 +15,21 @@
│ │ ├── flags: ∅
│ │ └── arguments: (length: 1)
│ │ └── @ RegularExpressionNode (location: (1,4)-(1,9))
│ │ ├── flags:
│ │ ├── flags: forced_us_ascii_encoding
│ │ ├── opening_loc: (1,4)-(1,5) = "/"
│ │ ├── content_loc: (1,5)-(1,8) = "bar"
│ │ ├── closing_loc: (1,8)-(1,9) = "/"
│ │ └── unescaped: "bar"
│ ├── closing_loc: ∅
│ └── block: ∅
├── @ RegularExpressionNode (location: (3,0)-(3,8))
│ ├── flags: ignore_case
│ ├── flags: ignore_case, forced_us_ascii_encoding
│ ├── opening_loc: (3,0)-(3,3) = "%r{"
│ ├── content_loc: (3,3)-(3,6) = "abc"
│ ├── closing_loc: (3,6)-(3,8) = "}i"
│ └── unescaped: "abc"
├── @ RegularExpressionNode (location: (5,0)-(5,5))
│ ├── flags:
│ ├── flags: forced_us_ascii_encoding
│ ├── opening_loc: (5,0)-(5,1) = "/"
│ ├── content_loc: (5,1)-(5,4) = "a\\b"
│ ├── closing_loc: (5,4)-(5,5) = "/"
Expand Down Expand Up @@ -92,7 +92,7 @@
│ │ │ │ ├── flags: ∅
│ │ │ │ ├── receiver:
│ │ │ │ │ @ RegularExpressionNode (location: (11,1)-(11,14))
│ │ │ │ │ ├── flags:
│ │ │ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ │ │ ├── opening_loc: (11,1)-(11,2) = "/"
│ │ │ │ │ ├── content_loc: (11,2)-(11,13) = "(?<foo>bar)"
│ │ │ │ │ ├── closing_loc: (11,13)-(11,14) = "/"
Expand Down Expand Up @@ -127,31 +127,31 @@
│ ├── opening_loc: (11,0)-(11,1) = "["
│ └── closing_loc: (11,26)-(11,27) = "]"
├── @ RegularExpressionNode (location: (13,0)-(13,6))
│ ├── flags: ignore_case
│ ├── flags: ignore_case, forced_us_ascii_encoding
│ ├── opening_loc: (13,0)-(13,1) = "/"
│ ├── content_loc: (13,1)-(13,4) = "abc"
│ ├── closing_loc: (13,4)-(13,6) = "/i"
│ └── unescaped: "abc"
├── @ RegularExpressionNode (location: (15,0)-(15,26))
│ ├── flags: ignore_case
│ ├── flags: ignore_case, forced_us_ascii_encoding
│ ├── opening_loc: (15,0)-(15,3) = "%r/"
│ ├── content_loc: (15,3)-(15,24) = "[a-z$._?][\\w$.?\#@~]*:"
│ ├── closing_loc: (15,24)-(15,26) = "/i"
│ └── unescaped: "[a-z$._?][\\w$.?\#@~]*:"
├── @ RegularExpressionNode (location: (17,0)-(17,37))
│ ├── flags: ignore_case
│ ├── flags: ignore_case, forced_us_ascii_encoding
│ ├── opening_loc: (17,0)-(17,3) = "%r/"
│ ├── content_loc: (17,3)-(17,35) = "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)"
│ ├── closing_loc: (17,35)-(17,37) = "/i"
│ └── unescaped: "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)"
├── @ RegularExpressionNode (location: (19,0)-(19,25))
│ ├── flags: ignore_case
│ ├── flags: ignore_case, forced_us_ascii_encoding
│ ├── opening_loc: (19,0)-(19,3) = "%r/"
│ ├── content_loc: (19,3)-(19,23) = "[a-z$._?][\\w$.?\#@~]*"
│ ├── closing_loc: (19,23)-(19,25) = "/i"
│ └── unescaped: "[a-z$._?][\\w$.?\#@~]*"
├── @ RegularExpressionNode (location: (21,0)-(24,1))
│ ├── flags:
│ ├── flags: forced_us_ascii_encoding
│ ├── opening_loc: (21,0)-(21,3) = "%r("
│ ├── content_loc: (21,3)-(24,0) = "\n(?:[\\w\#$%_']|\\(\\)|\\(,\\)|\\[\\]|[0-9])*\n (?:[\\w\#$%_']+)\n"
│ ├── closing_loc: (24,0)-(24,1) = ")"
Expand All @@ -160,7 +160,7 @@
│ ├── flags: ∅
│ ├── receiver:
│ │ @ RegularExpressionNode (location: (26,0)-(26,8))
│ │ ├── flags:
│ │ ├── flags: forced_us_ascii_encoding
│ │ ├── opening_loc: (26,0)-(26,1) = "/"
│ │ ├── content_loc: (26,1)-(26,7) = "(?#\\))"
│ │ ├── closing_loc: (26,7)-(26,8) = "/"
Expand All @@ -182,7 +182,7 @@
│ ├── closing_loc: ∅
│ └── block: ∅
├── @ RegularExpressionNode (location: (28,0)-(28,9))
│ ├── flags:
│ ├── flags: forced_us_ascii_encoding
│ ├── opening_loc: (28,0)-(28,3) = "%r#"
│ ├── content_loc: (28,3)-(28,8) = "pound"
│ ├── closing_loc: (28,8)-(28,9) = "#"
Expand Down Expand Up @@ -220,7 +220,7 @@
│ │ ├── flags: ∅
│ │ ├── receiver:
│ │ │ @ RegularExpressionNode (location: (32,0)-(33,4))
│ │ │ ├── flags:
│ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ ├── opening_loc: (32,0)-(32,1) = "/"
│ │ │ ├── content_loc: (32,1)-(33,3) = "(?<a\\\nb>)"
│ │ │ ├── closing_loc: (33,3)-(33,4) = "/"
Expand Down Expand Up @@ -254,7 +254,7 @@
│ │ ├── flags: ∅
│ │ ├── receiver:
│ │ │ @ RegularExpressionNode (location: (35,0)-(35,18))
│ │ │ ├── flags:
│ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ ├── opening_loc: (35,0)-(35,1) = "/"
│ │ │ ├── content_loc: (35,1)-(35,17) = "(?<abc>)(?<abc>)"
│ │ │ ├── closing_loc: (35,17)-(35,18) = "/"
Expand Down Expand Up @@ -286,7 +286,7 @@
│ ├── flags: ∅
│ ├── receiver:
│ │ @ RegularExpressionNode (location: (37,0)-(37,10))
│ │ ├── flags:
│ │ ├── flags: forced_us_ascii_encoding
│ │ ├── opening_loc: (37,0)-(37,1) = "/"
│ │ ├── content_loc: (37,1)-(37,9) = "(?<a b>)"
│ │ ├── closing_loc: (37,9)-(37,10) = "/"
Expand Down Expand Up @@ -338,7 +338,7 @@
│ │ ├── flags: ∅
│ │ ├── receiver:
│ │ │ @ RegularExpressionNode (location: (40,6)-(40,14))
│ │ │ ├── flags:
│ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ ├── opening_loc: (40,6)-(40,7) = "/"
│ │ │ ├── content_loc: (40,7)-(40,13) = "(?<a>)"
│ │ │ ├── closing_loc: (40,13)-(40,14) = "/"
Expand Down
2 changes: 1 addition & 1 deletion test/prism/snapshots/seattlerb/TestRubyParserShared.txt
Expand Up @@ -70,7 +70,7 @@
│ ├── opening_loc: (26,0)-(26,3) = "%i["
│ └── closing_loc: (29,0)-(29,1) = "]"
├── @ RegularExpressionNode (location: (31,0)-(34,1))
│ ├── flags:
│ ├── flags: forced_us_ascii_encoding
│ ├── opening_loc: (31,0)-(31,3) = "%r["
│ ├── content_loc: (31,3)-(34,0) = "\n\n\n"
│ ├── closing_loc: (34,0)-(34,1) = "]"
Expand Down
2 changes: 1 addition & 1 deletion test/prism/snapshots/seattlerb/bug190.txt
Expand Up @@ -4,7 +4,7 @@
@ StatementsNode (location: (1,0)-(1,6))
└── body: (length: 1)
└── @ RegularExpressionNode (location: (1,0)-(1,6))
├── flags:
├── flags: forced_us_ascii_encoding
├── opening_loc: (1,0)-(1,3) = "%r'"
├── content_loc: (1,3)-(1,5) = "\\'"
├── closing_loc: (1,5)-(1,6) = "'"
Expand Down
2 changes: 1 addition & 1 deletion test/prism/snapshots/seattlerb/bug_case_when_regexp.txt
Expand Up @@ -16,7 +16,7 @@
│ ├── keyword_loc: (1,9)-(1,13) = "when"
│ ├── conditions: (length: 1)
│ │ └── @ RegularExpressionNode (location: (1,14)-(1,17))
│ │ ├── flags:
│ │ ├── flags: forced_us_ascii_encoding
│ │ ├── opening_loc: (1,14)-(1,15) = "/"
│ │ ├── content_loc: (1,15)-(1,16) = "x"
│ │ ├── closing_loc: (1,16)-(1,17) = "/"
Expand Down
2 changes: 1 addition & 1 deletion test/prism/snapshots/seattlerb/bug_cond_pct.txt
Expand Up @@ -10,7 +10,7 @@
│ ├── keyword_loc: (1,6)-(1,10) = "when"
│ ├── conditions: (length: 1)
│ │ └── @ RegularExpressionNode (location: (1,11)-(1,23))
│ │ ├── flags:
│ │ ├── flags: forced_us_ascii_encoding
│ │ ├── opening_loc: (1,11)-(1,14) = "%r%"
│ │ ├── content_loc: (1,14)-(1,22) = "blahblah"
│ │ ├── closing_loc: (1,22)-(1,23) = "%"
Expand Down
2 changes: 1 addition & 1 deletion test/prism/snapshots/seattlerb/case_in.txt
Expand Up @@ -338,7 +338,7 @@
│ │ └── @ InNode (location: (46,0)-(46,11))
│ │ ├── pattern:
│ │ │ @ RegularExpressionNode (location: (46,3)-(46,11))
│ │ │ ├── flags:
│ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ ├── opening_loc: (46,3)-(46,4) = "/"
│ │ │ ├── content_loc: (46,4)-(46,10) = "regexp"
│ │ │ ├── closing_loc: (46,10)-(46,11) = "/"
Expand Down

0 comments on commit 82fb6a9

Please sign in to comment.