Skip to content

Commit 442bd90

Browse files
authored
Fix UTF-8 code units to match the number of bytes (#4098)
1 parent 5ec3981 commit 442bd90

4 files changed

Lines changed: 62 additions & 28 deletions

File tree

lib/prism/parse_result.rb

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ def character_column(byte_offset)
189189
#--
190190
#: (Integer byte_offset, Encoding encoding) -> Integer
191191
def code_units_offset(byte_offset, encoding)
192+
return byte_offset if encoding == Encoding::UTF_8
193+
192194
byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
193195

194196
if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
@@ -250,6 +252,14 @@ def find_line(byte_offset) # :nodoc:
250252
# has not yet been implemented.
251253
#
252254
class CodeUnitsCache
255+
# Counter used for UTF-8, where one code unit equals one byte.
256+
class UTF8Counter # :nodoc:
257+
#: (Integer byte_offset, Integer byte_length) -> Integer
258+
def count(byte_offset, byte_length)
259+
byte_length
260+
end
261+
end
262+
253263
class UTF16Counter # :nodoc:
254264
# @rbs @source: String
255265
# @rbs @encoding: Encoding
@@ -266,7 +276,10 @@ def count(byte_offset, byte_length)
266276
end
267277
end
268278

269-
class LengthCounter # :nodoc:
279+
# Counter used for UTF-32, where one code unit equals one code point and
280+
# matches String#length. Also used as a best-effort fallback for any other
281+
# encoding that does not have a dedicated counter.
282+
class UTF32Counter # :nodoc:
270283
# @rbs @source: String
271284
# @rbs @encoding: Encoding
272285

@@ -282,10 +295,10 @@ def count(byte_offset, byte_length)
282295
end
283296
end
284297

285-
private_constant :UTF16Counter, :LengthCounter
298+
private_constant :UTF8Counter, :UTF16Counter, :UTF32Counter
286299

287300
# @rbs @source: String
288-
# @rbs @counter: UTF16Counter | LengthCounter
301+
# @rbs @counter: UTF8Counter | UTF16Counter | UTF32Counter
289302
# @rbs @cache: Hash[Integer, Integer]
290303
# @rbs @offsets: Array[Integer]
291304

@@ -295,10 +308,13 @@ def count(byte_offset, byte_length)
295308
def initialize(source, encoding)
296309
@source = source
297310
@counter =
298-
if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
311+
case encoding
312+
when Encoding::UTF_8
313+
UTF8Counter.new
314+
when Encoding::UTF_16LE, Encoding::UTF_16BE
299315
UTF16Counter.new(source, encoding)
300316
else
301-
LengthCounter.new(source, encoding)
317+
UTF32Counter.new(source, encoding)
302318
end
303319

304320
@cache = {} #: Hash[Integer, Integer]

rbi/generated/prism/parse_result.rbi

Lines changed: 10 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

sig/generated/prism/parse_result.rbs

Lines changed: 11 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/prism/ruby/location_test.rb

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -73,72 +73,72 @@ def test_code_units
7373
assert_equal 0, location.start_code_units_offset(Encoding::UTF_16LE)
7474
assert_equal 0, location.start_code_units_offset(Encoding::UTF_32LE)
7575

76-
assert_equal 1, location.end_code_units_offset(Encoding::UTF_8)
76+
assert_equal 4, location.end_code_units_offset(Encoding::UTF_8)
7777
assert_equal 2, location.end_code_units_offset(Encoding::UTF_16LE)
7878
assert_equal 1, location.end_code_units_offset(Encoding::UTF_32LE)
7979

8080
assert_equal 0, location.start_code_units_column(Encoding::UTF_8)
8181
assert_equal 0, location.start_code_units_column(Encoding::UTF_16LE)
8282
assert_equal 0, location.start_code_units_column(Encoding::UTF_32LE)
8383

84-
assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
84+
assert_equal 4, location.end_code_units_column(Encoding::UTF_8)
8585
assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
8686
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
8787

8888
# second 😀
8989
location = program.statements.body.first.arguments.arguments.first.location
9090

91-
assert_equal 4, location.start_code_units_offset(Encoding::UTF_8)
91+
assert_equal 7, location.start_code_units_offset(Encoding::UTF_8)
9292
assert_equal 5, location.start_code_units_offset(Encoding::UTF_16LE)
9393
assert_equal 4, location.start_code_units_offset(Encoding::UTF_32LE)
9494

95-
assert_equal 5, location.end_code_units_offset(Encoding::UTF_8)
95+
assert_equal 11, location.end_code_units_offset(Encoding::UTF_8)
9696
assert_equal 7, location.end_code_units_offset(Encoding::UTF_16LE)
9797
assert_equal 5, location.end_code_units_offset(Encoding::UTF_32LE)
9898

99-
assert_equal 4, location.start_code_units_column(Encoding::UTF_8)
99+
assert_equal 7, location.start_code_units_column(Encoding::UTF_8)
100100
assert_equal 5, location.start_code_units_column(Encoding::UTF_16LE)
101101
assert_equal 4, location.start_code_units_column(Encoding::UTF_32LE)
102102

103-
assert_equal 5, location.end_code_units_column(Encoding::UTF_8)
103+
assert_equal 11, location.end_code_units_column(Encoding::UTF_8)
104104
assert_equal 7, location.end_code_units_column(Encoding::UTF_16LE)
105105
assert_equal 5, location.end_code_units_column(Encoding::UTF_32LE)
106106

107107
# first 😍
108108
location = program.statements.body.last.name_loc
109109

110-
assert_equal 6, location.start_code_units_offset(Encoding::UTF_8)
110+
assert_equal 12, location.start_code_units_offset(Encoding::UTF_8)
111111
assert_equal 8, location.start_code_units_offset(Encoding::UTF_16LE)
112112
assert_equal 6, location.start_code_units_offset(Encoding::UTF_32LE)
113113

114-
assert_equal 7, location.end_code_units_offset(Encoding::UTF_8)
114+
assert_equal 16, location.end_code_units_offset(Encoding::UTF_8)
115115
assert_equal 10, location.end_code_units_offset(Encoding::UTF_16LE)
116116
assert_equal 7, location.end_code_units_offset(Encoding::UTF_32LE)
117117

118118
assert_equal 0, location.start_code_units_column(Encoding::UTF_8)
119119
assert_equal 0, location.start_code_units_column(Encoding::UTF_16LE)
120120
assert_equal 0, location.start_code_units_column(Encoding::UTF_32LE)
121121

122-
assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
122+
assert_equal 4, location.end_code_units_column(Encoding::UTF_8)
123123
assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
124124
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
125125

126126
# second 😍
127127
location = program.statements.body.last.value.location
128128

129-
assert_equal 12, location.start_code_units_offset(Encoding::UTF_8)
129+
assert_equal 21, location.start_code_units_offset(Encoding::UTF_8)
130130
assert_equal 15, location.start_code_units_offset(Encoding::UTF_16LE)
131131
assert_equal 12, location.start_code_units_offset(Encoding::UTF_32LE)
132132

133-
assert_equal 13, location.end_code_units_offset(Encoding::UTF_8)
133+
assert_equal 25, location.end_code_units_offset(Encoding::UTF_8)
134134
assert_equal 17, location.end_code_units_offset(Encoding::UTF_16LE)
135135
assert_equal 13, location.end_code_units_offset(Encoding::UTF_32LE)
136136

137-
assert_equal 6, location.start_code_units_column(Encoding::UTF_8)
137+
assert_equal 9, location.start_code_units_column(Encoding::UTF_8)
138138
assert_equal 7, location.start_code_units_column(Encoding::UTF_16LE)
139139
assert_equal 6, location.start_code_units_column(Encoding::UTF_32LE)
140140

141-
assert_equal 7, location.end_code_units_column(Encoding::UTF_8)
141+
assert_equal 13, location.end_code_units_column(Encoding::UTF_8)
142142
assert_equal 9, location.end_code_units_column(Encoding::UTF_16LE)
143143
assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
144144
end
@@ -157,34 +157,34 @@ def test_cached_code_units
157157
assert_equal 0, location.cached_start_code_units_offset(utf16_cache)
158158
assert_equal 0, location.cached_start_code_units_offset(utf32_cache)
159159

160-
assert_equal 1, location.cached_end_code_units_offset(utf8_cache)
160+
assert_equal 4, location.cached_end_code_units_offset(utf8_cache)
161161
assert_equal 2, location.cached_end_code_units_offset(utf16_cache)
162162
assert_equal 1, location.cached_end_code_units_offset(utf32_cache)
163163

164164
assert_equal 0, location.cached_start_code_units_column(utf8_cache)
165165
assert_equal 0, location.cached_start_code_units_column(utf16_cache)
166166
assert_equal 0, location.cached_start_code_units_column(utf32_cache)
167167

168-
assert_equal 1, location.cached_end_code_units_column(utf8_cache)
168+
assert_equal 4, location.cached_end_code_units_column(utf8_cache)
169169
assert_equal 2, location.cached_end_code_units_column(utf16_cache)
170170
assert_equal 1, location.cached_end_code_units_column(utf32_cache)
171171

172172
# second 😀
173173
location = result.value.statements.body.first.arguments.arguments.first.location
174174

175-
assert_equal 4, location.cached_start_code_units_offset(utf8_cache)
175+
assert_equal 7, location.cached_start_code_units_offset(utf8_cache)
176176
assert_equal 5, location.cached_start_code_units_offset(utf16_cache)
177177
assert_equal 4, location.cached_start_code_units_offset(utf32_cache)
178178

179-
assert_equal 5, location.cached_end_code_units_offset(utf8_cache)
179+
assert_equal 11, location.cached_end_code_units_offset(utf8_cache)
180180
assert_equal 7, location.cached_end_code_units_offset(utf16_cache)
181181
assert_equal 5, location.cached_end_code_units_offset(utf32_cache)
182182

183-
assert_equal 4, location.cached_start_code_units_column(utf8_cache)
183+
assert_equal 7, location.cached_start_code_units_column(utf8_cache)
184184
assert_equal 5, location.cached_start_code_units_column(utf16_cache)
185185
assert_equal 4, location.cached_start_code_units_column(utf32_cache)
186186

187-
assert_equal 5, location.cached_end_code_units_column(utf8_cache)
187+
assert_equal 11, location.cached_end_code_units_column(utf8_cache)
188188
assert_equal 7, location.cached_end_code_units_column(utf16_cache)
189189
assert_equal 5, location.cached_end_code_units_column(utf32_cache)
190190
end
@@ -200,7 +200,7 @@ def test_code_units_binary_valid_utf8
200200
assert_equal "😀".b.to_sym, receiver.name
201201

202202
location = receiver.location
203-
assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
203+
assert_equal 4, location.end_code_units_column(Encoding::UTF_8)
204204
assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
205205
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
206206
end

0 commit comments

Comments
 (0)