Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add code unit APIs to location #2406

Merged
merged 1 commit into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions lib/prism/parse_result.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,23 @@ def character_column(byte_offset)
character_offset(byte_offset) - character_offset(line_start(byte_offset))
end

# Returns the offset from the start of the file for the given byte offset
# counting in code units for the given encoding.
#
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
# concept of code units that differs from the number of characters in other
# encodings, it is not captured here.
def code_units_offset(byte_offset, encoding)
byteslice = source.byteslice(0, byte_offset).encode(encoding)
(encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE) ? (byteslice.bytesize / 2) : byteslice.length
end

# Returns the column number in code units for the given encoding for the
# given byte offset.
def code_units_column(byte_offset, encoding)
code_units_offset(byte_offset, encoding) - code_units_offset(line_start(byte_offset), encoding)
end

private

# Binary search through the offsets to find the line number for the given
Expand Down Expand Up @@ -138,6 +155,11 @@ def start_character_offset
source.character_offset(start_offset)
end

# The offset from the start of the file in code units of the given encoding.
def start_code_units_offset(encoding = Encoding::UTF_16LE)
source.code_units_offset(start_offset, encoding)
end

# The byte offset from the beginning of the source where this location ends.
def end_offset
start_offset + length
Expand All @@ -149,6 +171,11 @@ def end_character_offset
source.character_offset(end_offset)
end

# The offset from the start of the file in code units of the given encoding.
def end_code_units_offset(encoding = Encoding::UTF_16LE)
source.code_units_offset(end_offset, encoding)
end

# The line number where this location starts.
def start_line
source.line(start_offset)
Expand Down Expand Up @@ -177,6 +204,12 @@ def start_character_column
source.character_column(start_offset)
end

# The column number in code units of the given encoding where this location
# starts from the start of the line.
def start_code_units_column(encoding = Encoding::UTF_16LE)
source.code_units_column(start_offset, encoding)
end

# The column number in bytes where this location ends from the start of the
# line.
def end_column
Expand All @@ -189,6 +222,12 @@ def end_character_column
source.character_column(end_offset)
end

# The column number in code units of the given encoding where this location
# ends from the start of the line.
def end_code_units_column(encoding = Encoding::UTF_16LE)
source.code_units_column(end_offset, encoding)
end

# Implement the hash pattern matching interface for Location.
def deconstruct_keys(keys)
{ start_offset: start_offset, end_offset: end_offset }
Expand Down
80 changes: 80 additions & 0 deletions test/prism/ruby_api_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,86 @@ def test_location_character_offsets
assert_equal 7, location.end_character_column
end

def test_location_code_units
program = Prism.parse("😀 + 😀\n😍 ||= 😍").value

# first 😀
location = program.statements.body.first.receiver.location

assert_equal 0, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 0, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 0, location.start_code_units_offset(Encoding::UTF_32LE)

assert_equal 1, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 2, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_offset(Encoding::UTF_32LE)

assert_equal 0, location.start_code_units_column(Encoding::UTF_8)
assert_equal 0, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 0, location.start_code_units_column(Encoding::UTF_32LE)

assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)

# second 😀
location = program.statements.body.first.arguments.arguments.first.location

assert_equal 4, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 5, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 4, location.start_code_units_offset(Encoding::UTF_32LE)

assert_equal 5, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 7, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 5, location.end_code_units_offset(Encoding::UTF_32LE)

assert_equal 4, location.start_code_units_column(Encoding::UTF_8)
assert_equal 5, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 4, location.start_code_units_column(Encoding::UTF_32LE)

assert_equal 5, location.end_code_units_column(Encoding::UTF_8)
assert_equal 7, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 5, location.end_code_units_column(Encoding::UTF_32LE)

# first 😍
location = program.statements.body.last.name_loc

assert_equal 6, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 8, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 6, location.start_code_units_offset(Encoding::UTF_32LE)

assert_equal 7, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 10, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 7, location.end_code_units_offset(Encoding::UTF_32LE)

assert_equal 0, location.start_code_units_column(Encoding::UTF_8)
assert_equal 0, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 0, location.start_code_units_column(Encoding::UTF_32LE)

assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)

# second 😍
location = program.statements.body.last.value.location

assert_equal 12, location.start_code_units_offset(Encoding::UTF_8)
assert_equal 15, location.start_code_units_offset(Encoding::UTF_16LE)
assert_equal 12, location.start_code_units_offset(Encoding::UTF_32LE)

assert_equal 13, location.end_code_units_offset(Encoding::UTF_8)
assert_equal 17, location.end_code_units_offset(Encoding::UTF_16LE)
assert_equal 13, location.end_code_units_offset(Encoding::UTF_32LE)

assert_equal 6, location.start_code_units_column(Encoding::UTF_8)
assert_equal 7, location.start_code_units_column(Encoding::UTF_16LE)
assert_equal 6, location.start_code_units_column(Encoding::UTF_32LE)

assert_equal 7, location.end_code_units_column(Encoding::UTF_8)
assert_equal 9, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
end

def test_heredoc?
refute parse_expression("\"foo\"").heredoc?
refute parse_expression("\"foo \#{1}\"").heredoc?
Expand Down
Loading