Skip to content

Commit

Permalink
Merge pull request #915 from rodjek/gh-912
Browse files Browse the repository at this point in the history
(#912) Count consumed chars not bytes when slurping strings
  • Loading branch information
rodjek committed Feb 15, 2020
2 parents 8b21954 + 7dace38 commit a9654ae
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 5 deletions.
6 changes: 3 additions & 3 deletions lib/puppet-lint/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def tokenise(code)
begin
string_segments = slurper.parse
process_string_segments(string_segments)
length = slurper.consumed_bytes + 1
length = slurper.consumed_chars + 1
rescue PuppetLint::Lexer::StringSlurper::UnterminatedStringError
raise PuppetLint::LexerError.new(@line_no, @column, 'unterminated string')
end
Expand Down Expand Up @@ -287,7 +287,7 @@ def tokenise(code)
slurper = PuppetLint::Lexer::StringSlurper.new(code[i + length..-1])
heredoc_segments = slurper.parse_heredoc(heredoc_tag)
process_heredoc_segments(heredoc_segments)
length += slurper.consumed_bytes
length += slurper.consumed_chars
end

elsif eol = chunk[%r{\A(#{LINE_END_RE})}, 1]
Expand All @@ -299,7 +299,7 @@ def tokenise(code)
slurper = PuppetLint::Lexer::StringSlurper.new(code[i + length..-1])
heredoc_segments = slurper.parse_heredoc(heredoc_tag)
process_heredoc_segments(heredoc_segments)
length += slurper.consumed_bytes
length += slurper.consumed_chars
end

elsif chunk.start_with?('/')
Expand Down
15 changes: 13 additions & 2 deletions lib/puppet-lint/lexer/string_slurper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,19 @@ def read_char
end
end

def consumed_bytes
scanner.pos
# Get the number of characters consumed by the StringSlurper.
#
# StringScanner from Ruby 2.0 onwards supports #charpos which returns
# the number of characters and is multibyte character aware.
#
# Prior to this, Ruby's multibyte character support in Strings was a
# bit unusual and neither String#length nor String#split behave as
# expected, so we use String#scan to split all the consumed text using
# a UTF-8 aware regex and use the length of the result
def consumed_chars
return scanner.charpos if scanner.respond_to?(:charpos)

(scanner.pre_match + scanner.matched).scan(%r{.}mu).length
end

def start_interp
Expand Down
22 changes: 22 additions & 0 deletions spec/puppet-lint/lexer/string_slurper_spec.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# encoding: utf-8

require 'spec_helper'

describe PuppetLint::Lexer::StringSlurper do
Expand Down Expand Up @@ -448,4 +450,24 @@
end
end
end

describe '#consumed_chars' do
subject { described_class.new(string).tap(&:parse).consumed_chars }

context 'when slurping a string containing multibyte characters' do
let(:string) { 'accentués"' }

it 'counts the multibyte character as a single consumed character' do
is_expected.to eq(10)
end
end

context 'when slurping an empty string' do
let(:string) { '"' }

it 'consumes only the closing quote' do
is_expected.to eq(1)
end
end
end
end

0 comments on commit a9654ae

Please sign in to comment.