Merge pull request #915 from rodjek/gh-912

(#912) Count consumed chars not bytes when slurping strings
rodjek · Feb 15, 2020 · a9654ae · a9654ae
2 parents 8b21954 + 7dace38
commit a9654ae
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 5 deletions.
diff --git a/lib/puppet-lint/lexer.rb b/lib/puppet-lint/lexer.rb
@@ -240,7 +240,7 @@ def tokenise(code)
           begin
             string_segments = slurper.parse
             process_string_segments(string_segments)
-            length = slurper.consumed_bytes + 1
+            length = slurper.consumed_chars + 1
           rescue PuppetLint::Lexer::StringSlurper::UnterminatedStringError
             raise PuppetLint::LexerError.new(@line_no, @column, 'unterminated string')
           end
@@ -287,7 +287,7 @@ def tokenise(code)
             slurper = PuppetLint::Lexer::StringSlurper.new(code[i + length..-1])
             heredoc_segments = slurper.parse_heredoc(heredoc_tag)
             process_heredoc_segments(heredoc_segments)
-            length += slurper.consumed_bytes
+            length += slurper.consumed_chars
           end
 
         elsif eol = chunk[%r{\A(#{LINE_END_RE})}, 1]
@@ -299,7 +299,7 @@ def tokenise(code)
             slurper = PuppetLint::Lexer::StringSlurper.new(code[i + length..-1])
             heredoc_segments = slurper.parse_heredoc(heredoc_tag)
             process_heredoc_segments(heredoc_segments)
-            length += slurper.consumed_bytes
+            length += slurper.consumed_chars
           end
 
         elsif chunk.start_with?('/')

diff --git a/lib/puppet-lint/lexer/string_slurper.rb b/lib/puppet-lint/lexer/string_slurper.rb
@@ -98,8 +98,19 @@ def read_char
         end
       end
 
-      def consumed_bytes
-        scanner.pos
+      # Get the number of characters consumed by the StringSlurper.
+      #
+      # StringScanner from Ruby 2.0 onwards supports #charpos which returns
+      # the number of characters and is multibyte character aware.
+      #
+      # Prior to this, Ruby's multibyte character support in Strings was a
+      # bit unusual and neither String#length nor String#split behave as
+      # expected, so we use String#scan to split all the consumed text using
+      # a UTF-8 aware regex and use the length of the result
+      def consumed_chars
+        return scanner.charpos if scanner.respond_to?(:charpos)
+
+        (scanner.pre_match + scanner.matched).scan(%r{.}mu).length
       end
 
       def start_interp

diff --git a/spec/puppet-lint/lexer/string_slurper_spec.rb b/spec/puppet-lint/lexer/string_slurper_spec.rb
@@ -1,3 +1,5 @@
+# encoding: utf-8
+
 require 'spec_helper'
 
 describe PuppetLint::Lexer::StringSlurper do
@@ -448,4 +450,24 @@
       end
     end
   end
+
+  describe '#consumed_chars' do
+    subject { described_class.new(string).tap(&:parse).consumed_chars }
+
+    context 'when slurping a string containing multibyte characters' do
+      let(:string) { 'accentués"' }
+
+      it 'counts the multibyte character as a single consumed character' do
+        is_expected.to eq(10)
+      end
+    end
+
+    context 'when slurping an empty string' do
+      let(:string) { '"' }
+
+      it 'consumes only the closing quote' do
+        is_expected.to eq(1)
+      end
+    end
+  end
 end