lib/puppet/pops/parser/lexer2.rb

# frozen_string_literal: true
# The Lexer is responsible for turning source text into tokens.
# This version is a performance enhanced lexer (in comparison to the 3.x and earlier "future parser" lexer.
#
# Old returns tokens [:KEY, value, { locator = }
# Could return [[token], locator]
# or Token.new([token], locator) with the same API x[0] = token_symbol, x[1] = self, x[:key] = (:value, :file, :line, :pos) etc

require 'strscan'
require 'puppet/pops/parser/lexer_support'
require 'puppet/pops/parser/heredoc_support'
require 'puppet/pops/parser/interpolation_support'
require 'puppet/pops/parser/epp_support'
require 'puppet/pops/parser/slurp_support'

module Puppet::Pops
module Parser
class Lexer2
  include LexerSupport
  include HeredocSupport
  include InterpolationSupport
  include SlurpSupport
  include EppSupport

  # ALl tokens have three slots, the token name (a Symbol), the token text (String), and a token text length.
  # All operator and punctuation tokens reuse singleton arrays Tokens that require unique values create
  # a unique array per token.
  #
  # PEFORMANCE NOTES:
  # This construct reduces the amount of object that needs to be created for operators and punctuation.
  # The length is pre-calculated for all singleton tokens. The length is used both to signal the length of
  # the token, and to advance the scanner position (without having to advance it with a scan(regexp)).
  #
  TOKEN_LBRACK       = [:LBRACK,       '[',   1].freeze
  TOKEN_LISTSTART    = [:LISTSTART,    '[',   1].freeze
  TOKEN_RBRACK       = [:RBRACK,       ']',   1].freeze
  TOKEN_LBRACE       = [:LBRACE,       '{',   1].freeze
  TOKEN_RBRACE       = [:RBRACE,       '}',   1].freeze
  TOKEN_SELBRACE     = [:SELBRACE,     '{',   1].freeze
  TOKEN_LPAREN       = [:LPAREN,       '(',   1].freeze
  TOKEN_WSLPAREN     = [:WSLPAREN,     '(',   1].freeze
  TOKEN_RPAREN       = [:RPAREN,       ')',   1].freeze

  TOKEN_EQUALS       = [:EQUALS,       '=',   1].freeze
  TOKEN_APPENDS      = [:APPENDS,      '+=',  2].freeze
  TOKEN_DELETES      = [:DELETES,      '-=',  2].freeze

  TOKEN_ISEQUAL      = [:ISEQUAL,      '==',  2].freeze
  TOKEN_NOTEQUAL     = [:NOTEQUAL,     '!=',  2].freeze
  TOKEN_MATCH        = [:MATCH,        '=~',  2].freeze
  TOKEN_NOMATCH      = [:NOMATCH,      '!~',  2].freeze
  TOKEN_GREATEREQUAL = [:GREATEREQUAL, '>=',  2].freeze
  TOKEN_GREATERTHAN  = [:GREATERTHAN,  '>',   1].freeze
  TOKEN_LESSEQUAL    = [:LESSEQUAL,    '<=',  2].freeze
  TOKEN_LESSTHAN     = [:LESSTHAN,     '<',   1].freeze

  TOKEN_FARROW       = [:FARROW,       '=>',  2].freeze
  TOKEN_PARROW       = [:PARROW,       '+>',  2].freeze

  TOKEN_LSHIFT       = [:LSHIFT,       '<<',  2].freeze
  TOKEN_LLCOLLECT    = [:LLCOLLECT,    '<<|', 3].freeze
  TOKEN_LCOLLECT     = [:LCOLLECT,     '<|',  2].freeze

  TOKEN_RSHIFT       = [:RSHIFT,       '>>',  2].freeze
  TOKEN_RRCOLLECT    = [:RRCOLLECT,    '|>>', 3].freeze
  TOKEN_RCOLLECT     = [:RCOLLECT,     '|>',  2].freeze

  TOKEN_PLUS         = [:PLUS,         '+',   1].freeze
  TOKEN_MINUS        = [:MINUS,        '-',   1].freeze
  TOKEN_DIV          = [:DIV,          '/',   1].freeze
  TOKEN_TIMES        = [:TIMES,        '*',   1].freeze
  TOKEN_MODULO       = [:MODULO,       '%',   1].freeze

  TOKEN_NOT          = [:NOT,          '!',   1].freeze
  TOKEN_DOT          = [:DOT,          '.',   1].freeze
  TOKEN_PIPE         = [:PIPE,         '|',   1].freeze
  TOKEN_AT           = [:AT ,          '@',   1].freeze
  TOKEN_ATAT         = [:ATAT ,        '@@',  2].freeze
  TOKEN_COLON        = [:COLON,        ':',   1].freeze
  TOKEN_COMMA        = [:COMMA,        ',',   1].freeze
  TOKEN_SEMIC        = [:SEMIC,        ';',   1].freeze
  TOKEN_QMARK        = [:QMARK,        '?',   1].freeze
  TOKEN_TILDE        = [:TILDE,        '~',   1].freeze # lexed but not an operator in Puppet

  TOKEN_REGEXP       = [:REGEXP,       nil,   0].freeze

  TOKEN_IN_EDGE      = [:IN_EDGE,      '->',  2].freeze
  TOKEN_IN_EDGE_SUB  = [:IN_EDGE_SUB,  '~>',  2].freeze
  TOKEN_OUT_EDGE     = [:OUT_EDGE,     '<-',  2].freeze
  TOKEN_OUT_EDGE_SUB = [:OUT_EDGE_SUB, '<~',  2].freeze

  # Tokens that are always unique to what has been lexed
  TOKEN_STRING         = [:STRING,      nil,  0].freeze
  TOKEN_WORD           = [:WORD,        nil,  0].freeze
  TOKEN_DQPRE          = [:DQPRE,       nil,  0].freeze
  TOKEN_DQMID          = [:DQPRE,       nil,  0].freeze
  TOKEN_DQPOS          = [:DQPRE,       nil,  0].freeze
  TOKEN_NUMBER         = [:NUMBER,      nil,  0].freeze
  TOKEN_VARIABLE       = [:VARIABLE,    nil,  1].freeze
  TOKEN_VARIABLE_EMPTY = [:VARIABLE,    '',   1].freeze

  # HEREDOC has syntax as an argument.
  TOKEN_HEREDOC        = [:HEREDOC,     nil,  0].freeze

  # EPP_START is currently a marker token, may later get syntax
  TOKEN_EPPSTART    = [:EPP_START,      nil,  0].freeze
  TOKEN_EPPEND      = [:EPP_END,       '%>',  2].freeze
  TOKEN_EPPEND_TRIM = [:EPP_END_TRIM, '-%>',  3].freeze

  # This is used for unrecognized tokens, will always be a single character. This particular instance
  # is not used, but is kept here for documentation purposes.
  TOKEN_OTHER = [:OTHER, nil, 0]

  # Keywords are all singleton tokens with pre calculated lengths.
  # Booleans are pre-calculated (rather than evaluating the strings "false" "true" repeatedly.
  #
  KEYWORDS = {
    'case'        => [:CASE,        'case',         4],
    'class'       => [:CLASS,       'class',        5],
    'default'     => [:DEFAULT,     'default',      7],
    'define'      => [:DEFINE,      'define',       6],
    'if'          => [:IF,          'if',           2],
    'elsif'       => [:ELSIF,       'elsif',        5],
    'else'        => [:ELSE,        'else',         4],
    'inherits'    => [:INHERITS,    'inherits',     8],
    'node'        => [:NODE,        'node',         4],
    'and'         => [:AND,         'and',          3],
    'or'          => [:OR,          'or',           2],
    'undef'       => [:UNDEF,       'undef',        5],
    'false'       => [:BOOLEAN,     false,          5],
    'true'        => [:BOOLEAN,     true,           4],
    'in'          => [:IN,          'in',           2],
    'unless'      => [:UNLESS,      'unless',       6],
    'function'    => [:FUNCTION,    'function',     8],
    'type'        => [:TYPE,        'type',         4],
    'attr'        => [:ATTR,        'attr',         4],
    'private'     => [:PRIVATE,     'private',      7],
    'application' => [:APPLICATION, 'application', 11],
    'consumes'    => [:CONSUMES,    'consumes',     8],
    'produces'    => [:PRODUCES,    'produces',     8],
    'site'        => [:SITE,        'site',         4],
  }

  KEYWORDS.each {|k,v| v[1].freeze; v.freeze }
  KEYWORDS.freeze

  # Reverse lookup of keyword name to string
  KEYWORD_NAMES = {}
  KEYWORDS.each {|k, v| KEYWORD_NAMES[v[0]] = k }
  KEYWORD_NAMES.freeze

  PATTERN_WS        = %r{[[:blank:]\r]+}
  PATTERN_NON_WS    = %r{\w+\b?}

  # The single line comment includes the line ending.
  PATTERN_COMMENT   = %r{#.*\r?}
  PATTERN_MLCOMMENT = %r{/\*(.*?)\*/}m

  PATTERN_REGEX     = %r{/[^/]*/}
  PATTERN_REGEX_END = %r{/}
  PATTERN_REGEX_A   = %r{\A/} # for replacement to ""
  PATTERN_REGEX_Z   = %r{/\Z} # for replacement to ""
  PATTERN_REGEX_ESC = %r{\\/} # for replacement to "/"

  # The 3x patterns:
  # PATTERN_CLASSREF       = %r{((::){0,1}[A-Z][-\w]*)+}
  # PATTERN_NAME           = %r{((::)?[a-z0-9][-\w]*)(::[a-z0-9][-\w]*)*}

  # The NAME and CLASSREF in 4x are strict. Each segment must start with
  # a letter a-z and may not contain dashes (\w includes letters, digits and _).
  #
  PATTERN_CLASSREF       = %r{((::){0,1}[A-Z][\w]*)+}
  PATTERN_NAME           = %r{^((::)?[a-z][\w]*)(::[a-z][\w]*)*$}

  PATTERN_BARE_WORD     = %r{((?:::){0,1}(?:[a-z_](?:[\w-]*[\w])?))+}

  PATTERN_DOLLAR_VAR     = %r{\$(::)?(\w+::)*\w+}
  PATTERN_NUMBER         = %r{\b(?:0[xX][0-9A-Fa-f]+|0?\d+(?:\.\d+)?(?:[eE]-?\d+)?)\b}

  # PERFORMANCE NOTE:
  # Comparison against a frozen string is faster (than unfrozen).
  #
  STRING_BSLASH_SLASH = '\/'.freeze

  attr_reader :locator

  def initialize()
    @selector = {
      '.' =>  lambda { emit(TOKEN_DOT, @scanner.pos) },
      ',' => lambda {  emit(TOKEN_COMMA, @scanner.pos) },
      '[' => lambda do
        before = @scanner.pos
        # Must check the preceding character to see if it is whitespace.
        # The fastest thing to do is to simply byteslice to get the string ending at the offset before
        # and then check what the last character is. (This is the same as  what an locator.char_offset needs
        # to compute, but with less overhead of trying to find out the global offset from a local offset in the
        # case when this is sublocated in a heredoc).
        if before == 0 || @scanner.string.byteslice(0, before)[-1] =~ /[[:blank:]\r\n]+/
          emit(TOKEN_LISTSTART, before)
        else
          emit(TOKEN_LBRACK, before)
        end
      end,
      ']' => lambda { emit(TOKEN_RBRACK, @scanner.pos) },
      '(' => lambda do
        before = @scanner.pos
        # If first on a line, or only whitespace between start of line and '('
        # then the token is special to avoid being taken as start of a call.
        line_start = @lexing_context[:line_lexical_start]
        if before == line_start || @scanner.string.byteslice(line_start, before - line_start) =~ /\A[[:blank:]\r]+\Z/
          emit(TOKEN_WSLPAREN, before)
        else
          emit(TOKEN_LPAREN, before)
        end
      end,
      ')' => lambda { emit(TOKEN_RPAREN, @scanner.pos) },
      ';' => lambda { emit(TOKEN_SEMIC, @scanner.pos) },
      '?' => lambda { emit(TOKEN_QMARK, @scanner.pos) },
      '*' => lambda { emit(TOKEN_TIMES, @scanner.pos) },
      '%' => lambda do
        scn = @scanner
        before = scn.pos
        la = scn.peek(2)
        if la[1] == '>' && @lexing_context[:epp_mode]
          scn.pos += 2
          if @lexing_context[:epp_mode] == :expr
            enqueue_completed(TOKEN_EPPEND, before)
          end
          @lexing_context[:epp_mode] = :text
          interpolate_epp
        else
          emit(TOKEN_MODULO, before)
        end
      end,
      '{' => lambda do
        # The lexer needs to help the parser since the technology used cannot deal with
        # lookahead of same token with different precedence. This is solved by making left brace
        # after ? into a separate token.
        #
        @lexing_context[:brace_count] += 1
        emit(if @lexing_context[:after] == :QMARK
               TOKEN_SELBRACE
             else
               TOKEN_LBRACE
             end, @scanner.pos)
      end,
      '}' => lambda do
        @lexing_context[:brace_count] -= 1
        emit(TOKEN_RBRACE, @scanner.pos)
      end,


      # TOKENS @, @@, @(
      '@' => lambda do
        scn = @scanner
        la = scn.peek(2)
        if la[1] == '@'
          emit(TOKEN_ATAT, scn.pos) # TODO; Check if this is good for the grammar
        elsif la[1] == '('
          heredoc
        else
          emit(TOKEN_AT, scn.pos)
        end
      end,

      # TOKENS |, |>, |>>
      '|' => lambda do
        scn = @scanner
        la = scn.peek(3)
        emit(la[1] == '>' ? (la[2] == '>' ? TOKEN_RRCOLLECT : TOKEN_RCOLLECT) : TOKEN_PIPE, scn.pos)
      end,

      # TOKENS =, =>, ==, =~
      '=' => lambda do
        scn = @scanner
        la = scn.peek(2)
        emit(case la[1]
             when '='
               TOKEN_ISEQUAL
             when '>'
               TOKEN_FARROW
             when '~'
               TOKEN_MATCH
             else
               TOKEN_EQUALS
             end, scn.pos)
      end,

      # TOKENS '+', '+=', and '+>'
      '+' => lambda do
        scn = @scanner
        la = scn.peek(2)
        emit(case la[1]
             when '='
               TOKEN_APPENDS
             when '>'
               TOKEN_PARROW
             else
               TOKEN_PLUS
             end, scn.pos)
      end,

      # TOKENS '-', '->', and epp '-%>' (end of interpolation with trim)
      '-' => lambda do
        scn = @scanner
        la = scn.peek(3)
        before = scn.pos
        if @lexing_context[:epp_mode] && la[1] == '%' && la[2] == '>'
          scn.pos += 3
          if @lexing_context[:epp_mode] == :expr
            enqueue_completed(TOKEN_EPPEND_TRIM, before)
          end
          interpolate_epp(:with_trim)
        else
          emit(case la[1]
               when '>'
                 TOKEN_IN_EDGE
               when '='
                 TOKEN_DELETES
               else
                 TOKEN_MINUS
               end, before)
        end
      end,

      # TOKENS !, !=, !~
      '!' => lambda do
        scn = @scanner
        la = scn.peek(2)
        emit(case la[1]
             when '='
               TOKEN_NOTEQUAL
             when '~'
               TOKEN_NOMATCH
             else
               TOKEN_NOT
             end, scn.pos)
      end,

      # TOKENS ~>, ~
      '~' => lambda do
        scn = @scanner
        la = scn.peek(2)
        emit(la[1] == '>' ? TOKEN_IN_EDGE_SUB : TOKEN_TILDE, scn.pos)
      end,

      '#' => lambda { @scanner.skip(PATTERN_COMMENT); nil },

      # TOKENS '/', '/*' and '/ regexp /'
      '/' => lambda do
        scn = @scanner
        la = scn.peek(2)
        if la[1] == '*'
          lex_error(Issues::UNCLOSED_MLCOMMENT) if scn.skip(PATTERN_MLCOMMENT).nil?
          nil
        else
          before = scn.pos
          # regexp position is a regexp, else a div
          value = scn.scan(PATTERN_REGEX) if regexp_acceptable?
          if value
            # Ensure an escaped / was not matched
            while escaped_end(value)
              more = scn.scan_until(PATTERN_REGEX_END)
              return emit(TOKEN_DIV, before) unless more
              value << more
            end
            regex = value.sub(PATTERN_REGEX_A, '').sub(PATTERN_REGEX_Z, '').gsub(PATTERN_REGEX_ESC, '/')
            emit_completed([:REGEX, Regexp.new(regex), scn.pos-before], before)
          else
            emit(TOKEN_DIV, before)
          end
        end
      end,

      # TOKENS <, <=, <|, <<|, <<, <-, <~
      '<' => lambda do
        scn = @scanner
        la = scn.peek(3)
        emit(case la[1]
             when '<'
               if la[2] == '|'
                 TOKEN_LLCOLLECT
               else
                 TOKEN_LSHIFT
               end
             when '='
               TOKEN_LESSEQUAL
             when '|'
               TOKEN_LCOLLECT
             when '-'
               TOKEN_OUT_EDGE
             when '~'
               TOKEN_OUT_EDGE_SUB
             else
               TOKEN_LESSTHAN
             end, scn.pos)
      end,

      # TOKENS >, >=, >>
      '>' => lambda do
        scn = @scanner
        la = scn.peek(2)
        emit(case la[1]
             when '>'
               TOKEN_RSHIFT
             when '='
               TOKEN_GREATEREQUAL
             else
               TOKEN_GREATERTHAN
             end, scn.pos)
      end,

      # TOKENS :, ::CLASSREF, ::NAME
      ':' => lambda do
        scn = @scanner
        la = scn.peek(3)
        before = scn.pos
        if la[1] == ':'
          # PERFORMANCE NOTE: This could potentially be speeded up by using a case/when listing all
          # upper case letters. Alternatively, the 'A', and 'Z' comparisons may be faster if they are
          # frozen.
          #
          la2 = la[2]
          if la2 >= 'A' && la2 <= 'Z'
            # CLASSREF or error
            value = scn.scan(PATTERN_CLASSREF)
            if value && scn.peek(2) != '::'
              after = scn.pos
              emit_completed([:CLASSREF, value.freeze, after-before], before)
            else
              # move to faulty position ('::<uc-letter>' was ok)
              scn.pos = scn.pos + 3
              lex_error(Issues::ILLEGAL_FULLY_QUALIFIED_CLASS_REFERENCE)
            end
          else
            value = scn.scan(PATTERN_BARE_WORD)
            if value
              if value =~ PATTERN_NAME
                emit_completed([:NAME, value.freeze, scn.pos - before], before)
              else
                emit_completed([:WORD, value.freeze, scn.pos - before], before)
              end
            else
              # move to faulty position ('::' was ok)
              scn.pos = scn.pos + 2
              lex_error(Issues::ILLEGAL_FULLY_QUALIFIED_NAME)
            end
          end
        else
          emit(TOKEN_COLON, before)
        end
      end,

      '$' => lambda do
        scn = @scanner
        before = scn.pos
        value = scn.scan(PATTERN_DOLLAR_VAR)
        if value
          emit_completed([:VARIABLE, value[1..-1].freeze, scn.pos - before], before)
        else
          # consume the $ and let higher layer complain about the error instead of getting a syntax error
          emit(TOKEN_VARIABLE_EMPTY, before)
        end
      end,

      '"' => lambda do
        # Recursive string interpolation, 'interpolate' either returns a STRING token, or
        # a DQPRE with the rest of the string's tokens placed in the @token_queue
        interpolate_dq
      end,

      "'" => lambda do
        scn = @scanner
        before = scn.pos
        emit_completed([:STRING, slurp_sqstring.freeze, scn.pos - before], before)
      end,

      "\n" => lambda do
        # If heredoc_cont is in effect there are heredoc text lines to skip over
        # otherwise just skip the newline.
        #
        ctx = @lexing_context
        if ctx[:newline_jump]
          @scanner.pos = ctx[:newline_jump]
          ctx[:newline_jump] = nil
        else
          @scanner.pos += 1
        end
        ctx[:line_lexical_start] = @scanner.pos
        nil
      end,
      '' => lambda { nil } # when the peek(1) returns empty
    }

    [ ' ', "\t", "\r" ].each { |c| @selector[c] = lambda { @scanner.skip(PATTERN_WS); nil } }

    [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'].each do |c|
      @selector[c] = lambda do
        scn = @scanner
        before = scn.pos
        value = scn.scan(PATTERN_NUMBER)
        if value
          length = scn.pos - before
          assert_numeric(value, before)
          emit_completed([:NUMBER, value.freeze, length], before)
        else
          invalid_number = scn.scan_until(PATTERN_NON_WS)
          if before > 1
            after = scn.pos
            scn.pos = before - 1
            if scn.peek(1) == '.'
              # preceded by a dot. Is this a bad decimal number then?
              scn.pos = before - 2
              while scn.peek(1) =~ /^\d$/
                invalid_number = nil
                before = scn.pos
                break if before == 0
                scn.pos = scn.pos - 1
              end
            end
            scn.pos = before
            invalid_number = scn.peek(after - before) unless invalid_number
          end
          assert_numeric(invalid_number, before)
          scn.pos = before + 1
          lex_error(Issues::ILLEGAL_NUMBER, {:value => invalid_number})
        end
      end
    end
    ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
      'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '_'].each do |c|
      @selector[c] = lambda do
        scn = @scanner
        before = scn.pos
        value = scn.scan(PATTERN_BARE_WORD)
        if value && value =~ PATTERN_NAME
          emit_completed(KEYWORDS[value] || @taskm_keywords[value] || [:NAME, value.freeze, scn.pos - before], before)
        elsif value
          emit_completed([:WORD, value.freeze, scn.pos - before], before)
        else
          # move to faulty position ([a-z_] was ok)
          scn.pos = scn.pos + 1
          fully_qualified = scn.match?(/::/)
          if fully_qualified
            lex_error(Issues::ILLEGAL_FULLY_QUALIFIED_NAME)
          else
            lex_error(Issues::ILLEGAL_NAME_OR_BARE_WORD)
          end
        end
      end
    end

    ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
      'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'].each do |c|
      @selector[c] = lambda do
        scn = @scanner
        before = scn.pos
        value = @scanner.scan(PATTERN_CLASSREF)
        if value && @scanner.peek(2) != '::'
          emit_completed([:CLASSREF, value.freeze, scn.pos - before], before)
        else
          # move to faulty position ([A-Z] was ok)
          scn.pos = scn.pos + 1
          lex_error(Issues::ILLEGAL_CLASS_REFERENCE)
        end
      end
    end

    @selector.default = lambda do
      # In case of unicode spaces of various kinds that are captured by a regexp, but not by the
      # simpler case expression above (not worth handling those special cases with better performance).
      scn = @scanner
      if scn.skip(PATTERN_WS)
        nil
      else
        # "unrecognized char"
        emit([:OTHER, scn.peek(0), 1], scn.pos)
      end
    end
    @selector.each { |k,v| k.freeze }
    @selector.freeze
  end

  # Determine if last char of value is escaped by a backslash
  def escaped_end(value)
    escaped = false
    if value.end_with?(STRING_BSLASH_SLASH)
      value[1...-1].each_codepoint do |cp|
        if cp == 0x5c # backslash
          escaped = !escaped
        else
          escaped = false
        end
      end
    end
    escaped
  end

  # Clears the lexer state (it is not required to call this as it will be garbage collected
  # and the next lex call (lex_string, lex_file) will reset the internal state.
  #
  def clear()
    # not really needed, but if someone wants to ensure garbage is collected as early as possible
    @scanner = nil
    @locator = nil
    @lexing_context = nil
  end

  # Convenience method, and for compatibility with older lexer. Use the lex_string instead which allows
  # passing the path to use without first having to call file= (which reads the file if it exists).
  # (Bad form to use overloading of assignment operator for something that is not really an assignment. Also,
  # overloading of = does not allow passing more than one argument).
  #
  def string=(string)
    lex_string(string, nil)
  end

  def lex_string(string, path=nil)
    initvars
    assert_not_bom(string)
    @scanner = StringScanner.new(string)
    @locator = Locator.locator(string, path)
  end

  # Lexes an unquoted string.
  # @param string [String] the string to lex
  # @param locator [Locator] the locator to use (a default is used if nil is given)
  # @param escapes [Array<String>] array of character strings representing the escape sequences to transform
  # @param interpolate [Boolean] whether interpolation of expressions should be made or not.
  #
  def lex_unquoted_string(string, locator, escapes, interpolate)
    initvars
    assert_not_bom(string)
    @scanner = StringScanner.new(string)
    @locator = locator || Locator.locator(string, '')
    @lexing_context[:escapes] = escapes || UQ_ESCAPES
    @lexing_context[:uq_slurp_pattern] = interpolate ? (escapes.include?('$') ? SLURP_UQ_PATTERN : SLURP_UQNE_PATTERN) : SLURP_ALL_PATTERN
  end

  # Convenience method, and for compatibility with older lexer. Use the lex_file instead.
  # (Bad form to use overloading of assignment operator for something that is not really an assignment).
  #
  def file=(file)
    lex_file(file)
  end

  # TODO: This method should not be used, callers should get the locator since it is most likely required to
  # compute line, position etc given offsets.
  #
  def file
    @locator ? @locator.file : nil
  end

  # Initializes lexing of the content of the given file. An empty string is used if the file does not exist.
  #
  def lex_file(file)
    initvars
    contents = Puppet::FileSystem.exist?(file) ? Puppet::FileSystem.read(file, :mode => 'rb', :encoding => 'utf-8') : ''
    assert_not_bom(contents)
    @scanner = StringScanner.new(contents.freeze)
    @locator = Locator.locator(contents, file)
  end

  def initvars
    @token_queue = []
    # NOTE: additional keys are used; :escapes, :uq_slurp_pattern, :newline_jump, :epp_*
    @lexing_context = {
      :brace_count => 0,
      :after => nil,
      :line_lexical_start => 0
    }
    # Use of --tasks introduces the new keyword 'plan'
    @taskm_keywords = Puppet[:tasks] ? { 'plan' => [:PLAN, 'plan',  4], 'apply' => [:APPLY, 'apply', 5] }.freeze : EMPTY_HASH
  end

  # Scans all of the content and returns it in an array
  # Note that the terminating [false, false] token is included in the result.
  #
  def fullscan
    result = []
    scan {|token| result.push(token) }
    result
  end

  # A block must be passed to scan. It will be called with two arguments, a symbol for the token,
  # and an instance of LexerSupport::TokenValue
  # PERFORMANCE NOTE: The TokenValue is designed to reduce the amount of garbage / temporary data
  # and to only convert the lexer's internal tokens on demand. It is slightly more costly to create an
  # instance of a class defined in Ruby than an Array or Hash, but the gain is much bigger since transformation
  # logic is avoided for many of its members (most are never used (e.g. line/pos information which is only of
  # value in general for error messages, and for some expressions (which the lexer does not know about).
  #
  def scan
    # PERFORMANCE note: it is faster to access local variables than instance variables.
    # This makes a small but notable difference since instance member access is avoided for
    # every token in the lexed content.
    #
    scn   = @scanner
    lex_error_without_pos(Issues::NO_INPUT_TO_LEXER) unless scn

    ctx   = @lexing_context
    queue = @token_queue
    selector = @selector

    scn.skip(PATTERN_WS)

    # This is the lexer's main loop
    until queue.empty? && scn.eos? do
      token = queue.shift || selector[scn.peek(1)].call
      if token
        ctx[:after] = token[0]
        yield token
      end
    end

    # Signals end of input
    yield [false, false]
  end

  # This lexes one token at the current position of the scanner.
  # PERFORMANCE NOTE: Any change to this logic should be performance measured.
  #
  def lex_token
    @selector[@scanner.peek(1)].call
  end

  # Emits (produces) a token [:tokensymbol, TokenValue] and moves the scanner's position past the token
  #
  def emit(token, byte_offset)
    @scanner.pos = byte_offset + token[2]
    [token[0], TokenValue.new(token, byte_offset, @locator)]
  end

  # Emits the completed token on the form [:tokensymbol, TokenValue. This method does not alter
  # the scanner's position.
  #
  def emit_completed(token, byte_offset)
    [token[0], TokenValue.new(token, byte_offset, @locator)]
  end

  # Enqueues a completed token at the given offset
  def enqueue_completed(token, byte_offset)
    @token_queue << emit_completed(token, byte_offset)
  end

  # Allows subprocessors for heredoc etc to enqueue tokens that are tokenized by a different lexer instance
  #
  def enqueue(emitted_token)
    @token_queue << emitted_token
  end

  # Answers after which tokens it is acceptable to lex a regular expression.
  # PERFORMANCE NOTE:
  # It may be beneficial to turn this into a hash with default value of true for missing entries.
  # A case expression with literal values will however create a hash internally. Since a reference is
  # always needed to the hash, this access is almost as costly as a method call.
  #
  def regexp_acceptable?
    case @lexing_context[:after]

    # Ends of (potential) R-value generating expressions
    when :RPAREN, :RBRACK, :RRCOLLECT, :RCOLLECT
      false

    # End of (potential) R-value - but must be allowed because of case expressions
    # Called out here to not be mistaken for a bug.
    when :RBRACE
      true

    # Operands (that can be followed by DIV (even if illegal in grammar)
    when :NAME, :CLASSREF, :NUMBER, :STRING, :BOOLEAN, :DQPRE, :DQMID, :DQPOST, :HEREDOC, :REGEX, :VARIABLE, :WORD
      false

    else
      true
    end
  end

end
end
end