diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 595669c..68684f1 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -48,29 +48,15 @@ class BaseParser REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)" REFERENCE_RE = /#{REFERENCE}/ - DOCTYPE_START = /\A\s*/um ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um - COMMENT_START = /\A/um - CDATA_START = /\A/um - CDATA_PATTERN = //um - XMLDECL_START = /\A<\?xml\s/u; - XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um - INSTRUCTION_START = /\A<\?/u - INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um - TAG_MATCH = /\A<((?>#{QNAME_STR}))/um - CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um + INSTRUCTION_PATTERN = /#{NAME}(\s+.*?)?\?>/um + TAG_MATCH = /((?>#{QNAME_STR}))/um + CLOSE_MATCH = /(#{QNAME_STR})\s*>/um VERSION = /\bversion\s*=\s*["'](.*?)['"]/um ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um - ENTITY_START = /\A\s*/um - SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" @@ -79,10 +65,7 @@ class BaseParser DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" ATTDEF_RE = /#{ATTDEF}/ - ATTLISTDECL_START = /\A\s*/um - - TEXT_PATTERN = /\A([^<]*)/um + ATTLISTDECL_PATTERN = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um # Entity constants PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" @@ -94,11 +77,10 @@ class BaseParser ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" - PEDECL = "" - GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um + PEDECL = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" + GEDECL = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" + ENTITYDECL = /(?:#{GEDECL})|(?:#{PEDECL})/um - NOTATIONDECL_START = /\A\s* [/'/, "'", "'", /'/] } + QUESTION_MARK_TAG_START = "" + SLASH = "/" + EXCLAMATION_MARK = "!" + QUESTION_MARK = "?" + DOUBLE_DASH = "--" + def initialize( source ) self.stream = source @listeners = [] @@ -198,65 +189,67 @@ def pull_event #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" if @document_status == nil - word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) - word = word[1] unless word.nil? - #STDERR.puts "WORD = #{word.inspect}" - case word - when COMMENT_START - return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] - when XMLDECL_START - #STDERR.puts "XMLDECL" - results = @source.match( XMLDECL_PATTERN, true )[1] - version = VERSION.match( results ) - version = version[1] unless version.nil? - encoding = ENCODING.match(results) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding - end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" - end - standalone = STANDALONE.match(results) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] - when INSTRUCTION_START - return process_instruction - when DOCTYPE_START - base_error_message = "Malformed DOCTYPE" - @source.match(DOCTYPE_START, true) - @nsstack.unshift(curr_ns=Set.new) - name = parse_name(base_error_message) - if @source.match(/\A\s*\[/um, true) - id = [nil, nil, nil] - @document_status = :in_doctype - elsif @source.match(/\A\s*>/um, true) - id = [nil, nil, nil] - @document_status = :after_doctype - else - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: false) - if id[0] == "SYSTEM" - # For backward compatibility - id[1], id[2] = id[2], nil + @source.read + if @source.match(QUESTION_MARK_TAG_START, true, false) + if results = @source.match(/xml\s+(.*?)\?>/um, true, false) + results = results[1] + version = VERSION.match( results ) + version = version[1] unless version.nil? + encoding = ENCODING.match(results) + encoding = encoding[1] unless encoding.nil? + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding + encoding = "UTF-16" end - if @source.match(/\A\s*\[/um, true) + standalone = STANDALONE.match(results) + standalone = standalone[1] unless standalone.nil? + return [ :xmldecl, version, encoding, standalone ] + else # instruction + return process_instruction + end + elsif @source.match(EXCLAMATION_MARK_TAG_START, true, false) + if @source.match(DOUBLE_DASH, true, false) + return [ :comment, @source.match( /(.*?)-->/um, true )[1] ] + elsif @source.match(/DOCTYPE\s/um, true, false) + base_error_message = "Malformed DOCTYPE" + @nsstack.unshift(curr_ns=Set.new) + name = parse_name(base_error_message) + if @source.match(/\s*\[/um, true) + id = [nil, nil, nil] @document_status = :in_doctype - elsif @source.match(/\A\s*>/um, true) + elsif @source.match(/\s*>/um, true) + id = [nil, nil, nil] @document_status = :after_doctype else - message = "#{base_error_message}: garbage after external ID" - raise REXML::ParseException.new(message, @source) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: false) + if id[0] == "SYSTEM" + # For backward compatibility + id[1], id[2] = id[2], nil + end + if @source.match(/\s*\[/um, true) + @document_status = :in_doctype + elsif @source.match(/\s*>/um, true) + @document_status = :after_doctype + else + message = "#{base_error_message}: garbage after external ID" + raise REXML::ParseException.new(message, @source) + end end + args = [:start_doctype, name, *id] + if @document_status == :after_doctype + @source.match(/\s*/um, true) + @stack << [ :end_doctype ] + end + return args + else + message = "Invalid XML" + raise REXML::ParseException.new(message, @source) end - args = [:start_doctype, name, *id] - if @document_status == :after_doctype - @source.match(/\A\s*/um, true) - @stack << [ :end_doctype ] - end - return args - when /\A\s+/ + elsif @source.match( /\s+/, false, false ) else @document_status = :after_doctype if @source.encoding == "UTF-8" @@ -265,16 +258,13 @@ def pull_event end end if @document_status == :in_doctype - md = @source.match(/\A\s*(.*?>)/um) - case md[1] - when SYSTEMENTITY - match = @source.match( SYSTEMENTITY, true )[1] - return [ :externalentity, match ] - - when ELEMENTDECL_START - return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] - - when ENTITY_START + @source.read + @source.match(/\s*/um, true, false) # skip spaces + if match = @source.match( /(%.*?;)\s*$/um, true, false) + return [ :externalentity, match[1] ] + elsif match = @source.match(/(/um, true, false) + return [ :elementdecl, match[1] ] + elsif @source.match( //um) + unless @source.match(/\s+/um, true) + if @source.match(/\s*>/um) message = "#{base_error_message}: name is missing" else message = "#{base_error_message}: invalid declaration name" end + @source.string = " /um, true) + unless @source.match(/\s*>/um, true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) end return [:notationdecl, name, *id] - when DOCTYPE_END + elsif @source.match( /\]\s*>/um, true, false) @document_status = :after_doctype - @source.match( DOCTYPE_END, true ) return [ :end_doctype ] end end if @document_status == :after_doctype - @source.match(/\A\s*/um, true) + @source.match(/\s*/um, true) end begin next_data = @source.buffer if next_data.size < 2 @source.read - next_data = @source.buffer end - if next_data[0] == ?< - if next_data[1] == ?/ + if @source.match(TAG_START, true, false) + if @source.match(SLASH, true, false) @nsstack.shift last_tag = @tags.pop md = @source.match( CLOSE_MATCH, true ) @@ -366,15 +355,16 @@ def pull_event if md.nil? or last_tag != md[1] message = "Missing end tag for '#{last_tag}'" message << " (got '#{md[1]}')" if md + @source.string = "]*>)/um) + elsif @source.match(EXCLAMATION_MARK, true, false) + md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][2] == ?- - md = @source.match( COMMENT_PATTERN, true ) + if md[0][0] == ?- + md = @source.match( /--(.*?)-->/um, true ) case md[1] when /--/, /-\z/ @@ -383,17 +373,18 @@ def pull_event return [ :comment, md[1] ] if md else - md = @source.match( CDATA_PATTERN, true ) + md = @source.match( /\[CDATA\[(.*?)\]\]>/um, true ) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) - elsif next_data[1] == ?? + elsif @source.match(QUESTION_MARK, true, false) return process_instruction else # Get the next tag md = @source.match(TAG_MATCH, true) unless md + @source.string = TAG_START + @source.buffer raise REXML::ParseException.new("malformed XML: missing tag start", @source) end tag = md[1] @@ -418,7 +409,7 @@ def pull_event return [ :start_element, tag, attributes ] end else - md = @source.match( TEXT_PATTERN, true ) + md = @source.match( /([^<]*)/um, true ) text = md[1] return [ :text, text ] end @@ -580,6 +571,7 @@ def process_instruction match_data = @source.match(INSTRUCTION_PATTERN, true) unless match_data message = "Invalid processing instruction node" + @source.string = QUESTION_MARK_TAG_START + @source.buffer raise REXML::ParseException.new(message, @source) end [:processing_instruction, match_data[1], match_data[2]] diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index db78a12..034acf1 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -68,7 +68,7 @@ def encoding=(enc) def read end - def match(pattern, cons=false) + def match(pattern, cons=false, read_source=false) if cons @scanner.scan(pattern).nil? ? nil : @scanner else @@ -76,6 +76,10 @@ def match(pattern, cons=false) end end + def string=(string) + @scanner.string = string + end + # @return true if the Source is exhausted def empty? @scanner.eos? @@ -155,13 +159,13 @@ def read end end - def match( pattern, cons=false ) + def match( pattern, cons=false, read_source=true ) if cons md = @scanner.scan(pattern) else md = @scanner.check(pattern) end - while md.nil? and @source + while read_source && md.nil? && @source begin @scanner << readline if cons