Skip to content

Commit

Permalink
Changed processing in REXML::Parsers::BaseParser#pull_event from regu…
Browse files Browse the repository at this point in the history
…lar expression to processing using StringScanner.

## Why
Improve maintainability by optimizing the process so that the parsing process proceeds using StringScanner#scan.

# Changed
- Added read_source option to IOSource#match to suppress read from @source.
- Added Source#string= method for error message output.

## Benchmark

```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     11.289      11.358        17.714       17.949 i/s -     100.000 times in 8.858505s 8.804219s 5.645189s 5.571484s
                 sax     31.403      31.528        49.524       51.521 i/s -     100.000 times in 3.184448s 3.171815s 2.019218s 1.940954s
                pull     36.580      36.984        58.444       60.713 i/s -     100.000 times in 2.733724s 2.703908s 1.711054s 1.647084s
              stream     34.225      34.187        49.746       50.839 i/s -     100.000 times in 2.921876s 2.925118s 2.010215s 1.966987s

Comparison:
                              dom
         after(YJIT):        17.9 i/s
        before(YJIT):        17.7 i/s - 1.01x  slower
               after:        11.4 i/s - 1.58x  slower
              before:        11.3 i/s - 1.59x  slower

                              sax
         after(YJIT):        51.5 i/s
        before(YJIT):        49.5 i/s - 1.04x  slower
               after:        31.5 i/s - 1.63x  slower
              before:        31.4 i/s - 1.64x  slower

                             pull
         after(YJIT):        60.7 i/s
        before(YJIT):        58.4 i/s - 1.04x  slower
               after:        37.0 i/s - 1.64x  slower
              before:        36.6 i/s - 1.66x  slower

                           stream
         after(YJIT):        50.8 i/s
        before(YJIT):        49.7 i/s - 1.02x  slower
              before:        34.2 i/s - 1.49x  slower
               after:        34.2 i/s - 1.49x  slower

```

- YJIT=ON : 1.01x - 1.04x faster
- YJIT=OFF : 1.00x - 1.01x faster
  • Loading branch information
naitoh committed Feb 23, 2024
1 parent fb7ba27 commit c9a3bf2
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 109 deletions.
204 changes: 98 additions & 106 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,29 +48,15 @@ class BaseParser
REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
REFERENCE_RE = /#{REFERENCE}/

DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
DOCTYPE_END = /\A\s*\]\s*>/um
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
COMMENT_START = /\A<!--/u
COMMENT_PATTERN = /<!--(.*?)-->/um
CDATA_START = /\A<!\[CDATA\[/u
CDATA_END = /\A\s*\]\s*>/um
CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
XMLDECL_START = /\A<\?xml\s/u;
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
INSTRUCTION_START = /\A<\?/u
INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
INSTRUCTION_PATTERN = /#{NAME}(\s+.*?)?\?>/um
TAG_MATCH = /((?>#{QNAME_STR}))/um
CLOSE_MATCH = /(#{QNAME_STR})\s*>/um

VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um

ENTITY_START = /\A\s*<!ENTITY/
ELEMENTDECL_START = /\A\s*<!ELEMENT/um
ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
Expand All @@ -79,10 +65,7 @@ class BaseParser
DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
ATTDEF_RE = /#{ATTDEF}/
ATTLISTDECL_START = /\A\s*<!ATTLIST/um
ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um

TEXT_PATTERN = /\A([^<]*)/um
ATTLISTDECL_PATTERN = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um

# Entity constants
PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
Expand All @@ -94,11 +77,10 @@ class BaseParser
ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
PEDECL = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
GEDECL = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
ENTITYDECL = /(?:#{GEDECL})|(?:#{PEDECL})/um

NOTATIONDECL_START = /\A\s*<!NOTATION/um
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
Expand All @@ -112,6 +94,15 @@ class BaseParser
"apos" => [/&apos;/, "&apos;", "'", /'/]
}

QUESTION_MARK_TAG_START = "<?"
EXCLAMATION_MARK_TAG_START = "<!"
TAG_START = "<"
TAG_END = ">"
SLASH = "/"
EXCLAMATION_MARK = "!"
QUESTION_MARK = "?"
DOUBLE_DASH = "--"

def initialize( source )
self.stream = source
@listeners = []
Expand Down Expand Up @@ -198,65 +189,67 @@ def pull_event
#STDERR.puts @source.encoding
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
if @document_status == nil
word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
word = word[1] unless word.nil?
#STDERR.puts "WORD = #{word.inspect}"
case word
when COMMENT_START
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
when XMLDECL_START
#STDERR.puts "XMLDECL"
results = @source.match( XMLDECL_PATTERN, true )[1]
version = VERSION.match( results )
version = version[1] unless version.nil?
encoding = ENCODING.match(results)
encoding = encoding[1] unless encoding.nil?
if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-16"
end
standalone = STANDALONE.match(results)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
when INSTRUCTION_START
return process_instruction
when DOCTYPE_START
base_error_message = "Malformed DOCTYPE"
@source.match(DOCTYPE_START, true)
@nsstack.unshift(curr_ns=Set.new)
name = parse_name(base_error_message)
if @source.match(/\A\s*\[/um, true)
id = [nil, nil, nil]
@document_status = :in_doctype
elsif @source.match(/\A\s*>/um, true)
id = [nil, nil, nil]
@document_status = :after_doctype
else
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: false)
if id[0] == "SYSTEM"
# For backward compatibility
id[1], id[2] = id[2], nil
@source.read
if @source.match(QUESTION_MARK_TAG_START, true, false)
if results = @source.match(/xml\s+(.*?)\?>/um, true, false)
results = results[1]
version = VERSION.match( results )
version = version[1] unless version.nil?
encoding = ENCODING.match(results)
encoding = encoding[1] unless encoding.nil?
if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-16"
end
if @source.match(/\A\s*\[/um, true)
standalone = STANDALONE.match(results)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
else # instruction
return process_instruction
end
elsif @source.match(EXCLAMATION_MARK_TAG_START, true, false)
if @source.match(DOUBLE_DASH, true, false)
return [ :comment, @source.match( /(.*?)-->/um, true )[1] ]
elsif @source.match(/DOCTYPE\s/um, true, false)
base_error_message = "Malformed DOCTYPE"
@nsstack.unshift(curr_ns=Set.new)
name = parse_name(base_error_message)
if @source.match(/\s*\[/um, true)
id = [nil, nil, nil]
@document_status = :in_doctype
elsif @source.match(/\A\s*>/um, true)
elsif @source.match(/\s*>/um, true)
id = [nil, nil, nil]
@document_status = :after_doctype
else
message = "#{base_error_message}: garbage after external ID"
raise REXML::ParseException.new(message, @source)
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: false)
if id[0] == "SYSTEM"
# For backward compatibility
id[1], id[2] = id[2], nil
end
if @source.match(/\s*\[/um, true)
@document_status = :in_doctype
elsif @source.match(/\s*>/um, true)
@document_status = :after_doctype
else
message = "#{base_error_message}: garbage after external ID"
raise REXML::ParseException.new(message, @source)
end
end
args = [:start_doctype, name, *id]
if @document_status == :after_doctype
@source.match(/\s*/um, true)
@stack << [ :end_doctype ]
end
return args
else
message = "Invalid XML"
raise REXML::ParseException.new(message, @source)
end
args = [:start_doctype, name, *id]
if @document_status == :after_doctype
@source.match(/\A\s*/um, true)
@stack << [ :end_doctype ]
end
return args
when /\A\s+/
elsif @source.match( /\s+/, false, false )
else
@document_status = :after_doctype
if @source.encoding == "UTF-8"
Expand All @@ -265,16 +258,13 @@ def pull_event
end
end
if @document_status == :in_doctype
md = @source.match(/\A\s*(.*?>)/um)
case md[1]
when SYSTEMENTITY
match = @source.match( SYSTEMENTITY, true )[1]
return [ :externalentity, match ]

when ELEMENTDECL_START
return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]

when ENTITY_START
@source.read
@source.match(/\s*/um, true, false) # skip spaces
if match = @source.match( /(%.*?;)\s*$/um, true, false)
return [ :externalentity, match[1] ]
elsif match = @source.match(/(<!ELEMENT.*?)>/um, true, false)
return [ :elementdecl, match[1] ]
elsif @source.match( /<!ENTITY/, true, false)
match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
ref = false
if match[1] == '%'
Expand All @@ -300,7 +290,7 @@ def pull_event
end
match << '%' if ref
return match
when ATTLISTDECL_START
elsif @source.match( /<!ATTLIST/um, true, false)
md = @source.match( ATTLISTDECL_PATTERN, true )
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
element = md[1]
Expand All @@ -320,42 +310,41 @@ def pull_event
end
end
return [ :attlistdecl, element, pairs, contents ]
when NOTATIONDECL_START
elsif @source.match( /<!NOTATION/um, true, false)
base_error_message = "Malformed notation declaration"
unless @source.match(/\A\s*<!NOTATION\s+/um, true)
if @source.match(/\A\s*<!NOTATION\s*>/um)
unless @source.match(/\s+/um, true)
if @source.match(/\s*>/um)
message = "#{base_error_message}: name is missing"
else
message = "#{base_error_message}: invalid declaration name"
end
@source.string = " <!NOTATION" + @source.buffer
raise REXML::ParseException.new(message, @source)
end
name = parse_name(base_error_message)
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: true)
unless @source.match(/\A\s*>/um, true)
unless @source.match(/\s*>/um, true)
message = "#{base_error_message}: garbage before end >"
raise REXML::ParseException.new(message, @source)
end
return [:notationdecl, name, *id]
when DOCTYPE_END
elsif @source.match( /\]\s*>/um, true, false)
@document_status = :after_doctype
@source.match( DOCTYPE_END, true )
return [ :end_doctype ]
end
end
if @document_status == :after_doctype
@source.match(/\A\s*/um, true)
@source.match(/\s*/um, true)
end
begin
next_data = @source.buffer
if next_data.size < 2
@source.read
next_data = @source.buffer
end
if next_data[0] == ?<
if next_data[1] == ?/
if @source.match(TAG_START, true, false)
if @source.match(SLASH, true, false)
@nsstack.shift
last_tag = @tags.pop
md = @source.match( CLOSE_MATCH, true )
Expand All @@ -366,15 +355,16 @@ def pull_event
if md.nil? or last_tag != md[1]
message = "Missing end tag for '#{last_tag}'"
message << " (got '#{md[1]}')" if md
@source.string = "</" + @source.buffer if md.nil?
raise REXML::ParseException.new(message, @source)
end
return [ :end_element, last_tag ]
elsif next_data[1] == ?!
md = @source.match(/\A(\s*[^>]*>)/um)
elsif @source.match(EXCLAMATION_MARK, true, false)
md = @source.match(/([^>]*>)/um)
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
raise REXML::ParseException.new("Malformed node", @source) unless md
if md[0][2] == ?-
md = @source.match( COMMENT_PATTERN, true )
if md[0][0] == ?-
md = @source.match( /--(.*?)-->/um, true )

case md[1]
when /--/, /-\z/
Expand All @@ -383,17 +373,18 @@ def pull_event

return [ :comment, md[1] ] if md
else
md = @source.match( CDATA_PATTERN, true )
md = @source.match( /\[CDATA\[(.*?)\]\]>/um, true )
return [ :cdata, md[1] ] if md
end
raise REXML::ParseException.new( "Declarations can only occur "+
"in the doctype declaration.", @source)
elsif next_data[1] == ??
elsif @source.match(QUESTION_MARK, true, false)
return process_instruction
else
# Get the next tag
md = @source.match(TAG_MATCH, true)
unless md
@source.string = TAG_START + @source.buffer
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
end
tag = md[1]
Expand All @@ -418,7 +409,7 @@ def pull_event
return [ :start_element, tag, attributes ]
end
else
md = @source.match( TEXT_PATTERN, true )
md = @source.match( /([^<]*)/um, true )
text = md[1]
return [ :text, text ]
end
Expand Down Expand Up @@ -580,6 +571,7 @@ def process_instruction
match_data = @source.match(INSTRUCTION_PATTERN, true)
unless match_data
message = "Invalid processing instruction node"
@source.string = QUESTION_MARK_TAG_START + @source.buffer
raise REXML::ParseException.new(message, @source)
end
[:processing_instruction, match_data[1], match_data[2]]
Expand Down
10 changes: 7 additions & 3 deletions lib/rexml/source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,18 @@ def encoding=(enc)
def read
end

def match(pattern, cons=false)
def match(pattern, cons=false, read_source=false)
if cons
@scanner.scan(pattern).nil? ? nil : @scanner
else
@scanner.check(pattern).nil? ? nil : @scanner
end
end

def string=(string)
@scanner.string = string
end

# @return true if the Source is exhausted
def empty?
@scanner.eos?
Expand Down Expand Up @@ -155,13 +159,13 @@ def read
end
end

def match( pattern, cons=false )
def match( pattern, cons=false, read_source=true )
if cons
md = @scanner.scan(pattern)
else
md = @scanner.check(pattern)
end
while md.nil? and @source
while read_source && md.nil? && @source
begin
@scanner << readline
if cons
Expand Down

0 comments on commit c9a3bf2

Please sign in to comment.