Skip to content

Commit

Permalink
Changed processing in REXML::Parsers::BaseParser#pull_event from regu…
Browse files Browse the repository at this point in the history
…lar expression to processing using StringScanner.

## Why
Improve maintainability by optimizing the process so that the parsing process proceeds using StringScanner#scan.

# Changed
- Added read_source option to IOSource#match to suppress read from @source.
- Added Source#string= method for error message output.

## Benchmark

```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     11.278      11.371        17.346       18.248 i/s -     100.000 times in 8.866784s 8.794635s 5.764884s 5.479904s
                 sax     31.845      32.282        48.621       53.235 i/s -     100.000 times in 3.140243s 3.097687s 2.056704s 1.878479s
                pull     37.033      37.840        59.401       64.111 i/s -     100.000 times in 2.700312s 2.642690s 1.683465s 1.559796s
              stream     34.799      36.514        49.432       56.885 i/s -     100.000 times in 2.873632s 2.738665s 2.022967s 1.757948s

Comparison:
                              dom
         after(YJIT):        18.2 i/s
        before(YJIT):        17.3 i/s - 1.05x  slower
               after:        11.4 i/s - 1.60x  slower
              before:        11.3 i/s - 1.62x  slower

                              sax
         after(YJIT):        53.2 i/s
        before(YJIT):        48.6 i/s - 1.09x  slower
               after:        32.3 i/s - 1.65x  slower
              before:        31.8 i/s - 1.67x  slower

                             pull
         after(YJIT):        64.1 i/s
        before(YJIT):        59.4 i/s - 1.08x  slower
               after:        37.8 i/s - 1.69x  slower
              before:        37.0 i/s - 1.73x  slower

                           stream
         after(YJIT):        56.9 i/s
        before(YJIT):        49.4 i/s - 1.15x  slower
               after:        36.5 i/s - 1.56x  slower
              before:        34.8 i/s - 1.63x  slower

```

- YJIT=ON : 1.05x - 1.15x faster
- YJIT=OFF : 1.00x - 1.05x faster
  • Loading branch information
naitoh committed Feb 24, 2024
1 parent fd8ef89 commit a7c9491
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 109 deletions.
195 changes: 89 additions & 106 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,29 +48,15 @@ class BaseParser
REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
REFERENCE_RE = /#{REFERENCE}/

DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
DOCTYPE_END = /\A\s*\]\s*>/um
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
COMMENT_START = /\A<!--/u
COMMENT_PATTERN = /<!--(.*?)-->/um
CDATA_START = /\A<!\[CDATA\[/u
CDATA_END = /\A\s*\]\s*>/um
CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
XMLDECL_START = /\A<\?xml\s/u;
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
INSTRUCTION_START = /\A<\?/u
INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
INSTRUCTION_PATTERN = /#{NAME}(\s+.*?)?\?>/um
TAG_MATCH = /((?>#{QNAME_STR}))/um
CLOSE_MATCH = /(#{QNAME_STR})\s*>/um

VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um

ENTITY_START = /\A\s*<!ENTITY/
ELEMENTDECL_START = /\A\s*<!ELEMENT/um
ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
Expand All @@ -79,10 +65,7 @@ class BaseParser
DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
ATTDEF_RE = /#{ATTDEF}/
ATTLISTDECL_START = /\A\s*<!ATTLIST/um
ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um

TEXT_PATTERN = /\A([^<]*)/um
ATTLISTDECL_PATTERN = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um

# Entity constants
PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
Expand All @@ -94,11 +77,10 @@ class BaseParser
ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
PEDECL = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
GEDECL = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
ENTITYDECL = /(?:#{GEDECL})|(?:#{PEDECL})/um

NOTATIONDECL_START = /\A\s*<!NOTATION/um
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
Expand Down Expand Up @@ -198,65 +180,67 @@ def pull_event
#STDERR.puts @source.encoding
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
if @document_status == nil
word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
word = word[1] unless word.nil?
#STDERR.puts "WORD = #{word.inspect}"
case word
when COMMENT_START
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
when XMLDECL_START
#STDERR.puts "XMLDECL"
results = @source.match( XMLDECL_PATTERN, true )[1]
version = VERSION.match( results )
version = version[1] unless version.nil?
encoding = ENCODING.match(results)
encoding = encoding[1] unless encoding.nil?
if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-16"
end
standalone = STANDALONE.match(results)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
when INSTRUCTION_START
return process_instruction
when DOCTYPE_START
base_error_message = "Malformed DOCTYPE"
@source.match(DOCTYPE_START, true)
@nsstack.unshift(curr_ns=Set.new)
name = parse_name(base_error_message)
if @source.match(/\A\s*\[/um, true)
id = [nil, nil, nil]
@document_status = :in_doctype
elsif @source.match(/\A\s*>/um, true)
id = [nil, nil, nil]
@document_status = :after_doctype
else
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: false)
if id[0] == "SYSTEM"
# For backward compatibility
id[1], id[2] = id[2], nil
@source.read
if @source.match("<?", true, false)
if results = @source.match(/xml\s+(.*?)\?>/um, true, false)
results = results[1]
version = VERSION.match( results )
version = version[1] unless version.nil?
encoding = ENCODING.match(results)
encoding = encoding[1] unless encoding.nil?
if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
if @source.match(/\A\s*\[/um, true)
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-16"
end
standalone = STANDALONE.match(results)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
else # instruction
return process_instruction
end
elsif @source.match("<!", true, false)
if @source.match("--", true, false)
return [ :comment, @source.match( /(.*?)-->/um, true )[1] ]
elsif @source.match(/DOCTYPE\s/um, true, false)
base_error_message = "Malformed DOCTYPE"
@nsstack.unshift(curr_ns=Set.new)
name = parse_name(base_error_message)
if @source.match(/\s*\[/um, true)
id = [nil, nil, nil]
@document_status = :in_doctype
elsif @source.match(/\A\s*>/um, true)
elsif @source.match(/\s*>/um, true)
id = [nil, nil, nil]
@document_status = :after_doctype
else
message = "#{base_error_message}: garbage after external ID"
raise REXML::ParseException.new(message, @source)
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: false)
if id[0] == "SYSTEM"
# For backward compatibility
id[1], id[2] = id[2], nil
end
if @source.match(/\s*\[/um, true)
@document_status = :in_doctype
elsif @source.match(/\s*>/um, true)
@document_status = :after_doctype
else
message = "#{base_error_message}: garbage after external ID"
raise REXML::ParseException.new(message, @source)
end
end
args = [:start_doctype, name, *id]
if @document_status == :after_doctype
@source.match(/\s*/um, true)
@stack << [ :end_doctype ]
end
return args
else
message = "Invalid XML"
raise REXML::ParseException.new(message, @source)
end
args = [:start_doctype, name, *id]
if @document_status == :after_doctype
@source.match(/\A\s*/um, true)
@stack << [ :end_doctype ]
end
return args
when /\A\s+/
elsif @source.match( /\s+/, false, false )
else
@document_status = :after_doctype
if @source.encoding == "UTF-8"
Expand All @@ -265,16 +249,13 @@ def pull_event
end
end
if @document_status == :in_doctype
md = @source.match(/\A\s*(.*?>)/um)
case md[1]
when SYSTEMENTITY
match = @source.match( SYSTEMENTITY, true )[1]
return [ :externalentity, match ]

when ELEMENTDECL_START
return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]

when ENTITY_START
@source.read
@source.match(/\s*/um, true, false) # skip spaces
if match = @source.match( /(%.*?;)\s*$/um, true, false)
return [ :externalentity, match[1] ]
elsif match = @source.match(/(<!ELEMENT.*?)>/um, true, false)
return [ :elementdecl, match[1] ]
elsif @source.match( "<!ENTITY", true, false)
match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
ref = false
if match[1] == '%'
Expand All @@ -300,7 +281,7 @@ def pull_event
end
match << '%' if ref
return match
when ATTLISTDECL_START
elsif @source.match( "<!ATTLIST", true, false)
md = @source.match( ATTLISTDECL_PATTERN, true )
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
element = md[1]
Expand All @@ -320,42 +301,41 @@ def pull_event
end
end
return [ :attlistdecl, element, pairs, contents ]
when NOTATIONDECL_START
elsif @source.match( "<!NOTATION", true, false)
base_error_message = "Malformed notation declaration"
unless @source.match(/\A\s*<!NOTATION\s+/um, true)
if @source.match(/\A\s*<!NOTATION\s*>/um)
unless @source.match(/\s+/um, true)
if @source.match(/\s*>/um)
message = "#{base_error_message}: name is missing"
else
message = "#{base_error_message}: invalid declaration name"
end
@source.string = " <!NOTATION" + @source.buffer
raise REXML::ParseException.new(message, @source)
end
name = parse_name(base_error_message)
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: true)
unless @source.match(/\A\s*>/um, true)
unless @source.match(/\s*>/um, true)
message = "#{base_error_message}: garbage before end >"
raise REXML::ParseException.new(message, @source)
end
return [:notationdecl, name, *id]
when DOCTYPE_END
elsif @source.match( /\]\s*>/um, true, false)
@document_status = :after_doctype
@source.match( DOCTYPE_END, true )
return [ :end_doctype ]
end
end
if @document_status == :after_doctype
@source.match(/\A\s*/um, true)
@source.match(/\s*/um, true)
end
begin
next_data = @source.buffer
if next_data.size < 2
@source.read
next_data = @source.buffer
end
if next_data[0] == ?<
if next_data[1] == ?/
if @source.match("<", true, false)
if @source.match("/", true, false)
@nsstack.shift
last_tag = @tags.pop
md = @source.match( CLOSE_MATCH, true )
Expand All @@ -366,15 +346,16 @@ def pull_event
if md.nil? or last_tag != md[1]
message = "Missing end tag for '#{last_tag}'"
message << " (got '#{md[1]}')" if md
@source.string = "</" + @source.buffer if md.nil?
raise REXML::ParseException.new(message, @source)
end
return [ :end_element, last_tag ]
elsif next_data[1] == ?!
md = @source.match(/\A(\s*[^>]*>)/um)
elsif @source.match("!", true, false)
md = @source.match(/([^>]*>)/um)
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
raise REXML::ParseException.new("Malformed node", @source) unless md
if md[0][2] == ?-
md = @source.match( COMMENT_PATTERN, true )
if md[0][0] == ?-
md = @source.match( /--(.*?)-->/um, true )

case md[1]
when /--/, /-\z/
Expand All @@ -383,17 +364,18 @@ def pull_event

return [ :comment, md[1] ] if md
else
md = @source.match( CDATA_PATTERN, true )
md = @source.match( /\[CDATA\[(.*?)\]\]>/um, true )
return [ :cdata, md[1] ] if md
end
raise REXML::ParseException.new( "Declarations can only occur "+
"in the doctype declaration.", @source)
elsif next_data[1] == ??
elsif @source.match("?", true, false)
return process_instruction
else
# Get the next tag
md = @source.match(TAG_MATCH, true)
unless md
@source.string = "<" + @source.buffer
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
end
tag = md[1]
Expand All @@ -418,7 +400,7 @@ def pull_event
return [ :start_element, tag, attributes ]
end
else
md = @source.match( TEXT_PATTERN, true )
md = @source.match( /([^<]*)/um, true )
text = md[1]
return [ :text, text ]
end
Expand Down Expand Up @@ -579,6 +561,7 @@ def process_instruction
match_data = @source.match(INSTRUCTION_PATTERN, true)
unless match_data
message = "Invalid processing instruction node"
@source.string = "<?" + @source.buffer
raise REXML::ParseException.new(message, @source)
end
[:processing_instruction, match_data[1], match_data[2]]
Expand Down
10 changes: 7 additions & 3 deletions lib/rexml/source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,18 @@ def encoding=(enc)
def read
end

def match(pattern, cons=false)
def match(pattern, cons=false, read_source=false)
if cons
@scanner.scan(pattern).nil? ? nil : @scanner
else
@scanner.check(pattern).nil? ? nil : @scanner
end
end

def string=(string)
@scanner.string = string
end

# @return true if the Source is exhausted
def empty?
@scanner.eos?
Expand Down Expand Up @@ -155,13 +159,13 @@ def read
end
end

def match( pattern, cons=false )
def match( pattern, cons=false, read_source=true )
if cons
md = @scanner.scan(pattern)
else
md = @scanner.check(pattern)
end
while md.nil? and @source
while read_source && md.nil? && @source
begin
@scanner << readline
if cons
Expand Down

0 comments on commit a7c9491

Please sign in to comment.