Skip to content

Commit

Permalink
Update processing of @document_status == :in_doctype to StringScann…
Browse files Browse the repository at this point in the history
…er style

[Changed]
Of the `intSubset` of DOCTYPE, "<!" added consideration for processing `Comments` that begin with "<!".

[Spec]
https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl

> [28] 	doctypedecl   ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset

> [28b] intSubset   ::=  (markupdecl | DeclSep)*

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl

> [29]  markupdecl   ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl

> [45]  elementdecl   ::=   '<!ELEMENT' S Name S contentspec S? '>'

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl

> [52] 	AttlistDecl   ::=   '<!ATTLIST' S Name AttDef* S? '>'

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityDecl

> [70] 	EntityDecl   ::=   GEDecl | PEDecl
> [71] 	GEDecl	   ::=   '<!ENTITY' S Name S EntityDef S? '>'
> [72] 	PEDecl	   ::=   '<!ENTITY' S '%' S Name S PEDef S? '>'

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NotationDecl

> [82] 	NotationDecl   ::=   '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI

> [16] 	PI	   ::=   '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment

> [15] 	Comment	   ::=   '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep

> [28a] DeclSep	   ::=   PEReference | S

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference

> [69]  PEReference   ::=   '%' Name ';'

[Benchmark]
```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= N_ELEMENTS=500 N_ATTRIBUTES=1 /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom    118.316     121.880       182.080      192.754 i/s -     100.000 times in 0.845194s 0.820481s 0.549208s 0.518795s
                 sax    362.797     370.424       557.308      598.347 i/s -     100.000 times in 0.275636s 0.269961s 0.179434s 0.167127s
                pull    436.479     445.266       689.066      755.002 i/s -     100.000 times in 0.229106s 0.224585s 0.145124s 0.132450s
              stream    449.029     465.134       643.141      727.357 i/s -     100.000 times in 0.222703s 0.214992s 0.155487s 0.137484s

Comparison:
                              dom
         after(YJIT):       192.8 i/s
        before(YJIT):       182.1 i/s - 1.06x  slower
               after:       121.9 i/s - 1.58x  slower
              before:       118.3 i/s - 1.63x  slower

                              sax
         after(YJIT):       598.3 i/s
        before(YJIT):       557.3 i/s - 1.07x  slower
               after:       370.4 i/s - 1.62x  slower
              before:       362.8 i/s - 1.65x  slower

                             pull
         after(YJIT):       755.0 i/s
        before(YJIT):       689.1 i/s - 1.10x  slower
               after:       445.3 i/s - 1.70x  slower
              before:       436.5 i/s - 1.73x  slower

                           stream
         after(YJIT):       727.4 i/s
        before(YJIT):       643.1 i/s - 1.13x  slower
               after:       465.1 i/s - 1.56x  slower
              before:       449.0 i/s - 1.62x  slower

```

- YJIT=ON : 1.06x - 1.13x faster
- YJIT=OFF : 1.02x - 1.03x faster
  • Loading branch information
naitoh committed Feb 25, 2024
1 parent 9a075bf commit 8d7fc13
Showing 1 changed file with 74 additions and 66 deletions.
140 changes: 74 additions & 66 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -251,76 +251,84 @@ def pull_event
if @document_status == :in_doctype
@source.read
@source.match(/\s*/um, true, false) # skip spaces
if match = @source.match( /(%.*?;)\s*$/um, true, false)
return [ :externalentity, match[1] ]
elsif match = @source.match(/(<!ELEMENT.*?)>/um, true, false)
return [ :elementdecl, match[1] ]
elsif @source.match( "<!ENTITY", true, false)
match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
ref = false
if match[1] == '%'
ref = true
match.delete_at 1
end
# Now we have to sort out what kind of entity reference this is
if match[2] == 'SYSTEM'
# External reference
match[3] = match[3][1..-2] # PUBID
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
elsif match[2] == 'PUBLIC'
# External reference
match[3] = match[3][1..-2] # PUBID
match[4] = match[4][1..-2] # HREF
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
else
match[2] = match[2][1..-2]
match.pop if match.size == 4
# match is [ :entity, name, value ]
end
match << '%' if ref
return match
elsif @source.match( "<!ATTLIST", true, false)
md = @source.match( ATTLISTDECL_PATTERN, true )
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
element = md[1]
contents = md[0]

pairs = {}
values = md[0].scan( ATTDEF_RE )
values.each do |attdef|
unless attdef[3] == "#IMPLIED"
attdef.compact!
val = attdef[3]
val = attdef[4] if val == "#FIXED "
pairs[attdef[0]] = val
if attdef[0] =~ /^xmlns:(.*)/
@nsstack[0] << $1
end
if match = @source.match("<!", true, false)
if match = @source.match(/(ELEMENT.*?)>/um, true, false)
return [ :elementdecl, "<!" + match[1] ]
elsif @source.match( "ENTITY", true, false)
match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
ref = false
if match[1] == '%'
ref = true
match.delete_at 1
end
end
return [ :attlistdecl, element, pairs, contents ]
elsif @source.match( "<!NOTATION", true, false)
base_error_message = "Malformed notation declaration"
unless @source.match(/\s+/um, true)
if @source.match(/\s*>/um)
message = "#{base_error_message}: name is missing"
# Now we have to sort out what kind of entity reference this is
if match[2] == 'SYSTEM'
# External reference
match[3] = match[3][1..-2] # PUBID
match.delete_at(4) if match.size > 4 # Chop out NDATA decl
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
elsif match[2] == 'PUBLIC'
# External reference
match[3] = match[3][1..-2] # PUBID
match[4] = match[4][1..-2] # HREF
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
else
message = "#{base_error_message}: invalid declaration name"
match[2] = match[2][1..-2]
match.pop if match.size == 4
# match is [ :entity, name, value ]
end
@source.string = " <!NOTATION" + @source.buffer
raise REXML::ParseException.new(message, @source)
end
name = parse_name(base_error_message)
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: true)
unless @source.match(/\s*>/um, true)
message = "#{base_error_message}: garbage before end >"
raise REXML::ParseException.new(message, @source)
match << '%' if ref
return match
elsif @source.match( "ATTLIST", true, false)
md = @source.match( ATTLISTDECL_PATTERN, true )
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
element = md[1]
contents = md[0]

pairs = {}
values = md[0].scan( ATTDEF_RE )
values.each do |attdef|
unless attdef[3] == "#IMPLIED"
attdef.compact!
val = attdef[3]
val = attdef[4] if val == "#FIXED "
pairs[attdef[0]] = val
if attdef[0] =~ /^xmlns:(.*)/
@nsstack[0] << $1
end
end
end
return [ :attlistdecl, element, pairs, contents ]
elsif @source.match( "NOTATION", true, false)
base_error_message = "Malformed notation declaration"
unless @source.match(/\s+/um, true)
if @source.match(/\s*>/um)
message = "#{base_error_message}: name is missing"
else
message = "#{base_error_message}: invalid declaration name"
end
@source.string = " <!NOTATION" + @source.buffer
raise REXML::ParseException.new(message, @source)
end
name = parse_name(base_error_message)
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: true)
unless @source.match(/\s*>/um, true)
message = "#{base_error_message}: garbage before end >"
raise REXML::ParseException.new(message, @source)
end
return [:notationdecl, name, *id]
elsif md = @source.match( /--(.*?)-->/um, true, false)
case md[1]
when /--/, /-\z/
raise REXML::ParseException.new("Malformed comment", @source)
end
return [ :comment, md[1] ] if md
end
return [:notationdecl, name, *id]
elsif match = @source.match( /(%.*?;)\s*$/um, true, false)
return [ :externalentity, match[1] ]
elsif @source.match( /\]\s*>/um, true, false)
@document_status = :after_doctype
return [ :end_doctype ]
Expand Down

0 comments on commit 8d7fc13

Please sign in to comment.