Skip to content

Commit

Permalink
adding parser context and replace_entities accessor. closes #76
Browse files Browse the repository at this point in the history
  • Loading branch information
tenderlove committed Oct 6, 2009
1 parent fbe7217 commit 795cc5a
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rdoc
Expand Up @@ -17,6 +17,7 @@
* Added XML::Node#create_external_subset
* Added XML::Node#create_internal_subset
* XML Builder can append raw strings (GH #141, patch from dudleyf)
* XML::SAX::ParserContext added

* Bugfixes

Expand Down
64 changes: 64 additions & 0 deletions ext/nokogiri/xml_sax_parser_context.c
Expand Up @@ -13,6 +13,12 @@ static void deallocate(xmlParserCtxtPtr ctxt)
NOKOGIRI_DEBUG_END(handler);
}

/*
* call-seq:
* parse_io(io, encoding)
*
* Parse +io+ object with +encoding+
*/
static VALUE parse_io(VALUE klass, VALUE io, VALUE encoding)
{
xmlCharEncoding enc = (xmlCharEncoding)NUM2INT(encoding);
Expand All @@ -29,12 +35,24 @@ static VALUE parse_io(VALUE klass, VALUE io, VALUE encoding)
return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
}

/*
* call-seq:
* parse_file(filename)
*
* Parse file given +filename+
*/
static VALUE parse_file(VALUE klass, VALUE filename)
{
xmlParserCtxtPtr ctxt = xmlCreateFileParserCtxt(StringValuePtr(filename));
return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
}

/*
* call-seq:
* parse_memory(data)
*
* Parse the XML stored in memory in +data+
*/
static VALUE parse_memory(VALUE klass, VALUE data)
{
if(Qnil == data) rb_raise(rb_eArgError, "data cannot be nil");
Expand All @@ -49,6 +67,12 @@ static VALUE parse_memory(VALUE klass, VALUE data)
return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
}

/*
* call-seq:
* parse_with(sax_handler)
*
* Use +sax_handler+ and parse the current document
*/
static VALUE parse_with(VALUE self, VALUE sax_handler)
{
if(!rb_obj_is_kind_of(sax_handler, cNokogiriXmlSaxParser))
Expand All @@ -74,6 +98,44 @@ static VALUE parse_with(VALUE self, VALUE sax_handler)
NOKOGIRI_SAX_TUPLE_DESTROY(ctxt->userData);
}

/*
* call-seq:
* replace_entities=(boolean)
*
* Should this parser replace entities? & will get converted to '&' if
* set to true
*/
static VALUE set_replace_entities(VALUE self, VALUE value)
{
xmlParserCtxtPtr ctxt;
Data_Get_Struct(self, xmlParserCtxt, ctxt);

if(Qfalse == value)
ctxt->replaceEntities = 0;
else
ctxt->replaceEntities = 1;

return value;
}

/*
* call-seq:
* replace_entities
*
* Should this parser replace entities? & will get converted to '&' if
* set to true
*/
static VALUE get_replace_entities(VALUE self)
{
xmlParserCtxtPtr ctxt;
Data_Get_Struct(self, xmlParserCtxt, ctxt);

if(0 == ctxt->replaceEntities)
return Qfalse;
else
return Qtrue;
}

void init_xml_sax_parser_context()
{
VALUE nokogiri = rb_define_module("Nokogiri");
Expand All @@ -88,4 +150,6 @@ void init_xml_sax_parser_context()
rb_define_singleton_method(klass, "file", parse_file, 1);

rb_define_method(klass, "parse_with", parse_with, 1);
rb_define_method(klass, "replace_entities=", set_replace_entities, 1);
rb_define_method(klass, "replace_entities", get_replace_entities, 0);
}
18 changes: 12 additions & 6 deletions lib/nokogiri/xml/sax/parser.rb
Expand Up @@ -76,19 +76,21 @@ def initialize doc = Nokogiri::XML::SAX::Document.new, encoding = 'UTF-8'
###
# Parse given +thing+ which may be a string containing xml, or an
# IO object.
def parse thing
def parse thing, &block
if thing.respond_to?(:read) && thing.respond_to?(:close)
parse_io(thing)
parse_io(thing, &block)
else
parse_memory(thing)
parse_memory(thing, &block)
end
end

###
# Parse given +io+
def parse_io io, encoding = 'ASCII'
@encoding = encoding
ParserContext.io(io, ENCODINGS[encoding]).parse_with self
ctx = ParserContext.io(io, ENCODINGS[encoding])
yield ctx if block_given?
ctx.parse_with self
end

###
Expand All @@ -97,11 +99,15 @@ def parse_file filename
raise ArgumentError unless filename
raise Errno::ENOENT unless File.exists?(filename)
raise Errno::EISDIR if File.directory?(filename)
ParserContext.file(filename).parse_with self
ctx = ParserContext.file filename
yield ctx if block_given?
ctx.parse_with self
end

def parse_memory data
ParserContext.memory(data).parse_with(self)
ctx = ParserContext.memory data
yield ctx if block_given?
ctx.parse_with self
end
end
end
Expand Down
32 changes: 32 additions & 0 deletions test/xml/sax/test_parser.rb
Expand Up @@ -11,6 +11,38 @@ def setup
@parser = XML::SAX::Parser.new(Doc.new)
end

def test_parser_context_yielded_io
doc = Doc.new
parser = XML::SAX::Parser.new doc
xml = "<foo a='&amp;b'/>"

block_called = false
parser.parse(StringIO.new(xml)) { |ctx|
block_called = true
ctx.replace_entities = true

This comment has been minimized.

Copy link
@rosenfeld

rosenfeld Jun 30, 2016

Contributor

Try setting this to false and see if the array would be ['a', '&amp;b'].

}

assert block_called

assert_equal ['a', '&b'], doc.start_elements.first.last
end

def test_parser_context_yielded_in_memory
doc = Doc.new
parser = XML::SAX::Parser.new doc
xml = "<foo a='&amp;b'/>"

block_called = false
parser.parse(xml) { |ctx|
block_called = true
ctx.replace_entities = true
}

assert block_called

assert_equal ['a', '&b'], doc.start_elements.first.last
end

def test_xml_decl
{
'' => nil,
Expand Down
9 changes: 9 additions & 0 deletions test/xml/sax/test_parser_context.rb
Expand Up @@ -6,6 +6,15 @@ module Nokogiri
module XML
module SAX
class TestParserContext < Nokogiri::SAX::TestCase
def test_replace_entities
pc = ParserContext.new StringIO.new('<root />'), 'UTF-8'
pc.replace_entities = false
assert_equal false, pc.replace_entities

pc.replace_entities = true
assert_equal true, pc.replace_entities
end

def test_from_io
assert_nothing_raised do
ParserContext.new StringIO.new('fo'), 'UTF-8'
Expand Down

9 comments on commit 795cc5a

@rosenfeld
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @tenderlove, this commit seems to be broken. There's only test cases for replace_entities = true. You should have also tested the output with replace_entities = false. You'll notice the result is the same. Could you please make replace_entities = false work as expected, by sending raw "&" to the sax parser with no substitutions?

@tenderlove
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rosenfeld patches welcome

@rosenfeld
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wish I had good understanding of libxml2 :(

@tenderlove
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rosenfeld telling me what "I should have done" on a 7 year old commit isn't a great way to ask for help.

@rosenfeld
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can contribute with a failing test case though if that helps. Sorry, I didn't mean to take you down. I'm not a native English speaker so please forgive me if something sound rude to you.

@rosenfeld
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I spent a few hours today trying to understand what this replace_entities really does. After reading lots of PR and issues I ended up watching this commit and decided to comment on what is the root cause of the main problem I'm experiencing and noticed yesterday. I didn't mean to harsh you though.

@rosenfeld
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I even tried to read nokogiri's source and I ran the test suite and changed that param in the test and could confirm the result is the same, that's why I wrote about this, but intending to explain that it doesn't really does what you probably think it does... I tried changing the source to set the default behavior of entity substitutions and the result was the same. I'm quite lost actually, that's why I'm asking for help. Sorry again if this is not what you understood from my previous comments.

@tenderlove
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I didn't mean to take you down. I'm not a native English speaker so please forgive me if something sound rude to you.

No problem. I can understand that! Can you send a failing test case? That would help me debug. It could be that libxml2 is broken and we'll have to fix that.

@rosenfeld
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I'll work on that after having lunch and create a PR. Thanks a lot!

Please sign in to comment.