Skip to content

Commit

Permalink
Improve impl when noblanks option is set. Previous fix didn't work wh…
Browse files Browse the repository at this point in the history
…en data was given as a file. Plus, stop weird text processing by changing test data to fit to xerces.
  • Loading branch information
yokolet committed Apr 10, 2010
1 parent 583f6f0 commit 0b7928c
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 72 deletions.
88 changes: 17 additions & 71 deletions ext/java/nokogiri/internals/XmlDomParserContext.java
Expand Up @@ -2,14 +2,8 @@

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import nokogiri.XmlDocument;
import nokogiri.XmlSyntaxError;
Expand All @@ -22,6 +16,8 @@
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
Expand Down Expand Up @@ -214,80 +210,30 @@ public XmlDocument parse(ThreadContext context,
}

protected Document do_parse() throws SAXException, IOException {
if (noBlanks() || getPublicId(getInputSource()) != null) {
String adjustedContent = tweakPublicId(asString(getInputSource()));
if (noBlanks()) {
adjustedContent = adjustedContent.replaceAll("(>\\n)", ">")
.replaceAll("\\s{1,}<", "<")
.replaceAll(">\\s{1,}", ">");
parser.parse(getInputSource());
if (noBlanks()) {
List<Node> emptyNodes = new ArrayList<Node>();
findEmptyTexts(parser.getDocument(), emptyNodes);
if (emptyNodes.size() > 0) {
for (Node node : emptyNodes) {
node.getParentNode().removeChild(node);
}
}
StringReader sr = new StringReader((new String(adjustedContent)));
parser.parse(new InputSource(sr));
} else {
parser.parse(getInputSource());
}
return parser.getDocument();
}

private String asString(InputSource input) throws IOException {
Reader reader = input.getCharacterStream();
/*
* when this block is used, XmlSchema.from_document raises ClassCastException since
* the given document is null.
InputStream istream = input.getByteStream();
if (reader == null) {
if (istream != null) {
reader = new InputStreamReader(istream);
private void findEmptyTexts(Node node, List<Node> emptyNodes) {
if (node.getNodeType() == Node.TEXT_NODE && "".equals(node.getTextContent().trim())) {
emptyNodes.add(node);
} else {
NodeList children = node.getChildNodes();
for (int i=0; i < children.getLength(); i++) {
findEmptyTexts(children.item(i), emptyNodes);
}
}*/
if (reader == null) return "";
StringBuffer content = new StringBuffer();
char[] cbuf = new char[2048];
int length;
while ((length = reader.read(cbuf)) != -1) {
content.append(cbuf, 0, length);
}
//if (istream != null) istream.reset();
reader.reset();
return new String(content);
}

private static Pattern doctype = Pattern.compile("<!DOCTYPE(.)+PUBLIC(()|[^>]*)>");
private static Pattern publicId = Pattern.compile("\"(-|/|\\d|:|-|_|\\.|[a-zA-Z]|,|\\s)+\"");

private String getPublicId(InputSource input) throws IOException {
Matcher mm = doctype.matcher(asString(input));
while(mm.find()) {
return mm.group();
}
return null;
}

private String tweakPublicId(String str) {
String newStr = tweak(str);
return newStr == null ? str : str.replace(str, tweak(str));
}

private String tweak(String str) {
List<String> list = getIds(str);
if (list.size() == 2) {
return null;
} else if (list.size() == 1) {
return str.replace(list.get(0), list.get(0) + " \"\"");
}
return null;
}

private List<String> getIds(String str) {
List<String> list = new ArrayList<String>();
Matcher mm = publicId.matcher(str);
while(mm.find()) {
list.add(mm.group());
}
return list;
}


public boolean dtdAttr() { return this.dtdAttr; }

public boolean dtdLoad() { return this.dtdLoad; }
Expand Down
6 changes: 5 additions & 1 deletion test/xml/test_dtd.rb
Expand Up @@ -14,7 +14,11 @@ def test_system_id
end

def test_external_id
xml = Nokogiri::XML('<!DOCTYPE foo PUBLIC "bar" ><foo />')
if Nokogiri.uses_libxml?
xml = Nokogiri::XML('<!DOCTYPE foo PUBLIC "bar"><foo />')
else
xml = Nokogiri::XML('<!DOCTYPE foo PUBLIC "bar" ""><foo />')
end
assert dtd = xml.internal_subset
assert_equal 'bar', dtd.external_id
end
Expand Down

0 comments on commit 0b7928c

Please sign in to comment.