Skip to content

Commit

Permalink
Fixes a bug only when charset is given by HTML5 style meta tag.
Browse files Browse the repository at this point in the history
  • Loading branch information
yokolet committed Aug 13, 2011
1 parent 05c5db4 commit f8c3369
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 0 deletions.
37 changes: 37 additions & 0 deletions ext/java/nokogiri/internals/HtmlDomParserContext.java
Expand Up @@ -34,6 +34,7 @@

import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
import static nokogiri.internals.NokogiriHelpers.isNamespace;
import static nokogiri.internals.NokogiriHelpers.stringOrNil;
import nokogiri.HtmlDocument;
import nokogiri.NokogiriService;
import nokogiri.XmlDocument;
Expand All @@ -52,6 +53,8 @@
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.NodeList;

/**
* Parser for HtmlDocument. This class actually parses HtmlDocument using NekoHtml.
Expand Down Expand Up @@ -120,9 +123,43 @@ protected XmlDocument wrapDocument(ThreadContext context,
Document document) {
HtmlDocument htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(context.getRuntime(), klazz);
htmlDocument.setNode(context, document);
if (ruby_encoding.isNil()) {
// ruby_encoding might have detected by HtmlDocument::EncodingReader
if (detected_encoding != null && !detected_encoding.isNil()) {
ruby_encoding = detected_encoding;
} else {
// no encoding given & no encoding detected, then try to get it
String charset = tryGetCharsetFromHtml5MetaTag(document);
ruby_encoding = stringOrNil(context.getRuntime(), charset);
}
}
htmlDocument.setEncoding(ruby_encoding);
return htmlDocument;
}

// NekoHtml doesn't understand HTML5 meta tag format. This fails to detect charset
// from an HTML5 style meta tag. Luckily, the meta tag and charset exists in DOM tree
// so, this method attempts to find the charset.
private String tryGetCharsetFromHtml5MetaTag(Document document) {
if (!"html".equalsIgnoreCase(document.getDocumentElement().getNodeName())) return null;
NodeList list = document.getDocumentElement().getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
if ("head".equalsIgnoreCase(list.item(i).getNodeName())) {
NodeList headers = list.item(i).getChildNodes();
for (int j = 0; j < headers.getLength(); j++) {
if ("meta".equalsIgnoreCase(headers.item(j).getNodeName())) {
NamedNodeMap nodeMap = headers.item(j).getAttributes();
for (int k = 0; k < nodeMap.getLength(); k++) {
if ("charset".equalsIgnoreCase(nodeMap.item(k).getNodeName())) {
return nodeMap.item(k).getNodeValue();
}
}
}
}
}
}
return null;
}

/**
* Filter to strip out attributes that pertain to XML namespaces.
Expand Down
12 changes: 12 additions & 0 deletions ext/java/nokogiri/internals/ParserContext.java
Expand Up @@ -43,6 +43,7 @@

import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyFixnum;
import org.jruby.RubyIO;
import org.jruby.RubyObject;
import org.jruby.RubyString;
Expand All @@ -65,6 +66,7 @@
*/
public class ParserContext extends RubyObject {
protected InputSource source = null;
protected IRubyObject detected_encoding = null;

/**
* Create a file base input source taking into account the current
Expand Down Expand Up @@ -102,8 +104,18 @@ protected InputSource getInputSource() {
public void setInputSource(ThreadContext context, IRubyObject data, IRubyObject url) {
Ruby ruby = context.getRuntime();
String path = (String) url.toJava(String.class);
if (data.getType().respondsTo("detect_encoding")) {
// data is EnocodingReader
try {
data.callMethod(context, "read", RubyFixnum.newFixnum(context.getRuntime(), 1024));
} catch (RaiseException e) {
detected_encoding = e.getException().getInstanceVariable("@encoding");
}
}

if (isAbsolutePath(path)) {
source = new InputSource();
if (detected_encoding != null) source.setEncoding((String) detected_encoding.toJava(String.class));
source.setSystemId(path);
return;
}
Expand Down

0 comments on commit f8c3369

Please sign in to comment.