Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

Encoding and baseUrl can be specified for html #167

Merged
merged 1 commit into from

2 participants

Steven Kabbes Roman Shtylman
Steven Kabbes

Some HTML documents may have incorrect meta tags which puts libxml out
of its default utf-8 parsing mode. Allow these optional parameters to
parseHtml so users can have control of this in theses situations.

Steven Kabbes skabbes Encoding and baseUrl can be specified for html
Some HTML documents may have incorrect meta tags which puts libxml out
of its default utf-8 parsing mode.  Allow these optional parameters to
parseHtml so users can have control of this in theses situations.
a055efc
Steven Kabbes

Cleaned up version of #161

Roman Shtylman defunctzombie merged commit 95c614c into from
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Sep 12, 2012
  1. Steven Kabbes

    Encoding and baseUrl can be specified for html

    skabbes authored
    Some HTML documents may have incorrect meta tags which puts libxml out
    of its default utf-8 parsing mode.  Allow these optional parameters to
    parseHtml so users can have control of this in theses situations.
This page is out of date. Refresh to see the latest.
24 lib/document.js
View
@@ -56,22 +56,22 @@ Document.prototype.childNodes = function() {
/// @return a string representation of the document
Document.prototype.toString = function() {
return this._toString();
-}
+};
/// @return the document version
Document.prototype.version = function() {
return this._version();
-}
+};
/// @return the document encoding
Document.prototype.encoding = function(encoding) {
return this._encoding(encoding);
-}
+};
/// @return whether the XmlDocument is valid
Document.prototype.validate = function(xsd) {
return this._validate(xsd);
-}
+};
/// @return array of namespaces in document
Document.prototype.namespaces = function() {
@@ -82,15 +82,23 @@ module.exports = Document;
/// parse a string into a html document
/// @param string html string to parse
+/// @param {encoding:string, baseUrl:string} opts html string to parse
/// @return a Document
-module.exports.fromHtml = function(string) {
- return bindings.fromHtml(string);
-}
+module.exports.fromHtml = function(string, opts) {
+ opts = opts || {};
+
+ // if for some reason user did not specify an object for the options
+ if (typeof(opts) !== 'object') {
+ throw new Error('fromHtml options must be an object');
+ }
+
+ return bindings.fromHtml(string, opts);
+};
/// parse a string into a xml document
/// @param string xml string to parse
/// @return a Document
module.exports.fromXml = function(string) {
return bindings.fromXml(string);
-}
+};
36 src/xml_document.cc
View
@@ -135,6 +135,28 @@ XmlDocument::FromHtml(const v8::Arguments& args)
{
v8::HandleScope scope;
+ v8::Local<v8::Object> options = args[1]->ToObject();
+ v8::Local<v8::Value> baseUrlOpt = options->Get(
+ v8::String::NewSymbol("baseUrl"));
+ v8::Local<v8::Value> encodingOpt = options->Get(
+ v8::String::NewSymbol("encoding"));
+
+ // the base URL that will be used for this HTML parsed document
+ v8::String::Utf8Value baseUrl_(baseUrlOpt->ToString());
+ const char * baseUrl = *baseUrl_;
+ if (!baseUrlOpt->IsString()) {
+ baseUrl = NULL;
+ }
+
+ // the encoding to be used for this document
+ // (leave NULL for libxml to autodetect)
+ v8::String::Utf8Value encoding_(encodingOpt->ToString());
+ const char * encoding = *encoding_;
+
+ if (!encodingOpt->IsString()) {
+ encoding = NULL;
+ }
+
v8::Local<v8::Array> errors = v8::Array::New();
xmlResetLastError();
xmlSetStructuredErrorFunc(reinterpret_cast<void *>(*errors),
@@ -142,15 +164,15 @@ XmlDocument::FromHtml(const v8::Arguments& args)
htmlDocPtr doc;
if (!node::Buffer::HasInstance(args[0])) {
- // Parse a string
- v8::String::Utf8Value str(args[0]->ToString());
- doc = htmlReadMemory(*str, str.length(), NULL, NULL, 0);
+ // Parse a string
+ v8::String::Utf8Value str(args[0]->ToString());
+ doc = htmlReadMemory(*str, str.length(), baseUrl, encoding, 0);
}
else {
- // Parse a buffer
- v8::Local<v8::Object> buf = args[0]->ToObject();
- doc = htmlReadMemory(node::Buffer::Data(buf), node::Buffer::Length(buf),
- NULL, NULL, 0);
+ // Parse a buffer
+ v8::Local<v8::Object> buf = args[0]->ToObject();
+ doc = htmlReadMemory(node::Buffer::Data(buf), node::Buffer::Length(buf),
+ baseUrl, encoding, 0);
}
xmlSetStructuredErrorFunc(NULL, NULL);
10 test/fixtures/parser.euc_jp.html
View
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <meta http-equiv="Content-Type" content="text/html; charset=euc-jp" />
+ <title>テスト</title>
+ </head>
+ <body>
+ <div>テスト</div>
+ </body>
+</html>
33 test/html_parser.js
View
@@ -32,6 +32,39 @@ module.exports.parse = function(assert) {
assert.done();
};
+// Although libxml defaults to a utf-8 encoding, if not specifically specified
+// it will guess the encoding based on meta http-equiv tags available
+// This test shows that the "guessed" encoding can be overridden
+module.exports.parse_force_encoding = function(assert) {
+ var filename = __dirname + '/fixtures/parser.euc_jp.html';
+
+ function attempt_parse(encoding, opts) {
+ var str = fs.readFileSync(filename, encoding);
+
+ var doc = libxml.parseHtml(str, opts);
+ assert.equal('html', doc.root().name());
+
+ // make sure libxml rewrite the meta charset of this document
+
+ // calling toString on the document ensure that it is converted to the
+ // correct internal format and the new meta tag is replaced
+ doc.root().toString();
+ var fixedCharset = doc.find('/html/head/meta/@content')[0].value();
+ assert.ok( fixedCharset.indexOf(opts.encoding.toUpperCase() ) !== -1);
+
+ assert.equal('テスト', doc.get('head/title').text());
+ assert.equal('テスト', doc.get('body/div').text());
+ }
+
+ // Parse via a string
+ attempt_parse('utf-8', {encoding: 'utf-8'});
+
+ // Parse via a Buffer
+ attempt_parse(null, {encoding: 'utf-8'});
+
+ assert.done();
+};
+
module.exports.parse_synonym = function(assert) {
assert.strictEqual(libxml.parseHtml, libxml.parseHtmlString);
assert.done();
Something went wrong with that request. Please try again.