Skip to content

Commit

Permalink
fix: empty fragment encoding
Browse files Browse the repository at this point in the history
and improve test coverage around fragment encoding

Closes #2649
  • Loading branch information
flavorjones committed Sep 19, 2022
1 parent 75af7e8 commit a582143
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 36 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ This version of Nokogiri uses [`jar-dependencies`](https://github.com/mkristian/
### Fixed

* `SAX::Parser`'s `encoding` attribute will not be clobbered when an alternative encoding is passed into `SAX::Parser#parse_io`. [[#1942](https://github.com/sparklemotion/nokogiri/issues/1942)] (Thanks, [@kp666](https://github.com/kp666)!)
* Serialized `HTML4::DocumentFragment` will now be properly encoded. Previously this empty string was encoded as `US-ASCII`. [[#2649](https://github.com/sparklemotion/nokogiri/issues/2649)]
* [CRuby] UTF-16-encoded documents longer than ~4000 code points now serialize properly. Previously the serialized document was corrupted when it exceeded the length of libxml2's internal string buffer. [[#752](https://github.com/sparklemotion/nokogiri/issues/752)]
* [CRuby] The HTML5 parser now correctly handles text at the end of `form` elements.
* [CRuby] `HTML5::Document#fragment` now always uses `body` as the parsing context. Previously, fragments were parsed in the context of the associated document's root node, which allowed for inconsistent parsing. [[#2553](https://github.com/sparklemotion/nokogiri/issues/2553)]
Expand Down
7 changes: 6 additions & 1 deletion lib/nokogiri/xml/node_set.rb
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,12 @@ def to_html(*args)
options[:save_with] ||= Node::SaveOptions::DEFAULT_HTML
args.insert(0, options)
end
map { |x| x.to_html(*args) }.join
if empty?
encoding = (args.first.is_a?(Hash) ? args.first[:encoding] : nil) || document.encoding
"".encode(encoding)
else
map { |x| x.to_html(*args) }.join
end
end

###
Expand Down
91 changes: 56 additions & 35 deletions test/html4/test_document_fragment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,6 @@ def test_ascii_8bit_encoding
assert_equal("hello", Nokogiri::HTML4::DocumentFragment.parse(s).to_html)
end

def test_inspect_encoding
fragment = "<div>こんにちは!</div>".encode("EUC-JP")
f = Nokogiri::HTML4::DocumentFragment.parse(fragment)
assert_equal("こんにちは!", f.content)
end

def test_html_parse_encoding
fragment = "<div>こんにちは!</div>".encode("EUC-JP")
f = Nokogiri::HTML4.fragment(fragment)
assert_equal("EUC-JP", f.document.encoding)
assert_equal("こんにちは!", f.content)
end

def test_unlink_empty_document
frag = Nokogiri::HTML4::DocumentFragment.parse("").unlink # must_not_raise
assert_nil(frag.parent)
Expand All @@ -38,20 +25,6 @@ def test_colons_are_not_removed
assert_match(/3:30/, doc.to_s)
end

def test_parse_encoding
fragment = "<div>hello world</div>"
f = Nokogiri::HTML4::DocumentFragment.parse(fragment, "ISO-8859-1")
assert_equal("ISO-8859-1", f.document.encoding)
assert_equal("hello world", f.content)
end

def test_html_parse_with_encoding
fragment = "<div>hello world</div>"
f = Nokogiri::HTML4.fragment(fragment, "ISO-8859-1")
assert_equal("ISO-8859-1", f.document.encoding)
assert_equal("hello world", f.content)
end

def test_parse_in_context
assert_equal("<br>", html.root.parse("<br />").to_s)
end
Expand All @@ -76,14 +49,6 @@ def test_ancestors_search
assert(li.matches?("li"))
end

def test_fun_encoding
string = %(<body>こんにちは</body>)
html = Nokogiri::HTML4::DocumentFragment.parse(
string
).to_html(encoding: "UTF-8")
assert_equal(string, html)
end

def test_new
assert(Nokogiri::HTML4::DocumentFragment.new(html))
end
Expand Down Expand Up @@ -306,6 +271,62 @@ def test_dup_should_create_an_html_document_fragment
assert_instance_of(Nokogiri::HTML4::DocumentFragment, duplicate)
end

describe "encoding" do
describe "#fragment" do
it "parses an encoded string" do
input = "<div>こんにちは!</div>".encode("EUC-JP")
fragment = Nokogiri::HTML4.fragment(input)
assert_equal("EUC-JP", fragment.document.encoding)
assert_equal("こんにちは!", fragment.content)
end

it "returns a string matching the passed encoding" do
input = "<div>hello world</div>"

fragment = Nokogiri::HTML4.fragment(input, "ISO-8859-1")
assert_equal("ISO-8859-1", fragment.document.encoding)
assert_equal("hello world", fragment.content)
end
end

describe "#parse" do
it "parses an encoded string" do
input = "<div>こんにちは!</div>".encode("EUC-JP")

fragment = Nokogiri::HTML4::DocumentFragment.parse(input)
assert_equal("EUC-JP", fragment.document.encoding)
assert_equal("こんにちは!", fragment.content)
end

it "returns a string matching the passed encoding" do
input = "<div>hello world</div>"

fragment = Nokogiri::HTML4::DocumentFragment.parse(input, "ISO-8859-1")
assert_equal("ISO-8859-1", fragment.document.encoding)
assert_equal("hello world", fragment.content)
end

it "respects encoding for empty strings" do
fragment = Nokogiri::HTML::DocumentFragment.parse("", "UTF-8")
assert_equal "UTF-8", fragment.to_html.encoding.to_s

fragment = Nokogiri::HTML::DocumentFragment.parse("", "US-ASCII")
assert_equal "US-ASCII", fragment.to_html.encoding.to_s

fragment = Nokogiri::HTML::DocumentFragment.parse("", "ISO-8859-1")
assert_equal "ISO-8859-1", fragment.to_html.encoding.to_s
end
end

describe "#to_html" do
it "serializes empty strings with the passed encoding" do
fragment = Nokogiri::HTML::DocumentFragment.parse("", "UTF-8")
assert_equal "ISO-8859-1", fragment.to_html(encoding: "ISO-8859-1").encoding.to_s
assert_equal "US-ASCII", fragment.to_html(encoding: "US-ASCII").encoding.to_s
end
end
end

describe "parse options" do
let(:html4_default) do
Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_HTML)
Expand Down

0 comments on commit a582143

Please sign in to comment.