Skip to content

Commit

Permalink
Gumbo common options (#3205)
Browse files Browse the repository at this point in the history
**What problem is this PR intended to solve?**

See discussion at #3199 

cc @stevecheckoway 

(Recreation of #3204 with additional commits)
  • Loading branch information
flavorjones committed May 24, 2024
2 parents 926b0b9 + 9fab4cb commit bb8bc2b
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 16 deletions.
47 changes: 35 additions & 12 deletions ext/nokogiri/gumbo.c
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,35 @@ parse_cleanup(VALUE parse_args)
return Qnil;
}

// Scan the keyword arguments for options common to the document and fragment
// parse.
static GumboOptions
common_options(VALUE kwargs) {
// The order of the keywords determines the order of the values below.
// If this order is changed, then setting the options below must change as
// well.
ID keywords[] = {
rb_intern_const("max_attributes"),
rb_intern_const("max_errors"),
rb_intern_const("max_tree_depth"),
};
VALUE values[sizeof keywords / sizeof keywords[0]];

// Extract the values coresponding to the required keywords. Raise an error
// if required arguments are missing.
rb_get_kwargs(kwargs, keywords, 3, 0, values);

GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(values[0]);
options.max_errors = NUM2INT(values[1]);

// handle negative values
int depth = NUM2INT(values[2]);
options.max_tree_depth = depth < 0 ? UINT_MAX : (unsigned int)depth;

return options;
}

static VALUE parse_continue(VALUE parse_args);

/*
Expand All @@ -331,10 +360,7 @@ rb_gumbo_s_parse(int argc, VALUE *argv, VALUE _self)
kwargs = rb_hash_new();
}

GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(rb_hash_aref(kwargs, ID2SYM(rb_intern_const("max_attributes"))));
options.max_errors = NUM2INT(rb_hash_aref(kwargs, ID2SYM(rb_intern_const("max_errors"))));
options.max_tree_depth = NUM2INT(rb_hash_aref(kwargs, ID2SYM(rb_intern_const("max_tree_depth"))));
GumboOptions options = common_options(kwargs);

GumboOutput *output = perform_parse(&options, input);
ParseArgs args = {
Expand Down Expand Up @@ -440,6 +466,8 @@ rb_gumbo_s_fragment(int argc, VALUE *argv, VALUE _self)
kwargs = rb_hash_new();
}

GumboOptions options = common_options(kwargs);

if (NIL_P(ctx)) {
ctx_tag = "body";
ctx_ns = GUMBO_NAMESPACE_HTML;
Expand Down Expand Up @@ -543,20 +571,15 @@ rb_gumbo_s_fragment(int argc, VALUE *argv, VALUE _self)
}

// Perform a fragment parse.
GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(rb_hash_aref(kwargs, ID2SYM(rb_intern_const("max_attributes"))));
options.max_errors = NUM2INT(rb_hash_aref(kwargs, ID2SYM(rb_intern_const("max_errors"))));

// Add one to account for the HTML element.
int depth = NUM2INT(rb_hash_aref(kwargs, ID2SYM(rb_intern_const("max_tree_depth"))));
options.max_tree_depth = depth < 0 ? -1 : (depth + 1);

options.fragment_context = ctx_tag;
options.fragment_namespace = ctx_ns;
options.fragment_encoding = encoding;
options.quirks_mode = quirks_mode;
options.fragment_context_has_form_ancestor = form;

// Add one to the max tree depth to account for the HTML element.
if (options.max_tree_depth < UINT_MAX) { options.max_tree_depth++; }

GumboOutput *output = perform_parse(&options, tags);
ParseArgs args = {
.output = output,
Expand Down
2 changes: 1 addition & 1 deletion lib/nokogiri/html5/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def do_parse(string_or_io, url, encoding, **options)
string = HTML5.read_and_encode(string_or_io, encoding)

options[:max_attributes] ||= Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
options[:max_errors] ||= options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
options[:max_errors] ||= options.delete(:max_parse_errors) || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
options[:max_tree_depth] ||= Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH

doc = Nokogiri::Gumbo.parse(string, url, self, **options)
Expand Down
2 changes: 1 addition & 1 deletion lib/nokogiri/html5/document_fragment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def initialize(doc, tags = nil, ctx = nil, options = {}) # rubocop:disable Lint/
tags = Nokogiri::HTML5.read_and_encode(tags, nil)

options[:max_attributes] ||= Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
options[:max_errors] ||= options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
options[:max_errors] ||= options.delete(:max_parse_errors) || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
options[:max_tree_depth] ||= Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH

Nokogiri::Gumbo.fragment(self, tags, ctx, **options)
Expand Down
4 changes: 2 additions & 2 deletions test/html5/test_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def round_trip_through(str, enc)
define_method("test_parse_encoded_#{enc[0]}".to_sym) do
html = "<!DOCTYPE html><span>#{enc[1]}</span>"
encoded_html = round_trip_through(html, enc[0])
doc = Nokogiri::HTML5(encoded_html, encoding: enc[0])
doc = Nokogiri::HTML5(encoded_html, enc[0])
span = doc.at("/html/body/span")
refute_nil span
assert_equal enc[1], span.content
Expand All @@ -210,7 +210,7 @@ def round_trip_through(str, enc)
skip "https://bugs.ruby-lang.org/issues/15033" if enc[0] == "ISO-2022-JP"
round_trip_through(enc[1], enc[0])
encoded = encodings_doc.serialize(encoding: enc[0])
doc = Nokogiri::HTML5(encoded, encoding: enc[0])
doc = Nokogiri::HTML5(encoded, enc[0])
assert_equal encodings_html, doc.serialize
end
end
Expand Down
2 changes: 2 additions & 0 deletions test/html5/test_nokogumbo.rb
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ def test_max_depth_parse
end

assert(Nokogiri::HTML5(html, max_tree_depth: depth))
assert(Nokogiri::HTML5(html, max_tree_depth: -1))
end

def test_max_depth_fragment
Expand All @@ -278,6 +279,7 @@ def test_max_depth_fragment
end

assert(Nokogiri::HTML5.fragment(html, max_tree_depth: depth))
assert(Nokogiri::HTML5.fragment(html, max_tree_depth: -1))
end

def test_document_encoding
Expand Down

0 comments on commit bb8bc2b

Please sign in to comment.