Skip to content
This repository has been archived by the owner on Aug 26, 2023. It is now read-only.

Commit

Permalink
Limit number of attributes per element to prevent DoS
Browse files Browse the repository at this point in the history
  • Loading branch information
rafbm authored and stevecheckoway committed May 5, 2020
1 parent b41e791 commit b903745
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 6 deletions.
9 changes: 6 additions & 3 deletions ext/nokogumbo/nokogumbo.c
Expand Up @@ -490,8 +490,9 @@ static VALUE parse_cleanup(ParseArgs *args) {
static VALUE parse_continue(ParseArgs *args);

// Parse a string using gumbo_parse into a Nokogiri document
static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_errors, VALUE max_depth) {
static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) {
GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(max_attributes);
options.max_errors = NUM2INT(max_errors);
options.max_tree_depth = NUM2INT(max_depth);

Expand Down Expand Up @@ -570,6 +571,7 @@ static VALUE fragment (
VALUE doc_fragment,
VALUE tags,
VALUE ctx,
VALUE max_attributes,
VALUE max_errors,
VALUE max_depth
) {
Expand Down Expand Up @@ -676,6 +678,7 @@ static VALUE fragment (
// Perform a fragment parse.
int depth = NUM2INT(max_depth);
GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(max_attributes);
options.max_errors = NUM2INT(max_errors);
// Add one to account for the HTML element.
options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
Expand Down Expand Up @@ -743,8 +746,8 @@ void Init_nokogumbo() {

// Define Nokogumbo module with parse and fragment methods.
VALUE Gumbo = rb_define_module("Nokogumbo");
rb_define_singleton_method(Gumbo, "parse", parse, 4);
rb_define_singleton_method(Gumbo, "fragment", fragment, 5);
rb_define_singleton_method(Gumbo, "parse", parse, 5);
rb_define_singleton_method(Gumbo, "fragment", fragment, 6);

// Add private constant for testing.
rb_define_const(Gumbo, "LINE_SUPPORTED", line_supported);
Expand Down
8 changes: 8 additions & 0 deletions gumbo-parser/src/gumbo.h
Expand Up @@ -706,6 +706,14 @@ typedef struct GumboInternalOptions {
*/
bool stop_on_first_error;

/**
* Maximum allowed number of attributes per element. If this limit is
* reached, the parser will start ignoring attributes until the end of
* the current opening tag. Set to `-1` to disable the limit.
* Default: `400`.
*/
int max_attributes;

/**
* Maximum allowed depth for the parse tree. If this limit is exceeded,
* the parser will return early with a partial document and the returned
Expand Down
1 change: 1 addition & 0 deletions gumbo-parser/src/parser.c
Expand Up @@ -48,6 +48,7 @@ typedef uint8_t TagSet[GUMBO_TAG_LAST + 1];
const GumboOptions kGumboDefaultOptions = {
.tab_stop = 8,
.stop_on_first_error = false,
.max_attributes = 400,
.max_tree_depth = 400,
.max_errors = -1,
.fragment_context = NULL,
Expand Down
10 changes: 9 additions & 1 deletion gumbo-parser/src/tokenizer.c
Expand Up @@ -784,12 +784,20 @@ static void add_duplicate_attr_error(GumboParser* parser) {
static void finish_attribute_name(GumboParser* parser) {
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
GumboTagState* tag_state = &tokenizer->_tag_state;
GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;

int max_attributes = parser->_options->max_attributes;
if (max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes) {
reinitialize_tag_buffer(parser);
tag_state->_drop_next_attr_value = true;
return;
}

// May've been set by a previous attribute without a value; reset it here.
tag_state->_drop_next_attr_value = false;
assert(tag_state->_attributes.data);
assert(tag_state->_attributes.capacity);

GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
for (unsigned int i = 0; i < attributes->length; ++i) {
GumboAttribute* attr = attributes->data[i];
if (
Expand Down
3 changes: 3 additions & 0 deletions lib/nokogumbo.rb
Expand Up @@ -5,6 +5,9 @@
require 'nokogumbo/nokogumbo'

module Nokogumbo
# The default maximum number of attributes per element.
DEFAULT_MAX_ATTRIBUTES = 400

# The default maximum number of errors for parsing a document or a fragment.
DEFAULT_MAX_ERRORS = 0

Expand Down
3 changes: 2 additions & 1 deletion lib/nokogumbo/html5/document.rb
Expand Up @@ -41,9 +41,10 @@ def to_xml(options = {}, &block)
private
def self.do_parse(string_or_io, url, encoding, options)
string = HTML5.read_and_encode(string_or_io, encoding)
max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
doc = Nokogumbo.parse(string, url, max_errors, max_depth)
doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
doc.encoding = 'UTF-8'
doc
end
Expand Down
3 changes: 2 additions & 1 deletion lib/nokogumbo/html5/document_fragment.rb
Expand Up @@ -12,10 +12,11 @@ def initialize(doc, tags = nil, ctx = nil, options = {})
self.errors = []
return self unless tags

max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
tags = Nokogiri::HTML5.read_and_encode(tags, nil)
Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
end

def serialize(options = {}, &block)
Expand Down
108 changes: 108 additions & 0 deletions test/test_nokogumbo.rb
Expand Up @@ -120,6 +120,111 @@ def test_root_comments
assert_equal ["html", "comment", "html", "comment"], doc.children.map(&:name)
end

def test_max_attributes
html = '<div id="i" class="c" title="t"><img src="s" alt="a"></div>'

doc = Nokogiri::HTML5(html, max_attributes: 2)
assert_equal({ 'id' => 'i', 'class' => 'c' }, attributes(doc.at_css('div')))
assert_equal({ 'src' => 's', 'alt' => 'a' }, attributes(doc.at_css('img')))

doc = Nokogiri::HTML5(html, max_attributes: 1)
assert_equal({ 'id' => 'i' }, attributes(doc.at_css('div')))
assert_equal({ 'src' => 's' }, attributes(doc.at_css('img')))

doc = Nokogiri::HTML5(html, max_attributes: 0)
assert_equal({}, attributes(doc.at_css('div')))
assert_equal({}, attributes(doc.at_css('img')))

# -1 disables limit
doc = Nokogiri::HTML5(html, max_attributes: -1)
assert_equal({ 'id' => 'i', 'class' => 'c', 'title' => 't' }, attributes(doc.at_css('div')))
assert_equal({ 'src' => 's', 'alt' => 'a' }, attributes(doc.at_css('img')))
end

def test_max_attributes_boolean
html = '<label><input checked type="checkbox" disabled name="cheese"> Cheese</label>'

doc = Nokogiri::HTML5(html, max_attributes: 4)
assert_equal({ 'checked' => '', 'type' => 'checkbox', 'disabled' => '', 'name' => 'cheese' }, attributes(doc.at_css('input')))

doc = Nokogiri::HTML5(html, max_attributes: 3)
assert_equal({ 'checked' => '', 'type' => 'checkbox', 'disabled' => '' }, attributes(doc.at_css('input')))

doc = Nokogiri::HTML5(html, max_attributes: 2)
assert_equal({ 'checked' => '', 'type' => 'checkbox' }, attributes(doc.at_css('input')))

doc = Nokogiri::HTML5(html, max_attributes: 1)
assert_equal({ 'checked' => '' }, attributes(doc.at_css('input')))

doc = Nokogiri::HTML5(html, max_attributes: 0)
assert_equal({}, attributes(doc.at_css('input')))
end

def test_default_max_attributes
a = 'a'
attrs = 50_000.times.map { x = a.dup; a.succ!; x }

# <div> contains 50,000 attributes, but default limit is 400. Parsing this would take ages if
# we were not enforcing any limit on attributes. All attributes are duplicated to make sure
# this doesn’t alter performance or end result.
html = "<div #{attrs.map.with_index { |x, i| "data-#{x}=#{i} data-#{x}=#{i}" }.join(' ')}>hello</div>"

doc = Nokogiri::HTML5(html)
div = doc.at_css('div')
assert_equal 'hello', div.text
assert_equal '0', div.attribute("data-#{attrs[0]}").value
assert_equal '1', div.attribute("data-#{attrs[1]}").value
assert_equal '2', div.attribute("data-#{attrs[2]}").value
assert_equal '398', div.attribute("data-#{attrs[398]}").value
assert_equal '399', div.attribute("data-#{attrs[399]}").value
assert_nil div.attribute("data-#{attrs[400]}")
assert_nil div.attribute("data-#{attrs[401]}")
assert_nil div.attribute("data-#{attrs[402]}")
end

def test_fragment_max_attributes
html = '<div id="i" class="c" title="t"><img src="s" alt="a"></div>'

doc = Nokogiri::HTML5.fragment(html, max_attributes: 2)
assert_equal({ 'id' => 'i', 'class' => 'c' }, attributes(doc.at_css('div')))
assert_equal({ 'src' => 's', 'alt' => 'a' }, attributes(doc.at_css('img')))

doc = Nokogiri::HTML5.fragment(html, max_attributes: 1)
assert_equal({ 'id' => 'i' }, attributes(doc.at_css('div')))
assert_equal({ 'src' => 's' }, attributes(doc.at_css('img')))

doc = Nokogiri::HTML5.fragment(html, max_attributes: 0)
assert_equal({}, attributes(doc.at_css('div')))
assert_equal({}, attributes(doc.at_css('img')))

# -1 disables limit
doc = Nokogiri::HTML5.fragment(html, max_attributes: -1)
assert_equal({ 'id' => 'i', 'class' => 'c', 'title' => 't' }, attributes(doc.at_css('div')))
assert_equal({ 'src' => 's', 'alt' => 'a' }, attributes(doc.at_css('img')))
end

def test_fragment_default_max_attributes
a = 'a'
attrs = 50_000.times.map { x = a.dup; a.succ!; x }

# <div> contains 50,000 attributes, but default limit is 400. Parsing this would take ages if
# we were not enforcing any limit on attributes. All attributes are duplicated to make sure
# this doesn’t alter performance or end result.
html = "<div #{attrs.map.with_index { |x, i| "data-#{x}=#{i} data-#{x}=#{i}" }.join(' ')}>hello</div>"

doc = Nokogiri::HTML5.fragment(html)
div = doc.at_css('div')
assert_equal 'hello', div.text
assert_equal '0', div.attribute("data-#{attrs[0]}").value
assert_equal '1', div.attribute("data-#{attrs[1]}").value
assert_equal '2', div.attribute("data-#{attrs[2]}").value
assert_equal '398', div.attribute("data-#{attrs[398]}").value
assert_equal '399', div.attribute("data-#{attrs[399]}").value
assert_nil div.attribute("data-#{attrs[400]}")
assert_nil div.attribute("data-#{attrs[401]}")
assert_nil div.attribute("data-#{attrs[402]}")
end

def test_parse_errors
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- <!-- --></a>", max_errors: 10)
assert_equal doc.errors.length, 2
Expand Down Expand Up @@ -279,4 +384,7 @@ def buffer
EOF
end

def attributes(element)
element.attributes.map { |name, attribute| [name, attribute.value] }.to_h
end
end

0 comments on commit b903745

Please sign in to comment.