Skip to content

Commit

Permalink
Merge pull request #2303 from sparklemotion/2302-c99-fixes
Browse files Browse the repository at this point in the history
c99 fixes

---

**What problem is this PR intended to solve?**

#2302 reports a compilation failure on RHEL6, RHEL7, and SLES12. These are older distros (RHEL7 is running gcc 4.8) which seem to compile in a C90-ish mode by default that gives errors on C99-isms.

This PR removed the C99isms (which are errors) as well as some C90isms (which generate warnings). Specifically:

- don't declare a variable in a `for` statement (C99-ism)
- don't mix declarations and code (C90-ism)

This PR also cleans up some C casting related to how we're handling `const` qualifiers. Specifically:

- `gumbo.c` is now using `(const xmlChar *)` in place of `BAD_CAST` (which is an alias for `(xmlChar *)`)
- extracted the const-discard hack `(xmlChar *)(uintptr_t)ptr` into a macro `DISCARD_CONST_QUAL`
- introduce use of `DISCARD_CONST_QUAL` in one additional place in `xml_node.c`

As a result, even when using `-Wcast-qual` on older Rubies that support it, we now see no compiler warnings.

**Have you included adequate test coverage?**

I considered setting up CI using a comparable Centos docker image, but I didn't feel like there was much regression value to that test, given the nature of this failure. I could be argued into doing it, if anyone feels strongly.
  • Loading branch information
flavorjones committed Aug 6, 2021
2 parents 5ce2014 + 4add278 commit efe8c0a
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 69 deletions.
11 changes: 9 additions & 2 deletions CHANGELOG.md
Expand Up @@ -4,18 +4,25 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA

---

## 1.12.3 / unreleased

### Fixed

* [CRuby] Fix compilation of libgumbo on older systems with versions of GCC that give errors on C99-isms. Affected systems include RHEL6, RHEL7, and SLES12. [[#2302](https://github.com/sparklemotion/nokogiri/issues/2302)]


## 1.12.2 / 2021-08-04

### Fixed

* Ensure that C extension files in non-native gem installations are loaded using `require` and rely on `$LOAD_PATH` instead of using `require_relative`. This issue only exists when deleting shared libraries that exist outside the extensions directory, something users occasionally do to conserve disk space. [[#2300](https://github.com/sparklemotion/nokogiri/issues/2300)]
* [CRuby] Ensure that C extension files in non-native gem installations are loaded using `require` and rely on `$LOAD_PATH` instead of using `require_relative`. This issue only exists when deleting shared libraries that exist outside the extensions directory, something users occasionally do to conserve disk space. [[#2300](https://github.com/sparklemotion/nokogiri/issues/2300)]


## 1.12.1 / 2021-08-03

### Fixed

* Fix compilation of libgumbo on BSD systems by avoiding GNU-isms. [[#2298](https://github.com/sparklemotion/nokogiri/issues/2298)]
* [CRuby] Fix compilation of libgumbo on BSD systems by avoiding GNU-isms. [[#2298](https://github.com/sparklemotion/nokogiri/issues/2298)]


## 1.12.0 / 2021-08-02
Expand Down
157 changes: 98 additions & 59 deletions ext/nokogiri/gumbo.c
Expand Up @@ -75,7 +75,7 @@ new_html_doc(const char *dtd_name, const char *system, const char *public)
htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
assert(doc);
if (dtd_name) {
xmlCreateIntSubset(doc, BAD_CAST dtd_name, BAD_CAST public, BAD_CAST system);
xmlCreateIntSubset(doc, (const xmlChar *)dtd_name, (const xmlChar *)public, (const xmlChar *)system);
}
return doc;
}
Expand All @@ -89,15 +89,19 @@ get_parent(xmlNodePtr node)
static GumboOutput *
perform_parse(const GumboOptions *options, VALUE input)
{
GumboOutput *output;
const char *status_string;

assert(RTEST(input));
Check_Type(input, T_STRING);
GumboOutput *output = gumbo_parse_with_options(
options,
RSTRING_PTR(input),
RSTRING_LEN(input)
);

const char *status_string = gumbo_status_to_string(output->status);
output = gumbo_parse_with_options(
options,
RSTRING_PTR(input),
RSTRING_LEN(input)
);

status_string = gumbo_status_to_string(output->status);
switch (output->status) {
case GUMBO_STATUS_OK:
break;
Expand All @@ -120,11 +124,11 @@ lookup_or_add_ns(
const char *prefix
)
{
xmlNsPtr ns = xmlSearchNs(doc, root, BAD_CAST prefix);
xmlNsPtr ns = xmlSearchNs(doc, root, (const xmlChar *)prefix);
if (ns) {
return ns;
}
return xmlNewNs(root, BAD_CAST href, BAD_CAST prefix);
return xmlNewNs(root, (const xmlChar *)href, (const xmlChar *)prefix);
}

static void
Expand All @@ -151,9 +155,13 @@ build_tree(
size_t child_index = 0;

while (true) {
const GumboVector *children;
const GumboNode *gumbo_child;
xmlNodePtr xml_child;

assert(gumbo_node != NULL);
const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT ?
&gumbo_node->v.document.children : &gumbo_node->v.element.children;
children = gumbo_node->type == GUMBO_NODE_DOCUMENT ?
&gumbo_node->v.document.children : &gumbo_node->v.element.children;
if (child_index >= children->length) {
// Move up the tree and to the next child.
if (xml_node == xml_output_node) {
Expand All @@ -172,42 +180,44 @@ build_tree(
}
continue;
}
const GumboNode *gumbo_child = children->data[child_index++];
xmlNodePtr xml_child;

gumbo_child = children->data[child_index++];
switch (gumbo_child->type) {
case GUMBO_NODE_DOCUMENT:
abort(); // Bug in Gumbo.

case GUMBO_NODE_TEXT:
case GUMBO_NODE_WHITESPACE:
xml_child = xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text);
xml_child = xmlNewDocText(doc, (const xmlChar *)gumbo_child->v.text.text);
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;

case GUMBO_NODE_CDATA:
xml_child = xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text,
xml_child = xmlNewCDataBlock(doc, (const xmlChar *)gumbo_child->v.text.text,
(int) strlen(gumbo_child->v.text.text));
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;

case GUMBO_NODE_COMMENT:
xml_child = xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text);
xml_child = xmlNewDocComment(doc, (const xmlChar *)gumbo_child->v.text.text);
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;

case GUMBO_NODE_TEMPLATE:
// XXX: Should create a template element and a new DocumentFragment
case GUMBO_NODE_ELEMENT: {
xml_child = xmlNewDocNode(doc, NULL, BAD_CAST gumbo_child->v.element.name, NULL);
xmlNsPtr ns = NULL;
const GumboVector *attrs;
size_t i;

xml_child = xmlNewDocNode(doc, NULL, (const xmlChar *)gumbo_child->v.element.name, NULL);
set_line(xml_child, gumbo_child->v.element.start_pos.line);
if (xml_root == NULL) {
xml_root = xml_child;
}
xmlNsPtr ns = NULL;
switch (gumbo_child->v.element.tag_namespace) {
case GUMBO_NAMESPACE_HTML:
break;
Expand All @@ -224,8 +234,8 @@ build_tree(
xmlAddChild(xml_node, xml_child);

// Add the attributes.
const GumboVector *attrs = &gumbo_child->v.element.attributes;
for (size_t i = 0; i < attrs->length; i++) {
attrs = &gumbo_child->v.element.attributes;
for (i = 0; i < attrs->length; i++) {
const GumboAttribute *attr = attrs->data[i];

switch (attr->attr_namespace) {
Expand All @@ -244,7 +254,7 @@ build_tree(
default:
ns = NULL;
}
xmlNewNsProp(xml_child, ns, BAD_CAST attr->name, BAD_CAST attr->value);
xmlNewNsProp(xml_child, ns, (const xmlChar *)attr->name, (const xmlChar *)attr->value);
}

// Add children for this element.
Expand All @@ -264,19 +274,29 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)

// Add parse errors to rdoc.
if (output->errors.length) {
size_t i;
const GumboVector *errors = &output->errors;
VALUE rerrors = rb_ary_new2(errors->length);

for (size_t i = 0; i < errors->length; i++) {
GumboError *err = errors->data[i];
GumboSourcePosition position = gumbo_error_position(err);
for (i = 0; i < errors->length; i++) {
GumboError *err;
GumboSourcePosition position;
char *msg;
size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
VALUE err_str = rb_utf8_str_new(msg, size);
size_t size;
VALUE err_str;
VALUE syntax_error;
const char *error_code;
VALUE str1;

err = errors->data[i];
position = gumbo_error_position(err);
size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
err_str = rb_utf8_str_new(msg, size);
free(msg);
VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
const char *error_code = gumbo_error_code(err);
VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
error_code = gumbo_error_code(err);
str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;

rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
Expand All @@ -303,7 +323,7 @@ typedef struct {
static VALUE
parse_cleanup(VALUE parse_args)
{
ParseArgs *args = (ParseArgs*)parse_args;
ParseArgs *args = (ParseArgs *)parse_args;
gumbo_destroy_output(args->output);
// Make sure garbage collection doesn't mark the objects as being live based
// on references from the ParseArgs. This may be unnecessary.
Expand All @@ -323,28 +343,32 @@ static VALUE parse_continue(VALUE parse_args);
static VALUE
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
{
GumboOutput *output;
GumboOptions options = kGumboDefaultOptions;
ParseArgs args;

options.max_attributes = NUM2INT(max_attributes);
options.max_errors = NUM2INT(max_errors);
options.max_tree_depth = NUM2INT(max_depth);

GumboOutput *output = perform_parse(&options, input);
ParseArgs args = {
.output = output,
.input = input,
.url_or_frag = url,
.doc = NULL,
};
output = perform_parse(&options, input);

args.output = output;
args.input = input;
args.url_or_frag = url;
args.doc = NULL;

return rb_ensure(parse_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args));
}

static VALUE
parse_continue(VALUE parse_args)
{
ParseArgs *args = (ParseArgs*)parse_args;
ParseArgs *args = (ParseArgs *)parse_args;
GumboOutput *output = args->output;
xmlDocPtr doc;
VALUE rdoc;

if (output->document->v.document.has_doctype) {
const char *name = output->document->v.document.name;
const char *public = output->document->v.document.public_identifier;
Expand All @@ -357,7 +381,7 @@ parse_continue(VALUE parse_args)
}
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
build_tree(doc, (xmlNodePtr)doc, output->document);
VALUE rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc);
rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc);
args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
add_errors(output, rdoc, args->input, args->url_or_frag);
return rdoc;
Expand All @@ -366,10 +390,15 @@ parse_continue(VALUE parse_args)
static int
lookup_namespace(VALUE node, bool require_known_ns)
{
VALUE ns;
ID namespace, href;
const char *href_ptr;
size_t href_len;

CONST_ID(namespace, "namespace");
CONST_ID(href, "href");
VALUE ns = rb_funcall(node, namespace, 0);

ns = rb_funcall(node, namespace, 0);

if (NIL_P(ns)) {
return GUMBO_NAMESPACE_HTML;
Expand All @@ -378,8 +407,8 @@ lookup_namespace(VALUE node, bool require_known_ns)
assert(RTEST(ns));
Check_Type(ns, T_STRING);

const char *href_ptr = RSTRING_PTR(ns);
size_t href_len = RSTRING_LEN(ns);
href_ptr = RSTRING_PTR(ns);
href_len = RSTRING_LEN(ns);
#define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) {
return GUMBO_NAMESPACE_HTML;
Expand Down Expand Up @@ -427,15 +456,23 @@ fragment(
GumboQuirksModeEnum quirks_mode;
bool form = false;
const char *encoding = NULL;
VALUE doc, dtd;
int depth;
GumboOptions options = kGumboDefaultOptions;
GumboOutput *output;
ParseArgs args;

if (NIL_P(ctx)) {
ctx_tag = "body";
ctx_ns = GUMBO_NAMESPACE_HTML;
} else if (TYPE(ctx) == T_STRING) {
size_t len;
const char *colon;

ctx_tag = StringValueCStr(ctx);
ctx_ns = GUMBO_NAMESPACE_HTML;
size_t len = RSTRING_LEN(ctx);
const char *colon = memchr(ctx_tag, ':', len);
len = RSTRING_LEN(ctx);
colon = memchr(ctx_tag, ':', len);
if (colon) {
switch (colon - ctx_tag) {
case 3:
Expand Down Expand Up @@ -470,6 +507,7 @@ fragment(
// Check if it's a form.
form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
} else {
VALUE node, element_name;
ID element_ = rb_intern_const("element?");

// Context fragment name.
Expand All @@ -482,13 +520,13 @@ fragment(
ctx_ns = lookup_namespace(ctx, true);

// Check for a form ancestor, including self.
for (VALUE node = ctx;
for (node = ctx;
!NIL_P(node);
node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
if (!RTEST(rb_funcall(node, element_, 0))) {
continue;
}
VALUE element_name = rb_funcall(node, name, 0);
element_name = rb_funcall(node, name, 0);
if (RSTRING_LEN(element_name) == 4
&& !st_strcasecmp(RSTRING_PTR(element_name), "form")
&& lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
Expand All @@ -510,8 +548,8 @@ fragment(
}

// Quirks mode.
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
VALUE dtd = rb_funcall(doc, internal_subset, 0);
doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
dtd = rb_funcall(doc, internal_subset, 0);
if (NIL_P(dtd)) {
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
} else {
Expand All @@ -526,8 +564,8 @@ fragment(
}

// Perform a fragment parse.
int depth = NUM2INT(max_depth);
GumboOptions options = kGumboDefaultOptions;
depth = NUM2INT(max_depth);
options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(max_attributes);
options.max_errors = NUM2INT(max_errors);
// Add one to account for the HTML element.
Expand All @@ -538,27 +576,28 @@ fragment(
options.quirks_mode = quirks_mode;
options.fragment_context_has_form_ancestor = form;

GumboOutput *output = perform_parse(&options, tags);
ParseArgs args = {
.output = output,
.input = tags,
.url_or_frag = doc_fragment,
.doc = (xmlDocPtr)extract_xml_node(doc),
};
output = perform_parse(&options, tags);

args.output = output;
args.input = tags;
args.url_or_frag = doc_fragment;
args.doc = (xmlDocPtr)extract_xml_node(doc);

rb_ensure(fragment_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args));
return Qnil;
}

static VALUE
fragment_continue(VALUE parse_args)
{
ParseArgs *args = (ParseArgs*)parse_args;
ParseArgs *args = (ParseArgs *)parse_args;
GumboOutput *output = args->output;
VALUE doc_fragment = args->url_or_frag;
xmlDocPtr xml_doc = args->doc;
xmlNodePtr xml_frag;

args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
xml_frag = extract_xml_node(doc_fragment);
build_tree(xml_doc, xml_frag, output->root);
add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
return Qnil;
Expand Down

0 comments on commit efe8c0a

Please sign in to comment.