diff --git a/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java b/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java index 60af0a09fd..5713c6aeef 100644 --- a/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +++ b/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java @@ -61,11 +61,16 @@ public void fatalError(SAXParseException ex) throws SAXException { // found in the prolog, instead it will keep calling this method and we'll // keep inserting the error in the document errors array until we run // out of memory + errors.add(ex); String message = ex.getMessage(); - if (message != null && message.toLowerCase().contains("in prolog")) { + + // The problem with Xerces is that some errors will cause the + // parser not to advance the reader and it will keep reporting + // the same error over and over, which will cause the parser + // to enter an infinite loop unless we throw the exception. + if (message != null && isFatal(message)) { throw ex; } - errors.add(ex); } public void error(String domain, String key, XMLParseException e) { @@ -80,4 +85,13 @@ public void warning(String domain, String key, XMLParseException e) { errors.add(e); } + /* + * Determine whether this is a fatal error that should cause + * the parsing to stop, or an error that can be ignored. + */ + private static boolean isFatal(String msg) { + return + msg.toLowerCase().contains("in prolog") || + msg.toLowerCase().contains("preceding the root element must be well-formed"); + } } diff --git a/test/xml/test_document.rb b/test/xml/test_document.rb index 120b3b3c46..97e7f40027 100644 --- a/test/xml/test_document.rb +++ b/test/xml/test_document.rb @@ -519,6 +519,12 @@ def test_encoding end end + def test_memory_explosion_on_invalid_xml + doc = Nokogiri::XML("<<<") + refute_nil doc + refute_empty doc.errors + end + def test_document_has_errors doc = Nokogiri::XML(<<-eoxml)