Skip to content

Commit

Permalink
fix(lib): improve head and body regex in domparser.js
Browse files Browse the repository at this point in the history
Because the head and body regexes test against the closing tag,
this causes html with unclosed head or body to not be parsed
correctly.

For example, given the following:

```js
parse('<html><body>');
```

The expected output is:

```
[ { type: 'tag',
    name: 'html',
    attribs: {},
    children:
     [ { type: 'tag',
         name: 'body',
         attribs: {},
         children: [],
         next: null,
         prev: null,
         parent: [Circular] } ],
    next: null,
    prev: null,
    parent: null } ]
```

But the actual output is:

```
[
  {
    "next": null,
    "prev": null,
    "parent": null,
    "name": "html",
    "attribs": {},
    "type": "tag",
    "children": []
  }
]
```

The fix is to update the regex to use the opening tag instead of
the closing tag.

Add test case.

Fixes #18
  • Loading branch information
remarkablemark committed Nov 4, 2019
1 parent c2665b4 commit 457bb58
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
9 changes: 5 additions & 4 deletions lib/domparser.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ var HTML_TAG_NAME = 'html';
var BODY_TAG_NAME = 'body';
var HEAD_TAG_NAME = 'head';
var FIRST_TAG_REGEX = /<([a-zA-Z]+[0-9]?)/; // e.g., <h1>
var HEAD_REGEX = /<\/head>/i;
var BODY_REGEX = /<\/body>/i;
var HEAD_REGEX = /<head.*>/i;
var BODY_REGEX = /<body.*>/i;
// http://www.w3.org/TR/html/syntax.html#void-elements
var VOID_ELEMENTS_REGEX = /<(area|base|br|col|embed|hr|img|input|keygen|link|menuitem|meta|param|source|track|wbr)(.*?)\/?>/gi;

Expand Down Expand Up @@ -36,7 +36,7 @@ if (typeof window.DOMParser === 'function') {
*/
parseFromString = function domStringParser(html, tagName) {
if (tagName) {
html = ['<', tagName, '>', html, '</', tagName, '>'].join('');
html = '<' + tagName + '>' + html + '</' + tagName + '>';
}
// because IE9 only supports MIME type 'text/xml', void elements need to be self-closed
if (isIE9) {
Expand Down Expand Up @@ -117,6 +117,7 @@ function domparser(html) {
// try to match first tag
var tagName;
var match = html.match(FIRST_TAG_REGEX);

if (match && match[1]) {
tagName = match[1].toLowerCase();
}
Expand All @@ -131,7 +132,7 @@ function domparser(html) {
doc = parseFromString(html);

// the created document may come with filler head/body elements,
// so ake sure to remove them if they don't actually exist
// so make sure to remove them if they don't actually exist
if (!HEAD_REGEX.test(html)) {
element = doc.getElementsByTagName(HEAD_TAG_NAME)[0];
if (element) element.parentNode.removeChild(element);
Expand Down
15 changes: 15 additions & 0 deletions test/cases/html.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,21 @@ module.exports = [
data: '<html><body></body></html>',
skip: isPhantomJS
},
{
name: 'unclosed html and head tags',
data: '<html><head>',
skip: isPhantomJS
},
{
name: 'unclosed html and body tags',
data: '<html><body>',
skip: isPhantomJS
},
{
name: 'unclosed html, head, and body tags',
data: '<html><head><body>',
skip: isPhantomJS
},

{
name: 'empty head',
Expand Down

0 comments on commit 457bb58

Please sign in to comment.