Skip to content

Commit

Permalink
Merge pull request matthewmueller#2 from siddMahen/multi-selector
Browse files Browse the repository at this point in the history
* Adds support for multiple attribute selectors
  Example: soupselect.select(dom, "[color][type='thingy']");

* Adds support for multiple selectors
  Example: soupselect.select(dom, "tag.class, div, #main");
  • Loading branch information
matthewmueller committed Dec 24, 2011
2 parents fd62f57 + 6617a60 commit 9fd8461
Show file tree
Hide file tree
Showing 8 changed files with 347 additions and 117 deletions.
6 changes: 6 additions & 0 deletions README.md
Expand Up @@ -6,6 +6,12 @@ A fork of harryf's [node-soupselect](http://github.com/harryf/node-soupselect),

$ npm install cheerio-soupselect

## Testing

From the root folder:

$ mocha -u tdd -R list

## Why not just use node-soupselect?

Since soupselect hasn't been updated for 9 months, I decided to fork it and continue development in the context of cheerio. I plan to keep it stand-alone from cheerio, so you will be able to use it without using cheerio.
Expand Down
193 changes: 107 additions & 86 deletions lib/soupselect.js
@@ -1,30 +1,39 @@
/**
Port of Simon Willison's Soup Select http://code.google.com/p/soupselect/
http://www.opensource.org/licenses/mit-license.php
MIT licensed http://www.opensource.org/licenses/mit-license.php
*/
/*
* Port of Simon Willison's Soup Select http://code.google.com/p/soupselect/
* http://www.opensource.org/licenses/mit-license.php
*
* MIT licensed http://www.opensource.org/licenses/mit-license.php
*/

var domUtils = require("htmlparser2").DomUtils;

var tagRe = /^[a-z0-9]+$/;

/*
/^(\w+)?\[(\w+)([=~\|\^\$\*]?)=?["']?([^\]"']*)["']?\]$/
\---/ \---/ \-------------/ \--------/
| | | |
| | | The value
| | ~,|,^,$,* or =
| Attribute
Tag
*/
var attrSelectRe = /^(\w+)?\[(\w+)([=~\|\^\$\*]?)=?["']?([^\]"']*)["']?\]$/;
Selecting the tag and attribute(s):
/^([a-z0-9_]+)?((?:\[(?:[a-z0-9_-]+)(?:[=~\|\^\$\*]?)=?["']?(?:[^\]"']*)["']?\])+)?$/;
\------/
|
Tag
/**
Takes an operator and a value and returns a function which can be used to
test other values against test provided value using the given operation
Used to checking attribute values for attribute selectors
Parsing the attribute(s):
/\[([a-z0-9_-]+)([=~\|\^\$\*]?)=?["']?([^\]"']*)["']?\]/g;
\--------/ \----------/ \------/
| | |
| | Value
| ~,|,^,$,* or =
Attribute
*/

var attrSelectRe = /^([a-z0-9_]+)?((?:\[(?:[a-z0-9_-]+)(?:[=~\|\^\$\*]?)=?["']?(?:[^\]"']*)["']?\])+)?$/;
var attrParseRe = /\[([a-z0-9_-]+)([=~\|\^\$\*]?)=?["']?([^\]"']*)["']?\]/g;

/*
* Takes an operator and a value and returns a function which can be used to
* test other values against test provided value using the given operation
* Used to checking attribute values for attribute selectors
*/

function makeValueChecker(operator, value) {
value = typeof(value) === 'string' ? value : '';

Expand All @@ -46,40 +55,60 @@ function makeValueChecker(operator, value) {

}

/**
Takes a dom tree or part of one from htmlparser and applies
the provided selector against. The returned value is also
a valid dom tree, so can be passed by into
htmlparser.DomUtil.* calls
*/
exports.select = function(dom, selector) {
var currentContext = [dom];
/*
* select()'s real implementation
*/

var _select = function(dom, selector) {
var currentContext = Array.isArray(dom) ? dom : [dom];
var found, tag, options;

var tokens = selector.split(/\s+/);


// This allows requests like "#main [class='main post']" without spliting on
// the space between 'main' and 'post'
var tokens = selector.split(/(\[.*?\]|\S*)/).filter(function(val){
return val.replace(/\s*/, "").length ? true : false;
});

for ( var i = 0; i < tokens.length; i++ ) {

// Attribute selectors
// Attribute and Tag selectors
var match = attrSelectRe.exec(tokens[i]);
if ( match ) {
var attribute = match[2], operator = match[3], value = match[4];
tag = match[1];
options = {};
options[attribute] = makeValueChecker(operator, value);

var tag = match[1], attributes = match[2];

found = [];
for (var j = 0; j < currentContext.length; j++ ) {
found = found.concat(domUtils.getElements(options, currentContext[j]));
};

if ( tag ) {
// Filter to only those matching the tag name
found = domUtils.getElements({ 'tag_name': tag }, found, false);
currentContext.forEach(function(ctx){
found = found.concat(domUtils.getElements({ 'tag_name': tag }, ctx, true));
});

currentContext = found;
}


if ( attributes ) {
// Further refine based on attributes
var attrmatch;
while(attrmatch = attrParseRe.exec(attributes)){
var attr = attrmatch[1],
operator = attrmatch[2],
value = attrmatch[3];

options = {};
options[attr] = makeValueChecker(operator, value);

found = [];
currentContext.forEach(function(ctx){
// Don't want any recursion if we're already in the set of tags which have
// the desired tag name
found = found.concat(domUtils.getElements(options, ctx, (tag ? false : true)));
});

currentContext = found;
}
}

currentContext = found;

}

// ID selector
Expand All @@ -89,22 +118,7 @@ exports.select = function(dom, selector) {
var id_selector = tokens[i].split('#', 2)[1];

// need to stop on the first id found (in bad HTML)...
var el = null;
for ( var k = 0; k < currentContext.length; k++ ) {

// the document has no child elements but tags do so we search children to avoid
// returning the current element via a false positive
if ( typeof currentContext[k].children !== 'undefined' ) {
el = domUtils.getElementById(id_selector, currentContext[k].children);
} else {
el = domUtils.getElementById(id_selector, currentContext[k]);
}

if ( el ) {
found.push(el);
break;
}
}
found.push(domUtils.getElementById(id_selector, currentContext, true));

if (!found[0]) {
currentContext = [];
Expand All @@ -119,26 +133,28 @@ exports.select = function(dom, selector) {
var parts = tokens[i].split('.');
tag = parts[0];
options = {};

options['class'] = function (value) {
if (!value) return false;

var classes = value.split(/\s+/);
for (var i = 1, len = parts.length; i < len; i++) {
if (!~classes.indexOf(parts[i])) return false;
if (classes.indexOf(parts[i]) == -1) return false;
}

return true;
};

found = [];
for ( var l = 0; l < currentContext.length; l++ ) {
var context = currentContext[l];
if ( tag.length > 0 ) {
context = domUtils.getElementsByTagName(tag, context);
context = domUtils.getElementsByTagName(tag, context, true);
// don't recurse in the case we have a tag or we get children we might not want
found = found.concat(domUtils.getElements(options, context, false));
} else {
found = found.concat(domUtils.getElements(options, context));
}

found = found.concat(domUtils.getElements(options, context, true));
}
};

currentContext = found;
Expand All @@ -148,28 +164,33 @@ exports.select = function(dom, selector) {
else if ( tokens[i] === '*' ) {
// nothing to do right?
}

// Tag selector
else {
if (!tagRe.test(tokens[i])) {
currentContext = [];
break;
}

found = [];
for ( var m = 0; m < currentContext.length; m++ ) {
// htmlparsers document itself has no child property - only nodes do...
if ( typeof currentContext[m].children !== 'undefined' ) {
found = found.concat(domUtils.getElementsByTagName(tokens[i], currentContext[m].children));
} else if (i === 0) {
found = found.concat(domUtils.getElementsByTagName(tokens[i], currentContext[m]));
}

};
// Nothing matches
else{

currentContext = found;
currentContext = [];
break;
}
};

return currentContext;

return currentContext;
};

/*
* Takes a dom tree or part of one from htmlparser and applies
* the provided selector against. The returned value is also
* a valid dom tree, so can be passed by into
* htmlparser.DomUtil.* calls
*/

exports.select = function(dom, selector){

var subselects = selector.split(/(?:\s*)?,(?:\s*)?/),
ctxs = [];

subselects.forEach(function(sub){
ctxs = ctxs.concat(_select(dom, sub));
});

return ctxs;
};
8 changes: 7 additions & 1 deletion package.json
Expand Up @@ -3,16 +3,22 @@
"version": "0.0.3",
"engines": {
"node": ">=0.2.0"
},
},
"author": "Matt Mueller <mattmuelle@gmail.com>",
"url": "http://github.com/harryf/node-soupselect",
"dependencies": {
"htmlparser2": "2.x"
},
"devDependencies": {
"mocha": "0.x"
},
"repository" : [
{ "type":"git", "url":"git://github.com/harryf/node-soupselect.git" }
],
"main": "./lib/soupselect",
"scripts": {
"test": "mocha -u tdd -R list"
},
"license": "MIT",
"description": "Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)"
}

0 comments on commit 9fd8461

Please sign in to comment.