Permalink
Browse files

Merge pull request #2 from siddMahen/multi-selector

* Adds support for multiple attribute selectors
  Example: soupselect.select(dom, "[color][type='thingy']");

* Adds support for multiple selectors
  Example: soupselect.select(dom, "tag.class, div, #main");
  • Loading branch information...
2 parents fd62f57 + 6617a60 commit 9fd84617755f38960e2ee8ad03b196d889e9ab9a @matthewmueller matthewmueller committed Dec 24, 2011
Showing with 347 additions and 117 deletions.
  1. +6 −0 README.md
  2. +107 −86 lib/soupselect.js
  3. +7 −1 package.json
  4. +109 −0 test/dom.js
  5. +39 −11 test/index.html
  6. +56 −0 test/test-basic.js
  7. +23 −0 test/test-regress-01.js
  8. +0 −19 test/test.coffee
View
@@ -6,6 +6,12 @@ A fork of harryf's [node-soupselect](http://github.com/harryf/node-soupselect),
$ npm install cheerio-soupselect
+## Testing
+
+From the root folder:
+
+ $ mocha -u tdd -R list
+
## Why not just use node-soupselect?
Since soupselect hasn't been updated for 9 months, I decided to fork it and continue development in the context of cheerio. I plan to keep it stand-alone from cheerio, so you will be able to use it without using cheerio.
View
@@ -1,30 +1,39 @@
-/**
-Port of Simon Willison's Soup Select http://code.google.com/p/soupselect/
-http://www.opensource.org/licenses/mit-license.php
-
-MIT licensed http://www.opensource.org/licenses/mit-license.php
-*/
+/*
+ * Port of Simon Willison's Soup Select http://code.google.com/p/soupselect/
+ * http://www.opensource.org/licenses/mit-license.php
+ *
+ * MIT licensed http://www.opensource.org/licenses/mit-license.php
+ */
var domUtils = require("htmlparser2").DomUtils;
-var tagRe = /^[a-z0-9]+$/;
-
/*
- /^(\w+)?\[(\w+)([=~\|\^\$\*]?)=?["']?([^\]"']*)["']?\]$/
- \---/ \---/ \-------------/ \--------/
- | | | |
- | | | The value
- | | ~,|,^,$,* or =
- | Attribute
- Tag
-*/
-var attrSelectRe = /^(\w+)?\[(\w+)([=~\|\^\$\*]?)=?["']?([^\]"']*)["']?\]$/;
+ Selecting the tag and attribute(s):
+
+ /^([a-z0-9_]+)?((?:\[(?:[a-z0-9_-]+)(?:[=~\|\^\$\*]?)=?["']?(?:[^\]"']*)["']?\])+)?$/;
+ \------/
+ |
+ Tag
-/**
-Takes an operator and a value and returns a function which can be used to
-test other values against test provided value using the given operation
-Used to checking attribute values for attribute selectors
+ Parsing the attribute(s):
+
+ /\[([a-z0-9_-]+)([=~\|\^\$\*]?)=?["']?([^\]"']*)["']?\]/g;
+ \--------/ \----------/ \------/
+ | | |
+ | | Value
+ | ~,|,^,$,* or =
+ Attribute
*/
+
+var attrSelectRe = /^([a-z0-9_]+)?((?:\[(?:[a-z0-9_-]+)(?:[=~\|\^\$\*]?)=?["']?(?:[^\]"']*)["']?\])+)?$/;
+var attrParseRe = /\[([a-z0-9_-]+)([=~\|\^\$\*]?)=?["']?([^\]"']*)["']?\]/g;
+
+/*
+ * Takes an operator and a value and returns a function which can be used to
+ * test other values against test provided value using the given operation
+ * Used to checking attribute values for attribute selectors
+ */
+
function makeValueChecker(operator, value) {
value = typeof(value) === 'string' ? value : '';
@@ -46,40 +55,60 @@ function makeValueChecker(operator, value) {
}
-/**
-Takes a dom tree or part of one from htmlparser and applies
-the provided selector against. The returned value is also
-a valid dom tree, so can be passed by into
-htmlparser.DomUtil.* calls
-*/
-exports.select = function(dom, selector) {
- var currentContext = [dom];
+/*
+ * select()'s real implementation
+ */
+
+var _select = function(dom, selector) {
+ var currentContext = Array.isArray(dom) ? dom : [dom];
var found, tag, options;
-
- var tokens = selector.split(/\s+/);
-
+
+ // This allows requests like "#main [class='main post']" without spliting on
+ // the space between 'main' and 'post'
+ var tokens = selector.split(/(\[.*?\]|\S*)/).filter(function(val){
+ return val.replace(/\s*/, "").length ? true : false;
+ });
+
for ( var i = 0; i < tokens.length; i++ ) {
- // Attribute selectors
+ // Attribute and Tag selectors
var match = attrSelectRe.exec(tokens[i]);
if ( match ) {
- var attribute = match[2], operator = match[3], value = match[4];
- tag = match[1];
- options = {};
- options[attribute] = makeValueChecker(operator, value);
-
+ var tag = match[1], attributes = match[2];
+
found = [];
- for (var j = 0; j < currentContext.length; j++ ) {
- found = found.concat(domUtils.getElements(options, currentContext[j]));
- };
-
if ( tag ) {
// Filter to only those matching the tag name
- found = domUtils.getElements({ 'tag_name': tag }, found, false);
+ currentContext.forEach(function(ctx){
+ found = found.concat(domUtils.getElements({ 'tag_name': tag }, ctx, true));
+ });
+
+ currentContext = found;
}
-
+
+ if ( attributes ) {
+ // Further refine based on attributes
+ var attrmatch;
+ while(attrmatch = attrParseRe.exec(attributes)){
+ var attr = attrmatch[1],
+ operator = attrmatch[2],
+ value = attrmatch[3];
+
+ options = {};
+ options[attr] = makeValueChecker(operator, value);
+
+ found = [];
+ currentContext.forEach(function(ctx){
+ // Don't want any recursion if we're already in the set of tags which have
+ // the desired tag name
+ found = found.concat(domUtils.getElements(options, ctx, (tag ? false : true)));
+ });
+
+ currentContext = found;
+ }
+ }
+
currentContext = found;
-
}
// ID selector
@@ -89,22 +118,7 @@ exports.select = function(dom, selector) {
var id_selector = tokens[i].split('#', 2)[1];
// need to stop on the first id found (in bad HTML)...
- var el = null;
- for ( var k = 0; k < currentContext.length; k++ ) {
-
- // the document has no child elements but tags do so we search children to avoid
- // returning the current element via a false positive
- if ( typeof currentContext[k].children !== 'undefined' ) {
- el = domUtils.getElementById(id_selector, currentContext[k].children);
- } else {
- el = domUtils.getElementById(id_selector, currentContext[k]);
- }
-
- if ( el ) {
- found.push(el);
- break;
- }
- }
+ found.push(domUtils.getElementById(id_selector, currentContext, true));
if (!found[0]) {
currentContext = [];
@@ -119,26 +133,28 @@ exports.select = function(dom, selector) {
var parts = tokens[i].split('.');
tag = parts[0];
options = {};
+
options['class'] = function (value) {
if (!value) return false;
+
var classes = value.split(/\s+/);
for (var i = 1, len = parts.length; i < len; i++) {
- if (!~classes.indexOf(parts[i])) return false;
+ if (classes.indexOf(parts[i]) == -1) return false;
}
+
return true;
};
found = [];
for ( var l = 0; l < currentContext.length; l++ ) {
var context = currentContext[l];
if ( tag.length > 0 ) {
- context = domUtils.getElementsByTagName(tag, context);
+ context = domUtils.getElementsByTagName(tag, context, true);
// don't recurse in the case we have a tag or we get children we might not want
found = found.concat(domUtils.getElements(options, context, false));
} else {
- found = found.concat(domUtils.getElements(options, context));
- }
-
+ found = found.concat(domUtils.getElements(options, context, true));
+ }
};
currentContext = found;
@@ -148,28 +164,33 @@ exports.select = function(dom, selector) {
else if ( tokens[i] === '*' ) {
// nothing to do right?
}
-
- // Tag selector
- else {
- if (!tagRe.test(tokens[i])) {
- currentContext = [];
- break;
- }
-
- found = [];
- for ( var m = 0; m < currentContext.length; m++ ) {
- // htmlparsers document itself has no child property - only nodes do...
- if ( typeof currentContext[m].children !== 'undefined' ) {
- found = found.concat(domUtils.getElementsByTagName(tokens[i], currentContext[m].children));
- } else if (i === 0) {
- found = found.concat(domUtils.getElementsByTagName(tokens[i], currentContext[m]));
- }
- };
+ // Nothing matches
+ else{
- currentContext = found;
+ currentContext = [];
+ break;
}
};
-
- return currentContext;
+
+ return currentContext;
+};
+
+/*
+ * Takes a dom tree or part of one from htmlparser and applies
+ * the provided selector against. The returned value is also
+ * a valid dom tree, so can be passed by into
+ * htmlparser.DomUtil.* calls
+ */
+
+exports.select = function(dom, selector){
+
+ var subselects = selector.split(/(?:\s*)?,(?:\s*)?/),
+ ctxs = [];
+
+ subselects.forEach(function(sub){
+ ctxs = ctxs.concat(_select(dom, sub));
+ });
+
+ return ctxs;
};
View
@@ -3,16 +3,22 @@
"version": "0.0.3",
"engines": {
"node": ">=0.2.0"
- },
+ },
"author": "Matt Mueller <mattmuelle@gmail.com>",
"url": "http://github.com/harryf/node-soupselect",
"dependencies": {
"htmlparser2": "2.x"
},
+ "devDependencies": {
+ "mocha": "0.x"
+ },
"repository" : [
{ "type":"git", "url":"git://github.com/harryf/node-soupselect.git" }
],
"main": "./lib/soupselect",
+ "scripts": {
+ "test": "mocha -u tdd -R list"
+ },
"license": "MIT",
"description": "Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)"
}
Oops, something went wrong.

0 comments on commit 9fd8461

Please sign in to comment.