Skip to content

Commit

Permalink
updated sitecrawl to take options instead of parms and noted open iss…
Browse files Browse the repository at this point in the history
…ue on jquery extension with jsdom
  • Loading branch information
Roger Castillo committed Jan 17, 2012
1 parent 930f8f5 commit db36ad6
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 25 deletions.
2 changes: 1 addition & 1 deletion examples/hello-crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ var SiteCrawl = require('../lib/sitecrawl').SiteCrawl;
// create a new SiteCrawl of depth 2 with a delay of 1s between next page
// Note: Webcrawling is delayed and will not be executed
// until Subscription
var siteCrawl = new SiteCrawl('http://loku.com', 2, 1000);
var siteCrawl = new SiteCrawl({url:'http://loku.com'});

// ask for the observable sequence and subscribe for the CrawlResult(s)
siteCrawl.toObservable().Subscribe(function(crawlResult) {
Expand Down
17 changes: 12 additions & 5 deletions examples/hello-query-ext.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,18 @@ var fs = require('fs');
var ext = fs.readFileSync('./sample-jquery-ext.js').toString();


// create a new SiteQuery of depth 2 with a delay of 1s between next page crawl
// selecting for `img` elements on each page
// Note: Webcrawling is delayed and will not be executed
// until Subscription
var siteQuery = new SiteQuery('http://loku.com', 2, 1000, 'img:regex(src,s3)', ext);
// jQuery extension example broken due (I think) to jsdom
// change...ToDo: Need to fix
//###### ###### ####### # # ####### # #
//# # # # # # # # # ## #
//# # # # # # # # # # # #
//###### ###### # # ### ##### # # #
//# # # # # # # # # # # #
//# # # # # # # # # # ##
//###### # # ####### # # ####### # #


var siteQuery = new SiteQuery({url:'http://loku.com'}, 'img:regex(src,s3)', ext);

// ask for the observable sequence and subscribe for selected jQuery element(s)
siteQuery.toObservable().Subscribe(function(result) {
Expand Down
4 changes: 3 additions & 1 deletion examples/hello-query.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ var SiteQuery = require('../lib/sitequery').SiteQuery;
// selecting for `img` elements on each page
// Note: Webcrawling is delayed and will not be executed
// until Subscription
var siteQuery = new SiteQuery('http://loku.com', 2, 1000, 'img');

var crawlOpts = {url:'http://loku.com', depth:2, delay:1000, maxCrawlTime: 100000}
var siteQuery = new SiteQuery(crawlOpts, 'img');

// ask for the observable sequence and subscribe for selected jQuery element(s)
siteQuery.toObservable().Subscribe(function(result) {
Expand Down
1 change: 1 addition & 0 deletions examples/sample-jquery-ext.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

window.jQuery.extend(window.jQuery.expr[':'].regex = function(elem, index, match) {
var matchParams = match[3].split(','),
validLabels = /^(data|css):/,
Expand Down
1 change: 1 addition & 0 deletions lib/rxjquery.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ var _ = require('underscore');
var URL = require('url');
var fs = require('fs');


var jquery = fs.readFileSync(__dirname + '/jquery-1.7.1.min.js').toString();
/**
* Creates an observable sequence of jQuery elements generated by selector on
Expand Down
27 changes: 14 additions & 13 deletions lib/sitecrawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -87,27 +87,28 @@ exports.crawlStep = crawlStep;
* Object that represents a breadth-first traversal of a site, filtering
* filtered for robots.txt up to n levels deep
* @contructor
* @param {String} url
* @param {Integer} maxDepth
* @param {Integer} delay time between request to the site
* @param {Integer} pageTimeout [default 5x delay] maximum time the crawl will wait for new pages
* @param {Integer} [maxCrawlTime] Length of SiteCrawl run in milliseconds, default 60s
* @param {Object} crawlOpts
* url, root URL
* [maxDepth], depth from root to crawl - default 2
* [delay], (ms) between hits to site - default 1s,
* [pageTimeout], (ms) max time to wait for new page default 5s,
* [maxCrawlTime], (ms) - default 10s
*/
var SiteCrawl = function (url, maxDepth, delay, pageTimeout, maxCrawlTime) {
self = this;
this.url = URL.parse(url);
var SiteCrawl = function (crawlOpts) {
var self = this;
this.url = URL.parse(crawlOpts.url);

// setup the robots parser
robotsParser.setUrl(URL.resolve(url, '/robots.txt'), function(parser, success) {
robotsParser.setUrl(URL.resolve(crawlOpts.url, '/robots.txt'), function(parser, success) {
if (success) {
self.robotsParser = parser;
}
});

this.maxDepth = maxDepth
this.delay = delay;
this.pageTimeout = pageTimeout || (delay * 5);
this.maxCrawTime = maxCrawlTime || 60000;
this.maxDepth = crawlOpts.maxDepth || 2;
this.delay = crawlOpts.delay || 1000;
this.pageTimeout = crawlOpts.pageTimeout || (crawlOpts.delay * 5);
this.maxCrawTime = crawlOpts.maxCrawlTime || 10000;
this.instanceKey = 'sitecrawl-' + Math.floor(Math.random() * 100000);
this.crawling = false;
this.crawlQueue = [];
Expand Down
10 changes: 5 additions & 5 deletions lib/sitequery.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@ var SiteCrawl = require('./sitecrawl').SiteCrawl;
/**
* SiteQuery create a observable sequence of jQuery element(s)
* @constructor
* @param {String} url
* @param {Integer} maxDepth
* @param {Integer} delay
*
* @param {Object} crawlOpts url, maxDepth, delay, pageTimeout, maxCrawlTime
* @param {String} selector
* @param {Buffer} ext loaded js src of jQuery ext
* @todo Make crawl options and opt struct
*/
var SiteQuery = function (url, maxDepth, delay, selector, ext) {
this.siteCrawl = new SiteCrawl(url, maxDepth, delay);
var SiteQuery = function (crawlOpts, selector, ext) {
this.siteCrawl = new SiteCrawl(crawlOpts);
this.ext = ext;
this.selector = selector;
}
Expand Down

0 comments on commit db36ad6

Please sign in to comment.