Skip to content

Commit

Permalink
generalized outlink seqences, improved polite crawling semantics and …
Browse files Browse the repository at this point in the history
…cleaned-up examples
  • Loading branch information
Roger Castillo authored and Roger Castillo committed Mar 23, 2012
1 parent 1340eb0 commit 4f8b1b2
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 62 deletions.
13 changes: 11 additions & 2 deletions examples/crawl-sequence.js
Expand Up @@ -59,7 +59,15 @@ var hrefArray = ['http://google.com',
'http://bing.com',
'http://loku.com',
'http://bing.com',
'http://loku.com'];
'http://loku.com',
'http://loku.com',
'http://loku.com',
'http://loku.com',
'http://loku.com',
'http://loku.com',
'http://loku.com'

];


// convert the array of hrefs to URL objecs
Expand All @@ -68,13 +76,14 @@ var urlStream = Rx.Observable.FromArray(hrefArray).Select(function(r){
});

// politness of 1s with crawl lasting no longer than 60s
var cs = createCrawlSequence(urlStream, 1000, 60000);
var cs = createCrawlSequence(urlStream, 3000);

cs.Subscribe(function(r){
console.log('createCrawlSequence result', r.crawlLink.url.href);
},
function(err){},
function(){
console.log('crawl completed');
process.exit(0);
});

3 changes: 2 additions & 1 deletion examples/hello-crawl.js
Expand Up @@ -8,7 +8,7 @@ var SiteCrawl = require('../lib/sitecrawl').SiteCrawl;
// create a new SiteCrawl of depth 2 with a delay of 1s between next page
// Note: Webcrawling is delayed and will not be executed
// until Subscription
var siteCrawl = new SiteCrawl({url:'http://loku.com', maxDepth:0});
var siteCrawl = new SiteCrawl({url:'http://loku.com', maxDepth:1});

// ask for the observable sequence and subscribe for the CrawlResult(s)
siteCrawl.toObservable().Subscribe(function(crawlResult) {
Expand All @@ -21,4 +21,5 @@ function(exn){
// on crawl complete
function(){
console.log('SiteCrawl complete');
process.exit(0);
});
5 changes: 3 additions & 2 deletions examples/hello-query.js
Expand Up @@ -10,12 +10,12 @@ var SiteQuery = require('../lib/sitequery').SiteQuery;
// Note: Webcrawling is delayed and will not be executed
// until Subscription

var crawlOpts = {url:'http://loku.com', depth:2, delay:1000, maxCrawlTime: 100000}
var crawlOpts = {url:'http://loku.com', maxDepth:2, delay:1000, maxCrawlTime: 100000}
var siteQuery = new SiteQuery(crawlOpts, 'img');

// ask for the observable sequence and subscribe for selected jQuery element(s)
siteQuery.toObservable().Subscribe(function(result) {
// output the img src
// output the img src
console.log(result.sourceUrl, result.elem.attr('src'));
},
// on err
Expand All @@ -25,4 +25,5 @@ function(exn) {
// on crawl complete
function() {
console.log('SiteQuery complete');
process.exit(0);
});
27 changes: 22 additions & 5 deletions examples/pager-filter.js
Expand Up @@ -3,7 +3,19 @@
* @author roger.castillo@loku.com (Roger Castillo)
*/

var SiteQuery = require('../lib/sitequery').SiteQuery;
var SiteQuery = require('../lib/sitequery').SiteQuery,
createObservablejQuery = require('../lib/rxjquery').createObservablejQuery;



var pagerLinks = function(body) {
return createObservablejQuery(body,'td > a').Where(function(r){
return r.elem.text() == 'Next';
})
.Select(function(r){
return r.elem.attr('href');
})
}

// create a new SiteQuery of depth 2 with a delay of 1s between next page crawl
// selecting for `img` elements on each page
Expand All @@ -12,22 +24,26 @@ var SiteQuery = require('../lib/sitequery').SiteQuery;

// sample google crawl filter filtering for the outLinks where
// out links are in the bottom pager
var crawlOpts = {url:'http://www.google.com/search?as_epq=derp&num=25&hl=en&as_qdr=m3',
maxDepth:2,
var crawlOpts = {url:'http://www.google.com/search?as_epq=derp&num=10 &hl=en&as_qdr=m3',
maxDepth:4,
delay:1000,
maxCrawlTime: 100000,
// only crawl anchors in the pager
outLinkQuery:'td > a'}
//outLinkQuery:'td > a'
createOutLinkSeq:pagerLinks
}


var siteQuery = new SiteQuery(crawlOpts, 'h3 > a');

console.log('Searching for DERP.');

var rank = 1;

// ask for the observable sequence and subscribe for selected jQuery element(s)
siteQuery.toObservable().Subscribe(function(result) {
// output the img src
console.log('Search Result Title', result.elem.text());
console.log('Rank', rank++, 'Search Result Title', result.elem.text());
},
// on err
function(exn) {
Expand All @@ -36,4 +52,5 @@ function(exn) {
// on crawl complete
function() {
console.log('SiteQuery complete');
process.exit(0);
});
3 changes: 2 additions & 1 deletion examples/timed-crawl.js
Expand Up @@ -10,7 +10,7 @@ var SiteCrawl = require('../lib/sitecrawl').SiteCrawl;
var siteCrawl = new SiteCrawl({url:'http://loku.com', maxCrawlTime:30000});

// ask for the observable sequence and subscribe for the CrawlResult(s)
siteCrawl.toObservable().Subscribe(function(crawlResult) {
siteCrawl.toObservable().Subscribe(function(crawlResult) {
console.log(crawlResult.crawlLink.url.href);
},
// on err
Expand All @@ -20,5 +20,6 @@ function(exn){
// on crawl complete
function(){
console.log('SiteCrawl complete');
process.exit(0);
});

57 changes: 36 additions & 21 deletions lib/crawlsequence.js
Expand Up @@ -2,7 +2,9 @@ var request = require('request'),
URL = require('url'),
Rx = require('rx').Rx,
CrawlLink = require('./sitecrawl').CrawlLink,
CrawlResult = require('./sitecrawl').CrawlResult;
CrawlResult = require('./sitecrawl').CrawlResult,
_ = require('underscore');



/**
Expand All @@ -17,9 +19,13 @@ function createCrawlSequence(urlStream, politenessDelay, maxCrawlTime){
politenessDelay = politenessDelay || 1000;
maxCrawlTime = maxCrawlTime || Number.MAX_VALUE;

var lastCrawlTimes = {};
function politelyCrawled(url){
return (Date.now() - lastCrawlTimes[url.hostname]) >= politenessDelay
}

var urlsInFlight = 0;
function crawl(url, obs){
urlsInFlight++;
request(url.href, function (error, response, body) {
// in a sequence all items in the sequence are
// have null roots
Expand All @@ -31,37 +37,46 @@ function createCrawlSequence(urlStream, politenessDelay, maxCrawlTime){
// a null result'
obs.OnNext(new CrawlResult(crawlLink, null));
}
urlsInFlight--;
urlsInFlight--;
});
}

return Rx.Observable.Create(function(obs){
// a domina will be crawled a speed no greater than
// the configered politensess intervel
var politenessInterval = Rx.Observable.Interval(politenessDelay);

// queue urls waiting in memory wating to be crawled
var waitingUrls = [];
// by hostname
var lastCrawlTimes = {};

var intSubs = null;
var urlStreamComplete = false;
var urlSubs = urlStream.Subscribe(function(url){
// check all waiting URL to see if any are ready
// on a delayed asynchronous loop
var crawlStartTime = Date.now();

intSubs = politenessInterval.Subscribe(function(_){
//console.log('politenes internval check');
//console.log('waiting url count', waitingUrls.length, 'politeness delay', politenessDelay);
for (var i in waitingUrls) {
var curUrl = waitingUrls[i];
//console.log('curUrl', curUrl, 'lastCrawlTime', lastCrawlTimes[curUrl.hostname], 'diff', Date.now() - lastCrawlTimes[curUrl.hostname]);
if ((Date.now() - lastCrawlTimes[curUrl.hostname]) >= politenessDelay){
lastCrawlTimes[curUrl.hostname] = Date.now();
// remove the current item
waitingUrls.splice(i);
//console.log('crawling delayed url', url.href);
crawl(url, obs);
}
}
intSubs = politenessInterval.Subscribe(function(tick){
// filter for the first of each host ready to crawl
var crawlUrls = _.filter(waitingUrls, function(url){
var filtered = politelyCrawled(url) && !this[url.hostname];
if (filtered) this[url.hostname] = true;
return filtered;
},{});

_.map(crawlUrls,function(url){
lastCrawlTimes[url.hostname] = Date.now();
})

// crawlUrls
_.map(crawlUrls, function(url){
urlsInFlight++;
crawl(url, obs);
});
// set waiting to diff
waitingUrls = _.difference(waitingUrls, crawlUrls);

// check to see if we are completed
// if we have recvd all urls and everything is finished
// OR maxCrawlTime exceeded
Expand All @@ -72,15 +87,15 @@ function createCrawlSequence(urlStream, politenessDelay, maxCrawlTime){
obs.OnCompleted();
}
});

if (lastCrawlTimes[url.hostname] == undefined ||
((Date.now() - lastCrawlTimes[url.hostname]) >= politenessDelay)){
// add to lastCrawlTime
politelyCrawled(url.hostname)){
lastCrawlTimes[url.hostname] = Date.now();
// crawl url
urlsInFlight++;
crawl(url, obs);
} else {
// url goes into waiting queue
lastCrawlTimes[url.hostname] = Date.now();
waitingUrls.push(url);
}
},
Expand Down
72 changes: 44 additions & 28 deletions lib/sitecrawl.js
Expand Up @@ -16,7 +16,6 @@ var RobotsParser = require('robots').RobotsParser;
var robotsParser = new RobotsParser();



/**
* Link parsed from crawl result
* @constructor
Expand Down Expand Up @@ -112,15 +111,44 @@ var SiteCrawl = function (crawlOpts) {
this.maxDepth = (crawlOpts.maxDepth == undefined) ? 0 : crawlOpts.maxDepth;
this.delay = crawlOpts.delay || 1000;
this.pageTimeout = crawlOpts.pageTimeout || (crawlOpts.delay * 5);
this.maxCrawlTime = crawlOpts.maxCrawlTime || 10000;
this.maxCrawlTime = crawlOpts.maxCrawlTime || 30000;
this.instanceKey = 'sitecrawl-' + Math.floor(Math.random() * 100000);
this.crawling = false;

this.crawlQueue = [];

// jQuery selector applied to body element, used to select
// outlinks
this.outLinkQuery = crawlOpts.outLinkQuery || "a";
this.crawlQueue = [];
this.createOutLinkSeq = crawlOpts.createOutLinkSeq;

// createOutLinkSeqFn maps a body onto sequenc of urls
if (this.createOutLinkSeq == undefined){
if (this.outLinkQuery != 'a') {
this.createOutLinkSeq = function(body){
return createObservablejQuery(body, self.outLinkQuery)
.Select(function(r){
return r.elem.attr('href');
});
};
} else {
console.log('default sax crawl')
// use SAX which is faster straight anchor queries
this.createOutLinkSeq = function(body){
//console.log('SAX query for link', body)
return createObservableSAX(body, 'a')
.Select(function(elem){
return elem ? elem.attributes.href : '';
});
};
}
} else {
console.log('outLinkSeqDefined');
}
}



SiteCrawl.prototype = {

stopCrawler:function() {
Expand Down Expand Up @@ -184,31 +212,16 @@ SiteCrawl.prototype = {
selectForCrawlLinks: function(cs){
var self = this;
return cs.Select(function (crawlResult) {
var outLinkObs;
// if a straight anchor query, use SAX for optimization
// project the result onto a.hrefs through both mechanisms
if (self.outLinkQuery != 'a') {
outLinkObs = createObservablejQuery(crawlResult.body, self.outLinkQuery)
.Select(function(r){
return r.elem.attr('href');
});
} else {
outLinkObs = createObservableSAX(crawlResult.body, self.outLinkQuery)
.Select(function(elem){
return elem ? elem.attributes.href : '';
})
}

outLinkObs.Where(function(href) {
// filter non-nul and anchor elem with hrefs
console.log('href', href);
var outLinkSeq = self.createOutLinkSeq(crawlResult.body);
outLinkSeq.Where(function(href) {
// filter for non-null, blank or in page links
return href != null && href != '' && href != '#';
})
.Select(function(href) {
// project next crawl links
return new CrawlLink(URL.parse(URL.resolve(crawlResult.crawlLink.url, href)),
crawlResult.crawlLink.depth + 1,
crawlResult.crawlLink.url);
.Select(function(href) {
// project next crawl links
return new CrawlLink(URL.parse(URL.resolve(crawlResult.crawlLink.url, href)),
crawlResult.crawlLink.depth + 1,
crawlResult.crawlLink.url);
})
.Where(function(crawlLink){
// filter craw links stream for this domain
Expand All @@ -220,6 +233,7 @@ SiteCrawl.prototype = {
return (crawlLink.url.host == self.url.host) && self.robotsParser.canFetchSync(crawlLink.url.pathname);
})
.Subscribe(function(crawlLink){
//console.log('crawl link', crawlLink.url, 'depth', crawlLink.depth);
if (crawlLink.depth <= self.maxDepth){
// queue the next crawl step
self.crawlQueue.push(crawlStep(self.instanceKey, crawlLink));
Expand All @@ -232,8 +246,8 @@ SiteCrawl.prototype = {
// parsing complete
});
return crawlResult;
})
},
});
},//

toObservable: function () {
var self = this;
Expand All @@ -250,4 +264,6 @@ SiteCrawl.prototype = {
});
}
}


exports.SiteCrawl = SiteCrawl;
4 changes: 2 additions & 2 deletions package.json
Expand Up @@ -2,7 +2,7 @@
"name": "sitequery",
"description": "A reactive framework for asynchronous web crawling.",
"author": "Roger H. Castillo <roger.castillo@loku.com> (http://tech.loku.com)",
"version": "0.2.6",
"version": "0.2.7",
"repository": {
"type": "git",
"url": "git://github.com/rcastillo/sitequery.git"
Expand All @@ -13,7 +13,7 @@
},
"dependencies": {
"request": "2.1.1",
"underscore": "1.1.7",
"underscore": "1.3.1",
"jsdom": "~0.2.10",
"jquery": "~1.6.3",
"robots": "~0.6.0",
Expand Down

0 comments on commit 4f8b1b2

Please sign in to comment.