Skip to content
This repository has been archived by the owner on Aug 4, 2020. It is now read-only.

Commit

Permalink
Added support for restrict path, implemented opt processing. Fixed bu…
Browse files Browse the repository at this point in the history
…g in handling https links.
  • Loading branch information
R. S. Doiel committed Jan 6, 2012
1 parent b9bb740 commit 71d5a22
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 48 deletions.
2 changes: 1 addition & 1 deletion README.md
@@ -1,6 +1,6 @@
extractor-js
============
revision 0.0.7f
revision 0.0.7g
---------------

# Overview
Expand Down
110 changes: 71 additions & 39 deletions examples/clustered-spider.js
Expand Up @@ -9,28 +9,28 @@
* See: http://opensource.org/licenses/bsd-license.php
*
*/
var //util = require('util'),
var util = require('util'),
fs = require('fs'),
url = require('url'),
path = require('path'),
opt = require('opt'),
extractor = require('extractor'),
extractor = require('../extractor'),
dirty = require('dirty'), db,
cluster = require('cluster'),
numCPUs = require('os').cpus().length, stat,
START_URLS = [],
START_URLS = [], restrictPath = false,
// Functions
USAGE, setStartURLs,
onMessageToChild, onDeathOfChild, onExitOfChild;
USAGE, setStartURLs, setNumCPUs, setRestrictPath,
onMessageToChild, onDeathOfChild;

USAGE = function (msg, error_level) {
var heading = " USAGE: node " + process.argv[1] + " --urls=STARTING_URL\n\n SYNOPSIS: Spider urls and build a database of links.\n",
var heading = "\n USAGE: node " + process.argv[1] + " --urls=STARTING_URL\n\n SYNOPSIS: Spider urls and build a database of links.\n",
help = opt.help(), ky;

if (error_level !== undefined) {
console.error(heading);
if (msg !== undefined) {
console.error(msg);
console.error(" " + msg + "\n");
} else {
console.error("ERROR: process exited with an error " + error_level);
}
Expand All @@ -41,7 +41,9 @@ USAGE = function (msg, error_level) {
console.log("\t" + ky + "\t\t" + help[ky]);
}
console.log("\n\n");
console.log(msg);
if (msg !== undefined) {
console.log(" " + msg + "\n");
}
process.exit(0);
};

Expand All @@ -55,7 +57,19 @@ setStartURLs = function (param) {
}
};

setNumCPUs = function (param) {
if (param.match(/[0-9]+/)){
numCPUs = param;
}
};

setRestrictPath = function (param) {
restrictPath = param;
};

onMessageToChild = function(m) {
var work_parts, work_url;

if (m.processed_url !== undefined && m.found_urls !== undefined) {
if (m.found_urls.join && m.found_urls.length > 0) {
m.found_urls.forEach(function(work_url) {
Expand All @@ -64,71 +78,82 @@ onMessageToChild = function(m) {
if (work_url.indexOf('://') < 0) {
work_url = work_url.replace(/\:\//,'://');
}
row = db.get(work_url);
if (row === undefined) {
console.log("Discovered: " + work_url);
db.set(work_url, { url: work_url, processed: false });
// Remove # from url safely.
work_parts = url.parse(work_url);
if (work_parts.hash) {
delete work_parts.hash;
}
if (restrictPath !== false) {
if (work_parts.pathname.indexOf(restrictPath) === 0) {
work_url = url.format(work_parts);
} else {
work_url = false;
}
} else {
work_url = url.format(work_parts);
}
if (work_url !== false) {
row = db.get(work_url);
if (row === undefined) {
console.log("Discovered: " + work_url);
db.set(work_url, { url: work_url, processed: false });
}
}
});
}
if (m.processed_url) {
console.log("Processed: " + m.processed_url);
db.set(m.processed_url, {url: m.processed_url, processed: true});
work_parts = url.parse(m.processed_url);
if (work_parts.hash) {
delete work_parts.hash;
}
work_url = url.format(work_parts);
console.log("Processed: " + work_url);
db.set(work_url, {url: work_url, processed: true});
}
}
};

onDeathOfChild = function (worker) {
console.error("ERROR: worker died: " + worker.pid);
};

onExitOfChild = function (err_no) {
if (err_no) {
console.log("worker " + process.pid + " exited with error no: " + err_no);
}
};

if (cluster.isMaster) {
opt.set(['-h','--help'], function () {
}, "Help message");
opt.set(['-u','--urls'], setStartURLs, "The starting url(s) to run the spider over.");
opt.set(['-u', '--urls'], setStartURLs, "The starting url(s) to run the spider over.");
opt.set(['-t', '--thread-count'], setNumCPUs, "Set the number of threads used by spider. Default is the number of CPUs available.");
opt.set(['-r', '--restrict-path'], setRestrictPath, "Only spider for a specific path. E.g. -r /my/stuff would only spider folders that start with /my/stuff.");
opt.set(['-h', '--help'], USAGE, "Help message");
opt.parse(process.argv);

if (process.argv.length <= 2) {
// If there is no spider.db then display USAGE
try {
stat = fs.statSync('spider.db');
} catch (err) {
console.error("USAGE: node " + process.argv[1] + " STARTING_URL\n");
process.exit(1);
USAGE("Missing spider.db, must provide STARTING_URL.", 1);
}
if (stat.isFile() !== true) {
console.error("USAGE: node " + process.argv[1] + " STARTING_URL\n");
process.exit(1);
USAGE("spider.db is not a file.", 1);
}
}
db = dirty('spider.db');

console.log("PARENT numCPUs: " + numCPUs);
console.log("PARENT No. of threads: " + numCPUs);
console.log("PARENT pid: " + process.pid);
console.log("PARENT loading db ...");
db.on('load', function() {
var n = [], i, count_down;
var n = [], i, count_down, interval_id;

// Seed the DB
for (i = 2; i < process.argv.length; i++) {
db.set(process.argv[i], { url: process.argv[i], processed: false });
}
START_URLS.forEach(function(start_url) {
db.set(start_url, { url: start_url, processed: false });
});

// Fork and setup the children
for (i = 0; i < numCPUs; i++ ) {
n.push(cluster.fork());
console.log("PARENT Forked child with pid: " + n[i].pid);

n[i].on('message', onMessageToChild);

n[i].on('death', onDeathOfChild);

n[i].on('exit', onExitOfChild);
}

i = 0;
Expand All @@ -148,10 +173,10 @@ if (cluster.isMaster) {

// Setup an service for sending message to child
count_down = 3;
var interval_id = setInterval(function() {
interval_id = setInterval(function() {
var i = 0, j = 0, k = 0;
db.forEach(function(ky, val) {
if (val.processed) {
if (ky && val.processed) {
k += 1;
} else {
if (i < numCPUs) {
Expand Down Expand Up @@ -234,6 +259,9 @@ if (cluster.isMaster) {
}
}
});
if (new_parts.hash) {
delete new_parts.hash;
}
return url.format(new_parts);
}
return false;
Expand Down Expand Up @@ -271,7 +299,11 @@ if (cluster.isMaster) {
}
}
// Send the URLs found to the master process
process.send({processed_url: cur_url, found_urls: urls, statusCode: res.statusCode, headers: res.headers});
if (res) {
process.send({processed_url: cur_url, found_urls: urls, statusCode: res.statusCode, headers: res.headers});
} else {
process.send({processed_url: cur_url, found_urls: urls});
}
}, {response:true});
}); // End process.on("message", ...);
}
13 changes: 7 additions & 6 deletions extractor.js
Expand Up @@ -11,7 +11,7 @@
* Released under New the BSD License.
* See: http://opensource.org/licenses/bsd-license.php
*
* revision 0.0.7f
* revision 0.0.7g
*/
var url = require('url'),
fs = require('fs'),
Expand Down Expand Up @@ -150,15 +150,15 @@ var FetchPage = function(pathname, callback, options) {
if (timer_id) { clearTimeout(timer_id); }
if (options.response) {
// FIXME Need to handle buf if array or string
if (buf === null) {
if (buf === undefined || buf === null) {
return callback(err, null, pathname, res);
}
else if (buf.join === undefined && buf.length > 0) {
return callback(null, buf.toString(), pathname, res);
}
else if (buf.join && buf.length) {
else if (buf.join !== undefined && buf.length) {
return callback(null, buf.join(""), pathname, res);
}
else if (buf.length > 0) {
return callback(null, buf.toString(), pathname, res);
}
else {
return callback(err, null, pathname, res);
}
Expand Down Expand Up @@ -206,6 +206,7 @@ var FetchPage = function(pathname, callback, options) {
}
break;
case 'https:':
protocol_method = https;
if (options.port === undefined) {
options.port = 443;
}
Expand Down
2 changes: 1 addition & 1 deletion extractor_test.js
Expand Up @@ -8,7 +8,7 @@
* Released under New the BSD License.
* See: http://opensource.org/licenses/bsd-license.php
*
* revision 0.0.7f
* revision 0.0.7g
*/

var TIMEOUT = 10,
Expand Down
2 changes: 1 addition & 1 deletion package.json
@@ -1,6 +1,6 @@
{
"name" : "extractor",
"version" : "0.0.7f",
"version" : "0.0.7g",
"description" : "A small utility library for retrieving and scraping web content. It targets scraping content with a unique attribute id, class or tag.",
"main" : "./extractor.js",
"repository" : {
Expand Down

0 comments on commit 71d5a22

Please sign in to comment.