Skip to content
This repository has been archived by the owner on Aug 4, 2020. It is now read-only.

Commit

Permalink
Added option to set the name of the database and to use at least two …
Browse files Browse the repository at this point in the history
…threads.
  • Loading branch information
rsdoiel committed Jan 7, 2012
1 parent a776ca5 commit 7d56c60
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 60 deletions.
2 changes: 1 addition & 1 deletion README.md
@@ -1,6 +1,6 @@
extractor-js
============
revision 0.0.7g
revision 0.0.7h
---------------

# Overview
Expand Down
171 changes: 115 additions & 56 deletions examples/clustered-spider.js
Expand Up @@ -14,47 +14,51 @@ var util = require('util'),
url = require('url'),
path = require('path'),
opt = require('opt'),
extractor = require('../extractor'),
extractor = require('extractor'),
dirty = require('dirty'), db,
cluster = require('cluster'),
numCPUs = require('os').cpus().length, stat,
START_URLS = [], restrictPath = false,
// Functions
USAGE, setStartURLs, setNumCPUs, setRestrictPath,
onMessageToChild, onDeathOfChild;
START_URLS = [], DatabaseName = "spider.db", restrictPath = false,
// Functions
USAGE, setDatabase, setStartURLs, setNumCPUs, setRestrictPath,
makeRecord, onMessageToChild, onDeathOfChild;

USAGE = function (msg, error_level) {
var heading = "\n USAGE: node " + process.argv[1] + " --urls=STARTING_URL\n\n SYNOPSIS: Spider urls and build a database of links.\n",
help = opt.help(), ky;
var heading = "\n USAGE:\n\n\tnode " + path.basename(process.argv[1]) + " --urls=STARTING_URL\n\tnode " + path.basename(process.argv[1]) + "\n\n SYNOPSIS:\n\n\t Spider urls and build a data set of link relationships.",
help = opt.help(), ky;

if (error_level !== undefined) {
console.error(heading);
if (msg !== undefined) {
console.error(" " + msg + "\n");
} else {
console.error("ERROR: process exited with an error " + error_level);
}
process.exit(error_level);
}
console.log(heading + "\n\n OPTIONS\n");
for (ky in help) {
console.log("\t" + ky + "\t\t" + help[ky]);
}
console.log("\n\n");
if (msg !== undefined) {
console.log(" " + msg + "\n");
if (error_level !== undefined) {
console.error(heading);
if (msg !== undefined) {
console.error("\n ERROR MSG:\n\n \t" + msg + "\n");
} else {
console.error("ERROR: process exited with an error " + error_level);
}
process.exit(error_level);
}
console.log(heading + "\n\n OPTIONS:\n");
for (ky in help) {
console.log("\t" + ky + "\t\t" + help[ky]);
}
console.log("\n\n");
if (msg !== undefined) {
console.log(" " + msg + "\n");
}
process.exit(0);
process.exit(0);
};

setDatabase = function (param) {
DatabaseName = param;
};

setStartURLs = function (param) {
if (param.indexOf(',')) {
param.split(',').forEach(function(start_url) {
START_URLS.push(start_url.trim());
});
} else {
START_URLS.push(param.trim());
}
if (param.indexOf(',')) {
param.split(',').forEach(function(start_url) {
START_URLS.push(start_url.trim());
});
} else {
START_URLS.push(param.trim());
}
};

setNumCPUs = function (param) {
Expand All @@ -67,22 +71,44 @@ setRestrictPath = function (param) {
restrictPath = param;
};

makeRecord = function (url, rec) {
var defaults = { processed: false, linked_from: [], linking_to: [] };

if (url !== undefined) {
defaults.url = url;
}
if (rec !== undefined) {
Object.keys(rec).forEach(function(ky) {
defaults[ky] = rec[ky];
});
}
return defaults;
};

onMessageToChild = function(m) {
var work_parts, work_url;

if (m.processed_url !== undefined && m.found_urls !== undefined) {
var rec, processed_parts, processed_url;

if (m.processed_url !== undefined) {
if (m.processed_url.indexOf('://') < 0 && m.processed_url.indexOf(':/') > 0) {
m.processed_url = m.processed_url.replace(/\:\//,'://');
}
}

if (m.processed_url !== undefined && m.found_urls !== undefined) {
if (m.found_urls.join && m.found_urls.length > 0) {
m.found_urls.forEach(function(work_url) {
var row;
var row, work_parts;

if (work_url.indexOf('://') < 0 && work_url.indexOf(':/') > 0) {
work_url = work_url.replace(/\:\//,'://');
}

// Remove # from url safely.
work_parts = url.parse(work_url);
if (work_parts.hash) {
delete work_parts.hash;
}

if (restrictPath !== false) {
if (work_parts.pathname.indexOf(restrictPath) === 0) {
work_url = url.format(work_parts);
Expand All @@ -92,63 +118,96 @@ onMessageToChild = function(m) {
} else {
work_url = url.format(work_parts);
}

if (work_url !== false) {
row = db.get(work_url);
if (row === undefined) {
console.log("Discovered: " + work_url);
db.set(work_url, { url: work_url, processed: false });
db.set(work_url, makeRecord(work_url));
}
}
});
}
if (m.processed_url) {
work_parts = url.parse(m.processed_url);
if (work_parts.hash) {
delete work_parts.hash;
processed_parts = url.parse(m.processed_url);
if (processed_parts.hash) {
delete processed_parts.hash;
}
processed_url = url.format(processed_parts);

console.log("Processed: " + processed_url);
rec = db.get(processed_url);
if (rec === undefined) {
rec = makeRecord(processed_url);
} else {
rec.processed = true;
rec = makeRecord(processed_url, rec);
}
work_url = url.format(work_parts);
console.log("Processed: " + work_url);
db.set(work_url, {url: work_url, processed: true});
db.set(processed_url, rec);
}
}
};

onDeathOfChild = function (worker) {
console.error("ERROR: worker died: " + worker.pid);
console.error("ERROR: worker died: " + worker.pid);
};

if (cluster.isMaster) {
opt.set(['-u', '--urls'], setStartURLs, "The starting url(s) to run the spider over.");
opt.set(['-t', '--thread-count'], setNumCPUs, "Set the number of threads used by spider. Default is the number of CPUs available.");
opt.set(['-r', '--restrict-path'], setRestrictPath, "Only spider for a specific path. E.g. -r /my/stuff would only spider folders that start with /my/stuff.");
opt.set(['-h', '--help'], USAGE, "Help message");
opt.parse(process.argv);

opt.set(['-d', '--database'], setDatabase, "Set the database filename to ");
opt.set(['-u', '--urls'], setStartURLs, "The starting url(s) to run the spider over.");
opt.set(['-t', '--threads'], setNumCPUs, "Set the number of threads used by spider. Default is the number of CPUs available. The minimum is two.");
opt.set(['-r', '--restrict'], setRestrictPath, "Only spider for a specific path. E.g. -r /my/stuff would only spider folders that start with /my/stuff.");
opt.set(['-h', '--help'], USAGE, "Help message");
opt.parse(process.argv);

if (process.argv.length <= 2) {
// If there is no spider.db then display USAGE
try {
stat = fs.statSync('spider.db');
stat = fs.statSync('spider.db');
} catch (err) {
USAGE("Missing spider.db, must provide STARTING_URL.", 1);
USAGE("Missing spider.db, must provide STARTING_URL.", 1);
}
if (stat.isFile() !== true) {
USAGE("spider.db is not a file.", 1);
USAGE("spider.db is not a file.", 1);
}
}
db = dirty('spider.db');
db = dirty(DatabaseName);

console.log("PARENT No. of threads: " + numCPUs);
console.log("PARENT pid: " + process.pid);
console.log("PARENT loading db ...");
console.log("PARENT loading " + DatabaseName + " ...");
if (START_URLS.length > 0) {
console.log("PARENT Starting URL(s):\n\t" + START_URLS.join("\n\t") + "\n");
}
db.on('load', function() {
var n = [], i, count_down, interval_id;

// Seed the DB
START_URLS.forEach(function(start_url) {
db.set(start_url, { url: start_url, processed: false });
var rec, start_parts;

if (start_url.indexOf('://') < 0) {
start_url = start_url.replace(/\:\//,'://');
}
start_parts = url.parse(start_url);
if (start_parts.hash) {
delete start_parts.hash;
}
start_url = url.format(start_parts);
rec = db.get(start_url);
if (rec) {
rec.processed = false;
} else {
rec = makeRecord(start_url);
rec.processed = false;
}
db.set(start_url, rec);
});

// Fork and setup the children
if (numCPUs < 2) {
numCPUs = 2;
}
for (i = 0; i < numCPUs; i++ ) {
n.push(cluster.fork());
console.log("PARENT Forked child with pid: " + n[i].pid);
Expand Down
2 changes: 1 addition & 1 deletion extractor.js
Expand Up @@ -11,7 +11,7 @@
* Released under New the BSD License.
* See: http://opensource.org/licenses/bsd-license.php
*
* revision 0.0.7g
* revision 0.0.7h
*/
var url = require('url'),
fs = require('fs'),
Expand Down
2 changes: 1 addition & 1 deletion extractor_test.js
Expand Up @@ -8,7 +8,7 @@
* Released under New the BSD License.
* See: http://opensource.org/licenses/bsd-license.php
*
* revision 0.0.7g
* revision 0.0.7h
*/

var TIMEOUT = 10,
Expand Down
2 changes: 1 addition & 1 deletion package.json
@@ -1,6 +1,6 @@
{
"name" : "extractor",
"version" : "0.0.7g",
"version" : "0.0.7h",
"description" : "A small utility library for retrieving and scraping web content. It targets scraping content with a unique attribute id, class or tag.",
"main" : "./extractor.js",
"repository" : {
Expand Down

0 comments on commit 7d56c60

Please sign in to comment.