Skip to content
This repository has been archived by the owner on Aug 4, 2020. It is now read-only.

Commit

Permalink
Fixed version references. Updated example code and docs.
Browse files Browse the repository at this point in the history
  • Loading branch information
rsdoiel committed May 18, 2012
1 parent d59850f commit 8cc37fb
Show file tree
Hide file tree
Showing 16 changed files with 39 additions and 39 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Original file line Diff line number Diff line change
@@ -1,7 +1,7 @@
extractor-js extractor-js
============ ============
revision 0.0.9d revision 0.1.0
--------------- --------------


# Overview # Overview


Expand Down Expand Up @@ -33,7 +33,7 @@ selector = {
}; };


pages.forEach(function(page) { pages.forEach(function(page) {
extractor.Scrape(page, selector, function (err, data, env) { extractor.scrape(page, selector, function (err, data, env) {
if (err) throw err; if (err) throw err;


console.log("Processed " + env.pathname); console.log("Processed " + env.pathname);
Expand All @@ -54,7 +54,7 @@ In this example we spider the homepage of the NodeJS website and list of the lin
```javascript ```javascript
var extractor = require('extractor'); var extractor = require('extractor');


extractor.Spider('http://nodejs.org', function(err, data, env) { extractor.spider('http://nodejs.org', function(err, data, env) {
var i; var i;
if (err) { if (err) {
console.error("ERROR: " + err); console.error("ERROR: " + err);
Expand Down
12 changes: 6 additions & 6 deletions docs/ClusteredSpider.md
Original file line number Original file line Diff line number Diff line change
@@ -1,13 +1,13 @@
ClusteredSpider clusteredSpider
=============== ===============


# Overview # Overview


ClusteredSpider is designed as a high level object which can spider one or clusteredSpider is designed as a high level object which can spider one or
more websites and preserve its state across multiple invocations. It collects more websites and preserve its state across multiple invocations. It collects
what individual page results from extractor.Spider() maintain the a list of what individual page results from extractor.Spider() maintain the a list of
links contained in a specific page as well as a record of pages that are pointing links contained in a specific page as well as a record of pages that are pointing
at the page spidered. It relies on the NodeJS's dirty module for persistance. at the page spidered. It relies on the NodeJS's dirty module for persistence.
It is suitable for creating custom a stand lone spiders with it. It doesn't support It is suitable for creating custom a stand lone spiders with it. It doesn't support
robot.txt processing at this time. robot.txt processing at this time.


Expand All @@ -18,9 +18,9 @@ Create a spider that will crawl http://example.com.
```javascript ```javascript
var util = require("util"), var util = require("util"),
extractor = require('extractor'), extractor = require('extractor'),
ClusteredSpider = extractor.ClusteredSpider({ url: "http://example.com" }); clusteredSpider = extractor.clusteredSpider({ url: "http://example.com" });


ClusteredSpider.on("message", function (m) { clusteredSpider.on("message", function (m) {
if (m.error) { if (m.error) {
console.error(m.messenger + ": " + m.error); console.error(m.messenger + ": " + m.error);
} }
Expand All @@ -29,7 +29,7 @@ Create a spider that will crawl http://example.com.
} }
}); });


ClusteredSpider.on("data", function (m) { clusteredSpider.on("data", function (m) {
if (m.error) { if (m.error) {
console.error(m.messenger + ": " + m.error); console.error(m.messenger + ": " + m.error);
} }
Expand Down
10 changes: 5 additions & 5 deletions docs/FetchPage.md
Original file line number Original file line Diff line number Diff line change
@@ -1,11 +1,11 @@
FetchPage fetchPage
========= =========
revision 0.0.8 revision 0.1.0
-------------- --------------


# FetchPage(pathname, options, callback) # fetchPage(pathname, options, callback)


This is a method to simplify reading HTML documents from either local disc or via http/https connects. FetchPage() is used by Scrape() to retrieve HTML content if a URL or path is provided. This is a method to simplify reading HTML documents from either local disc or via http/https connects. fetchPage() is used by Scrape() to retrieve HTML content if a URL or path is provided.




## parameters ## parameters
Expand All @@ -19,7 +19,7 @@ This is a method to simplify reading HTML documents from either local disc or vi
```javascript ```javascript
var extractor = require('extractor'); var extractor = require('extractor');


extractor.FetchPage("http://nodejs.org",{ response: true}, function (err, data, env) { extractor.fetchPage("http://nodejs.org",{ response: true}, function (err, data, env) {
if (err) { if (err) {
console.error('ERROR: ' + err); console.error('ERROR: ' + err);
} }
Expand Down
8 changes: 4 additions & 4 deletions docs/Scrape.md
Original file line number Original file line Diff line number Diff line change
@@ -1,9 +1,9 @@
Scrape scrape
====== ======
revision 0.0.8 revision 0.1.0
-------------- --------------


# Scrape(document_or_path, map, options, callback) # scrape(document_or_path, map, options, callback)


The scrape method is used to create to extract content from HTML markup. It The scrape method is used to create to extract content from HTML markup. It
has three required parameters - document_or_path, map, and callback. It has two has three required parameters - document_or_path, map, and callback. It has two
Expand Down Expand Up @@ -91,7 +91,7 @@ font, spacing tags).
return val; return val;
}; };


extractor.Scrape("http://nodejs.org", { title: "title", div_h2: "div > h2" }, { response: true, extractor.scrape("http://nodejs.org", { title: "title", div_h2: "div > h2" }, { response: true,
cleaner:clean, transformer: transform}, function (err, data, env) { cleaner:clean, transformer: transform}, function (err, data, env) {
if (err) { if (err) {
console.error('ERROR: ' + err); console.error('ERROR: ' + err);
Expand Down
6 changes: 3 additions & 3 deletions docs/Spider.md
Original file line number Original file line Diff line number Diff line change
@@ -1,6 +1,6 @@
Spider spider
====== ======
revision 0.0.8 revision 0.1.0
-------------- --------------


# Overview # Overview
Expand All @@ -14,7 +14,7 @@ markup spidered.
```javascript ```javascript
var extractor = require('extractor'), util = require('util'); var extractor = require('extractor'), util = require('util');


extractor.Spider("http://nodejs.org", { response: true }, function (err, data, env) { extractor.spider("http://nodejs.org", { response: true }, function (err, data, env) {
if (err) { if (err) {
console.error('ERROR: ' + err); console.error('ERROR: ' + err);
} }
Expand Down
4 changes: 2 additions & 2 deletions docs/SubmitForm.md
Original file line number Original file line Diff line number Diff line change
@@ -1,8 +1,8 @@
SubmitForm SubmitForm
============ ============
revision 0.0.8 revision 0.0.9
-------------- --------------


# SubmitForm # submitForm


Provide a means to submit forms easily and return results. This is buggy and not well developed yet. Provide a means to submit forms easily and return results. This is buggy and not well developed yet.
2 changes: 1 addition & 1 deletion examples/example-1.js
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ var extractor = require('../extractor');


var map = {title:'title', intro : '#introduction' }; var map = {title:'title', intro : '#introduction' };


extractor.Scrape('http://nodejs.org', map, function(err, data, env) { extractor.scrape('http://nodejs.org', map, function(err, data, env) {
if (err) { if (err) {
console.error("ERROR: " + err); console.error("ERROR: " + err);
} }
Expand Down
2 changes: 1 addition & 1 deletion examples/example-2.js
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ var extractor = require('../extractor');


var map = {title:'title', links: 'a' }; var map = {title:'title', links: 'a' };


extractor.Scrape('http://nodejs.org', map, function(err, data, env) { extractor.scrape('http://nodejs.org', map, function(err, data, env) {
var i; var i;
if (err) { if (err) {
console.error("ERROR: " + err); console.error("ERROR: " + err);
Expand Down
2 changes: 1 addition & 1 deletion examples/example-3.js
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ if (cluster.isMaster) {
} else { // End of Parent process } else { // End of Parent process
process.on('message', function(m) { process.on('message', function(m) {
console.log('CHILD ' + process.pid + ' spider:', m.url); console.log('CHILD ' + process.pid + ' spider:', m.url);
extractor.Spider(m.url, function(err, data, env) { extractor.spider(m.url, function(err, data, env) {
var i, base_path = path.dirname(url), cut_pos = base_path.length, new_url, var i, base_path = path.dirname(url), cut_pos = base_path.length, new_url,
urls = [], url = env.pathmame; urls = [], url = env.pathmame;


Expand Down
2 changes: 1 addition & 1 deletion examples/spider.js
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ runChild = function (options) {


process.on('message', function (m) { process.on('message', function (m) {
console.log('CHILD (' + process.pid + ') spidering:', m.url); console.log('CHILD (' + process.pid + ') spidering:', m.url);
extractor.Spider(m.url, function (err, data, env) { extractor.spider(m.url, function (err, data, env) {
var i, new_url, urls = [], base_parts, base_path, cur_url = env.pathname, res = env.response; var i, new_url, urls = [], base_parts, base_path, cur_url = env.pathname, res = env.response;


base_parts = url.parse(cur_url); base_parts = url.parse(cur_url);
Expand Down
4 changes: 2 additions & 2 deletions examples/spider_test.js
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ TESTS.formatRecord = function () {
test_completed += 1; test_completed += 1;
}; };


TESTS.Spider = function () { TESTS.spider = function () {
test_expected += 1; test_expected += 1;
assert.fail("Spider(), tests not implemented for runMaster(),runChild()."); assert.fail("spider(), tests not implemented for runMaster(),runChild().");
test_completed += 0; test_completed += 0;
}; };


Expand Down
2 changes: 1 addition & 1 deletion extractor.js
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* Released under New the BSD License. * Released under New the BSD License.
* See: http://opensource.org/licenses/bsd-license.php * See: http://opensource.org/licenses/bsd-license.php
* *
* revision 0.0.9d * revision 0.1.0
*/ */


/*jslint devel: true, node: true, maxerr: 50, indent: 4, vars: true, sloppy: true, stupid: false */ /*jslint devel: true, node: true, maxerr: 50, indent: 4, vars: true, sloppy: true, stupid: false */
Expand Down
2 changes: 1 addition & 1 deletion extractor_test.js
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
* Released under New the BSD License. * Released under New the BSD License.
* See: http://opensource.org/licenses/bsd-license.php * See: http://opensource.org/licenses/bsd-license.php
* *
* revision 0.0.9d * revision 0.1.0
*/ */


var TIMEOUT = 10, var TIMEOUT = 10,
Expand Down
10 changes: 5 additions & 5 deletions lib/SubmitForm.js
Original file line number Original file line Diff line number Diff line change
@@ -1,13 +1,13 @@
/** /**
* SubmitForm - send a get/post and pass the results to the callback. * submitForm - send a get/post and pass the results to the callback.
* @param action - the url hosting the form processor (e.g. 'http://example.com/form-processor.php') * @param action - the url hosting the form processor (e.g. 'http://example.com/form-processor.php')
* @param form_data - the form field name/values to submit * @param form_data - the form field name/values to submit
* @param options - a set of properties to modify SubmitForm behavior (e.g. options.method defaults to POST, * @param options - a set of properties to modify submitForm behavior (e.g. options.method defaults to POST,
* optional.timeout defaults to 30000 milliseconds). * optional.timeout defaults to 30000 milliseconds).
* @param callback - the callback to use when you get a response from the form submission. Args past * @param callback - the callback to use when you get a response from the form submission. Args past
* to the callback function are err, data and environment. * to the callback function are err, data and environment.
*/ */
var SubmitForm = function (action, form_data, options, callback) { var submitForm = function (action, form_data, options, callback) {
var defaults = { method:'POST', timeout:30000, protocol: "http:" }, var defaults = { method:'POST', timeout:30000, protocol: "http:" },
parts, req, timer_id, protocol_method = http; parts, req, timer_id, protocol_method = http;


Expand Down Expand Up @@ -95,6 +95,6 @@ var SubmitForm = function (action, form_data, options, callback) {
timer_id = setTimeout(function () { timer_id = setTimeout(function () {
return callback("ERROR: timeout", null, {options: options}); return callback("ERROR: timeout", null, {options: options});
}, options.timeout); }, options.timeout);
}; /* END SubmitForm(action, form_data, options, callback) */ }; /* END submitForm(action, form_data, options, callback) */


exports.SubmitForm = SubmitForm; exports.submitForm = submitForm;
2 changes: 1 addition & 1 deletion lib/SubmitForm_test.js
Original file line number Original file line Diff line number Diff line change
@@ -1,5 +1,5 @@
// Tests of SubmitForm() // Tests of SubmitForm()
TESTS.SubmitForm = function () { TESTS.submitForm = function () {
// http GET // http GET
test_expected += 1; test_expected += 1;
(function () { (function () {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Original file line Diff line number Diff line change
@@ -1,6 +1,6 @@
{ {
"name" : "extractor", "name" : "extractor",
"version" : "0.0.9d", "version" : "0.1.0",
"description" : "A small utility library for retrieving and scraping web content. It targets scraping content with a unique attribute id, class or tag.", "description" : "A small utility library for retrieving and scraping web content. It targets scraping content with a unique attribute id, class or tag.",
"main" : "./extractor.js", "main" : "./extractor.js",
"repository" : { "repository" : {
Expand Down

0 comments on commit 8cc37fb

Please sign in to comment.