Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Fixed version references. Updated example code and docs.

  • Loading branch information...
commit 8cc37fb71e9d49e3ab1940e045461c102327c90c 1 parent d59850f
@rsdoiel authored
View
8 README.md
@@ -1,7 +1,7 @@
extractor-js
============
-revision 0.0.9d
----------------
+revision 0.1.0
+--------------
# Overview
@@ -33,7 +33,7 @@ selector = {
};
pages.forEach(function(page) {
- extractor.Scrape(page, selector, function (err, data, env) {
+ extractor.scrape(page, selector, function (err, data, env) {
if (err) throw err;
console.log("Processed " + env.pathname);
@@ -54,7 +54,7 @@ In this example we spider the homepage of the NodeJS website and list of the lin
```javascript
var extractor = require('extractor');
-extractor.Spider('http://nodejs.org', function(err, data, env) {
+extractor.spider('http://nodejs.org', function(err, data, env) {
var i;
if (err) {
console.error("ERROR: " + err);
View
12 docs/ClusteredSpider.md
@@ -1,13 +1,13 @@
-ClusteredSpider
+clusteredSpider
===============
# Overview
-ClusteredSpider is designed as a high level object which can spider one or
+clusteredSpider is designed as a high level object which can spider one or
more websites and preserve its state across multiple invocations. It collects
what individual page results from extractor.Spider() maintain the a list of
links contained in a specific page as well as a record of pages that are pointing
-at the page spidered. It relies on the NodeJS's dirty module for persistance.
+at the page spidered. It relies on the NodeJS's dirty module for persistence.
It is suitable for creating custom a stand lone spiders with it. It doesn't support
robot.txt processing at this time.
@@ -18,9 +18,9 @@ Create a spider that will crawl http://example.com.
```javascript
var util = require("util"),
extractor = require('extractor'),
- ClusteredSpider = extractor.ClusteredSpider({ url: "http://example.com" });
+ clusteredSpider = extractor.clusteredSpider({ url: "http://example.com" });
- ClusteredSpider.on("message", function (m) {
+ clusteredSpider.on("message", function (m) {
if (m.error) {
console.error(m.messenger + ": " + m.error);
}
@@ -29,7 +29,7 @@ Create a spider that will crawl http://example.com.
}
});
- ClusteredSpider.on("data", function (m) {
+ clusteredSpider.on("data", function (m) {
if (m.error) {
console.error(m.messenger + ": " + m.error);
}
View
10 docs/FetchPage.md
@@ -1,11 +1,11 @@
-FetchPage
+fetchPage
=========
-revision 0.0.8
+revision 0.1.0
--------------
-# FetchPage(pathname, options, callback)
+# fetchPage(pathname, options, callback)
-This is a method to simplify reading HTML documents from either local disc or via http/https connects. FetchPage() is used by Scrape() to retrieve HTML content if a URL or path is provided.
+This is a method to simplify reading HTML documents from either local disc or via http/https connects. fetchPage() is used by Scrape() to retrieve HTML content if a URL or path is provided.
## parameters
@@ -19,7 +19,7 @@ This is a method to simplify reading HTML documents from either local disc or vi
```javascript
var extractor = require('extractor');
-extractor.FetchPage("http://nodejs.org",{ response: true}, function (err, data, env) {
+extractor.fetchPage("http://nodejs.org",{ response: true}, function (err, data, env) {
if (err) {
console.error('ERROR: ' + err);
}
View
8 docs/Scrape.md
@@ -1,9 +1,9 @@
-Scrape
+scrape
======
-revision 0.0.8
+revision 0.1.0
--------------
-# Scrape(document_or_path, map, options, callback)
+# scrape(document_or_path, map, options, callback)
The scrape method is used to create to extract content from HTML markup. It
has three required parameters - document_or_path, map, and callback. It has two
@@ -91,7 +91,7 @@ font, spacing tags).
return val;
};
- extractor.Scrape("http://nodejs.org", { title: "title", div_h2: "div > h2" }, { response: true,
+ extractor.scrape("http://nodejs.org", { title: "title", div_h2: "div > h2" }, { response: true,
cleaner:clean, transformer: transform}, function (err, data, env) {
if (err) {
console.error('ERROR: ' + err);
View
6 docs/Spider.md
@@ -1,6 +1,6 @@
-Spider
+spider
======
-revision 0.0.8
+revision 0.1.0
--------------
# Overview
@@ -14,7 +14,7 @@ markup spidered.
```javascript
var extractor = require('extractor'), util = require('util');
- extractor.Spider("http://nodejs.org", { response: true }, function (err, data, env) {
+ extractor.spider("http://nodejs.org", { response: true }, function (err, data, env) {
if (err) {
console.error('ERROR: ' + err);
}
View
4 docs/SubmitForm.md
@@ -1,8 +1,8 @@
SubmitForm
============
-revision 0.0.8
+revision 0.0.9
--------------
-# SubmitForm
+# submitForm
Provide a means to submit forms easily and return results. This is buggy and not well developed yet.
View
2  examples/example-1.js
@@ -2,7 +2,7 @@ var extractor = require('../extractor');
var map = {title:'title', intro : '#introduction' };
-extractor.Scrape('http://nodejs.org', map, function(err, data, env) {
+extractor.scrape('http://nodejs.org', map, function(err, data, env) {
if (err) {
console.error("ERROR: " + err);
}
View
2  examples/example-2.js
@@ -2,7 +2,7 @@ var extractor = require('../extractor');
var map = {title:'title', links: 'a' };
-extractor.Scrape('http://nodejs.org', map, function(err, data, env) {
+extractor.scrape('http://nodejs.org', map, function(err, data, env) {
var i;
if (err) {
console.error("ERROR: " + err);
View
2  examples/example-3.js
@@ -106,7 +106,7 @@ if (cluster.isMaster) {
} else { // End of Parent process
process.on('message', function(m) {
console.log('CHILD ' + process.pid + ' spider:', m.url);
- extractor.Spider(m.url, function(err, data, env) {
+ extractor.spider(m.url, function(err, data, env) {
var i, base_path = path.dirname(url), cut_pos = base_path.length, new_url,
urls = [], url = env.pathmame;
View
2  examples/spider.js
@@ -316,7 +316,7 @@ runChild = function (options) {
process.on('message', function (m) {
console.log('CHILD (' + process.pid + ') spidering:', m.url);
- extractor.Spider(m.url, function (err, data, env) {
+ extractor.spider(m.url, function (err, data, env) {
var i, new_url, urls = [], base_parts, base_path, cur_url = env.pathname, res = env.response;
base_parts = url.parse(cur_url);
View
4 examples/spider_test.js
@@ -77,9 +77,9 @@ TESTS.formatRecord = function () {
test_completed += 1;
};
-TESTS.Spider = function () {
+TESTS.spider = function () {
test_expected += 1;
- assert.fail("Spider(), tests not implemented for runMaster(),runChild().");
+ assert.fail("spider(), tests not implemented for runMaster(),runChild().");
test_completed += 0;
};
View
2  extractor.js
@@ -11,7 +11,7 @@
* Released under New the BSD License.
* See: http://opensource.org/licenses/bsd-license.php
*
- * revision 0.0.9d
+ * revision 0.1.0
*/
/*jslint devel: true, node: true, maxerr: 50, indent: 4, vars: true, sloppy: true, stupid: false */
View
2  extractor_test.js
@@ -8,7 +8,7 @@
* Released under New the BSD License.
* See: http://opensource.org/licenses/bsd-license.php
*
- * revision 0.0.9d
+ * revision 0.1.0
*/
var TIMEOUT = 10,
View
10 lib/SubmitForm.js
@@ -1,13 +1,13 @@
/**
- * SubmitForm - send a get/post and pass the results to the callback.
+ * submitForm - send a get/post and pass the results to the callback.
* @param action - the url hosting the form processor (e.g. 'http://example.com/form-processor.php')
* @param form_data - the form field name/values to submit
- * @param options - a set of properties to modify SubmitForm behavior (e.g. options.method defaults to POST,
+ * @param options - a set of properties to modify submitForm behavior (e.g. options.method defaults to POST,
* optional.timeout defaults to 30000 milliseconds).
* @param callback - the callback to use when you get a response from the form submission. Args past
* to the callback function are err, data and environment.
*/
-var SubmitForm = function (action, form_data, options, callback) {
+var submitForm = function (action, form_data, options, callback) {
var defaults = { method:'POST', timeout:30000, protocol: "http:" },
parts, req, timer_id, protocol_method = http;
@@ -95,6 +95,6 @@ var SubmitForm = function (action, form_data, options, callback) {
timer_id = setTimeout(function () {
return callback("ERROR: timeout", null, {options: options});
}, options.timeout);
-}; /* END SubmitForm(action, form_data, options, callback) */
+}; /* END submitForm(action, form_data, options, callback) */
-exports.SubmitForm = SubmitForm;
+exports.submitForm = submitForm;
View
2  lib/SubmitForm_test.js
@@ -1,5 +1,5 @@
// Tests of SubmitForm()
-TESTS.SubmitForm = function () {
+TESTS.submitForm = function () {
// http GET
test_expected += 1;
(function () {
View
2  package.json
@@ -1,6 +1,6 @@
{
"name" : "extractor",
- "version" : "0.0.9d",
+ "version" : "0.1.0",
"description" : "A small utility library for retrieving and scraping web content. It targets scraping content with a unique attribute id, class or tag.",
"main" : "./extractor.js",
"repository" : {
Please sign in to comment.
Something went wrong with that request. Please try again.