Permalink
Browse files

initial commit

  • Loading branch information...
0 parents commit 51e7c10fd883cba8b7eddb295508994b7ee2bb48 @sentientwaffle committed Mar 19, 2012
Showing with 2,218 additions and 0 deletions.
  1. +1 −0 .gitignore
  2. 0 .npmignore
  3. +20 −0 LICENSE
  4. +47 −0 README.md
  5. +295 −0 index.js
  6. +38 −0 package.json
  7. +63 −0 test/fixtures/atom.xml
  8. +99 −0 test/fixtures/google-news.rss
  9. +457 −0 test/fixtures/rss.xml
  10. +971 −0 test/fixtures/techcrunch.rss
  11. +227 −0 test/index.test.js
@@ -0,0 +1 @@
+node_modules/
No changes.
20 LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2012 [DJG](https://github.com/sentientwaffle)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files
+(the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,47 @@
+# Feed-Read
+[Node.js](http://nodejs.org/) module for parsing RSS and ATOM feeds into
+a common article object.
+
+# Installation
+
+ $ npm install feed-read
+
+# Usage
+
+ var feed = require("feed-read");
+
+## `feed(url, callback)`
+Fetch a feed.
+
+ feed("http://craphound.com/?feed=rss2", function(err, articles) {
+ if (err) throw err;
+ // Each article has the following properties:
+ //
+ // * "title" - The article title (String).
+ // * "author" - The author's name (String).
+ // * "link" - The original article link (String).
+ // * "content" - The HTML content of the article (String).
+ // * "published" - The date that the article was published (Date).
+ // * "feed" - {name, source, link}
+ //
+ });
+
+## `feed.rss(rss_string, callback)`
+Parse a string of XML as RSS.
+
+The callback receives `(err, articles)`.
+
+## `feed.atom(atom_string, callback)`
+Parse a string of XML as ATOM.
+
+The callback receives `(err, articles)`.
+
+## `feed.identify(xml_string)` // => "atom", "rss", or false
+Identify what type of feed the XML represents.
+
+Returns `false` when it is neither RSS or ATOM.
+
+
+# License
+See LICENSE.
+
295 index.js
@@ -0,0 +1,295 @@
+var request = require('request')
+ , sax = require('sax')
+ , _ = require('underscore');
+
+
+// Public: Fetch the articles from the RSS or ATOM feed.
+//
+// url - The String feed url, or an Array of urls.
+// callback - Receives `(err, articles)`, where each article has properties:
+//
+// * "title"
+// * "author"
+// * "link"
+// * "content"
+// * "published"
+// * "feed" - {name, source, link}
+//
+// Returns nothing.
+var FeedRead = module.exports = function(feed_url, callback) {
+ if (feed_url instanceof Array) {
+ var feed_urls = feed_url
+ , articles = [];
+ var next = function(i) {
+ var feed_url = feed_urls[i];
+ if (!feed_url) return callback(null, articles);
+ FeedRead.get(feed_url, function(err, _articles) {
+ if (err) return callback(err);
+ articles = articles.concat(_articles);
+ next(i + 1);
+ });
+ };
+ next(0);
+ } else {
+ FeedRead.get(feed_url, callback);
+ }
+};
+
+
+// Public: Check if the XML is RSS, ATOM, or neither.
+//
+// xml - A String of XML.
+//
+// Returns "atom", "rss", or false when it is neither.
+FeedRead.identify = function(xml) {
+ if (/<rss /i.test(xml)) {
+ return "rss";
+ } else if (/<feed /i.test(xml)) {
+ return "atom";
+ } else {
+ return false;
+ }
+}
+
+
+
+// Internal: Get a single feed.
+//
+// feed_url - String url.
+// callback - Receives `(err, articles)`.
+//
+FeedRead.get = function(feed_url, callback) {
+ request(feed_url, function(err, res, body) {
+ if (err) return callback(err);
+ var type = FeedRead.identify(body);
+ if (type == "atom") {
+ FeedRead.atom(body, feed_url, callback);
+ } else if (type == "rss") {
+ FeedRead.rss(body, feed_url, callback);
+ } else {
+ return callback(new Error( "Body is not RSS or ATOM"
+ , body.substr(0, 30), "..."));
+ }
+ });
+};
+
+
+
+// Public: Parse the articles from some ATOM.
+//
+// xml - A XML String.
+// source - (optional)
+// callback - Receives `(err, articles)`.
+//
+// Returns an Array of Articles.
+FeedRead.atom = function(xml, source, callback) {
+ if (!callback) return FeedRead.atom(xml, "", source);
+
+ var parser = new FeedParser()
+ , articles = []
+ // Info about the feed itself, not an article.
+ , meta = {source: source}
+ // The current article.
+ , article
+ // The author for when no author is specified for the post.
+ , default_author;
+
+
+ parser.onopentag = function(tag) {
+ if (tag.name == "entry") article = tag;
+ };
+
+ parser.onclosetag = function(tagname, current_tag) {
+ if (tagname == "entry") {
+ articles.push(article);
+ article = null;
+ } else if (tagname == "author" && !article) {
+ default_author = child_data(current_tag, "name");
+ } else if (tagname == "link" && current_tag.attributes.rel != "self") {
+ meta.link || (meta.link = current_tag.attributes.href);
+ } else if (tagname == "title" && !current_tag.parent.parent) {
+ meta.name = current_tag.children[0];
+ }
+ };
+
+ parser.onend = function() {
+ callback(null, _.map(articles,
+ function(art) {
+ var author = child_by_name(art, "author");
+ if (author) author = child_data(author, "name");
+
+ var obj = {
+ title: child_data(art, "title")
+ , content: child_data(art, "content")
+ , published: child_data(art, "published")
+ || child_data(art, "updated")
+ , author: author || default_author
+ , link: child_by_name(art, "link").attributes.href
+ , feed: meta
+ };
+ if (obj.published) obj.published = new Date(obj.published);
+ return obj;
+ }
+ ));
+ };
+
+ parser.write(xml);
+};
+
+
+// Public: Parse the articles from some RSS.
+//
+// xml - A XML String.
+// source - (optional)
+// callback - Receives `(err, articles)`.
+//
+// Returns an Array of Articles.
+FeedRead.rss = function(xml, source, callback) {
+ if (!callback) return FeedRead.rss(xml, "", source);
+
+ var parser = new FeedParser()
+ , articles = []
+ // Info about the feed itself, not an article.
+ , meta = {source: source}
+ // The current article.
+ , article;
+
+
+ parser.onopentag = function(tag) {
+ if (tag.name == "item") article = tag;
+ };
+
+ parser.onclosetag = function(tagname, current_tag) {
+ if (tagname == "item") {
+ articles.push(article);
+ article = null;
+ } else if (tagname == "channel") {
+ meta.link || (meta.link = child_data(current_tag, "link"));
+ meta.name = child_data(current_tag, "title");
+ }
+ };
+
+ parser.onend = function() {
+ callback(null, _.map(articles,
+ function(art) {
+ var obj = {
+ title: child_data(art, "title")
+ , content: scrub_html(child_data(art, "content:encoded"))
+ || scrub_html(child_data(art, "description"))
+ , published: child_data(art, "pubDate")
+ , author: child_data(art, "author")
+ || child_data(art, "dc:creator")
+ , link: child_data(art, "link")
+ , feed: meta
+ };
+ if (obj.published) obj.published = new Date(obj.published);
+ return obj;
+ }
+ ));
+ };
+
+ parser.write(xml);
+};
+
+
+// Methods to override:
+//
+// * onopentag
+// * onclosetag
+// * onend
+//
+var FeedParser = (function() {
+ // Internal: Parse the XML.
+ //
+ // xml - An XML String.
+ // callback - Receives `(err, obj)`.
+ //
+ function FeedParser() {
+ this.current_tag = null;
+ var parser = this.parser = sax.parser(true,
+ { trim: true
+ , normalize: true
+ })
+ , _this = this;
+
+ parser.onopentag = function(tag) { _this.open(tag); };
+ parser.onclosetag = function(tag) { _this.close(tag); };
+
+ parser.ontext = function(text) { _this.ontext(text); };
+ parser.oncdata = function(text) { _this.ontext(text); };
+ parser.onend = function() { _this.onend(); };
+
+ parser.onerror = console.error;
+ }
+
+
+ // Public: Parse the XML.
+ FeedParser.prototype.write = function(xml) {
+ this.parser.write(xml).close();
+ };
+
+ // Internal: Open a tag.
+ FeedParser.prototype.open = function(tag) {
+ tag.parent = this.current_tag;
+ tag.children = [];
+ if (tag.parent) tag.parent.children.push(tag);
+ this.current_tag = tag;
+ this.onopentag(tag);
+ };
+
+ // Internal: CLose a tag.
+ FeedParser.prototype.close = function(tagname) {
+ this.onclosetag(tagname, this.current_tag);
+ if (this.current_tag && this.current_tag.parent) {
+ var p = this.current_tag.parent;
+ delete this.current_tag.parent;
+ this.current_tag = p;
+ }
+ };
+
+ // Internal: Add the text as a child of the current tag.
+ FeedParser.prototype.ontext = function(text) {
+ if (this.current_tag) {
+ this.current_tag.children.push(text);
+ }
+ };
+
+ return FeedParser;
+})();
+
+
+// Internal: Remove <script> tags from the HTML.
+//
+// html - An HTML String.
+// callback - Receives `(err, html)`.
+//
+// TODO: Do actual HTML parsing!!
+function scrub_html(html) {
+ return html.replace(/<script.*<\/script>/gi, "");
+}
+
+
+// Internal: Find the first node from the parent node's children that has
+// the given name.
+//
+// parent - An Array of node objects.
+// name - String node name.
+//
+// Returns a node Object or null.
+function child_by_name(parent, name) {
+ var children = parent.children || [];
+ for (var i = 0; i < children.length; i++) {
+ if (children[i].name == name) return children[i];
+ }
+ return null;
+}
+
+// Internal: Get the first child of `parent` with `name`,
+// and return the text of its children.
+function child_data(parent, name) {
+ var node = child_by_name(parent, name)
+ if (!node) return "";
+ var children = node.children;
+ if (!children.length) return "";
+ return children.join("");
+}
Oops, something went wrong.

0 comments on commit 51e7c10

Please sign in to comment.