Permalink
Browse files

Initial commit of indexer which uses https://github.com/cgiffard/node…

  • Loading branch information...
sirkitree committed Mar 5, 2012
0 parents commit 98e541dd9d49d51cd582e5d3407d38d0ae77bfae
Showing with 30 additions and 0 deletions.
  1. +30 −0 indexer.js
@@ -0,0 +1,30 @@
+var Crawler = require("simplecrawler").Crawler;
+// var myCrawler = new Crawler("jeradbitner.com");
+var myCrawler = new Crawler("engineering.mit.edu");
+myCrawler.domain = "engineering.mit.edu";
+myCrawler.supportedMimeTypes = [
+ /^text\//i
+];
+myCrawler.scanSubdomains = false;
+myCrawler.ignoreWWWDomain = true;
+// myCrawler.discoverResources = false;
+
+var items = new Array;
+
+myCrawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
+ // Only want html pages &&
+ // only want stuff on this domain. (Seems to pull in other things sometimes
+ // even though simplcrawler claims it should not).
+ if (queueItem.stateData.contentType.indexOf("text/html") != -1 &&
+ (queueItem.domain == myCrawler.domain)) {
+ items.push(queueItem.path);
+ console.log(queueItem.url);
+ }
+});
+
+myCrawler.start();
+
+myCrawler.on("complete", function() {
+ console.log(items);
+ console.log(items.length);
+});

0 comments on commit 98e541d

Please sign in to comment.