Added caching

sp00 · Aug 8, 2011 · 71a84b1 · 71a84b1
1 parent eaff925
commit 71a84b1
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -1,10 +1,9 @@
 # Google+ Scraper
 ### Retrieve data from Google+ profiles with [NodeJS](http://nodejs.org/) and [CoffeeScript](http://jashkenas.github.com/coffee-script/).
 
-The technique used is called “[web scraping](http://en.wikipedia.org/wiki/Web_scraping)”.
-That means: If Google+ changes anything on their HTML, the script is going to fail and needs to be adjusted.
+The technique used is called “[web scraping](http://en.wikipedia.org/wiki/Web_scraping)”. Instead of scraping the HTML code itself, this script fights its way through `OZ_initData`, a big, mean and ugly inline JavaScript array containing the profile information.
 
-Note: This script is still beta. Of cause you're very welcome to contribute. ;-)
+That means: If Google+ changes anything on their side, the script may fail and needs to be adjusted.
 
 
 ## Installation
@@ -17,7 +16,7 @@ First of all, you need to install NodeJS and npm:
 Then you can either install the package via npm:
 
 	$ npm install -g googleplus-scraper
-	$ googleplus-scraper _[portnumber]_
+	$ googleplus-scraper [portnumber]
 
 Or install all dependencies manually and run `app.coffee`:
 
@@ -39,7 +38,7 @@ At the moment, profile information and posts are supported:
 /_[Google+ User ID]_/posts._[format]_  
 where _[format]_ is either _json_, _rss_ or _atom_
 
-Instead of scraping the HTML code itself, this script fights its way through `OZ_initData`, a big, mean and ugly inline JavaScript array containing the profile information.
+Results for each Google+ user are cached for 60 seconds. This can be adjusted by setting the cache time in milliseconds (app.coffee:7), e.g: `require('./lib/pico.coffee').Pico(30000)` would reduce the cache time to 30 seconds.
 
 
 ## License

diff --git a/app.coffee b/app.coffee
@@ -4,6 +4,7 @@ path       = require 'path'
 url        = require 'url'
 util       = require 'util'
 
+cache      = require('./lib/pico.coffee').Pico()
 googleplus = require './lib/googleplus-scraper.coffee'
 view       = require './lib/view.coffee'
 
@@ -32,6 +33,21 @@ process.addListener 'uncaughtException', (err) ->
   views.error.render(httpResponse, {error: err.message}) if httpResponse?
 
 
+renderGooglePlusData = (res, [err, data]) ->
+  if err
+    gpResponse =
+      error: err
+  else
+    if res.format is 'rss' or res.format is 'atom'
+      gpResponse =
+        profile: googleplus.getProfile(data)
+        posts:   googleplus.getPosts(data)
+    else
+      gpResponse = if res.view is 'posts' then googleplus.getPosts(data) else googleplus.getProfile(data)
+
+  views[res.view][res.format].render(res, gpResponse)
+
+
 server = require('http').createServer (req, res) ->
   httpResponse = res
   uri = url.parse(req.url).pathname
@@ -46,24 +62,20 @@ server = require('http').createServer (req, res) ->
     ///)
 
   if route and route[1]
-    [userId, view, format] = route[1..3]
-    view   ||= 'profile'
-    format ||= 'json'
-    format = 'json' if view is 'profile'
-
-    gp = googleplus.GooglePlusScraper userId, (err, data) =>
-      if err
-        gpResponse = 
-          error: err
-      else
-        if format is 'rss' or format is 'atom'
-          gpResponse =
-            profile: gp.getProfile(data)
-            posts:   gp.getPosts(data)
-        else
-          gpResponse = if view is 'posts' then gp.getPosts(data) else gp.getProfile(data)
-
-      views[view][format].render(res, gpResponse)
+    [res.userId, res.view, res.format] = route[1..3]
+    res.view   ||= 'profile'
+    res.format ||= 'json'
+    res.format = 'json' if res.view is 'profile'
+
+    googleplusResponse = cache.get res.userId
+    if googleplusResponse
+      console.log 'cache hit'
+      renderGooglePlusData res, googleplusResponse
+    else 
+      console.log 'cache miss'
+      googleplus.scrape res.userId, (err, data) =>
+        renderGooglePlusData res, [err, data]
+        cache.set res.userId, [err, data]
 
   else
     views.index.render(res)

diff --git a/lib/googleplus-scraper.coffee b/lib/googleplus-scraper.coffee
@@ -19,10 +19,10 @@ Date::toRFCString = ->
   "#{pad(@getFullYear(), 4)}-#{pad(@getMonth()+1, 2)}-#{pad(@getDate(), 2)}T#{pad(@getHours(), 2)}:#{pad(@getMinutes(), 2)}:#{pad(@getSeconds(), 2)}Z"
 
 
-class exports.GooglePlusScraper
+class GooglePlusScraper
   gpBaseURL = 'https://plus.google.com/'
 
-  constructor: (@user, callback) ->
+  scrape: (@user, callback) ->
     return new GooglePlusScraper(@user, callback) if !(this instanceof GooglePlusScraper)
 
     request {uri: "#{gpBaseURL}#{@user}"}, (err, res, body) ->
@@ -152,4 +152,7 @@ class exports.GooglePlusScraper
         sharedBy:       post[25]
         latestComments: post[7]
       )
-    return posts
+    return posts
+
+
+module.exports = new GooglePlusScraper()
diff --git a/lib/pico.coffee b/lib/pico.coffee
@@ -0,0 +1,13 @@
+# pico: plain in-memory cache object
+
+class exports.Pico
+  constructor: (@cachingTime = 60 * 1000) ->
+    if not (this instanceof Pico) then return new Pico
+    @cache = []
+
+  set: (key, value) ->
+    @cache[key] = value
+    setTimeout (=> @cache[key] = null), @cachingTime
+
+  get: (key) ->
+    if @cache then return @cache[key]
diff --git a/package.json b/package.json
@@ -2,7 +2,7 @@
   "name": "googleplus-scraper",
   "description": "Retrieve profile infos and posts from Google+ users",
   "keywords": "google+, google plus",
-  "version": "0.0.1",
+  "version": "0.0.2",
   "author": "Frederic Hemberger (http://frederic-hemberger.de/)",
   "homepage": "https://github.com/fhemberger/googleplus-scraper/",
   "bin" : "./bin/googleplus-scraper",