Skip to content

Commit

Permalink
Added caching
Browse files Browse the repository at this point in the history
  • Loading branch information
fhemberger committed Aug 8, 2011
1 parent eaff925 commit 71a84b1
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 27 deletions.
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# Google+ Scraper
### Retrieve data from Google+ profiles with [NodeJS](http://nodejs.org/) and [CoffeeScript](http://jashkenas.github.com/coffee-script/).

The technique used is called “[web scraping](http://en.wikipedia.org/wiki/Web_scraping)”.
That means: If Google+ changes anything on their HTML, the script is going to fail and needs to be adjusted.
The technique used is called “[web scraping](http://en.wikipedia.org/wiki/Web_scraping)”. Instead of scraping the HTML code itself, this script fights its way through `OZ_initData`, a big, mean and ugly inline JavaScript array containing the profile information.

Note: This script is still beta. Of cause you're very welcome to contribute. ;-)
That means: If Google+ changes anything on their side, the script may fail and needs to be adjusted.


## Installation
Expand All @@ -17,7 +16,7 @@ First of all, you need to install NodeJS and npm:
Then you can either install the package via npm:

$ npm install -g googleplus-scraper
$ googleplus-scraper _[portnumber]_
$ googleplus-scraper [portnumber]

Or install all dependencies manually and run `app.coffee`:

Expand All @@ -39,7 +38,7 @@ At the moment, profile information and posts are supported:
/_[Google+ User ID]_/posts._[format]_
where _[format]_ is either _json_, _rss_ or _atom_

Instead of scraping the HTML code itself, this script fights its way through `OZ_initData`, a big, mean and ugly inline JavaScript array containing the profile information.
Results for each Google+ user are cached for 60 seconds. This can be adjusted by setting the cache time in milliseconds (app.coffee:7), e.g: `require('./lib/pico.coffee').Pico(30000)` would reduce the cache time to 30 seconds.


## License
Expand Down
48 changes: 30 additions & 18 deletions app.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ path = require 'path'
url = require 'url'
util = require 'util'

cache = require('./lib/pico.coffee').Pico()
googleplus = require './lib/googleplus-scraper.coffee'
view = require './lib/view.coffee'

Expand Down Expand Up @@ -32,6 +33,21 @@ process.addListener 'uncaughtException', (err) ->
views.error.render(httpResponse, {error: err.message}) if httpResponse?


renderGooglePlusData = (res, [err, data]) ->
if err
gpResponse =
error: err
else
if res.format is 'rss' or res.format is 'atom'
gpResponse =
profile: googleplus.getProfile(data)
posts: googleplus.getPosts(data)
else
gpResponse = if res.view is 'posts' then googleplus.getPosts(data) else googleplus.getProfile(data)

views[res.view][res.format].render(res, gpResponse)


server = require('http').createServer (req, res) ->
httpResponse = res
uri = url.parse(req.url).pathname
Expand All @@ -46,24 +62,20 @@ server = require('http').createServer (req, res) ->
///)

if route and route[1]
[userId, view, format] = route[1..3]
view ||= 'profile'
format ||= 'json'
format = 'json' if view is 'profile'

gp = googleplus.GooglePlusScraper userId, (err, data) =>
if err
gpResponse =
error: err
else
if format is 'rss' or format is 'atom'
gpResponse =
profile: gp.getProfile(data)
posts: gp.getPosts(data)
else
gpResponse = if view is 'posts' then gp.getPosts(data) else gp.getProfile(data)

views[view][format].render(res, gpResponse)
[res.userId, res.view, res.format] = route[1..3]
res.view ||= 'profile'
res.format ||= 'json'
res.format = 'json' if res.view is 'profile'

googleplusResponse = cache.get res.userId
if googleplusResponse
console.log 'cache hit'
renderGooglePlusData res, googleplusResponse
else
console.log 'cache miss'
googleplus.scrape res.userId, (err, data) =>
renderGooglePlusData res, [err, data]
cache.set res.userId, [err, data]

else
views.index.render(res)
Expand Down
9 changes: 6 additions & 3 deletions lib/googleplus-scraper.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ Date::toRFCString = ->
"#{pad(@getFullYear(), 4)}-#{pad(@getMonth()+1, 2)}-#{pad(@getDate(), 2)}T#{pad(@getHours(), 2)}:#{pad(@getMinutes(), 2)}:#{pad(@getSeconds(), 2)}Z"


class exports.GooglePlusScraper
class GooglePlusScraper
gpBaseURL = 'https://plus.google.com/'

constructor: (@user, callback) ->
scrape: (@user, callback) ->
return new GooglePlusScraper(@user, callback) if !(this instanceof GooglePlusScraper)

request {uri: "#{gpBaseURL}#{@user}"}, (err, res, body) ->
Expand Down Expand Up @@ -152,4 +152,7 @@ class exports.GooglePlusScraper
sharedBy: post[25]
latestComments: post[7]
)
return posts
return posts


module.exports = new GooglePlusScraper()
13 changes: 13 additions & 0 deletions lib/pico.coffee
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# pico: plain in-memory cache object

class exports.Pico
constructor: (@cachingTime = 60 * 1000) ->
if not (this instanceof Pico) then return new Pico
@cache = []

set: (key, value) ->
@cache[key] = value
setTimeout (=> @cache[key] = null), @cachingTime

get: (key) ->
if @cache then return @cache[key]
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "googleplus-scraper",
"description": "Retrieve profile infos and posts from Google+ users",
"keywords": "google+, google plus",
"version": "0.0.1",
"version": "0.0.2",
"author": "Frederic Hemberger (http://frederic-hemberger.de/)",
"homepage": "https://github.com/fhemberger/googleplus-scraper/",
"bin" : "./bin/googleplus-scraper",
Expand Down

0 comments on commit 71a84b1

Please sign in to comment.