Skip to content

Commit

Permalink
Merge pull request #21 from git2samus/parser
Browse files Browse the repository at this point in the history
modular architecture for feed parsers by samus
  • Loading branch information
pote committed May 21, 2012
2 parents 85db784 + f84c033 commit 45d74df
Show file tree
Hide file tree
Showing 7 changed files with 242 additions and 146 deletions.
17 changes: 4 additions & 13 deletions bin/planet
Expand Up @@ -50,19 +50,10 @@ command :generate do |c|
c.action do |global_options,options,args| c.action do |global_options,options,args|
conf = YAML.load_file('planet.yml') conf = YAML.load_file('planet.yml')


@planet = Planet.new(config: conf.fetch('planet', {})) @planet = Planet.new(

config: conf.fetch('planet', {}),
conf['blogs'].each do |blog| blogs: conf.fetch('blogs', [])
@planet.blogs << Planet::Blog.new( )
feed: blog['feed'],
url: blog['url'],
author: blog['author'],
image: blog['image'],
posts: [],
planet: @planet,
twitter: blog['twitter']
)
end


@planet.aggregate @planet.aggregate


Expand Down
140 changes: 12 additions & 128 deletions lib/planet.rb
@@ -1,14 +1,23 @@
require 'feedzirra'
require 'mustache'
require 'planet/version' require 'planet/version'
require 'planet/blog'


class Planet class Planet


attr_accessor :config, :blogs attr_accessor :config, :blogs


def initialize(attributes = {}) def initialize(attributes = {})
self.config = attributes[:config] self.config = attributes[:config]
self.blogs = attributes.fetch(:blogs, []) self.blogs = attributes.fetch(:blogs, []).map do |blog|
Blog.new(
feed: blog['feed'],
url: blog['url'],
author: blog['author'],
image: blog['image'],
posts: [],
planet: self,
twitter: blog['twitter']
)
end
end end


def posts def posts
Expand All @@ -33,129 +42,4 @@ def write_posts
File.open(file_name + '.markdown', "w+") { |f| f.write(post.to_s) } File.open(file_name + '.markdown', "w+") { |f| f.write(post.to_s) }
end end
end end

class Post

attr_accessor :title, :content, :date, :url, :blog

def initialize(attributes = {})
self.title = attributes[:title]
self.content = attributes[:content]
self.date = attributes[:date]
self.url = attributes[:url]
self.blog = attributes[:blog]
end

def to_s
"#{ header }#{ content }#{ footer }"
end

def to_hash
{
post_content: self.content,
post_title: self.title,
post_date: self.date,
image_url: self.blog.image,
author: self.blog.author,
blog_url: self.blog.url,
blog_name: self.blog.name,
post_url: self.url,
twitter: self.blog.twitter,
twitter_url: "http://twitter.com/#{ self.blog.twitter }"
}
end

def header
## TODO: We need categories/tags
file = self.blog.planet.config.fetch('templates_directory', '_layouts/') + 'header.md'
file_contents = File.read(file)

Mustache.render(file_contents, self.to_hash)
end

def footer
file = self.blog.planet.config.fetch('templates_directory', '_layouts/') + 'author.html'
file_contents = File.read(file)

Mustache.render(file_contents, self.to_hash)
end

def file_name
name_date = date ? date.strftime('%Y-%m-%d') : nil
name_title = title.downcase.scan(/\w+/).join('-')

[name_date, name_title].join('-')
end

end

class Blog

attr_accessor :url, :feed, :name, :author, :image, :twitter, :posts, :planet

def initialize(attributes = {})
self.url = attributes[:url]
self.feed = attributes[:feed]
self.name = attributes[:name]
self.author = attributes[:author]
self.image = attributes[:image]
self.twitter = attributes[:twitter]
self.posts = attributes.fetch(:posts, [])
self.planet = attributes[:planet]
end

def fetch
feed = Feedzirra::Feed.fetch_and_parse(self.feed)

self.name ||= feed.title || 'the source'
self.url ||= feed.url

if self.url.nil?
abort "#{ self.author }'s blog does not have a url field on it's feed, you will need to specify it on planet.yml"
end

feed.entries.each do |entry|
## TODO: I should probably consider using feed 'adapters' for specific
## blog engine feeds that don't have their stuff on the standard fields.
## Example: blogspot has the content on "summary" instead of content ¬¬.
content = if !entry.content.nil?
self.sanitize_images(entry.content.strip)
elsif !entry.summary.nil?
self.sanitize_images(entry.summary.strip)
else
abort "=> No content found on entry"
end

title = if !entry.title.nil?
entry.title.sanitize
else
self.name
end

self.posts << @post = Post.new(
title: title,
content: content,
date: entry.published,
url: self.url + entry.url,
blog: self
)

puts "=> Found post titled #{ @post.title } - by #{ @post.blog.author }"
end
end

def sanitize_images(html)
## We take all images with src not matching http refs and append
## the original blog to them.
html.scan(/<img src="([^h"]+)"/).flatten.each do |img|
if img[0] == '/'
html.gsub!(img, "#{ self.url }#{ img }")
else
html.gsub!(img, "#{ self.url }/#{ img }")
end
end

html
end
end
end end
79 changes: 79 additions & 0 deletions lib/planet/blog.rb
@@ -0,0 +1,79 @@
require 'planet/post'
require 'planet/parsers'

class Planet
class Blog

attr_accessor :url, :feed, :type, :name, :author, :image, :twitter, :posts, :planet

def initialize(attributes = {})
self.url = attributes[:url]
self.feed = attributes[:feed]
self.type = attributes[:type]
self.name = attributes[:name]
self.author = attributes[:author]
self.image = attributes[:image]
self.twitter = attributes[:twitter]
self.posts = attributes.fetch(:posts, [])
self.planet = attributes[:planet]

# get parser-manager instance
@parsers = Parsers.new
end

def fetch
# given parser can be set arbitrarily with :type or inferred from the domain
parser = self.type ? @parsers.get_parser(self.type) : @parsers.get_parser_for(self.feed)

# parser instances should mimick Feedzirra interface
feed = parser.fetch_and_parse(self.feed)

self.name ||= feed.title || 'the source'
self.url ||= feed.url

if self.url.nil?
abort "#{ self.author }'s blog does not have a url field on it's feed, you will need to specify it on planet.yml"
end

feed.entries.each do |entry|
content = if !entry.content.nil?
self.sanitize_images(entry.content.strip)
elsif !entry.summary.nil?
self.sanitize_images(entry.summary.strip)
else
abort "=> No content found on entry"
end

title = if !entry.title.nil?
entry.title.sanitize
else
self.name
end

self.posts << @post = Post.new(
title: title,
content: content,
date: entry.published,
url: self.url + entry.url,
blog: self
)

puts "=> Found post titled #{ @post.title } - by #{ @post.blog.author }"
end
end

def sanitize_images(html)
## We take all images with src not matching http refs and append
## the original blog to them.
html.scan(/<img src="([^h"]+)"/).flatten.each do |img|
if img[0] == '/'
html.gsub!(img, "#{ self.url }#{ img }")
else
html.gsub!(img, "#{ self.url }/#{ img }")
end
end

html
end
end
end
64 changes: 64 additions & 0 deletions lib/planet/parsers.rb
@@ -0,0 +1,64 @@
require 'feedzirra'
require 'set'

class Planet
# Parsers class - manager for the feed parsers
#
# parser classes inherit from Planet::Parsers::BaseParser
# and are added automatically to the list of available parsers.
# files located on planet/parsers are automatically loaded.
class Parsers
@@parsers = Set.new

def self.add_parser(parser)
@@parsers << parser
end

# Parser instances keep indexes of the available parsers and
# check for duplicate definitions (need to use an instance
# because #inherited gets called as soon as the class is seen
# but before it is fully defined).
def initialize
@types, @domains = {}, {}

@@parsers.each do |parser|
new_type, new_domains = parser.type, parser.domains

fail("duplicate type") if new_type and @types.has_key? new_type
fail("overlapping domains") unless (@domains.keys & new_domains).empty?

@types[new_type] = parser if new_type
new_domains.each do |new_domain|
@domains[new_domain] = parser
end
end
end

# returns the appropiate parser based on the type
def get_parser(type)
begin
return @types.fetch(type)
rescue KeyError => e
raise(ArgumentError, "No parser for type '#{ type }'", caller)
end
end

# returns any parser that can handle this feeds' domain,
# defaults to Feedzirra if none available.
def get_parser_for(feed)
feed_domain = URI(feed).host

@domains.each do |domain, parser|
return parser if feed_domain.end_with? domain
end

return Feedzirra::Feed # default generic parser
end
end
end

# load parsers
dirname = File.join([File.dirname(__FILE__), 'parsers'])
Dir.open(dirname).each do |filename|
require "#{dirname}/#{filename}" if filename.end_with? '.rb'
end
24 changes: 24 additions & 0 deletions lib/planet/parsers/base_parser.rb
@@ -0,0 +1,24 @@
class Planet
class Parsers
# base class for feed parsers
# subclasses should declare @type and @domains
# and also mimick Feedzirra interface.
class BaseParser
def self.type
@type
end

def self.domains
@domains || []
end

def self.inherited(parser)
Parsers.add_parser parser
end

def self.fetch_and_parse(feed)
raise(Exception, "Not implemented", caller)
end
end
end
end

0 comments on commit 45d74df

Please sign in to comment.