Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #21 from git2samus/parser
modular architecture for feed parsers by samus
- Loading branch information
Showing
7 changed files
with
242 additions
and
146 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,79 @@ | |||
require 'planet/post' | |||
require 'planet/parsers' | |||
|
|||
class Planet | |||
class Blog | |||
|
|||
attr_accessor :url, :feed, :type, :name, :author, :image, :twitter, :posts, :planet | |||
|
|||
def initialize(attributes = {}) | |||
self.url = attributes[:url] | |||
self.feed = attributes[:feed] | |||
self.type = attributes[:type] | |||
self.name = attributes[:name] | |||
self.author = attributes[:author] | |||
self.image = attributes[:image] | |||
self.twitter = attributes[:twitter] | |||
self.posts = attributes.fetch(:posts, []) | |||
self.planet = attributes[:planet] | |||
|
|||
# get parser-manager instance | |||
@parsers = Parsers.new | |||
end | |||
|
|||
def fetch | |||
# given parser can be set arbitrarily with :type or inferred from the domain | |||
parser = self.type ? @parsers.get_parser(self.type) : @parsers.get_parser_for(self.feed) | |||
|
|||
# parser instances should mimick Feedzirra interface | |||
feed = parser.fetch_and_parse(self.feed) | |||
|
|||
self.name ||= feed.title || 'the source' | |||
self.url ||= feed.url | |||
|
|||
if self.url.nil? | |||
abort "#{ self.author }'s blog does not have a url field on it's feed, you will need to specify it on planet.yml" | |||
end | |||
|
|||
feed.entries.each do |entry| | |||
content = if !entry.content.nil? | |||
self.sanitize_images(entry.content.strip) | |||
elsif !entry.summary.nil? | |||
self.sanitize_images(entry.summary.strip) | |||
else | |||
abort "=> No content found on entry" | |||
end | |||
|
|||
title = if !entry.title.nil? | |||
entry.title.sanitize | |||
else | |||
self.name | |||
end | |||
|
|||
self.posts << @post = Post.new( | |||
title: title, | |||
content: content, | |||
date: entry.published, | |||
url: self.url + entry.url, | |||
blog: self | |||
) | |||
|
|||
puts "=> Found post titled #{ @post.title } - by #{ @post.blog.author }" | |||
end | |||
end | |||
|
|||
def sanitize_images(html) | |||
## We take all images with src not matching http refs and append | |||
## the original blog to them. | |||
html.scan(/<img src="([^h"]+)"/).flatten.each do |img| | |||
if img[0] == '/' | |||
html.gsub!(img, "#{ self.url }#{ img }") | |||
else | |||
html.gsub!(img, "#{ self.url }/#{ img }") | |||
end | |||
end | |||
|
|||
html | |||
end | |||
end | |||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,64 @@ | |||
require 'feedzirra' | |||
require 'set' | |||
|
|||
class Planet | |||
# Parsers class - manager for the feed parsers | |||
# | |||
# parser classes inherit from Planet::Parsers::BaseParser | |||
# and are added automatically to the list of available parsers. | |||
# files located on planet/parsers are automatically loaded. | |||
class Parsers | |||
@@parsers = Set.new | |||
|
|||
def self.add_parser(parser) | |||
@@parsers << parser | |||
end | |||
|
|||
# Parser instances keep indexes of the available parsers and | |||
# check for duplicate definitions (need to use an instance | |||
# because #inherited gets called as soon as the class is seen | |||
# but before it is fully defined). | |||
def initialize | |||
@types, @domains = {}, {} | |||
|
|||
@@parsers.each do |parser| | |||
new_type, new_domains = parser.type, parser.domains | |||
|
|||
fail("duplicate type") if new_type and @types.has_key? new_type | |||
fail("overlapping domains") unless (@domains.keys & new_domains).empty? | |||
|
|||
@types[new_type] = parser if new_type | |||
new_domains.each do |new_domain| | |||
@domains[new_domain] = parser | |||
end | |||
end | |||
end | |||
|
|||
# returns the appropiate parser based on the type | |||
def get_parser(type) | |||
begin | |||
return @types.fetch(type) | |||
rescue KeyError => e | |||
raise(ArgumentError, "No parser for type '#{ type }'", caller) | |||
end | |||
end | |||
|
|||
# returns any parser that can handle this feeds' domain, | |||
# defaults to Feedzirra if none available. | |||
def get_parser_for(feed) | |||
feed_domain = URI(feed).host | |||
|
|||
@domains.each do |domain, parser| | |||
return parser if feed_domain.end_with? domain | |||
end | |||
|
|||
return Feedzirra::Feed # default generic parser | |||
end | |||
end | |||
end | |||
|
|||
# load parsers | |||
dirname = File.join([File.dirname(__FILE__), 'parsers']) | |||
Dir.open(dirname).each do |filename| | |||
require "#{dirname}/#{filename}" if filename.end_with? '.rb' | |||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,24 @@ | |||
class Planet | |||
class Parsers | |||
# base class for feed parsers | |||
# subclasses should declare @type and @domains | |||
# and also mimick Feedzirra interface. | |||
class BaseParser | |||
def self.type | |||
@type | |||
end | |||
|
|||
def self.domains | |||
@domains || [] | |||
end | |||
|
|||
def self.inherited(parser) | |||
Parsers.add_parser parser | |||
end | |||
|
|||
def self.fetch_and_parse(feed) | |||
raise(Exception, "Not implemented", caller) | |||
end | |||
end | |||
end | |||
end |
Oops, something went wrong.