Merge pull request #21 from git2samus/parser

modular architecture for feed parsers by samus
pote · May 21, 2012 · 45d74df · 45d74df
2 parents 85db784 + f84c033
commit 45d74df
Show file tree

Hide file tree

Showing 7 changed files with 242 additions and 146 deletions.
diff --git a/bin/planet b/bin/planet
@@ -50,19 +50,10 @@ command :generate do |c|
   c.action do |global_options,options,args|
     conf = YAML.load_file('planet.yml')
 
-    @planet = Planet.new(config: conf.fetch('planet', {}))
+    @planet = Planet.new(
-
+      config: conf.fetch('planet', {}),
-    conf['blogs'].each do |blog|
+      blogs:  conf.fetch('blogs',  [])
-      @planet.blogs << Planet::Blog.new(
+    )
-        feed: blog['feed'],
-        url: blog['url'],
-        author: blog['author'],
-        image: blog['image'],
-        posts: [],
-        planet: @planet,
-        twitter: blog['twitter']
-      )
-    end
 
     @planet.aggregate
 

diff --git a/lib/planet.rb b/lib/planet.rb
@@ -1,14 +1,23 @@
-require 'feedzirra'
-require 'mustache'
 require 'planet/version'
+require 'planet/blog'
 
 class Planet
 
   attr_accessor :config, :blogs
 
   def initialize(attributes = {})
     self.config = attributes[:config]
-    self.blogs = attributes.fetch(:blogs, [])
+    self.blogs  = attributes.fetch(:blogs, []).map do |blog|
+      Blog.new(
+        feed:    blog['feed'],
+        url:     blog['url'],
+        author:  blog['author'],
+        image:   blog['image'],
+        posts:   [],
+        planet:  self,
+        twitter: blog['twitter']
+      )
+    end
   end
 
   def posts
@@ -33,129 +42,4 @@ def write_posts
       File.open(file_name + '.markdown', "w+") { |f| f.write(post.to_s) }
     end
   end
-
-  class Post
-
-    attr_accessor :title, :content, :date, :url, :blog
-
-    def initialize(attributes = {})
-      self.title = attributes[:title]
-      self.content = attributes[:content]
-      self.date = attributes[:date]
-      self.url = attributes[:url]
-      self.blog = attributes[:blog]
-    end
-
-    def to_s
-      "#{ header }#{ content }#{ footer }"
-    end
-
-    def to_hash
-      {
-        post_content: self.content,
-        post_title: self.title,
-        post_date: self.date,
-        image_url: self.blog.image,
-        author: self.blog.author,
-        blog_url: self.blog.url,
-        blog_name: self.blog.name,
-        post_url: self.url,
-        twitter: self.blog.twitter,
-        twitter_url: "http://twitter.com/#{ self.blog.twitter }"
-      }
-    end
-
-    def header
-      ## TODO: We need categories/tags
-      file = self.blog.planet.config.fetch('templates_directory', '_layouts/') + 'header.md'
-      file_contents = File.read(file)
-
-      Mustache.render(file_contents, self.to_hash)
-    end
-
-    def footer
-      file = self.blog.planet.config.fetch('templates_directory', '_layouts/') + 'author.html'
-      file_contents = File.read(file)
-
-      Mustache.render(file_contents, self.to_hash)
-    end
-
-    def file_name
-      name_date = date ? date.strftime('%Y-%m-%d') : nil
-      name_title = title.downcase.scan(/\w+/).join('-')
-
-      [name_date, name_title].join('-')
-    end
-
-  end
-
-  class Blog
-
-    attr_accessor :url, :feed, :name, :author, :image, :twitter, :posts, :planet
-
-    def initialize(attributes = {})
-      self.url = attributes[:url]
-      self.feed = attributes[:feed]
-      self.name = attributes[:name]
-      self.author = attributes[:author]
-      self.image = attributes[:image]
-      self.twitter = attributes[:twitter]
-      self.posts = attributes.fetch(:posts, [])
-      self.planet = attributes[:planet]
-    end
-
-    def fetch
-      feed = Feedzirra::Feed.fetch_and_parse(self.feed)
-
-      self.name ||= feed.title || 'the source'
-      self.url ||= feed.url
-
-      if self.url.nil?
-        abort "#{ self.author }'s blog does not have a url field on it's feed, you will need to specify it on planet.yml"
-      end
-
-      feed.entries.each do |entry|
-        ## TODO: I should probably consider using feed 'adapters' for specific
-        ## blog engine feeds that don't have their stuff on the standard fields.
-        ## Example: blogspot has the content on "summary" instead of content ¬¬.
-        content = if !entry.content.nil?
-                    self.sanitize_images(entry.content.strip)
-                  elsif !entry.summary.nil?
-                    self.sanitize_images(entry.summary.strip)
-                  else
-                    abort "=> No content found on entry"
-                  end
-
-        title = if !entry.title.nil?
-                  entry.title.sanitize
-                else
-                  self.name
-                end
-
-        self.posts << @post = Post.new(
-          title: title,
-          content: content,
-          date: entry.published,
-          url: self.url + entry.url,
-          blog: self
-        )
-
-        puts "=> Found post titled #{ @post.title } - by #{ @post.blog.author }"
-      end
-    end
-
-    def sanitize_images(html)
-      ## We take all images with src not matching http refs and append
-      ## the original blog to them.
-      html.scan(/<img src="([^h"]+)"/).flatten.each do |img|
-        if img[0] == '/'
-          html.gsub!(img, "#{ self.url }#{ img }")
-        else
-          html.gsub!(img, "#{ self.url }/#{ img }")
-        end
-      end
-
-      html
-    end
-  end
 end
diff --git a/lib/planet/blog.rb b/lib/planet/blog.rb
@@ -0,0 +1,79 @@
+require 'planet/post'
+require 'planet/parsers'
+
+class Planet
+  class Blog
+
+    attr_accessor :url, :feed, :type, :name, :author, :image, :twitter, :posts, :planet
+
+    def initialize(attributes = {})
+      self.url = attributes[:url]
+      self.feed = attributes[:feed]
+      self.type = attributes[:type]
+      self.name = attributes[:name]
+      self.author = attributes[:author]
+      self.image = attributes[:image]
+      self.twitter = attributes[:twitter]
+      self.posts = attributes.fetch(:posts, [])
+      self.planet = attributes[:planet]
+
+      # get parser-manager instance
+      @parsers = Parsers.new
+    end
+
+    def fetch
+      # given parser can be set arbitrarily with :type or inferred from the domain
+      parser = self.type ? @parsers.get_parser(self.type) : @parsers.get_parser_for(self.feed)
+
+      # parser instances should mimick Feedzirra interface
+      feed = parser.fetch_and_parse(self.feed)
+
+      self.name ||= feed.title || 'the source'
+      self.url ||= feed.url
+
+      if self.url.nil?
+        abort "#{ self.author }'s blog does not have a url field on it's feed, you will need to specify it on planet.yml"
+      end
+
+      feed.entries.each do |entry|
+        content = if !entry.content.nil?
+                    self.sanitize_images(entry.content.strip)
+                  elsif !entry.summary.nil?
+                    self.sanitize_images(entry.summary.strip)
+                  else
+                    abort "=> No content found on entry"
+                  end
+
+        title = if !entry.title.nil?
+                  entry.title.sanitize
+                else
+                  self.name
+                end
+
+        self.posts << @post = Post.new(
+          title: title,
+          content: content,
+          date: entry.published,
+          url: self.url + entry.url,
+          blog: self
+        )
+
+        puts "=> Found post titled #{ @post.title } - by #{ @post.blog.author }"
+      end
+    end
+
+    def sanitize_images(html)
+      ## We take all images with src not matching http refs and append
+      ## the original blog to them.
+      html.scan(/<img src="([^h"]+)"/).flatten.each do |img|
+        if img[0] == '/'
+          html.gsub!(img, "#{ self.url }#{ img }")
+        else
+          html.gsub!(img, "#{ self.url }/#{ img }")
+        end
+      end
+
+      html
+    end
+  end
+end
diff --git a/lib/planet/parsers.rb b/lib/planet/parsers.rb
@@ -0,0 +1,64 @@
+require 'feedzirra'
+require 'set'
+
+class Planet
+  # Parsers class - manager for the feed parsers
+  #
+  # parser classes inherit from Planet::Parsers::BaseParser
+  # and are added automatically to the list of available parsers.
+  # files located on planet/parsers are automatically loaded.
+  class Parsers
+    @@parsers = Set.new
+
+    def self.add_parser(parser)
+      @@parsers << parser
+    end
+
+    # Parser instances keep indexes of the available parsers and
+    # check for duplicate definitions (need to use an instance
+    # because #inherited gets called as soon as the class is seen
+    # but before it is fully defined).
+    def initialize
+      @types, @domains = {}, {}
+
+      @@parsers.each do |parser|
+        new_type, new_domains = parser.type, parser.domains
+
+        fail("duplicate type") if new_type and @types.has_key? new_type
+        fail("overlapping domains") unless (@domains.keys & new_domains).empty?
+
+        @types[new_type] = parser if new_type
+        new_domains.each do |new_domain|
+          @domains[new_domain] = parser
+        end
+      end
+    end
+
+    # returns the appropiate parser based on the type
+    def get_parser(type)
+      begin
+        return @types.fetch(type)
+      rescue KeyError => e
+        raise(ArgumentError, "No parser for type '#{ type }'", caller)
+      end
+    end
+
+    # returns any parser that can handle this feeds' domain,
+    # defaults to Feedzirra if none available.
+    def get_parser_for(feed)
+      feed_domain = URI(feed).host
+
+      @domains.each do |domain, parser|
+        return parser if feed_domain.end_with? domain
+      end
+
+      return Feedzirra::Feed # default generic parser
+    end
+  end
+end
+
+# load parsers
+dirname = File.join([File.dirname(__FILE__), 'parsers'])
+Dir.open(dirname).each do |filename|
+  require "#{dirname}/#{filename}" if filename.end_with? '.rb'
+end
diff --git a/lib/planet/parsers/base_parser.rb b/lib/planet/parsers/base_parser.rb
@@ -0,0 +1,24 @@
+class Planet
+  class Parsers
+    # base class for feed parsers
+    # subclasses should declare @type and @domains
+    # and also mimick Feedzirra interface.
+    class BaseParser
+      def self.type
+        @type
+      end
+
+      def self.domains
+        @domains || []
+      end
+
+      def self.inherited(parser)
+        Parsers.add_parser parser
+      end
+
+      def self.fetch_and_parse(feed)
+        raise(Exception, "Not implemented", caller)
+      end
+    end
+  end
+end