merge in 0.3.1, set up fancy index/instances methods

propublica · Feb 16, 2014 · a25e84e · a25e84e
2 parents 1074e2c + 24cb65e
commit a25e84e
Show file tree

Hide file tree

Showing 8 changed files with 187 additions and 65 deletions.
diff --git a/README.md b/README.md
@@ -8,18 +8,27 @@ Documentation
 With Upton, you can scrape complex sites to a CSV in just a few lines of code:
 
 ```ruby
-scraper = Upton::Scraper.new("http://www.propublica.org", "section#river h1 a")
+scraper = Upton::Scraper.index("http://www.propublica.org", "section#river h1 a")
 scraper.scrape_to_csv "output.csv" do |html|
   Nokogiri::HTML(html).search("#comments h2.title-link").map &:text
 end
 ```
 
-Just specify a URL to a list of links -- or simply a list of links --, an XPath expression or CSS selector for the links and a block of what to do with the content of the pages you've scraped. Upton comes with some pre-written blocks (Procs, technically) for scraping simple lists and tables, like the `list` function above.
+Just specify a URL to a list of links (an "index"), an XPath expression or CSS selector for the links and a block of what to do with the content of the pages you've scraped. Upton comes with some pre-written blocks (Procs, technically) for scraping simple lists and tables, like the `list` function above.
 
 Upton operates on the theory that, for most scraping projects, you need to scrape two types of pages:
 
 1. Instance pages, which are the goal of your scraping, e.g. job listings or news articles.
-1. Index pages, which list instance pages. For example, a job search site's search page or a newspaper's homepage.
+2. Index pages, which list instance pages. For example, a job search site's search page or a newspaper's homepage.
+
+You can also directly specify the list of instance page URLs to scrape:
+
+```ruby
+scraper = Upton::Scraper.instances(["http://www.propublica.org/article_1.html", "http://www.propublica.org/article_2.html"])
+scraper.scrape_to_csv "output.csv" do |html|
+  Nokogiri::HTML(html).search("#comments h2.title-link").map &:text
+end
+```
 
 For more complex use cases, subclass `Upton::Scraper` and override the relevant methods. If you're scraping links from an API, you would override `get_index`; if you need to log in before scraping a site or do something special with the scraped instance page, you would override `get_instance`.
 

diff --git a/lib/upton.rb b/lib/upton.rb
@@ -3,6 +3,8 @@
 require_relative 'upton/scraper'
 require_relative 'upton/utils'
 require_relative 'upton/version'
+require_relative 'upton/downloader'
+require_relative 'upton/scraper'
 
 ##
 # This module contains a scraper called Upton
@@ -20,5 +22,6 @@ module Upton
   #     site's search page or a newspaper's homepage.
   # 2. Instance pages, which represent the goal of your scraping, e.g.
   #     job listings or news articles.
-  #
+  ##
+
 end
diff --git a/lib/upton/downloader.rb b/lib/upton/downloader.rb
@@ -2,6 +2,7 @@
 require "open-uri"
 require "tmpdir"
 require "restclient"
+require_relative "./version"
 
 module Upton
 
@@ -88,11 +89,30 @@ def download_from_cache!
             puts "Writing #{uri} data to the cache"
           end
         end
-        open(cached_file, 'w'){|f| f << resp}
+        commented_resp = add_comment(resp)
+        open(cached_file, 'w'){|f| f << commented_resp}
       end
       {:resp => resp, :from_resource => from_resource }
     end
 
+    def add_comment(resp)
+      # n = Nokogiri::HTML("<html></html>")
+      # c = Nokogiri::XML::Comment.new(n, "asdfasdf")
+      # n.root.add_child(c)
+      # <!----Retrieved by Upton from http://www.somesite.com on January 15 at 4:28 p.m.-->
+      msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
+      resp_html = Nokogiri::HTML(resp)
+      comment = Nokogiri::XML::Comment.new(resp_html, msg)
+      if resp_html.root.nil?
+        return resp
+      elsif resp_html.root.children.empty?
+        resp_html.root.add_child(comment)
+      else
+        resp_html.root.children.before(comment)
+      end
+      resp_html.to_html
+    end
+
     def cache_enabled?
       !!@cache
     end

diff --git a/lib/upton/scraper.rb b/lib/upton/scraper.rb
@@ -15,8 +15,9 @@ module Upton
   class Scraper
     EMPTY_STRING = ''
 
-    attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
-      :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames
+    attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests,
+     :stash_folder, :paginated, :pagination_param, :pagination_max_pages, 
+     :pagination_start_index, :readable_filenames, :pagination_interval
 
     ##
     # This is the main user-facing method for a basic scraper.
@@ -25,8 +26,8 @@ class Scraper
     # in the list of instance URLs returned by +get_index+).
     ##
     def scrape(&blk)
-      self.url_array = self.get_index unless self.url_array
-      self.scrape_from_list(self.url_array, blk)
+      get_indexes!
+      self.scrape_from_list(@instance_urls, blk)
     end
 
     ##
@@ -41,20 +42,7 @@ def scrape(&blk)
     # If you don't specify a selector, the first argument will be treated as a
     # list of URLs.
     ##
-    def initialize(index_url_or_array, selector="")
-
-      #if first arg is a valid URL, do already-written stuff;
-      #if it's not (or if it's a list?) don't bother with get_index, etc.
-      #e.g. Scraper.new(["http://jeremybmerrill.com"])
-
-      #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
-      if index_url_or_array.respond_to? :each_with_index
-        @url_array = index_url_or_array
-      else
-        @index_url = index_url_or_array
-        @index_selector = selector
-      end
-
+    def initialize(options={})
       # If true, then Upton prints information about when it gets
       # files from the internet and when it gets them from its stash.
       @verbose = false
@@ -82,12 +70,57 @@ def initialize(index_url_or_array, selector="")
       @pagination_max_pages = 2
       # Default starting number for pagination (second page is this plus 1).
       @pagination_start_index = 1
+      # Default value to increment page number by
+      @pagination_interval = 1
 
       # Folder name for stashes, if you want them to be stored somewhere else,
       # e.g. under /tmp.
       if @stash_folder
         FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
       end
+
+      @indexes = []
+      @instance_urls = []
+    end
+
+    def index(index_url, selector)
+      # for future:
+      @indexes ||= []
+      @indexes << [index_url, selector]
+      # and actually go scrape the index page, populate @instances
+      self
+    end
+
+    def self.index(index_url, selector, options={})
+      scraper = self.new
+      scraper.index(index_url, selector)
+      scraper
+    end
+
+    def self.instances(instances, options={})
+      s = self.new
+      s.instance_variable_set(:@instance_urls, instances)
+      s
+    end
+
+    # does 
+    # def add_instances(urls)
+    #   #for future:
+    #   # @instances += urls
+    #   # @instances.uniq!
+    #   @instance_urls ||= []
+    #   @instance_urls += urls
+    #   @instance_urls.uniq!
+    # end
+
+    def instances(urls=nil)
+      if urls.nil?
+        @instance_urls
+      else
+        @instance_urls ||= []
+        @instance_urls += urls
+        self
+      end
     end
 
     ##
@@ -145,33 +178,33 @@ def next_index_page_url(url, pagination_index)
     ##
     def scrape_to_csv filename, &blk
       require 'csv'
-      self.url_array = self.get_index unless self.url_array
+      @instance_urls = self.get_index unless @instance_urls
       CSV.open filename, 'wb' do |csv|
         #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
-        self.scrape_from_list(self.url_array, blk).compact.each do |document|
+        self.scrape_from_list(@instance_urls, blk).compact.each do |document|
           if document[0].respond_to? :map
             document.each{|row| csv << row }
           else
             csv << document
           end
         end
-        #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
+        #self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
       end
     end
 
     def scrape_to_tsv filename, &blk
       require 'csv'
-      self.url_array = self.get_index unless self.url_array
+      @instance_urls = self.get_index unless @instance_urls
       CSV.open filename, 'wb', :col_sep => "\t" do |csv|
         #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
-        self.scrape_from_list(self.url_array, blk).compact.each do |document|
+        self.scrape_from_list(@instance_urls, blk).compact.each do |document|
           if document[0].respond_to? :map
             document.each{|row| csv << row }
           else
             csv << document
           end
         end
-        #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
+        #self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
       end
     end
 
@@ -215,6 +248,8 @@ def resolve_url(href_str, absolute_url_str)
           absolute_url = URI(absolute_url_str).dup
         rescue URI::InvalidURIError
           raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI" 
+        rescue ArgumentError
+          raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI" 
         end
       end
       raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
@@ -235,45 +270,33 @@ def resolve_url(href_str, absolute_url_str)
       URI.join(absolute_url.to_s, href.to_s).to_s
     end
 
-    ##
-    # Return a list of URLs for the instances you want to scrape.
-    # This can optionally be overridden if, for example, the list of instances
-    # comes from an API.
-    ##
-    def get_index
-      index_pages = get_index_pages(@index_url, @pagination_start_index).map{|page| parse_index(page, @index_selector) }.flatten
-    end
-
     # TODO: Not sure the best way to handle this
     # Currently, #parse_index is called upon #get_index_pages,
     #  which itself is dependent on @index_url
     # Does @index_url stay unaltered for the lifetime of the Upton instance?
     # It seems to at this point, but that may be something that gets
     #  deprecated later
     #
-    # So for now, @index_url is used in conjunction with resolve_url
+    # So for now, index_url is used in conjunction with resolve_url
     # to make sure that this method returns absolute urls
-    # i.e. this method expects @index_url to always have an absolute address
-    # for the lifetime of an Upton instance
-    def parse_index(text, selector)
-      Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
-        href = a_element["href"]
-        resolved_url = resolve_url( href, @index_url) unless href.nil?
+    def parse_index(text, selector, index_url)
+      Nokogiri::HTML(text).search(selector).to_a.map do |anchor|
+        href = anchor["href"]
+        resolved_url = resolve_url( href, index_url) unless href.nil?
         puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
         resolved_url
       end
     end
 
-
     ##
     # Returns the concatenated output of each member of a paginated index,
     # e.g. a site listing links with 2+ pages.
     ##
-    def get_index_pages(url, pagination_index, options={})
+    def get_index_pages(url, pagination_index, pagination_interval, options={})
       resps = [self.get_page(url, @index_debug, options)]
       prev_url = url
       while !resps.last.empty?
-        pagination_index += 1
+        pagination_index += pagination_interval
         next_url = self.next_index_page_url(url, pagination_index)
         next_url = resolve_url(next_url, url)
         break if next_url == prev_url || next_url.empty?
@@ -308,6 +331,19 @@ def get_instance(url, pagination_index=0, options={})
       resps
     end
 
+    ##
+    # Return a list of URLs for the instances you want to scrape.
+    # This can optionally be overridden if, for example, the list of instances
+    # comes from an API.
+    ##
+    def get_indexes!
+      @indexes.each do |index_url, index_selector|
+        #TODO: cope with pagination stuff per URL
+        @instance_urls += get_index_pages(index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, index_selector, index_url) }.flatten
+      end
+    end
+
+
     # Just a helper for +scrape+.
     def scrape_from_list(list, blk)
       puts "Scraping #{list.size} instances" if @verbose

diff --git a/spec/unit/pagination_spec.rb b/spec/unit/pagination_spec.rb
@@ -7,7 +7,7 @@
       describe '#next_index_page_url' do 
 
         let(:page_url){ 'http://www.propublica.org/search.php?q=test' }
-        let(:u){ Upton::Scraper.new(@page_url, "a") }
+        let(:u){ Upton::Scraper.index(@page_url, "a") }
 
         it "should return an empty string by default" do
           expect(u.next_index_page_url(page_url, 1)).to be_empty

diff --git a/spec/unit/parsing_spec.rb b/spec/unit/parsing_spec.rb
@@ -4,7 +4,8 @@
     context 'parsing' do 
 
       before(:all) do 
-        @scraper = Upton::Scraper.new 'http://an.absolute.url.com/', ''
+        @index_url = 'http://an.absolute.url.com/'
+        @scraper = Upton::Scraper.index @index_url, ''
         @html = %q{
                 <!doctype html><html lang="en"><head><meta charset="UTF-8"><title>Document</title></head>
                 <body>
@@ -17,23 +18,23 @@
 
       describe '#parse_index' do 
         it 'should return an array' do 
-          expect(@scraper.send :parse_index, @html, 'h1#not-actually-existing-element').to be_an Array
+          expect(@scraper.send :parse_index, @html, 'h1#not-actually-existing-element', @index_url).to be_an Array
         end
 
         it 'should use Nokogiri to find element(s) within @html::selector' do
-          arr = @scraper.send :parse_index, @html, 'h1.item a'
+          arr = @scraper.send :parse_index, @html, 'h1.item a', @index_url
           expect(arr.count).to eq 2
         end
 
         it 'should return hrefs in found elements' do
-          href = @scraper.send( :parse_index, @html, 'h1.item a').first
+          href = @scraper.send( :parse_index, @html, 'h1.item a', @index_url).first
           expect(href).to eq 'http://example.com/thing/1'
         end
 
         it 'should currently not allow user to specify actual attribute' do 
           # in cases where links are not inside href attribute, Upton doesn't allow user
           # to specify the attribute
-          data_href = @scraper.send( :parse_index, @html, 'h2.bad-item a').first
+          data_href = @scraper.send( :parse_index, @html, 'h2.bad-item a', @index_url).first
           expect(data_href).to be_nil
         end
       end

diff --git a/spec/unit/resolve_url_spec.rb b/spec/unit/resolve_url_spec.rb
@@ -6,7 +6,7 @@
 
     before(:each) do 
       @page_url = 'http://www.propublica.org/'
-      @u = Upton::Scraper.new(@page_url, "a")
+      @u = Upton::Scraper.index(@page_url, "a")
     end
 
     context 'arguments' do