Added the ability to scrape, click_link, then scrape again. (Only for…

… firefox agent)
scrubber · Apr 6, 2009 · 2a82536 · 2a82536
1 parent 17de69f
commit 2a82536
Show file tree

Hide file tree

Showing 8 changed files with 94 additions and 5 deletions.
diff --git a/examples/misc/on_click_next/next_page_link.rb b/examples/misc/on_click_next/next_page_link.rb
@@ -0,0 +1,42 @@
+$:.unshift File.join(File.dirname(__FILE__), '../../../lib')
+
+require 'scrubyt'
+require 'scrubyt/output/result_dumper'
+
+#Example of: Using the next_page command with onclick='' hrefs.
+
+### Doesn't work, as next_page doesn't click links
+data = Scrubyt::Extractor.define(:agent => :firefox) do
+  fetch("file://"+File.expand_path(File.join(File.dirname(__FILE__), "page_1.html")))
+
+  entry '//div'
+
+  next_page 'Next'
+end
+
+puts "First results :"
+puts data.to_xml
+
+puts "---------------"
+
+### Doesn't work, all the results are the same :(
+
+data = Scrubyt::Extractor.define(:agent => :firefox) do
+  fetch("file://"+File.expand_path(File.join(File.dirname(__FILE__), "page_1.html")))
+
+  while(true)
+    entry '//div'
+
+    begin
+      click_link 'Next'
+    rescue Watir::Exception::UnknownObjectException
+      puts "Reached the end.  Breaking"
+      break
+    end
+  end
+end
+puts "Second results:"
+puts data.to_xml
+
+
+
diff --git a/examples/misc/on_click_next/page_1.html b/examples/misc/on_click_next/page_1.html
@@ -0,0 +1,10 @@
+<html>
+  <body>
+    <div>1</div>
+    <div>2</div>
+    <div>3</div>
+    <form id='nextPage' action='page_2.html'>
+      <a onclick='document.getElementById("nextPage").submit(); return false' href="#">Next</a>
+    </form>
+  </body>
+</html>
diff --git a/examples/misc/on_click_next/page_2.html b/examples/misc/on_click_next/page_2.html
@@ -0,0 +1,10 @@
+<html>
+  <body>
+    <div>4</div>
+    <div>5</div>
+    <div>6</div>
+    <form id='nextPage' action='page_3.html'>
+      <a onclick='document.getElementById("nextPage").submit(); return false' href="#">Next</a>
+    </form>
+  </body>
+</html>
diff --git a/examples/misc/on_click_next/page_3.html b/examples/misc/on_click_next/page_3.html
@@ -0,0 +1,7 @@
+<html>
+  <body>
+    <div>7</div>
+    <div>8</div>
+    <div>9</div>
+  </body>
+</html>
diff --git a/lib/scrubyt/core/navigation/agents/firewatir.rb b/lib/scrubyt/core/navigation/agents/firewatir.rb
@@ -103,6 +103,10 @@ def self.click_link(link_spec,index = 0,wait_secs=0)
             end            
             sleep(wait_secs) if wait_secs > 0
             @@agent.wait
+
+            # evaluate the results
+            extractor.evaluate_extractor
+
             @@current_doc_url = @@agent.url
             @@mechanize_doc = "<html>#{@@agent.html}</html>"
             @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))

diff --git a/lib/scrubyt/core/navigation/fetch_action.rb b/lib/scrubyt/core/navigation/fetch_action.rb
@@ -14,6 +14,15 @@ module FetchAction
     @@host_name = nil
     @@history = []
     @@current_form = nil
+    @@extractor = nil
+
+    def self.extractor=(extractor)
+      @@extractor = extractor
+    end
+
+    def self.extractor
+      return @@extractor
+    end
 
     ##
     # At any given point, the current document can be queried with this method; Typically used

diff --git a/lib/scrubyt/core/shared/extractor.rb b/lib/scrubyt/core/shared/extractor.rb
@@ -71,6 +71,7 @@ def method_missing(method_name, *args, &block)
           root_pattern
         end
       end
+      FetchAction.extractor = self
       context.extractor = self
       context.instance_eval(&extractor_definition)
       @evaluating_extractor_definition = false
@@ -82,10 +83,10 @@ def method_missing(method_name, *args, &block)
       end
 
       #Once all is set up, evaluate the extractor from the root pattern!
-      root_results = evaluate_extractor
+      evaluate_extractor
 
       @result = ScrubytResult.new('root')
-      @result.push(*root_results)
+      @result.push(*@root_results)
       @result.root_patterns = @root_patterns
       @result.source_file = source_file
       @result.source_proc = extractor_definition
@@ -127,14 +128,14 @@ def add_to_next_page_list(result_node)
     end
 
     def evaluate_extractor
-      root_results = []
+      @root_results ||= []
       current_page_count = 1
       catch :quit_next_page_loop do
         loop do
           url = get_current_doc_url #TODO need absolute address here 2/4
           @processed_pages << url
           @root_patterns.each do |root_pattern|
-            root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
+            @root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
           end
 
           while @processed_pages.include? url #TODO need absolute address here 3/4
@@ -160,7 +161,8 @@ def evaluate_extractor
           current_page_count += 1
         end
       end
-      root_results
+      @root_patterns = []
+      @root_results
     end
 
   end

diff --git a/lib/scrubyt/output/result_node.rb b/lib/scrubyt/output/result_node.rb
@@ -20,6 +20,7 @@ def has_content?
     end
 
     def to_s
+      return "" if result.nil?
       text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
       text = SharedUtils.unescape_entities(text)
       text.strip!
@@ -29,6 +30,10 @@ def to_s
         text
       end
     end
+
+    def inspect
+      to_s
+    end
 
     def to_libxml
       libxml_node = XML::Node.new(name)