Skip to content

Commit

Permalink
Added the ability to scrape, click_link, then scrape again. (Only for…
Browse files Browse the repository at this point in the history
… firefox agent)
  • Loading branch information
Mikkel Garcia committed Apr 6, 2009
1 parent 17de69f commit 2a82536
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 5 deletions.
42 changes: 42 additions & 0 deletions examples/misc/on_click_next/next_page_link.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
$:.unshift File.join(File.dirname(__FILE__), '../../../lib')

require 'scrubyt'
require 'scrubyt/output/result_dumper'

#Example of: Using the next_page command with onclick='' hrefs.

### Doesn't work, as next_page doesn't click links
data = Scrubyt::Extractor.define(:agent => :firefox) do
fetch("file://"+File.expand_path(File.join(File.dirname(__FILE__), "page_1.html")))

entry '//div'

next_page 'Next'
end

puts "First results :"
puts data.to_xml

puts "---------------"

### Doesn't work, all the results are the same :(

data = Scrubyt::Extractor.define(:agent => :firefox) do
fetch("file://"+File.expand_path(File.join(File.dirname(__FILE__), "page_1.html")))

while(true)
entry '//div'

begin
click_link 'Next'
rescue Watir::Exception::UnknownObjectException
puts "Reached the end. Breaking"
break
end
end
end
puts "Second results:"
puts data.to_xml



10 changes: 10 additions & 0 deletions examples/misc/on_click_next/page_1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<html>
<body>
<div>1</div>
<div>2</div>
<div>3</div>
<form id='nextPage' action='page_2.html'>
<a onclick='document.getElementById("nextPage").submit(); return false' href="#">Next</a>
</form>
</body>
</html>
10 changes: 10 additions & 0 deletions examples/misc/on_click_next/page_2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<html>
<body>
<div>4</div>
<div>5</div>
<div>6</div>
<form id='nextPage' action='page_3.html'>
<a onclick='document.getElementById("nextPage").submit(); return false' href="#">Next</a>
</form>
</body>
</html>
7 changes: 7 additions & 0 deletions examples/misc/on_click_next/page_3.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<html>
<body>
<div>7</div>
<div>8</div>
<div>9</div>
</body>
</html>
4 changes: 4 additions & 0 deletions lib/scrubyt/core/navigation/agents/firewatir.rb
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ def self.click_link(link_spec,index = 0,wait_secs=0)
end
sleep(wait_secs) if wait_secs > 0
@@agent.wait

# evaluate the results
extractor.evaluate_extractor

@@current_doc_url = @@agent.url
@@mechanize_doc = "<html>#{@@agent.html}</html>"
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
Expand Down
9 changes: 9 additions & 0 deletions lib/scrubyt/core/navigation/fetch_action.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ module FetchAction
@@host_name = nil
@@history = []
@@current_form = nil
@@extractor = nil

def self.extractor=(extractor)
@@extractor = extractor
end

def self.extractor
return @@extractor
end

##
# At any given point, the current document can be queried with this method; Typically used
Expand Down
12 changes: 7 additions & 5 deletions lib/scrubyt/core/shared/extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def method_missing(method_name, *args, &block)
root_pattern
end
end
FetchAction.extractor = self
context.extractor = self
context.instance_eval(&extractor_definition)
@evaluating_extractor_definition = false
Expand All @@ -82,10 +83,10 @@ def method_missing(method_name, *args, &block)
end

#Once all is set up, evaluate the extractor from the root pattern!
root_results = evaluate_extractor
evaluate_extractor

@result = ScrubytResult.new('root')
@result.push(*root_results)
@result.push(*@root_results)
@result.root_patterns = @root_patterns
@result.source_file = source_file
@result.source_proc = extractor_definition
Expand Down Expand Up @@ -127,14 +128,14 @@ def add_to_next_page_list(result_node)
end

def evaluate_extractor
root_results = []
@root_results ||= []
current_page_count = 1
catch :quit_next_page_loop do
loop do
url = get_current_doc_url #TODO need absolute address here 2/4
@processed_pages << url
@root_patterns.each do |root_pattern|
root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
@root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
end

while @processed_pages.include? url #TODO need absolute address here 3/4
Expand All @@ -160,7 +161,8 @@ def evaluate_extractor
current_page_count += 1
end
end
root_results
@root_patterns = []
@root_results
end

end
Expand Down
5 changes: 5 additions & 0 deletions lib/scrubyt/output/result_node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def has_content?
end

def to_s
return "" if result.nil?
text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
text = SharedUtils.unescape_entities(text)
text.strip!
Expand All @@ -29,6 +30,10 @@ def to_s
text
end
end

def inspect
to_s
end

def to_libxml
libxml_node = XML::Node.new(name)
Expand Down

0 comments on commit 2a82536

Please sign in to comment.