Skip to content
Browse files

follow links to subdomains if :crawl_subdomains => true

  • Loading branch information...
1 parent c0f75cc commit 91559bde052956cfc40ae62678ec2a61574cf928 @runix runix committed Mar 10, 2011
Showing with 26 additions and 10 deletions.
  1. +13 −1 lib/anemone/core.rb
  2. +1 −9 lib/anemone/page.rb
  3. +12 −0 spec/core_spec.rb
View
14 lib/anemone/core.rb
@@ -54,6 +54,8 @@ class Core
:link_queue => Queue.new,
# Manager for the page processing queue
:page_queue => Queue.new,
+ # Crawl subdomains?
+ :crawl_subdomains => false,
}
# Create setter methods for all options to be called from the crawl block
@@ -70,6 +72,7 @@ class Core
def initialize(urls, opts = {})
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
@urls.each{ |url| url.path = '/' if url.path.empty? }
+ @valid_domains = @urls.map{|u| [u.host,u.host.gsub(/^www\./,'.')]}.flatten.compact.uniq
@tentacles = []
@on_every_page_blocks = []
@@ -254,7 +257,16 @@ def visit_link?(link, from_page = nil)
!skip_link?(link) &&
!skip_query_string?(link) &&
allowed(link) &&
- !too_deep?(from_page)
+ !too_deep?(from_page) &&
+ (in_allowed_domain?(link) or in_allowed_subdomain?(link))
+ end
+
+ def in_allowed_domain?(link)
+ @valid_domains.index(link.host)
+ end
+
+ def in_allowed_subdomain?(link)
+ opts[:crawl_subdomains] and @valid_domains.find{|domain| link.host.end_with?(domain)}
end
#
View
10 lib/anemone/page.rb
@@ -63,7 +63,7 @@ def links
u = a['href']
next if u.nil? or u.empty?
abs = to_absolute(URI(u)) rescue next
- @links << abs if in_domain?(abs)
+ @links << abs
end
@links.uniq!
@links
@@ -158,14 +158,6 @@ def to_absolute(link)
return absolute
end
- #
- # Returns +true+ if *uri* is in the same domain as the page, returns
- # +false+ otherwise
- #
- def in_domain?(uri)
- uri.host == @url.host
- end
-
def marshal_dump
[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
end
View
12 spec/core_spec.rb
@@ -41,6 +41,18 @@ module Anemone
core.pages.keys.should_not include('http://www.other.com/')
end
+ it "should follow links to subdomains" do
+ pages = []
+ pages << FakePage.new('0', :links => ['1'], :hrefs => [ 'http://www.other.com/', 'http://subdomain.example.com/'] )
+ pages << FakePage.new('1')
+
+ core = Anemone.crawl(pages[0].url, @opts.merge({:crawl_subdomains => true}))
+
+ core.should have(3).pages
+ core.pages.keys.should_not include('http://www.other.com/')
+ core.pages.keys.should include('http://subdomain.example.com/')
+ end
+
it "should follow http redirects" do
pages = []
pages << FakePage.new('0', :links => ['1'])

0 comments on commit 91559bd

Please sign in to comment.
Something went wrong with that request. Please try again.