Skip to content

Commit

Permalink
follow links to subdomains if :crawl_subdomains => true
Browse files Browse the repository at this point in the history
  • Loading branch information
runix committed Mar 10, 2011
1 parent c0f75cc commit 91559bd
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 10 deletions.
14 changes: 13 additions & 1 deletion lib/anemone/core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ class Core
:link_queue => Queue.new,
# Manager for the page processing queue
:page_queue => Queue.new,
# Crawl subdomains?
:crawl_subdomains => false,
}

# Create setter methods for all options to be called from the crawl block
Expand All @@ -70,6 +72,7 @@ class Core
def initialize(urls, opts = {})
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
@urls.each{ |url| url.path = '/' if url.path.empty? }
@valid_domains = @urls.map{|u| [u.host,u.host.gsub(/^www\./,'.')]}.flatten.compact.uniq

@tentacles = []
@on_every_page_blocks = []
Expand Down Expand Up @@ -254,7 +257,16 @@ def visit_link?(link, from_page = nil)
!skip_link?(link) &&
!skip_query_string?(link) &&
allowed(link) &&
!too_deep?(from_page)
!too_deep?(from_page) &&
(in_allowed_domain?(link) or in_allowed_subdomain?(link))
end

def in_allowed_domain?(link)
@valid_domains.index(link.host)
end

def in_allowed_subdomain?(link)
opts[:crawl_subdomains] and @valid_domains.find{|domain| link.host.end_with?(domain)}
end

#
Expand Down
10 changes: 1 addition & 9 deletions lib/anemone/page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def links
u = a['href']
next if u.nil? or u.empty?
abs = to_absolute(URI(u)) rescue next
@links << abs if in_domain?(abs)
@links << abs
end
@links.uniq!
@links
Expand Down Expand Up @@ -158,14 +158,6 @@ def to_absolute(link)
return absolute
end

#
# Returns +true+ if *uri* is in the same domain as the page, returns
# +false+ otherwise
#
def in_domain?(uri)
uri.host == @url.host
end

def marshal_dump
[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
end
Expand Down
12 changes: 12 additions & 0 deletions spec/core_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,18 @@ module Anemone
core.pages.keys.should_not include('http://www.other.com/')
end

it "should follow links to subdomains" do
pages = []
pages << FakePage.new('0', :links => ['1'], :hrefs => [ 'http://www.other.com/', 'http://subdomain.example.com/'] )
pages << FakePage.new('1')

core = Anemone.crawl(pages[0].url, @opts.merge({:crawl_subdomains => true}))

core.should have(3).pages
core.pages.keys.should_not include('http://www.other.com/')
core.pages.keys.should include('http://subdomain.example.com/')
end

it "should follow http redirects" do
pages = []
pages << FakePage.new('0', :links => ['1'])
Expand Down

0 comments on commit 91559bd

Please sign in to comment.