From 57da8c1cc528883fca48456ac7eb7a077db0e0c2 Mon Sep 17 00:00:00 2001 From: Rob Dawson Date: Tue, 17 Jul 2012 10:05:31 +1000 Subject: [PATCH] Added the target of redirected urls to the list of crawled urls. --- lib/crawl_job.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/crawl_job.rb b/lib/crawl_job.rb index c43d1f2..c264754 100644 --- a/lib/crawl_job.rb +++ b/lib/crawl_job.rb @@ -35,8 +35,11 @@ def self.perform(content_request) # if there is no limit or we're still under it lets get the url if within_crawl_limits?(content_request[:crawl_limit]) begin + # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects) @redis.srem "queued", content_request[:url] @redis.sadd "crawled", content_request[:url] + @redis.srem "queued", content[:url] + @redis.sadd "crawled", content[:url] # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page if content_request[:crawl_limit_by_page] if content[:mime_type].match("text/html")