From 57da8c1cc528883fca48456ac7eb7a077db0e0c2 Mon Sep 17 00:00:00 2001
From: Rob Dawson <robert@rojotek.com>
Date: Tue, 17 Jul 2012 10:05:31 +1000
Subject: [PATCH] Added the target of redirected urls to the list of crawled
 urls.

---
 lib/crawl_job.rb | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/crawl_job.rb b/lib/crawl_job.rb
index c43d1f2..c264754 100644
--- a/lib/crawl_job.rb
+++ b/lib/crawl_job.rb
@@ -35,8 +35,11 @@ def self.perform(content_request)
         # if there is no limit or we're still under it lets get the url
         if within_crawl_limits?(content_request[:crawl_limit])
           begin
+            # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
             @redis.srem "queued", content_request[:url]
             @redis.sadd "crawled", content_request[:url]
+            @redis.srem "queued", content[:url]
+            @redis.sadd "crawled", content[:url]
             # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
             if content_request[:crawl_limit_by_page]
               if content[:mime_type].match("text/html")