Skip to content

Commit

Permalink
updated crawl destroy to work with sidekiq
Browse files Browse the repository at this point in the history
  • Loading branch information
stewartmckee committed Mar 15, 2015
1 parent 4528753 commit 219e388
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 45 deletions.
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ group :test do
gem 'coveralls', :require => false
# gem 'thin'
# gem 'sidekiq'
end
end
58 changes: 40 additions & 18 deletions lib/cobweb_crawl_helper.rb
Original file line number Diff line number Diff line change
@@ -1,40 +1,54 @@
# The crawl class gives easy access to information about the crawl, and gives the ability to stop a crawl
class CobwebCrawlHelper

attr_accessor :id

BATCH_SIZE = 200
FINISHED = "Finished"
STARTING = "Starting"
CANCELLED = "Cancelled"

def initialize(data)
@data = data

# TAKING A LONG TIME TO RUN ON PRODUCTION BOX
@stats = Stats.new(data)
end

def destroy
options = @data
options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
if RESQUE_INSTALLED
options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
options[:processing_queue] = "CobwebJob" unless options.has_key?(:processing_queue)
options[:crawl_finished_queue] = "CobwebFinishedJob" unless options.has_key?(:crawl_finished_queue)
end
if SIDEKIQ_INSTALLED
options[:processing_queue] = "CrawlWorker" unless options.has_key?(:processing_queue)
options[:crawl_finished_queue] = "CrawlFinishedWorker" unless options.has_key?(:crawl_finished_queue)
end

# set status as cancelled now so that we don't enqueue any further pages
self.statistics.end_crawl(@data, true)


if options[:finished_resque_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED


if options[:crawl_finished_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED

additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
Resque.enqueue(options[:finished_resque_queue], @stats.get_statistics.merge(additional_stats))

Resque.enqueue(options[:crawl_finished_queue], @stats.get_statistics.merge(additional_stats))
end


if options[:crawl_finished_queue] && options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED

additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?

Kernel.const_get(options[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
end

counter = 0
while(counter < 200) do
break if self.statistics.get_status == CANCELLED
Expand All @@ -55,19 +69,27 @@ def destroy
end
end
end

if options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
queue = Sidekiq::Queue.new(queue_name)
queue.each do |job|
job.args # => [1, 2, 3]
job.delete if job.args[0]["crawl_id"] == id
end
end

end

def statistics
@stats
end

def status
statistics.get_status
end

def id
@data[:crawl_id]
end

end
71 changes: 53 additions & 18 deletions spec/cobweb/cobweb_crawl_helper_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@
describe CobwebCrawlHelper do
include HttpStubs
before(:each) do
pending("not enabled for non resque installs") unless RESQUE_INSTALLED
pending("requires resque or sidekiq") unless RESQUE_INSTALLED || SIDEKIQ_INSTALLED

setup_stubs
end
# this spec tests the crawl object

describe "initialize" do
describe "without data" do
it "should raise an exception" do
lambda {CobwebCrawlHelper.new}.should raise_exception
end
end

describe "with data" do
before(:each) do
data = {:crawl_id => "asdf"}
Expand All @@ -30,41 +30,76 @@
it "should return a status" do
@crawl.should respond_to "status"
end
describe "the destroy method" do

describe "the destroy method " do
before(:each) do
if Resque.size("cobweb_crawl_job") > 0
raise "cobweb_crawl_job is not empty, do not run specs until it is!"
if SIDEKIQ_INSTALLED
if Sidekiq::Queue.new("crawl_worker").size > 0
raise "cobweb_crawl_job is not empty, do not run specs until it is!"
end
elsif RESQUE_INSTALLED
if Resque.size("cobweb_crawl_job") > 0
raise "cobweb_crawl_job is not empty, do not run specs until it is!"
end
end

105.times do |item_count|
2.times do |crawl_count|
item_data = {:crawl_id => "crawl_#{crawl_count}_id", :url => "http://crawl#{crawl_count}.com/page#{item_count}.html"}
Resque.enqueue(CrawlJob, item_data)
if SIDEKIQ_INSTALLED
item_data = {:crawl_id => "crawl_#{crawl_count}_id", :url => "http://crawl#{crawl_count}.com/page#{item_count}.html"}
CrawlWorker.perform_async(item_data)
elsif RESQUE_INSTALLED
item_data = {:crawl_id => "crawl_#{crawl_count}_id", :url => "http://crawl#{crawl_count}.com/page#{item_count}.html"}
Resque.enqueue(CrawlJob, item_data)
end
end
end
end
after(:each) do
Sidekiq::Queue.new("crawl_worker").clear if SIDEKIQ_INSTALLED
Resque.remove_queue("cobweb_crawl_job") if RESQUE_INSTALLED
end
it "should have a queue length of 210" do
Resque.size("cobweb_crawl_job").should == 210
Sidekiq::Queue.new("crawl_worker").size.should == 210 if SIDEKIQ_INSTALLED
Resque.size("cobweb_crawl_job").should == 210 if RESQUE_INSTALLED
end
describe "after called" do
before(:each) do
@crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id", :queue_system => :resque})
if SIDEKIQ_INSTALLED
@crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id", :queue_system => :sidekiq}) if SIDEKIQ_INSTALLED
elsif RESQUE_INSTALLED
@crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id", :queue_system => :resque}) if RESQUE_INSTALLED
end
@crawl.destroy
end
it "should delete only the crawl specified" do
Resque.size("cobweb_crawl_job").should == 105
if SIDEKIQ_INSTALLED
Sidekiq::Queue.new("crawl_worker").size.should == 105
elsif RESQUE_INSTALLED
Resque.size("cobweb_crawl_job").should == 105
end

end
it "should not contain any crawl_0_id" do
Resque.peek("cobweb_crawl_job", 0, 200).map{|i| i["args"][0]}.each do |item|
item["crawl_id"].should_not == "crawl_0_id"
if SIDEKIQ_INSTALLED
Sidekiq::Queue.new("crawl_job").each do |item|
item.args[0]["crawl_id"].should_not == "crawl_0_id"
end
elsif RESQUE_INSTALLED
Resque.peek("cobweb_crawl_job", 0, 200).map{|i| i["args"][0]}.each do |item|
item["crawl_id"].should_not == "crawl_0_id"
end
end
end
it "should only contain crawl_1_id" do
Resque.peek("cobweb_crawl_job", 0, 200).map{|i| i["args"][0]}.each do |item|
item["crawl_id"].should == "crawl_1_id"
if SIDEKIQ_INSTALLED
Sidekiq::Queue.new("crawl_job").each do |item|
item.args[0]["crawl_id"].should == "crawl_1_id"
end
elsif RESQUE_INSTALLED
Resque.peek("cobweb_crawl_job", 0, 200).map{|i| i["args"][0]}.each do |item|
item["crawl_id"].should == "crawl_1_id"
end
end
end
it "should set status to 'Cancelled'" do
Expand All @@ -74,6 +109,6 @@
end
end
end


end
15 changes: 7 additions & 8 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
require File.expand_path(File.dirname(__FILE__) + '/../spec/http_stubs')
require 'mock_redis'


require 'coveralls'
Coveralls.wear!

Expand All @@ -13,11 +12,11 @@
APP_ROOT = File.expand_path(File.dirname(__FILE__) + '/../')

RSpec.configure do |config|

if ENV["TRAVIS_RUBY_VERSION"] || ENV['CI']
config.filter_run_excluding :local_only => true
end

THIN_INSTALLED = false
if Gem::Specification.find_all_by_name("thin", ">=1.0.0").count >= 1
require 'thin'
Expand All @@ -30,18 +29,18 @@
# WAIT FOR START TO COMPLETE
sleep 1


config.before(:all) {
# START THIN SERVER TO HOST THE SAMPLE SITE FOR CRAWLING
}

config.before(:each) {

#redis_mock = double("redis")
#redis_mock.stub(:new).and_return(@redis_mock_object)

#redis_mock.flushdb

}

end

0 comments on commit 219e388

Please sign in to comment.