diff --git a/.rspec b/.rspec new file mode 100644 index 0000000..16f9cdb --- /dev/null +++ b/.rspec @@ -0,0 +1,2 @@ +--color +--format documentation diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..e2a82a2 --- /dev/null +++ b/Gemfile @@ -0,0 +1,9 @@ +source 'http://rubygems.org' + +gem 'rspec' +gem 'redis' +gem 'resque' +gem 'addressable' +gem 'awesome_print' +gem 'nokogiri' +gem 'absolutize' \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..ba75d79 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,47 @@ +GEM + remote: http://rubygems.org/ + specs: + absolutize (0.0.12) + addressable (2.2.6) + awesome_print (1.0.2) + diff-lcs (1.1.3) + multi_json (1.0.4) + nokogiri (1.5.0) + rack (1.4.1) + rack-protection (1.2.0) + rack + redis (2.2.2) + redis-namespace (1.0.3) + redis (< 3.0.0) + resque (1.19.0) + multi_json (~> 1.0) + redis-namespace (~> 1.0.2) + sinatra (>= 0.9.2) + vegas (~> 0.1.2) + rspec (2.8.0) + rspec-core (~> 2.8.0) + rspec-expectations (~> 2.8.0) + rspec-mocks (~> 2.8.0) + rspec-core (2.8.0) + rspec-expectations (2.8.0) + diff-lcs (~> 1.1.2) + rspec-mocks (2.8.0) + sinatra (1.3.2) + rack (~> 1.3, >= 1.3.6) + rack-protection (~> 1.2) + tilt (~> 1.3, >= 1.3.3) + tilt (1.3.3) + vegas (0.1.11) + rack (>= 1.0.0) + +PLATFORMS + ruby + +DEPENDENCIES + absolutize + addressable + awesome_print + nokogiri + redis + resque + rspec diff --git a/cobweb.gemspec b/cobweb.gemspec index 178554d..e324394 100644 --- a/cobweb.gemspec +++ b/cobweb.gemspec @@ -15,5 +15,6 @@ spec = Gem::Specification.new do |s| s.add_dependency('absolutize') s.add_dependency('nokogiri') s.add_dependency('addressable') + s.add_dependency('rspec') end diff --git a/lib/cobweb_crawler.rb b/lib/cobweb_crawler.rb new file mode 100644 index 0000000..99c4958 --- /dev/null +++ b/lib/cobweb_crawler.rb @@ -0,0 +1,102 @@ +class CobwebCrawler + + def initialize(options={}) + @options = options + + @queue = [] + @crawled = [] + end + + def crawl(url) + @options[:base_url] = url unless @options.has_key? :base_url + + @absolutize = Absolutize.new(@options[:base_url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true) + + crawl_counter = @crawled.count + + unless @queue.include? url + + # increment counter and check we haven't hit our crawl limit + if !@options.has_key?(:crawl_limit) || crawl_counter <= @options[:crawl_limit].to_i + content = CobWeb.new(@options).get(@options[:url]) + + if @statistic[:average_response_time].nil? + @statistic[:average_response_time] = content[:response_time].to_f + else + @statistic[:average_response_time] = (((@statistic[:average_response_time] * crawl_counter) + content[:response_time].to_f) / crawl_counter + 1) + end + + @statistic[:maximum_response_time] = content[:response_time] if @statistic[:maximum_response_time].nil? || @statistic[:maximum_response_time] < content[:response_time] + @statistic[:minimum_response_time] = content[:response_time] if @statistic[:minimum_response_time].nil? || @statistic[:minimum_response_time] > content[:response_time] + + if @statistic[:average_length] + @statistic[:average_length] = (((statistics[:average_length].to_i*crawl_counter) + content[:length].to_i) / crawl_counter + 1) + else + @statistic[:average_length] = content[:length].to_i + end + + @statistic[:maximum_length] = content[:length].to_i if @statistic[:maximum_length].nil? || content[:length].to_i > @statistic[:maximum_length].to_i + @statistic[:minimum_length] = content[:length].to_i if @statistic[:minimum_length].nil? || content[:length].to_i > @statistic[:minimum_length].to_i + + if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml") + @statistic[:page_count] = @statistic[:page_count].to_i + 1 + else + @statistic[:asset_count] = @statistic[:asset_count].to_i + 1 + end + + mime_counts = {} + if @statistics.has_key? :mime_counts + mime_counts = @statistic[:mime_counts] + if mime_counts.has_key? content[:mime_type] + mime_counts[content[:mime_type]] += 1 + else + mime_counts[content[:mime_type]] = 1 + end + else + mime_counts = {content[:mime_type] => 1} + end + @statistic[:mime_counts] = mime_counts + + status_counts = {} + + if @statistic.has_key? :status_counts + status_counts = @statistic[:status_counts] + if status_counts.has_key? content[:status_code].to_i + status_counts[content[:status_code].to_i] += 1 + else + status_counts[content[:status_code].to_i] = 1 + end + else + status_counts = {content[:status_code].to_i => 1} + end + @statistic[:status_counts] = status_counts + + @queued.delete(url) + @crawled << url + + content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link| + unless @crawled.include? link + puts "Checking if #{link} matches #{@options[:base_url]} as internal?" if @options[:debug] + if link.to_s.match(Regexp.new("^#{@options[:base_url]}")) + puts "Matched as #{link} as internal" if @options[:debug] + unless @crawled.include? link or @queued.include? link + @queued << link + crawl(url) + end + end + end + end + + puts "Crawled: #{crawl_counter} Limit: #{@options[:crawl_limit]} Queued: #{@queued.count}" if @options[:debug] + + + else + puts "Crawl Limit Exceeded by #{crawl_counter - @options[:crawl_limit].to_i} objects" if @options[:debug] + end + else + puts "Already crawled #{@options[:url]}" if @options[:debug] + end + @statistics + end + +end \ No newline at end of file diff --git a/spec/cobweb/cobweb_spec.rb b/spec/cobweb/cobweb_spec.rb index ba539b2..3574eb3 100644 --- a/spec/cobweb/cobweb_spec.rb +++ b/spec/cobweb/cobweb_spec.rb @@ -1,5 +1,4 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') -require "ap" describe CobWeb do @@ -28,6 +27,7 @@ @mock_http_response = mock(Net::HTTPResponse) @mock_http_redirect_response = mock(Net::HTTPRedirection) + @mock_http_redirect_response2 = mock(Net::HTTPRedirection) @mock_http_get = mock(Net::HTTP::Get) Net::HTTP.stub!(:new).and_return(@mock_http_client) @@ -172,7 +172,7 @@ describe "without mock" do it "should throw invalid url exception for an invalid url" do - lambda {@cobweb.get("asdgas asv\"£%\"^%&*%")}.should raise_error URI::InvalidURIError + lambda {@cobweb.get('asdgas as%%&*%')}.should raise_error URI::InvalidURIError end it "should throw exception when server is unavailable" #do