Skip to content

Commit

Permalink
added rspec and bundler
Browse files Browse the repository at this point in the history
  • Loading branch information
stewartmckee committed Feb 9, 2012
1 parent 906142e commit 2309330
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 2 deletions.
2 changes: 2 additions & 0 deletions .rspec
@@ -0,0 +1,2 @@
--color
--format documentation
9 changes: 9 additions & 0 deletions Gemfile
@@ -0,0 +1,9 @@
source 'http://rubygems.org'

gem 'rspec'
gem 'redis'
gem 'resque'
gem 'addressable'
gem 'awesome_print'
gem 'nokogiri'
gem 'absolutize'
47 changes: 47 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,47 @@
GEM
remote: http://rubygems.org/
specs:
absolutize (0.0.12)
addressable (2.2.6)
awesome_print (1.0.2)
diff-lcs (1.1.3)
multi_json (1.0.4)
nokogiri (1.5.0)
rack (1.4.1)
rack-protection (1.2.0)
rack
redis (2.2.2)
redis-namespace (1.0.3)
redis (< 3.0.0)
resque (1.19.0)
multi_json (~> 1.0)
redis-namespace (~> 1.0.2)
sinatra (>= 0.9.2)
vegas (~> 0.1.2)
rspec (2.8.0)
rspec-core (~> 2.8.0)
rspec-expectations (~> 2.8.0)
rspec-mocks (~> 2.8.0)
rspec-core (2.8.0)
rspec-expectations (2.8.0)
diff-lcs (~> 1.1.2)
rspec-mocks (2.8.0)
sinatra (1.3.2)
rack (~> 1.3, >= 1.3.6)
rack-protection (~> 1.2)
tilt (~> 1.3, >= 1.3.3)
tilt (1.3.3)
vegas (0.1.11)
rack (>= 1.0.0)

PLATFORMS
ruby

DEPENDENCIES
absolutize
addressable
awesome_print
nokogiri
redis
resque
rspec
1 change: 1 addition & 0 deletions cobweb.gemspec
Expand Up @@ -15,5 +15,6 @@ spec = Gem::Specification.new do |s|
s.add_dependency('absolutize')
s.add_dependency('nokogiri')
s.add_dependency('addressable')
s.add_dependency('rspec')

end
102 changes: 102 additions & 0 deletions lib/cobweb_crawler.rb
@@ -0,0 +1,102 @@
class CobwebCrawler

def initialize(options={})
@options = options

@queue = []
@crawled = []
end

def crawl(url)
@options[:base_url] = url unless @options.has_key? :base_url

@absolutize = Absolutize.new(@options[:base_url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)

crawl_counter = @crawled.count

unless @queue.include? url

# increment counter and check we haven't hit our crawl limit
if !@options.has_key?(:crawl_limit) || crawl_counter <= @options[:crawl_limit].to_i
content = CobWeb.new(@options).get(@options[:url])

if @statistic[:average_response_time].nil?
@statistic[:average_response_time] = content[:response_time].to_f
else
@statistic[:average_response_time] = (((@statistic[:average_response_time] * crawl_counter) + content[:response_time].to_f) / crawl_counter + 1)
end

@statistic[:maximum_response_time] = content[:response_time] if @statistic[:maximum_response_time].nil? || @statistic[:maximum_response_time] < content[:response_time]
@statistic[:minimum_response_time] = content[:response_time] if @statistic[:minimum_response_time].nil? || @statistic[:minimum_response_time] > content[:response_time]

if @statistic[:average_length]
@statistic[:average_length] = (((statistics[:average_length].to_i*crawl_counter) + content[:length].to_i) / crawl_counter + 1)
else
@statistic[:average_length] = content[:length].to_i
end

@statistic[:maximum_length] = content[:length].to_i if @statistic[:maximum_length].nil? || content[:length].to_i > @statistic[:maximum_length].to_i
@statistic[:minimum_length] = content[:length].to_i if @statistic[:minimum_length].nil? || content[:length].to_i > @statistic[:minimum_length].to_i

if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
@statistic[:page_count] = @statistic[:page_count].to_i + 1
else
@statistic[:asset_count] = @statistic[:asset_count].to_i + 1
end

mime_counts = {}
if @statistics.has_key? :mime_counts
mime_counts = @statistic[:mime_counts]
if mime_counts.has_key? content[:mime_type]
mime_counts[content[:mime_type]] += 1
else
mime_counts[content[:mime_type]] = 1
end
else
mime_counts = {content[:mime_type] => 1}
end
@statistic[:mime_counts] = mime_counts

status_counts = {}

if @statistic.has_key? :status_counts
status_counts = @statistic[:status_counts]
if status_counts.has_key? content[:status_code].to_i
status_counts[content[:status_code].to_i] += 1
else
status_counts[content[:status_code].to_i] = 1
end
else
status_counts = {content[:status_code].to_i => 1}
end
@statistic[:status_counts] = status_counts

@queued.delete(url)
@crawled << url

content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
unless @crawled.include? link
puts "Checking if #{link} matches #{@options[:base_url]} as internal?" if @options[:debug]
if link.to_s.match(Regexp.new("^#{@options[:base_url]}"))
puts "Matched as #{link} as internal" if @options[:debug]
unless @crawled.include? link or @queued.include? link
@queued << link
crawl(url)
end
end
end
end

puts "Crawled: #{crawl_counter} Limit: #{@options[:crawl_limit]} Queued: #{@queued.count}" if @options[:debug]


else
puts "Crawl Limit Exceeded by #{crawl_counter - @options[:crawl_limit].to_i} objects" if @options[:debug]
end
else
puts "Already crawled #{@options[:url]}" if @options[:debug]
end
@statistics
end

end
4 changes: 2 additions & 2 deletions spec/cobweb/cobweb_spec.rb
@@ -1,5 +1,4 @@
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
require "ap"

describe CobWeb do

Expand Down Expand Up @@ -28,6 +27,7 @@

@mock_http_response = mock(Net::HTTPResponse)
@mock_http_redirect_response = mock(Net::HTTPRedirection)
@mock_http_redirect_response2 = mock(Net::HTTPRedirection)
@mock_http_get = mock(Net::HTTP::Get)

Net::HTTP.stub!(:new).and_return(@mock_http_client)
Expand Down Expand Up @@ -172,7 +172,7 @@

describe "without mock" do
it "should throw invalid url exception for an invalid url" do
lambda {@cobweb.get("asdgas asv\"£%\"^%&*%")}.should raise_error URI::InvalidURIError
lambda {@cobweb.get('asdgas as%%&*%')}.should raise_error URI::InvalidURIError
end

it "should throw exception when server is unavailable" #do
Expand Down

0 comments on commit 2309330

Please sign in to comment.