Permalink
Browse files

Merge pull request #140 from themgt/master

expose bots hash for editing from config block
  • Loading branch information...
andrew committed Mar 7, 2013
2 parents b95e0b5 + 4de2bca commit 563d9e49f43cfeaed40185b7fb7973145797c52c
Showing with 64 additions and 50 deletions.
  1. +3 −1 README.mdown
  2. +56 −49 lib/split/configuration.rb
  3. +5 −0 spec/configuration_spec.rb
View
@@ -248,7 +248,9 @@ You can override the default configuration options of Split like so:
```ruby
Split.configure do |config|
- config.robot_regex = /my_custom_robot_regex/
+ config.robot_regex = /my_custom_robot_regex/ # or
+ config.bots['newbot'] = "Description for bot with 'newbot' user agent, which will be added to config.robot_regex for exclusion"
+
config.ignore_ip_addresses << '81.19.48.130' # or regex: /81\.19\.48\.[0-9]+/
config.db_failover = true # handle redis errors gracefully
config.db_failover_on_db_error = proc{|error| Rails.logger.error(error.message) }
View
@@ -1,52 +1,6 @@
module Split
class Configuration
- BOTS = {
- # Indexers
- "AdsBot-Google" => 'Google Adwords',
- 'Baidu' => 'Chinese search engine',
- 'Gigabot' => 'Gigabot spider',
- 'Googlebot' => 'Google spider',
- 'msnbot' => 'Microsoft bot',
- 'bingbot' => 'Microsoft bing bot',
- 'rogerbot' => 'SeoMoz spider',
- 'Slurp' => 'Yahoo spider',
- 'Sogou' => 'Chinese search engine',
- "spider" => 'generic web spider',
- 'WordPress' => 'WordPress spider',
- 'ZIBB' => 'ZIBB spider',
- 'YandexBot' => 'Yandex spider',
- # HTTP libraries
- 'Apache-HttpClient' => 'Java http library',
- 'AppEngine-Google' => 'Google App Engine',
- "curl" => 'curl unix CLI http client',
- 'ColdFusion' => 'ColdFusion http library',
- "EventMachine HttpClient" => 'Ruby http library',
- "Go http package" => 'Go http library',
- 'Java' => 'Generic Java http library',
- 'libwww-perl' => 'Perl client-server library loved by script kids',
- 'lwp-trivial' => 'Another Perl library loved by script kids',
- "Python-urllib" => 'Python http library',
- "PycURL" => 'Python http library',
- "Test Certificate Info" => 'C http library?',
- "Wget" => 'wget unix CLI http client',
- # URL expanders / previewers
- 'awe.sm' => 'Awe.sm URL expander',
- "bitlybot" => 'bit.ly bot',
- "facebookexternalhit" => 'facebook bot',
- 'LongURL' => 'URL expander service',
- 'Twitterbot' => 'Twitter URL expander',
- 'UnwindFetch' => 'Gnip URL expander',
- # Uptime monitoring
- 'check_http' => 'Nagios monitor',
- 'NewRelicPinger' => 'NewRelic monitor',
- 'Panopta' => 'Monitoring service',
- "Pingdom" => 'Pingdom monitoring',
- 'SiteUptime' => 'Site monitoring services',
- # ???
- "DigitalPersona Fingerprint Software" => 'HP Fingerprint scanner',
- "ShowyouBot" => 'Showyou iOS app spider',
- 'ZyBorg' => 'Zyborg? Hmmm....',
- }
+ attr_accessor :bots
attr_accessor :robot_regex
attr_accessor :ignore_ip_addresses
attr_accessor :db_failover
@@ -58,6 +12,56 @@ class Configuration
attr_accessor :persistence
attr_accessor :algorithm
+ def bots
+ @bots ||= {
+ # Indexers
+ "AdsBot-Google" => 'Google Adwords',
+ 'Baidu' => 'Chinese search engine',
+ 'Gigabot' => 'Gigabot spider',
+ 'Googlebot' => 'Google spider',
+ 'msnbot' => 'Microsoft bot',
+ 'bingbot' => 'Microsoft bing bot',
+ 'rogerbot' => 'SeoMoz spider',
+ 'Slurp' => 'Yahoo spider',
+ 'Sogou' => 'Chinese search engine',
+ "spider" => 'generic web spider',
+ 'WordPress' => 'WordPress spider',
+ 'ZIBB' => 'ZIBB spider',
+ 'YandexBot' => 'Yandex spider',
+ # HTTP libraries
+ 'Apache-HttpClient' => 'Java http library',
+ 'AppEngine-Google' => 'Google App Engine',
+ "curl" => 'curl unix CLI http client',
+ 'ColdFusion' => 'ColdFusion http library',
+ "EventMachine HttpClient" => 'Ruby http library',
+ "Go http package" => 'Go http library',
+ 'Java' => 'Generic Java http library',
+ 'libwww-perl' => 'Perl client-server library loved by script kids',
+ 'lwp-trivial' => 'Another Perl library loved by script kids',
+ "Python-urllib" => 'Python http library',
+ "PycURL" => 'Python http library',
+ "Test Certificate Info" => 'C http library?',
+ "Wget" => 'wget unix CLI http client',
+ # URL expanders / previewers
+ 'awe.sm' => 'Awe.sm URL expander',
+ "bitlybot" => 'bit.ly bot',
+ "facebookexternalhit" => 'facebook bot',
+ 'LongURL' => 'URL expander service',
+ 'Twitterbot' => 'Twitter URL expander',
+ 'UnwindFetch' => 'Gnip URL expander',
+ # Uptime monitoring
+ 'check_http' => 'Nagios monitor',
+ 'NewRelicPinger' => 'NewRelic monitor',
+ 'Panopta' => 'Monitoring service',
+ "Pingdom" => 'Pingdom monitoring',
+ 'SiteUptime' => 'Site monitoring services',
+ # ???
+ "DigitalPersona Fingerprint Software" => 'HP Fingerprint scanner',
+ "ShowyouBot" => 'Showyou iOS app spider',
+ 'ZyBorg' => 'Zyborg? Hmmm....',
+ }
+ end
+
def disabled?
!enabled
end
@@ -138,8 +142,11 @@ def normalize_alternatives(alternatives)
end
end
+ def robot_regex
+ @robot_regex ||= /\b(?:#{escaped_bots.join('|')})\b|\A\W*\z/i
+ end
+
def initialize
- @robot_regex = /\b(?:#{escaped_bots.join('|')})\b|\A\W*\z/i
@ignore_ip_addresses = []
@db_failover = false
@db_failover_on_db_error = proc{|error|} # e.g. use Rails logger here
@@ -160,7 +167,7 @@ def value_for(hash, key)
end
def escaped_bots
- BOTS.map { |key, _| Regexp.escape(key) }
+ bots.map { |key, _| Regexp.escape(key) }
end
end
end
@@ -43,6 +43,11 @@
@config.robot_regex.should_not =~ "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; F-6.0SP2-20041109; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET CLR 1.1.4322; InfoPath.3)"
end
+ it "should allow adding a bot to the bot list" do
+ @config.bots["newbot"] = "An amazing test bot"
+ @config.robot_regex.should =~ "newbot"
+ end
+
it "should use the session adapter for persistence by default" do
@config.persistence.should eq(Split::Persistence::SessionAdapter)
end

0 comments on commit 563d9e4

Please sign in to comment.