Permalink
Browse files

adding robots model and also clickstream archive

  • Loading branch information...
1 parent fbec5f3 commit 2d10a2d54714df0d8b0c6443351e3d37e83cb8c3 erik committed Jan 18, 2012
@@ -3,8 +3,6 @@ class ApplicationController < ActionController::Base
filter_parameter_logging :password, :password_confirmation, :credit_card_number
helper :all # include all helpers, all the time
- BOT_FILTER = /(?:Googlebot|Slurp|Apache|msnbot|wget|libwww|nutch|ia_archiver|heretrix|cuil|google|yandex)/i
-
include AuthenticatedSystem
include SslRequirement
@@ -0,0 +1,2 @@
+class ClickstreamArchive < ActiveRecord::Base
+end
View
@@ -0,0 +1,2 @@
+class Robot < ActiveRecord::Base
+end
@@ -40,4 +40,6 @@
TEXT[:headline] = "Write a short, enticing and descriptive headline"
TEXT[:title] = "Write a short, enticing and descriptive title"
-SHOW_PREPEND_FOR_STORY_UPDATES = false
+SHOW_PREPEND_FOR_STORY_UPDATES = false
+
+BOT_FILTER = /(?:#{Robot.all.map(&:name).join('|')})/i
@@ -0,0 +1,23 @@
+class CreateClickstreamArchives < ActiveRecord::Migration
+ def self.up
+ create_table :clickstream_archives do |t|
+ t.string "ip"
+ t.string "user_agent"
+ t.string "url"
+ t.string "referer"
+ t.string "session_id"
+ t.integer "user_id"
+ t.string "clickstreamable_type"
+ t.integer "clickstreamable_id"
+ t.datetime "created_at"
+ t.datetime "updated_at"
+ t.integer "status", :default => 0
+ end
+
+ add_index "clickstreams", ["session_id", "created_at"], :name => "clickstream_session_id_created_at_index"
+ end
+
+ def self.down
+ drop_table :clickstream_archives
+ end
+end
@@ -0,0 +1,159 @@
+class CreateRobots < ActiveRecord::Migration
+ def self.up
+ create_table :robots do |t|
+ t.string :name
+ t.timestamps
+ end
+
+ Robot.create({:name => 'abachobot'})
+ Robot.create({:name => 'accoona-ai-agent'})
+ Robot.create({:name => 'anyapexbot'})
+ Robot.create({:name => 'arachmo'})
+ Robot.create({:name => 'b-l-i-t-z-b-o-t'})
+ Robot.create({:name => 'baiduspider'})
+ Robot.create({:name => 'becomebot'})
+ Robot.create({:name => 'bimbot'})
+ Robot.create({:name => 'blitzbot'})
+ Robot.create({:name => 'boitho.com-dc'})
+ Robot.create({:name => 'boitho.com-robot'})
+ Robot.create({:name => 'bot'})
+ Robot.create({:name => 'btbot'})
+ Robot.create({:name => 'cerberian'})
+ Robot.create({:name => 'drtrs'})
+ Robot.create({:name => 'converacrawler'})
+ Robot.create({:name => 'cosmos'})
+ Robot.create({:name => 'dataparksearch'})
+ Robot.create({:name => 'diamondbot'})
+ Robot.create({:name => 'discobot'})
+ Robot.create({:name => 'emeraldshield.com'})
+ Robot.create({:name => 'webbot'})
+ Robot.create({:name => 'envolk[its]spider'})
+ Robot.create({:name => 'esperanzabot'})
+ Robot.create({:name => 'exabot'})
+ Robot.create({:name => 'fast'})
+ Robot.create({:name => 'enterprise'})
+ Robot.create({:name => 'crawler'})
+ Robot.create({:name => 'fast-webcrawler'})
+ Robot.create({:name => 'fdse'})
+ Robot.create({:name => 'robot'})
+ Robot.create({:name => 'findlinks'})
+ Robot.create({:name => 'furlbot'})
+ Robot.create({:name => 'fyberspider'})
+ Robot.create({:name => 'gcrawler'})
+ Robot.create({:name => 'gaisbot'})
+ Robot.create({:name => 'geniebot'})
+ Robot.create({:name => 'gigabot'})
+ Robot.create({:name => 'girafabot'})
+ Robot.create({:name => 'googlebot'})
+ Robot.create({:name => 'googlebot-image'})
+ Robot.create({:name => 'hl_ftien_spider'})
+ Robot.create({:name => 'htdig'})
+ Robot.create({:name => 'ia_archiver'})
+ Robot.create({:name => 'ichiro'})
+ Robot.create({:name => 'irlbot'})
+ Robot.create({:name => 'issuecrawler'})
+ Robot.create({:name => 'jyxobot'})
+ Robot.create({:name => 'lapozzbot'})
+ Robot.create({:name => 'larbin'})
+ Robot.create({:name => 'linkwalker'})
+ Robot.create({:name => 'lmspider'})
+ Robot.create({:name => 'lwp-trivial'})
+ Robot.create({:name => 'mabontland'})
+ Robot.create({:name => 'mediapartners-google'})
+ Robot.create({:name => 'mjbot'})
+ Robot.create({:name => 'mnogosearch'})
+ Robot.create({:name => 'mogimogi'})
+ Robot.create({:name => 'mojeekbot'})
+ Robot.create({:name => 'morning'})
+ Robot.create({:name => 'paper'})
+ Robot.create({:name => 'msnbot'})
+ Robot.create({:name => 'msrbot'})
+ Robot.create({:name => 'mvaclient'})
+ Robot.create({:name => 'netresearchserver'})
+ Robot.create({:name => 'ng-search'})
+ Robot.create({:name => 'nicebot'})
+ Robot.create({:name => 'noxtrumbot'})
+ Robot.create({:name => 'nusearch'})
+ Robot.create({:name => 'spider'})
+ Robot.create({:name => 'nutchcvs'})
+ Robot.create({:name => 'obot'})
+ Robot.create({:name => 'oegp'})
+ Robot.create({:name => 'omniexplorer_bot'})
+ Robot.create({:name => 'oozbot'})
+ Robot.create({:name => 'orbiter'})
+ Robot.create({:name => 'pagebiteshyperbot'})
+ Robot.create({:name => 'polybot'})
+ Robot.create({:name => 'pompos'})
+ Robot.create({:name => 'psbot'})
+ Robot.create({:name => 'pycurl'})
+ Robot.create({:name => 'rampybot'})
+ Robot.create({:name => 'rufusbot'})
+ Robot.create({:name => 'sandcrawler'})
+ Robot.create({:name => 'sbider'})
+ Robot.create({:name => 'scoutjet'})
+ Robot.create({:name => 'scrubby'})
+ Robot.create({:name => 'searchsight'})
+ Robot.create({:name => 'seekbot'})
+ Robot.create({:name => 'semanticdiscovery'})
+ Robot.create({:name => 'sensis'})
+ Robot.create({:name => 'web'})
+ Robot.create({:name => 'crawler'})
+ Robot.create({:name => 'seochat::bot'})
+ Robot.create({:name => 'shim-crawler'})
+ Robot.create({:name => 'shopwiki'})
+ Robot.create({:name => 'shoula'})
+ Robot.create({:name => 'robot'})
+ Robot.create({:name => 'silk'})
+ Robot.create({:name => 'snappy'})
+ Robot.create({:name => 'sogou'})
+ Robot.create({:name => 'spider'})
+ Robot.create({:name => 'speedy'})
+ Robot.create({:name => 'spider'})
+ Robot.create({:name => 'sqworm'})
+ Robot.create({:name => 'stackrambler'})
+ Robot.create({:name => 'surveybot'})
+ Robot.create({:name => 'synoobot'})
+ Robot.create({:name => 'teoma'})
+ Robot.create({:name => 'terrawizbot'})
+ Robot.create({:name => 'thesubot'})
+ Robot.create({:name => 'thumbnail.cz'})
+ Robot.create({:name => 'robot'})
+ Robot.create({:name => 'tineye'})
+ Robot.create({:name => 'turnitinbot'})
+ Robot.create({:name => 'updated'})
+ Robot.create({:name => 'vagabondo'})
+ Robot.create({:name => 'voilabot'})
+ Robot.create({:name => 'vortex'})
+ Robot.create({:name => 'voyager'})
+ Robot.create({:name => 'vyu'})
+ Robot.create({:name => 'webcollage'})
+ Robot.create({:name => 'websquash.com'})
+ Robot.create({:name => 'wf'})
+ Robot.create({:name => 'wofindeich'})
+ Robot.create({:name => 'robot'})
+ Robot.create({:name => 'xaldon_webspider'})
+ Robot.create({:name => 'yacy'})
+ Robot.create({:name => 'yahoo!'})
+ Robot.create({:name => 'slurp'})
+ Robot.create({:name => 'yahoo!'})
+ Robot.create({:name => 'slurp'})
+ Robot.create({:name => 'china'})
+ Robot.create({:name => 'yahooseeker'})
+ Robot.create({:name => 'yahooseeker-testing'})
+ Robot.create({:name => 'yooglifetchagent'})
+ Robot.create({:name => 'zao'})
+ Robot.create({:name => 'zealbot'})
+ Robot.create({:name => 'zspider'})
+ Robot.create({:name => 'zyborg'})
+ Robot.create({:name => 'Apache'})
+ Robot.create({:name => 'wget'})
+ Robot.create({:name => 'libwww'})
+ Robot.create({:name => 'nutch'})
+ Robot.create({:name => 'cuil'})
+ Robot.create({:name => 'libcurl'})
+ end
+
+ def self.down
+ drop_table :robots
+ end
+end
View
@@ -23,8 +23,23 @@ clickstreams.each do |c|
end
end
-log.info " Updating the table as processed below id #{last_clickstream.id}..."
+# clean up processed clickstreams ...
+if last_clickstream
+ log.info " Cleaning the clickstream table from processed entries below id #{last_clickstream.id}..."
-Clickstream.update_all('status=1', "clickstreamable_id is not null and clickstreamable_type is not null and status=0 and id<=#{last_clickstream.id}")
+ Clickstream.update_all('status=1', "clickstreamable_id is not null and clickstreamable_type is not null and status=0 and id<=#{last_clickstream.id}")
-log.info Time.now.to_s + " Done processing clickstream and going to sleep..."
+ sql = "INSERT INTO clickstream_archives (SELECT * from clickstreams where status = 1 and id<=#{last_clickstream.id})"
+ ActiveRecord::Base.connection.execute(sql)
+
+ Clickstream.delete_all("status=1 and id<=#{last_clickstream.id}")
+end
+
+# clean up the rest of the table...
+log.info " Cleaning table from invalid data..."
+sql = "INSERT INTO clickstream_archives (SELECT * from clickstreams where clickstreamable_id is null and clickstreamable_type is null)"
+ActiveRecord::Base.connection.execute(sql)
+
+Clickstream.delete_all("clickstreamable_id is null and clickstreamable_type is null")
+
+log.info Time.now.to_s + " Done and going to sleep..."
@@ -0,0 +1,11 @@
+# Read about fixtures at http://ar.rubyonrails.org/classes/Fixtures.html
+
+# This model initially had no columns defined. If you add columns to the
+# model remove the '{}' from the fixture names and add the columns immediately
+# below each fixture, per the syntax in the comments below
+#
+one: {}
+# column: value
+#
+two: {}
+# column: value
View
@@ -0,0 +1,11 @@
+# Read about fixtures at http://ar.rubyonrails.org/classes/Fixtures.html
+
+# This model initially had no columns defined. If you add columns to the
+# model remove the '{}' from the fixture names and add the columns immediately
+# below each fixture, per the syntax in the comments below
+#
+one: {}
+# column: value
+#
+two: {}
+# column: value
@@ -0,0 +1,8 @@
+require 'test_helper'
+
+class ClickstreamArchiveTest < ActiveSupport::TestCase
+ # Replace this with your real tests.
+ test "the truth" do
+ assert true
+ end
+end
View
@@ -0,0 +1,8 @@
+require 'test_helper'
+
+class RobotTest < ActiveSupport::TestCase
+ # Replace this with your real tests.
+ test "the truth" do
+ assert true
+ end
+end

0 comments on commit 2d10a2d

Please sign in to comment.