diff --git a/.gitignore b/.gitignore index 2cc4c0b..e422932 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ public/stylesheets/all*.css config/mail.yml db/top-1m.csv* tmp/* +*.lock diff --git a/app/controllers/scrapings_controller.rb b/app/controllers/scrapings_controller.rb index 34217d6..8c8d7bb 100644 --- a/app/controllers/scrapings_controller.rb +++ b/app/controllers/scrapings_controller.rb @@ -4,7 +4,9 @@ def new end def create - if params[:cookie].blank? + if File.exist?(File.join(RAILS_ROOT, 'update.lock')) + @error_msg = "Our sites database is currently being updated. Please wait a few minutes and try again." + elsif params[:cookie].blank? @error_msg = "Please enter a unique code." else @current_user = User.find_by_cookie(params[:cookie]) @@ -77,7 +79,7 @@ def results end else render :update do |page| - page['status'].replace_html "Processing... #{@scraping.found_visitations_count} hits found. #{@scraping.visitations_count} processed so far of #{@scraping.served_urls} scraped. \ + page['status'].replace_html "Processing ##{@scraping.id}... #{@scraping.found_visitations_count} hits found. #{@scraping.visitations_count} processed so far of #{@scraping.served_urls} scraped. \ #{WORKLING_CLIENT.stats.first[1]['curr_items']} jobs in queue." end end diff --git a/app/models/probability_vector.rb b/app/models/probability_vector.rb new file mode 100644 index 0000000..8aa4415 --- /dev/null +++ b/app/models/probability_vector.rb @@ -0,0 +1,79 @@ +class ProbabilityVector < ActiveRecord::Base + belongs_to :user + belongs_to :site + # hits, tests, avg + + # on create: + # 0. abort if negative and no other users have hit the site + # 1a. if visited_users_count > 0: update aggregate a = (a * [n-1]/n) + (value/n), users_count += 1; visited_users_count += 1; add pv + # 1b. if not: create PVs for all other users' negative tests; update aggreagte a = 1/n, users_count = n; visited_users_count = 1; add pv + + # on update: + # 1. increment tests; increment hits if true + # 2. update aggregate a = a + (new avg - old avg)/n + + # site_results should be a hash of {siteid => visited, ...} + def self.report user_id, site_results + self.transaction do + prior_pvs = self.find(:all, :lock => true, :conditions => ["site_id in (?) and user_id = ?", site_results.keys, user_id]).inject({}){|m,v| m[v.site_id] = v; m } + new_hits = [] # site_id + new_pvs = [] # [site_id, user_id, tests, hits, avg] + sites = Site.find(:all, :lock => true, :conditions => ["id in (?)", site_results.keys], :select => "id, visited_users_count, users_count, avg_visited").inject([]) do |m,s| + visited_now = site_results[s.id] + new_avg_visited, new_users_count, new_visited_users_count = -s.avg_visited, s.users_count, s.visited_users_count + group = if prior_pvs[s.id] # already have a PV, just update it + pv = prior_pvs[s.id] + new_pv_hits = (visited_now ? pv.hits + 1 : pv.hits) + new_avg = (new_pv_hits.to_f / (pv.tests + 1)) + new_avg_visited += ((new_avg - pv.avg) / new_users_count) # change the aggregate by the weighted delta of this user's PV + new_pvs << [s.id, user_id, pv.tests + 1, new_pv_hits, new_avg] + new_visited_users_count += 1 if visited_now and pv.hits == 0 # up the aggregated visited_users_count if this is our first *hit* + :old + else # new for this user + if new_visited_users_count > 0 # other users have hit this, no need to lazy-add their stuff + new_avg_visited = (new_avg_visited * (new_users_count - 1) / new_users_count) + ((visited_now ? 1 : 0).to_f / new_users_count) + new_visited_users_count += 1 if visited_now + new_users_count += 1 + new_pvs << [s.id, user_id, 1, (visited_now ? 1 : 0), 1.0] + :new_existing + else + if visited_now # lazy-add the other users' PVs + new_hits << s.id + new_pvs << [s.id, user_id, 1, 1, 1.0] # will also add ones for the others' down below + :new_hit + else # don't create new PVs unless it's a hit or someone else had a hit + :noop + end + end + end + m << {:id => s.id, :group => group, :avg_visited => -new_avg_visited, :users_count => new_users_count, :visited_users_count => new_visited_users_count} + m + end + + if !new_hits.empty? + # Get all the users who "ought" to have PVs for these sites + nohit_counts = Hash.new(0) + Visitation.find(:all, :conditions => ['site_id IN (?)', new_hits], :joins => :scraping, :group => "site_id", + :select => "group_concat(user_id) as user_ids, site_id").map{|v| + users = v.user_ids.split(',').map(&:to_i).inject(Hash.new(0)){|mm,vv| mm[vv] += 1 ;mm } # hash {user_id => count, ...} + users.each{|user, count| new_pvs << [v.site_id, user, count, 0, 0.0] + nohit_counts[v.site_id] = users.count } } + sites.each do |s| + if s[:group] == :new_hit + n = nohit_counts[s[:id]] + 1 + s[:users_count] = n + s[:visited_users_count] = 1 + s[:avg_visited] = -1.0 / n + end + end + end + + # Update everything, releasing their locks + Site.import [:id, :users_count, :visited_users_count, :avg_visited], sites.map{|v| [v[:id], v[:users_count], v[:visited_users_count], v[:avg_visited]] }, + :validate => false, :on_duplicate_key_update => [:users_count, :visited_users_count, :avg_visited] + self.import [:site_id, :user_id, :tests, :hits, :avg], new_pvs, :validate => false, :on_duplicate_key_update => [:tests, :hits, :avg] if !new_pvs.empty? + end + end + + +end diff --git a/app/models/site.rb b/app/models/site.rb index cb53c7b..67e788e 100644 --- a/app/models/site.rb +++ b/app/models/site.rb @@ -4,5 +4,56 @@ class Site < ActiveRecord::Base has_many :found_visitations, :class_name => 'Visitation', :conditions => 'visited = 1' has_many :found_scrapings, :class_name => 'Scraping', :through => :found_visitations, :source => :scraping + # NOTE: avg_visited is stored NEGATIVE to the real value, because mysql cannot use an index if ORDER BY has mixed ascendency. It's a kludge. + validates_presence_of :url, :alexa_rank, :users_count + + def self.avg_probability_vector site_ids = nil + if site_ids + self.find(:all, :conditions => ['id IN (?)', site_ids], :select => 'id, avg_visited').inject({}){|m,x| m[x.id] = -x.avg_visited; m} + else + self.find(:all, :select => 'id, avg_visited').inject({}){|m,x| m[x.id] = -x.avg_visited; m} + end + end + + def self.avg_url_probabilities site_ids = nil + if site_ids + self.find(:all, :conditions => ['id IN (?)', site_ids], :select => 'url, avg_visited').inject({}){|m,x| m[x.url] = -x.avg_visited; m} + else + self.find(:all, :select => 'url, avg_visited').inject({}){|m,x| m[x.url] = -x.avg_visited; m} + end + end + + def self.update_user_counts sites = nil + sites = sites.map(&:to_i) if sites + return if sites.empty? + ActiveRecord::Base.connection.execute "UPDATE sites INNER JOIN ( \ + SELECT v.site_id, count(DISTINCT user_id) AS count_distinct_user_id FROM `scrapings` INNER JOIN ( \ + SELECT site_id, scraping_id, visited FROM `visitations` WHERE visited = 1 #{ 'AND site_id IN (' + sites.join(',') + ')' if sites }) \ + as v ON scrapings.id = v.scraping_id group by site_id) \ + as s on s.site_id = sites.id \ + SET users_count = count_distinct_user_id;" + end + + def self.version + unless v = Rails.cache.increment('sites_version', 0) + Rails.cache.write 'sites_version', 0 + self.version! # increment seems to mess with the cache format + v = 1 + end + v + end + + def self.version! + Rails.cache.increment 'sites_version', 1 + end + + def self.get offset, batch_size = 500 + key = "sites_#{offset}_#{batch_size}_#{self.version}" + unless r = Rails.cache.read(key) + r = Site.find(:all, :limit => batch_size, :offset => offset, :order => 'avg_visited, alexa_rank', :select => 'id, url, alexa_rank, avg_visited') + Rails.cache.write key, r + end + r + end end diff --git a/app/models/user.rb b/app/models/user.rb index 24285c4..995d3bc 100644 --- a/app/models/user.rb +++ b/app/models/user.rb @@ -4,6 +4,7 @@ class User < ActiveRecord::Base has_many :visitations, :through => :successful_scrapings has_many :found_visitations, :through => :successful_scrapings has_many :unfound_visitations, :through => :successful_scrapings + has_many :probability_vectors validates_presence_of :cookie validates_uniqueness_of :cookie @@ -15,10 +16,12 @@ def wipe_blanks self.email = nil if email.blank? end - def probability_vector - found_site_ids = found_visitations.find(:all, :select => 'site_id').map(&:site_id) - visitations.find(:all, :group => 'site_id', :select => 'site_id, AVG(visited) as prob', - :conditions => ["site_id IN (?)", found_site_ids]).inject({}){|m, x| m[x.site_id] = x.prob.to_f; m } + def probability_vector site_ids = nil + if site_ids + probability_vectors.find(:all, :conditions => ['site_id IN (?)', site_ids]).inject({}){|m,x| m[x.site_id] = x.avg} + else + probability_vectors.inject({}){|m,x| m[x.site_id] = x.avg} + end end def url_probabilities prob = nil @@ -26,17 +29,4 @@ def url_probabilities prob = nil Site.find(prob.keys).inject({}){|m,x| m[x.url] = prob[x.id]; m } end - # FIXME: Make this make each user weight 1, i.e. each visitation weighted (1 / # scrapings for this user-site) - def self.avg_probability_vector site_ids = nil - site_ids ||= Visitation.find(:all, :select => 'DISTINCT site_id', :conditions => 'visited = 1').map(&:site_id) - successful_scraping_ids = Scraping.find(:all, :select => 'id', :conditions => 'found_visitations_count > 0').map(&:id) - Visitation.find(:all, :group => 'site_id', :select => 'site_id, AVG(visited) as prob', - :conditions => ["site_id IN (?) AND scraping_id IN (?)", site_ids, successful_scraping_ids]).inject({}){|m, x| - m[x.site_id] = x.prob.to_f; m } - end - - def self.avg_url_probabilities site_ids = nil - prob = avg_probability_vector(site_ids) - Site.find(prob.keys).inject({}){|m,x| m[x.url] = prob[x.id]; m } - end end diff --git a/app/views/scrapings/error.js.erb b/app/views/scrapings/error.js.erb index d664507..f80e481 100644 --- a/app/views/scrapings/error.js.erb +++ b/app/views/scrapings/error.js.erb @@ -1 +1 @@ -<%= update_page {|page| page['status'].replace_html @error_msg; page['cookie_form'].show; } %> \ No newline at end of file +<%= update_page {|page| page['status_0'].replace_html @error_msg; page['cookie_form'].show; } %> \ No newline at end of file diff --git a/app/views/scrapings/new.html.erb b/app/views/scrapings/new.html.erb index 56104df..32c2c3f 100644 --- a/app/views/scrapings/new.html.erb +++ b/app/views/scrapings/new.html.erb @@ -60,9 +60,10 @@

Are you a geek? Please read the <%= link_to 'geektastic about page', about_url %> for technical details.

CSS Fingerprint is a research project inspired by the EFF's Panopticlick.

- -

Its intent is to see how well the CSS history hack can be used with "fuzzy" AI techniques to uniquely fingerprint users - despite changes in their browsing history, even on new computers or new browsers, and to tell how socially/culturally similar any two users are.

+ +

Its intent is to see how well the CSS history hack can be used with "fuzzy" artificial intelligence + techniques to uniquely fingerprint users despite changes in their browsing history, even on new computers or new browsers, and to tell how socially/culturally + similar any two users are.

At the moment, the AI component is not yet active. In order to write it, I need data.

diff --git a/app/workers/scraping_worker.rb b/app/workers/scraping_worker.rb index a9defa4..db377c6 100644 --- a/app/workers/scraping_worker.rb +++ b/app/workers/scraping_worker.rb @@ -5,24 +5,25 @@ class ScrapingWorker < Workling::Base BG_LOGGER = Logger.new(logfile) BG_LOGGER.debug "#{Time.now.to_s}: Loading ScrapingWorker. Return store: #{Workling.return.inspect}" - def process_results(options) - Workling.return.set options[:uid], "Starting results calculation..." - scraping = Scraping.find(options[:scraping_id]) - BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Starting results for scraping #{scraping.id}" - sites = scraping.found_sites.find(:all, :select => :url).map(&:url) - Workling.return.set options[:uid], "Calculating results... 1/5" - unfound_sites = scraping.unfound_sites.find(:all, :select => :url).map(&:url) - Workling.return.set options[:uid], "Calculating results... 2/5" - pv = scraping.user.probability_vector - Workling.return.set options[:uid], "Calculating results... 3/5" - probabilities = scraping.user.url_probabilities(pv) - Workling.return.set options[:uid], "Calculating results... 4/5" - avg_up = User.avg_url_probabilities pv.keys - Workling.return.set options[:uid], "Calculating results... 5/5" - BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Returning results for scraping #{scraping.id}" + def version_sites_once_idle!(options) + if Rails.cache.read 'version_sites_once_idle_lock' + BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: version_sites_once_idle already in queue" + return + else + Rails.cache.write 'version_sites_once_idle_lock', true + end - Workling.return.set options[:uid], :sites => sites, :unfound_sites => unfound_sites, :probabilities => probabilities, :avg_up => avg_up - BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Processed results for scraping #{scraping.id}" + while Scraping.last.created_at > 2.minutes.ago + BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Not idle..." + sleep 10 + end + + Site.version! + BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Versioned!" + 300.times{|i| Site.get 500 * i } + BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Warmed up!" + + Rails.cache.delete 'version_sites_once_idle_lock' rescue => e BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: ERROR #{e}" end diff --git a/app/workers/visitation_worker.rb b/app/workers/visitation_worker.rb index 44d0c99..eab15db 100644 --- a/app/workers/visitation_worker.rb +++ b/app/workers/visitation_worker.rb @@ -11,26 +11,27 @@ def process_results(options) scraping_id, results = options[:scraping_id], options[:results] results = JSON.parse(results) sites = Site.find(:all, :conditions => ['url IN (?)', results.keys.map{|x| URI.decode x}], :select => 'id, url').map{|s| [s.id, s.url]} + site_results = results.inject({}){|m,v| m[sites.rassoc(URI.decode v[0])[0]] = v[1]; m} + found_site_ids = site_results.reject{|k,v| !v}.keys Visitation.import [:scraping_id, :site_id, :visited], results.map{|key,value| [scraping_id, sites.rassoc(URI.decode key)[0], value]}, :validate => false # save a bit of RAM # because we're using mass import, this isn't getting updated automagically - found_count = results.map{|k,v| v}.count(true) + found_count = found_site_ids.count Scraping.update_counters scraping_id, :visitations_count => results.size, :found_visitations_count => found_count scraping = Scraping.find(scraping_id) # AFTER the update # (almost) all done. Sometimes batches seem to get lost over the wire. # FIXME: why are they getting lost? Why are some threads not finishing? - if scraping.finished_threads <= THREADS - 1 and scraping.served_urls <= scraping.visitations_count + scraping.batch_size * THREADS + if scraping.created_at < 60.seconds.ago and scraping.served_urls <= scraping.visitations_count + scraping.batch_size * THREADS # and finished_threads <= THREADS - 1 + ScrapingWorker.asynch_version_sites_once_idle! Workling.return.set options[:uid], "done" scraping.update_attribute :job_id, options[:uid] # ScrapingWorker.asynch_process_results(:scraping_id => scraping_id) end + ProbabilityVector.report scraping.user_id, site_results + # Site.update_user_counts found_site_ids if !found_site_ids.empty? - # BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Updating scrapings count..." - # there should be a faster way of doing this - # sites.map{|s| s.update_attribute :users_count, x.found_scrapings.count('DISTINCT user_id')} - - BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Processed scraping #{scraping_id} offset #{sites.first[0]}; found #{found_count} / #{results.size}" + BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Processed scraping #{scraping_id} offset #{sites.first[0]}; found #{found_count} / #{results.size}: #{found_site_ids.join(', ')}" rescue => e BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: ERROR #{e}" end diff --git a/db/migrate/20100303004045_add_probability_vectors.rb b/db/migrate/20100303004045_add_probability_vectors.rb new file mode 100644 index 0000000..bdc25a9 --- /dev/null +++ b/db/migrate/20100303004045_add_probability_vectors.rb @@ -0,0 +1,23 @@ +class AddProbabilityVectors < ActiveRecord::Migration + def self.up + add_column :sites, :avg_visited, :float, :default => 0 + add_column :sites, :visited_users_count, :integer, :default => 0 + create_table :probability_vectors do |t| + t.references :user, :site, :default => nil, :null => false + t.integer :hits, :tests, :default => 0 + t.float :avg, :default => 0 + + t.timestamps + end + + add_index :probability_vectors, [:user_id, :site_id], :unique => true + remove_index :sites, [:alexa_rank, :id, :url] + add_index :sites, [:avg_visited, :alexa_rank, :id, :url], :name => :by_popularity + end + + def self.down + remove_column :sites, :avg_visited + remove_column :sites, :visited_users_count + drop_table :probability_vectors + end +end diff --git a/lib/tasks/alexa.rake b/lib/tasks/alexa.rake index 2f06ac1..6b02867 100644 --- a/lib/tasks/alexa.rake +++ b/lib/tasks/alexa.rake @@ -7,6 +7,9 @@ namespace :alexa do # If RAM needs to be conserved, use FasterCSV.foreach and Site.create instead # In testing, single-item import was about ~600 items per second; batch import was ~1300 + raise "Lockfile found" if File.exist?(File.join(RAILS_ROOT, 'update.lock')) + f = File.new(File.join(RAILS_ROOT, 'update.lock'), 'w') + f.close FileUtils.rm(File.join(RAILS_ROOT, 'db', 'top-1m.csv.zip')) rescue true `cd #{File.join(RAILS_ROOT, 'db')} && wget http://s3.amazonaws.com/alexa-static/top-1m.csv.zip` `cd #{File.join(RAILS_ROOT, 'db')} && unzip -o top-1m.csv.zip` @@ -14,12 +17,13 @@ namespace :alexa do alexa = FasterCSV.read(File.join(RAILS_ROOT, 'db', 'top-1m.csv')) puts "Importing..." Site.import [:alexa_rank, :url], alexa, :validate => false, :on_duplicate_key_update => [:alexa_rank] + Site.version! + File.delete(File.join(RAILS_ROOT, 'update.lock')) puts "Done!" end desc "Warm up the database" task :warm_db => :environment do - 300.times{|i| Site.find(:all, :order => 'alexa_rank', :limit => 500, :offset => 500 * i, :select => 'alexa_rank, id, url') } # This needs to be kept in sync with VisitationsController#create - true + 300.times{|i| Site.get 500 * i } end end diff --git a/lib/tasks/technorati.rake b/lib/tasks/technorati.rake index 1716763..d9d4ce8 100644 --- a/lib/tasks/technorati.rake +++ b/lib/tasks/technorati.rake @@ -4,6 +4,10 @@ require 'scrubyt' namespace :technorati do desc "Import top 100 Technorati blogs" task :update => :environment do + raise "Lockfile found" if File.exist?(File.join(RAILS_ROOT, 'update.lock')) + f = File.new(File.join(RAILS_ROOT, 'update.lock'), 'w') + f.close + technorati = Scrubyt::Extractor.define do fetch 'http://technorati.com/blogs/top100/' @@ -16,5 +20,8 @@ namespace :technorati do Site.import [:alexa_rank, :url], technorati.to_hash.map{|x| [0, x[:link_url].sub('http://www.', '').sub('http://','').sub(/\/$/, '')]}, :validate => false, :on_duplicate_key_update => [:alexa_rank] + + Site.version! + File.delete(File.join(RAILS_ROOT, 'update.lock')) end end \ No newline at end of file diff --git a/public/index_offline.html b/public/index_offline.html new file mode 100644 index 0000000..503bb71 --- /dev/null +++ b/public/index_offline.html @@ -0,0 +1 @@ +Scraping temporarily disabled pending a database renovation. See about page or results page for now. Back up within a few hours diff --git a/schema.sql b/schema.sql new file mode 100644 index 0000000..5bd8285 --- /dev/null +++ b/schema.sql @@ -0,0 +1,197 @@ +-- MySQL dump 10.11 +-- +-- Host: localhost Database: historyprint_production +-- ------------------------------------------------------ +-- Server version 5.1.30 + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!40101 SET NAMES utf8 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Table structure for table `browser_tests` +-- + +DROP TABLE IF EXISTS `browser_tests`; +SET @saved_cs_client = @@character_set_client; +SET character_set_client = utf8; +CREATE TABLE `browser_tests` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `bogus` tinyint(1) DEFAULT NULL, + `result` tinyint(1) DEFAULT NULL, + `method` varchar(255) DEFAULT NULL, + `url` varchar(255) DEFAULT NULL, + `os` varchar(255) DEFAULT NULL, + `browser` varchar(255) DEFAULT NULL, + `version` varchar(255) DEFAULT NULL, + `created_at` datetime DEFAULT NULL, + `updated_at` datetime DEFAULT NULL, + `user_agent` varchar(255) DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=15748 DEFAULT CHARSET=latin1; +SET character_set_client = @saved_cs_client; + +-- +-- Table structure for table `method_timings` +-- + +DROP TABLE IF EXISTS `method_timings`; +SET @saved_cs_client = @@character_set_client; +SET character_set_client = utf8; +CREATE TABLE `method_timings` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `method` varchar(255) DEFAULT NULL, + `with_variants` tinyint(1) DEFAULT '1', + `batch_size` int(11) DEFAULT NULL, + `timing` int(11) DEFAULT NULL, + `os` varchar(255) DEFAULT NULL, + `browser` varchar(255) DEFAULT NULL, + `version` varchar(255) DEFAULT NULL, + `user_agent` varchar(255) DEFAULT NULL, + `created_at` datetime DEFAULT NULL, + `updated_at` datetime DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=14415 DEFAULT CHARSET=latin1; +SET character_set_client = @saved_cs_client; + +-- +-- Table structure for table `probability_vectors` +-- + +DROP TABLE IF EXISTS `probability_vectors`; +SET @saved_cs_client = @@character_set_client; +SET character_set_client = utf8; +CREATE TABLE `probability_vectors` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `user_id` int(11) NOT NULL, + `site_id` int(11) NOT NULL, + `hits` int(11) DEFAULT '0', + `tests` int(11) DEFAULT '0', + `avg` float DEFAULT '0', + `created_at` datetime DEFAULT NULL, + `updated_at` datetime DEFAULT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `index_probability_vectors_on_user_id_and_site_id` (`user_id`,`site_id`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +SET character_set_client = @saved_cs_client; + +-- +-- Table structure for table `schema_migrations` +-- + +DROP TABLE IF EXISTS `schema_migrations`; +SET @saved_cs_client = @@character_set_client; +SET character_set_client = utf8; +CREATE TABLE `schema_migrations` ( + `version` varchar(255) NOT NULL, + UNIQUE KEY `unique_schema_migrations` (`version`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +SET character_set_client = @saved_cs_client; + +-- +-- Table structure for table `scrapings` +-- + +DROP TABLE IF EXISTS `scrapings`; +SET @saved_cs_client = @@character_set_client; +SET character_set_client = utf8; +CREATE TABLE `scrapings` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `user_id` int(11) DEFAULT NULL, + `batch_size` int(11) DEFAULT NULL, + `finished_threads` int(11) DEFAULT '0', + `served_urls` int(11) DEFAULT '0', + `visitations_count` int(11) NOT NULL DEFAULT '0', + `found_visitations_count` int(11) NOT NULL DEFAULT '0', + `job_id` varchar(255) DEFAULT NULL, + `user_agent` varchar(255) DEFAULT NULL, + `os` varchar(255) DEFAULT NULL, + `browser` varchar(255) DEFAULT NULL, + `version` varchar(255) DEFAULT NULL, + `created_at` datetime DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `index_scrapings_on_user_id` (`user_id`), + KEY `index_scrapings_on_visitations_count` (`visitations_count`) +) ENGINE=InnoDB AUTO_INCREMENT=584 DEFAULT CHARSET=latin1; +SET character_set_client = @saved_cs_client; + +-- +-- Table structure for table `sites` +-- + +DROP TABLE IF EXISTS `sites`; +SET @saved_cs_client = @@character_set_client; +SET character_set_client = utf8; +CREATE TABLE `sites` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `url` varchar(255) NOT NULL, + `alexa_rank` int(11) DEFAULT NULL, + `users_count` int(11) DEFAULT '0', + `avg_visited` float DEFAULT '0', + `created_at` datetime DEFAULT NULL, + `updated_at` datetime DEFAULT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `index_sites_on_url` (`url`), + KEY `index_sites_on_alexa_rank` (`alexa_rank`), + KEY `index_sites_on_users_count` (`users_count`), + KEY `alexa_rank_id_url` (`alexa_rank`,`id`,`url`) +) ENGINE=InnoDB AUTO_INCREMENT=11583520 DEFAULT CHARSET=latin1; +SET character_set_client = @saved_cs_client; + +-- +-- Table structure for table `users` +-- + +DROP TABLE IF EXISTS `users`; +SET @saved_cs_client = @@character_set_client; +SET character_set_client = utf8; +CREATE TABLE `users` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `cookie` varchar(255) NOT NULL, + `scrapings_count` int(11) NOT NULL DEFAULT '0', + `name` varchar(255) DEFAULT NULL, + `email` varchar(255) DEFAULT NULL, + `release_name` tinyint(1) DEFAULT '0', + `job_id` varchar(255) DEFAULT NULL, + `created_at` datetime DEFAULT NULL, + `updated_at` datetime DEFAULT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `index_users_on_cookie` (`cookie`) +) ENGINE=InnoDB AUTO_INCREMENT=162 DEFAULT CHARSET=latin1; +SET character_set_client = @saved_cs_client; + +-- +-- Table structure for table `visitations` +-- + +DROP TABLE IF EXISTS `visitations`; +SET @saved_cs_client = @@character_set_client; +SET character_set_client = utf8; +CREATE TABLE `visitations` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `scraping_id` int(11) NOT NULL, + `site_id` int(11) NOT NULL, + `visited` tinyint(1) NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `index_visitations_on_scraping_id_and_site_id` (`scraping_id`,`site_id`), + KEY `index_visitations_on_site_id` (`site_id`) +) ENGINE=InnoDB AUTO_INCREMENT=19539903 DEFAULT CHARSET=latin1; +SET character_set_client = @saved_cs_client; +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + +-- Dump completed on 2010-03-04 1:31:21 diff --git a/test/fixtures/probability_vectors.yml b/test/fixtures/probability_vectors.yml new file mode 100644 index 0000000..5bf0293 --- /dev/null +++ b/test/fixtures/probability_vectors.yml @@ -0,0 +1,7 @@ +# Read about fixtures at http://ar.rubyonrails.org/classes/Fixtures.html + +# one: +# column: value +# +# two: +# column: value diff --git a/test/unit/probability_vector_test.rb b/test/unit/probability_vector_test.rb new file mode 100644 index 0000000..b972230 --- /dev/null +++ b/test/unit/probability_vector_test.rb @@ -0,0 +1,8 @@ +require 'test_helper' + +class ProbabilityVectorTest < ActiveSupport::TestCase + # Replace this with your real tests. + test "the truth" do + assert true + end +end