add auto selftest w/ fallthrough to scraper js; move scraping info in…

…to scrapings out of session/params; overhaul threads sync method to be properly atomic; redo return method to work better w/ asynch & long times
saizai · Mar 2, 2010 · 00bf82a · 00bf82a
1 parent ca1b12e
commit 00bf82a
Show file tree

Hide file tree

Showing 15 changed files with 229 additions and 106 deletions.
diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb
@@ -6,7 +6,7 @@ class ApplicationController < ActionController::Base
   protect_from_forgery # See ActionController::RequestForgeryProtection for details
 
   # Scrub sensitive parameters from your log
-  filter_parameter_logging :results
+  filter_parameter_logging :results, :timings, :cookie, :name, :email
 
   before_filter :get_user
 

diff --git a/app/controllers/scrapings_controller.rb b/app/controllers/scrapings_controller.rb
@@ -33,7 +33,7 @@ def create
 
     session[:user_id] = @current_user.id
     cookies[:remember_token] = params[:cookie]
-    session[:scraping_id]  = scraping_id = @current_user.scrapings.create(:user_agent => request.env["HTTP_USER_AGENT"]).id
+    session[:scraping_id]  = scraping_id = @current_user.scrapings.create(:user_agent => request.env["HTTP_USER_AGENT"], :batch_size => 500).id
     Rails.cache.write "scraping_#{scraping_id}_total", 0
     Rails.cache.write "scraping_#{scraping_id}_threads", 0
     session[:scraping_start] = Time.now
@@ -42,32 +42,43 @@ def create
   end
 
   def results
- logger.info session.inspect
-    total = Rails.cache.increment("scraping_#{session[:scraping_id]}_total", 0)
-    finished_threads = Rails.cache.increment("scraping_#{session[:scraping_id]}_threads", 0)
-
 #    if (total < 1) and !(finished_threads > 0)
 #      head :ok
 #      return
 #    end
-
+    logger.info session
     @scraping = @current_user.scrapings.find(session[:scraping_id])
 
-    # note: Rails.cache.read seems to return nil when increment,0 returns the correct value. Not sure why, not worth the time to debug
-    if finished_threads > 0 # == effective_threads
-      @sites = @scraping.found_sites.find(:all, :select => :url).map(&:url)
-      @unfound_sites = @scraping.unfound_sites.find(:all, :select => :url).map(&:url)
-      pv = @current_user.probability_vector
-      @probabilities = @current_user.url_probabilities(pv)
-      @avg_up = User.avg_url_probabilities pv.keys
-      render :update do |page|
-        page.assign 'completed', true
-        page['status'].hide
-        page['results'].replace_html :partial => '/scrapings/results'
+    if @scraping.job_id
+      result = Workling.return.get(@scraping.job_id)
+      if result.nil?
+        head :ok
+      elsif result == 'done'
+        Workling.return.set @scraping.job_id, "Starting results calculation..."
+        @sites = @scraping.found_sites.find(:all, :select => :url).map(&:url)
+        @unfound_sites = @scraping.unfound_sites.find(:all, :select => :url).map(&:url)
+        Workling.return.set @scraping.job_id, "Calculating results... 1/5"
+        pv = @current_user.probability_vector
+        Workling.return.set @scraping.job_id, "Calculating results... 2/5"
+        @probabilities = @current_user.url_probabilities(pv)
+        Workling.return.set @scraping.job_id, "Calculating results... 3/5"
+        @avg_up = User.avg_url_probabilities pv.keys
+        Workling.return.set @scraping.job_id, "Calculating results... 4/5"
+        render :update do |page|
+          page.assign 'completed', true
+          page['status'].hide
+          page['results'].replace_html :partial => '/scrapings/results'
+        end
+        @scraping.update_attribute :job_id, nil
+      else
+        render :update do |page|
+          page['status'].replace_html result
+        end
       end
     else
       render :update do |page|
-        page['status'].replace_html "Processing... #{@scraping.found_visitations_count} hits found of #{@scraping.visitations_count} processed so far of #{total}"
+        page['status'].replace_html "Processing... #{@scraping.found_visitations_count} hits found. #{@scraping.visitations_count} processed so far of #{@scraping.served_urls} scraped. \
+          #{WORKLING_CLIENT.stats.first[1]['curr_items']} jobs in queue."
       end
     end
   end

diff --git a/app/controllers/visitations_controller.rb b/app/controllers/visitations_controller.rb
@@ -8,30 +8,40 @@ def create
       return
     end
 
-    Rails.cache.increment "scraping_#{session[:scraping_id]}_total", @limit
+    @thread_id = params[:thread_id].to_i
+    VisitationWorker.asynch_process_results :scraping_id => session[:scraping_id], :results => params[:results]#, :return => true
 
-    @limit, @offset, @thread_id = params[:limit].to_i, params[:offset].to_i, params[:thread_id].to_i
-logger.info session.inspect    
     if session[:scraping_start] > 60.seconds.ago
-      @offset += (@limit * effective_threads) # TODO: modify batch size dynamically?
-      @sites = Site.find(:all, :order => 'alexa_rank', :limit => @limit, :offset => @offset, :select => 'alexa_rank, id, url')
+      Scraping.transaction do # using this instead of update_counters so we can atomically get the new value
+        @scraping = Scraping.find(session[:scraping_id], :lock => true)
+        @scraping.served_urls += @scraping.batch_size
+        @scraping.save
+      end
+      # TODO: modify batch size dynamically?
+      @offset = @scraping.served_urls - @scraping.batch_size # technically we should be updating the # served AFTER we set the current one; this just compensates
+      @sites = Site.find(:all, :order => 'alexa_rank', :limit => @scraping.batch_size, :offset => @offset, :select => 'alexa_rank, id, url')
       render '/visitations/new.js.erb'
-      VisitationWorker.asynch_process_results :scraping_id => session[:scraping_id], :results => params[:results]
     else
-      Rails.cache.increment "scraping_#{session[:scraping_id]}_threads", 1
-      asynch_code = VisitationWorker.asynch_process_results :scraping_id => session[:scraping_id], :results => params[:results]#, :return => true
-#      @current_user.update_attribute :job_id, asynch_code
-#      session[:final_offset] = @offset if session[:final_offset].blank? or session[:final_offset] < @offset
+      Scraping.transaction do # using this instead of update_counters so we can atomically get the new value
+        @scraping = Scraping.find(session[:scraping_id], :lock => true)
+        @scraping.finished_threads += 1
+        @scraping.save
+      end
       render :js => "top.document.getElementById('status_#{@thread_id}').hide();"
     end
   end
 
   def autoscrape
-    @offset, @limit = 0, 500
     @thread_id = params[:thread_id].to_i
-    @offset += @limit * @thread_id
 
-    @sites = Site.find(:all, :order => 'alexa_rank', :limit => @limit, :offset => @offset, :select => 'alexa_rank, id, url')
+    Scraping.transaction do # using this instead of update_counters so we can atomically get the new value
+      @scraping = Scraping.find(session[:scraping_id], :lock => true)
+      @scraping.served_urls += @scraping.batch_size
+      @scraping.save
+    end
+    @offset = @scraping.served_urls - @scraping.batch_size
+
+    @sites = Site.find(:all, :order => 'alexa_rank', :limit => @scraping.batch_size, :offset => @offset, :select => 'alexa_rank, id, url')
   end
 
 end
diff --git a/app/models/user.rb b/app/models/user.rb
@@ -8,6 +8,13 @@ class User < ActiveRecord::Base
   validates_presence_of :cookie
   validates_uniqueness_of :cookie
 
+  before_validation :wipe_blanks
+
+  def wipe_blanks
+    self.name = nil if name.blank?
+    self.email = nil if email.blank?
+  end
+
   def probability_vector
     found_site_ids = found_visitations.find(:all, :select => 'site_id').map(&:site_id)
     visitations.find(:all, :group => 'site_id', :select => 'site_id, AVG(visited) as prob',

diff --git a/app/views/main/about.html.erb b/app/views/main/about.html.erb
@@ -9,9 +9,9 @@
 
 	<p>The point of this is simply to tell whether I can automatically identify when you visit again with a different browser. To do that, I need training/test data to feed my AI to tell it authoritatively whether two scrapings are the same user or not.</p>
 	 <p>Currently, I'm testing naive Bayes, SVD, and SVM; if you have suggestions for other methods or tweaks to what I'm doing now, please check out the repo and email me.</p>
-
-	<p>The data will not be shared with anyone except other EFF-friendly researchers who agree to keep it confidential.</p>
-
+	 
+	<p>A weakness of Panopticlick's method is that it is very browser-specific. That means that while they can fairly well identify your exactly configured browser if you visit again, they cannot easily identify <i>you the human</i> if you visit from multiple computers or browsers, even if your behavior is similar.</p>
+		
 	<p>The source code is available at <A href="http://github.com/saizai/cssfingerprint">github</A>. Commits welcome.</p>
 </div>
 

diff --git a/app/views/main/results.html.erb b/app/views/main/results.html.erb
@@ -23,7 +23,7 @@
 <p>Right now, other components of the app aren't able to actually keep up with that speed:
 <ul>
 	<li>network i/o needs a lot of optimization</li>
-	<li>the background processing has a synch bug (and is overwhelmed by the front-end speed)</li>
+	<li>the background processing can't keep up with the scraper</li>
 	<li>I am still processing *all* the data and not just the hits because the way it's set up doesn't let me easily compress that info (... trying to insert ~3-50k rows per second in mysql is
 		kinda overtaxing my dev box)</li>
 	<li>I need to totally redo the way I'm choosing which URLs to test to be intelligent (right now I'm just using the Alexa db, rather than scraping my own and using a bootstrapping method).</li>

diff --git a/app/views/scrapings/new.html.erb b/app/views/scrapings/new.html.erb
@@ -40,9 +40,8 @@
 		<%= periodically_call_remote :frequency => 2, :condition => "completed == false", :url => results_scrapings_url(), :method => :get %>
 	</div>
 
-	<p><span style="color:red">The results display and threads synchronization is broken now. I'm working on it. The scraping still works to collect data; 
-		it's just an error in showing you the results. It will display partial data, but you have to wait about a minute after the end of your scraping; the data processor
-		isn't able to work as fast as the scraper.</span></p>
+	<p><span style="color:red">The data processor cannot currently keep up with the speed of the scraper, especially with multiple users. It may take a minute or two to process
+		your data. You'll see a live status report as it's working.</span></p>
 	<p><small>There are a few <a href="/about#bugs">known bugs</a>.</small></p>
 
 	<script>
@@ -55,6 +54,28 @@
 	</script>
 </div>
 
+<div id="about">
+	<h3>About CSS Fingerprint</h3>
+
+	<p>Are you a geek? Please read the <%= link_to 'geektastic about page', about_url %> for technical details.</p>
+
+	<p>CSS Fingerprint is a research project inspired by the EFF's <a href="http://panopticlick.eff.org">Panopticlick</a>.</p>
+
+	<p>Its intent is to see how well the <a href="http://ha.ckers.org/weird/CSS-history-hack.html">CSS history hack</a> can be used with "fuzzy" AI techniques to uniquely fingerprint users 
+		<i>despite changes in their browsing history, even on new computers or new browsers</i>, and to tell how socially/culturally similar any two users are.</p>
+
+	<p>At the moment, the AI component is not yet active. In order to write it, I need data.</p>
+
+	<p>To help out, please visit this site from multiple different browsers / computers that you own, on multiple days, using the same input each time.</p>
+
+	<p>The data will not be shared with anyone except other EFF-friendly researchers who agree to keep it confidential.</p>
+
+	<p>Thanks!</p>
+
+	<p>- <A href="http://saizai.com">Sai Emrys</A> (saizai)</p>
+</div>
+
+
 <div id="selftest">
 	<h3>Self test</h3>
 	<p>Browsers each have idiosyncracies and require different methods to test efficiently. Here are the results of yours.</p>
@@ -88,32 +109,37 @@
 				timings[batches[b]] = {};
 			}
 			<% methods.each do |m| %>
-				document.write("<tr><td>" + "<%=m%>" + "</td>");
-				for (var b = 0; b < batches.size(); b++) {
-					timeDiff.setStartTime();
-					CSSHistory.check_batch_with(urls.slice(0, batches[b]), "<%=m%>");
-					timings[batches[b]]["<%=m%>"] = Math.round(timeDiff.getDiff() * (1000 / batches[b])); // normalize to ms/1kURL
-					document.write('<td style="text-align:right;">' + timings[batches[b]]["<%=m%>"] + " ms</td>");
-				}
-				results["<%=m%>"] = CSSHistory.check_batch_with(selftest_urls, "<%=m%>");
-				document.write("<td>");
-				if (!results["<%=m%>"]['cssfingerprint.com'] || results["<%=m%>"]['adfkljalksdflaesw.com']) {
-					document.write('bogus!');
-				}
-				document.write('</td>');
-				for (var j = 0; j < selftest_urls.length; j++) {
-					document.write("<td class='" + results["<%=m%>"][selftest_urls[j]] + "'>" + results["<%=m%>"][selftest_urls[j]] + "</td>");
+				// reuse_noinsert + explorer = crash :(
+				if (!('<%=m%>' == 'reuse_noinsert' && BrowserDetect.browser == 'Explorer')) {
+					document.write("<tr><td>" + "<%=m%>" + "</td>");
+					for (var b = 0; b < batches.size(); b++) {
+						timeDiff.setStartTime();
+						CSSHistory.check_batch_with(urls.slice(0, batches[b]), "<%=m%>");
+						timings[batches[b]]["<%=m%>"] = Math.round(timeDiff.getDiff() * (1000 / batches[b])); // normalize to ms/1kURL
+						document.write('<td style="text-align:right;">' + timings[batches[b]]["<%=m%>"] + " ms</td>");
+					}
+					results["<%=m%>"] = CSSHistory.check_batch_with(selftest_urls, "<%=m%>");
+					document.write("<td>");
+					if (!results["<%=m%>"]['cssfingerprint.com'] || results["<%=m%>"]['adfkljalksdflaesw.com']) {
+						document.write('bogus!');
+					}
+					document.write('</td>');
+					for (var j = 0; j < selftest_urls.length; j++) {
+						document.write("<td class='" + results["<%=m%>"][selftest_urls[j]] + "'>" + results["<%=m%>"][selftest_urls[j]] + "</td>");
+					}
 				}
 			<% end %>
 			document.write("<tr><th>without variants:</th></tr>");
 			<% methods.each do |m| %>
-				document.write("<tr><td>" + "<%=m%>" + "</td>");
-				for (var b = 0; b < batches.size(); b++) {
-					timeDiff.setStartTime();
-					CSSHistory.check_batch_with(urls.slice(0, batches[b]), "<%=m%>", false);
-					timings[batches[b]]["<%=m%> novariants"] = Math.round(timeDiff.getDiff() * (1000 / batches[b])); // normalize to ms/1kURL
-					document.write('<td style="text-align:right;">' + timings[batches[b]]["<%=m%> novariants"] + " ms</td>");
-				}				
+				if (!('<%=m%>' == 'reuse_noinsert' && BrowserDetect.browser == 'Explorer')) {
+					document.write("<tr><td>" + "<%=m%>" + "</td>");
+					for (var b = 0; b < batches.size(); b++) {
+						timeDiff.setStartTime();
+						CSSHistory.check_batch_with(urls.slice(0, batches[b]), "<%=m%>", false);
+						timings[batches[b]]["<%=m%> novariants"] = Math.round(timeDiff.getDiff() * (1000 / batches[b])); // normalize to ms/1kURL
+						document.write('<td style="text-align:right;">' + timings[batches[b]]["<%=m%> novariants"] + " ms</td>");
+					}
+				}
 			<% end %>
 			document.write("</tr>");
 			<%=  remote_function :url => browser_tests_url, :method => :post, :with => "'results=' + JSON.stringify(results) + '&timings=' + JSON.stringify(timings)" %>
@@ -127,24 +153,3 @@
 	<p>This test only tells the server your user-agent and the information in the table above. It tests 1000 URLs but doesn't report the results of that, only the timing. 
 		This will help me to create better scraping methods.</p>
 </div>
-
-
-<div id="about">
-	<h3>About CSS Fingerprint</h3>
-
-	<p>Are you a geek? Please read the <%= link_to 'geektastic about page', about_url %> for technical details.</p>
-
-	<p>CSS Fingerprint is a research project inspired by the EFF's <a href="http://panopticlick.eff.org">Panopticlick</a>.</p>
-
-	<p>Its intent is to see how well the <a href="http://ha.ckers.org/weird/CSS-history-hack.html">CSS history hack</a> can be used with "fuzzy" AI techniques to uniquely fingerprint users <i>despite changes in their browsing history, even on new computers or new browsers</i>.</p>
-
-	<p>A weakness of Panopticlick's method is that it is very browser-specific. That means that while they can fairly well identify your exactly configured browser if you visit again, they cannot easily identify <i>you the human</i> if you visit from multiple computers or browsers, even if your behavior is similar.</p>
-
-	<p>At the moment, the AI component is not yet active. In order to write it, I need data.</p>
-
-	<p>To help out, please visit this site from multiple different browsers / computers that you own, on multiple days, using the same input each time.</p>
-
-	<p>Thanks!</p>
-
-	<p>- <A href="http://saizai.com">Sai Emrys</A> (saizai)</p>
-</div>
diff --git a/app/views/visitations/autoscrape.html.erb b/app/views/visitations/autoscrape.html.erb
@@ -1,4 +1,4 @@
-<%= javascript_tag "top.document.getElementById('status_#{@thread_id}').update('Testing sites #{@offset + 1} through #{@limit+@offset}...');" %>
-<%= javascript_tag remote_function(:url => visitations_url(:limit => @limit, :offset => @offset, :thread_id => @thread_id ), :method => :post, 
+<%= javascript_tag "top.document.getElementById('status_#{@thread_id}').update('Testing sites #{@offset + 1} through #{@scraping.batch_size+@offset}...');" %>
+<%= javascript_tag remote_function(:url => visitations_url(:thread_id => @thread_id ), :method => :post, 
 		:with => "'results=' + JSON.stringify(CSSHistory.check_batch(#{@sites.map(&:url).to_json}))" )
 %>
diff --git a/app/views/visitations/new.js.erb b/app/views/visitations/new.js.erb
@@ -1,4 +1,4 @@
-<%= "top.document.getElementById('status_#{@thread_id}').update('Testing sites #{@offset + 1} through #{@limit+@offset}...');" %>
-<%= remote_function :url => visitations_url(:limit => @limit, :offset => @offset, :thread_id => @thread_id ), :method => :post, 
+<%= "top.document.getElementById('status_#{@thread_id}').update('Testing sites #{@offset + 1} through #{@scraping.batch_size+@offset}...');" %>
+<%= remote_function :url => visitations_url(:thread_id => @thread_id ), :method => :post, 
 		:with => "'results=' + JSON.stringify(CSSHistory.check_batch(#{@sites.map(&:url).to_json}))"
 %>
diff --git a/app/workers/scraping_worker.rb b/app/workers/scraping_worker.rb
@@ -0,0 +1,29 @@
+class ScrapingWorker < Workling::Base 
+  Workling::Return::Store.instance = Workling::Return::Store::StarlingReturnStore.new
+  logfile = File.open("#{RAILS_ROOT}/log/#{RAILS_ENV}-background.log", 'a')
+  logfile.sync = true
+  BG_LOGGER = Logger.new(logfile) 
+  BG_LOGGER.debug "#{Time.now.to_s}: Loading ScrapingWorker. Return store: #{Workling.return.inspect}"
+
+  def process_results(options)
+    Workling.return.set options[:uid], "Starting results calculation..."
+    scraping = Scraping.find(options[:scraping_id])
+    BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Starting results for scraping #{scraping.id}"
+    sites = scraping.found_sites.find(:all, :select => :url).map(&:url)
+    Workling.return.set options[:uid], "Calculating results... 1/5"
+    unfound_sites = scraping.unfound_sites.find(:all, :select => :url).map(&:url)
+    Workling.return.set options[:uid], "Calculating results... 2/5"
+    pv = scraping.user.probability_vector
+    Workling.return.set options[:uid], "Calculating results... 3/5"
+    probabilities = scraping.user.url_probabilities(pv)
+    Workling.return.set options[:uid], "Calculating results... 4/5"
+    avg_up = User.avg_url_probabilities pv.keys
+    Workling.return.set options[:uid], "Calculating results... 5/5"
+    BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Returning results for scraping #{scraping.id}"
+
+    Workling.return.set options[:uid], :sites => sites, :unfound_sites => unfound_sites, :probabilities => probabilities, :avg_up => avg_up 
+    BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Processed results for scraping #{scraping.id}"
+  rescue => e
+    BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: ERROR #{e}"
+  end
+end