Skip to content

Commit

Permalink
add auto selftest w/ fallthrough to scraper js; move scraping info in…
Browse files Browse the repository at this point in the history
…to scrapings out of session/params; overhaul threads sync method to be properly atomic; redo return method to work better w/ asynch & long times
  • Loading branch information
saizai committed Mar 2, 2010
1 parent ca1b12e commit 00bf82a
Show file tree
Hide file tree
Showing 15 changed files with 229 additions and 106 deletions.
2 changes: 1 addition & 1 deletion app/controllers/application_controller.rb
Expand Up @@ -6,7 +6,7 @@ class ApplicationController < ActionController::Base
protect_from_forgery # See ActionController::RequestForgeryProtection for details

# Scrub sensitive parameters from your log
filter_parameter_logging :results
filter_parameter_logging :results, :timings, :cookie, :name, :email

before_filter :get_user

Expand Down
47 changes: 29 additions & 18 deletions app/controllers/scrapings_controller.rb
Expand Up @@ -33,7 +33,7 @@ def create

session[:user_id] = @current_user.id
cookies[:remember_token] = params[:cookie]
session[:scraping_id] = scraping_id = @current_user.scrapings.create(:user_agent => request.env["HTTP_USER_AGENT"]).id
session[:scraping_id] = scraping_id = @current_user.scrapings.create(:user_agent => request.env["HTTP_USER_AGENT"], :batch_size => 500).id
Rails.cache.write "scraping_#{scraping_id}_total", 0
Rails.cache.write "scraping_#{scraping_id}_threads", 0
session[:scraping_start] = Time.now
Expand All @@ -42,32 +42,43 @@ def create
end

def results
logger.info session.inspect
total = Rails.cache.increment("scraping_#{session[:scraping_id]}_total", 0)
finished_threads = Rails.cache.increment("scraping_#{session[:scraping_id]}_threads", 0)

# if (total < 1) and !(finished_threads > 0)
# head :ok
# return
# end

logger.info session
@scraping = @current_user.scrapings.find(session[:scraping_id])

# note: Rails.cache.read seems to return nil when increment,0 returns the correct value. Not sure why, not worth the time to debug
if finished_threads > 0 # == effective_threads
@sites = @scraping.found_sites.find(:all, :select => :url).map(&:url)
@unfound_sites = @scraping.unfound_sites.find(:all, :select => :url).map(&:url)
pv = @current_user.probability_vector
@probabilities = @current_user.url_probabilities(pv)
@avg_up = User.avg_url_probabilities pv.keys
render :update do |page|
page.assign 'completed', true
page['status'].hide
page['results'].replace_html :partial => '/scrapings/results'
if @scraping.job_id
result = Workling.return.get(@scraping.job_id)
if result.nil?
head :ok
elsif result == 'done'
Workling.return.set @scraping.job_id, "Starting results calculation..."
@sites = @scraping.found_sites.find(:all, :select => :url).map(&:url)
@unfound_sites = @scraping.unfound_sites.find(:all, :select => :url).map(&:url)
Workling.return.set @scraping.job_id, "Calculating results... 1/5"
pv = @current_user.probability_vector
Workling.return.set @scraping.job_id, "Calculating results... 2/5"
@probabilities = @current_user.url_probabilities(pv)
Workling.return.set @scraping.job_id, "Calculating results... 3/5"
@avg_up = User.avg_url_probabilities pv.keys
Workling.return.set @scraping.job_id, "Calculating results... 4/5"
render :update do |page|
page.assign 'completed', true
page['status'].hide
page['results'].replace_html :partial => '/scrapings/results'
end
@scraping.update_attribute :job_id, nil
else
render :update do |page|
page['status'].replace_html result
end
end
else
render :update do |page|
page['status'].replace_html "Processing... #{@scraping.found_visitations_count} hits found of #{@scraping.visitations_count} processed so far of #{total}"
page['status'].replace_html "Processing... #{@scraping.found_visitations_count} hits found. #{@scraping.visitations_count} processed so far of #{@scraping.served_urls} scraped. \
#{WORKLING_CLIENT.stats.first[1]['curr_items']} jobs in queue."
end
end
end
Expand Down
36 changes: 23 additions & 13 deletions app/controllers/visitations_controller.rb
Expand Up @@ -8,30 +8,40 @@ def create
return
end

Rails.cache.increment "scraping_#{session[:scraping_id]}_total", @limit
@thread_id = params[:thread_id].to_i
VisitationWorker.asynch_process_results :scraping_id => session[:scraping_id], :results => params[:results]#, :return => true

@limit, @offset, @thread_id = params[:limit].to_i, params[:offset].to_i, params[:thread_id].to_i
logger.info session.inspect
if session[:scraping_start] > 60.seconds.ago
@offset += (@limit * effective_threads) # TODO: modify batch size dynamically?
@sites = Site.find(:all, :order => 'alexa_rank', :limit => @limit, :offset => @offset, :select => 'alexa_rank, id, url')
Scraping.transaction do # using this instead of update_counters so we can atomically get the new value
@scraping = Scraping.find(session[:scraping_id], :lock => true)
@scraping.served_urls += @scraping.batch_size
@scraping.save
end
# TODO: modify batch size dynamically?
@offset = @scraping.served_urls - @scraping.batch_size # technically we should be updating the # served AFTER we set the current one; this just compensates
@sites = Site.find(:all, :order => 'alexa_rank', :limit => @scraping.batch_size, :offset => @offset, :select => 'alexa_rank, id, url')
render '/visitations/new.js.erb'
VisitationWorker.asynch_process_results :scraping_id => session[:scraping_id], :results => params[:results]
else
Rails.cache.increment "scraping_#{session[:scraping_id]}_threads", 1
asynch_code = VisitationWorker.asynch_process_results :scraping_id => session[:scraping_id], :results => params[:results]#, :return => true
# @current_user.update_attribute :job_id, asynch_code
# session[:final_offset] = @offset if session[:final_offset].blank? or session[:final_offset] < @offset
Scraping.transaction do # using this instead of update_counters so we can atomically get the new value
@scraping = Scraping.find(session[:scraping_id], :lock => true)
@scraping.finished_threads += 1
@scraping.save
end
render :js => "top.document.getElementById('status_#{@thread_id}').hide();"
end
end

def autoscrape
@offset, @limit = 0, 500
@thread_id = params[:thread_id].to_i
@offset += @limit * @thread_id

@sites = Site.find(:all, :order => 'alexa_rank', :limit => @limit, :offset => @offset, :select => 'alexa_rank, id, url')
Scraping.transaction do # using this instead of update_counters so we can atomically get the new value
@scraping = Scraping.find(session[:scraping_id], :lock => true)
@scraping.served_urls += @scraping.batch_size
@scraping.save
end
@offset = @scraping.served_urls - @scraping.batch_size

@sites = Site.find(:all, :order => 'alexa_rank', :limit => @scraping.batch_size, :offset => @offset, :select => 'alexa_rank, id, url')
end

end
7 changes: 7 additions & 0 deletions app/models/user.rb
Expand Up @@ -8,6 +8,13 @@ class User < ActiveRecord::Base
validates_presence_of :cookie
validates_uniqueness_of :cookie

before_validation :wipe_blanks

def wipe_blanks
self.name = nil if name.blank?
self.email = nil if email.blank?
end

def probability_vector
found_site_ids = found_visitations.find(:all, :select => 'site_id').map(&:site_id)
visitations.find(:all, :group => 'site_id', :select => 'site_id, AVG(visited) as prob',
Expand Down
6 changes: 3 additions & 3 deletions app/views/main/about.html.erb
Expand Up @@ -9,9 +9,9 @@

<p>The point of this is simply to tell whether I can automatically identify when you visit again with a different browser. To do that, I need training/test data to feed my AI to tell it authoritatively whether two scrapings are the same user or not.</p>
<p>Currently, I'm testing naive Bayes, SVD, and SVM; if you have suggestions for other methods or tweaks to what I'm doing now, please check out the repo and email me.</p>

<p>The data will not be shared with anyone except other EFF-friendly researchers who agree to keep it confidential.</p>

<p>A weakness of Panopticlick's method is that it is very browser-specific. That means that while they can fairly well identify your exactly configured browser if you visit again, they cannot easily identify <i>you the human</i> if you visit from multiple computers or browsers, even if your behavior is similar.</p>
<p>The source code is available at <A href="http://github.com/saizai/cssfingerprint">github</A>. Commits welcome.</p>
</div>

Expand Down
2 changes: 1 addition & 1 deletion app/views/main/results.html.erb
Expand Up @@ -23,7 +23,7 @@
<p>Right now, other components of the app aren't able to actually keep up with that speed:
<ul>
<li>network i/o needs a lot of optimization</li>
<li>the background processing has a synch bug (and is overwhelmed by the front-end speed)</li>
<li>the background processing can't keep up with the scraper</li>
<li>I am still processing *all* the data and not just the hits because the way it's set up doesn't let me easily compress that info (... trying to insert ~3-50k rows per second in mysql is
kinda overtaxing my dev box)</li>
<li>I need to totally redo the way I'm choosing which URLs to test to be intelligent (right now I'm just using the Alexa db, rather than scraping my own and using a bootstrapping method).</li>
Expand Down
97 changes: 51 additions & 46 deletions app/views/scrapings/new.html.erb
Expand Up @@ -40,9 +40,8 @@
<%= periodically_call_remote :frequency => 2, :condition => "completed == false", :url => results_scrapings_url(), :method => :get %>
</div>

<p><span style="color:red">The results display and threads synchronization is broken now. I'm working on it. The scraping still works to collect data;
it's just an error in showing you the results. It will display partial data, but you have to wait about a minute after the end of your scraping; the data processor
isn't able to work as fast as the scraper.</span></p>
<p><span style="color:red">The data processor cannot currently keep up with the speed of the scraper, especially with multiple users. It may take a minute or two to process
your data. You'll see a live status report as it's working.</span></p>
<p><small>There are a few <a href="/about#bugs">known bugs</a>.</small></p>

<script>
Expand All @@ -55,6 +54,28 @@
</script>
</div>

<div id="about">
<h3>About CSS Fingerprint</h3>

<p>Are you a geek? Please read the <%= link_to 'geektastic about page', about_url %> for technical details.</p>

<p>CSS Fingerprint is a research project inspired by the EFF's <a href="http://panopticlick.eff.org">Panopticlick</a>.</p>

<p>Its intent is to see how well the <a href="http://ha.ckers.org/weird/CSS-history-hack.html">CSS history hack</a> can be used with "fuzzy" AI techniques to uniquely fingerprint users
<i>despite changes in their browsing history, even on new computers or new browsers</i>, and to tell how socially/culturally similar any two users are.</p>

<p>At the moment, the AI component is not yet active. In order to write it, I need data.</p>

<p>To help out, please visit this site from multiple different browsers / computers that you own, on multiple days, using the same input each time.</p>

<p>The data will not be shared with anyone except other EFF-friendly researchers who agree to keep it confidential.</p>

<p>Thanks!</p>

<p>- <A href="http://saizai.com">Sai Emrys</A> (saizai)</p>
</div>


<div id="selftest">
<h3>Self test</h3>
<p>Browsers each have idiosyncracies and require different methods to test efficiently. Here are the results of yours.</p>
Expand Down Expand Up @@ -88,32 +109,37 @@
timings[batches[b]] = {};
}
<% methods.each do |m| %>
document.write("<tr><td>" + "<%=m%>" + "</td>");
for (var b = 0; b < batches.size(); b++) {
timeDiff.setStartTime();
CSSHistory.check_batch_with(urls.slice(0, batches[b]), "<%=m%>");
timings[batches[b]]["<%=m%>"] = Math.round(timeDiff.getDiff() * (1000 / batches[b])); // normalize to ms/1kURL
document.write('<td style="text-align:right;">' + timings[batches[b]]["<%=m%>"] + " ms</td>");
}
results["<%=m%>"] = CSSHistory.check_batch_with(selftest_urls, "<%=m%>");
document.write("<td>");
if (!results["<%=m%>"]['cssfingerprint.com'] || results["<%=m%>"]['adfkljalksdflaesw.com']) {
document.write('bogus!');
}
document.write('</td>');
for (var j = 0; j < selftest_urls.length; j++) {
document.write("<td class='" + results["<%=m%>"][selftest_urls[j]] + "'>" + results["<%=m%>"][selftest_urls[j]] + "</td>");
// reuse_noinsert + explorer = crash :(
if (!('<%=m%>' == 'reuse_noinsert' && BrowserDetect.browser == 'Explorer')) {
document.write("<tr><td>" + "<%=m%>" + "</td>");
for (var b = 0; b < batches.size(); b++) {
timeDiff.setStartTime();
CSSHistory.check_batch_with(urls.slice(0, batches[b]), "<%=m%>");
timings[batches[b]]["<%=m%>"] = Math.round(timeDiff.getDiff() * (1000 / batches[b])); // normalize to ms/1kURL
document.write('<td style="text-align:right;">' + timings[batches[b]]["<%=m%>"] + " ms</td>");
}
results["<%=m%>"] = CSSHistory.check_batch_with(selftest_urls, "<%=m%>");
document.write("<td>");
if (!results["<%=m%>"]['cssfingerprint.com'] || results["<%=m%>"]['adfkljalksdflaesw.com']) {
document.write('bogus!');
}
document.write('</td>');
for (var j = 0; j < selftest_urls.length; j++) {
document.write("<td class='" + results["<%=m%>"][selftest_urls[j]] + "'>" + results["<%=m%>"][selftest_urls[j]] + "</td>");
}
}
<% end %>
document.write("<tr><th>without variants:</th></tr>");
<% methods.each do |m| %>
document.write("<tr><td>" + "<%=m%>" + "</td>");
for (var b = 0; b < batches.size(); b++) {
timeDiff.setStartTime();
CSSHistory.check_batch_with(urls.slice(0, batches[b]), "<%=m%>", false);
timings[batches[b]]["<%=m%> novariants"] = Math.round(timeDiff.getDiff() * (1000 / batches[b])); // normalize to ms/1kURL
document.write('<td style="text-align:right;">' + timings[batches[b]]["<%=m%> novariants"] + " ms</td>");
}
if (!('<%=m%>' == 'reuse_noinsert' && BrowserDetect.browser == 'Explorer')) {
document.write("<tr><td>" + "<%=m%>" + "</td>");
for (var b = 0; b < batches.size(); b++) {
timeDiff.setStartTime();
CSSHistory.check_batch_with(urls.slice(0, batches[b]), "<%=m%>", false);
timings[batches[b]]["<%=m%> novariants"] = Math.round(timeDiff.getDiff() * (1000 / batches[b])); // normalize to ms/1kURL
document.write('<td style="text-align:right;">' + timings[batches[b]]["<%=m%> novariants"] + " ms</td>");
}
}
<% end %>
document.write("</tr>");
<%= remote_function :url => browser_tests_url, :method => :post, :with => "'results=' + JSON.stringify(results) + '&timings=' + JSON.stringify(timings)" %>
Expand All @@ -127,24 +153,3 @@
<p>This test only tells the server your user-agent and the information in the table above. It tests 1000 URLs but doesn't report the results of that, only the timing.
This will help me to create better scraping methods.</p>
</div>


<div id="about">
<h3>About CSS Fingerprint</h3>

<p>Are you a geek? Please read the <%= link_to 'geektastic about page', about_url %> for technical details.</p>

<p>CSS Fingerprint is a research project inspired by the EFF's <a href="http://panopticlick.eff.org">Panopticlick</a>.</p>

<p>Its intent is to see how well the <a href="http://ha.ckers.org/weird/CSS-history-hack.html">CSS history hack</a> can be used with "fuzzy" AI techniques to uniquely fingerprint users <i>despite changes in their browsing history, even on new computers or new browsers</i>.</p>

<p>A weakness of Panopticlick's method is that it is very browser-specific. That means that while they can fairly well identify your exactly configured browser if you visit again, they cannot easily identify <i>you the human</i> if you visit from multiple computers or browsers, even if your behavior is similar.</p>

<p>At the moment, the AI component is not yet active. In order to write it, I need data.</p>

<p>To help out, please visit this site from multiple different browsers / computers that you own, on multiple days, using the same input each time.</p>

<p>Thanks!</p>

<p>- <A href="http://saizai.com">Sai Emrys</A> (saizai)</p>
</div>
4 changes: 2 additions & 2 deletions app/views/visitations/autoscrape.html.erb
@@ -1,4 +1,4 @@
<%= javascript_tag "top.document.getElementById('status_#{@thread_id}').update('Testing sites #{@offset + 1} through #{@limit+@offset}...');" %>
<%= javascript_tag remote_function(:url => visitations_url(:limit => @limit, :offset => @offset, :thread_id => @thread_id ), :method => :post,
<%= javascript_tag "top.document.getElementById('status_#{@thread_id}').update('Testing sites #{@offset + 1} through #{@scraping.batch_size+@offset}...');" %>
<%= javascript_tag remote_function(:url => visitations_url(:thread_id => @thread_id ), :method => :post,
:with => "'results=' + JSON.stringify(CSSHistory.check_batch(#{@sites.map(&:url).to_json}))" )
%>
4 changes: 2 additions & 2 deletions app/views/visitations/new.js.erb
@@ -1,4 +1,4 @@
<%= "top.document.getElementById('status_#{@thread_id}').update('Testing sites #{@offset + 1} through #{@limit+@offset}...');" %>
<%= remote_function :url => visitations_url(:limit => @limit, :offset => @offset, :thread_id => @thread_id ), :method => :post,
<%= "top.document.getElementById('status_#{@thread_id}').update('Testing sites #{@offset + 1} through #{@scraping.batch_size+@offset}...');" %>
<%= remote_function :url => visitations_url(:thread_id => @thread_id ), :method => :post,
:with => "'results=' + JSON.stringify(CSSHistory.check_batch(#{@sites.map(&:url).to_json}))"
%>
29 changes: 29 additions & 0 deletions app/workers/scraping_worker.rb
@@ -0,0 +1,29 @@
class ScrapingWorker < Workling::Base
Workling::Return::Store.instance = Workling::Return::Store::StarlingReturnStore.new
logfile = File.open("#{RAILS_ROOT}/log/#{RAILS_ENV}-background.log", 'a')
logfile.sync = true
BG_LOGGER = Logger.new(logfile)
BG_LOGGER.debug "#{Time.now.to_s}: Loading ScrapingWorker. Return store: #{Workling.return.inspect}"

def process_results(options)
Workling.return.set options[:uid], "Starting results calculation..."
scraping = Scraping.find(options[:scraping_id])
BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Starting results for scraping #{scraping.id}"
sites = scraping.found_sites.find(:all, :select => :url).map(&:url)
Workling.return.set options[:uid], "Calculating results... 1/5"
unfound_sites = scraping.unfound_sites.find(:all, :select => :url).map(&:url)
Workling.return.set options[:uid], "Calculating results... 2/5"
pv = scraping.user.probability_vector
Workling.return.set options[:uid], "Calculating results... 3/5"
probabilities = scraping.user.url_probabilities(pv)
Workling.return.set options[:uid], "Calculating results... 4/5"
avg_up = User.avg_url_probabilities pv.keys
Workling.return.set options[:uid], "Calculating results... 5/5"
BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Returning results for scraping #{scraping.id}"

Workling.return.set options[:uid], :sites => sites, :unfound_sites => unfound_sites, :probabilities => probabilities, :avg_up => avg_up
BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: Processed results for scraping #{scraping.id}"
rescue => e
BG_LOGGER.debug "#{Time.now.to_s}: #{options[:uid]}: ERROR #{e}"
end
end

0 comments on commit 00bf82a

Please sign in to comment.