Skip to content

Commit

Permalink
it works! searching 40k source files over 30 git projects in 0.005s
Browse files Browse the repository at this point in the history
(210M total / processing 65 docs/sec)

freaking sweet.
  • Loading branch information
schacon committed Oct 11, 2008
1 parent 02ad5d6 commit 50f792a
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 12 deletions.
7 changes: 7 additions & 0 deletions git_file.rb
@@ -0,0 +1,7 @@
class GitFile
include DataMapper::Resource
property :id, Integer, :serial => true
property :repo, String
property :path, String
property :sha_value, String
end
16 changes: 5 additions & 11 deletions git_sphinx.rb
Expand Up @@ -4,14 +4,7 @@
require 'dm-core'
require 'albino'
require 'pp'

class GitFile
include DataMapper::Resource
property :id, Integer, :serial => true
property :repo, String
property :path, String
property :sha_value, String
end
require 'git_file'

module GitSphinx
class Indexer
Expand All @@ -38,6 +31,7 @@ def print_document(repo, path, modified_date = nil)
g = Grit::Repo.new(path)

# !!! do this for each head, ignoring common blobs, starting with master
Grit::Git.git_timeout = 40
g.git.method_missing('ls_tree', {'full-name' => true, 'r' => true}, 'master').split("\n").each do |line|
(info, path) = line.split("\t")
(mode, type, sha) = info.split(' ')
Expand All @@ -46,14 +40,14 @@ def print_document(repo, path, modified_date = nil)
if type == 'blob' && (lexer != 'plain')
next if !(id = get_object_id(repo, path, sha))
blob = g.blob(sha)

# i'd like to check for binary data here, but I suppose it doens't really
# matter - i assume sphinx will just ignore it if it can't parse any
# valid chars out of it

content = blob.data.fast_xs
#.fast_xs

puts "<sphinx:document id=\"#{id}\">
<content><![CDATA[#{content}]]></content>
<repository>#{repo}</repository>
Expand Down
11 changes: 10 additions & 1 deletion index.rb
Expand Up @@ -16,7 +16,16 @@
# get repository list from db (based on main or new)
repos = []
if index_main
repos << ['fuzed', '/Users/schacon/projects/fuzed/.git']
projects = `locate '.git/description' | grep projects`
projects.each do |project_line|
project_line = project_line.split('/')
project_line.pop
if(project_line.size < 7)
project = project_line.join('/')
pname = project.scan(/\/([a-zA-z0-9\-_]*?)\/.git/).first.first rescue nil
repos << [pname, project] if pname
end
end
else
repos << ['grit', '/Users/schacon/projects/grit/.git']
end
Expand Down
67 changes: 67 additions & 0 deletions search_server.rb
@@ -0,0 +1,67 @@
#! /usr/bin/env ruby
require 'rubygems'
require 'sinatra'
require 'riddle'
require 'dm-core'
require 'dm-aggregates'
require 'git_file'
require 'pp'

# sinatra server for searching your codes
DataMapper.setup(:default, 'mysql://localhost/git_file_index')
$client = Riddle::Client.new

def show_doc(id)
f = GitFile.get(id.to_i)
[f.repo, f.path].join(', ')
end

template :layout do
"
<html>
<form method=\"POST\" action=\"/search\">
<input name=\"search\" type=\"text\">
<input type=\"submit\">
</form>
<%= yield %>
</html>
"
end

template :index do
'<em>type a search term to begin</em>'
end

template :results do
'
<h1><%= @results[:total] %> Results</h1>
<% @results[:matches].each do |match| %>
<li><%= show_doc(match[:doc]) %>
<% end %>
<hr/>
<h3>Matches</h3>
<% @results[:words].each do |word, stats| %>
<li><%= word %> (docs:<%= stats[:docs] %>, hits:<%= stats[:hits] %>)
<% end %>
<hr/>
Results took : <%= @results[:time] %>s
'
end

get '/' do
erb :index
end

post '/search' do

@results = $client.query params[:search]
erb :results
end

#collected 40569 docs, 216.5 MB
#sorted 15.8 Mhits, 100.0% done
#total 40569 docs, 216528702 bytes
#total 617.686 sec, 350548.44 bytes/sec, 65.68 docs/sec

0 comments on commit 50f792a

Please sign in to comment.