Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Revert ftsearch removal commit f171c77 for now. Commit again later.
- Loading branch information
Showing
34 changed files
with
1,706 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,51 +1,46 @@ | ||
require("yajl") | ||
require("picky") | ||
require 'ftsearch/fragment_writer' | ||
require 'ftsearch/analysis/simple_identifier_analyzer' | ||
#require 'ftsearchrt' | ||
|
||
Picky.logger = Picky::Loggers::Silent.new | ||
class Shoes::Search | ||
include FTSearch | ||
attr_reader :index | ||
def initialize fields = [:uri, :body] | ||
field_infos = FTSearch::FieldInfos.new | ||
fields.each do |name| | ||
field_infos.add_field :name => name, | ||
:analyzer => FTSearch::Analysis::SimpleIdentifierAnalyzer.new | ||
end | ||
@index = FTSearch::FragmentWriter.new :path => nil, :field_infos => field_infos | ||
end | ||
def add_document hsh | ||
@index.add_document hsh | ||
end | ||
def finish! | ||
@index.finish! | ||
|
||
class Search | ||
Document = Struct.new :id, :uri, :body | ||
|
||
def initialize | ||
@documents = [] | ||
@index = Picky::Index.new :terms do | ||
indexing removes_characters: %r{[^a-z0-9\s\/\-\_\:\"\&\.]}i, | ||
splits_text_on: %r{[\s/\-\_\:\"\&/\.]} | ||
category :uri, :from => lambda { |doc| doc.uri.dup } | ||
category :body, :from => lambda { |doc| doc.body.dup } | ||
end | ||
@search = Picky::Search.new @index do | ||
searching removes_characters: %r{[^a-z0-9\s\/\-\_\:\"\&\.]}i, | ||
splits_text_on: %r{[\s/\-\_\:\"\&/\.]} | ||
end | ||
|
||
@update = true | ||
if File.directory?("index") | ||
file = Dir[File.join("index", "development", "terms", "*")].first | ||
if (File.mtime("#{DIR}/static/manual-en.txt") < File.mtime(file)) | ||
@index.load | ||
@update = false | ||
end | ||
end | ||
end | ||
|
||
def add_document(terms = {}) | ||
@documents << Document.new(@documents.size + 1, terms[:uri], terms[:body]) | ||
@index.add @documents[-1] if @update | ||
end | ||
|
||
def find_all(terms) | ||
retval = [] | ||
results = @search.search(terms) | ||
results.sort_by { |id| id } | ||
results.ids.each do |id| | ||
document = @documents.detect { |n| n.id == id } | ||
retval << [document.uri] unless document.nil? | ||
end | ||
retval | ||
end | ||
|
||
def finish! | ||
@index.dump if @update | ||
end | ||
end | ||
@ft = FulltextReader.new :io => StringIO.new(@index.fulltext_writer.data) | ||
@sa = SuffixArrayReader.new @ft, nil, :io => StringIO.new(@index.suffix_array_writer.data) | ||
@dm = DocumentMapReader.new :io => StringIO.new(@index.doc_map_writer.data) | ||
end | ||
def find_all terms, show = 20, prob_sort = false | ||
h = Hash.new{|h,k| h[k] = 0} | ||
weights = Hash.new(1.0) | ||
weights[0] = 10000000 # :uri | ||
weights[1] = 10000000 # :body | ||
hits = @sa.find_all terms | ||
size = hits.size | ||
if prob_sort && size > 10000 | ||
iterations = 50 * Math.sqrt(size) | ||
offsets = @sa.lazyhits_to_offsets(hits) | ||
weight_arr = weights.sort_by{|id,w| id}.map{|_,v| v} | ||
sorted = @dm.rank_offsets_probabilistic(offsets, weight_arr, iterations) | ||
else | ||
offsets = @sa.lazyhits_to_offsets(hits) | ||
sorted = @dm.rank_offsets(offsets, weights.sort_by{|id,w| id}.map{|_,v| v}) | ||
end | ||
sorted[0..show].map do |doc_id, count| | ||
[@dm.document_id_to_uri(doc_id), count] | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
|
||
This is far from finished, but there's enough done to compare the performance | ||
for some basic searches (word-prefix, word and phrasal). | ||
|
||
If you want to try it, here's what you have to do: | ||
|
||
1) build the extension | ||
$ cd ext/ftsearch && ruby extconf.rb && make | ||
(no need to make install for now, ext/ftsearch is added to $: in the | ||
scripts you'll run) | ||
|
||
I've only tested this under i686-linux; some things are known not to work | ||
with 64bit platforms (but a few are detected at compile time, and the | ||
corresponding optimizations disabled). | ||
|
||
|
||
2) index the corpora with Ferret and FTSearch. | ||
a) Unpack Linux's tree under corpus/linux | ||
b) Run | ||
$ ruby ferret-indexing-benchmark-linux-source.rb | ||
You will find a line like this in | ||
ferret-indexing-benchmark-linux-source.rb: | ||
field_infos.add_field(:body, :store => :yes, :term_vector => :with_positions_offsets) | ||
==== | ||
This controls whether the body is stored. Set it to :no to index faster | ||
(on my box, 2:45 instead of 3:30), but keep in mind that FTSearch's | ||
indexing is equivalent to :store => :yes. | ||
c) Run | ||
$ ruby sample-indexer.rb linux | ||
|
||
Repeat (b), (c) if you want to compare them fairly when corpus/linux/* is | ||
cached. | ||
|
||
3) Searching with Ferret & FTSearch | ||
|
||
$ ruby ferret-lookup.rb | ||
|
||
It will ask you for a query term and show the times/top results. | ||
Enter !queryterm to see how long it takes to get the first match. | ||
Enter an empty term (just press enter) when done. | ||
|
||
$ ruby sample-lookup.rb | ||
|
||
Same interface as ferret-lookup.rb. | ||
|
||
Note: FTSearch uses a suffix-array, so if you look for e.g. "fa", it'll match | ||
faq, fat, fat_entry, ..., making it equivalent to looking for "fa*" with | ||
Ferret. | ||
|
||
FTSearch does phrasal search naturally, if you're lookup for "big array", just | ||
enter it (without the quotes); with ferret-lookup.rb, you *have* to | ||
surround the phrase with quotes. | ||
|
||
|
||
LICENSE | ||
======= | ||
Distribution and modification subject to the same terms as Ruby. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
if ENV['SYSROOT'] | ||
rblv = ENV['TGT_RUBY_V'] | ||
rbroot = ENV['TGT_RUBY_PATH'] | ||
rlib = rbroot+"/lib" | ||
incl = "#{rbroot}/include/ruby-#{rblv}" | ||
incla = "#{incl}/#{ENV['TGT_ARCH']}" | ||
RbConfig::CONFIG["rubyhdrdir"] = incl | ||
RbConfig::CONFIG["rubyarchhdrdir"] = incla | ||
RbConfig::MAKEFILE_CONFIG['libdir'] = rlib # needed for Linking ext.so | ||
RbConfig::CONFIG['libdir'] = rlib # needed for conftest | ||
if RUBY_PLATFORM =~ /darwin/ | ||
ARCH_FLAG = ENV['SYSROOT'] | ||
end | ||
end | ||
require 'mkmf' | ||
# update the CONFIG with the correct values. RbConfig won't work | ||
# for cross compiling. This is a bit heavy handed. | ||
CONFIG['CC']=ENV['CC'] if ENV['CC'] | ||
$CFLAGS += ' -Wno-declaration-after-statement -std=gnu99 -ffast-math' | ||
if ENV['SYSROOT'] | ||
if ENV['TGT_RUBY_PATH'] =~ /mingw/ | ||
$LDFLAGS = "-L #{rbroot}/bin" | ||
#puts "$LIBS = #{$LIBS}" | ||
$LIBS = "" | ||
CONFIG['RUBY_SO_NAME'] = ENV['TGT_RUBY_SO'] | ||
end | ||
end | ||
create_makefile('ftsearchrt') |
Oops, something went wrong.
7733cae
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you. I try to keep the master branch 'always build-able'. When I add new features to master, Shoes is always build-able. Perhaps not all platforms can use the new feature but they all run.