Skip to content

Commit

Permalink
Revert ftsearch removal for #312
Browse files Browse the repository at this point in the history
Revert ftsearch removal commit f171c77 for now. Commit again later.
  • Loading branch information
IanTrudel committed Feb 8, 2017
1 parent d2e4fcc commit 7733cae
Show file tree
Hide file tree
Showing 34 changed files with 1,706 additions and 62 deletions.
2 changes: 1 addition & 1 deletion Rakefile
Expand Up @@ -49,7 +49,7 @@ SHOES_RUBY_ARCH = RbConfig::CONFIG['arch']
# default exts, gems & locations to build and include - replace with custom.yaml
APP['GEMLOC'] = File.expand_path('req')
APP['EXTLOC'] = File.expand_path('req')
APP['EXTLIST'] = ['chipmunk']
APP['EXTLIST'] = ['ftsearch', 'chipmunk']
APP['GEMLIST'] = ['sqlite3']

if File.exists? "crosscompile"
Expand Down
93 changes: 44 additions & 49 deletions lib/shoes/search.rb
@@ -1,51 +1,46 @@
require("yajl")
require("picky")
require 'ftsearch/fragment_writer'
require 'ftsearch/analysis/simple_identifier_analyzer'
#require 'ftsearchrt'

Picky.logger = Picky::Loggers::Silent.new
class Shoes::Search
include FTSearch
attr_reader :index
def initialize fields = [:uri, :body]
field_infos = FTSearch::FieldInfos.new
fields.each do |name|
field_infos.add_field :name => name,
:analyzer => FTSearch::Analysis::SimpleIdentifierAnalyzer.new
end
@index = FTSearch::FragmentWriter.new :path => nil, :field_infos => field_infos
end
def add_document hsh
@index.add_document hsh
end
def finish!
@index.finish!

class Search
Document = Struct.new :id, :uri, :body

def initialize
@documents = []
@index = Picky::Index.new :terms do
indexing removes_characters: %r{[^a-z0-9\s\/\-\_\:\"\&\.]}i,
splits_text_on: %r{[\s/\-\_\:\"\&/\.]}
category :uri, :from => lambda { |doc| doc.uri.dup }
category :body, :from => lambda { |doc| doc.body.dup }
end
@search = Picky::Search.new @index do
searching removes_characters: %r{[^a-z0-9\s\/\-\_\:\"\&\.]}i,
splits_text_on: %r{[\s/\-\_\:\"\&/\.]}
end

@update = true
if File.directory?("index")
file = Dir[File.join("index", "development", "terms", "*")].first
if (File.mtime("#{DIR}/static/manual-en.txt") < File.mtime(file))
@index.load
@update = false
end
end
end

def add_document(terms = {})
@documents << Document.new(@documents.size + 1, terms[:uri], terms[:body])
@index.add @documents[-1] if @update
end

def find_all(terms)
retval = []
results = @search.search(terms)
results.sort_by { |id| id }
results.ids.each do |id|
document = @documents.detect { |n| n.id == id }
retval << [document.uri] unless document.nil?
end
retval
end

def finish!
@index.dump if @update
end
end
@ft = FulltextReader.new :io => StringIO.new(@index.fulltext_writer.data)
@sa = SuffixArrayReader.new @ft, nil, :io => StringIO.new(@index.suffix_array_writer.data)
@dm = DocumentMapReader.new :io => StringIO.new(@index.doc_map_writer.data)
end
def find_all terms, show = 20, prob_sort = false
h = Hash.new{|h,k| h[k] = 0}
weights = Hash.new(1.0)
weights[0] = 10000000 # :uri
weights[1] = 10000000 # :body
hits = @sa.find_all terms
size = hits.size
if prob_sort && size > 10000
iterations = 50 * Math.sqrt(size)
offsets = @sa.lazyhits_to_offsets(hits)
weight_arr = weights.sort_by{|id,w| id}.map{|_,v| v}
sorted = @dm.rank_offsets_probabilistic(offsets, weight_arr, iterations)
else
offsets = @sa.lazyhits_to_offsets(hits)
sorted = @dm.rank_offsets(offsets, weights.sort_by{|id,w| id}.map{|_,v| v})
end
sorted[0..show].map do |doc_id, count|
[@dm.document_id_to_uri(doc_id), count]
end
end
end
6 changes: 3 additions & 3 deletions make/linux/tasks.rb
Expand Up @@ -67,11 +67,11 @@ def common_build
puts "common_build dir=#{pwd} #{SHOES_RUBY_ARCH}"
mkdir_p "#{TGT_DIR}/lib/ruby/#{RUBY_V}/#{SHOES_RUBY_ARCH}"
#cp_r "#{EXT_RUBY}/lib/ruby/#{RUBY_V}", "#{TGT_DIR}/ruby/lib"
%w[req/rake/lib/*].each do |rdir|
%w[req/ftsearch/lib/* req/rake/lib/*].each do |rdir|
FileList[rdir].each { |rlib| cp_r rlib, "#{TGT_DIR}/lib/ruby/#{RUBY_V}" }
end
#%w[req/binject/ext/binject_c req/bloopsaphone/ext/bloops req/chipmunk/ext/chipmunk].
%w[req/chipmunk/ext/chipmunk].
#%w[req/binject/ext/binject_c req/ftsearch/ext/ftsearchrt req/bloopsaphone/ext/bloops req/chipmunk/ext/chipmunk].
%w[req/ftsearch/ext/ftsearchrt req/chipmunk/ext/chipmunk].
each { |xdir| copy_ext xdir, "#{TGT_DIR}/lib/ruby/#{RUBY_V}/#{SHOES_RUBY_ARCH}" }

gdir = "#{TGT_DIR}/lib/ruby/gems/#{RUBY_V}"
Expand Down
5 changes: 3 additions & 2 deletions make/make.rb
Expand Up @@ -64,10 +64,11 @@ def common_build
rm_rf "dist/ruby/lib/#{libn}"
end
end
%w[req/rake/lib/*].each do |rdir|
%w[req/ftsearch/lib/* req/rake/lib/*].each do |rdir|
FileList[rdir].each { |rlib| cp_r rlib, "dist/ruby/lib" }
end
%w[req/binject/ext/binject_c req/bloopsaphone/ext/bloops req/chipmunk/ext/chipmunk].
#%w[req/ftsearch/ext/ftsearchrt].
%w[req/binject/ext/binject_c req/ftsearch/ext/ftsearchrt req/bloopsaphone/ext/bloops req/chipmunk/ext/chipmunk].
each { |xdir| copy_ext xdir, "dist/ruby/lib/#{SHOES_RUBY_ARCH}" }

gdir = "dist/ruby/gems/#{RUBY_V}"
Expand Down
2 changes: 1 addition & 1 deletion make/mavericks/tasks.rb
Expand Up @@ -72,7 +72,7 @@ def pre_build
puts "Entering osx pre_build #{TGT_DIR}"
rm_rf "#{TGT_DIR}"
# copy Ruby, dylib, includes - have them in place before
# we build exts (binject, bloopsaphone, chipmunk, sqlite3 and winject).
# we build exts (ftsearch).
puts "Ruby at #{EXT_RUBY}"
rbvt = RUBY_V
rbvm = RUBY_V[/^\d+\.\d+/]
Expand Down
2 changes: 1 addition & 1 deletion make/snow/tasks.rb
Expand Up @@ -72,7 +72,7 @@ def pre_build
puts "Entering osx pre_build #{TGT_DIR}"
rm_rf "#{TGT_DIR}"
# copy Ruby, dylib, includes - have them in place before
# we build exts (binject, bloopsaphone, chipmunk, sqlite3 and winject).
# we build exts (ftsearch).
puts "Ruby at #{EXT_RUBY}"
rbvt = RUBY_V
rbvm = RUBY_V[/^\d+\.\d+/]
Expand Down
2 changes: 1 addition & 1 deletion make/xmavericks/env.rb
Expand Up @@ -72,7 +72,7 @@
LINUX_CFLAGS << ' -Wno-incompatible-pointer-types-discards-qualifiers'

OSX_ARCH = '-arch x86_64'
# These env vars are used in chipmunk extconf.rb
# These env vars are used in ftsearch, chipmunk extconf.rb
#SHOES_TGT_ARCH = SHOES_GEM_ARCH ='x86_64-darwin13.0'
SHOES_TGT_ARCH = SHOES_GEM_ARCH ='x86_64-darwin13'
ENV['CC'] = CC
Expand Down
2 changes: 1 addition & 1 deletion make/xmavericks/tasks.rb
Expand Up @@ -72,7 +72,7 @@ def pre_build
puts "Entering osx pre_build #{TGT_DIR}"
rm_rf "#{TGT_DIR}"
# copy Ruby, dylib, includes - have them in place before
# we build exts (binject, bloopsaphone, chipmunk, sqlite3 and winject).
# we build exts (ftsearch).
puts "Ruby at #{EXT_RUBY}"
rbvt = RUBY_V
rbvm = RUBY_V[/^\d+\.\d+/]
Expand Down
2 changes: 1 addition & 1 deletion make/xsnow/env.rb
Expand Up @@ -68,7 +68,7 @@
LINUX_CFLAGS << ' -Wno-incompatible-pointer-types-discards-qualifiers'

OSX_ARCH = '-arch x86_64'
# These env vars are used in chipmunk extconf.rb
# These env vars are used in ftsearch, chipmunk extconf.rb
SHOES_TGT_ARCH = SHOES_GEM_ARCH ='x86_64-darwin10.0'
ENV['CC'] = CC
ENV['TGT_RUBY_PATH'] = EXT_RUBY
Expand Down
2 changes: 1 addition & 1 deletion make/xsnow/tasks.rb
Expand Up @@ -72,7 +72,7 @@ def pre_build
puts "Entering osx pre_build #{TGT_DIR}"
rm_rf "#{TGT_DIR}"
# copy Ruby, dylib, includes - have them in place before
# we build exts (binject, bloopsaphone, chipmunk, sqlite3 and winject).
# we build exts (ftsearch).
puts "Ruby at #{EXT_RUBY}"
rbvt = RUBY_V
rbvm = RUBY_V[/^\d+\.\d+/]
Expand Down
2 changes: 1 addition & 1 deletion make/yosemite/tasks.rb
Expand Up @@ -72,7 +72,7 @@ def pre_build
puts "Entering osx pre_build #{TGT_DIR}"
rm_rf "#{TGT_DIR}"
# copy Ruby, dylib, includes - have them in place before
# we build exts (binject, bloopsaphone, chipmunk, sqlite3 and winject).
# we build exts (ftsearch).
puts "Ruby at #{EXT_RUBY}"
rbvt = RUBY_V
rbvm = RUBY_V[/^\d+\.\d+/]
Expand Down
3 changes: 3 additions & 0 deletions platform/nix/Makefile
Expand Up @@ -74,6 +74,9 @@ dist/shoes.launch: dist/shoes-bin
@cd req/sqlite3/ext/sqlite3 && ruby extconf.rb && make
@cp req/sqlite3/ext/sqlite3/*.so ${GEM_DIR}/gems/${SQLITE_NAME}/lib
@cp req/sqlite3/gemspec ${GEM_DIR}/specifications/${SQLITE_NAME}.gemspec
@cp -r req/ftsearch/lib/* dist/lib
@cd req/ftsearch/ext/ftsearchrt && ruby extconf.rb && make
@cp req/ftsearch/ext/ftsearchrt/*.so dist/lib
@cd req/binject/ext/binject_c && ruby extconf.rb && make
@cp req/binject/ext/binject_c/*.so dist/lib
@cp -r samples dist/samples
Expand Down
57 changes: 57 additions & 0 deletions req/ftsearch/README
@@ -0,0 +1,57 @@

This is far from finished, but there's enough done to compare the performance
for some basic searches (word-prefix, word and phrasal).

If you want to try it, here's what you have to do:

1) build the extension
$ cd ext/ftsearch && ruby extconf.rb && make
(no need to make install for now, ext/ftsearch is added to $: in the
scripts you'll run)

I've only tested this under i686-linux; some things are known not to work
with 64bit platforms (but a few are detected at compile time, and the
corresponding optimizations disabled).


2) index the corpora with Ferret and FTSearch.
a) Unpack Linux's tree under corpus/linux
b) Run
$ ruby ferret-indexing-benchmark-linux-source.rb
You will find a line like this in
ferret-indexing-benchmark-linux-source.rb:
field_infos.add_field(:body, :store => :yes, :term_vector => :with_positions_offsets)
====
This controls whether the body is stored. Set it to :no to index faster
(on my box, 2:45 instead of 3:30), but keep in mind that FTSearch's
indexing is equivalent to :store => :yes.
c) Run
$ ruby sample-indexer.rb linux

Repeat (b), (c) if you want to compare them fairly when corpus/linux/* is
cached.

3) Searching with Ferret & FTSearch

$ ruby ferret-lookup.rb

It will ask you for a query term and show the times/top results.
Enter !queryterm to see how long it takes to get the first match.
Enter an empty term (just press enter) when done.

$ ruby sample-lookup.rb

Same interface as ferret-lookup.rb.

Note: FTSearch uses a suffix-array, so if you look for e.g. "fa", it'll match
faq, fat, fat_entry, ..., making it equivalent to looking for "fa*" with
Ferret.

FTSearch does phrasal search naturally, if you're lookup for "big array", just
enter it (without the quotes); with ferret-lookup.rb, you *have* to
surround the phrase with quotes.


LICENSE
=======
Distribution and modification subject to the same terms as Ruby.
28 changes: 28 additions & 0 deletions req/ftsearch/ext/ftsearchrt/extconf.rb
@@ -0,0 +1,28 @@
if ENV['SYSROOT']
rblv = ENV['TGT_RUBY_V']
rbroot = ENV['TGT_RUBY_PATH']
rlib = rbroot+"/lib"
incl = "#{rbroot}/include/ruby-#{rblv}"
incla = "#{incl}/#{ENV['TGT_ARCH']}"
RbConfig::CONFIG["rubyhdrdir"] = incl
RbConfig::CONFIG["rubyarchhdrdir"] = incla
RbConfig::MAKEFILE_CONFIG['libdir'] = rlib # needed for Linking ext.so
RbConfig::CONFIG['libdir'] = rlib # needed for conftest
if RUBY_PLATFORM =~ /darwin/
ARCH_FLAG = ENV['SYSROOT']
end
end
require 'mkmf'
# update the CONFIG with the correct values. RbConfig won't work
# for cross compiling. This is a bit heavy handed.
CONFIG['CC']=ENV['CC'] if ENV['CC']
$CFLAGS += ' -Wno-declaration-after-statement -std=gnu99 -ffast-math'
if ENV['SYSROOT']
if ENV['TGT_RUBY_PATH'] =~ /mingw/
$LDFLAGS = "-L #{rbroot}/bin"
#puts "$LIBS = #{$LIBS}"
$LIBS = ""
CONFIG['RUBY_SO_NAME'] = ENV['TGT_RUBY_SO']
end
end
create_makefile('ftsearchrt')

1 comment on commit 7733cae

@ccoupe
Copy link

@ccoupe ccoupe commented on 7733cae Feb 8, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you. I try to keep the master branch 'always build-able'. When I add new features to master, Shoes is always build-able. Perhaps not all platforms can use the new feature but they all run.

Please sign in to comment.