Skip to content
This repository has been archived by the owner on Apr 12, 2022. It is now read-only.

Add the ability to match on phrase instead of word. #23

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/soulmate.rb
Expand Up @@ -7,6 +7,8 @@
require 'soulmate/base'
require 'soulmate/matcher'
require 'soulmate/loader'
require 'soulmate/phrase_loader'
require 'soulmate/phrase_matcher'

module Soulmate

Expand Down
9 changes: 7 additions & 2 deletions lib/soulmate/helpers.rb
@@ -1,18 +1,23 @@
module Soulmate
module Helpers

def prefixes_for_phrase(phrase)
def word_prefixes_for_phrase(phrase)
words = normalize(phrase).split(' ').reject do |w|
Soulmate.stop_words.include?(w)
end
words.map do |w|
(MIN_COMPLETE-1..(w.length-1)).map{ |l| w[0..l] }
end.flatten.uniq

end

def prefixes_for_phrase(phrase)
(MIN_COMPLETE-1..(phrase.length-1)).map{ |l| phrase[0..l] }.flatten.uniq
end

def normalize(str)
str.downcase.gsub(/[^a-z0-9 ]/i, '').strip
end

end
end
end
6 changes: 3 additions & 3 deletions lib/soulmate/loader.rb
Expand Up @@ -37,7 +37,7 @@ def add(item, opts = {})
# store the raw data in a separate key to reduce memory usage
Soulmate.redis.hset(database, item["id"], MultiJson.encode(item))
phrase = ([item["term"]] + (item["aliases"] || [])).join(' ')
prefixes_for_phrase(phrase).each do |p|
word_prefixes_for_phrase(phrase).each do |p|
Soulmate.redis.sadd(base, p) # remember this prefix in a master set
Soulmate.redis.zadd("#{base}:#{p}", item["score"], item["id"]) # store the id of this term in the index
end
Expand All @@ -53,12 +53,12 @@ def remove(item)
Soulmate.redis.pipelined do
Soulmate.redis.hdel(database, prev_item["id"])
phrase = ([prev_item["term"]] + (prev_item["aliases"] || [])).join(' ')
prefixes_for_phrase(phrase).each do |p|
word_prefixes_for_phrase(phrase).each do |p|
Soulmate.redis.srem(base, p)
Soulmate.redis.zrem("#{base}:#{p}", prev_item["id"])
end
end
end
end
end
end
end
45 changes: 45 additions & 0 deletions lib/soulmate/phrase_loader.rb
@@ -0,0 +1,45 @@
module Soulmate

class PhraseLoader < Loader

# "id", "term", "score", "aliases", "data"
def add(item, opts = {})
opts = { :skip_duplicate_check => false }.merge(opts)
raise ArgumentError unless item["id"] && item["term"]

# kill any old items with this id
remove("id" => item["id"]) unless opts[:skip_duplicate_check]

Soulmate.redis.pipelined do
# store the raw data in a separate key to reduce memory usage
Soulmate.redis.hset(database, item["id"], MultiJson.encode(item))
phrases = ([item["term"]] + (item["aliases"] || []))
phrases.each do |phrase|
prefixes_for_phrase(phrase).each do |p|
Soulmate.redis.sadd(base, p) # remember this prefix in a master set
Soulmate.redis.zadd("#{base}:#{p}", item["score"], item["id"]) # store the id of this term in the index
end
end
end
end

# remove only cares about an item's id, but for consistency takes an object
def remove(item)
prev_item = Soulmate.redis.hget(database, item["id"])
if prev_item
prev_item = MultiJson.decode(prev_item)
# undo the operations done in add
Soulmate.redis.pipelined do
Soulmate.redis.hdel(database, prev_item["id"])
phrases = ([prev_item["term"]] + (prev_item["aliases"] || []))
phrases.each do |phrase|
prefixes_for_phrase(phrase).each do |p|
Soulmate.redis.srem(base, p)
Soulmate.redis.zrem("#{base}:#{p}", prev_item["id"])
end
end
end
end
end
end
end
28 changes: 28 additions & 0 deletions lib/soulmate/phrase_matcher.rb
@@ -0,0 +1,28 @@
module Soulmate

class PhraseMatcher < Base

def matches_for_term(term, options = {})
options = { :limit => 5, :cache => true }.merge(options)

return [] if term.empty?

cachekey = "#{cachebase}:" + term

if !options[:cache] || !Soulmate.redis.exists(cachekey)
interkeys = ["#{base}:#{term}"]
Soulmate.redis.zinterstore(cachekey, interkeys)
Soulmate.redis.expire(cachekey, 10 * 60) # expire after 10 minutes
end

ids = Soulmate.redis.zrevrange(cachekey, 0, options[:limit] - 1)
if ids.size > 0
results = Soulmate.redis.hmget(database, *ids)
results = results.reject{ |r| r.nil? } # handle cached results for ids which have since been deleted
results.map { |r| MultiJson.decode(r) }
else
[]
end
end
end
end
3 changes: 2 additions & 1 deletion lib/soulmate/server.rb
Expand Up @@ -23,10 +23,11 @@ class Server < Sinatra::Base
limit = (params[:limit] || 5).to_i
types = params[:types].map { |t| normalize(t) }
term = params[:term]
matcher_class = params[:phrase] ? PhraseMatcher : Matcher

results = {}
types.each do |type|
matcher = Matcher.new(type)
matcher = matcher_class.new(type)
results[type] = matcher.matches_for_term(term, :limit => limit)
end

Expand Down
19 changes: 13 additions & 6 deletions test/test_soulmate.rb
Expand Up @@ -91,15 +91,22 @@ def test_can_update_items

end

def test_prefixes_for_phrase
def test_prefixes_for_phrase_words
loader = Soulmate::Loader.new('venues')

Soulmate.stop_words = ['the']

assert_equal ["kn", "kni", "knic", "knick", "knicks"], loader.prefixes_for_phrase("the knicks")
assert_equal ["te", "tes", "test", "testi", "testin", "th", "thi", "this"], loader.prefixes_for_phrase("testin' this")
assert_equal ["te", "tes", "test", "testi", "testin", "th", "thi", "this"], loader.prefixes_for_phrase("testin' this")
assert_equal ["te", "tes", "test"], loader.prefixes_for_phrase("test test")
assert_equal ["so", "sou", "soul", "soulm", "soulma", "soulmat", "soulmate"], loader.prefixes_for_phrase("SoUlmATE")
assert_equal ["kn", "kni", "knic", "knick", "knicks"], loader.word_prefixes_for_phrase("the knicks")
assert_equal ["te", "tes", "test", "testi", "testin", "th", "thi", "this"], loader.word_prefixes_for_phrase("testin' this")
assert_equal ["te", "tes", "test", "testi", "testin", "th", "thi", "this"], loader.word_prefixes_for_phrase("testin' this")
assert_equal ["te", "tes", "test"], loader.word_prefixes_for_phrase("test test")
assert_equal ["so", "sou", "soul", "soulm", "soulma", "soulmat", "soulmate"], loader.word_prefixes_for_phrase("SoUlmATE")
end

def test_prefixes_for_phrase
loader = Soulmate::PhraseLoader.new('venues')
assert_equal ["th", "the", "the ", "the k", "the kn", "the kni", "the knic", "the knick", "the knicks"], loader.prefixes_for_phrase("the knicks")
# We don't normalise because it gets messy with whole phrases
assert_equal ["it", "it'", "it's"], loader.prefixes_for_phrase("it's")
end
end