Skip to content

Commit

Permalink
Added find_in_batches which yields a chunk of solr records
Browse files Browse the repository at this point in the history
  • Loading branch information
jcoyne committed May 12, 2012
1 parent d71234f commit 088dba1
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 43 deletions.
10 changes: 5 additions & 5 deletions Gemfile.lock
@@ -1,7 +1,7 @@
PATH
remote: .
specs:
active-fedora (4.0.0)
active-fedora (4.1.0)
activeresource (>= 3.0.0)
activesupport (>= 3.0.0)
equivalent-xml
Expand Down Expand Up @@ -30,7 +30,7 @@ GEM
activesupport (3.2.2)
i18n (~> 0.6)
multi_json (~> 1.0)
addressable (2.2.7)
addressable (2.2.8)
akami (1.0.0)
gyoku (>= 0.4.0)
builder (3.0.0)
Expand All @@ -44,7 +44,7 @@ GEM
ffi (1.0.11)
gyoku (0.4.4)
builder (>= 2.1.2)
httpi (0.9.6)
httpi (0.9.7)
rack
i18n (0.6.0)
jettywrapper (1.2.1)
Expand Down Expand Up @@ -76,7 +76,7 @@ GEM
rdf (>= 0.3.4)
rest-client (1.6.7)
mime-types (>= 1.16)
rsolr (1.0.7)
rsolr (1.0.8)
builder (>= 2.1.2)
rspec (2.9.0)
rspec-core (~> 2.9.0)
Expand Down Expand Up @@ -108,7 +108,7 @@ GEM
simplecov-html (0.5.3)
simplecov-rcov (0.2.3)
simplecov (>= 0.4.1)
solrizer (1.2.0)
solrizer (1.2.1)
daemons
mediashelf-loggable (~> 0.4.7)
nokogiri
Expand Down
1 change: 1 addition & 0 deletions History.txt
@@ -1,5 +1,6 @@
4.2.0
Added Base.find_each which allows yielding of each result
Added Base.find_in_batches which allows yielding of a chunk of solr results

4.1.0
ActiveFedora::Base.find() now supports filtering by solr fields (boolean AND).
Expand Down
100 changes: 72 additions & 28 deletions lib/active_fedora/model.rb
Expand Up @@ -102,32 +102,58 @@ def to_class_uri(attrs = {})
# @option opts [Integer] :rows when :all is passed, the maximum number of rows to load from solr
# @option opts [Boolean] :cast when true, examine the model and cast it to the first known cModel
def find(args, opts={})
opts = {:rows=>25}.merge(opts)
return find_one(args, opts[:cast]) if args.class == String
args = {} if args == :all
hits = find_with_conditions(args, opts.merge({:fl=>SOLR_DOCUMENT_ID}))
hits.map do |hit|
pid = hit[SOLR_DOCUMENT_ID]
find_one(pid, opts[:cast])
results = []
find_each(args, opts) do |obj|
results << obj
end
results
end

# Yields the found object to the passed block

# Yields each batch of solr records that was found by the find +options+ as
# an array. The size of each batch is set by the <tt>:batch_size</tt>
# option; the default is 1000.
#
# @param[ Hash] options
# @option opts [Integer] :conditions a list of conditions for the search to match
# @option opts [Integer] :rows when :all is passed, the maximum number of rows to load from solr
# Returns a solr result matching the supplied conditions
# @param[Hash] conditions solr conditions to match
# @param[Hash] options
# @option opts [Array] :sort a list of fields to sort by
# @option opts [Array] :rows number of rows to return
#
# @example
# Person.find_in_batches('age_t'=>'21', {:batch_size=>50}) do |group|
# group.each { |person| puts person['name_t'] }
# end

def find_in_batches conditions, opts={}
opts[:q] = create_query(conditions)
#set default sort to created date ascending
unless opts.include?(:sort)
opts[:sort]=[ActiveFedora::SolrService.solr_name(:system_create,:date)+' asc']
end

batch_size = opts.delete(:batch_size) || 1000

counter = 0
begin
counter += 1
response = ActiveFedora::SolrService.instance.conn.paginate counter, batch_size, "select", :params => opts
docs = response["response"]["docs"]
yield docs
end while docs.has_next?
end

# Yields the found ActiveFedora::Base object to the passed block
#
# @param [Hash] conditions the conditions for the solr search to match
# @param [Hash] opts
# @option opts [Boolean] :cast when true, examine the model and cast it to the first known cModel
def find_each(opts={})
opts = {:rows=>25}.merge(opts)
conditions = opts.delete(:conditions) || {}
hits = find_with_conditions(conditions, opts.merge({:fl=>SOLR_DOCUMENT_ID}))

hits.each do |hit|
pid = hit[SOLR_DOCUMENT_ID]
if pid.present?
obj=find_one(pid, opts[:cast])
yield(obj)
def find_each( conditions={}, opts={})
find_in_batches(conditions, opts.merge({:fl=>SOLR_DOCUMENT_ID})) do |group|
group.each do |hit|
yield(find_one(hit[SOLR_DOCUMENT_ID], opts[:cast]))
end
end
end
Expand All @@ -141,6 +167,7 @@ def exists?(pid)
!inner.new?
end

#@deprecated
def find_model(pid)
ActiveSupport::Deprecation.warn("find_model is deprecated. Use find instead")
find(pid)
Expand All @@ -156,8 +183,10 @@ def count(args = {})
SolrService.query(q, :raw=>true, :rows=>0)['response']['numFound']
end

#@deprecated
#Sends a query directly to SolrService
def solr_search(query, args={})
ActiveSupport::Deprecation.warn("solr_search is deprecated and will be removed in the next release. Use SolrService.query instead")
SolrService.instance.conn.query(query, args)
end

Expand All @@ -182,6 +211,7 @@ def find_by_solr(query, args={})
end
end

# @deprecated
# Find all ActiveFedora objects for this model that match arguments
# passed in by querying Solr. Like find_by_solr this returns a solr result.
#
Expand Down Expand Up @@ -268,8 +298,24 @@ def find_by_fields_by_solr(query_fields,opts={})
SolrService.query(query, query_opts)
end

# @param[Hash] conditions
# Returns a solr result matching the supplied conditions
# @param[Hash] conditions solr conditions to match
# @param[Hash] options
# @option opts [Array] :sort a list of fields to sort by
# @option opts [Array] :rows number of rows to return
def find_with_conditions(conditions, opts={})
query = create_query(conditions)
#set default sort to created date ascending
unless opts.include?(:sort)
opts[:sort]=[ActiveFedora::SolrService.solr_name(:system_create,:date)+' asc']
end
SolrService.query(query, opts)
end


# Returns a solr query for the supplied conditions
# @param[Hash] conditions solr conditions to match
def create_query(conditions)
escaped_class_uri = SolrService.escape_uri_for_query(self.to_class_uri)
clauses = ["#{ActiveFedora::SolrService.solr_name(:has_model, :symbol)}:#{escaped_class_uri}"]
conditions.each_pair do |key,value|
Expand All @@ -286,13 +332,7 @@ def find_with_conditions(conditions, opts={})
end
end

query = clauses.join(" AND ")

#set default sort to created date ascending
unless opts.include?(:sort)
opts[:sort]=[ActiveFedora::SolrService.solr_name(:system_create,:date)+' asc']
end
SolrService.query(query, opts)
clauses.join(" AND ")
end

def quote_for_solr(value)
Expand All @@ -309,18 +349,20 @@ def class_fields
return fields
end

#TODO remove
#wrapper around instance_variable_set, sets @name to value
def attribute_set(name, value)
instance_variable_set("@#{name}", value)
end

#TODO remove
#wrapper around instance_variable_get, returns current value of @name
def attribute_get(name)
instance_variable_get("@#{name}")
end

private
# Retrieve the Fedora object with te given pid, explore the returned object, determine its model
# Retrieve the Fedora object with the given pid, explore the returned object, determine its model
# using #{ActiveFedora::ContentModel.known_models_for} and cast to that class.
# Raises a ObjectNotFoundError if the object is not found.
# @param [String] pid of the object to load
Expand All @@ -336,6 +378,7 @@ def find_one(pid, cast=false)
end

end
#TODO remove
def create_property_getter(property) # :nodoc:

class_eval <<-END, __FILE__, __LINE__
Expand All @@ -345,6 +388,7 @@ def #{property.name}
END
end

#TODO remove
def create_property_setter(property)# :nodoc:
class_eval <<-END, __FILE__, __LINE__
def #{property.name}=(value)
Expand Down
45 changes: 35 additions & 10 deletions spec/unit/model_spec.rb
Expand Up @@ -73,10 +73,13 @@ def initialize (args = {})
describe '#find' do
describe "without :cast" do
it "(:all) should query solr for all objects with :active_fedora_model_s of self.class" do
ActiveFedora::SolrService.expects(:query).with('has_model_s:info\\:fedora/afmodel\\:SpecModel_Basic', :fl => 'id', :sort => ['system_create_dt asc'], :rows=>1001).returns([{"id" => "changeme:30"}, {"id" => "changeme:22"}])
SpecModel::Basic.expects(:find_one).with("changeme:30", nil).returns("Fake Object1")
SpecModel::Basic.expects(:find_one).with("changeme:22", nil).returns("Fake Object2")
SpecModel::Basic.find(:all, :rows=>1001).should == ["Fake Object1", "Fake Object2"]
mock_docs = mock('docs')
mock_docs.expects(:each).multiple_yields([{"id" => "changeme:30"}],[{"id" => "changeme:22"}])
mock_docs.expects(:has_next?).returns(false)
ActiveFedora::SolrService.instance.conn.expects(:paginate).with(1, 1000, 'select', :params=>{:q=>'has_model_s:info\\:fedora/afmodel\\:SpecModel_Basic', :sort => ['system_create_dt asc'], :fl=> 'id', }).returns('response'=>{'docs'=>mock_docs})
SpecModel::Basic.find(:all).should == ["Fake Object1", "Fake Object2"]
end
it "should use SpecModel::Basic.allocate.init_with to instantiate an object" do
SpecModel::Basic.any_instance.expects(:init_with).returns(SpecModel::Basic.new)
Expand All @@ -96,36 +99,58 @@ def initialize (args = {})
end
end

describe "with conditions hash" do
describe "with conditions" do
it "should filter by the provided fields" do
SpecModel::Basic.expects(:find_one).with("changeme:30", nil).returns("Fake Object1")
SpecModel::Basic.expects(:find_one).with("changeme:22", nil).returns("Fake Object2")

ActiveFedora::SolrService.expects(:query).with('has_model_s:info\\:fedora/afmodel\\:SpecModel_Basic AND foo:"bar" AND baz:"quix" AND baz:"quack"', {:sort => ['system_create_dt asc'], :fl=> 'id', :rows=>1002}).returns([{"id" => "changeme:30"}, {"id" => "changeme:22"}])
SpecModel::Basic.find({:foo=>'bar', :baz=>['quix','quack']}, {:rows=>1002}).should == ["Fake Object1", "Fake Object2"]
mock_docs = mock('docs')
mock_docs.expects(:each).multiple_yields([{"id" => "changeme:30"}],[{"id" => "changeme:22"}])
mock_docs.expects(:has_next?).returns(false)
ActiveFedora::SolrService.instance.conn.expects(:paginate).with(1, 1000, 'select', :params=>{:q=>'has_model_s:info\\:fedora/afmodel\\:SpecModel_Basic AND foo:"bar" AND baz:"quix" AND baz:"quack"', :sort => ['system_create_dt asc'], :fl=> 'id', }).returns('response'=>{'docs'=>mock_docs})
SpecModel::Basic.find({:foo=>'bar', :baz=>['quix','quack']}).should == ["Fake Object1", "Fake Object2"]
end
end
end

describe '#find_each' do
it "should query solr for all objects with :active_fedora_model_s of self.class" do
ActiveFedora::SolrService.expects(:query).with('has_model_s:info\\:fedora/afmodel\\:SpecModel_Basic', :rows=>1001, :fl=>'id',:sort => ['system_create_dt asc']).returns([{"id" => "changeme:30"}, {"id" => "changeme:22"}])
mock_docs = mock('docs')
mock_docs.expects(:each).multiple_yields([{"id" => "changeme:30"}],[{"id" => "changeme:22"}])
mock_docs.expects(:has_next?).returns(false)
ActiveFedora::SolrService.instance.conn.expects(:paginate).with(1, 1000, 'select', :params=>{:q=>'has_model_s:info\\:fedora/afmodel\\:SpecModel_Basic', :sort => ['system_create_dt asc'], :fl=> 'id', }).returns('response'=>{'docs'=>mock_docs})

SpecModel::Basic.expects(:find_one).with("changeme:30", nil).returns(SpecModel::Basic.new(:pid=>'changeme:30'))
SpecModel::Basic.expects(:find_one).with("changeme:22", nil).returns(SpecModel::Basic.new(:pid=>'changeme:22'))
yielded = mock("yielded method")
yielded.expects(:run).with { |obj| obj.class == SpecModel::Basic}.twice
SpecModel::Basic.find_each(:rows=>1001){|obj| yielded.run(obj) }.should == [{"id"=>"changeme:30"}, {"id"=>"changeme:22"}]
SpecModel::Basic.find_each(){|obj| yielded.run(obj) }
end
describe "with conditions hash" do
describe "with conditions" do
it "should filter by the provided fields" do
SpecModel::Basic.expects(:find_one).with("changeme:30", nil).returns(SpecModel::Basic.new(:pid=>'changeme:30'))
SpecModel::Basic.expects(:find_one).with("changeme:22", nil).returns(SpecModel::Basic.new(:pid=>'changeme:22'))

ActiveFedora::SolrService.expects(:query).with('has_model_s:info\\:fedora/afmodel\\:SpecModel_Basic AND foo:"bar" AND baz:"quix" AND baz:"quack"', {:sort => ['system_create_dt asc'], :fl=> 'id', :rows=>1002}).returns([{"id" => "changeme:30"}, {"id" => "changeme:22"}])
mock_docs = mock('docs')
mock_docs.expects(:each).multiple_yields([{"id" => "changeme:30"}],[{"id" => "changeme:22"}])
mock_docs.expects(:has_next?).returns(false)
ActiveFedora::SolrService.instance.conn.expects(:paginate).with(1, 1000, 'select', :params=>{:q=>'has_model_s:info\\:fedora/afmodel\\:SpecModel_Basic AND foo:"bar" AND baz:"quix" AND baz:"quack"', :sort => ['system_create_dt asc'], :fl=> 'id', }).returns('response'=>{'docs'=>mock_docs})
yielded = mock("yielded method")
yielded.expects(:run).with { |obj| obj.class == SpecModel::Basic}.twice
SpecModel::Basic.find_each(:conditions=>{:foo=>'bar', :baz=>['quix','quack']}, :rows=>1002){|obj| yielded.run(obj) }.should == [{"id"=>"changeme:30"}, {"id"=>"changeme:22"}]
SpecModel::Basic.find_each({:foo=>'bar', :baz=>['quix','quack']}){|obj| yielded.run(obj) }
end
end
end

describe '#find_in_batches' do
describe "with conditions hash" do
it "should filter by the provided fields" do
mock_docs = mock('docs')
mock_docs.expects(:has_next?).returns(false)
ActiveFedora::SolrService.instance.conn.expects(:paginate).with(1, 1002, 'select', :params=>{:q=>'has_model_s:info\\:fedora/afmodel\\:SpecModel_Basic AND foo:"bar" AND baz:"quix" AND baz:"quack"', :sort => ['system_create_dt asc'], :fl=> 'id', }).returns('response'=>{'docs'=>mock_docs})
yielded = mock("yielded method")
yielded.expects(:run).with(mock_docs)
SpecModel::Basic.find_in_batches({:foo=>'bar', :baz=>['quix','quack']}, {:batch_size=>1002, :fl=>'id'}){|group| yielded.run group }.should
end
end
end
Expand Down

0 comments on commit 088dba1

Please sign in to comment.