Skip to content

Commit 451437c

Browse files
committed
adds support for limits in batch processing
1 parent 3b55ac2 commit 451437c

File tree

4 files changed

+220
-54
lines changed

4 files changed

+220
-54
lines changed

activerecord/CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1+
* The flag `error_on_ignored_order_or_limit` has been deprecated in favor of
2+
the current `error_on_ignored_order`.
3+
4+
*Xavier Noria*
5+
6+
* Batch processing methods support `limit`:
7+
8+
Post.limit(10_000).find_each do |post|
9+
# ...
10+
end
11+
12+
It also works in `find_in_batches` and `in_batches`.
13+
14+
*Xavier Noria*
15+
116
* Using `group` with an attribute that has a custom type will properly cast
217
the hash keys after calling a calculation method like `count`. Fixes #25595.
318

activerecord/lib/active_record/core.rb

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,20 @@ def self.configurations
7272

7373
##
7474
# :singleton-method:
75-
# Specifies if an error should be raised on query limit or order being
75+
# Specifies if an error should be raised if the query has an order being
7676
# ignored when doing batch queries. Useful in applications where the
77-
# limit or scope being ignored is error-worthy, rather than a warning.
77+
# scope being ignored is error-worthy, rather than a warning.
78+
mattr_accessor :error_on_ignored_order, instance_writer: false
79+
self.error_on_ignored_order = false
80+
7881
mattr_accessor :error_on_ignored_order_or_limit, instance_writer: false
79-
self.error_on_ignored_order_or_limit = false
82+
def self.error_on_ignored_order_or_limit=(value)
83+
ActiveSupport::Deprecation.warn(<<-MSG.squish)
84+
The flag error_on_ignored_order_or_limit is deprecated. Limits are
85+
now supported. Please use error_on_ignored_order= instead.
86+
MSG
87+
self.error_on_ignored_order = value
88+
end
8089

8190
##
8291
# :singleton-method:

activerecord/lib/active_record/relation/batches.rb

Lines changed: 71 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
require "active_record/relation/batches/batch_enumerator"
1+
require 'active_record/relation/batches/batch_enumerator'
22

33
module ActiveRecord
44
module Batches
5-
ORDER_OR_LIMIT_IGNORED_MESSAGE = "Scoped order and limit are ignored, it's forced to be batch order and batch size."
5+
ORDER_IGNORE_MESSAGE = "Scoped order is ignored, it's forced to be batch order."
66

77
# Looping through a collection of records from the database
88
# (using the Scoping::Named::ClassMethods.all method, for example)
@@ -34,15 +34,19 @@ module Batches
3434
# * <tt>:start</tt> - Specifies the primary key value to start from, inclusive of the value.
3535
# * <tt>:finish</tt> - Specifies the primary key value to end at, inclusive of the value.
3636
# * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when
37-
# the order and limit have to be ignored due to batching.
37+
# the order has to be ignored due to batching.
3838
#
39-
# This is especially useful if you want multiple workers dealing with
40-
# the same processing queue. You can make worker 1 handle all the records
41-
# between id 0 and 10,000 and worker 2 handle from 10,000 and beyond
42-
# (by setting the +:start+ and +:finish+ option on each worker).
39+
# Limits are honored, and if present there is no requirement for the batch
40+
# size, it can be less than, equal, or greater than the limit.
4341
#
44-
# # Let's process for a batch of 2000 records, skipping the first 2000 rows
45-
# Person.find_each(start: 2000, batch_size: 2000) do |person|
42+
# The options +start+ and +finish+ are especially useful if you want
43+
# multiple workers dealing with the same processing queue. You can make
44+
# worker 1 handle all the records between id 1 and 9999 and worker 2
45+
# handle from 10000 and beyond by setting the +:start+ and +:finish+
46+
# option on each worker.
47+
#
48+
# # Let's process from record 10_000 on.
49+
# Person.find_each(start: 10_000) do |person|
4650
# person.party_all_night!
4751
# end
4852
#
@@ -51,8 +55,8 @@ module Batches
5155
# work. This also means that this method only works when the primary key is
5256
# orderable (e.g. an integer or string).
5357
#
54-
# NOTE: You can't set the limit either, that's used to control
55-
# the batch sizes.
58+
# NOTE: By its nature, batch processing is subject to race conditions if
59+
# other processes are modifying the database.
5660
def find_each(start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil)
5761
if block_given?
5862
find_in_batches(start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore) do |records|
@@ -89,15 +93,19 @@ def find_each(start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil)
8993
# * <tt>:start</tt> - Specifies the primary key value to start from, inclusive of the value.
9094
# * <tt>:finish</tt> - Specifies the primary key value to end at, inclusive of the value.
9195
# * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when
92-
# the order and limit have to be ignored due to batching.
96+
# the order has to be ignored due to batching.
97+
#
98+
# Limits are honored, and if present there is no requirement for the batch
99+
# size, it can be less than, equal, or greater than the limit.
93100
#
94-
# This is especially useful if you want multiple workers dealing with
95-
# the same processing queue. You can make worker 1 handle all the records
96-
# between id 0 and 10,000 and worker 2 handle from 10,000 and beyond
97-
# (by setting the +:start+ and +:finish+ option on each worker).
101+
# The options +start+ and +finish+ are especially useful if you want
102+
# multiple workers dealing with the same processing queue. You can make
103+
# worker 1 handle all the records between id 1 and 9999 and worker 2
104+
# handle from 10000 and beyond by setting the +:start+ and +:finish+
105+
# option on each worker.
98106
#
99-
# # Let's process the next 2000 records
100-
# Person.find_in_batches(start: 2000, batch_size: 2000) do |group|
107+
# # Let's process from record 10_000 on.
108+
# Person.find_in_batches(start: 10_000) do |group|
101109
# group.each { |person| person.party_all_night! }
102110
# end
103111
#
@@ -106,8 +114,8 @@ def find_each(start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil)
106114
# work. This also means that this method only works when the primary key is
107115
# orderable (e.g. an integer or string).
108116
#
109-
# NOTE: You can't set the limit either, that's used to control
110-
# the batch sizes.
117+
# NOTE: By its nature, batch processing is subject to race conditions if
118+
# other processes are modifying the database.
111119
def find_in_batches(start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil)
112120
relation = self
113121
unless block_given?
@@ -149,17 +157,19 @@ def find_in_batches(start: nil, finish: nil, batch_size: 1000, error_on_ignore:
149157
# * <tt>:start</tt> - Specifies the primary key value to start from, inclusive of the value.
150158
# * <tt>:finish</tt> - Specifies the primary key value to end at, inclusive of the value.
151159
# * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when
152-
# the order and limit have to be ignored due to batching.
160+
# the order has to be ignored due to batching.
161+
#
162+
# Limits are honored, and if present there is no requirement for the batch
163+
# size, it can be less than, equal, or greater than the limit.
153164
#
154-
# This is especially useful if you want to work with the
155-
# ActiveRecord::Relation object instead of the array of records, or if
156-
# you want multiple workers dealing with the same processing queue. You can
157-
# make worker 1 handle all the records between id 0 and 10,000 and worker 2
158-
# handle from 10,000 and beyond (by setting the +:start+ and +:finish+
159-
# option on each worker).
165+
# The options +start+ and +finish+ are especially useful if you want
166+
# multiple workers dealing with the same processing queue. You can make
167+
# worker 1 handle all the records between id 1 and 9999 and worker 2
168+
# handle from 10000 and beyond by setting the +:start+ and +:finish+
169+
# option on each worker.
160170
#
161-
# # Let's process the next 2000 records
162-
# Person.in_batches(of: 2000, start: 2000).update_all(awesome: true)
171+
# # Let's process from record 10_000 on.
172+
# Person.in_batches(start: 10_000).update_all(awesome: true)
163173
#
164174
# An example of calling where query method on the relation:
165175
#
@@ -179,31 +189,38 @@ def find_in_batches(start: nil, finish: nil, batch_size: 1000, error_on_ignore:
179189
# consistent. Therefore the primary key must be orderable, e.g an integer
180190
# or a string.
181191
#
182-
# NOTE: You can't set the limit either, that's used to control the batch
183-
# sizes.
192+
# NOTE: By its nature, batch processing is subject to race conditions if
193+
# other processes are modifying the database.
184194
def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore: nil)
185195
relation = self
186196
unless block_given?
187197
return BatchEnumerator.new(of: of, start: start, finish: finish, relation: self)
188198
end
189199

190-
if arel.orders.present? || arel.taken.present?
191-
act_on_order_or_limit_ignored(error_on_ignore)
200+
if arel.orders.present?
201+
act_on_ignored_order(error_on_ignore)
202+
end
203+
204+
batch_limit = of
205+
if limit_value
206+
remaining = limit_value
207+
batch_limit = remaining if remaining < batch_limit
208+
relation = relation.limit(nil) # new relation without the limit
192209
end
193210

194-
relation = relation.reorder(batch_order).limit(of)
211+
relation = relation.reorder(batch_order).limit(batch_limit)
195212
relation = apply_limits(relation, start, finish)
196213
batch_relation = relation
197214

198215
loop do
199216
if load
200217
records = batch_relation.records
201218
ids = records.map(&:id)
202-
yielded_relation = self.where(primary_key => ids)
219+
yielded_relation = where(primary_key => ids)
203220
yielded_relation.load_records(records)
204221
else
205222
ids = batch_relation.pluck(primary_key)
206-
yielded_relation = self.where(primary_key => ids)
223+
yielded_relation = where(primary_key => ids)
207224
end
208225

209226
break if ids.empty?
@@ -213,7 +230,20 @@ def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore:
213230

214231
yield yielded_relation
215232

216-
break if ids.length < of
233+
break if ids.length < batch_limit
234+
235+
if limit_value
236+
remaining -= ids.length
237+
238+
if remaining == 0
239+
# Saves a useless iteration when the limit is a multiple of the
240+
# batch size.
241+
break
242+
elsif remaining < batch_limit
243+
relation = relation.limit(remaining)
244+
end
245+
end
246+
217247
batch_relation = relation.where(arel_attribute(primary_key).gt(primary_key_offset))
218248
end
219249
end
@@ -230,13 +260,13 @@ def batch_order
230260
"#{quoted_table_name}.#{quoted_primary_key} ASC"
231261
end
232262

233-
def act_on_order_or_limit_ignored(error_on_ignore)
234-
raise_error = (error_on_ignore.nil? ? self.klass.error_on_ignored_order_or_limit : error_on_ignore)
263+
def act_on_ignored_order(error_on_ignore)
264+
raise_error = (error_on_ignore.nil? ? self.klass.error_on_ignored_order : error_on_ignore)
235265

236266
if raise_error
237-
raise ArgumentError.new(ORDER_OR_LIMIT_IGNORED_MESSAGE)
267+
raise ArgumentError.new(ORDER_IGNORE_MESSAGE)
238268
elsif logger
239-
logger.warn(ORDER_OR_LIMIT_IGNORED_MESSAGE)
269+
logger.warn(ORDER_IGNORE_MESSAGE)
240270
end
241271
end
242272
end

0 commit comments

Comments
 (0)