-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enables running of multiple tasks in batches #1538
Changes from 1 commit
dd1b095
3802b29
cc73a92
0c90de0
188c6e8
9f73c33
0e87f50
fbba3d2
ca275a6
8007bf9
694fbbd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
|
||
import abc | ||
import datetime | ||
import enum | ||
import warnings | ||
import json | ||
from json import JSONEncoder | ||
|
@@ -44,6 +45,25 @@ | |
_no_value = object() | ||
|
||
|
||
def join_with_commas(values): | ||
""" Join all values with a comma. | ||
|
||
This is necessary because ','.join is not picklable but top-level functions | ||
are. | ||
|
||
""" | ||
return ','.join(values) | ||
|
||
|
||
class BatchAggregation(enum.Enum): | ||
COMMA_LIST = join_with_commas # join all values with commas | ||
MIN_VALUE = min # choose just the minimum value by string representation | ||
MAX_VALUE = max # choose just the maximum value by string representation | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure how to make these visible on readthedocs. Maybe I should move this documentation to the batch_method parameter description? It seems a little unfortunate to have it there :( |
||
|
||
def __call__(self, values): | ||
return self.value(values) | ||
|
||
|
||
class ParameterException(Exception): | ||
""" | ||
Base exception. | ||
|
@@ -139,7 +159,10 @@ def __init__(self, default=_no_value, is_global=False, significant=True, descrip | |
``positional=False`` for abstract base classes and similar cases. | ||
:param bool always_in_help: For the --help option in the command line | ||
parsing. Set true to always show in --help. | ||
:param str batch_method: How multiple | ||
:param BatchAggregation batch_method: How multiple values of this argument are combined when | ||
batching in the scheduler. See BatchAggregation for | ||
details of what each one means. Default is None, | ||
meaning that this parameter cannot be aggregated. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is good docs, but for this big new feature I expect there to be something similar to what we have for docs for Range*. Why not add it in the same file
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The COMMA_LIST refers to how we combine serializations. If we combined them as an actual list, it would break deserialization. It's up to the user to define the parameters to deserialize into a list if that's what they want. If we build in the list serialization, it would be very similar to re-implementing list parameters, which you were big on removing. What we could do instead is provide a wrapper to translate between lists and commas. So instead of |
||
""" | ||
self._default = default | ||
if is_global: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -83,13 +83,6 @@ class Scheduler(object): | |
BATCH_TASKS = 'batch_tasks' | ||
RUNNING_BATCHES = 'running_batches' | ||
|
||
AGGREGATE_FUNCTIONS = { | ||
'csv': ','.join, | ||
'max': max, | ||
'min': min, | ||
'range': lambda args: '%s-%s' % (min(args), max(args)), | ||
} | ||
|
||
|
||
class scheduler(Config): | ||
# TODO(erikbern): the config_path is needed for backwards compatilibity. We | ||
|
@@ -242,7 +235,7 @@ def task_id(self, tasks): | |
raw_vals = [task.params[arg] for task in tasks] | ||
agg_function = self.aggregates.get(arg) | ||
if agg_function is not None: | ||
arg_val = AGGREGATE_FUNCTIONS[agg_function](raw_vals) | ||
arg_val = agg_function(raw_vals) | ||
elif any(v != raw_vals[0] for v in raw_vals): | ||
return None, None | ||
else: | ||
|
@@ -341,14 +334,19 @@ def get_state(self): | |
} | ||
|
||
def set_state(self, state): | ||
# This check is for backward compatability as of 2016-06-02. In the future we should be able | ||
# to assume a dictionary. | ||
if isinstance(state, dict): | ||
self._tasks = state.get(TASKS, {}) | ||
self._active_workers = state.get(ACTIVE_WORKERS, {}) | ||
self._batch_tasks = state.get(BATCH_TASKS, {}) | ||
self._running_batches = state.get(RUNNING_BATCHES, {}) | ||
|
||
# TODO: remove this code path | ||
else: | ||
self._tasks, self._active_workers = state | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a note that this is only for backward compatibility and this codepath will be removed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
self._batch_tasks = {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
self._running_batches = {} | ||
|
||
def dump(self): | ||
try: | ||
|
@@ -392,7 +390,7 @@ def get_pending_tasks(self): | |
return itertools.chain.from_iterable(six.itervalues(self._status_tasks[status]) | ||
for status in [PENDING, RUNNING]) | ||
|
||
def get_batch(self, worker_id, tasks): | ||
def create_batch_task(self, worker_id, tasks): | ||
if len(tasks) == 1: | ||
return tasks[0] | ||
families = set(task.family for task in tasks) | ||
|
@@ -437,8 +435,8 @@ def get_task(self, task_id, default=None, setdefault=None): | |
else: | ||
return self._tasks.get(task_id, default) | ||
|
||
def get_batcher(self, worker, family): | ||
return self._batch_tasks.get((worker, family)) | ||
def get_batcher(self, worker, task_family): | ||
return self._batch_tasks.get((worker, task_family)) | ||
|
||
def set_batcher(self, worker, family, batcher_aggregate_args, max_batch_size): | ||
batcher = TaskBatcher(batcher_aggregate_args, max_batch_size) | ||
|
@@ -898,8 +896,9 @@ def get_work(self, host=None, assistant=False, current_tasks=None, **kwargs): | |
if len(task.workers) == 1 and not assistant: | ||
n_unique_pending += 1 | ||
|
||
# batch as many tasks as possible together if multiple batchable tasks are available | ||
if (in_workers and best_tasks and task.batchable and | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add a comment above this line There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
self._state.get_batch(worker_id, best_tasks + [task]) is not None and | ||
self._state.create_batch_task(worker_id, best_tasks + [task]) is not None and | ||
self._schedulable(task) and | ||
self._has_resources(task.resources, greedy_resources)): | ||
best_tasks.append(task) | ||
|
@@ -934,7 +933,7 @@ def get_work(self, host=None, assistant=False, current_tasks=None, **kwargs): | |
'n_unique_pending': n_unique_pending} | ||
|
||
if best_tasks: | ||
best_batch = self._state.get_batch(worker_id, best_tasks) | ||
best_batch = self._state.create_batch_task(worker_id, best_tasks) | ||
best_task = self._state.get_task(best_batch.id, setdefault=best_batch) | ||
for task in best_tasks: | ||
if task == best_task: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this isn't public API prepend an
_
underscore._join_with_commas