New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Task id hashing #1444
Task id hashing #1444
Changes from all commits
b547b5e
1ee3614
eaf8ea3
82812f8
8b3988c
66ff3c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,9 @@ | |
import logging | ||
import traceback | ||
import warnings | ||
import json | ||
import hashlib | ||
import re | ||
|
||
from luigi import six | ||
|
||
|
@@ -253,14 +256,23 @@ def __init__(self, *args, **kwargs): | |
self.param_args = tuple(value for key, value in param_values) | ||
self.param_kwargs = dict(param_values) | ||
|
||
# Build up task id | ||
task_id_parts = [] | ||
param_objs = dict(params) | ||
for param_name, param_value in param_values: | ||
if param_objs[param_name].significant: | ||
task_id_parts.append('%s=%s' % (param_name, param_objs[param_name].serialize(param_value))) | ||
# task_id is a concatenation of task family, the first values of the first 3 parameters | ||
# sorted by parameter name and a md5hash of the family/parameters as a cananocalised json. | ||
TASK_ID_INCLUDE_PARAMS = 3 | ||
TASK_ID_TRUNCATE_PARAMS = 16 | ||
TASK_ID_TRUNCATE_HASH = 10 | ||
TASK_ID_INVALID_CHAR_REGEX = r'[^A-Za-z0-9_]' | ||
|
||
params = self.to_str_params(only_significant=True) | ||
param_str = json.dumps(params, separators=(',', ':'), sort_keys=True) | ||
param_hash = hashlib.md5(param_str.encode('utf-8')).hexdigest() | ||
|
||
param_summary = '_'.join(p[:TASK_ID_TRUNCATE_PARAMS] | ||
for p in (params[p] for p in sorted(params)[:TASK_ID_INCLUDE_PARAMS])) | ||
param_summary = re.sub(TASK_ID_INVALID_CHAR_REGEX, '_', param_summary) | ||
|
||
self.task_id = '{}_{}_{}'.format(self.task_family, param_summary, param_hash[:TASK_ID_TRUNCATE_HASH]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so this doesn't include a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've removed refs in |
||
|
||
self.task_id = '%s(%s)' % (self.task_family, ', '.join(task_id_parts)) | ||
self.__hash = hash(self.task_id) | ||
|
||
def initialized(self): | ||
|
@@ -283,14 +295,15 @@ def from_str_params(cls, params_str): | |
|
||
return cls(**kwargs) | ||
|
||
def to_str_params(self): | ||
def to_str_params(self, only_significant=False): | ||
""" | ||
Convert all parameters to a str->str hash. | ||
""" | ||
params_str = {} | ||
params = dict(self.get_params()) | ||
for param_name, param_value in six.iteritems(self.param_kwargs): | ||
params_str[param_name] = params[param_name].serialize(param_value) | ||
if (not only_significant) or params[param_name].significant: | ||
params_str[param_name] = params[param_name].serialize(param_value) | ||
|
||
return params_str | ||
|
||
|
@@ -324,7 +337,22 @@ def __hash__(self): | |
return self.__hash | ||
|
||
def __repr__(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps a docstring:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've tried this but the results are inconsistent in the tests. I'm seeing a mixture of |
||
return self.task_id | ||
""" | ||
Build a task representation like `MyTask(param1=1.5, param2='5')` | ||
""" | ||
params = self.get_params() | ||
param_values = self.get_param_values(params, [], self.param_kwargs) | ||
|
||
# Build up task id | ||
repr_parts = [] | ||
param_objs = dict(params) | ||
for param_name, param_value in param_values: | ||
if param_objs[param_name].significant: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now, since we're not sending this to the scheduler any more. Maybe we should include the insignificant parameters too? (What I'm suggesting is to remove this one line) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm undecided. Maybe people will want to keep repr() short by marking configuration parameters as insignificant? I don't use this feature so I don't really have an opinion. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
repr_parts.append('%s=%s' % (param_name, param_objs[param_name].serialize(param_value))) | ||
|
||
task_str = '{}({})'.format(self.task_family, ', '.join(repr_parts)) | ||
|
||
return task_str | ||
|
||
def __eq__(self, other): | ||
return self.__class__ == other.__class__ and self.param_args == other.param_args | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you should probably remove non-alphanumeric characters from
param_summary
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, all non-alphanumeric (+ "") converted to "".