From 43c0cd97b1aaf98446666534f2d2657a95aeb619 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Mon, 12 Nov 2018 17:41:59 -0600 Subject: [PATCH 1/5] Optimize join. Each case is treated separately and error handling is removed from inside the loop. Also use iteritems() instead of dict.items(). --- toolz/itertoolz.py | 88 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 20 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 941b9a9f..dfc3c733 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -809,6 +809,63 @@ def getter(index): return operator.itemgetter(index) +def _inner_join(leftkey, leftseq, rightkey, rightseq): + d = groupby(leftkey, leftseq) + for item in rightseq: + key = rightkey(item) + if key in d: + for left_match in d[key]: + yield (left_match, item) + + +def _right_join(leftkey, leftseq, rightkey, rightseq, + left_default=no_default): + d = groupby(leftkey, leftseq) + for item in rightseq: + key = rightkey(item) + if key in d: + for left_match in d[key]: + yield (left_match, item) + else: + yield (left_default, item) + + +def _left_join(leftkey, leftseq, rightkey, rightseq, + right_default=no_default): + d = groupby(leftkey, leftseq) + seen_keys = set() + for item in rightseq: + key = rightkey(item) + seen_keys.add(key) + if key in d: + for left_match in d[key]: + yield(left_match, item) + + for key, matches in iteritems(d): + if key not in seen_keys: + for match in matches: + yield (match, right_default) + + +def _full_join(leftkey, leftseq, rightkey, rightseq, + left_default=no_default, right_default=no_default): + d = groupby(leftkey, leftseq) + seen_keys = set() + for item in rightseq: + key = rightkey(item) + seen_keys.add(key) + if key in d: + for left_match in d[key]: + yield (left_match, item) + else: + yield (left_default, item) + + for key, matches in iteritems(d): + if key not in seen_keys: + for match in matches: + yield (match, right_default) + + def join(leftkey, leftseq, rightkey, rightseq, left_default=no_default, right_default=no_default): """ Join two sequences on common attributes @@ -868,26 +925,17 @@ def join(leftkey, leftseq, rightkey, rightseq, if not callable(rightkey): rightkey = getter(rightkey) - d = groupby(leftkey, leftseq) - seen_keys = set() - - left_default_is_no_default = (left_default == no_default) - for item in rightseq: - key = rightkey(item) - seen_keys.add(key) - try: - left_matches = d[key] - for match in left_matches: - yield (match, item) - except KeyError: - if not left_default_is_no_default: - yield (left_default, item) - - if right_default != no_default: - for key, matches in d.items(): - if key not in seen_keys: - for match in matches: - yield (match, right_default) + if (left_default == no_default) and (right_default == no_default): + return _inner_join(leftkey, leftseq, rightkey, rightseq) + elif (left_default != no_default) and (right_default == no_default): + return _right_join(leftkey, leftseq, rightkey, rightseq, + left_default=left_default) + elif (left_default == no_default) and (right_default != no_default): + return _left_join(leftkey, leftseq, rightkey, rightseq, + right_default=right_default) + else: + return _full_join(leftkey, leftseq, rightkey, rightseq, + left_default=left_default, right_default=right_default) def diff(*seqs, **kwargs): From ef2b135ee6cbb16c0eac89166cdd5fe6015c08a0 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Mon, 12 Nov 2018 18:01:12 -0600 Subject: [PATCH 2/5] Update docstring for join. Explain memory usage of join and hashability requirements. --- toolz/itertoolz.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index dfc3c733..a9487037 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -871,8 +871,12 @@ def join(leftkey, leftseq, rightkey, rightseq, """ Join two sequences on common attributes This is a semi-streaming operation. The LEFT sequence is fully evaluated - and placed into memory. The RIGHT sequence is evaluated lazily and so can - be arbitrarily large. + and placed into memory. The RIGHT sequence is evaluated lazily and unless + right_default is defined, it can be arbitrarily large. If right_default is + defined, the unique keys of rightseq will be placed into memory. + The join is implemented as a hash join and the keys of leftseq must be + hashable. Additionally, if right_default is defined, then keys of rightseq + must also be hashable. >>> friends = [('Alice', 'Edith'), ... ('Alice', 'Zhao'), From 8140c02ea06aca00a9052cad58e419f50ccf9630 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Mon, 12 Nov 2018 18:07:14 -0600 Subject: [PATCH 3/5] Fix indentation. --- toolz/itertoolz.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index a9487037..f1c07ef4 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -933,13 +933,13 @@ def join(leftkey, leftseq, rightkey, rightseq, return _inner_join(leftkey, leftseq, rightkey, rightseq) elif (left_default != no_default) and (right_default == no_default): return _right_join(leftkey, leftseq, rightkey, rightseq, - left_default=left_default) + left_default=left_default) elif (left_default == no_default) and (right_default != no_default): return _left_join(leftkey, leftseq, rightkey, rightseq, - right_default=right_default) + right_default=right_default) else: return _full_join(leftkey, leftseq, rightkey, rightseq, - left_default=left_default, right_default=right_default) + left_default=left_default, right_default=right_default) def diff(*seqs, **kwargs): From 015a6d40c440cb147abef903b2ac12b6a4bb5d00 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Mon, 12 Nov 2018 18:11:18 -0600 Subject: [PATCH 4/5] Fix indentation for PEP8. --- toolz/itertoolz.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index f1c07ef4..90195c92 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -933,13 +933,14 @@ def join(leftkey, leftseq, rightkey, rightseq, return _inner_join(leftkey, leftseq, rightkey, rightseq) elif (left_default != no_default) and (right_default == no_default): return _right_join(leftkey, leftseq, rightkey, rightseq, - left_default=left_default) + left_default=left_default) elif (left_default == no_default) and (right_default != no_default): return _left_join(leftkey, leftseq, rightkey, rightseq, - right_default=right_default) + right_default=right_default) else: return _full_join(leftkey, leftseq, rightkey, rightseq, - left_default=left_default, right_default=right_default) + left_default=left_default, + right_default=right_default) def diff(*seqs, **kwargs): From 82db29545ba175e152c51d7ce8422a8d1c1c7456 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Mon, 12 Nov 2018 18:17:20 -0600 Subject: [PATCH 5/5] Make pycodestyle happy. --- toolz/itertoolz.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 90195c92..6fb03967 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -831,7 +831,7 @@ def _right_join(leftkey, leftseq, rightkey, rightseq, def _left_join(leftkey, leftseq, rightkey, rightseq, - right_default=no_default): + right_default=no_default): d = groupby(leftkey, leftseq) seen_keys = set() for item in rightseq: @@ -848,7 +848,7 @@ def _left_join(leftkey, leftseq, rightkey, rightseq, def _full_join(leftkey, leftseq, rightkey, rightseq, - left_default=no_default, right_default=no_default): + left_default=no_default, right_default=no_default): d = groupby(leftkey, leftseq) seen_keys = set() for item in rightseq: