From 18947feed97e4a3287e5a56656a1dbb6186090c8 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Fri, 2 Nov 2018 23:26:10 -0500 Subject: [PATCH 1/7] Avoid error catching inside loop for significant performance gain. --- toolz/itertoolz.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 941b9a9f..77099d4c 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -875,13 +875,11 @@ def join(leftkey, leftseq, rightkey, rightseq, for item in rightseq: key = rightkey(item) seen_keys.add(key) - try: - left_matches = d[key] - for match in left_matches: - yield (match, item) - except KeyError: - if not left_default_is_no_default: - yield (left_default, item) + if key in d: + for left_match in d[key]: + yield (left_match, item) + elif not left_default_is_no_default: + yield (left_default, item) if right_default != no_default: for key, matches in d.items(): From afd126fb6861725b4f00380a81e61e9a154bb681 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Fri, 2 Nov 2018 23:28:24 -0500 Subject: [PATCH 2/7] On Python 2, don't create a copy of the dictionary items. --- toolz/itertoolz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 77099d4c..45031c33 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -882,7 +882,7 @@ def join(leftkey, leftseq, rightkey, rightseq, yield (left_default, item) if right_default != no_default: - for key, matches in d.items(): + for key, matches in iteritems(d): if key not in seen_keys: for match in matches: yield (match, right_default) From 33c6687c13d91f7b1ec09ec12e9d5dd314c77f2c Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 7 Nov 2018 16:48:46 -0600 Subject: [PATCH 3/7] Detect join case and perform each case separately. Added notes about memory usage and hashability. --- toolz/itertoolz.py | 58 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 45031c33..7fed9d26 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -816,6 +816,8 @@ def join(leftkey, leftseq, rightkey, rightseq, This is a semi-streaming operation. The LEFT sequence is fully evaluated and placed into memory. The RIGHT sequence is evaluated lazily and so can be arbitrarily large. + (Note: If right_default is defined, then unique keys of rightseq + will also be stored in memory.) >>> friends = [('Alice', 'Edith'), ... ('Alice', 'Zhao'), @@ -858,7 +860,9 @@ def join(leftkey, leftseq, rightkey, rightseq, Usually the key arguments are callables to be applied to the sequences. If the keys are not obviously callable then it is assumed that indexing was - intended, e.g. the following is a legal change + intended, e.g. the following is a legal change. + The join is implemented as a hash join and the keys of leftseq must be hashable. + Additionally, if right_default is defined, then keys of rightseq must also be hashable. >>> # result = join(second, friends, first, cities) >>> result = join(1, friends, 0, cities) # doctest: +SKIP @@ -869,19 +873,45 @@ def join(leftkey, leftseq, rightkey, rightseq, rightkey = getter(rightkey) d = groupby(leftkey, leftseq) - seen_keys = set() - - left_default_is_no_default = (left_default == no_default) - for item in rightseq: - key = rightkey(item) - seen_keys.add(key) - if key in d: - for left_match in d[key]: - yield (left_match, item) - elif not left_default_is_no_default: - yield (left_default, item) - - if right_default != no_default: + + if (left_default is no_default) and (right_default is no_default): + # Inner Join + for item in rightseq: + key = rightkey(item) + if key in d: + for left_match in d[key]: + yield (left_match, item) + elif (left_default is not no_default) and (right_default is no_default): + # Right Join + for item in rightseq: + key = rightkey(item) + if key in d: + for left_match in d[key]: + yield (left_match, item) + else: + yield (left_default, item) + elif (right_default is not no_default): + seen_keys = set() + + if left_default is no_default: + # Left Join + for item in rightseq: + key = rightkey(item) + seen_keys.add(key) + if key in d: + for left_match in d[key]: + yield(left_match, item) + else: + # Full Join + for item in rightseq: + key = rightkey(item) + seen_keys.add(key) + if key in d: + for left_match in d[key]: + yield (left_match, item) + else: + yield (left_default, item) + for key, matches in iteritems(d): if key not in seen_keys: for match in matches: From a99299e62ba5bf0d3522293bce34bbd6fd9223ee Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 7 Nov 2018 23:12:08 -0600 Subject: [PATCH 4/7] Restore identity comparisons. --- toolz/itertoolz.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 7fed9d26..280180e4 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -874,14 +874,14 @@ def join(leftkey, leftseq, rightkey, rightseq, d = groupby(leftkey, leftseq) - if (left_default is no_default) and (right_default is no_default): + if (left_default == no_default) and (right_default == no_default): # Inner Join for item in rightseq: key = rightkey(item) if key in d: for left_match in d[key]: yield (left_match, item) - elif (left_default is not no_default) and (right_default is no_default): + elif (left_default != no_default) and (right_default == no_default): # Right Join for item in rightseq: key = rightkey(item) @@ -890,17 +890,17 @@ def join(leftkey, leftseq, rightkey, rightseq, yield (left_match, item) else: yield (left_default, item) - elif (right_default is not no_default): + elif (right_default != no_default): seen_keys = set() - if left_default is no_default: + if left_default == no_default: # Left Join for item in rightseq: key = rightkey(item) seen_keys.add(key) if key in d: for left_match in d[key]: - yield(left_match, item) + yield (left_match, item) else: # Full Join for item in rightseq: From 9ffd119674a09c2a5cd5ee9b042574872aec6ed6 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 7 Nov 2018 23:15:22 -0600 Subject: [PATCH 5/7] Adjust line breaks to 79 characters. --- toolz/itertoolz.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 280180e4..bf72f94c 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -861,8 +861,9 @@ def join(leftkey, leftseq, rightkey, rightseq, Usually the key arguments are callables to be applied to the sequences. If the keys are not obviously callable then it is assumed that indexing was intended, e.g. the following is a legal change. - The join is implemented as a hash join and the keys of leftseq must be hashable. - Additionally, if right_default is defined, then keys of rightseq must also be hashable. + The join is implemented as a hash join and the keys of leftseq must be + hashable. Additionally, if right_default is defined, then keys of rightseq + must also be hashable. >>> # result = join(second, friends, first, cities) >>> result = join(1, friends, 0, cities) # doctest: +SKIP From 0217f38a2fde4e9bd2cbf51de0cbe38a29f9ff7b Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 5 Dec 2018 21:55:52 -0600 Subject: [PATCH 6/7] Remove unnecessary parens. --- toolz/itertoolz.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index bf72f94c..80d1fe5f 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -875,14 +875,14 @@ def join(leftkey, leftseq, rightkey, rightseq, d = groupby(leftkey, leftseq) - if (left_default == no_default) and (right_default == no_default): + if left_default == no_default and right_default == no_default: # Inner Join for item in rightseq: key = rightkey(item) if key in d: for left_match in d[key]: yield (left_match, item) - elif (left_default != no_default) and (right_default == no_default): + elif left_default != no_default and right_default == no_default: # Right Join for item in rightseq: key = rightkey(item) @@ -891,7 +891,7 @@ def join(leftkey, leftseq, rightkey, rightseq, yield (left_match, item) else: yield (left_default, item) - elif (right_default != no_default): + elif right_default != no_default: seen_keys = set() if left_default == no_default: From 54aa44292e433a6f7829c383f0fb9cbf5c4f09b3 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 5 Dec 2018 22:02:49 -0600 Subject: [PATCH 7/7] Avoid lookups for set.add method. --- toolz/itertoolz.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 80d1fe5f..e07bb765 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -893,12 +893,13 @@ def join(leftkey, leftseq, rightkey, rightseq, yield (left_default, item) elif right_default != no_default: seen_keys = set() + seen = seen_keys.add if left_default == no_default: # Left Join for item in rightseq: key = rightkey(item) - seen_keys.add(key) + seen(key) if key in d: for left_match in d[key]: yield (left_match, item) @@ -906,7 +907,7 @@ def join(leftkey, leftseq, rightkey, rightseq, # Full Join for item in rightseq: key = rightkey(item) - seen_keys.add(key) + seen(key) if key in d: for left_match in d[key]: yield (left_match, item)