diff --git a/fuzzywuzzy/process.py b/fuzzywuzzy/process.py index decc0193..f31dbe82 100644 --- a/fuzzywuzzy/process.py +++ b/fuzzywuzzy/process.py @@ -197,7 +197,7 @@ def dedupe (contains_dupes, threshold=70, scorer=fuzz.token_set_ratio): # iterate over items in *contains_dupes* for item in contains_dupes: # return all duplicate matches found - matches = process.extract(item, contains_dupes, limit=None, scorer=scorer) + matches = extract(item, contains_dupes, limit=None, scorer=scorer) # filter matches based on the threshold filtered = [x for x in matches if x[1] > threshold] # if there is only 1 item in *filtered*, no duplicates were found so append to *extracted* @@ -217,5 +217,10 @@ def dedupe (contains_dupes, threshold=70, scorer=fuzz.token_set_ratio): for e in extractor: keys[e] = 1 extractor = keys.keys() - - return extractor + + # check that extractor differs from contain_dupes (e.g. duplicates were found) + # if not, then return the original list + if len(extractor) == len(contains_dupes): + return contains_dupes + else: + return extractor diff --git a/test_fuzzywuzzy.py b/test_fuzzywuzzy.py index 1c243f1b..1369d79e 100644 --- a/test_fuzzywuzzy.py +++ b/test_fuzzywuzzy.py @@ -447,6 +447,28 @@ def test_dict_like_extract(self): for value, confidence, key in result: self.assertTrue(value in choices.values()) + def test_dedupe(self): + """We should be able to use a list-like object for contains_dupes + """ + ## Test 1 + contains_dupes = ['Frodo Baggins', 'Tom Sawyer', 'Bilbo Baggin', 'Samuel L. Jackson', 'F. Baggins', 'Frody Baggins', 'Bilbo Baggins'] + + # Note that the ordering will differ due to the alpha then length sort + deduped_list = ['Frodo Baggins', 'Samuel L. Jackson', 'Bilbo Baggins', 'Tom Sawyer'] + + result = process.dedupe(contains_dupes) + self.assertEqual(result, deduped_list) + + + ## Test 2 + contains_dupes = ['Tom', 'Dick', 'Harry'] + + # we should end up with the same list since no duplicates are contained in the list (e.g. original list is returned) + deduped_list = ['Tom','Dick','Harry'] + + result = process.dedupe(contains_dupes) + self.assertEqual(result, deduped_list) + if __name__ == '__main__': unittest.main() # run all tests