Skip to content

Commit

Permalink
updated test_fuzzywuzzy.py to include tests for process.dedupe. Updat…
Browse files Browse the repository at this point in the history
…ed process.py dedupe function to fix errors found via unit testing.
  • Loading branch information
brandomr committed Jun 1, 2015
1 parent 7a3b28f commit f99851f
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 3 deletions.
11 changes: 8 additions & 3 deletions fuzzywuzzy/process.py
Expand Up @@ -197,7 +197,7 @@ def dedupe (contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):
# iterate over items in *contains_dupes*
for item in contains_dupes:
# return all duplicate matches found
matches = process.extract(item, contains_dupes, limit=None, scorer=scorer)
matches = extract(item, contains_dupes, limit=None, scorer=scorer)
# filter matches based on the threshold
filtered = [x for x in matches if x[1] > threshold]
# if there is only 1 item in *filtered*, no duplicates were found so append to *extracted*
Expand All @@ -217,5 +217,10 @@ def dedupe (contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):
for e in extractor:
keys[e] = 1
extractor = keys.keys()

return extractor

# check that extractor differs from contain_dupes (e.g. duplicates were found)
# if not, then return the original list
if len(extractor) == len(contains_dupes):
return contains_dupes
else:
return extractor
22 changes: 22 additions & 0 deletions test_fuzzywuzzy.py
Expand Up @@ -447,6 +447,28 @@ def test_dict_like_extract(self):
for value, confidence, key in result:
self.assertTrue(value in choices.values())

def test_dedupe(self):
"""We should be able to use a list-like object for contains_dupes
"""
## Test 1
contains_dupes = ['Frodo Baggins', 'Tom Sawyer', 'Bilbo Baggin', 'Samuel L. Jackson', 'F. Baggins', 'Frody Baggins', 'Bilbo Baggins']

# Note that the ordering will differ due to the alpha then length sort
deduped_list = ['Frodo Baggins', 'Samuel L. Jackson', 'Bilbo Baggins', 'Tom Sawyer']

result = process.dedupe(contains_dupes)
self.assertEqual(result, deduped_list)


## Test 2
contains_dupes = ['Tom', 'Dick', 'Harry']

# we should end up with the same list since no duplicates are contained in the list (e.g. original list is returned)
deduped_list = ['Tom','Dick','Harry']

result = process.dedupe(contains_dupes)
self.assertEqual(result, deduped_list)


if __name__ == '__main__':
unittest.main() # run all tests

0 comments on commit f99851f

Please sign in to comment.