Skip to content
This repository has been archived by the owner on Aug 26, 2024. It is now read-only.

Commit

Permalink
Minor whitespace changes for PEP8
Browse files Browse the repository at this point in the history
  • Loading branch information
josegonzalez committed Aug 7, 2015
1 parent 9a4bc22 commit d0c389d
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 25 deletions.
2 changes: 1 addition & 1 deletion benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
print_result_from_timeit('utils.full_process(u\'%s\')' % s,
common_setup + basic_setup, number=iterations)

### benchmarking the core matching methods...
# benchmarking the core matching methods...

for s in cirque_strings:
print 'Test fuzz.ratio for string: "%s"' % s
Expand Down
3 changes: 2 additions & 1 deletion fuzzywuzzy/StringMatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from Levenshtein import *
from warnings import warn


class StringMatcher:
"""A SequenceMatcher-like class built on the top of Levenshtein"""

Expand Down Expand Up @@ -75,4 +76,4 @@ def real_quick_ratio(self):
def distance(self):
if not self._distance:
self._distance = distance(self._str1, self._str2)
return self._distance
return self._distance
2 changes: 2 additions & 0 deletions fuzzywuzzy/fuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def _process_and_sort(s, force_ascii):
sorted_string = u" ".join(sorted(tokens))
return sorted_string.strip()


# Sorted Token
# find all alphanumeric tokens in the string
# sort those tokens and take ratio of resulting joined strings
Expand All @@ -132,6 +133,7 @@ def _token_sort(s1, s2, partial=True, force_ascii=True):
else:
return ratio(sorted1, sorted2)


def token_sort_ratio(s1, s2, force_ascii=True):
"""Return a measure of the sequences' similarity between 0 and 100
but sorting the token before comparing.
Expand Down
29 changes: 15 additions & 14 deletions fuzzywuzzy/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,20 +163,21 @@ def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0):
return best_list[0]
return None

def dedupe (contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):
"""This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify
and remove duplicates. Specifically, it uses the process.extract to identify duplicates that

def dedupe(contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):
"""This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify
and remove duplicates. Specifically, it uses the process.extract to identify duplicates that
score greater than a user defined threshold. Then, it looks for the longest item in the duplicate list
since we assume this item contains the most entity information and returns that. It breaks string
since we assume this item contains the most entity information and returns that. It breaks string
length ties on an alphabetical sort.
Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the
returned deduplicated list will likely be shorter. Raise the threshold for fuzzy_dedupe to be less
Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the
returned deduplicated list will likely be shorter. Raise the threshold for fuzzy_dedupe to be less
sensitive.
Args:
contains_dupes: A list of strings that we would like to dedupe.
threshold: the numerical value (0,100) point at which we expect to find duplicates.
threshold: the numerical value (0,100) point at which we expect to find duplicates.
Defaults to 70 out of 100
scorer: Optional function for scoring matches between the query and
an individual processed choice. This should be a function
Expand All @@ -193,22 +194,22 @@ def dedupe (contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):
"""

extractor = []

# iterate over items in *contains_dupes*
for item in contains_dupes:
# return all duplicate matches found
matches = extract(item, contains_dupes, limit=None, scorer=scorer)
# filter matches based on the threshold
# filter matches based on the threshold
filtered = [x for x in matches if x[1] > threshold]
# if there is only 1 item in *filtered*, no duplicates were found so append to *extracted*
if len(filtered) == 1:
extractor.append(filtered[0][0])

else:
# alpha sort
filtered = sorted(filtered, key = lambda x: x[0])
filtered = sorted(filtered, key=lambda x: x[0])
# length sort
filter_sort = sorted(filtered, key = lambda x: len(x[0]), reverse=True)
filter_sort = sorted(filtered, key=lambda x: len(x[0]), reverse=True)
# take first item as our 'canonical example'
extractor.append(filter_sort[0][0])

Expand All @@ -217,7 +218,7 @@ def dedupe (contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):
for e in extractor:
keys[e] = 1
extractor = keys.keys()

# check that extractor differs from contain_dupes (e.g. duplicates were found)
# if not, then return the original list
if len(extractor) == len(contains_dupes):
Expand Down
1 change: 0 additions & 1 deletion fuzzywuzzy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
PY3 = sys.version_info[0] == 3



def validate_string(s):
try:
return len(s) > 0
Expand Down
16 changes: 8 additions & 8 deletions test_fuzzywuzzy.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def test_fullProcessForceAscii(self):
for s in self.mixed_strings:
utils.full_process(s, force_ascii=True)


class RatioTest(unittest.TestCase):

def setUp(self):
Expand Down Expand Up @@ -371,12 +372,12 @@ def testWithCutoff(self):

best = process.extractOne(query, choices, score_cutoff=50)
self.assertTrue(best is None)
#self.assertIsNone(best) # unittest.TestCase did not have assertIsNone until Python 2.7
# self.assertIsNone(best) # unittest.TestCase did not have assertIsNone until Python 2.7

# however if we had no cutoff, something would get returned

#best = process.extractOne(query, choices)
#self.assertIsNotNone(best)
# best = process.extractOne(query, choices)
# self.assertIsNotNone(best)

def testWithCutoff2(self):
choices = [
Expand Down Expand Up @@ -450,18 +451,17 @@ def test_dict_like_extract(self):
def test_dedupe(self):
"""We should be able to use a list-like object for contains_dupes
"""
## Test 1
# Test 1
contains_dupes = ['Frodo Baggins', 'Tom Sawyer', 'Bilbo Baggin', 'Samuel L. Jackson', 'F. Baggins', 'Frody Baggins', 'Bilbo Baggins']

result = process.dedupe(contains_dupes)
self.assertTrue(len(result) < len(contains_dupes))


## Test 2
# Test 2
contains_dupes = ['Tom', 'Dick', 'Harry']

# we should end up with the same list since no duplicates are contained in the list (e.g. original list is returned)
deduped_list = ['Tom','Dick','Harry']
deduped_list = ['Tom', 'Dick', 'Harry']

result = process.dedupe(contains_dupes)
self.assertEqual(result, deduped_list)
Expand Down

0 comments on commit d0c389d

Please sign in to comment.