From f006dfafd723456062a7f359ef0aa74a40c46d3d Mon Sep 17 00:00:00 2001 From: Michael Lever <8038441+MWLever@users.noreply.github.com> Date: Sun, 21 Feb 2021 11:32:44 -0500 Subject: [PATCH 1/2] Modified Set Scoring Previous issue: partial_token_set_ratio matching strings across tokens. Fix: Preserve tokenization of the comparison sets and use Levenshtein's setratio/seqratio over ratio. Detail: Previously token_set_ratio used python's strip to remove white space. Since strip removes all whitespace, the set comparisons are not tokenized. So when using partial_token_set_ratio, you would be able to match strings across word boundaries. This is generally unexpected behavior. This change should allow a more bag of words. --- fuzzywuzzy/StringMatcher.py | 12 +++++++++++- fuzzywuzzy/fuzz.py | 37 ++++++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/fuzzywuzzy/StringMatcher.py b/fuzzywuzzy/StringMatcher.py index d35e075f..15798919 100644 --- a/fuzzywuzzy/StringMatcher.py +++ b/fuzzywuzzy/StringMatcher.py @@ -8,7 +8,7 @@ License available here: https://github.com/miohtama/python-Levenshtein/blob/master/COPYING """ -from Levenshtein import * +from Levenshtein._levenshtein import * from warnings import warn @@ -64,6 +64,16 @@ def ratio(self): self._ratio = ratio(self._str1, self._str2) return self._ratio + def setratio(self): + if(not hasattr(self, '_setratio')): + self._setratio = setratio(self._str1, self._str2) + return self._setratio + + def seqratio(self): + if(not hasattr(self, '_seqratio')): + self._seqratio = seqratio(self._str1, self._str2) + return self._seqratio + def quick_ratio(self): # This is usually quick enough :o) if not self._ratio: diff --git a/fuzzywuzzy/fuzz.py b/fuzzywuzzy/fuzz.py index 27f80c9f..d8f5b15d 100644 --- a/fuzzywuzzy/fuzz.py +++ b/fuzzywuzzy/fuzz.py @@ -27,6 +27,24 @@ def ratio(s1, s2): m = SequenceMatcher(None, s1, s2) return utils.intr(100 * m.ratio()) +@utils.check_for_none +@utils.check_for_equivalence +@utils.check_empty_string +def setratio(s1, s2): + s1, s2 = utils.make_type_consistent(s1, s2) + + m = SequenceMatcher(None, s1, s2) + return utils.intr(100 * m.setratio()) + +@utils.check_for_none +@utils.check_for_equivalence +@utils.check_empty_string +def seqratio(s1, s2): + s1, s2 = utils.make_type_consistent(s1, s2) + + m = SequenceMatcher(None, s1, s2) + return utils.intr(100 * m.seqratio()) + @utils.check_for_none @utils.check_for_equivalence @@ -140,23 +158,28 @@ def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True): diff1to2 = tokens1.difference(tokens2) diff2to1 = tokens2.difference(tokens1) - sorted_sect = " ".join(sorted(intersection)) - sorted_1to2 = " ".join(sorted(diff1to2)) - sorted_2to1 = " ".join(sorted(diff2to1)) + delimiter = "+++" + sorted_sect = delimiter.join(sorted(intersection)) + sorted_1to2 = delimiter.join(sorted(diff1to2)) + sorted_2to1 = delimiter.join(sorted(diff2to1)) - combined_1to2 = sorted_sect + " " + sorted_1to2 - combined_2to1 = sorted_sect + " " + sorted_2to1 + combined_1to2 = sorted_sect + delimiter + sorted_1to2 + combined_2to1 = sorted_sect + delimiter + sorted_2to1 # strip sorted_sect = sorted_sect.strip() combined_1to2 = combined_1to2.strip() combined_2to1 = combined_2to1.strip() + # replace + sorted_sect = sorted_sect.replace(delimiter, " ") + combined_1to2 = combined_1to2.replace(delimiter, " ") + combined_2to1 = combined_2to1.replace(delimiter, " ") + if partial: ratio_func = partial_ratio else: - ratio_func = ratio - + ratio_func = setratio pairwise = [ ratio_func(sorted_sect, combined_1to2), ratio_func(sorted_sect, combined_2to1), From ee940a7ca23cbb0890dc3a964cef2fd384d1201a Mon Sep 17 00:00:00 2001 From: Michael Lever <8038441+MWLever@users.noreply.github.com> Date: Sun, 21 Feb 2021 11:47:24 -0500 Subject: [PATCH 2/2] make pep8 compliant --- fuzzywuzzy/StringMatcher.py | 2 +- fuzzywuzzy/fuzz.py | 11 ++++++++--- test_fuzzywuzzy.py | 1 + 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/fuzzywuzzy/StringMatcher.py b/fuzzywuzzy/StringMatcher.py index 15798919..c24c73f5 100644 --- a/fuzzywuzzy/StringMatcher.py +++ b/fuzzywuzzy/StringMatcher.py @@ -68,7 +68,7 @@ def setratio(self): if(not hasattr(self, '_setratio')): self._setratio = setratio(self._str1, self._str2) return self._setratio - + def seqratio(self): if(not hasattr(self, '_seqratio')): self._seqratio = seqratio(self._str1, self._str2) diff --git a/fuzzywuzzy/fuzz.py b/fuzzywuzzy/fuzz.py index d8f5b15d..5b69e146 100644 --- a/fuzzywuzzy/fuzz.py +++ b/fuzzywuzzy/fuzz.py @@ -8,7 +8,8 @@ from .StringMatcher import StringMatcher as SequenceMatcher except ImportError: if platform.python_implementation() != "PyPy": - warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') + warnings.warn( + 'Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') from difflib import SequenceMatcher from . import utils @@ -27,6 +28,7 @@ def ratio(s1, s2): m = SequenceMatcher(None, s1, s2) return utils.intr(100 * m.ratio()) + @utils.check_for_none @utils.check_for_equivalence @utils.check_empty_string @@ -36,6 +38,7 @@ def setratio(s1, s2): m = SequenceMatcher(None, s1, s2) return utils.intr(100 * m.setratio()) + @utils.check_for_none @utils.check_for_equivalence @utils.check_empty_string @@ -142,8 +145,10 @@ def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True): if not full_process and s1 == s2: return 100 - p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1 - p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2 + p1 = utils.full_process( + s1, force_ascii=force_ascii) if full_process else s1 + p2 = utils.full_process( + s2, force_ascii=force_ascii) if full_process else s2 if not utils.validate_string(p1): return 0 diff --git a/test_fuzzywuzzy.py b/test_fuzzywuzzy.py index 58617b68..7bf3e422 100644 --- a/test_fuzzywuzzy.py +++ b/test_fuzzywuzzy.py @@ -140,6 +140,7 @@ def testTokenSetRatio(self): self.assertEqual(fuzz.token_set_ratio(self.s9, self.s9a, full_process=True), 100) self.assertEqual(fuzz.token_set_ratio(self.s9, self.s9a, full_process=False), 100) self.assertEqual(fuzz.token_set_ratio(self.s10, self.s10a, full_process=False), 50) + def testPartialTokenSetRatio(self): self.assertEqual(fuzz.partial_token_set_ratio(self.s4, self.s7), 100)