From f006dfafd723456062a7f359ef0aa74a40c46d3d Mon Sep 17 00:00:00 2001
From: Michael Lever <8038441+MWLever@users.noreply.github.com>
Date: Sun, 21 Feb 2021 11:32:44 -0500
Subject: [PATCH 1/2] Modified Set Scoring

Previous issue: partial_token_set_ratio matching strings across tokens.

Fix: Preserve tokenization of the comparison sets and use Levenshtein's setratio/seqratio over ratio.

Detail:
Previously token_set_ratio used python's strip to remove white space. Since strip removes all whitespace, the set comparisons are not tokenized. So when using partial_token_set_ratio, you would be able to match strings across word boundaries. This is generally unexpected behavior. This change should allow a more bag of words.
---
 fuzzywuzzy/StringMatcher.py | 12 +++++++++++-
 fuzzywuzzy/fuzz.py          | 37 ++++++++++++++++++++++++++++++-------
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/fuzzywuzzy/StringMatcher.py b/fuzzywuzzy/StringMatcher.py
index d35e075f..15798919 100644
--- a/fuzzywuzzy/StringMatcher.py
+++ b/fuzzywuzzy/StringMatcher.py
@@ -8,7 +8,7 @@
 License available here: https://github.com/miohtama/python-Levenshtein/blob/master/COPYING
 """
 
-from Levenshtein import *
+from Levenshtein._levenshtein import *
 from warnings import warn
 
 
@@ -64,6 +64,16 @@ def ratio(self):
             self._ratio = ratio(self._str1, self._str2)
         return self._ratio
 
+    def setratio(self):
+        if(not hasattr(self, '_setratio')):
+            self._setratio = setratio(self._str1, self._str2)
+        return self._setratio
+    
+    def seqratio(self):
+        if(not hasattr(self, '_seqratio')):
+            self._seqratio = seqratio(self._str1, self._str2)
+        return self._seqratio
+
     def quick_ratio(self):
         # This is usually quick enough :o)
         if not self._ratio:
diff --git a/fuzzywuzzy/fuzz.py b/fuzzywuzzy/fuzz.py
index 27f80c9f..d8f5b15d 100644
--- a/fuzzywuzzy/fuzz.py
+++ b/fuzzywuzzy/fuzz.py
@@ -27,6 +27,24 @@ def ratio(s1, s2):
     m = SequenceMatcher(None, s1, s2)
     return utils.intr(100 * m.ratio())
 
+@utils.check_for_none
+@utils.check_for_equivalence
+@utils.check_empty_string
+def setratio(s1, s2):
+    s1, s2 = utils.make_type_consistent(s1, s2)
+
+    m = SequenceMatcher(None, s1, s2)
+    return utils.intr(100 * m.setratio())
+
+@utils.check_for_none
+@utils.check_for_equivalence
+@utils.check_empty_string
+def seqratio(s1, s2):
+    s1, s2 = utils.make_type_consistent(s1, s2)
+
+    m = SequenceMatcher(None, s1, s2)
+    return utils.intr(100 * m.seqratio())
+
 
 @utils.check_for_none
 @utils.check_for_equivalence
@@ -140,23 +158,28 @@ def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
     diff1to2 = tokens1.difference(tokens2)
     diff2to1 = tokens2.difference(tokens1)
 
-    sorted_sect = " ".join(sorted(intersection))
-    sorted_1to2 = " ".join(sorted(diff1to2))
-    sorted_2to1 = " ".join(sorted(diff2to1))
+    delimiter = "+++"
+    sorted_sect = delimiter.join(sorted(intersection))
+    sorted_1to2 = delimiter.join(sorted(diff1to2))
+    sorted_2to1 = delimiter.join(sorted(diff2to1))
 
-    combined_1to2 = sorted_sect + " " + sorted_1to2
-    combined_2to1 = sorted_sect + " " + sorted_2to1
+    combined_1to2 = sorted_sect + delimiter + sorted_1to2
+    combined_2to1 = sorted_sect + delimiter + sorted_2to1
 
     # strip
     sorted_sect = sorted_sect.strip()
     combined_1to2 = combined_1to2.strip()
     combined_2to1 = combined_2to1.strip()
 
+    # replace
+    sorted_sect = sorted_sect.replace(delimiter, " ")
+    combined_1to2 = combined_1to2.replace(delimiter, " ")
+    combined_2to1 = combined_2to1.replace(delimiter, " ")
+
     if partial:
         ratio_func = partial_ratio
     else:
-        ratio_func = ratio
-
+        ratio_func = setratio
     pairwise = [
         ratio_func(sorted_sect, combined_1to2),
         ratio_func(sorted_sect, combined_2to1),

From ee940a7ca23cbb0890dc3a964cef2fd384d1201a Mon Sep 17 00:00:00 2001
From: Michael Lever <8038441+MWLever@users.noreply.github.com>
Date: Sun, 21 Feb 2021 11:47:24 -0500
Subject: [PATCH 2/2] make pep8 compliant

---
 fuzzywuzzy/StringMatcher.py |  2 +-
 fuzzywuzzy/fuzz.py          | 11 ++++++++---
 test_fuzzywuzzy.py          |  1 +
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/fuzzywuzzy/StringMatcher.py b/fuzzywuzzy/StringMatcher.py
index 15798919..c24c73f5 100644
--- a/fuzzywuzzy/StringMatcher.py
+++ b/fuzzywuzzy/StringMatcher.py
@@ -68,7 +68,7 @@ def setratio(self):
         if(not hasattr(self, '_setratio')):
             self._setratio = setratio(self._str1, self._str2)
         return self._setratio
-    
+
     def seqratio(self):
         if(not hasattr(self, '_seqratio')):
             self._seqratio = seqratio(self._str1, self._str2)
diff --git a/fuzzywuzzy/fuzz.py b/fuzzywuzzy/fuzz.py
index d8f5b15d..5b69e146 100644
--- a/fuzzywuzzy/fuzz.py
+++ b/fuzzywuzzy/fuzz.py
@@ -8,7 +8,8 @@
     from .StringMatcher import StringMatcher as SequenceMatcher
 except ImportError:
     if platform.python_implementation() != "PyPy":
-        warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
+        warnings.warn(
+            'Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
     from difflib import SequenceMatcher
 
 from . import utils
@@ -27,6 +28,7 @@ def ratio(s1, s2):
     m = SequenceMatcher(None, s1, s2)
     return utils.intr(100 * m.ratio())
 
+
 @utils.check_for_none
 @utils.check_for_equivalence
 @utils.check_empty_string
@@ -36,6 +38,7 @@ def setratio(s1, s2):
     m = SequenceMatcher(None, s1, s2)
     return utils.intr(100 * m.setratio())
 
+
 @utils.check_for_none
 @utils.check_for_equivalence
 @utils.check_empty_string
@@ -142,8 +145,10 @@ def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
     if not full_process and s1 == s2:
         return 100
 
-    p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
-    p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2
+    p1 = utils.full_process(
+        s1, force_ascii=force_ascii) if full_process else s1
+    p2 = utils.full_process(
+        s2, force_ascii=force_ascii) if full_process else s2
 
     if not utils.validate_string(p1):
         return 0
diff --git a/test_fuzzywuzzy.py b/test_fuzzywuzzy.py
index 58617b68..7bf3e422 100644
--- a/test_fuzzywuzzy.py
+++ b/test_fuzzywuzzy.py
@@ -140,6 +140,7 @@ def testTokenSetRatio(self):
         self.assertEqual(fuzz.token_set_ratio(self.s9, self.s9a, full_process=True), 100)
         self.assertEqual(fuzz.token_set_ratio(self.s9, self.s9a, full_process=False), 100)
         self.assertEqual(fuzz.token_set_ratio(self.s10, self.s10a, full_process=False), 50)
+        
 
     def testPartialTokenSetRatio(self):
         self.assertEqual(fuzz.partial_token_set_ratio(self.s4, self.s7), 100)