Skip to content

Commit

Permalink
Merge pull request #10 from maxbachmann/master
Browse files Browse the repository at this point in the history
replace python-Levenshtein with rapidfuzz
  • Loading branch information
bigtoast committed Aug 18, 2023
2 parents 04deff5 + c2cd4f4 commit 681abb2
Show file tree
Hide file tree
Showing 15 changed files with 259 additions and 599 deletions.
11 changes: 6 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ jobs:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
test-cmd: [pytest]
include:
#- python-version: pyp-y3.8
# test-cmd: pytest test_thefuzz.py test_thefuzz_pytest.py
- python-version: "3.7"
test-cmd: python setup.py check --restructuredtext --strict --metadata
- python-version: "3.10"
Expand All @@ -26,7 +24,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
pip install pytest pycodestyle docutils Pygments hypothesis python-Levenshtein
pip install pytest pycodestyle docutils Pygments hypothesis
- name: Install project
run: pip install .

- name: Test with pytest
run: |
${{ matrix.test-cmd }}
run: ${{ matrix.test-cmd }}
12 changes: 2 additions & 10 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ Requirements
============

- Python 3.7 or higher
- difflib
- `python-Levenshtein <https://github.com/ztane/python-Levenshtein/>`_ (optional, provides a 4-10x speedup in String
Matching, though may result in `differing results for certain cases <https://github.com/seatgeek/fuzzywuzzy/issues/128>`_)
- `rapidfuzz <https://github.com/maxbachmann/RapidFuzz/>`_

For testing
~~~~~~~~~~~
Expand All @@ -29,12 +27,6 @@ Using PIP via PyPI
pip install thefuzz
or the following to install `python-Levenshtein` too

.. code:: bash
pip install thefuzz[speedup]
Using PIP via Github

Expand Down Expand Up @@ -110,7 +102,7 @@ Partial Token Sort Ratio
84
>>> fuzz.partial_token_sort_ratio("fuzzy was a bear", "wuzzy fuzzy was a bear")
100
Process
~~~~~~~

Expand Down
23 changes: 7 additions & 16 deletions benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
]

common_setup = "from thefuzz import fuzz, utils; "
basic_setup = "from thefuzz.string_processing import StringProcessor;"


def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
Expand All @@ -55,48 +54,42 @@ def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
duration, avg_duration * (1000 ** -thousands), units[-thousands]))


for s in choices:
print('Test validate_string for: "%s"' % s)
print_result_from_timeit('utils.validate_string(\'%s\')' % s, common_setup, number=iterations)

print('')

for s in mixed_strings + cirque_strings + choices:
print('Test full_process for: "%s"' % s)
print_result_from_timeit('utils.full_process(u\'%s\')' % s,
common_setup + basic_setup, number=iterations)
common_setup, number=iterations)

# benchmarking the core matching methods...

for s in cirque_strings:
print('Test fuzz.ratio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.ratio(u\'cirque du soleil\', u\'%s\')' % s,
common_setup + basic_setup, number=iterations / 100)
common_setup, number=iterations / 100)

for s in cirque_strings:
print('Test fuzz.partial_ratio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.partial_ratio(u\'cirque du soleil\', u\'%s\')'
% s, common_setup + basic_setup, number=iterations / 100)
% s, common_setup, number=iterations / 100)

for s in cirque_strings:
print('Test fuzz.WRatio for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('fuzz.WRatio(u\'cirque du soleil\', u\'%s\')' % s,
common_setup + basic_setup, number=iterations / 100)
common_setup, number=iterations / 100)

print('Test process.extract(scorer = fuzz.QRatio) for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.QRatio)',
common_setup + basic_setup + " from thefuzz import process; import string,random; random.seed(18);"
common_setup + " from thefuzz import process; import string,random; random.seed(18);"
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
number=10)

print('Test process.extract(scorer = fuzz.WRatio) for string: "%s"' % s)
print('-------------------------------')
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.WRatio)',
common_setup + basic_setup + " from thefuzz import process; import string,random; random.seed(18);"
common_setup + " from thefuzz import process; import string,random; random.seed(18);"
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
number=10)

Expand All @@ -114,6 +107,4 @@ def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
print('-------------------------------')
test += 'prepared_ratio = functools.partial(fuzz.ratio, "%s")\n' % s
test += 'titles.sort(key=prepared_ratio)\n'
print_result_from_timeit(test,
common_setup + basic_setup,
number=100)
print_result_from_timeit(test, common_setup, number=100)
20 changes: 8 additions & 12 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,20 @@
# This file is part of thefuzz.

from thefuzz import __version__
import os

try:
from setuptools import setup
except ImportError:
from distutils.core import setup


def open_file(fname):
return open(os.path.join(os.path.dirname(__file__), fname))
from setuptools import setup

with open('README.rst') as f:
long_description = f.read()

setup(
name='thefuzz',
version=__version__,
author='Adam Cohen',
author_email='adam@seatgeek.com',
packages=['thefuzz'],
extras_require={'speedup': ['python-levenshtein>=0.12']},
# keep for backwards compatibility of projects depending on `thefuzz[speedup]`
extras_require={'speedup': []},
install_requires= ['rapidfuzz>=3.0.0, < 4.0.0'],
url='https://github.com/seatgeek/thefuzz',
license="GPLv2",
classifiers=[
Expand All @@ -39,6 +34,7 @@ def open_file(fname):
'Programming Language :: Python :: 3 :: Only',
],
description='Fuzzy string matching in python',
long_description=open_file('README.rst').read(),
long_description=long_description,
zip_safe=True,
python_requires='>=3.7'
)
82 changes: 42 additions & 40 deletions test_thefuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,40 @@
from thefuzz import fuzz
from thefuzz import process
from thefuzz import utils
from thefuzz.string_processing import StringProcessor

scorers = [
fuzz.ratio,
fuzz.partial_ratio,
fuzz.token_sort_ratio,
fuzz.token_set_ratio,
fuzz.partial_token_sort_ratio,
fuzz.partial_token_set_ratio,
fuzz.QRatio,
fuzz.UQRatio,
fuzz.WRatio,
fuzz.UWRatio,
]

class StringProcessingTest(unittest.TestCase):
def test_replace_non_letters_non_numbers_with_whitespace(self):
strings = ["new york mets - atlanta braves", "Cães danados",
"New York //// Mets $$$", "Ça va?"]
for string in strings:
proc_string = StringProcessor.replace_non_letters_non_numbers_with_whitespace(string)
proc_string = utils.full_process(string)
regex = re.compile(r"(?ui)[\W]")
for expr in regex.finditer(proc_string):
self.assertEqual(expr.group(), " ")

def test_dont_condense_whitespace(self):
s1 = "new york mets - atlanta braves"
s2 = "new york mets atlanta braves"
p1 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s1)
p2 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s2)
self.assertNotEqual(p1, p2)
s3 = "new york mets atlanta braves"
p1 = utils.full_process(s1)
p2 = utils.full_process(s2)
p3 = utils.full_process(s3)
self.assertEqual(p1, s3)
self.assertEqual(p2, s2)
self.assertEqual(p3, s3)


class UtilsTest(unittest.TestCase):
Expand Down Expand Up @@ -120,7 +135,8 @@ def testPartialTokenSortRatio(self):
self.assertEqual(fuzz.partial_token_sort_ratio(self.s8, self.s8a, full_process=False), 100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=True), 100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s9, self.s9a, full_process=False), 100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s10, self.s10a, full_process=False), 50)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s10, self.s10a, full_process=False), 67)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s10a, self.s10, full_process=False), 67)

def testTokenSetRatio(self):
self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5), 100)
Expand Down Expand Up @@ -243,58 +259,44 @@ def testQRatioForceAscii(self):
score = fuzz.WRatio(s1, s2, force_ascii=False)
self.assertLess(score, 100)

def testTokenSetForceAscii(self):
def testPartialTokenSetRatioForceAscii(self):
s1 = "ABCD\u00C1 HELP\u00C1"
s2 = "ABCD HELP"

score = fuzz._token_set(s1, s2, force_ascii=True)
score = fuzz.partial_token_set_ratio(s1, s2, force_ascii=True)
self.assertEqual(score, 100)

score = fuzz._token_set(s1, s2, force_ascii=False)
score = fuzz.partial_token_set_ratio(s1, s2, force_ascii=False)
self.assertLess(score, 100)

def testTokenSortForceAscii(self):
def testPartialTokenSortRatioForceAscii(self):
s1 = "ABCD\u00C1 HELP\u00C1"
s2 = "ABCD HELP"

score = fuzz._token_sort(s1, s2, force_ascii=True)
score = fuzz.partial_token_sort_ratio(s1, s2, force_ascii=True)
self.assertEqual(score, 100)

score = fuzz._token_sort(s1, s2, force_ascii=False)
score = fuzz.partial_token_sort_ratio(s1, s2, force_ascii=False)
self.assertLess(score, 100)


class ValidatorTest(unittest.TestCase):
def setUp(self):
self.testFunc = lambda *args, **kwargs: (args, kwargs)

def testCheckForNone(self):
invalid_input = [
(None, None),
('Some', None),
(None, 'Some')
]
decorated_func = utils.check_for_none(self.testFunc)
for i in invalid_input:
self.assertEqual(decorated_func(*i), 0)
for scorer in scorers:
self.assertEqual(scorer(None, None), 0)
self.assertEqual(scorer('Some', None), 0)
self.assertEqual(scorer(None, 'Some'), 0)

valid_input = ('Some', 'Some')
actual = decorated_func(*valid_input)
self.assertNotEqual(actual, 0)
self.assertNotEqual(scorer('Some', 'Some'), 0)

def testCheckEmptyString(self):
invalid_input = [
('', ''),
('Some', ''),
('', 'Some')
]
decorated_func = utils.check_empty_string(self.testFunc)
for i in invalid_input:
self.assertEqual(decorated_func(*i), 0)

valid_input = ('Some', 'Some')
actual = decorated_func(*valid_input)
self.assertNotEqual(actual, 0)
for scorer in scorers:
if scorer in {fuzz.token_set_ratio, fuzz.partial_token_set_ratio, fuzz.WRatio, fuzz.UWRatio, fuzz.QRatio, fuzz.UQRatio}:
self.assertEqual(scorer('', ''), 0)
else:
self.assertEqual(scorer('', ''), 100)

self.assertEqual(scorer('Some', ''), 0)
self.assertEqual(scorer('', 'Some'), 0)
self.assertNotEqual(scorer('Some', 'Some'), 0)


class ProcessTest(unittest.TestCase):
Expand Down
4 changes: 2 additions & 2 deletions test_thefuzz_hypothesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from functools import partial
from string import ascii_letters, digits, punctuation

from hypothesis import given, assume, settings
from hypothesis import given, assume, settings, HealthCheck
import hypothesis.strategies as st
import pytest

Expand Down Expand Up @@ -62,7 +62,7 @@ def full_scorers_processors():
@pytest.mark.parametrize('scorer,processor',
scorers_processors())
@given(data=st.data())
@settings(max_examples=20, deadline=5000)
@settings(max_examples=20, deadline=5000, suppress_health_check=[HealthCheck.data_too_large])
def test_identical_strings_extracted(scorer, processor, data):
"""
Test that identical strings will always return a perfect match.
Expand Down

0 comments on commit 681abb2

Please sign in to comment.