Skip to content

Commit

Permalink
Merge pull request #11 from medecau/master
Browse files Browse the repository at this point in the history
Issue 10 and refactoring
  • Loading branch information
Adam Cohen committed Jul 9, 2012
2 parents c73e3d3 + dc2e341 commit 9c70844
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 51 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -1 +1,3 @@
*.pyc
env
dist
File renamed without changes.
3 changes: 2 additions & 1 deletion MANIFEST
@@ -1,5 +1,6 @@
# file GENERATED by distutils, do NOT edit
README.txt
LICENSE
README.textile
setup.py
fuzzywuzzy/__init__.py
fuzzywuzzy/benchmarks.py
Expand Down
2 changes: 2 additions & 0 deletions README.textile
@@ -1,3 +1,5 @@
!https://​pullstat.us/seatgeek/fuzzywuzzy/pull/5(Pull Request #5)!:https://github.com/seatgeek/fuzzywuzzy/pull/5 - Speed improvements

h1. FuzzyWuzzy

Fuzzy string matching like a boss.
Expand Down
22 changes: 11 additions & 11 deletions fuzzywuzzy/benchmarks.py → benchmarks.py
@@ -1,9 +1,9 @@
# -*- coding: utf8 -*-

from timeit import timeit
import utils
from fuzzywuzzy import utils

iterations=100000*10
iterations=100000

cirque_strings = [
"cirque du soleil - zarkana - las vegas",
Expand Down Expand Up @@ -36,36 +36,36 @@

for s in choices:
print 'Test for string: "%s"' % s
# print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "import utils",number=iterations),4)
# print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)

print

for s in mixed_strings:
print 'Test for string: "%s"' % s
#print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "import utils",number=iterations),4)
#print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)

print

for s in mixed_strings+cirque_strings+choices:
print 'Test for string: "%s"' % s
#print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "import utils",number=iterations),4)
#print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)

### benchmarking the core matching methods...

for s in cirque_strings:
print 'Test fuzz.ratio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)

for s in cirque_strings:
print 'Test fuzz.partial_ratio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)

for s in cirque_strings:
print 'Test fuzz.WRatio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
21 changes: 10 additions & 11 deletions fuzzywuzzy/fuzz.py
Expand Up @@ -28,10 +28,9 @@
import sys
import os
import re
import utils
from utils import *

try:
import Levenshtein
from StringMatcher import StringMatcher as SequenceMatcher
except:
from difflib import SequenceMatcher
Expand All @@ -48,7 +47,7 @@ def ratio(s1, s2):
if s2 is None: raise TypeError("s2 is None")

m = SequenceMatcher(None, s1, s2)
return int(100 * m.ratio())
return intr(100 * m.ratio())

# todo: skip duplicate indexes for a little more speed
def partial_ratio(s1, s2):
Expand Down Expand Up @@ -178,20 +177,20 @@ def partial_token_set_ratio(s1, s2):

# q is for quick
def QRatio(s1, s2):
if not utils.validate_string(s1): return 0
if not utils.validate_string(s2): return 0
if not validate_string(s1): return 0
if not validate_string(s2): return 0

p1 = utils.full_process(s1)
p2 = utils.full_process(s2)
p1 = full_process(s1)
p2 = full_process(s2)

return ratio(p1, p2)

# w is for weighted
def WRatio(s1, s2):
p1 = utils.full_process(s1)
p2 = utils.full_process(s2)
if not utils.validate_string(p1): return 0
if not utils.validate_string(p2): return 0
p1 = full_process(s1)
p2 = full_process(s2)
if not validate_string(p1): return 0
if not validate_string(p2): return 0

# should we look at partials?
try_partial = True
Expand Down
4 changes: 3 additions & 1 deletion fuzzywuzzy/utils.py
Expand Up @@ -33,6 +33,8 @@ def full_process(s):
s = asciidammit(s)
return s.translate(trans_table, bad_chars).strip()


def intr(n):
'''Returns a correctly rounded integer'''
return int(round(n))


54 changes: 27 additions & 27 deletions fuzzywuzzy/tests.py → tests.py
@@ -1,8 +1,8 @@
# -*- coding: utf8 -*-

from fuzz import *
import process
import utils
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzywuzzy import utils

import itertools
import unittest
Expand Down Expand Up @@ -75,77 +75,77 @@ def tearDown(self):
pass

def testEqual(self):
self.assertEqual(ratio(self.s1, self.s1a),100)
self.assertEqual(fuzz.ratio(self.s1, self.s1a),100)

def testCaseInsensitive(self):
self.assertNotEqual(ratio(self.s1, self.s2),100)
self.assertEqual(ratio(utils.full_process(self.s1), utils.full_process(self.s2)),100)
self.assertNotEqual(fuzz.ratio(self.s1, self.s2),100)
self.assertEqual(fuzz.ratio(utils.full_process(self.s1), utils.full_process(self.s2)),100)

def testPartialRatio(self):
self.assertEqual(partial_ratio(self.s1, self.s3),100)
self.assertEqual(fuzz.partial_ratio(self.s1, self.s3),100)

def testTokenSortRatio(self):
self.assertEqual(token_sort_ratio(self.s1, self.s1a),100)
self.assertEqual(fuzz.token_sort_ratio(self.s1, self.s1a),100)

def testPartialTokenSortRatio(self):
self.assertEqual(partial_token_sort_ratio(self.s1, self.s1a),100)
self.assertEqual(partial_token_sort_ratio(self.s4, self.s5),100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s1, self.s1a),100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s4, self.s5),100)

def testTokenSetRatio(self):
self.assertEqual(token_set_ratio(self.s4, self.s5),100)
self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100)

def testPartialTokenSetRatio(self):
self.assertEqual(partial_token_set_ratio(self.s4, self.s5),100)
self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100)

def testQuickRatioEqual(self):
self.assertEqual(QRatio(self.s1, self.s1a), 100)
self.assertEqual(fuzz.QRatio(self.s1, self.s1a), 100)

def testQuickRatioCaseInsensitive(self):
self.assertEqual(QRatio(self.s1, self.s2), 100)
self.assertEqual(fuzz.QRatio(self.s1, self.s2), 100)

def testQuickRatioNotEqual(self):
self.assertNotEqual(QRatio(self.s1, self.s3), 100)
self.assertNotEqual(fuzz.QRatio(self.s1, self.s3), 100)

def testWRatioEqual(self):
self.assertEqual(WRatio(self.s1, self.s1a), 100)
self.assertEqual(fuzz.WRatio(self.s1, self.s1a), 100)

def testWRatioCaseInsensitive(self):
self.assertEqual(WRatio(self.s1, self.s2), 100)
self.assertEqual(fuzz.WRatio(self.s1, self.s2), 100)

def testWRatioPartialMatch(self):
# a partial match is scaled by .9
self.assertEqual(WRatio(self.s1, self.s3), 90)
self.assertEqual(fuzz.WRatio(self.s1, self.s3), 90)

def testWRatioMisorderedMatch(self):
# misordered full matches are scaled by .95
self.assertEqual(WRatio(self.s4, self.s5), 95)
self.assertEqual(fuzz.WRatio(self.s4, self.s5), 95)

def testWRatioUnicode(self):
self.assertEqual(WRatio(unicode(self.s1), unicode(self.s1a)), 100)
self.assertEqual(fuzz.WRatio(unicode(self.s1), unicode(self.s1a)), 100)

def testQRatioUnicode(self):
self.assertEqual(WRatio(unicode(self.s1), unicode(self.s1a)), 100)
self.assertEqual(fuzz.WRatio(unicode(self.s1), unicode(self.s1a)), 100)

def testIssueSeven(self):
s1 = "HSINCHUANG"
s2 = "SINJHUAN"
s3 = "LSINJHUANG DISTRIC"
s4 = "SINJHUANG DISTRICT"

self.assertGreater(partial_ratio(s1, s2), 75)
self.assertGreater(partial_ratio(s1, s3), 75)
self.assertGreater(partial_ratio(s1, s4), 75)
self.assertTrue(fuzz.partial_ratio(s1, s2) > 75)
self.assertTrue(fuzz.partial_ratio(s1, s3) > 75)
self.assertTrue(fuzz.partial_ratio(s1, s4) > 75)

def testWRatioUnicodeString(self):
s1 = u"\u00C1"
s2 = "ABCD"
score = WRatio(s1, s2)
score = fuzz.WRatio(s1, s2)
self.assertEqual(0, score)

def testQRatioUnicodeString(self):
s1 = u"\u00C1"
s2 = "ABCD"
score = QRatio(s1, s2)
score = fuzz.QRatio(s1, s2)
self.assertEqual(0, score)

# test processing methods
Expand Down Expand Up @@ -218,7 +218,7 @@ def testWithScorer(self):

# in this hypothetical example we care about ordering, so we use quick ratio
query = "new york mets at chicago cubs"
scorer = QRatio
scorer = fuzz.QRatio

# first, as an example, the normal way would select the "more 'complete' match of choices[1]"

Expand Down

0 comments on commit 9c70844

Please sign in to comment.