Skip to content
Browse files

Merge pull request #11 from medecau/master

Issue 10 and refactoring
  • Loading branch information...
2 parents c73e3d3 + dc2e341 commit 9c70844fa6669455f99aeb2668c48aac5d595f22 @acslater00 acslater00 committed Jul 8, 2012
Showing with 57 additions and 51 deletions.
  1. +2 −0 .gitignore
  2. 0 LICENSE.txt → LICENSE
  3. +2 −1 MANIFEST
  4. +2 −0 README.textile
  5. +11 −11 {fuzzywuzzy → }/benchmarks.py
  6. +10 −11 fuzzywuzzy/fuzz.py
  7. +3 −1 fuzzywuzzy/utils.py
  8. +27 −27 {fuzzywuzzy → }/tests.py
View
2 .gitignore
@@ -1 +1,3 @@
*.pyc
+env
+dist
View
0 LICENSE.txt → LICENSE
File renamed without changes.
View
3 MANIFEST
@@ -1,5 +1,6 @@
# file GENERATED by distutils, do NOT edit
-README.txt
+LICENSE
+README.textile
setup.py
fuzzywuzzy/__init__.py
fuzzywuzzy/benchmarks.py
View
2 README.textile
@@ -1,3 +1,5 @@
+!https://​pullstat.us/seatgeek/fuzzywuzzy/pull/5(Pull Request #5)!:https://github.com/seatgeek/fuzzywuzzy/pull/5 - Speed improvements
+
h1. FuzzyWuzzy
Fuzzy string matching like a boss.
View
22 fuzzywuzzy/benchmarks.py → benchmarks.py
@@ -1,9 +1,9 @@
# -*- coding: utf8 -*-
from timeit import timeit
-import utils
+from fuzzywuzzy import utils
-iterations=100000*10
+iterations=100000
cirque_strings = [
"cirque du soleil - zarkana - las vegas",
@@ -36,36 +36,36 @@
for s in choices:
print 'Test for string: "%s"' % s
- # print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "import utils",number=iterations),4)
- print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "import utils",number=iterations),4)
+ # print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
+ print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print
for s in mixed_strings:
print 'Test for string: "%s"' % s
- #print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "import utils",number=iterations),4)
- print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "import utils",number=iterations),4)
+ #print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
+ print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print
for s in mixed_strings+cirque_strings+choices:
print 'Test for string: "%s"' % s
- #print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "import utils",number=iterations),4)
- print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "import utils",number=iterations),4)
+ #print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
+ print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
### benchmarking the core matching methods...
for s in cirque_strings:
print 'Test fuzz.ratio for string: "%s"' % s
print '-------------------------------'
- print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
+ print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
for s in cirque_strings:
print 'Test fuzz.partial_ratio for string: "%s"' % s
print '-------------------------------'
- print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
+ print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
for s in cirque_strings:
print 'Test fuzz.WRatio for string: "%s"' % s
print '-------------------------------'
- print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
+ print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
View
21 fuzzywuzzy/fuzz.py
@@ -28,10 +28,9 @@
import sys
import os
import re
-import utils
+from utils import *
try:
- import Levenshtein
from StringMatcher import StringMatcher as SequenceMatcher
except:
from difflib import SequenceMatcher
@@ -48,7 +47,7 @@ def ratio(s1, s2):
if s2 is None: raise TypeError("s2 is None")
m = SequenceMatcher(None, s1, s2)
- return int(100 * m.ratio())
+ return intr(100 * m.ratio())
# todo: skip duplicate indexes for a little more speed
def partial_ratio(s1, s2):
@@ -178,20 +177,20 @@ def partial_token_set_ratio(s1, s2):
# q is for quick
def QRatio(s1, s2):
- if not utils.validate_string(s1): return 0
- if not utils.validate_string(s2): return 0
+ if not validate_string(s1): return 0
+ if not validate_string(s2): return 0
- p1 = utils.full_process(s1)
- p2 = utils.full_process(s2)
+ p1 = full_process(s1)
+ p2 = full_process(s2)
return ratio(p1, p2)
# w is for weighted
def WRatio(s1, s2):
- p1 = utils.full_process(s1)
- p2 = utils.full_process(s2)
- if not utils.validate_string(p1): return 0
- if not utils.validate_string(p2): return 0
+ p1 = full_process(s1)
+ p2 = full_process(s2)
+ if not validate_string(p1): return 0
+ if not validate_string(p2): return 0
# should we look at partials?
try_partial = True
View
4 fuzzywuzzy/utils.py
@@ -33,6 +33,8 @@ def full_process(s):
s = asciidammit(s)
return s.translate(trans_table, bad_chars).strip()
-
+def intr(n):
+ '''Returns a correctly rounded integer'''
+ return int(round(n))
View
54 fuzzywuzzy/tests.py → tests.py
@@ -1,8 +1,8 @@
# -*- coding: utf8 -*-
-from fuzz import *
-import process
-import utils
+from fuzzywuzzy import fuzz
+from fuzzywuzzy import process
+from fuzzywuzzy import utils
import itertools
import unittest
@@ -75,77 +75,77 @@ def tearDown(self):
pass
def testEqual(self):
- self.assertEqual(ratio(self.s1, self.s1a),100)
+ self.assertEqual(fuzz.ratio(self.s1, self.s1a),100)
def testCaseInsensitive(self):
- self.assertNotEqual(ratio(self.s1, self.s2),100)
- self.assertEqual(ratio(utils.full_process(self.s1), utils.full_process(self.s2)),100)
+ self.assertNotEqual(fuzz.ratio(self.s1, self.s2),100)
+ self.assertEqual(fuzz.ratio(utils.full_process(self.s1), utils.full_process(self.s2)),100)
def testPartialRatio(self):
- self.assertEqual(partial_ratio(self.s1, self.s3),100)
+ self.assertEqual(fuzz.partial_ratio(self.s1, self.s3),100)
def testTokenSortRatio(self):
- self.assertEqual(token_sort_ratio(self.s1, self.s1a),100)
+ self.assertEqual(fuzz.token_sort_ratio(self.s1, self.s1a),100)
def testPartialTokenSortRatio(self):
- self.assertEqual(partial_token_sort_ratio(self.s1, self.s1a),100)
- self.assertEqual(partial_token_sort_ratio(self.s4, self.s5),100)
+ self.assertEqual(fuzz.partial_token_sort_ratio(self.s1, self.s1a),100)
+ self.assertEqual(fuzz.partial_token_sort_ratio(self.s4, self.s5),100)
def testTokenSetRatio(self):
- self.assertEqual(token_set_ratio(self.s4, self.s5),100)
+ self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100)
def testPartialTokenSetRatio(self):
- self.assertEqual(partial_token_set_ratio(self.s4, self.s5),100)
+ self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100)
def testQuickRatioEqual(self):
- self.assertEqual(QRatio(self.s1, self.s1a), 100)
+ self.assertEqual(fuzz.QRatio(self.s1, self.s1a), 100)
def testQuickRatioCaseInsensitive(self):
- self.assertEqual(QRatio(self.s1, self.s2), 100)
+ self.assertEqual(fuzz.QRatio(self.s1, self.s2), 100)
def testQuickRatioNotEqual(self):
- self.assertNotEqual(QRatio(self.s1, self.s3), 100)
+ self.assertNotEqual(fuzz.QRatio(self.s1, self.s3), 100)
def testWRatioEqual(self):
- self.assertEqual(WRatio(self.s1, self.s1a), 100)
+ self.assertEqual(fuzz.WRatio(self.s1, self.s1a), 100)
def testWRatioCaseInsensitive(self):
- self.assertEqual(WRatio(self.s1, self.s2), 100)
+ self.assertEqual(fuzz.WRatio(self.s1, self.s2), 100)
def testWRatioPartialMatch(self):
# a partial match is scaled by .9
- self.assertEqual(WRatio(self.s1, self.s3), 90)
+ self.assertEqual(fuzz.WRatio(self.s1, self.s3), 90)
def testWRatioMisorderedMatch(self):
# misordered full matches are scaled by .95
- self.assertEqual(WRatio(self.s4, self.s5), 95)
+ self.assertEqual(fuzz.WRatio(self.s4, self.s5), 95)
def testWRatioUnicode(self):
- self.assertEqual(WRatio(unicode(self.s1), unicode(self.s1a)), 100)
+ self.assertEqual(fuzz.WRatio(unicode(self.s1), unicode(self.s1a)), 100)
def testQRatioUnicode(self):
- self.assertEqual(WRatio(unicode(self.s1), unicode(self.s1a)), 100)
+ self.assertEqual(fuzz.WRatio(unicode(self.s1), unicode(self.s1a)), 100)
def testIssueSeven(self):
s1 = "HSINCHUANG"
s2 = "SINJHUAN"
s3 = "LSINJHUANG DISTRIC"
s4 = "SINJHUANG DISTRICT"
- self.assertGreater(partial_ratio(s1, s2), 75)
- self.assertGreater(partial_ratio(s1, s3), 75)
- self.assertGreater(partial_ratio(s1, s4), 75)
+ self.assertTrue(fuzz.partial_ratio(s1, s2) > 75)
+ self.assertTrue(fuzz.partial_ratio(s1, s3) > 75)
+ self.assertTrue(fuzz.partial_ratio(s1, s4) > 75)
def testWRatioUnicodeString(self):
s1 = u"\u00C1"
s2 = "ABCD"
- score = WRatio(s1, s2)
+ score = fuzz.WRatio(s1, s2)
self.assertEqual(0, score)
def testQRatioUnicodeString(self):
s1 = u"\u00C1"
s2 = "ABCD"
- score = QRatio(s1, s2)
+ score = fuzz.QRatio(s1, s2)
self.assertEqual(0, score)
# test processing methods
@@ -218,7 +218,7 @@ def testWithScorer(self):
# in this hypothetical example we care about ordering, so we use quick ratio
query = "new york mets at chicago cubs"
- scorer = QRatio
+ scorer = fuzz.QRatio
# first, as an example, the normal way would select the "more 'complete' match of choices[1]"

0 comments on commit 9c70844

Please sign in to comment.
Something went wrong with that request. Please try again.