Skip to content

Commit

Permalink
Merge pull request #5 from medecau/master
Browse files Browse the repository at this point in the history
Speed improvements
  • Loading branch information
Adam Cohen committed Nov 18, 2011
2 parents cbfa45c + def73d6 commit 16d396b
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 37 deletions.
54 changes: 54 additions & 0 deletions fuzzywuzzy/benchmarks.py
@@ -0,0 +1,54 @@
# -*- coding: utf8 -*-

from timeit import timeit
import utils

iterations=100000*10

cirque_strings = [
"cirque du soleil - zarkana - las vegas",
"cirque du soleil ",
"cirque du soleil las vegas",
"zarkana las vegas",
"las vegas cirque du soleil at the bellagio",
"zarakana - cirque du soleil - bellagio"
]

choices = [
"",
"new york yankees vs boston red sox",
"",
"zarakana - cirque du soleil - bellagio",
None,
"cirque du soleil las vegas",
None
]

mixed_strings = [
"Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
"C\\'est la vie",
"Ça va?",
"Cães danados",
u"\xacCamarões assados",
u"a\xac\u1234\u20ac\U00008000"
]


for s in choices:
print 'Test for string: "%s"' % s
# print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "import utils",number=iterations),4)

print

for s in mixed_strings:
print 'Test for string: "%s"' % s
#print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "import utils",number=iterations),4)

print

for s in mixed_strings+cirque_strings+choices:
print 'Test for string: "%s"' % s
#print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "import utils",number=iterations),4)
22 changes: 20 additions & 2 deletions fuzzywuzzy/fuzz.py
Expand Up @@ -3,8 +3,26 @@
"""
score.py
Created by Adam Cohen on 2011-07-01.
Copyright (c) 2011 Adam Cohen. All rights reserved.
Copyright (c) 2011 Adam Cohen
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""

import sys
Expand Down
22 changes: 20 additions & 2 deletions fuzzywuzzy/process.py
Expand Up @@ -3,8 +3,26 @@
"""
process.py
Created by Adam Cohen on 2011-07-01.
Copyright (c) 2011 Adam Cohen. All rights reserved.
Copyright (c) 2011 Adam Cohen
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
from fuzz import *

Expand Down
35 changes: 32 additions & 3 deletions fuzzywuzzy/tests.py
@@ -1,10 +1,38 @@
# -*- coding: utf8 -*-

from fuzz import *
import process
import utils

import itertools
import unittest

class UtilsTest(unittest.TestCase):
def setUp(self):
self.s1 = "new york mets"
self.s1a = "new york mets"
self.s2 = "new YORK mets"
self.s3 = "the wonderful new york mets"
self.s4 = "new york mets vs atlanta braves"
self.s5 = "atlanta braves vs new york mets"
self.s6 = "new york mets - atlanta braves"
self.mixed_strings = [
"Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
"C'est la vie",
"Ça va?",
"Cães danados",
u"\xacCamarões assados",
u"a\xac\u1234\u20ac\U00008000"
]


def tearDown(self):
pass

def test_asciidammit(self):
for s in self.mixed_strings:
utils.asciidammit(s)

class RatioTest(unittest.TestCase):

def setUp(self):
Expand Down Expand Up @@ -177,12 +205,13 @@ def testWithCutoff(self):
# we don't want to randomly match to something, so we use a reasonable cutoff

best = process.extractOne(query, choices, score_cutoff=50)
self.assertIsNone(best)
self.assertTrue(best is None)
#self.assertIsNone(best) # unittest.TestCase did not have assertIsNone until Python 2.7

# however if we had no cutoff, something would get returned

best = process.extractOne(query, choices)
self.assertIsNotNone(best)
#best = process.extractOne(query, choices)
#self.assertIsNotNone(best)

def testEmptyStrings(self):
choices = [
Expand Down
55 changes: 25 additions & 30 deletions fuzzywuzzy/utils.py
@@ -1,41 +1,36 @@
# encode as string, decode as unicode bytes
def asciidammit(x):
if type(x) is str:
try:
return x.decode('ascii')
except:
return x.decode('ascii', 'ignore')
elif type(x) is unicode:
try:
s = x.encode('ascii')
return s.decode('ascii')
except:
s = x.encode('ascii', 'ignore')
return s.decode('ascii')
else:
x = unicode(x)
return asciidammit(x)
import string

bad_chars=''
for i in range(128,256):
bad_chars+=chr(i)
table_from=string.punctuation+string.ascii_uppercase
table_to=' '*len(string.punctuation)+string.ascii_lowercase
trans_table=string.maketrans(table_from, table_to)


def remove_punctuation(s):
if s is None: return s
s = s.replace(","," ").replace("."," ").replace("-"," ").replace(":"," ")
return s
def asciionly(s):
return s.translate(None, bad_chars)

# remove non-ASCII characters from strings
def asciidammit(s):
if type(s) is str:
return asciionly(s)
elif type(s) is unicode:
return asciionly(s.encode('ascii', 'ignore'))
else:
return asciidammit(unicode(s))

def validate_string(s):
if s is None: return False
try:
if len(s) == 0: return False
if len(s)>0:
return True
else:
return False
except:
return False
return True

def full_process(s):
s = s.lower()
s = s.strip()
s = remove_punctuation(s)
x = asciidammit(s)
return x

return s.translate(trans_table, bad_chars).strip()



Expand Down

0 comments on commit 16d396b

Please sign in to comment.