Permalink
Browse files

looks for python-Levenshtein package, and if present, uses that inste…

…ad of difflib.

10x speedup if present. 
add benchmarks
  • Loading branch information...
1 parent 4bf2816 commit 9cf369eec555fa8098e8fa48be7d608d690d7a97 @acslater00 acslater00 committed Apr 5, 2012
Showing with 102 additions and 1 deletion.
  1. +78 −0 fuzzywuzzy/StringMatcher.py
  2. +17 −0 fuzzywuzzy/benchmarks.py
  3. +7 −1 fuzzywuzzy/fuzz.py
View
78 fuzzywuzzy/StringMatcher.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+StringMatcher.py
+
+ported from python-Levenshtein
+[https://github.com/miohtama/python-Levenshtein]
+"""
+
+from Levenshtein import *
+from warnings import warn
+
+class StringMatcher:
+ """A SequenceMatcher-like class built on the top of Levenshtein"""
+
+ def _reset_cache(self):
+ self._ratio = self._distance = None
+ self._opcodes = self._editops = self._matching_blocks = None
+
+ def __init__(self, isjunk=None, seq1='', seq2=''):
+ if isjunk:
+ warn("isjunk not NOT implemented, it will be ignored")
+ self._str1, self._str2 = seq1, seq2
+ self._reset_cache()
+
+ def set_seqs(self, seq1, seq2):
+ self._str1, self._str2 = seq1, seq2
+ self._reset_cache()
+
+ def set_seq1(self, seq1):
+ self._str1 = seq1
+ self._reset_cache()
+
+ def set_seq2(self, seq2):
+ self._str2 = seq2
+ self._reset_cache()
+
+ def get_opcodes(self):
+ if not self._opcodes:
+ if self._editops:
+ self._opcodes = opcodes(self._editops, self._str1, self._str2)
+ else:
+ self._opcodes = opcodes(self._str1, self._str2)
+ return self._opcodes
+
+ def get_editops(self):
+ if not self._editops:
+ if self._opcodes:
+ self._editops = editops(self._opcodes, self._str1, self._str2)
+ else:
+ self._editops = editops(self._str1, self._str2)
+ return self._editops
+
+ def get_matching_blocks(self):
+ if not self._matching_blocks:
+ self._matching_blocks = matching_blocks(self.get_opcodes(),
+ self._str1, self._str2)
+ return self._matching_blocks
+
+ def ratio(self):
+ if not self._ratio:
+ self._ratio = ratio(self._str1, self._str2)
+ return self._ratio
+
+ def quick_ratio(self):
+ # This is usually quick enough :o)
+ if not self._ratio:
+ self._ratio = ratio(self._str1, self._str2)
+ return self._ratio
+
+ def real_quick_ratio(self):
+ len1, len2 = len(self._str1), len(self._str2)
+ return 2.0 * min(len1, len2) / (len1 + len2)
+
+ def distance(self):
+ if not self._distance:
+ self._distance = distance(self._str1, self._str2)
+ return self._distance
View
17 fuzzywuzzy/benchmarks.py
@@ -52,3 +52,20 @@
print 'Test for string: "%s"' % s
#print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "import utils",number=iterations),4)
+
+### benchmarking the core matching methods...
+
+for s in cirque_strings:
+ print 'Test fuzz.ratio for string: "%s"' % s
+ print '-------------------------------'
+ print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
+
+for s in cirque_strings:
+ print 'Test fuzz.partial_ratio for string: "%s"' % s
+ print '-------------------------------'
+ print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
+
+for s in cirque_strings:
+ print 'Test fuzz.WRatio for string: "%s"' % s
+ print '-------------------------------'
+ print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
View
8 fuzzywuzzy/fuzz.py
@@ -28,9 +28,15 @@
import sys
import os
import re
-from difflib import SequenceMatcher
import utils
+try:
+ import Levenshtein
+ from StringMatcher import StringMatcher as SequenceMatcher
+ print "** using levenshtein"
+except:
+ from difflib import SequenceMatcher
+
REG_TOKEN = re.compile("[\w\d]+")
###########################

0 comments on commit 9cf369e

Please sign in to comment.