Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Merge pull request #11 from medecau/master

Issue 10 and refactoring
  • Loading branch information...
commit 9c70844fa6669455f99aeb2668c48aac5d595f22 2 parents c73e3d3 + dc2e341
Adam Cohen acslater00 authored
2  .gitignore
... ... @@ -1 +1,3 @@
1 1 *.pyc
  2 +env
  3 +dist
0  LICENSE.txt → LICENSE
File renamed without changes
3  MANIFEST
... ... @@ -1,5 +1,6 @@
1 1 # file GENERATED by distutils, do NOT edit
2   -README.txt
  2 +LICENSE
  3 +README.textile
3 4 setup.py
4 5 fuzzywuzzy/__init__.py
5 6 fuzzywuzzy/benchmarks.py
2  README.textile
Source Rendered
... ... @@ -1,3 +1,5 @@
  1 +!https://​pullstat.us/seatgeek/fuzzywuzzy/pull/5(Pull Request #5)!:https://github.com/seatgeek/fuzzywuzzy/pull/5 - Speed improvements
  2 +
1 3 h1. FuzzyWuzzy
2 4
3 5 Fuzzy string matching like a boss.
22 fuzzywuzzy/benchmarks.py → benchmarks.py
... ... @@ -1,9 +1,9 @@
1 1 # -*- coding: utf8 -*-
2 2
3 3 from timeit import timeit
4   -import utils
  4 +from fuzzywuzzy import utils
5 5
6   -iterations=100000*10
  6 +iterations=100000
7 7
8 8 cirque_strings = [
9 9 "cirque du soleil - zarkana - las vegas",
@@ -36,36 +36,36 @@
36 36
37 37 for s in choices:
38 38 print 'Test for string: "%s"' % s
39   - # print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "import utils",number=iterations),4)
40   - print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "import utils",number=iterations),4)
  39 + # print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
  40 + print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
41 41
42 42 print
43 43
44 44 for s in mixed_strings:
45 45 print 'Test for string: "%s"' % s
46   - #print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "import utils",number=iterations),4)
47   - print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "import utils",number=iterations),4)
  46 + #print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
  47 + print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
48 48
49 49 print
50 50
51 51 for s in mixed_strings+cirque_strings+choices:
52 52 print 'Test for string: "%s"' % s
53   - #print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "import utils",number=iterations),4)
54   - print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "import utils",number=iterations),4)
  53 + #print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
  54 + print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
55 55
56 56 ### benchmarking the core matching methods...
57 57
58 58 for s in cirque_strings:
59 59 print 'Test fuzz.ratio for string: "%s"' % s
60 60 print '-------------------------------'
61   - print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
  61 + print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
62 62
63 63 for s in cirque_strings:
64 64 print 'Test fuzz.partial_ratio for string: "%s"' % s
65 65 print '-------------------------------'
66   - print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
  66 + print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
67 67
68 68 for s in cirque_strings:
69 69 print 'Test fuzz.WRatio for string: "%s"' % s
70 70 print '-------------------------------'
71   - print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
  71 + print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
21 fuzzywuzzy/fuzz.py
@@ -28,10 +28,9 @@
28 28 import sys
29 29 import os
30 30 import re
31   -import utils
  31 +from utils import *
32 32
33 33 try:
34   - import Levenshtein
35 34 from StringMatcher import StringMatcher as SequenceMatcher
36 35 except:
37 36 from difflib import SequenceMatcher
@@ -48,7 +47,7 @@ def ratio(s1, s2):
48 47 if s2 is None: raise TypeError("s2 is None")
49 48
50 49 m = SequenceMatcher(None, s1, s2)
51   - return int(100 * m.ratio())
  50 + return intr(100 * m.ratio())
52 51
53 52 # todo: skip duplicate indexes for a little more speed
54 53 def partial_ratio(s1, s2):
@@ -178,20 +177,20 @@ def partial_token_set_ratio(s1, s2):
178 177
179 178 # q is for quick
180 179 def QRatio(s1, s2):
181   - if not utils.validate_string(s1): return 0
182   - if not utils.validate_string(s2): return 0
  180 + if not validate_string(s1): return 0
  181 + if not validate_string(s2): return 0
183 182
184   - p1 = utils.full_process(s1)
185   - p2 = utils.full_process(s2)
  183 + p1 = full_process(s1)
  184 + p2 = full_process(s2)
186 185
187 186 return ratio(p1, p2)
188 187
189 188 # w is for weighted
190 189 def WRatio(s1, s2):
191   - p1 = utils.full_process(s1)
192   - p2 = utils.full_process(s2)
193   - if not utils.validate_string(p1): return 0
194   - if not utils.validate_string(p2): return 0
  190 + p1 = full_process(s1)
  191 + p2 = full_process(s2)
  192 + if not validate_string(p1): return 0
  193 + if not validate_string(p2): return 0
195 194
196 195 # should we look at partials?
197 196 try_partial = True
4 fuzzywuzzy/utils.py
@@ -33,6 +33,8 @@ def full_process(s):
33 33 s = asciidammit(s)
34 34 return s.translate(trans_table, bad_chars).strip()
35 35
36   -
  36 +def intr(n):
  37 + '''Returns a correctly rounded integer'''
  38 + return int(round(n))
37 39
38 40
54 fuzzywuzzy/tests.py → tests.py
... ... @@ -1,8 +1,8 @@
1 1 # -*- coding: utf8 -*-
2 2
3   -from fuzz import *
4   -import process
5   -import utils
  3 +from fuzzywuzzy import fuzz
  4 +from fuzzywuzzy import process
  5 +from fuzzywuzzy import utils
6 6
7 7 import itertools
8 8 import unittest
@@ -75,56 +75,56 @@ def tearDown(self):
75 75 pass
76 76
77 77 def testEqual(self):
78   - self.assertEqual(ratio(self.s1, self.s1a),100)
  78 + self.assertEqual(fuzz.ratio(self.s1, self.s1a),100)
79 79
80 80 def testCaseInsensitive(self):
81   - self.assertNotEqual(ratio(self.s1, self.s2),100)
82   - self.assertEqual(ratio(utils.full_process(self.s1), utils.full_process(self.s2)),100)
  81 + self.assertNotEqual(fuzz.ratio(self.s1, self.s2),100)
  82 + self.assertEqual(fuzz.ratio(utils.full_process(self.s1), utils.full_process(self.s2)),100)
83 83
84 84 def testPartialRatio(self):
85   - self.assertEqual(partial_ratio(self.s1, self.s3),100)
  85 + self.assertEqual(fuzz.partial_ratio(self.s1, self.s3),100)
86 86
87 87 def testTokenSortRatio(self):
88   - self.assertEqual(token_sort_ratio(self.s1, self.s1a),100)
  88 + self.assertEqual(fuzz.token_sort_ratio(self.s1, self.s1a),100)
89 89
90 90 def testPartialTokenSortRatio(self):
91   - self.assertEqual(partial_token_sort_ratio(self.s1, self.s1a),100)
92   - self.assertEqual(partial_token_sort_ratio(self.s4, self.s5),100)
  91 + self.assertEqual(fuzz.partial_token_sort_ratio(self.s1, self.s1a),100)
  92 + self.assertEqual(fuzz.partial_token_sort_ratio(self.s4, self.s5),100)
93 93
94 94 def testTokenSetRatio(self):
95   - self.assertEqual(token_set_ratio(self.s4, self.s5),100)
  95 + self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100)
96 96
97 97 def testPartialTokenSetRatio(self):
98   - self.assertEqual(partial_token_set_ratio(self.s4, self.s5),100)
  98 + self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100)
99 99
100 100 def testQuickRatioEqual(self):
101   - self.assertEqual(QRatio(self.s1, self.s1a), 100)
  101 + self.assertEqual(fuzz.QRatio(self.s1, self.s1a), 100)
102 102
103 103 def testQuickRatioCaseInsensitive(self):
104   - self.assertEqual(QRatio(self.s1, self.s2), 100)
  104 + self.assertEqual(fuzz.QRatio(self.s1, self.s2), 100)
105 105
106 106 def testQuickRatioNotEqual(self):
107   - self.assertNotEqual(QRatio(self.s1, self.s3), 100)
  107 + self.assertNotEqual(fuzz.QRatio(self.s1, self.s3), 100)
108 108
109 109 def testWRatioEqual(self):
110   - self.assertEqual(WRatio(self.s1, self.s1a), 100)
  110 + self.assertEqual(fuzz.WRatio(self.s1, self.s1a), 100)
111 111
112 112 def testWRatioCaseInsensitive(self):
113   - self.assertEqual(WRatio(self.s1, self.s2), 100)
  113 + self.assertEqual(fuzz.WRatio(self.s1, self.s2), 100)
114 114
115 115 def testWRatioPartialMatch(self):
116 116 # a partial match is scaled by .9
117   - self.assertEqual(WRatio(self.s1, self.s3), 90)
  117 + self.assertEqual(fuzz.WRatio(self.s1, self.s3), 90)
118 118
119 119 def testWRatioMisorderedMatch(self):
120 120 # misordered full matches are scaled by .95
121   - self.assertEqual(WRatio(self.s4, self.s5), 95)
  121 + self.assertEqual(fuzz.WRatio(self.s4, self.s5), 95)
122 122
123 123 def testWRatioUnicode(self):
124   - self.assertEqual(WRatio(unicode(self.s1), unicode(self.s1a)), 100)
  124 + self.assertEqual(fuzz.WRatio(unicode(self.s1), unicode(self.s1a)), 100)
125 125
126 126 def testQRatioUnicode(self):
127   - self.assertEqual(WRatio(unicode(self.s1), unicode(self.s1a)), 100)
  127 + self.assertEqual(fuzz.WRatio(unicode(self.s1), unicode(self.s1a)), 100)
128 128
129 129 def testIssueSeven(self):
130 130 s1 = "HSINCHUANG"
@@ -132,20 +132,20 @@ def testIssueSeven(self):
132 132 s3 = "LSINJHUANG DISTRIC"
133 133 s4 = "SINJHUANG DISTRICT"
134 134
135   - self.assertGreater(partial_ratio(s1, s2), 75)
136   - self.assertGreater(partial_ratio(s1, s3), 75)
137   - self.assertGreater(partial_ratio(s1, s4), 75)
  135 + self.assertTrue(fuzz.partial_ratio(s1, s2) > 75)
  136 + self.assertTrue(fuzz.partial_ratio(s1, s3) > 75)
  137 + self.assertTrue(fuzz.partial_ratio(s1, s4) > 75)
138 138
139 139 def testWRatioUnicodeString(self):
140 140 s1 = u"\u00C1"
141 141 s2 = "ABCD"
142   - score = WRatio(s1, s2)
  142 + score = fuzz.WRatio(s1, s2)
143 143 self.assertEqual(0, score)
144 144
145 145 def testQRatioUnicodeString(self):
146 146 s1 = u"\u00C1"
147 147 s2 = "ABCD"
148   - score = QRatio(s1, s2)
  148 + score = fuzz.QRatio(s1, s2)
149 149 self.assertEqual(0, score)
150 150
151 151 # test processing methods
@@ -218,7 +218,7 @@ def testWithScorer(self):
218 218
219 219 # in this hypothetical example we care about ordering, so we use quick ratio
220 220 query = "new york mets at chicago cubs"
221   - scorer = QRatio
  221 + scorer = fuzz.QRatio
222 222
223 223 # first, as an example, the normal way would select the "more 'complete' match of choices[1]"
224 224

0 comments on commit 9c70844

Please sign in to comment.
Something went wrong with that request. Please try again.