/
stringmatching.py
285 lines (237 loc) · 10.2 KB
/
stringmatching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# -*- coding: utf-8 -*-
#
# Copyright © Spyder Project Contributors
# Licensed under the terms of the MIT License
# (see spyder/__init__.py for details)
"""
String search and match utilities useful when filtering a list of texts.
"""
import re
from spyder.py3compat import to_text_string
NOT_FOUND_SCORE = -1
NO_SCORE = 0
def get_search_regex(query, ignore_case=True):
"""Returns a compiled regex pattern to search for query letters in order.
Parameters
----------
query : str
String to search in another string (in order of character occurrence).
ignore_case : True
Optional value perform a case insensitive search (True by default).
Returns
-------
pattern : SRE_Pattern
Notes
-----
This function adds '.*' between the query characters and compiles the
resulting regular expression.
"""
regex_text = [char for char in query if char != ' ']
regex_text = '.*'.join(regex_text)
regex = u'({0})'.format(regex_text)
if ignore_case:
pattern = re.compile(regex, re.IGNORECASE)
else:
pattern = re.compile(regex)
return pattern
def get_search_score(query, choice, ignore_case=True, apply_regex=True,
template='{}'):
"""Returns a tuple with the enriched text (if a template is provided) and
a score for the match.
Parameters
----------
query : str
String with letters to search in choice (in order of appearance).
choice : str
Sentence/words in which to search for the 'query' letters.
ignore_case : bool, optional
Optional value perform a case insensitive search (True by default).
apply_regex : bool, optional
Optional value (True by default) to perform a regex search. Useful
when this function is called directly.
template : str, optional
Optional template string to surround letters found in choices. This is
useful when using a rich text editor ('{}' by default).
Examples: '<b>{}</b>', '<code>{}</code>', '<i>{}</i>'
Returns
-------
results : tuple
Tuples where the first item is the text (enriched if a template was
used) and the second item is a search score.
Notes
-----
The score is given according the following precedence (high to low):
- Letters in one word and no spaces with exact match.
Example: 'up' in 'up stroke'
- Letters in one word and no spaces with partial match.
Example: 'up' in 'upstream stroke'
- Letters in one word but with skip letters.
Example: 'cls' in 'close up'
- Letters in two or more words
Example: 'cls' in 'car lost'
"""
original_choice = to_text_string(choice, encoding='utf-8')
result = (original_choice, NOT_FOUND_SCORE)
# Handle empty string case
if not query:
return result
query = to_text_string(query, encoding='utf-8')
choice = to_text_string(choice, encoding='utf-8')
if ignore_case:
query = query.lower()
choice = choice.lower()
if apply_regex:
pattern = get_search_regex(query, ignore_case=ignore_case)
r = re.search(pattern, choice)
if r is None:
return result
else:
sep = u'-' # Matches will be replaced by this character
let = u'x' # Nonmatches (except spaed) will be replaced by this
score = 0
exact_words = [query == to_text_string(word, encoding='utf-8')
for word in choice.split(u' ')]
partial_words = [query in word for word in choice.split(u' ')]
if any(exact_words) or any(partial_words):
pos_start = choice.find(query)
pos_end = pos_start + len(query)
score += pos_start
text = choice.replace(query, sep*len(query), 1)
enriched_text = original_choice[:pos_start] +\
template.format(original_choice[pos_start:pos_end]) +\
original_choice[pos_end:]
if any(exact_words):
# Check if the query words exists in a word with exact match
score += 1
elif any(partial_words):
# Check if the query words exists in a word with partial match
score += 100
else:
# Check letter by letter
text = [l for l in original_choice]
if ignore_case:
temp_text = [l.lower() for l in original_choice]
else:
temp_text = text[:]
# Give points to start of string
score += temp_text.index(query[0])
# Find the query letters and replace them by `sep`, also apply
# template as needed for enricching the letters in the text
enriched_text = text[:]
for char in query:
if char != u'' and char in temp_text:
index = temp_text.index(char)
enriched_text[index] = template.format(text[index])
text[index] = sep
temp_text = [u' ']*(index + 1) + temp_text[index+1:]
enriched_text = u''.join(enriched_text)
patterns_text = []
for i, char in enumerate(text):
if char != u' ' and char != sep:
new_char = let
else:
new_char = char
patterns_text.append(new_char)
patterns_text = u''.join(patterns_text)
for i in reversed(range(1, len(query) + 1)):
score += (len(query) - patterns_text.count(sep*i))*100000
temp = patterns_text.split(sep)
while u'' in temp:
temp.remove(u'')
if not patterns_text.startswith(sep):
temp = temp[1:]
if not patterns_text.endswith(sep):
temp = temp[:-1]
for pat in temp:
score += pat.count(u' ')*10000
score += pat.count(let)*100
return original_choice, enriched_text, score
def get_search_scores(query, choices, ignore_case=True, template='{}',
valid_only=False, sort=False):
"""Search for query inside choices and return a list of tuples.
Returns a list of tuples of text with the enriched text (if a template is
provided) and a score for the match. Lower scores imply a better match.
Parameters
----------
query : str
String with letters to search in each choice (in order of appearance).
choices : list of str
List of sentences/words in which to search for the 'query' letters.
ignore_case : bool, optional
Optional value perform a case insensitive search (True by default).
template : str, optional
Optional template string to surround letters found in choices. This is
useful when using a rich text editor ('{}' by default).
Examples: '<b>{}</b>', '<code>{}</code>', '<i>{}</i>'
Returns
-------
results : list of tuples
List of tuples where the first item is the text (enriched if a
template was used) and a search score. Lower scores means better match.
"""
# First remove spaces from query
query = query.replace(' ', '')
pattern = get_search_regex(query, ignore_case)
results = []
for choice in choices:
r = re.search(pattern, choice)
if query and r:
result = get_search_score(query, choice, ignore_case=ignore_case,
apply_regex=False, template=template)
else:
if query:
result = (choice, choice, NOT_FOUND_SCORE)
else:
result = (choice, choice, NO_SCORE)
if valid_only:
if result[-1] != NOT_FOUND_SCORE:
results.append(result)
else:
results.append(result)
if sort:
results = sorted(results, key=lambda row: row[-1])
return results
def test():
template = '<b>{0}</b>'
names = ['close pane', 'debug continue', 'debug exit', 'debug step into',
'debug step over', 'debug step return', 'fullscreen mode',
'layout preferences', 'lock unlock panes', 'maximize pane',
'preferences', 'quit', 'restart', 'save current layout',
'switch to breakpoints', 'switch to console', 'switch to editor',
'switch to explorer', 'switch to find_in_files',
'switch to historylog', 'switch to help',
'switch to ipython_console', 'switch to onlinehelp',
'switch to outline_explorer', 'switch to project_explorer',
'switch to variable_explorer',
'use next layout', 'use previous layout', 'clear line',
'clear shell', 'inspect current object', 'blockcomment',
'breakpoint', 'close all', 'code completion',
'conditional breakpoint', 'configure', 'copy', 'copy line', 'cut',
'debug', 'debug with winpdb', 'delete', 'delete line',
'duplicate line', 'end of document', 'end of line',
'file list management', 'find next', 'find previous', 'find text',
'go to definition', 'go to line', 'go to next file',
'go to previous file', 'inspect current object', 'kill next word',
'kill previous word', 'kill to line end', 'kill to line start',
'last edit location', 'move line down', 'move line up',
'new file', 'next char', 'next cursor position', 'next line',
'next word', 'open file', 'paste', 'previous char',
'previous cursor position', 'previous line', 'previous word',
'print', 're-run last script', 'redo', 'replace text',
'rotate kill ring', 'run', 'run selection', 'save all', 'save as',
'save file', 'select all', 'show/hide outline',
'show/hide project explorer', 'start of document',
'start of line', 'toggle comment', 'unblockcomment', 'undo',
'yank', 'run profiler', 'run analysis']
a = get_search_scores('lay', names, template=template, )
b = get_search_scores('lay', names, template=template, valid_only=True,
sort=True)
# Full results
for r in a:
print(r) # spyder: test-skip
# Ordered and filtered results
print('\n') # spyder: test-skip
for r in b:
print(r) # spyder: test-skip
if __name__ == '__main__':
test()