-
Notifications
You must be signed in to change notification settings - Fork 77
/
sbtmh.py
255 lines (185 loc) · 7.19 KB
/
sbtmh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
from io import BytesIO
import sys
from .sbt import Leaf, SBT, GraphFactory
from . import signature
def load_sbt_index(filename, *, print_version_warning=True, cache_size=None):
"Load and return an SBT index."
return SBT.load(filename, leaf_loader=SigLeaf.load,
print_version_warning=print_version_warning,
cache_size=cache_size)
def create_sbt_index(bloom_filter_size=1e5, n_children=2):
"Create an empty SBT index."
factory = GraphFactory(1, bloom_filter_size, 4)
tree = SBT(factory, d=n_children)
return tree
def search_sbt_index(tree, query, threshold):
"""\
Search an SBT index `tree` with signature `query` for matches above
`threshold`.
Usage:
for match_sig, similarity in search_sbt_index(tree, query, threshold):
...
"""
for leaf in tree.find(search_minhashes, query, threshold, unload_data=True):
similarity = query.similarity(leaf.data)
yield leaf.data, similarity
class SigLeaf(Leaf):
def __str__(self):
return '**Leaf:{name} -> {metadata}'.format(
name=self.name, metadata=self.metadata)
def save(self, path):
# this is here only for triggering the property load
# before we reopen the file (and overwrite the previous
# content...)
self.data
buf = signature.save_signatures([self.data], compression=1)
return self.storage.save(path, buf)
def update(self, parent):
mh = self.data.minhash
parent.data.update(mh)
@property
def data(self):
if self._data is None:
buf = BytesIO(self.storage.load(self._path))
self._data = signature.load_one_signature(buf)
return self._data
@data.setter
def data(self, new_data):
self._data = new_data
### Search functionality.
def _max_jaccard_underneath_internal_node(node, query):
"""\
calculate the maximum possibility similarity score below
this node, based on the number of matches in 'hashes' at this node,
divided by the size of the query.
This should yield be an upper bound on the Jaccard similarity
for any signature below this point.
"""
query_bf = _get_bf(node, query)
mh = query.minhash
if len(mh) == 0:
return 0.0
# J(A, B) = |A intersection B| / |A union B|
# If we use only |A| as denominator, it is the containment
# Because |A| <= |A union B|, it is also an upper bound on the max jaccard
max_score = query_bf.containment(node.data)
#matches = node.data.matches(mh)
#max_score = float(matches) / len(mh)
return max_score
def search_minhashes(node, sig, threshold, results=None):
"""\
Default tree search function, searching for best Jaccard similarity.
"""
assert results is None
score = 0
if isinstance(node, SigLeaf):
score = node.data.minhash.similarity(sig.minhash)
else: # Node minhash comparison
#query_bf = _get_bf(node, sig)
sig_mh = sig.minhash
if len(sig_mh) == 0:
return 0.0
# count the maximum number of hash matches beneath this node
matches = node.data.matches(sig_mh)
# J(A, B) = |A intersection B| / |A union B|
# If we use only |A| as denominator, it is the containment
# Because |A| <= |A union B|, it is also an upper bound on the max jaccard
score = float(matches) / len(sig_mh)
if results is not None:
results[node.name] = score
if score >= threshold:
return 1
return 0
class SearchMinHashesFindBest(object):
def __init__(self):
self.best_match = 0.
def search(self, node, sig, threshold, results=None):
assert results is None
score = 0
if isinstance(node, SigLeaf):
score = node.data.minhash.similarity(sig.minhash)
else: # internal object, not leaf.
#query_bf = _get_bf(node, sig)
sig_mh = sig.minhash
if len(sig_mh) == 0:
return 0.0
# count the maximum number of hash matches beneath this node
matches = node.data.matches(sig_mh)
# J(A, B) = |A intersection B| / |A union B|
# If we use only |A| as denominator, it is the containment
# Because |A| <= |A union B|, it is also an upper bound on the max jaccard
score = float(matches) / len(sig_mh)
if results is not None:
results[node.name] = score
if score >= threshold:
# have we done better than this elsewhere? if yes, truncate.
if score > self.best_match:
# update best if it's a leaf node...
if isinstance(node, SigLeaf):
self.best_match = score
return 1
return 0
def search_minhashes_containment(node, sig, threshold, results=None, downsample=True):
assert results is None
mh = sig.minhash
if isinstance(node, SigLeaf):
matches = node.data.minhash.count_common(mh, downsample)
else: # Node or Leaf, Nodegraph by minhash comparison
#bf = _get_bf(node, sig)
#matches = bf.containment(node.data) * len(mh)
matches = node.data.matches(mh)
if len(mh) and float(matches) / len(mh) >= threshold:
return 1
return 0
def search_minhashes_max_containment(node, sig, threshold, results=None,
downsample=True):
assert results is None
mh = sig.minhash
if isinstance(node, SigLeaf):
node_mh = node.data.minhash
matches = node_mh.count_common(mh, downsample)
node_size = len(node_mh)
else: # Node or Leaf, Nodegraph by minhash comparison
matches = node.data.matches(mh)
# get the size of the smallest collection of hashes below this point
node_size = node.metadata.get('min_n_below', -1)
if node_size == -1:
raise Exception('cannot do max_containment search on this SBT; need to rebuild.')
denom = min((len(mh), node_size))
if len(mh) and matches / denom >= threshold:
return 1
return 0
class GatherMinHashes(object):
def __init__(self):
self.best_match = 0
def search(self, node, query, threshold, results=None):
assert results is None
mh = query.minhash
if not len(mh):
return 0
if isinstance(node, SigLeaf):
matches = mh.count_common(node.data.minhash, True)
else: # Nodegraph by minhash comparison
#bf = _get_bf(node, query)
#score = bf.containment(node.data)
matches = node.data.matches(mh)
if not matches:
return 0
score = float(matches) / len(mh)
if score < threshold:
return 0
# have we done better than this? if no, truncate searches below.
if score >= self.best_match:
# update best if it's a leaf node...
if isinstance(node, SigLeaf):
self.best_match = score
return 1
return 0
def _get_bf(node, query):
try:
query_bf = query.bf
except AttributeError:
query_bf = node._factory()
query_bf.update(query.minhash)
query.bf = query_bf
return query_bf