-
Notifications
You must be signed in to change notification settings - Fork 76
/
sbtmh.py
200 lines (145 loc) · 5.63 KB
/
sbtmh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
from io import BytesIO
import sys
from .sbt import Leaf, SBT, GraphFactory
from . import signature
def load_sbt_index(filename, *, print_version_warning=True, cache_size=None):
"Load and return an SBT index."
return SBT.load(filename, leaf_loader=SigLeaf.load,
print_version_warning=print_version_warning,
cache_size=cache_size)
def create_sbt_index(bloom_filter_size=1e5, n_children=2):
"Create an empty SBT index."
factory = GraphFactory(1, bloom_filter_size, 4)
tree = SBT(factory, d=n_children)
return tree
def search_sbt_index(tree, query, threshold):
"""\
Search an SBT index `tree` with signature `query` for matches above
`threshold`.
Usage:
for match_sig, similarity in search_sbt_index(tree, query, threshold):
...
"""
for leaf in tree.find(search_minhashes, query, threshold, unload_data=True):
similarity = query.similarity(leaf.data)
yield leaf.data, similarity
class SigLeaf(Leaf):
def __str__(self):
return '**Leaf:{name} -> {metadata}'.format(
name=self.name, metadata=self.metadata)
def save(self, path):
# this is here only for triggering the property load
# before we reopen the file (and overwrite the previous
# content...)
self.data
buf = signature.save_signatures([self.data], compression=1)
return self.storage.save(path, buf)
def update(self, parent):
mh = self.data.minhash
parent.data.update(mh)
min_n_below = parent.metadata.get('min_n_below', sys.maxsize)
min_n_below = min(len(mh), min_n_below)
if min_n_below == 0:
min_n_below = 1
parent.metadata['min_n_below'] = min_n_below
@property
def data(self):
if self._data is None:
buf = BytesIO(self.storage.load(self._path))
self._data = signature.load_one_signature(buf)
return self._data
@data.setter
def data(self, new_data):
self._data = new_data
### Search functionality.
def _max_jaccard_underneath_internal_node(node, mh):
"""\
calculate the maximum possibility similarity score below
this node, based on the number of matches in 'hashes' at this node,
divided by the smallest minhash size below this node.
This should yield be an upper bound on the Jaccard similarity
for any signature below this point.
"""
if len(mh) == 0:
return 0.0
# count the maximum number of hash matches beneath this node
matches = node.data.matches(mh)
# get the size of the smallest collection of hashes below this point
min_n_below = node.metadata.get('min_n_below', -1)
if min_n_below == -1:
raise Exception('cannot do similarity search on this SBT; need to rebuild.')
# max of numerator divided by min of denominator => max Jaccard
max_score = float(matches) / min_n_below
return max_score
def search_minhashes(node, sig, threshold, results=None):
"""\
Default tree search function, searching for best Jaccard similarity.
"""
sig_mh = sig.minhash
score = 0
if isinstance(node, SigLeaf):
score = node.data.minhash.similarity(sig_mh)
else: # Node minhash comparison
score = _max_jaccard_underneath_internal_node(node, sig_mh)
if results is not None:
results[node.name] = score
if score >= threshold:
return 1
return 0
class SearchMinHashesFindBest(object):
def __init__(self):
self.best_match = 0.
def search(self, node, sig, threshold, results=None):
sig_mh = sig.minhash
score = 0
if isinstance(node, SigLeaf):
score = node.data.minhash.similarity(sig_mh)
else: # internal object, not leaf.
score = _max_jaccard_underneath_internal_node(node, sig_mh)
if results is not None:
results[node.name] = score
if score >= threshold:
# have we done better than this elsewhere? if yes, truncate.
if score > self.best_match:
# update best if it's a leaf node...
if isinstance(node, SigLeaf):
self.best_match = score
return 1
return 0
def search_minhashes_containment(node, sig, threshold, results=None, downsample=True):
mh = sig.minhash
if isinstance(node, SigLeaf):
matches = node.data.minhash.count_common(mh, downsample)
else: # Node or Leaf, Nodegraph by minhash comparison
matches = node.data.matches(mh)
if results is not None:
results[node.name] = float(matches) / len(mh)
if len(mh) and float(matches) / len(mh) >= threshold:
return 1
return 0
class GatherMinHashes(object):
def __init__(self):
self.best_match = 0
def search(self, node, query, threshold, results=None):
mh = query.minhash
if not len(mh):
return 0
if isinstance(node, SigLeaf):
matches = mh.count_common(node.data.minhash, True)
else: # Nodegraph by minhash comparison
matches = node.data.matches(mh)
if not matches:
return 0
score = float(matches) / len(mh)
if score < threshold:
return 0
# store results if we have passed in an appropriate dictionary
if results is not None:
results[node.name] = score
# have we done better than this? if no, truncate searches below.
if score >= self.best_match:
# update best if it's a leaf node...
if isinstance(node, SigLeaf):
self.best_match = score
return 1
return 0