From a1210a1414e1c6474b1d34ab4ff50e66263a42d5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 2 Jun 2018 08:39:18 -0700 Subject: [PATCH] change max_n_below -> min_n_below --- sourmash/sbt.py | 8 +++++--- sourmash/sbtmh.py | 19 +++++++++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 729b1e2fb1..d5eea18aba 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -533,15 +533,17 @@ def _fill_min_n_below(self): if isinstance(n, Leaf): parent = self.parent(i) if parent.pos not in self.missing_nodes: - min_n_below = parent.node.metadata.get('min_n_below', 0) + min_n_below = parent.node.metadata.get('min_n_below', 1) min_n_below = min(len(n.data.minhash.get_mins()), min_n_below) + if min_n_below == 0: + min_n_below = 1 parent.node.metadata['min_n_below'] = min_n_below current = parent parent = self.parent(parent.pos) while parent and parent.pos not in self.missing_nodes: - min_n_below = parent.node.metadata.get('min_n_below', 0) + min_n_below = parent.node.metadata.get('min_n_below', 1) min_n_below = min(current.node.metadata['min_n_below'], min_n_below) parent.node.metadata['min_n_below'] = min_n_below @@ -699,7 +701,7 @@ def load(info, storage=None): def update(self, parent): parent.data.update(self.data) - min_n_below = min(parent.metadata.get('min_n_below', 0), + min_n_below = min(parent.metadata.get('min_n_below', 1), self.metadata.get('min_n_below')) parent.metadata['min_n_below'] = min_n_below diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py index 805834caee..205b20e1aa 100644 --- a/sourmash/sbtmh.py +++ b/sourmash/sbtmh.py @@ -57,6 +57,7 @@ def update(self, parent): min_n_below = parent.metadata.get('min_n_below', 1) min_n_below = min(len(self.data.minhash.get_mins()), min_n_below) + parent.metadata['min_n_below'] = min_n_below @property @@ -94,10 +95,10 @@ def search_minhashes(node, sig, threshold, results=None, downsample=True): else: # Node or Leaf, Nodegraph by minhash comparison if len(mins): matches = sum(1 for value in mins if node.data.get(value)) - max_mins = node.metadata.get('min_n_below', -1) - if max_mins == -1: + min_n_below = node.metadata.get('min_n_below', -1) + if min_n_below == -1: raise Exception('cannot do similarity search on this SBT; need to rebuild.') - score = float(matches) / max_mins + score = float(matches) / min_n_below if results is not None: results[node.name] = score @@ -130,11 +131,17 @@ def search(self, node, sig, threshold, results=None): raise else: # internal object, not leaf. if len(mins): + + # calculate the maximum possibility similarity score below + # this node, based on the number of matches at this node, + # divided by the smallest minhash size below this node + # (which should be an upper bound on the Jaccard similarity + # of any signature below this point) matches = sum(1 for value in mins if node.data.get(value)) - max_mins = node.metadata.get('min_n_below', -1) - if max_mins == -1: + min_n_below = node.metadata.get('min_n_below', -1) + if min_n_below == -1: raise Exception('cannot do similarity search on this SBT; need to rebuild.') - score = float(matches) / max_mins + score = float(matches) / min_n_below if results is not None: results[node.name] = score