From dd1d3511dfbceb801a733a3005a2918d954680a5 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 1 Oct 2018 16:41:33 -0700 Subject: [PATCH 01/37] Start moving LCA to Index --- sourmash/index.py | 16 ++++++++++++++++ sourmash/lca/lca_utils.py | 6 +++++- sourmash/sbt.py | 4 ++-- 3 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 sourmash/index.py diff --git a/sourmash/index.py b/sourmash/index.py new file mode 100644 index 000000000..6449fedaf --- /dev/null +++ b/sourmash/index.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod + +class Index(ABC): + + @abstractmethod + def find(self, search_fn, *args, **kwargs): + ''' ''' + + @abstractmethod + def save(self, path, storage=None, sparseness=0.0, structure_only=False): + ''' ''' + + @classmethod + @abstractmethod + def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True): + ''' ''' diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index 06b54ffd1..431b8f895 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -19,6 +19,7 @@ from .._minhash import get_max_hash_for_scaled from ..logging import notify, error, debug +from ..index import Index # type to store an element in a taxonomic lineage LineagePair = namedtuple('LineagePair', ['rank', 'name']) @@ -138,7 +139,7 @@ def find_lca(tree): return tuple(lineage), len(node) -class LCA_Database(object): +class LCA_Database(Index): """ Wrapper class for taxonomic database. @@ -261,6 +262,9 @@ def save(self, db_name): json.dump(save_d, fp) + def find(self, search_fn, *args, **kwargs): + pass + def downsample_scaled(self, scaled): """ Downsample to the provided scaled value, i.e. eliminate all hashes diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 2a48874a3..b4b5714ac 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -66,7 +66,7 @@ def search_transcript(node, seq, threshold): from .sbt_storage import FSStorage, TarStorage, IPFSStorage, RedisStorage from .logging import error, notify, debug - +from .index import Index STORAGES = { 'TarStorage': TarStorage, @@ -102,7 +102,7 @@ def init_args(self): return (self.ksize, self.starting_size, self.n_tables) -class SBT(object): +class SBT(Index): """A Sequence Bloom Tree implementation allowing generic internal nodes and leaves. The default node and leaf format is a Bloom Filter (like the original implementation), From 291beeea107cf9b5b61e77e099f5cf45704f3836 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 12 Oct 2018 19:01:17 -0700 Subject: [PATCH 02/37] rename add_node to insert --- setup.py | 3 ++- sourmash/commands.py | 2 +- sourmash/index.py | 11 ++++++++- sourmash/lca/lca_utils.py | 3 +++ sourmash/sbt.py | 29 +++++++++++++---------- tests/test_sbt.py | 48 +++++++++++++++++++-------------------- 6 files changed, 57 insertions(+), 39 deletions(-) diff --git a/setup.py b/setup.py index 6bde32429..5d32d9af6 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,8 @@ language="c++", extra_compile_args=EXTRA_COMPILE_ARGS, extra_link_args=EXTRA_LINK_ARGS)], - "install_requires": ["screed>=0.9", "ijson<2.5", "khmer>=2.1"], + "install_requires": ["screed>=0.9", "ijson<2.5", "khmer>=2.1", + "deprecation>=2.0.6"], "setup_requires": ['Cython>=0.25.2', "setuptools>=38.6.0", 'setuptools_scm', 'setuptools_scm_git_archive'], "use_scm_version": {"write_to": "sourmash/version.py"}, diff --git a/sourmash/commands.py b/sourmash/commands.py index 1bdeeae6f..cd1cdeb5f 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -451,7 +451,7 @@ def index(args): scaleds.add(ss.minhash.scaled) leaf = SigLeaf(ss.md5sum(), ss) - tree.add_node(leaf) + tree.insert(leaf) n += 1 if not ss: diff --git a/sourmash/index.py b/sourmash/index.py index 6449fedaf..706691612 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -1,4 +1,9 @@ -from abc import ABC, abstractmethod +from abc import ABCMeta, abstractmethod + + +# compatible with Python 2 *and* 3: +ABC = ABCMeta('ABC', (object,), {'__slots__': ()}) + class Index(ABC): @@ -6,6 +11,10 @@ class Index(ABC): def find(self, search_fn, *args, **kwargs): ''' ''' + @abstractmethod + def insert(self, node): + ''' ''' + @abstractmethod def save(self, path, storage=None, sparseness=0.0, structure_only=False): ''' ''' diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index 431b8f895..f0d26e8fd 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -265,6 +265,9 @@ def save(self, db_name): def find(self, search_fn, *args, **kwargs): pass + def insert(self, node): + pass + def downsample_scaled(self, scaled): """ Downsample to the provided scaled value, i.e. eliminate all hashes diff --git a/sourmash/sbt.py b/sourmash/sbt.py index b4b5714ac..bef1fcb6d 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -10,7 +10,7 @@ graph1 = factory() # ... add stuff to graph1 ... leaf1 = Leaf("a", graph1) - root.add_node(leaf1) + root.insert(leaf1) For example, :: @@ -26,7 +26,7 @@ graph = factory() graph.consume_fasta(filename) leaf = Leaf(filename, graph) - root.add_node(leaf) + root.insert(leaf) then define a search function, :: @@ -57,6 +57,7 @@ def search_transcript(node, seq, threshold): import sys from tempfile import NamedTemporaryFile +from deprecation import deprecated import khmer try: @@ -160,13 +161,13 @@ def new_node_pos(self, node): return self.next_node - def add_node(self, leaf): - pos = self.new_node_pos(leaf) + def insert(self, node): + pos = self.new_node_pos(node) if pos == 0: # empty tree; initialize w/node. n = Node(self.factory, name="internal." + str(pos)) self._nodes[0] = n - pos = self.new_node_pos(leaf) + pos = self.new_node_pos(node) # Cases: # 1) parent is a Leaf (already covered) @@ -186,28 +187,32 @@ def add_node(self, leaf): c1, c2 = self.children(p.pos)[:2] self._leaves[c1.pos] = p.node - self._leaves[c2.pos] = leaf + self._leaves[c2.pos] = node del self._leaves[p.pos] - for child in (p.node, leaf): + for child in (p.node, node): child.update(n) elif isinstance(p.node, Node): - self._leaves[pos] = leaf - leaf.update(p.node) + self._leaves[pos] = node + node.update(p.node) elif p.node is None: n = Node(self.factory, name="internal." + str(p.pos)) self._nodes[p.pos] = n c1 = self.children(p.pos)[0] - self._leaves[c1.pos] = leaf - leaf.update(n) + self._leaves[c1.pos] = node + node.update(n) # update all parents! p = self.parent(p.pos) while p: self._rebuild_node(p.pos) - leaf.update(self._nodes[p.pos]) + node.update(self._nodes[p.pos]) p = self.parent(p.pos) + @deprecated(details="Use the insert method instead") + def add_node(self, node): + self.insert(node) + def find(self, search_fn, *args, **kwargs): "Search the tree using `search_fn`." diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 69824caab..bef6e7c6e 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -43,11 +43,11 @@ def test_simple(n_children): leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') - root.add_node(leaf1) - root.add_node(leaf2) - root.add_node(leaf3) - root.add_node(leaf4) - root.add_node(leaf5) + root.insert(leaf1) + root.insert(leaf2) + root.insert(leaf3) + root.insert(leaf4) + root.insert(leaf5) def search_kmer(obj, seq): return obj.data.get(seq) @@ -104,11 +104,11 @@ def test_longer_search(n_children): leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') - root.add_node(leaf1) - root.add_node(leaf2) - root.add_node(leaf3) - root.add_node(leaf4) - root.add_node(leaf5) + root.insert(leaf1) + root.insert(leaf2) + root.insert(leaf3) + root.insert(leaf4) + root.insert(leaf5) def kmers(k, seq): for start in range(len(seq) - k + 1): @@ -213,7 +213,7 @@ def test_tree_save_load(n_children): for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.add_node(leaf) + tree.insert(leaf) to_search = leaf print('*' * 60) @@ -274,7 +274,7 @@ def test_search_minhashes(): for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.add_node(leaf) + tree.insert(leaf) to_search = next(iter(tree.leaves())) @@ -298,7 +298,7 @@ def test_binary_nary_tree(): sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): - tree.add_node(leaf) + tree.insert(leaf) to_search = leaf n_leaves += 1 @@ -325,11 +325,11 @@ def test_sbt_combine(n_children): for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.add_node(leaf) + tree.insert(leaf) if n_leaves < 4: - tree_1.add_node(leaf) + tree_1.insert(leaf) else: - tree_2.add_node(leaf) + tree_2.insert(leaf) n_leaves += 1 tree_1.combine(tree_2) @@ -360,7 +360,7 @@ def test_sbt_combine(n_children): if not next_empty: next_empty = n + 1 - tree_1.add_node(leaf) + tree_1.insert(leaf) assert tree_1.next_node == next_empty @@ -372,7 +372,7 @@ def test_sbt_fsstorage(): for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.add_node(leaf) + tree.insert(leaf) to_search = leaf print('*' * 60) @@ -405,7 +405,7 @@ def test_sbt_tarstorage(): for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.add_node(leaf) + tree.insert(leaf) to_search = leaf print('*' * 60) @@ -441,7 +441,7 @@ def test_sbt_ipfsstorage(): for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.add_node(leaf) + tree.insert(leaf) to_search = leaf print('*' * 60) @@ -479,7 +479,7 @@ def test_sbt_redisstorage(): for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.add_node(leaf) + tree.insert(leaf) to_search = leaf print('*' * 60) @@ -527,14 +527,14 @@ def test_tree_repair(): assert len(results_repair) == 2 -def test_tree_repair_add_node(): +def test_tree_repair_insert(): tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'), leaf_loader=SigLeaf.load) for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree_repair.add_node(leaf) + tree_repair.insert(leaf) for pos, node in tree_repair: # Every parent of a node must be an internal node (and not a leaf), @@ -554,7 +554,7 @@ def test_save_sparseness(n_children): for f in utils.SIG_FILES: sig = next(signature.load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.add_node(leaf) + tree.insert(leaf) to_search = leaf print('*' * 60) From a39a74a286ad980778483cd38b5437085cbc64f8 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 12 Oct 2018 19:49:07 -0700 Subject: [PATCH 03/37] simple test --- sourmash/index.py | 34 ++++++++++++++++++++----- tests/test_index.py | 62 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 6 deletions(-) create mode 100644 tests/test_index.py diff --git a/sourmash/index.py b/sourmash/index.py index 706691612..305dafdaf 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -2,24 +2,46 @@ # compatible with Python 2 *and* 3: -ABC = ABCMeta('ABC', (object,), {'__slots__': ()}) +ABC = ABCMeta("ABC", (object,), {"__slots__": ()}) class Index(ABC): - @abstractmethod def find(self, search_fn, *args, **kwargs): - ''' ''' + """ """ @abstractmethod def insert(self, node): - ''' ''' + """ """ @abstractmethod def save(self, path, storage=None, sparseness=0.0, structure_only=False): - ''' ''' + """ """ @classmethod @abstractmethod def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True): - ''' ''' + """ """ + + +class LinearIndex(Index): + def __init__(self): + self.signatures = set() + + def insert(self, node): + self.signatures.add(node) + + def find(self, search_fn, *args, **kwargs): + matches = [] + + for node in self.signatures: + if search_fn(node, *args): + matches.append(node) + return matches + + def save(self, path): + pass + + @classmethod + def load(cls, location): + pass diff --git a/tests/test_index.py b/tests/test_index.py new file mode 100644 index 000000000..8e6e7af39 --- /dev/null +++ b/tests/test_index.py @@ -0,0 +1,62 @@ +from __future__ import print_function, unicode_literals + +from sourmash.index import LinearIndex +from sourmash_lib.sbt import SBT, GraphFactory, Leaf + + +def test_simple_index(n_children): + factory = GraphFactory(5, 100, 3) + root = SBT(factory, d=n_children) + + leaf1 = Leaf("a", factory()) + leaf1.data.count("AAAAA") + leaf1.data.count("AAAAT") + leaf1.data.count("AAAAC") + + leaf2 = Leaf("b", factory()) + leaf2.data.count("AAAAA") + leaf2.data.count("AAAAT") + leaf2.data.count("AAAAG") + + leaf3 = Leaf("c", factory()) + leaf3.data.count("AAAAA") + leaf3.data.count("AAAAT") + leaf3.data.count("CAAAA") + + leaf4 = Leaf("d", factory()) + leaf4.data.count("AAAAA") + leaf4.data.count("CAAAA") + leaf4.data.count("GAAAA") + + leaf5 = Leaf("e", factory()) + leaf5.data.count("AAAAA") + leaf5.data.count("AAAAT") + leaf5.data.count("GAAAA") + + root.insert(leaf1) + root.insert(leaf2) + root.insert(leaf3) + root.insert(leaf4) + root.insert(leaf5) + + def search_kmer(obj, seq): + return obj.data.get(seq) + + kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"] + + linear = LinearIndex() + linear.insert(leaf1) + linear.insert(leaf2) + linear.insert(leaf3) + linear.insert(leaf4) + linear.insert(leaf5) + + for kmer in kmers: + assert set(root.find(search_kmer, kmer)) == set(linear.find(search_kmer, kmer)) + + print("-----") + print([x.metadata for x in root.find(search_kmer, "AAAAA")]) + print([x.metadata for x in root.find(search_kmer, "AAAAT")]) + print([x.metadata for x in root.find(search_kmer, "AAAAG")]) + print([x.metadata for x in root.find(search_kmer, "CAAAA")]) + print([x.metadata for x in root.find(search_kmer, "GAAAA")]) From 2769ae9c2c4fa5d84a50fc12aaa212d3dc36d2c4 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 3 Jan 2019 11:04:05 -0800 Subject: [PATCH 04/37] first pass definition of search and gather - tests pass, at least! --- sourmash/index.py | 19 +++++++++++++++++++ sourmash/lca/lca_utils.py | 6 ++++++ sourmash/sbt.py | 6 ++++++ 3 files changed, 31 insertions(+) diff --git a/sourmash/index.py b/sourmash/index.py index 305dafdaf..3eb08b1fa 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -10,6 +10,14 @@ class Index(ABC): def find(self, search_fn, *args, **kwargs): """ """ + @abstractmethod + def search(self, signature, *args, **kwargs): + """ """ + + @abstractmethod + def gather(self, signature, *args, **kwargs): + """ """ + @abstractmethod def insert(self, node): """ """ @@ -39,6 +47,17 @@ def find(self, search_fn, *args, **kwargs): matches.append(node) return matches + def search(self, signature, *args, **kwargs): + matches = [] + + for node in self.signatures: + if signature.similarity(node): + matches.append(node) + return matches + + def gather(self, signature, *args, **kwargs): + pass + def save(self, path): pass diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index f0d26e8fd..d542ca2c0 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -265,6 +265,12 @@ def save(self, db_name): def find(self, search_fn, *args, **kwargs): pass + def search(self, sig): + pass + + def gather(self, sig): + pass + def insert(self, node): pass diff --git a/sourmash/sbt.py b/sourmash/sbt.py index bef1fcb6d..6d846ca4d 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -256,6 +256,12 @@ def find(self, search_fn, *args, **kwargs): queue.extend(c.pos for c in self.children(node_p)) return matches + def search(self, sig): + pass + + def gather(self, sig): + pass + def _rebuild_node(self, pos=0): """Recursively rebuilds an internal node (if it is not present). From 6e2e504f3dac22ef685b93ca90d228b48ba0244a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 5 Jan 2019 07:07:35 -0800 Subject: [PATCH 05/37] start adding code for LinearIndex.search --- sourmash/index.py | 35 ++++++++++++++++++++++++++++++++--- tests/test_index.py | 17 +++++++++++++++++ 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index 3eb08b1fa..9fb0e107a 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -48,11 +48,40 @@ def find(self, search_fn, *args, **kwargs): return matches def search(self, signature, *args, **kwargs): + """@@ + + Note, the "best only" hint is ignored by LinearIndex. + """ + + # check arguments + if 'threshold' not in kwargs: + raise TypeError("'search' requires 'threshold'") + + do_containment = kwargs.get('do_containment', False) + ignore_abundance = kwargs.get('ignore_abundance', False) + + # configure search - containment? ignore abundance? + if do_containment: + query_match = lambda x: query.contained_by(x, downsample=True) + else: + query_match = lambda x: query.similarity( + x, downsample=True, ignore_abundance=ignore_abundance) + + # do the actual search: matches = [] - for node in self.signatures: - if signature.similarity(node): - matches.append(node) + for ss in self.signatures: + similarity = query_match(ss) + if similarity >= threshold: + # @CTB: check duplicates via md5sum - here or ?? + sr = SearchResult(similarity=similarity, + match_sig=ss, + md5=ss.md5sum(), + filename = None, + name=ss.name()) + matches.append(sr) + + # @CTB sort here or ?? return matches def gather(self, signature, *args, **kwargs): diff --git a/tests/test_index.py b/tests/test_index.py index 8e6e7af39..225b0c8a2 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -1,7 +1,9 @@ from __future__ import print_function, unicode_literals +import sourmash from sourmash.index import LinearIndex from sourmash_lib.sbt import SBT, GraphFactory, Leaf +from . import sourmash_tst_utils as utils def test_simple_index(n_children): @@ -60,3 +62,18 @@ def search_kmer(obj, seq): print([x.metadata for x in root.find(search_kmer, "AAAAG")]) print([x.metadata for x in root.find(search_kmer, "CAAAA")]) print([x.metadata for x in root.find(search_kmer, "GAAAA")]) + + +def test_linear_index_search(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + lidx = LinearIndex() + lidx.insert(ss2) + lidx.insert(ss47) + lidx.insert(ss63) From 412e1012951dd19f8b29921c52e0d229d758a013 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 5 Jan 2019 07:50:00 -0800 Subject: [PATCH 06/37] an initial test of LinearIndex.search --- sourmash/index.py | 10 +++++++++- tests/test_index.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/sourmash/index.py b/sourmash/index.py index 9fb0e107a..44493b1ff 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -1,4 +1,11 @@ +"An Abstract Base Class for collections of signatures." + from abc import ABCMeta, abstractmethod +from collections import namedtuple + +# @CTB copied out of search.py to deal with import order issues, #willfix +SearchResult = namedtuple('SearchResult', + 'similarity, match_sig, md5, filename, name') # compatible with Python 2 *and* 3: @@ -47,7 +54,7 @@ def find(self, search_fn, *args, **kwargs): matches.append(node) return matches - def search(self, signature, *args, **kwargs): + def search(self, query, *args, **kwargs): """@@ Note, the "best only" hint is ignored by LinearIndex. @@ -56,6 +63,7 @@ def search(self, signature, *args, **kwargs): # check arguments if 'threshold' not in kwargs: raise TypeError("'search' requires 'threshold'") + threshold = kwargs['threshold'] do_containment = kwargs.get('do_containment', False) ignore_abundance = kwargs.get('ignore_abundance', False) diff --git a/tests/test_index.py b/tests/test_index.py index 225b0c8a2..7c8cd7e98 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -77,3 +77,32 @@ def test_linear_index_search(): lidx.insert(ss2) lidx.insert(ss47) lidx.insert(ss63) + + # now, search for sig2 + sr = lidx.search(ss2, threshold=1.0) + print([s.name for s in sr]) + assert len(sr) == 1 + assert sr[0].match_sig == ss2 + + # search for sig47 with lower threshold; search order not guaranteed. + sr = lidx.search(ss47, threshold=0.1) + print([s.name for s in sr]) + assert len(sr) == 2 + sr.sort(key=lambda x: -x.similarity) + assert sr[0].match_sig == ss47 + assert sr[1].match_sig == ss63 + + # search for sig63 with lower threshold; search order not guaranteed. + sr = lidx.search(ss63, threshold=0.1) + print([s.name for s in sr]) + assert len(sr) == 2 + sr.sort(key=lambda x: -x.similarity) + assert sr[0].match_sig == ss63 + assert sr[1].match_sig == ss47 + + # search for sig63 with high threshold => 1 match + sr = lidx.search(ss63, threshold=0.8) + print([s.name for s in sr]) + assert len(sr) == 1 + sr.sort(key=lambda x: -x.similarity) + assert sr[0].match_sig == ss63 From 5f197d313c6c95de37619eab6d5f872438c0b637 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 13:44:01 -0400 Subject: [PATCH 07/37] implement save & load for LinearIndex --- sourmash/index.py | 11 +++++++-- tests/test_index.py | 56 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index 44493b1ff..1422039db 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -96,8 +96,15 @@ def gather(self, signature, *args, **kwargs): pass def save(self, path): - pass + from .signature import save_signatures + with open(path, 'wt') as fp: + save_signatures(self.signatures, fp) @classmethod def load(cls, location): - pass + from .signature import load_signatures + si = load_signatures(location) + + lidx = LinearIndex() + lidx.signatures.update(si) + return lidx diff --git a/tests/test_index.py b/tests/test_index.py index 7c8cd7e98..08ee27435 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -1,5 +1,6 @@ from __future__ import print_function, unicode_literals +import os import sourmash from sourmash.index import LinearIndex from sourmash_lib.sbt import SBT, GraphFactory, Leaf @@ -106,3 +107,58 @@ def test_linear_index_search(): assert len(sr) == 1 sr.sort(key=lambda x: -x.similarity) assert sr[0].match_sig == ss63 + + +def test_linear_index_save(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + linear = LinearIndex() + linear.insert(ss2) + linear.insert(ss47) + linear.insert(ss63) + + with utils.TempDirectory() as location: + filename = os.path.join(location, 'foo') + linear.save(filename) + + from sourmash import load_signatures + si = set(load_signatures(filename)) + + x = { ss2, ss47, ss63} + + print(len(si)) + print(len(x)) + + assert si == x + + +def test_linear_index_save_load(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + linear = LinearIndex() + linear.insert(ss2) + linear.insert(ss47) + linear.insert(ss63) + + with utils.TempDirectory() as location: + filename = os.path.join(location, 'foo') + linear.save(filename) + linear2 = LinearIndex.load(filename) + + # now, search for sig2 + sr = linear2.search(ss2, threshold=1.0) + print([s.name for s in sr]) + assert len(sr) == 1 + assert sr[0].match_sig == ss2 From 723d3df758425e831bfecc6a6e9d99816cf913a3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 13:45:54 -0400 Subject: [PATCH 08/37] add test for LinearIndex.load --- tests/test_index.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_index.py b/tests/test_index.py index 08ee27435..88eba0ec5 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -138,6 +138,28 @@ def test_linear_index_save(): assert si == x +def test_linear_index_load(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + with utils.TempDirectory() as location: + from sourmash import save_signatures + + filename = os.path.join(location, 'foo') + with open(filename, 'wt') as fp: + sourmash.save_signatures([ss2, ss47, ss63], fp) + + linear = LinearIndex.load(filename) + + x = { ss2, ss47, ss63} + assert linear.signatures == x + + def test_linear_index_save_load(): sig2 = utils.get_test_data('2.fa.sig') sig47 = utils.get_test_data('47.fa.sig') From 577c9fa29ceecfded68caa8e5e661d9d4d169a10 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 13:56:49 -0400 Subject: [PATCH 09/37] implemented & tested LinearIndex.gather --- sourmash/index.py | 14 ++++++++++++-- tests/test_index.py | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index 1422039db..002bcfedd 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -92,8 +92,18 @@ def search(self, query, *args, **kwargs): # @CTB sort here or ?? return matches - def gather(self, signature, *args, **kwargs): - pass + def gather(self, query, *args, **kwargs): + # check arguments + threshold = kwargs.get('threshold', 0) + + results = [] + for ss in self.signatures: + cont = query.minhash.containment_ignore_maxhash(ss.minhash) + if cont > threshold: + results.append((cont, ss)) + results.sort(reverse=True) + + return results def save(self, path): from .signature import save_signatures diff --git a/tests/test_index.py b/tests/test_index.py index 88eba0ec5..06e2903b1 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -109,6 +109,33 @@ def test_linear_index_search(): assert sr[0].match_sig == ss63 +def test_linear_index_gather(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + lidx = LinearIndex() + lidx.insert(ss2) + lidx.insert(ss47) + lidx.insert(ss63) + + matches = lidx.gather(ss2) + assert len(matches) == 1 + assert matches[0][0] == 1.0 + assert matches[0][1] == ss2 + + matches = lidx.gather(ss47) + assert len(matches) == 2 + assert matches[0][0] == 1.0 + assert matches[0][1] == ss47 + assert round(matches[1][0], 2) == 0.49 + assert matches[1][1] == ss63 + + def test_linear_index_save(): sig2 = utils.get_test_data('2.fa.sig') sig47 = utils.get_test_data('47.fa.sig') From eb41b93b13ca9b5fe07711440dad2fd9f62ea4b1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 14:26:04 -0400 Subject: [PATCH 10/37] implement LinearIndex in load_databases and search funtions --- sourmash/index.py | 17 ++++++++++------- sourmash/search.py | 18 +++++++----------- sourmash/sourmash_args.py | 19 ++++++++++--------- tests/test_index.py | 10 +++++++--- tests/test_sourmash.py | 3 ++- 5 files changed, 36 insertions(+), 31 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index 002bcfedd..57bce5bdb 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -40,11 +40,15 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr class LinearIndex(Index): - def __init__(self): - self.signatures = set() + def __init__(self, signatures=[], filename=None): + self.signatures = list(signatures) + self.filename = filename + + def __len__(self): + return len(self.signatures) def insert(self, node): - self.signatures.add(node) + self.signatures.append(node) def find(self, search_fn, *args, **kwargs): matches = [] @@ -85,11 +89,11 @@ def search(self, query, *args, **kwargs): sr = SearchResult(similarity=similarity, match_sig=ss, md5=ss.md5sum(), - filename = None, + filename = self.filename, name=ss.name()) matches.append(sr) - # @CTB sort here or ?? + matches.sort(key=lambda x: -x.similarity) return matches def gather(self, query, *args, **kwargs): @@ -115,6 +119,5 @@ def load(cls, location): from .signature import load_signatures si = load_signatures(location) - lidx = LinearIndex() - lidx.signatures.update(si) + lidx = LinearIndex(si, filename=location) return lidx diff --git a/sourmash/search.py b/sourmash/search.py index 210640581..f1fd6fc93 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -88,18 +88,14 @@ def search_databases(query, databases, threshold, do_containment, best_only, results.append(sr) else: # list of signatures - for ss in obj: - similarity = query_match(ss) - if similarity >= threshold and \ - ss.md5sum() not in found_md5: - sr = SearchResult(similarity=similarity, - match_sig=ss, - md5=ss.md5sum(), - filename=filename, - name=ss.name()) - found_md5.add(sr.md5) + linear = obj + search_iter = linear.search(query, threshold=threshold, + do_containment=do_containment, + ignore_abundance=ignore_abundance) + for sr in search_iter: + if sr.md5 not in found_md5: results.append(sr) - + found_md5.add(sr.md5) # sort results on similarity (reverse) results.sort(key=lambda x: -x.similarity) diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index 33ca36565..09975463c 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -7,6 +7,7 @@ from . import signature from .logging import notify, error +from .index import LinearIndex from . import signature as sig from .sbt import SBT from .sbtmh import SigLeaf @@ -297,12 +298,12 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, traverse=False): ksize=query_ksize, select_moltype=query_moltype) siglist = filter_compatible_signatures(query, siglist, 1) - siglist = list(siglist) - databases.append((siglist, sbt_or_sigfile, False)) - notify('loaded {} signatures from {}', len(siglist), + linear = LinearIndex(siglist, filename=sigfile) + databases.append((linear, sbt_or_sigfile, False)) + notify('loaded {} signatures from {}', len(linear), sigfile, end='\r') - n_signatures += len(siglist) - except Exception: # ignore errors with traverse + n_signatures += len(linear) + except Exception: # ignore errors with traverse pass # done! jump to beginning of main 'for' loop @@ -355,12 +356,12 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, traverse=False): raise ValueError siglist = filter_compatible_signatures(query, siglist, False) - siglist = list(siglist) + linear = LinearIndex(siglist, filename=sbt_or_sigfile) + databases.append((linear, sbt_or_sigfile, 'signature')) - databases.append((siglist, sbt_or_sigfile, 'signature')) - notify('loaded {} signatures from {}', len(siglist), + notify('loaded {} signatures from {}', len(linear), sbt_or_sigfile, end='\r') - n_signatures += len(siglist) + n_signatures += len(linear) except (EnvironmentError, ValueError): error("\nCannot open file '{}'", sbt_or_sigfile) sys.exit(-1) diff --git a/tests/test_index.py b/tests/test_index.py index 06e2903b1..e232770ad 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -155,13 +155,16 @@ def test_linear_index_save(): linear.save(filename) from sourmash import load_signatures - si = set(load_signatures(filename)) + si = list(load_signatures(filename)) - x = { ss2, ss47, ss63} + x = [ ss2, ss47, ss63 ] print(len(si)) print(len(x)) + print(si) + print(x) + assert si == x @@ -183,8 +186,9 @@ def test_linear_index_load(): linear = LinearIndex.load(filename) - x = { ss2, ss47, ss63} + x = [ ss2, ss47, ss63 ] assert linear.signatures == x + assert linear.filename == filename def test_linear_index_save_load(): diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index d7abdf7a5..9363a51b9 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -613,9 +613,10 @@ def test_search_csv(): with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) + print('xxx', row) assert float(row['similarity']) == 0.93 assert row['name'].endswith('short2.fa') - assert row['filename'].endswith('short2.fa.sig') + assert row['filename'].endswith('short2.fa.sig'), row['filename'] assert row['md5'] == '914591cd1130aa915fe0c0c63db8f19d' From 2a1342831d564a6cf5b08a35cc6b3c48128870d5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 14:29:04 -0400 Subject: [PATCH 11/37] implemented LinearIndex for gather, too --- sourmash/index.py | 2 +- sourmash/search.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index 57bce5bdb..af20ae12e 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -104,7 +104,7 @@ def gather(self, query, *args, **kwargs): for ss in self.signatures: cont = query.minhash.containment_ignore_maxhash(ss.minhash) if cont > threshold: - results.append((cont, ss)) + results.append((cont, ss, self.filename)) results.sort(reverse=True) return results diff --git a/sourmash/search.py b/sourmash/search.py index f1fd6fc93..a98b20067 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -162,10 +162,10 @@ def find_best(dblist, query, remainder): # search a signature else: - for ss in obj: - similarity = query.minhash.containment_ignore_maxhash(ss.minhash) - if similarity > 0.0: - results.append((similarity, ss, filename)) + linear = obj + gather_iter = linear.gather(query) + for similarity, ss, filename in gather_iter: + results.append((similarity, ss, filename)) if not results: return None, None, None From 0803af3ca042fba341447d5a266f0547cecf63f9 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 14:37:07 -0400 Subject: [PATCH 12/37] implemented search in LCA db --- sourmash/lca/lca_utils.py | 28 +++++++++++++++++++++++----- sourmash/search.py | 15 ++++++--------- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index d542ca2c0..dcba9f0c4 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -24,6 +24,9 @@ # type to store an element in a taxonomic lineage LineagePair = namedtuple('LineagePair', ['rank', 'name']) +# @CTB copied out of search.py to deal with import order issues, #willfix +SearchResult = namedtuple('SearchResult', + 'similarity, match_sig, md5, filename, name') def check_files_exist(*files): ret = True @@ -262,11 +265,26 @@ def save(self, db_name): json.dump(save_d, fp) - def find(self, search_fn, *args, **kwargs): - pass - - def search(self, sig): - pass + def search(self, query, *args, **kwargs): + # check arguments + if 'threshold' not in kwargs: + raise TypeError("'search' requires 'threshold'") + threshold = kwargs['threshold'] + do_containment = kwargs.get('do_containment', False) + # @CTB ignore_abundance? + + results = [] + for x in self.find(query.minhash, threshold, do_containment): + (score, match_sig, md5, filename, name) = x + sr = SearchResult(similarity=score, + match_sig=match_sig, + md5=md5, + filename=filename, + name=name) + results.append(sr) + + results.sort(key=lambda x: -x.similarity) + return results def gather(self, sig): pass diff --git a/sourmash/search.py b/sourmash/search.py index a98b20067..95f5c44d6 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -76,16 +76,13 @@ def search_databases(query, databases, threshold, do_containment, best_only, elif filetype == 'LCA': lca_db = obj - for x in lca_db.find(query.minhash, threshold, do_containment): - (score, match_sig, md5, filename, name) = x - if md5 not in found_md5: - sr = SearchResult(similarity=score, - match_sig=match_sig, - md5=md5, - filename=filename, - name=name) - found_md5.add(sr.md5) + search_iter = lca_db.search(query, threshold=threshold, + do_containment=do_containment, + ignore_abundance=ignore_abundance) + for sr in search_iter: + if sr.md5 not in found_md5: results.append(sr) + found_md5.add(sr.md5) else: # list of signatures linear = obj From 9406e1008d7ca0b06357a0525f8832aab9152139 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 14:40:37 -0400 Subject: [PATCH 13/37] implemented gather on LCA DBs --- sourmash/lca/lca_utils.py | 11 +++++++++-- sourmash/search.py | 8 +++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index dcba9f0c4..1974518fb 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -286,8 +286,15 @@ def search(self, query, *args, **kwargs): results.sort(key=lambda x: -x.similarity) return results - def gather(self, sig): - pass + def gather(self, query, *args, **kwargs): + results = [] + for x in self.find(query.minhash, 0.0, + containment=True, ignore_scaled=True): + (score, match_sig, md5, filename, name) = x + if score > 0.0: + results.append((score, match_sig, filename)) + + return results def insert(self, node): pass diff --git a/sourmash/search.py b/sourmash/search.py index 95f5c44d6..e70b38628 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -151,11 +151,9 @@ def find_best(dblist, query, remainder): # or an LCA database elif filetype == 'LCA': lca_db = obj - for x in lca_db.find(query.minhash, 0.0, - containment=True, ignore_scaled=True): - (score, match_sig, md5, filename, name) = x - if score > 0.0: - results.append((score, match_sig, filename)) + gather_iter = lca_db.gather(query) + for similarity, ss, filename in gather_iter: + results.append((similarity, ss, filename)) # search a signature else: From 913721232c5d8fa455bc6001be7a058f70da5d37 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 14:46:50 -0400 Subject: [PATCH 14/37] implemented gather on SBT --- sourmash/sbt.py | 16 ++++++++++++++-- sourmash/search.py | 9 +++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 6d846ca4d..676242f2f 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -259,8 +259,20 @@ def find(self, search_fn, *args, **kwargs): def search(self, sig): pass - def gather(self, sig): - pass + def gather(self, query, *args, **kwargs): + from .sbtmh import GatherMinHashesFindBestIgnoreMaxHash + threshold = kwargs['threshold'] + + search_fn = GatherMinHashesFindBestIgnoreMaxHash(threshold).search + + results = [] + for leaf in self.find(search_fn, query, threshold): + leaf_e = leaf.data.minhash + similarity = query.minhash.containment_ignore_maxhash(leaf_e) + if similarity > 0.0: + results.append((similarity, leaf.data)) + + return results def _rebuild_node(self, pos=0): """Recursively rebuilds an internal node (if it is not present). diff --git a/sourmash/search.py b/sourmash/search.py index e70b38628..2fa3b4a8f 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -141,13 +141,10 @@ def find_best(dblist, query, remainder): # search a tree if filetype == 'SBT': tree = obj - search_fn = GatherMinHashesFindBestIgnoreMaxHash(best_ctn_sofar).search + gather_iter = tree.gather(query, threshold=best_ctn_sofar) + for similarity, ss in gather_iter: + results.append((similarity, ss, filename)) - for leaf in tree.find(search_fn, query, best_ctn_sofar): - leaf_e = leaf.data.minhash - similarity = query.minhash.containment_ignore_maxhash(leaf_e) - if similarity > 0.0: - results.append((similarity, leaf.data, filename)) # or an LCA database elif filetype == 'LCA': lca_db = obj From 100cbd9f5d091fafae0d1977eb0a22a65b0274b7 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 14:57:39 -0400 Subject: [PATCH 15/37] implemented search on SBTs --- sourmash/sbt.py | 53 ++++++++++++++++++++++++++++++++++++++++++++-- sourmash/search.py | 36 ++++++------------------------- 2 files changed, 58 insertions(+), 31 deletions(-) diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 676242f2f..98302920c 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -77,6 +77,9 @@ def search_transcript(node, seq, threshold): } NodePos = namedtuple("NodePos", ["pos", "node"]) +# @CTB copied out of search.py to deal with import order issues, #willfix +SearchResult = namedtuple('SearchResult', + 'similarity, match_sig, md5, filename, name') class GraphFactory(object): """Build new nodegraphs (Bloom filters) of a specific (fixed) size. @@ -256,8 +259,54 @@ def find(self, search_fn, *args, **kwargs): queue.extend(c.pos for c in self.children(node_p)) return matches - def search(self, sig): - pass + def search(self, query, *args, **kwargs): + from .sbtmh import search_minhashes, search_minhashes_containment + from .sbtmh import SearchMinHashesFindBest + from .signature import SourmashSignature + + threshold = kwargs['threshold'] + ignore_abundance = kwargs['ignore_abundance'] + do_containment = kwargs['do_containment'] + best_only = kwargs['best_only'] + + search_fn = search_minhashes + query_match = lambda x: query.similarity( + x, downsample=True, ignore_abundance=ignore_abundance) + if do_containment: + search_fn = search_minhashes_containment + query_match = lambda x: query.contained_by(x, downsample=True) + + if best_only: # this needs to be reset for each SBT + search_fn = SearchMinHashesFindBest().search + + # figure out scaled value of tree, downsample query if needed. + leaf = next(iter(self.leaves())) + tree_mh = leaf.data.minhash + + tree_query = query + if tree_mh.scaled and query.minhash.scaled and \ + tree_mh.scaled > query.minhash.scaled: + resampled_query_mh = tree_query.minhash + resampled_query_mh = resampled_query_mh.downsample_scaled(tree_mh.scaled) + tree_query = SourmashSignature(resampled_query_mh) + + # now, search! + results = [] + for leaf in self.find(search_fn, tree_query, threshold): + similarity = query_match(leaf.data) + + # tree search should always/only return matches above threshold + assert similarity >= threshold + + sr = SearchResult(similarity=similarity, + match_sig=leaf.data, + md5=leaf.data.md5sum(), + name=leaf.data.name(), + filename=None) + results.append(sr) + + return results + def gather(self, query, *args, **kwargs): from .sbtmh import GatherMinHashesFindBestIgnoreMaxHash diff --git a/sourmash/search.py b/sourmash/search.py index 2fa3b4a8f..182f18045 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -42,37 +42,15 @@ def search_databases(query, databases, threshold, do_containment, best_only, found_md5 = set() for (obj, filename, filetype) in databases: if filetype == 'SBT': - if best_only: # this needs to be reset for each SBT - search_fn = SearchMinHashesFindBest().search - tree = obj - - # figure out scaled value of tree, downsample query if needed. - leaf = next(iter(tree.leaves())) - tree_mh = leaf.data.minhash - - tree_query = query - if tree_mh.scaled and query.minhash.scaled and \ - tree_mh.scaled > query.minhash.scaled: - resampled_query_mh = tree_query.minhash - resampled_query_mh = resampled_query_mh.downsample_scaled(tree_mh.scaled) - tree_query = SourmashSignature(resampled_query_mh) - - # now, search! - for leaf in tree.find(search_fn, tree_query, threshold): - similarity = query_match(leaf.data) - - # tree search should always/only return matches above threshold - assert similarity >= threshold - - if leaf.data.md5sum() not in found_md5: - sr = SearchResult(similarity=similarity, - match_sig=leaf.data, - md5=leaf.data.md5sum(), - filename=filename, - name=leaf.data.name()) - found_md5.add(sr.md5) + search_iter = tree.search(query, threshold=threshold, + do_containment=do_containment, + ignore_abundance=ignore_abundance, + best_only=best_only) + for sr in search_iter: + if sr.md5 not in found_md5: results.append(sr) + found_md5.add(sr.md5) elif filetype == 'LCA': lca_db = obj From 980a470e51c056c9a9f8e4b72aaa9be14a9ac2aa Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 16:27:26 -0400 Subject: [PATCH 16/37] removed conditionals in search & gather in favor of Index interface --- sourmash/sbt.py | 2 +- sourmash/search.py | 63 +++++++++++----------------------------------- 2 files changed, 15 insertions(+), 50 deletions(-) diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 98302920c..2578d6f2c 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -319,7 +319,7 @@ def gather(self, query, *args, **kwargs): leaf_e = leaf.data.minhash similarity = query.minhash.containment_ignore_maxhash(leaf_e) if similarity > 0.0: - results.append((similarity, leaf.data)) + results.append((similarity, leaf.data, None)) return results diff --git a/sourmash/search.py b/sourmash/search.py index 182f18045..40e01c977 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -41,36 +41,15 @@ def search_databases(query, databases, threshold, do_containment, best_only, results = [] found_md5 = set() for (obj, filename, filetype) in databases: - if filetype == 'SBT': - tree = obj - search_iter = tree.search(query, threshold=threshold, - do_containment=do_containment, - ignore_abundance=ignore_abundance, - best_only=best_only) - for sr in search_iter: - if sr.md5 not in found_md5: - results.append(sr) - found_md5.add(sr.md5) - - elif filetype == 'LCA': - lca_db = obj - search_iter = lca_db.search(query, threshold=threshold, - do_containment=do_containment, - ignore_abundance=ignore_abundance) - for sr in search_iter: - if sr.md5 not in found_md5: - results.append(sr) - found_md5.add(sr.md5) - - else: # list of signatures - linear = obj - search_iter = linear.search(query, threshold=threshold, - do_containment=do_containment, - ignore_abundance=ignore_abundance) - for sr in search_iter: - if sr.md5 not in found_md5: - results.append(sr) - found_md5.add(sr.md5) + search_iter = obj.search(query, threshold=threshold, + do_containment=do_containment, + ignore_abundance=ignore_abundance, + best_only=best_only) + for sr in search_iter: + if sr.md5 not in found_md5: + results.append(sr) + found_md5.add(sr.md5) + # sort results on similarity (reverse) results.sort(key=lambda x: -x.similarity) @@ -117,25 +96,11 @@ def find_best(dblist, query, remainder): results = [] for (obj, filename, filetype) in dblist: # search a tree - if filetype == 'SBT': - tree = obj - gather_iter = tree.gather(query, threshold=best_ctn_sofar) - for similarity, ss in gather_iter: - results.append((similarity, ss, filename)) - - # or an LCA database - elif filetype == 'LCA': - lca_db = obj - gather_iter = lca_db.gather(query) - for similarity, ss, filename in gather_iter: - results.append((similarity, ss, filename)) - - # search a signature - else: - linear = obj - gather_iter = linear.gather(query) - for similarity, ss, filename in gather_iter: - results.append((similarity, ss, filename)) + tree = obj + gather_iter = tree.gather(query, threshold=best_ctn_sofar) + for similarity, ss, filename in gather_iter: + results.append((similarity, ss, filename)) + if not results: return None, None, None From 622ddeea03e99dbf293e0be21a3fbb0d6b577041 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 19:38:14 -0400 Subject: [PATCH 17/37] fix remaining tests for search & gather --- sourmash/search.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sourmash/search.py b/sourmash/search.py index 40e01c977..e6aeaf7ef 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -86,19 +86,23 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): # define a function to do a 'best' search and get only top match. def find_best(dblist, query, remainder): + # @CTB this is a tree-specific optimization, I think - should fix. # precompute best containment from all of the remainders best_ctn_sofar = 0.0 - for x in remainder: - ctn = query.minhash.containment_ignore_maxhash(x.minhash) - if ctn > best_ctn_sofar: - best_ctn_sofar = ctn +# for x in remainder: +# ctn = query.minhash.containment_ignore_maxhash(x.minhash) +# if ctn > best_ctn_sofar: +# best_ctn_sofar = ctn results = [] for (obj, filename, filetype) in dblist: - # search a tree - tree = obj - gather_iter = tree.gather(query, threshold=best_ctn_sofar) - for similarity, ss, filename in gather_iter: + # search a tree! + gather_iter = obj.gather(query, threshold=best_ctn_sofar) + for similarity, ss, fname in gather_iter: + # @CTB hackity-hack hack, this is because trees don't have + # filenames at the moment. + if fname is None and filename: + fname = filename results.append((similarity, ss, filename)) From dd55de6c0e1fed81aa7f17e2b38460ca335d0907 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 6 Sep 2019 19:40:42 -0400 Subject: [PATCH 18/37] remove some debugging code --- tests/test_sourmash.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 9363a51b9..d7abdf7a5 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -613,10 +613,9 @@ def test_search_csv(): with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) - print('xxx', row) assert float(row['similarity']) == 0.93 assert row['name'].endswith('short2.fa') - assert row['filename'].endswith('short2.fa.sig'), row['filename'] + assert row['filename'].endswith('short2.fa.sig') assert row['md5'] == '914591cd1130aa915fe0c0c63db8f19d' From 6f2e4c23157f8a1e3232bcdb608574d5d64547b6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2019 05:56:56 -0700 Subject: [PATCH 19/37] fix my errant default parameter ways --- sourmash/index.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index af20ae12e..ef73dbec6 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -40,8 +40,10 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr class LinearIndex(Index): - def __init__(self, signatures=[], filename=None): - self.signatures = list(signatures) + def __init__(self, signatures=None, filename=None): + self.signatures = [] + if signatures: + self.signatures = list(signatures) self.filename = filename def __len__(self): From f5e622b15e24dc3be22016fe7c11fbec2a6ed774 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2019 06:16:28 -0700 Subject: [PATCH 20/37] cleanup and simplification of gather code --- sourmash/index.py | 5 +---- sourmash/lca/lca_utils.py | 2 +- sourmash/sbt.py | 7 +++---- sourmash/sbtmh.py | 17 ++++++++--------- sourmash/search.py | 23 ++++------------------- 5 files changed, 17 insertions(+), 37 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index ef73dbec6..dea966eec 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -99,13 +99,10 @@ def search(self, query, *args, **kwargs): return matches def gather(self, query, *args, **kwargs): - # check arguments - threshold = kwargs.get('threshold', 0) - results = [] for ss in self.signatures: cont = query.minhash.containment_ignore_maxhash(ss.minhash) - if cont > threshold: + if cont: results.append((cont, ss, self.filename)) results.sort(reverse=True) diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index 1974518fb..73a78a8af 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -291,7 +291,7 @@ def gather(self, query, *args, **kwargs): for x in self.find(query.minhash, 0.0, containment=True, ignore_scaled=True): (score, match_sig, md5, filename, name) = x - if score > 0.0: + if score: results.append((score, match_sig, filename)) return results diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 2578d6f2c..10661b031 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -310,12 +310,11 @@ def search(self, query, *args, **kwargs): def gather(self, query, *args, **kwargs): from .sbtmh import GatherMinHashesFindBestIgnoreMaxHash - threshold = kwargs['threshold'] - - search_fn = GatherMinHashesFindBestIgnoreMaxHash(threshold).search + # use a tree search function that keeps track of its best match. + search_fn = GatherMinHashesFindBestIgnoreMaxHash().search results = [] - for leaf in self.find(search_fn, query, threshold): + for leaf in self.find(search_fn, query, 0.0): leaf_e = leaf.data.minhash similarity = query.minhash.containment_ignore_maxhash(leaf_e) if similarity > 0.0: diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py index 066b2a952..5f8c20f0b 100644 --- a/sourmash/sbtmh.py +++ b/sourmash/sbtmh.py @@ -204,8 +204,8 @@ def search_minhashes_containment(node, sig, threshold, class GatherMinHashesFindBestIgnoreMaxHash(object): - def __init__(self, initial_best_match=0.0): - self.best_match = initial_best_match + def __init__(self): + self.best_match = 0 def search(self, node, query, threshold, results=None): score = 0 @@ -235,12 +235,11 @@ def search(self, node, query, threshold, results=None): if results is not None: results[node.name] = score - if score >= threshold: - # have we done better than this? if no, truncate searches below. - if score >= self.best_match: - # update best if it's a leaf node... - if isinstance(node, SigLeaf): - self.best_match = score - return 1 + # have we done better than this? if no, truncate searches below. + if score >= self.best_match: + # update best if it's a leaf node... + if isinstance(node, SigLeaf): + self.best_match = score + return 1 return 0 diff --git a/sourmash/search.py b/sourmash/search.py index e6aeaf7ef..08624f396 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -84,20 +84,11 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): orig_scaled = orig_query.minhash.scaled # define a function to do a 'best' search and get only top match. - def find_best(dblist, query, remainder): - - # @CTB this is a tree-specific optimization, I think - should fix. - # precompute best containment from all of the remainders - best_ctn_sofar = 0.0 -# for x in remainder: -# ctn = query.minhash.containment_ignore_maxhash(x.minhash) -# if ctn > best_ctn_sofar: -# best_ctn_sofar = ctn - + def find_best(dblist, query): results = [] for (obj, filename, filetype) in dblist: # search a tree! - gather_iter = obj.gather(query, threshold=best_ctn_sofar) + gather_iter = obj.gather(query) for similarity, ss, fname in gather_iter: # @CTB hackity-hack hack, this is because trees don't have # filenames at the moment. @@ -105,7 +96,6 @@ def find_best(dblist, query, remainder): fname = filename results.append((similarity, ss, filename)) - if not results: return None, None, None @@ -113,19 +103,14 @@ def find_best(dblist, query, remainder): results.sort(key=lambda x: (-x[0], x[1].name())) # reverse sort on similarity, and then on name best_similarity, best_leaf, filename = results[0] - for x in results[1:]: - remainder.add(x[1]) - return best_similarity, best_leaf, filename - # construct a new query that doesn't have the max_hash attribute set. query = build_new_query([], orig_query) cmp_scaled = 0 - remainder = set() while 1: - best_similarity, best_leaf, filename = find_best(databases, query, remainder) + best_similarity, best_leaf, filename = find_best(databases, query) if not best_leaf: # no matches at all! break @@ -136,7 +121,7 @@ def find_best(dblist, query, remainder): # figure out what the resolution of the banding on the subject is if not best_leaf.minhash.max_hash: error('Best hash match in sbt_gather has no max_hash') - error('Please prepare database of sequences with --scaled') + error('Please prepare gather databases with --scaled') sys.exit(-1) match_scaled = best_leaf.minhash.scaled From 088395de379175b1af82a5e4142b4acf092d906c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2019 06:57:46 -0700 Subject: [PATCH 21/37] significant refactor of gather code --- sourmash/index.py | 4 +- sourmash/search.py | 116 +++++++++++++++++++++++++-------------------- 2 files changed, 67 insertions(+), 53 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index dea966eec..47586db8a 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -99,12 +99,14 @@ def search(self, query, *args, **kwargs): return matches def gather(self, query, *args, **kwargs): + "Return the best containment in the list." results = [] for ss in self.signatures: cont = query.minhash.containment_ignore_maxhash(ss.minhash) if cont: results.append((cont, ss, self.filename)) - results.sort(reverse=True) + + results.sort(reverse=True) # CTB: sort on ss.name() too? return results diff --git a/sourmash/search.py b/sourmash/search.py index 08624f396..580261dbb 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -57,8 +57,8 @@ def search_databases(query, databases, threshold, do_containment, best_only, return results -# define a function to build new query object -def build_new_query(to_remove, old_query, scaled=None): +# build a new query object, subtracting found mins and downsampling if needed. +def _build_new_query(to_remove, old_query, scaled=None): e = old_query.minhash e.remove_many(to_remove) if scaled: @@ -70,64 +70,74 @@ def build_new_query(to_remove, old_query, scaled=None): 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, leaf') -def gather_databases(query, databases, threshold_bp, ignore_abundance): - orig_query = query - orig_mins = orig_query.minhash.get_hashes() - orig_abunds = { k: 1 for k in orig_mins } +def _find_best(dblist, query): + """ + Search for the best containment, return precisely one match. + """ - # do we pay attention to abundances? - if orig_query.minhash.track_abundance and not ignore_abundance: - import numpy as np - orig_abunds = orig_query.minhash.get_mins(with_abundance=True) + best_cont = 0.0 + best_match = None + best_filename = None - # store the scaled value for the query - orig_scaled = orig_query.minhash.scaled + # search across all databases + for (obj, filename, filetype) in dblist: + for cont, match, fname in obj.gather(query): + if cont: + # note, break ties based on name, to ensure consistent order. + if (cont == best_cont and match.name() < best_match.name()) or\ + cont > best_cont: + # update best match. + best_cont = cont + best_match = match - # define a function to do a 'best' search and get only top match. - def find_best(dblist, query): - results = [] - for (obj, filename, filetype) in dblist: - # search a tree! - gather_iter = obj.gather(query) - for similarity, ss, fname in gather_iter: - # @CTB hackity-hack hack, this is because trees don't have - # filenames at the moment. - if fname is None and filename: - fname = filename - results.append((similarity, ss, filename)) + # some objects may not have associated filename (e.g. SBTs) + best_filename = fname or filename - if not results: - return None, None, None + if not best_match: + return None, None, None - # take the best result - results.sort(key=lambda x: (-x[0], x[1].name())) # reverse sort on similarity, and then on name - best_similarity, best_leaf, filename = results[0] + return best_cont, best_match, best_filename - return best_similarity, best_leaf, filename - # construct a new query that doesn't have the max_hash attribute set. - query = build_new_query([], orig_query) +def gather_databases(query, databases, threshold_bp, ignore_abundance): + """ + Iteratively find the best containment of `query` in all the `databases`, + until we find fewer than `threshold_bp` (estimated) bp in common. + """ + # track original query information for later usage. + track_abundance = query.minhash.track_abundance and not ignore_abundance + orig_mh = query.minhash + orig_mins = orig_mh.get_hashes() + orig_abunds = { k: 1 for k in orig_mins } - cmp_scaled = 0 + # do we pay attention to abundances? + if track_abundance: + import numpy as np + orig_abunds = orig_mh.get_mins(with_abundance=True) + + # construct a new query object for later modification. + # @CTB note this doesn't actually construct a new query object... + query = _build_new_query([], query) + + cmp_scaled = query.minhash.scaled # initialize with resolution of query while 1: - best_similarity, best_leaf, filename = find_best(databases, query) - if not best_leaf: # no matches at all! + best_cont, best_match, filename = _find_best(databases, query) + if not best_match: # no matches at all! break # subtract found hashes from search hashes, construct new search query_mins = set(query.minhash.get_hashes()) - found_mins = best_leaf.minhash.get_hashes() + found_mins = best_match.minhash.get_hashes() - # figure out what the resolution of the banding on the subject is - if not best_leaf.minhash.max_hash: - error('Best hash match in sbt_gather has no max_hash') + # Is the best match computed with scaled? Die if not. + match_scaled = best_match.minhash.scaled + if not match_scaled: + error('Best match in gather is not scaled.') error('Please prepare gather databases with --scaled') - sys.exit(-1) - - match_scaled = best_leaf.minhash.scaled + raise Exception # pick the highest scaled / lowest resolution - cmp_scaled = max(cmp_scaled, match_scaled, orig_scaled) + cmp_scaled = max(cmp_scaled, match_scaled) # eliminate mins under this new resolution. # (CTB note: this means that if a high scaled/low res signature is @@ -154,7 +164,7 @@ def find_best(dblist, query): f_orig_query = len(intersect_orig_mins) / float(len(orig_mins)) # calculate fractions wrt second denominator - metagenome size - orig_mh = orig_query.minhash.downsample_scaled(cmp_scaled) + orig_mh = orig_mh.downsample_scaled(cmp_scaled) query_n_mins = len(orig_mh) f_unique_to_query = len(intersect_mins) / float(query_n_mins) @@ -162,9 +172,10 @@ def find_best(dblist, query): f_unique_weighted = sum((orig_abunds[k] for k in intersect_mins)) \ / sum_abunds - intersect_abunds = list(sorted(orig_abunds[k] for k in intersect_mins)) + # calculate stats on abundances, if desired. average_abund, median_abund, std_abund = 0, 0, 0 - if orig_query.minhash.track_abundance and not ignore_abundance: + if track_abundance: + intersect_abunds = list((orig_abunds[k] for k in intersect_mins)) average_abund = np.mean(intersect_abunds) median_abund = np.median(intersect_abunds) std_abund = np.std(intersect_abunds) @@ -179,14 +190,15 @@ def find_best(dblist, query): median_abund=median_abund, std_abund=std_abund, filename=filename, - md5=best_leaf.md5sum(), - name=best_leaf.name(), - leaf=best_leaf) + md5=best_match.md5sum(), + name=best_match.name(), + leaf=best_match) - # construct a new query, minus the previous one. - query = build_new_query(found_mins, orig_query, cmp_scaled) - query_mins -= set(found_mins) + # construct a new query, subtracting hashes found in previous one. + query = _build_new_query(found_mins, query, cmp_scaled) + # compute weighted_missed: + query_mins -= set(found_mins) weighted_missed = sum((orig_abunds[k] for k in query_mins)) \ / sum_abunds From c281e79b009430acabf0461e211fc8bb7ccd8991 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2019 07:06:38 -0700 Subject: [PATCH 22/37] futher refactoring and simplification --- sourmash/sbt.py | 4 ++-- sourmash/sbtmh.py | 2 +- sourmash/search.py | 40 ++++++++++++++-------------------------- 3 files changed, 17 insertions(+), 29 deletions(-) diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 10661b031..127870424 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -309,9 +309,9 @@ def search(self, query, *args, **kwargs): def gather(self, query, *args, **kwargs): - from .sbtmh import GatherMinHashesFindBestIgnoreMaxHash + from .sbtmh import GatherMinHashes # use a tree search function that keeps track of its best match. - search_fn = GatherMinHashesFindBestIgnoreMaxHash().search + search_fn = GatherMinHashes().search results = [] for leaf in self.find(search_fn, query, 0.0): diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py index 5f8c20f0b..44067d896 100644 --- a/sourmash/sbtmh.py +++ b/sourmash/sbtmh.py @@ -203,7 +203,7 @@ def search_minhashes_containment(node, sig, threshold, return 0 -class GatherMinHashesFindBestIgnoreMaxHash(object): +class GatherMinHashes(object): def __init__(self): self.best_match = 0 diff --git a/sourmash/search.py b/sourmash/search.py index 580261dbb..d9408e2b7 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -4,12 +4,10 @@ from .logging import notify, error from .signature import SourmashSignature -from .sbtmh import search_minhashes, search_minhashes_containment -from .sbtmh import SearchMinHashesFindBest, GatherMinHashesFindBestIgnoreMaxHash from ._minhash import get_max_hash_for_scaled -# generic SearchResult across individual signatures + SBTs. +# generic SearchResult. SearchResult = namedtuple('SearchResult', 'similarity, match_sig, md5, filename, name') @@ -30,14 +28,6 @@ def format_bp(bp): def search_databases(query, databases, threshold, do_containment, best_only, ignore_abundance): - # set up the search & score function(s) - similarity vs containment - search_fn = search_minhashes - query_match = lambda x: query.similarity( - x, downsample=True, ignore_abundance=ignore_abundance) - if do_containment: - search_fn = search_minhashes_containment - query_match = lambda x: query.contained_by(x, downsample=True) - results = [] found_md5 = set() for (obj, filename, filetype) in databases: @@ -50,26 +40,28 @@ def search_databases(query, databases, threshold, do_containment, best_only, results.append(sr) found_md5.add(sr.md5) - # sort results on similarity (reverse) results.sort(key=lambda x: -x.similarity) return results - -# build a new query object, subtracting found mins and downsampling if needed. -def _build_new_query(to_remove, old_query, scaled=None): - e = old_query.minhash - e.remove_many(to_remove) - if scaled: - e = e.downsample_scaled(scaled) - return SourmashSignature(e) - +### +### gather code +### GatherResult = namedtuple('GatherResult', 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, leaf') +# build a new query object, subtracting found mins and downsampling +def _subtract_and_downsample(to_remove, old_query, scaled=None): + mh = old_query.minhash + mh = mh.downsample_scaled(scaled) + mh.remove_many(to_remove) + + return SourmashSignature(mh) + + def _find_best(dblist, query): """ Search for the best containment, return precisely one match. @@ -115,10 +107,6 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): import numpy as np orig_abunds = orig_mh.get_mins(with_abundance=True) - # construct a new query object for later modification. - # @CTB note this doesn't actually construct a new query object... - query = _build_new_query([], query) - cmp_scaled = query.minhash.scaled # initialize with resolution of query while 1: best_cont, best_match, filename = _find_best(databases, query) @@ -195,7 +183,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): leaf=best_match) # construct a new query, subtracting hashes found in previous one. - query = _build_new_query(found_mins, query, cmp_scaled) + query = _subtract_and_downsample(found_mins, query, cmp_scaled) # compute weighted_missed: query_mins -= set(found_mins) From e82cffc76e2fb31324692250bcf9583f71f377c2 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2019 07:18:35 -0700 Subject: [PATCH 23/37] rely on 'Index.gather' returning actual matches --- sourmash/search.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/sourmash/search.py b/sourmash/search.py index d9408e2b7..54fba18f0 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -74,16 +74,17 @@ def _find_best(dblist, query): # search across all databases for (obj, filename, filetype) in dblist: for cont, match, fname in obj.gather(query): - if cont: - # note, break ties based on name, to ensure consistent order. - if (cont == best_cont and match.name() < best_match.name()) or\ - cont > best_cont: - # update best match. - best_cont = cont - best_match = match - - # some objects may not have associated filename (e.g. SBTs) - best_filename = fname or filename + assert cont + + # note, break ties based on name, to ensure consistent order. + if (cont == best_cont and match.name() < best_match.name()) or \ + cont > best_cont: + # update best match. + best_cont = cont + best_match = match + + # some objects may not have associated filename (e.g. SBTs) + best_filename = fname or filename if not best_match: return None, None, None From ef9b900677c0219abd0530f5527da1bc0294f31a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2019 07:53:45 -0700 Subject: [PATCH 24/37] remove duplicate SearchResult, clean up & rationalize SearchResult and GatherResult --- sourmash/commands.py | 18 +++++++++--------- sourmash/index.py | 19 +++++-------------- sourmash/lca/lca_utils.py | 22 +++++++--------------- sourmash/sbt.py | 10 +--------- sourmash/search.py | 26 +++++++++++++++++--------- tests/test_index.py | 30 +++++++++++++++--------------- 6 files changed, 54 insertions(+), 71 deletions(-) diff --git a/sourmash/commands.py b/sourmash/commands.py index cd1cdeb5f..ef31532c6 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -570,7 +570,7 @@ def search(args): print_results("---------- -----") for sr in results[:n_matches]: pct = '{:.1f}%'.format(sr.similarity*100) - name = sr.match_sig._display_name(60) + name = sr.match._display_name(60) print_results('{:>6} {}', pct, name) if args.best_only: @@ -583,14 +583,14 @@ def search(args): w.writeheader() for sr in results: d = dict(sr._asdict()) - del d['match_sig'] + del d['match'] w.writerow(d) # save matching signatures upon request if args.save_matches: outname = args.save_matches.name notify('saving all matched signatures to "{}"', outname) - sig.save_signatures([ sr.match_sig for sr in results ], + sig.save_signatures([ sr.match for sr in results ], args.save_matches) @@ -758,7 +758,7 @@ def gather(args): pct_query = '{:.1f}%'.format(result.f_unique_weighted*100) pct_genome = '{:.1f}%'.format(result.f_match*100) average_abund ='{:.1f}'.format(result.average_abund) - name = result.leaf._display_name(40) + name = result.match._display_name(40) if query.minhash.track_abundance and not args.ignore_abundance: print_results('{:9} {:>7} {:>7} {:>9} {}', @@ -786,13 +786,13 @@ def gather(args): w.writeheader() for result in found: d = dict(result._asdict()) - del d['leaf'] # actual signature not in CSV. + del d['match'] # actual signature not in CSV. w.writerow(d) if found and args.save_matches: outname = args.save_matches.name notify('saving all matches to "{}"', outname) - sig.save_signatures([ r.leaf for r in found ], args.save_matches) + sig.save_signatures([ r.match for r in found ], args.save_matches) if args.output_unassigned: if not len(query.minhash): @@ -906,7 +906,7 @@ def multigather(args): pct_query = '{:.1f}%'.format(result.f_unique_weighted*100) pct_genome = '{:.1f}%'.format(result.f_match*100) average_abund ='{:.1f}'.format(result.average_abund) - name = result.leaf._display_name(40) + name = result.match._display_name(40) if query.minhash.track_abundance and not args.ignore_abundance: print_results('{:9} {:>7} {:>7} {:>9} {}', @@ -941,14 +941,14 @@ def multigather(args): w.writeheader() for result in found: d = dict(result._asdict()) - del d['leaf'] # actual signature not in CSV. + del d['match'] # actual signature not in CSV. w.writerow(d) output_matches = output_base + '.matches.sig' with open(output_matches, 'wt') as fp: outname = output_matches notify('saving all matches to "{}"', outname) - sig.save_signatures([ r.leaf for r in found ], fp) + sig.save_signatures([ r.match for r in found ], fp) output_unassigned = output_base + '.unassigned.sig' with open(output_unassigned, 'wt') as fp: diff --git a/sourmash/index.py b/sourmash/index.py index 47586db8a..8e3147df7 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -3,11 +3,6 @@ from abc import ABCMeta, abstractmethod from collections import namedtuple -# @CTB copied out of search.py to deal with import order issues, #willfix -SearchResult = namedtuple('SearchResult', - 'similarity, match_sig, md5, filename, name') - - # compatible with Python 2 *and* 3: ABC = ABCMeta("ABC", (object,), {"__slots__": ()}) @@ -87,15 +82,11 @@ def search(self, query, *args, **kwargs): for ss in self.signatures: similarity = query_match(ss) if similarity >= threshold: - # @CTB: check duplicates via md5sum - here or ?? - sr = SearchResult(similarity=similarity, - match_sig=ss, - md5=ss.md5sum(), - filename = self.filename, - name=ss.name()) - matches.append(sr) - - matches.sort(key=lambda x: -x.similarity) + # @CTB: check duplicates via md5sum - here or later? + matches.append((similarity, ss, self.filename)) + + # sort! + matches.sort(key=lambda x: -x[0]) return matches def gather(self, query, *args, **kwargs): diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index 73a78a8af..60d6fc897 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -24,9 +24,6 @@ # type to store an element in a taxonomic lineage LineagePair = namedtuple('LineagePair', ['rank', 'name']) -# @CTB copied out of search.py to deal with import order issues, #willfix -SearchResult = namedtuple('SearchResult', - 'similarity, match_sig, md5, filename, name') def check_files_exist(*files): ret = True @@ -275,24 +272,19 @@ def search(self, query, *args, **kwargs): results = [] for x in self.find(query.minhash, threshold, do_containment): - (score, match_sig, md5, filename, name) = x - sr = SearchResult(similarity=score, - match_sig=match_sig, - md5=md5, - filename=filename, - name=name) - results.append(sr) - - results.sort(key=lambda x: -x.similarity) + (score, match, filename) = x + results.append((score, match, filename)) + + results.sort(key=lambda x: -x[0]) return results def gather(self, query, *args, **kwargs): results = [] for x in self.find(query.minhash, 0.0, containment=True, ignore_scaled=True): - (score, match_sig, md5, filename, name) = x + (score, match, filename) = x if score: - results.append((score, match_sig, filename)) + results.append((score, match, filename)) return results @@ -396,7 +388,7 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False): from .. import SourmashSignature match_sig = SourmashSignature(match_mh, name=name) - yield score, match_sig, match_sig.md5sum(), self.filename, name + yield score, match_sig, self.filename def load_single_database(filename, verbose=False): diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 127870424..648b769a7 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -77,9 +77,6 @@ def search_transcript(node, seq, threshold): } NodePos = namedtuple("NodePos", ["pos", "node"]) -# @CTB copied out of search.py to deal with import order issues, #willfix -SearchResult = namedtuple('SearchResult', - 'similarity, match_sig, md5, filename, name') class GraphFactory(object): """Build new nodegraphs (Bloom filters) of a specific (fixed) size. @@ -298,12 +295,7 @@ def search(self, query, *args, **kwargs): # tree search should always/only return matches above threshold assert similarity >= threshold - sr = SearchResult(similarity=similarity, - match_sig=leaf.data, - md5=leaf.data.md5sum(), - name=leaf.data.name(), - filename=None) - results.append(sr) + results.append((similarity, leaf.data, None)) return results diff --git a/sourmash/search.py b/sourmash/search.py index 54fba18f0..7694bdfdf 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -9,7 +9,7 @@ # generic SearchResult. SearchResult = namedtuple('SearchResult', - 'similarity, match_sig, md5, filename, name') + 'similarity, match, md5, filename, name') def format_bp(bp): @@ -35,22 +35,30 @@ def search_databases(query, databases, threshold, do_containment, best_only, do_containment=do_containment, ignore_abundance=ignore_abundance, best_only=best_only) - for sr in search_iter: - if sr.md5 not in found_md5: - results.append(sr) - found_md5.add(sr.md5) + for (similarity, match, filename) in search_iter: + md5 = match.md5sum() + if md5 not in found_md5: + results.append((similarity, match, filename)) + found_md5.add(md5) # sort results on similarity (reverse) - results.sort(key=lambda x: -x.similarity) + results.sort(key=lambda x: -x[0]) - return results + x = [] + for (similarity, match, filename) in results: + x.append(SearchResult(similarity=similarity, + match=match, + md5=match.md5sum(), + filename=filename, + name=match.name())) + return x ### ### gather code ### GatherResult = namedtuple('GatherResult', - 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, leaf') + 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match') # build a new query object, subtracting found mins and downsampling @@ -181,7 +189,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): filename=filename, md5=best_match.md5sum(), name=best_match.name(), - leaf=best_match) + match=best_match) # construct a new query, subtracting hashes found in previous one. query = _subtract_and_downsample(found_mins, query, cmp_scaled) diff --git a/tests/test_index.py b/tests/test_index.py index e232770ad..2f5bad4e1 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -81,32 +81,32 @@ def test_linear_index_search(): # now, search for sig2 sr = lidx.search(ss2, threshold=1.0) - print([s.name for s in sr]) + print([s[1].name() for s in sr]) assert len(sr) == 1 - assert sr[0].match_sig == ss2 + assert sr[0][1] == ss2 # search for sig47 with lower threshold; search order not guaranteed. sr = lidx.search(ss47, threshold=0.1) - print([s.name for s in sr]) + print([s[1].name() for s in sr]) assert len(sr) == 2 - sr.sort(key=lambda x: -x.similarity) - assert sr[0].match_sig == ss47 - assert sr[1].match_sig == ss63 + sr.sort(key=lambda x: -x[0]) + assert sr[0][1] == ss47 + assert sr[1][1] == ss63 # search for sig63 with lower threshold; search order not guaranteed. sr = lidx.search(ss63, threshold=0.1) - print([s.name for s in sr]) + print([s[1].name() for s in sr]) assert len(sr) == 2 - sr.sort(key=lambda x: -x.similarity) - assert sr[0].match_sig == ss63 - assert sr[1].match_sig == ss47 + sr.sort(key=lambda x: -x[0]) + assert sr[0][1] == ss63 + assert sr[1][1] == ss47 # search for sig63 with high threshold => 1 match sr = lidx.search(ss63, threshold=0.8) - print([s.name for s in sr]) + print([s[1].name for s in sr]) assert len(sr) == 1 - sr.sort(key=lambda x: -x.similarity) - assert sr[0].match_sig == ss63 + sr.sort(key=lambda x: -x[0]) + assert sr[0][1] == ss63 def test_linear_index_gather(): @@ -212,6 +212,6 @@ def test_linear_index_save_load(): # now, search for sig2 sr = linear2.search(ss2, threshold=1.0) - print([s.name for s in sr]) + print([s[1].name() for s in sr]) assert len(sr) == 1 - assert sr[0].match_sig == ss2 + assert sr[0][1] == ss2 From df4b91145fd201c33baba971e60166b3b4f21fcf Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2019 08:34:56 -0700 Subject: [PATCH 25/37] display full order of sigs in failed tests --- tests/test_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_index.py b/tests/test_index.py index 2f5bad4e1..a4e371911 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -165,7 +165,7 @@ def test_linear_index_save(): print(si) print(x) - assert si == x + assert si == x, si def test_linear_index_load(): @@ -187,7 +187,7 @@ def test_linear_index_load(): linear = LinearIndex.load(filename) x = [ ss2, ss47, ss63 ] - assert linear.signatures == x + assert linear.signatures == x, linear.signatures assert linear.filename == filename From 95ddb7fcfa68e6f37231aa96909369c7e2c9e336 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Wed, 23 Oct 2019 23:06:21 +0000 Subject: [PATCH 26/37] fix heisenbug in tests --- tests/test_index.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_index.py b/tests/test_index.py index a4e371911..bc596b0f3 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -155,9 +155,9 @@ def test_linear_index_save(): linear.save(filename) from sourmash import load_signatures - si = list(load_signatures(filename)) + si = set(load_signatures(filename)) - x = [ ss2, ss47, ss63 ] + x = {ss2, ss47, ss63} print(len(si)) print(len(x)) @@ -186,8 +186,8 @@ def test_linear_index_load(): linear = LinearIndex.load(filename) - x = [ ss2, ss47, ss63 ] - assert linear.signatures == x, linear.signatures + x = {ss2, ss47, ss63} + assert set(linear.signatures) == x, linear.signatures assert linear.filename == filename From 5984d3de0c5bffdb2694e9b8a8ac7af07fae472a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 12 Dec 2019 15:55:38 -0800 Subject: [PATCH 27/37] add signatures() iterator to Index objects --- sourmash/index.py | 27 +++++++++++++++------------ sourmash/lca/lca_utils.py | 11 +++++++---- sourmash/sbt.py | 3 +++ tests/test_index.py | 2 +- 4 files changed, 26 insertions(+), 17 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index 8e3147df7..965bd199c 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -9,8 +9,8 @@ class Index(ABC): @abstractmethod - def find(self, search_fn, *args, **kwargs): - """ """ + def signatures(self): + "Return an iterator over all signatures in the Index object." @abstractmethod def search(self, signature, *args, **kwargs): @@ -35,22 +35,25 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr class LinearIndex(Index): - def __init__(self, signatures=None, filename=None): - self.signatures = [] - if signatures: - self.signatures = list(signatures) + def __init__(self, _signatures=None, filename=None): + self._signatures = [] + if _signatures: + self._signatures = list(_signatures) self.filename = filename + def signatures(self): + return iter(self._signatures) + def __len__(self): - return len(self.signatures) + return len(self._signatures) def insert(self, node): - self.signatures.append(node) + self._signatures.append(node) def find(self, search_fn, *args, **kwargs): matches = [] - for node in self.signatures: + for node in self.signatures(): if search_fn(node, *args): matches.append(node) return matches @@ -79,7 +82,7 @@ def search(self, query, *args, **kwargs): # do the actual search: matches = [] - for ss in self.signatures: + for ss in self.signatures(): similarity = query_match(ss) if similarity >= threshold: # @CTB: check duplicates via md5sum - here or later? @@ -92,7 +95,7 @@ def search(self, query, *args, **kwargs): def gather(self, query, *args, **kwargs): "Return the best containment in the list." results = [] - for ss in self.signatures: + for ss in self.signatures(): cont = query.minhash.containment_ignore_maxhash(ss.minhash) if cont: results.append((cont, ss, self.filename)) @@ -104,7 +107,7 @@ def gather(self, query, *args, **kwargs): def save(self, path): from .signature import save_signatures with open(path, 'wt') as fp: - save_signatures(self.signatures, fp) + save_signatures(self.signatures(), fp) @classmethod def load(cls, location): diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index 60d6fc897..0ea5ace50 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -164,6 +164,9 @@ def __init__(self): def __repr__(self): return "LCA_Database('{}')".format(self.filename) + def signatures(self): + raise NotImplementedError + def load(self, db_name): "Load from a JSON file." xopen = open @@ -334,7 +337,7 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False): elif self.scaled < minhash.scaled and not ignore_scaled: raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled)) - if not hasattr(self, 'signatures'): + if not hasattr(self, '_signatures'): debug('creating signatures for LCA DB...') sigd = defaultdict(minhash.copy_and_clear) @@ -342,9 +345,9 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False): for vv in v: sigd[vv].add_hash(k) - self.signatures = sigd + self._signatures = sigd - debug('=> {} signatures!', len(self.signatures)) + debug('=> {} signatures!', len(self._signatures)) # build idx_to_ident from ident_to_idx if not hasattr(self, 'idx_to_ident'): @@ -370,7 +373,7 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False): name = self.ident_to_name[ident] debug('looking at {} ({})', ident, name) - match_mh = self.signatures[idx] + match_mh = self._signatures[idx] match_size = len(match_mh) debug('count: {}; query_mins: {}; match size: {}', diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 648b769a7..743f6e25e 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -134,6 +134,9 @@ def __init__(self, factory, d=2, storage=None): self.next_node = 0 self.storage = storage + def signatures(self): + return leaves() + def new_node_pos(self, node): if not self._nodes: self.next_node = 1 diff --git a/tests/test_index.py b/tests/test_index.py index bc596b0f3..923239d74 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -187,7 +187,7 @@ def test_linear_index_load(): linear = LinearIndex.load(filename) x = {ss2, ss47, ss63} - assert set(linear.signatures) == x, linear.signatures + assert set(linear.signatures()) == x, linear.signatures assert linear.filename == filename From 931737ef3f9ba796f8245682cca42260928f863b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 12 Dec 2019 16:17:04 -0800 Subject: [PATCH 28/37] move search, gather functions into base Index class --- sourmash/index.py | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index 965bd199c..667cc2ab7 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -12,14 +12,6 @@ class Index(ABC): def signatures(self): "Return an iterator over all signatures in the Index object." - @abstractmethod - def search(self, signature, *args, **kwargs): - """ """ - - @abstractmethod - def gather(self, signature, *args, **kwargs): - """ """ - @abstractmethod def insert(self, node): """ """ @@ -33,23 +25,6 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True): """ """ - -class LinearIndex(Index): - def __init__(self, _signatures=None, filename=None): - self._signatures = [] - if _signatures: - self._signatures = list(_signatures) - self.filename = filename - - def signatures(self): - return iter(self._signatures) - - def __len__(self): - return len(self._signatures) - - def insert(self, node): - self._signatures.append(node) - def find(self, search_fn, *args, **kwargs): matches = [] @@ -100,10 +75,27 @@ def gather(self, query, *args, **kwargs): if cont: results.append((cont, ss, self.filename)) - results.sort(reverse=True) # CTB: sort on ss.name() too? + results.sort(reverse=True, key=lambda x: (x[0], x[1].name())) return results + +class LinearIndex(Index): + def __init__(self, _signatures=None, filename=None): + self._signatures = [] + if _signatures: + self._signatures = list(_signatures) + self.filename = filename + + def signatures(self): + return iter(self._signatures) + + def __len__(self): + return len(self._signatures) + + def insert(self, node): + self._signatures.append(node) + def save(self, path): from .signature import save_signatures with open(path, 'wt') as fp: From da7c97905cd7c1b1ea82dfb6a468a2678e293e5f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 12 Dec 2019 16:39:12 -0800 Subject: [PATCH 29/37] fix lca search ignore abundance --- sourmash/commands.py | 4 ++++ sourmash/lca/lca_utils.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/sourmash/commands.py b/sourmash/commands.py index c986c5600..e82fb6f3a 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -545,6 +545,10 @@ def search(args): not args.containment, args.traverse_directory) + # forcibly ignore abundances if query has no abundances + if not query.minhash.track_abundance: + args.ignore_abundance = True + if not len(databases): error('Nothing found to search!') sys.exit(-1) diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index 0ea5ace50..e4ad78762 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -271,7 +271,9 @@ def search(self, query, *args, **kwargs): raise TypeError("'search' requires 'threshold'") threshold = kwargs['threshold'] do_containment = kwargs.get('do_containment', False) - # @CTB ignore_abundance? + ignore_abundance = kwargs.get('ignore_abundance') + if not ignore_abundance: + raise TypeError("'search' on LCA databases does not use abundance") results = [] for x in self.find(query.minhash, threshold, do_containment): From 3fa8de353bf30a8e37d8ac6dad6239870bcafc24 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 12 Dec 2019 16:48:27 -0800 Subject: [PATCH 30/37] add function doc --- sourmash/index.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index 667cc2ab7..3f7666c16 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -34,7 +34,17 @@ def find(self, search_fn, *args, **kwargs): return matches def search(self, query, *args, **kwargs): - """@@ + """Return set of matches with similarity above 'threshold'. + + Results will be sorted by similarity, highest to lowest. + + Optional arguments accepted by all Index subclasses: + * do_containment: default False. If True, use Jaccard containment. + * best_only: default False. If True, allow optimizations that + may. May discard matches better than threshold, but first match + is guaranteed to be best. + * ignore_abundance: default False. If True, and query signature + and database support k-mer abundances, ignore those abundances. Note, the "best only" hint is ignored by LinearIndex. """ @@ -60,7 +70,6 @@ def search(self, query, *args, **kwargs): for ss in self.signatures(): similarity = query_match(ss) if similarity >= threshold: - # @CTB: check duplicates via md5sum - here or later? matches.append((similarity, ss, self.filename)) # sort! @@ -68,7 +77,7 @@ def search(self, query, *args, **kwargs): return matches def gather(self, query, *args, **kwargs): - "Return the best containment in the list." + "Return the match with the best Jaccard containment in the Index." results = [] for ss in self.signatures(): cont = query.minhash.containment_ignore_maxhash(ss.minhash) From d3dc2fb28bbd1d8446ae9aaee5ee7abb5dfb4d4c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 13 Dec 2019 06:33:41 -0800 Subject: [PATCH 31/37] add signatures() method to both LCA and SBT indices --- sourmash/lca/lca_utils.py | 31 ++++++++++++++------- sourmash/sbt.py | 3 ++- tests/test_lca.py | 8 ++++++ tests/test_sbt.py | 57 +++++++++++++++++++++++++-------------- 4 files changed, 68 insertions(+), 31 deletions(-) diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index e4ad78762..7167f18c9 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -165,7 +165,10 @@ def __repr__(self): return "LCA_Database('{}')".format(self.filename) def signatures(self): - raise NotImplementedError + from .. import SourmashSignature + self._create_signatures() + for v in self._signatures.values(): + yield SourmashSignature(v) def load(self, db_name): "Load from a JSON file." @@ -329,17 +332,13 @@ def get_lineage_assignments(self, hashval): return x - def find(self, minhash, threshold, containment=False, ignore_scaled=False): - """ - Do a Jaccard similarity or containment search. - """ - # make sure we're looking at the same scaled value as database - if self.scaled > minhash.scaled: - minhash = minhash.downsample_scaled(self.scaled) - elif self.scaled < minhash.scaled and not ignore_scaled: - raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled)) + def _create_signatures(self): + "Create a _signatures member dictionary that contains {idx: minhash}." + from .. import MinHash if not hasattr(self, '_signatures'): + minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled) + debug('creating signatures for LCA DB...') sigd = defaultdict(minhash.copy_and_clear) @@ -351,6 +350,18 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False): debug('=> {} signatures!', len(self._signatures)) + def find(self, minhash, threshold, containment=False, ignore_scaled=False): + """ + Do a Jaccard similarity or containment search. + """ + # make sure we're looking at the same scaled value as database + if self.scaled > minhash.scaled: + minhash = minhash.downsample_scaled(self.scaled) + elif self.scaled < minhash.scaled and not ignore_scaled: + raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled)) + + self._create_signatures() + # build idx_to_ident from ident_to_idx if not hasattr(self, 'idx_to_ident'): idx_to_ident = {} diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 743f6e25e..ea713b720 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -135,7 +135,8 @@ def __init__(self, factory, d=2, storage=None): self.storage = storage def signatures(self): - return leaves() + for k in self.leaves(): + yield k.data def new_node_pos(self, node): if not self._nodes: diff --git a/tests/test_lca.py b/tests/test_lca.py index fb63d63ad..820269dc8 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -133,6 +133,14 @@ def test_db_repr(): assert repr(db) == "LCA_Database('{}')".format(filename) +def test_lca_index_signatures_method(): + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + siglist = list(db.signatures()) + assert len(siglist) == 2 + + ## command line tests diff --git a/tests/test_sbt.py b/tests/test_sbt.py index bef6e7c6e..2c694d7a9 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -4,12 +4,12 @@ import pytest -from sourmash import signature +from sourmash import load_signatures, load_one_signature from sourmash.sbt import SBT, GraphFactory, Leaf, Node from sourmash.sbtmh import (SigLeaf, search_minhashes, - search_minhashes_containment) + search_minhashes_containment) from sourmash.sbt_storage import (FSStorage, TarStorage, - RedisStorage, IPFSStorage) + RedisStorage, IPFSStorage) from . import sourmash_tst_utils as utils @@ -138,7 +138,7 @@ def test_tree_v1_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = next(load_signatures(testdata1)) results_v1 = {str(s) for s in tree_v1.find(search_minhashes_containment, to_search, 0.1)} @@ -157,7 +157,7 @@ def test_tree_v2_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = next(load_signatures(testdata1)) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -176,7 +176,7 @@ def test_tree_v3_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = next(load_signatures(testdata1)) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -195,7 +195,7 @@ def test_tree_v5_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = next(load_signatures(testdata1)) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -211,7 +211,7 @@ def test_tree_save_load(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.insert(leaf) to_search = leaf @@ -241,7 +241,7 @@ def test_tree_save_load_v5(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -272,7 +272,7 @@ def test_search_minhashes(): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.insert(leaf) @@ -295,7 +295,7 @@ def test_binary_nary_tree(): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): tree.insert(leaf) @@ -323,7 +323,7 @@ def test_sbt_combine(n_children): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.insert(leaf) if n_leaves < 4: @@ -341,7 +341,7 @@ def test_sbt_combine(n_children): assert len(t_leaves) == len(t1_leaves) assert t1_leaves == t_leaves - to_search = next(signature.load_signatures( + to_search = next(load_signatures( utils.get_test_data(utils.SIG_FILES[0]))) t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)} @@ -370,7 +370,7 @@ def test_sbt_fsstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.insert(leaf) to_search = leaf @@ -403,7 +403,7 @@ def test_sbt_tarstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.insert(leaf) to_search = leaf @@ -439,7 +439,7 @@ def test_sbt_ipfsstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.insert(leaf) to_search = leaf @@ -477,7 +477,7 @@ def test_sbt_redisstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.insert(leaf) to_search = leaf @@ -516,7 +516,7 @@ def test_tree_repair(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = next(load_signatures(testdata1)) results_repair = {str(s) for s in tree_repair.find(search_minhashes, to_search, 0.1)} @@ -532,7 +532,7 @@ def test_tree_repair_insert(): leaf_loader=SigLeaf.load) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree_repair.insert(leaf) @@ -552,7 +552,7 @@ def test_save_sparseness(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.insert(leaf) to_search = leaf @@ -586,3 +586,20 @@ def test_save_sparseness(n_children): # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_loaded.children(pos)) + + +def test_sbt_signatures(): + factory = GraphFactory(31, 1e5, 4) + tree = SBT(factory, d=2) + + sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) + sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + + tree.insert(SigLeaf('47', sig47)) + tree.insert(SigLeaf('63', sig63)) + + xx = list(tree.signatures()) + assert len(xx) == 2 + + assert sig47 in xx + assert sig63 in xx From 83ad1b9ea8e100a6ba5b29d8dc75033eaab910dc Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 13 Dec 2019 16:16:27 -0800 Subject: [PATCH 32/37] Update tests/test_sbt.py Co-Authored-By: Luiz Irber --- tests/test_sbt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 2c694d7a9..8bd059c0e 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -138,7 +138,7 @@ def test_tree_v1_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v1 = {str(s) for s in tree_v1.find(search_minhashes_containment, to_search, 0.1)} From c80ef462b23fed5e8e9471bce63fb793d922439e Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 14 Dec 2019 07:10:37 -0800 Subject: [PATCH 33/37] SBT.insert now matches Index.insert, while SBT.add_node does what insert used to --- sourmash/commands.py | 3 +-- sourmash/sbt.py | 13 +++++++---- tests/test_index.py | 10 ++++---- tests/test_sbt.py | 55 +++++++++++++++++++++----------------------- 4 files changed, 40 insertions(+), 41 deletions(-) diff --git a/sourmash/commands.py b/sourmash/commands.py index e82fb6f3a..ac81f816b 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -450,8 +450,7 @@ def index(args): ss.minhash = ss.minhash.downsample_scaled(args.scaled) scaleds.add(ss.minhash.scaled) - leaf = SigLeaf(ss.md5sum(), ss) - tree.insert(leaf) + tree.insert(ss) n += 1 if not ss: diff --git a/sourmash/sbt.py b/sourmash/sbt.py index ea713b720..ddc2f617b 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -165,7 +165,14 @@ def new_node_pos(self, node): return self.next_node - def insert(self, node): + def insert(self, signature): + "Add a new SourmashSignature in to the SBT." + from .sbtmh import SigLeaf + + leaf = SigLeaf(signature.name(), signature) + self.add_node(leaf) + + def add_node(self, node): pos = self.new_node_pos(node) if pos == 0: # empty tree; initialize w/node. @@ -213,10 +220,6 @@ def insert(self, node): node.update(self._nodes[p.pos]) p = self.parent(p.pos) - @deprecated(details="Use the insert method instead") - def add_node(self, node): - self.insert(node) - def find(self, search_fn, *args, **kwargs): "Search the tree using `search_fn`." diff --git a/tests/test_index.py b/tests/test_index.py index 923239d74..cfcc5c976 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -36,11 +36,11 @@ def test_simple_index(n_children): leaf5.data.count("AAAAT") leaf5.data.count("GAAAA") - root.insert(leaf1) - root.insert(leaf2) - root.insert(leaf3) - root.insert(leaf4) - root.insert(leaf5) + root.add_node(leaf1) + root.add_node(leaf2) + root.add_node(leaf3) + root.add_node(leaf4) + root.add_node(leaf5) def search_kmer(obj, seq): return obj.data.get(seq) diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 2c694d7a9..7023f938d 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -43,11 +43,11 @@ def test_simple(n_children): leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') - root.insert(leaf1) - root.insert(leaf2) - root.insert(leaf3) - root.insert(leaf4) - root.insert(leaf5) + root.add_node(leaf1) + root.add_node(leaf2) + root.add_node(leaf3) + root.add_node(leaf4) + root.add_node(leaf5) def search_kmer(obj, seq): return obj.data.get(seq) @@ -104,11 +104,11 @@ def test_longer_search(n_children): leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') - root.insert(leaf1) - root.insert(leaf2) - root.insert(leaf3) - root.insert(leaf4) - root.insert(leaf5) + root.add_node(leaf1) + root.add_node(leaf2) + root.add_node(leaf3) + root.add_node(leaf4) + root.add_node(leaf5) def kmers(k, seq): for start in range(len(seq) - k + 1): @@ -212,8 +212,8 @@ def test_tree_save_load(n_children): for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) - leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + leaf = SigLeaf(f, sig) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -273,8 +273,7 @@ def test_search_minhashes(): n_leaves = 0 for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) - leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.insert(sig) to_search = next(iter(tree.leaves())) @@ -298,7 +297,7 @@ def test_binary_nary_tree(): sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf n_leaves += 1 @@ -324,12 +323,11 @@ def test_sbt_combine(n_children): n_leaves = 0 for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) - leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.insert(sig) if n_leaves < 4: - tree_1.insert(leaf) + tree_1.insert(sig) else: - tree_2.insert(leaf) + tree_2.insert(sig) n_leaves += 1 tree_1.combine(tree_2) @@ -360,7 +358,7 @@ def test_sbt_combine(n_children): if not next_empty: next_empty = n + 1 - tree_1.insert(leaf) + tree_1.add_node(SigLeaf(to_search.name(), to_search)) assert tree_1.next_node == next_empty @@ -372,7 +370,7 @@ def test_sbt_fsstorage(): for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -405,7 +403,7 @@ def test_sbt_tarstorage(): for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -441,7 +439,7 @@ def test_sbt_ipfsstorage(): for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -479,7 +477,7 @@ def test_sbt_redisstorage(): for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -533,8 +531,7 @@ def test_tree_repair_insert(): for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) - leaf = SigLeaf(os.path.basename(f), sig) - tree_repair.insert(leaf) + tree_repair.insert(sig) for pos, node in tree_repair: # Every parent of a node must be an internal node (and not a leaf), @@ -554,7 +551,7 @@ def test_save_sparseness(n_children): for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -595,8 +592,8 @@ def test_sbt_signatures(): sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) - tree.insert(SigLeaf('47', sig47)) - tree.insert(SigLeaf('63', sig63)) + tree.insert(sig47) + tree.insert(sig63) xx = list(tree.signatures()) assert len(xx) == 2 From b0af24d06e1ec23ce6a74bc32bc7ed2364e555e0 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 14 Dec 2019 15:24:06 -0800 Subject: [PATCH 34/37] clean up signature loading --- tests/test_sbt.py | 54 ++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/tests/test_sbt.py b/tests/test_sbt.py index c97ba4954..5854bee87 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -4,7 +4,7 @@ import pytest -from sourmash import load_signatures, load_one_signature +from sourmash import load_one_signature from sourmash.sbt import SBT, GraphFactory, Leaf, Node from sourmash.sbtmh import (SigLeaf, search_minhashes, search_minhashes_containment) @@ -157,7 +157,7 @@ def test_tree_v2_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -176,7 +176,7 @@ def test_tree_v3_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -195,7 +195,7 @@ def test_tree_v5_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -211,8 +211,8 @@ def test_tree_save_load(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(load_signatures(utils.get_test_data(f))) - leaf = SigLeaf(f, sig) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -241,7 +241,7 @@ def test_tree_save_load_v5(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -272,8 +272,9 @@ def test_search_minhashes(): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(load_signatures(utils.get_test_data(f))) - tree.insert(sig) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) + tree.add_node(leaf) to_search = next(iter(tree.leaves())) @@ -294,7 +295,7 @@ def test_binary_nary_tree(): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): tree.add_node(leaf) @@ -322,12 +323,13 @@ def test_sbt_combine(n_children): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(load_signatures(utils.get_test_data(f))) - tree.insert(sig) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) + tree.add_node(leaf) if n_leaves < 4: - tree_1.insert(sig) + tree_1.add_node(leaf) else: - tree_2.insert(sig) + tree_2.add_node(leaf) n_leaves += 1 tree_1.combine(tree_2) @@ -339,8 +341,7 @@ def test_sbt_combine(n_children): assert len(t_leaves) == len(t1_leaves) assert t1_leaves == t_leaves - to_search = next(load_signatures( - utils.get_test_data(utils.SIG_FILES[0]))) + to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)} tree_result = {str(s) for s in tree.find(search_minhashes, @@ -368,7 +369,8 @@ def test_sbt_fsstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -401,7 +403,8 @@ def test_sbt_tarstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -437,7 +440,8 @@ def test_sbt_ipfsstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -475,7 +479,8 @@ def test_sbt_redisstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -514,7 +519,7 @@ def test_tree_repair(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_repair = {str(s) for s in tree_repair.find(search_minhashes, to_search, 0.1)} @@ -530,8 +535,9 @@ def test_tree_repair_insert(): leaf_loader=SigLeaf.load) for f in utils.SIG_FILES: - sig = next(load_signatures(utils.get_test_data(f))) - tree_repair.insert(sig) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) + tree_repair.add_node(leaf) for pos, node in tree_repair: # Every parent of a node must be an internal node (and not a leaf), @@ -549,7 +555,7 @@ def test_save_sparseness(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf From 2e8c3ab48962af820688a7933358087f252b27b1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 14 Dec 2019 19:38:57 -0800 Subject: [PATCH 35/37] round out Index method tests, sort of :) --- sourmash/index.py | 10 +++++++++- sourmash/lca/lca_utils.py | 15 +++++++++------ tests/test_lca.py | 20 ++++++++++++++++++++ tests/test_sbt.py | 3 ++- 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index 3f7666c16..df1f58e89 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -13,7 +13,7 @@ def signatures(self): "Return an iterator over all signatures in the Index object." @abstractmethod - def insert(self, node): + def insert(self, signature): """ """ @abstractmethod @@ -26,6 +26,14 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr """ """ def find(self, search_fn, *args, **kwargs): + """Use search_fn to find matching signatures in the index. + + search_fn(other_sig, *args) should return a boolean that indicates + whether other_sig is a match. + + Returns a list. + """ + matches = [] for node in self.signatures(): diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index 7167f18c9..7dceb82c6 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -279,7 +279,7 @@ def search(self, query, *args, **kwargs): raise TypeError("'search' on LCA databases does not use abundance") results = [] - for x in self.find(query.minhash, threshold, do_containment): + for x in self.find_signatures(query.minhash, threshold, do_containment): (score, match, filename) = x results.append((score, match, filename)) @@ -288,8 +288,8 @@ def search(self, query, *args, **kwargs): def gather(self, query, *args, **kwargs): results = [] - for x in self.find(query.minhash, 0.0, - containment=True, ignore_scaled=True): + for x in self.find_signatures(query.minhash, 0.0, + containment=True, ignore_scaled=True): (score, match, filename) = x if score: results.append((score, match, filename)) @@ -297,7 +297,10 @@ def gather(self, query, *args, **kwargs): return results def insert(self, node): - pass + raise NotImplementedError + + def find(self, search_fn, *args, **kwargs): + raise NotImplementedError def downsample_scaled(self, scaled): """ @@ -350,7 +353,8 @@ def _create_signatures(self): debug('=> {} signatures!', len(self._signatures)) - def find(self, minhash, threshold, containment=False, ignore_scaled=False): + def find_signatures(self, minhash, threshold, containment=False, + ignore_scaled=False): """ Do a Jaccard similarity or containment search. """ @@ -400,7 +404,6 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False): debug('score: {} (containment? {})', score, containment) if score >= threshold: - # reconstruct signature... ugh. from .. import SourmashSignature match_sig = SourmashSignature(match_mh, name=name) diff --git a/tests/test_lca.py b/tests/test_lca.py index 820269dc8..f5ac6cb8d 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -134,12 +134,32 @@ def test_db_repr(): def test_lca_index_signatures_method(): + # test 'signatures' method from base class Index filename = utils.get_test_data('lca/47+63.lca.json') db, ksize, scaled = lca_utils.load_single_database(filename) siglist = list(db.signatures()) assert len(siglist) == 2 +def test_lca_index_insert_method(): + # test 'signatures' method from base class Index + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + sig = next(iter(db.signatures())) + + with pytest.raises(NotImplementedError) as e: + db.insert(sig) + +def test_lca_index_find_method(): + # test 'signatures' method from base class Index + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + sig = next(iter(db.signatures())) + + with pytest.raises(NotImplementedError) as e: + db.find(None) ## command line tests diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 5854bee87..3f2dfd51c 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -591,7 +591,8 @@ def test_save_sparseness(n_children): assert all(c.node is None for c in tree_loaded.children(pos)) -def test_sbt_signatures(): +def test_sbt_as_index_signatures(): + # test 'signatures' method from Index base class. factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) From c6c02139c674be505a76358a190984786ad23653 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 14 Dec 2019 21:02:24 -0800 Subject: [PATCH 36/37] [WIP] add signatures() method to both LCA and SBT indices (#796) * add signatures() method to both LCA and SBT indices * Update tests/test_sbt.py Co-Authored-By: Luiz Irber * SBT.insert now matches Index.insert, while SBT.add_node does what insert used to * clean up signature loading * round out Index method tests, sort of :) --- sourmash/commands.py | 3 +- sourmash/index.py | 10 +++- sourmash/lca/lca_utils.py | 44 +++++++++------ sourmash/sbt.py | 16 +++--- tests/test_index.py | 10 ++-- tests/test_lca.py | 28 ++++++++++ tests/test_sbt.py | 109 +++++++++++++++++++++++--------------- 7 files changed, 147 insertions(+), 73 deletions(-) diff --git a/sourmash/commands.py b/sourmash/commands.py index e82fb6f3a..ac81f816b 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -450,8 +450,7 @@ def index(args): ss.minhash = ss.minhash.downsample_scaled(args.scaled) scaleds.add(ss.minhash.scaled) - leaf = SigLeaf(ss.md5sum(), ss) - tree.insert(leaf) + tree.insert(ss) n += 1 if not ss: diff --git a/sourmash/index.py b/sourmash/index.py index 3f7666c16..df1f58e89 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -13,7 +13,7 @@ def signatures(self): "Return an iterator over all signatures in the Index object." @abstractmethod - def insert(self, node): + def insert(self, signature): """ """ @abstractmethod @@ -26,6 +26,14 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr """ """ def find(self, search_fn, *args, **kwargs): + """Use search_fn to find matching signatures in the index. + + search_fn(other_sig, *args) should return a boolean that indicates + whether other_sig is a match. + + Returns a list. + """ + matches = [] for node in self.signatures(): diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index e4ad78762..7dceb82c6 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -165,7 +165,10 @@ def __repr__(self): return "LCA_Database('{}')".format(self.filename) def signatures(self): - raise NotImplementedError + from .. import SourmashSignature + self._create_signatures() + for v in self._signatures.values(): + yield SourmashSignature(v) def load(self, db_name): "Load from a JSON file." @@ -276,7 +279,7 @@ def search(self, query, *args, **kwargs): raise TypeError("'search' on LCA databases does not use abundance") results = [] - for x in self.find(query.minhash, threshold, do_containment): + for x in self.find_signatures(query.minhash, threshold, do_containment): (score, match, filename) = x results.append((score, match, filename)) @@ -285,8 +288,8 @@ def search(self, query, *args, **kwargs): def gather(self, query, *args, **kwargs): results = [] - for x in self.find(query.minhash, 0.0, - containment=True, ignore_scaled=True): + for x in self.find_signatures(query.minhash, 0.0, + containment=True, ignore_scaled=True): (score, match, filename) = x if score: results.append((score, match, filename)) @@ -294,7 +297,10 @@ def gather(self, query, *args, **kwargs): return results def insert(self, node): - pass + raise NotImplementedError + + def find(self, search_fn, *args, **kwargs): + raise NotImplementedError def downsample_scaled(self, scaled): """ @@ -329,17 +335,13 @@ def get_lineage_assignments(self, hashval): return x - def find(self, minhash, threshold, containment=False, ignore_scaled=False): - """ - Do a Jaccard similarity or containment search. - """ - # make sure we're looking at the same scaled value as database - if self.scaled > minhash.scaled: - minhash = minhash.downsample_scaled(self.scaled) - elif self.scaled < minhash.scaled and not ignore_scaled: - raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled)) + def _create_signatures(self): + "Create a _signatures member dictionary that contains {idx: minhash}." + from .. import MinHash if not hasattr(self, '_signatures'): + minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled) + debug('creating signatures for LCA DB...') sigd = defaultdict(minhash.copy_and_clear) @@ -351,6 +353,19 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False): debug('=> {} signatures!', len(self._signatures)) + def find_signatures(self, minhash, threshold, containment=False, + ignore_scaled=False): + """ + Do a Jaccard similarity or containment search. + """ + # make sure we're looking at the same scaled value as database + if self.scaled > minhash.scaled: + minhash = minhash.downsample_scaled(self.scaled) + elif self.scaled < minhash.scaled and not ignore_scaled: + raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled)) + + self._create_signatures() + # build idx_to_ident from ident_to_idx if not hasattr(self, 'idx_to_ident'): idx_to_ident = {} @@ -389,7 +404,6 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False): debug('score: {} (containment? {})', score, containment) if score >= threshold: - # reconstruct signature... ugh. from .. import SourmashSignature match_sig = SourmashSignature(match_mh, name=name) diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 743f6e25e..ddc2f617b 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -135,7 +135,8 @@ def __init__(self, factory, d=2, storage=None): self.storage = storage def signatures(self): - return leaves() + for k in self.leaves(): + yield k.data def new_node_pos(self, node): if not self._nodes: @@ -164,7 +165,14 @@ def new_node_pos(self, node): return self.next_node - def insert(self, node): + def insert(self, signature): + "Add a new SourmashSignature in to the SBT." + from .sbtmh import SigLeaf + + leaf = SigLeaf(signature.name(), signature) + self.add_node(leaf) + + def add_node(self, node): pos = self.new_node_pos(node) if pos == 0: # empty tree; initialize w/node. @@ -212,10 +220,6 @@ def insert(self, node): node.update(self._nodes[p.pos]) p = self.parent(p.pos) - @deprecated(details="Use the insert method instead") - def add_node(self, node): - self.insert(node) - def find(self, search_fn, *args, **kwargs): "Search the tree using `search_fn`." diff --git a/tests/test_index.py b/tests/test_index.py index 923239d74..cfcc5c976 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -36,11 +36,11 @@ def test_simple_index(n_children): leaf5.data.count("AAAAT") leaf5.data.count("GAAAA") - root.insert(leaf1) - root.insert(leaf2) - root.insert(leaf3) - root.insert(leaf4) - root.insert(leaf5) + root.add_node(leaf1) + root.add_node(leaf2) + root.add_node(leaf3) + root.add_node(leaf4) + root.add_node(leaf5) def search_kmer(obj, seq): return obj.data.get(seq) diff --git a/tests/test_lca.py b/tests/test_lca.py index fb63d63ad..f5ac6cb8d 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -133,6 +133,34 @@ def test_db_repr(): assert repr(db) == "LCA_Database('{}')".format(filename) +def test_lca_index_signatures_method(): + # test 'signatures' method from base class Index + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + siglist = list(db.signatures()) + assert len(siglist) == 2 + +def test_lca_index_insert_method(): + # test 'signatures' method from base class Index + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + sig = next(iter(db.signatures())) + + with pytest.raises(NotImplementedError) as e: + db.insert(sig) + +def test_lca_index_find_method(): + # test 'signatures' method from base class Index + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + sig = next(iter(db.signatures())) + + with pytest.raises(NotImplementedError) as e: + db.find(None) + ## command line tests diff --git a/tests/test_sbt.py b/tests/test_sbt.py index bef6e7c6e..3f2dfd51c 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -4,12 +4,12 @@ import pytest -from sourmash import signature +from sourmash import load_one_signature from sourmash.sbt import SBT, GraphFactory, Leaf, Node from sourmash.sbtmh import (SigLeaf, search_minhashes, - search_minhashes_containment) + search_minhashes_containment) from sourmash.sbt_storage import (FSStorage, TarStorage, - RedisStorage, IPFSStorage) + RedisStorage, IPFSStorage) from . import sourmash_tst_utils as utils @@ -43,11 +43,11 @@ def test_simple(n_children): leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') - root.insert(leaf1) - root.insert(leaf2) - root.insert(leaf3) - root.insert(leaf4) - root.insert(leaf5) + root.add_node(leaf1) + root.add_node(leaf2) + root.add_node(leaf3) + root.add_node(leaf4) + root.add_node(leaf5) def search_kmer(obj, seq): return obj.data.get(seq) @@ -104,11 +104,11 @@ def test_longer_search(n_children): leaf5.data.count('AAAAT') leaf5.data.count('GAAAA') - root.insert(leaf1) - root.insert(leaf2) - root.insert(leaf3) - root.insert(leaf4) - root.insert(leaf5) + root.add_node(leaf1) + root.add_node(leaf2) + root.add_node(leaf3) + root.add_node(leaf4) + root.add_node(leaf5) def kmers(k, seq): for start in range(len(seq) - k + 1): @@ -138,7 +138,7 @@ def test_tree_v1_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v1 = {str(s) for s in tree_v1.find(search_minhashes_containment, to_search, 0.1)} @@ -157,7 +157,7 @@ def test_tree_v2_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -176,7 +176,7 @@ def test_tree_v3_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -195,7 +195,7 @@ def test_tree_v5_load(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment, to_search, 0.1)} @@ -211,9 +211,9 @@ def test_tree_save_load(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -241,7 +241,7 @@ def test_tree_save_load_v5(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf @@ -272,9 +272,9 @@ def test_search_minhashes(): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = next(iter(tree.leaves())) @@ -295,10 +295,10 @@ def test_binary_nary_tree(): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) for tree in trees.values(): - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf n_leaves += 1 @@ -323,13 +323,13 @@ def test_sbt_combine(n_children): n_leaves = 0 for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) if n_leaves < 4: - tree_1.insert(leaf) + tree_1.add_node(leaf) else: - tree_2.insert(leaf) + tree_2.add_node(leaf) n_leaves += 1 tree_1.combine(tree_2) @@ -341,8 +341,7 @@ def test_sbt_combine(n_children): assert len(t_leaves) == len(t1_leaves) assert t1_leaves == t_leaves - to_search = next(signature.load_signatures( - utils.get_test_data(utils.SIG_FILES[0]))) + to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) t1_result = {str(s) for s in tree_1.find(search_minhashes, to_search, 0.1)} tree_result = {str(s) for s in tree.find(search_minhashes, @@ -360,7 +359,7 @@ def test_sbt_combine(n_children): if not next_empty: next_empty = n + 1 - tree_1.insert(leaf) + tree_1.add_node(SigLeaf(to_search.name(), to_search)) assert tree_1.next_node == next_empty @@ -370,9 +369,10 @@ def test_sbt_fsstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -403,9 +403,10 @@ def test_sbt_tarstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -439,9 +440,10 @@ def test_sbt_ipfsstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -477,9 +479,10 @@ def test_sbt_redisstorage(): tree = SBT(factory) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) + leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -516,7 +519,7 @@ def test_tree_repair(): leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) - to_search = next(signature.load_signatures(testdata1)) + to_search = load_one_signature(testdata1) results_repair = {str(s) for s in tree_repair.find(search_minhashes, to_search, 0.1)} @@ -532,9 +535,9 @@ def test_tree_repair_insert(): leaf_loader=SigLeaf.load) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) - tree_repair.insert(leaf) + tree_repair.add_node(leaf) for pos, node in tree_repair: # Every parent of a node must be an internal node (and not a leaf), @@ -552,9 +555,9 @@ def test_save_sparseness(n_children): tree = SBT(factory, d=n_children) for f in utils.SIG_FILES: - sig = next(signature.load_signatures(utils.get_test_data(f))) + sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) - tree.insert(leaf) + tree.add_node(leaf) to_search = leaf print('*' * 60) @@ -586,3 +589,21 @@ def test_save_sparseness(n_children): # Leaf nodes can't have children if isinstance(node, Leaf): assert all(c.node is None for c in tree_loaded.children(pos)) + + +def test_sbt_as_index_signatures(): + # test 'signatures' method from Index base class. + factory = GraphFactory(31, 1e5, 4) + tree = SBT(factory, d=2) + + sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) + sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + + tree.insert(sig47) + tree.insert(sig63) + + xx = list(tree.signatures()) + assert len(xx) == 2 + + assert sig47 in xx + assert sig63 in xx From 689dcf55f8e39cfbef8bbd9fec9c5174536ed4e0 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 14 Dec 2019 21:33:26 -0800 Subject: [PATCH 37/37] add more scaled relationship tests in lca DB --- sourmash/lca/lca_utils.py | 8 ++++++- sourmash/sourmash_args.py | 1 - tests/test_lca.py | 50 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index 7dceb82c6..3c5530eac 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -274,7 +274,7 @@ def search(self, query, *args, **kwargs): raise TypeError("'search' requires 'threshold'") threshold = kwargs['threshold'] do_containment = kwargs.get('do_containment', False) - ignore_abundance = kwargs.get('ignore_abundance') + ignore_abundance = kwargs.get('ignore_abundance', True) if not ignore_abundance: raise TypeError("'search' on LCA databases does not use abundance") @@ -306,6 +306,10 @@ def downsample_scaled(self, scaled): """ Downsample to the provided scaled value, i.e. eliminate all hashes that don't fall in the required range. + + NOTE: we probably need to invalidate some of the dynamically + calculated members of this object, like _signatures, when we do this. + But we aren't going to right now. """ if scaled == self.scaled: return @@ -362,7 +366,9 @@ def find_signatures(self, minhash, threshold, containment=False, if self.scaled > minhash.scaled: minhash = minhash.downsample_scaled(self.scaled) elif self.scaled < minhash.scaled and not ignore_scaled: + # note that containment can be calculated w/o matching scaled. raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled)) + pass self._create_signatures() diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index ad90f9ff2..542559a9a 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -352,7 +352,6 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, traverse=False): assert query_ksize == lca_db.ksize query_scaled = query.minhash.scaled - assert query_scaled and query_scaled <= lca_db.scaled notify('loaded LCA {}', sbt_or_sigfile, end='\r') n_databases += 1 diff --git a/tests/test_lca.py b/tests/test_lca.py index f5ac6cb8d..0c8c4c376 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -141,6 +141,7 @@ def test_lca_index_signatures_method(): siglist = list(db.signatures()) assert len(siglist) == 2 + def test_lca_index_insert_method(): # test 'signatures' method from base class Index filename = utils.get_test_data('lca/47+63.lca.json') @@ -151,6 +152,7 @@ def test_lca_index_insert_method(): with pytest.raises(NotImplementedError) as e: db.insert(sig) + def test_lca_index_find_method(): # test 'signatures' method from base class Index filename = utils.get_test_data('lca/47+63.lca.json') @@ -161,6 +163,54 @@ def test_lca_index_find_method(): with pytest.raises(NotImplementedError) as e: db.find(None) + +def test_search_db_scaled_gt_sig_scaled(): + dbfile = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(dbfile) + sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + + results = db.search(sig, threshold=.01, ignore_abundance=True) + match_sig = results[0][1] + + sig.minhash = sig.minhash.downsample_scaled(10000) + assert sig.minhash == match_sig.minhash + + +def test_search_db_scaled_lt_sig_scaled(): + dbfile = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(dbfile) + sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig.minhash = sig.minhash.downsample_scaled(100000) + + with pytest.raises(ValueError) as e: + results = db.search(sig, threshold=.01, ignore_abundance=True) + + +def test_gather_db_scaled_gt_sig_scaled(): + dbfile = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(dbfile) + sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + + results = db.gather(sig, threshold=.01, ignore_abundance=True) + match_sig = results[0][1] + + sig.minhash = sig.minhash.downsample_scaled(10000) + assert sig.minhash == match_sig.minhash + + +def test_gather_db_scaled_lt_sig_scaled(): + dbfile = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(dbfile) + sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig.minhash = sig.minhash.downsample_scaled(100000) + + results = db.gather(sig, threshold=.01, ignore_abundance=True) + match_sig = results[0][1] + + match_sig.minhash = match_sig.minhash.downsample_scaled(100000) + assert sig.minhash == match_sig.minhash + + ## command line tests