From dd1d3511dfbceb801a733a3005a2918d954680a5 Mon Sep 17 00:00:00 2001
From: Luiz Irber <luiz.irber@gmail.com>
Date: Mon, 1 Oct 2018 16:41:33 -0700
Subject: [PATCH 01/37] Start moving LCA to Index

---
 sourmash/index.py         | 16 ++++++++++++++++
 sourmash/lca/lca_utils.py |  6 +++++-
 sourmash/sbt.py           |  4 ++--
 3 files changed, 23 insertions(+), 3 deletions(-)
 create mode 100644 sourmash/index.py

diff --git a/sourmash/index.py b/sourmash/index.py
new file mode 100644
index 000000000..6449fedaf
--- /dev/null
+++ b/sourmash/index.py
@@ -0,0 +1,16 @@
+from abc import ABC, abstractmethod
+
+class Index(ABC):
+
+    @abstractmethod
+    def find(self, search_fn, *args, **kwargs):
+        ''' '''
+
+    @abstractmethod
+    def save(self, path, storage=None, sparseness=0.0, structure_only=False):
+        ''' '''
+
+    @classmethod
+    @abstractmethod
+    def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True):
+        ''' '''
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index 06b54ffd1..431b8f895 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -19,6 +19,7 @@
 
 from .._minhash import get_max_hash_for_scaled
 from ..logging import notify, error, debug
+from ..index import Index
 
 # type to store an element in a taxonomic lineage
 LineagePair = namedtuple('LineagePair', ['rank', 'name'])
@@ -138,7 +139,7 @@ def find_lca(tree):
             return tuple(lineage), len(node)
 
 
-class LCA_Database(object):
+class LCA_Database(Index):
     """
     Wrapper class for taxonomic database.
 
@@ -261,6 +262,9 @@ def save(self, db_name):
             
             json.dump(save_d, fp)
 
+    def find(self, search_fn, *args, **kwargs):
+        pass
+
     def downsample_scaled(self, scaled):
         """
         Downsample to the provided scaled value, i.e. eliminate all hashes
diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index 2a48874a3..b4b5714ac 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -66,7 +66,7 @@ def search_transcript(node, seq, threshold):
 
 from .sbt_storage import FSStorage, TarStorage, IPFSStorage, RedisStorage
 from .logging import error, notify, debug
-
+from .index import Index
 
 STORAGES = {
     'TarStorage': TarStorage,
@@ -102,7 +102,7 @@ def init_args(self):
         return (self.ksize, self.starting_size, self.n_tables)
 
 
-class SBT(object):
+class SBT(Index):
     """A Sequence Bloom Tree implementation allowing generic internal nodes and leaves.
 
     The default node and leaf format is a Bloom Filter (like the original implementation),

From 291beeea107cf9b5b61e77e099f5cf45704f3836 Mon Sep 17 00:00:00 2001
From: Luiz Irber <luiz.irber@gmail.com>
Date: Fri, 12 Oct 2018 19:01:17 -0700
Subject: [PATCH 02/37] rename add_node to insert

---
 setup.py                  |  3 ++-
 sourmash/commands.py      |  2 +-
 sourmash/index.py         | 11 ++++++++-
 sourmash/lca/lca_utils.py |  3 +++
 sourmash/sbt.py           | 29 +++++++++++++----------
 tests/test_sbt.py         | 48 +++++++++++++++++++--------------------
 6 files changed, 57 insertions(+), 39 deletions(-)

diff --git a/setup.py b/setup.py
index 6bde32429..5d32d9af6 100644
--- a/setup.py
+++ b/setup.py
@@ -64,7 +64,8 @@
                                language="c++",
                                extra_compile_args=EXTRA_COMPILE_ARGS,
                                extra_link_args=EXTRA_LINK_ARGS)],
-    "install_requires": ["screed>=0.9", "ijson<2.5", "khmer>=2.1"],
+    "install_requires": ["screed>=0.9", "ijson<2.5", "khmer>=2.1",
+                         "deprecation>=2.0.6"],
     "setup_requires": ['Cython>=0.25.2', "setuptools>=38.6.0",
                        'setuptools_scm', 'setuptools_scm_git_archive'],
     "use_scm_version": {"write_to": "sourmash/version.py"},
diff --git a/sourmash/commands.py b/sourmash/commands.py
index 1bdeeae6f..cd1cdeb5f 100644
--- a/sourmash/commands.py
+++ b/sourmash/commands.py
@@ -451,7 +451,7 @@ def index(args):
             scaleds.add(ss.minhash.scaled)
 
             leaf = SigLeaf(ss.md5sum(), ss)
-            tree.add_node(leaf)
+            tree.insert(leaf)
             n += 1
 
         if not ss:
diff --git a/sourmash/index.py b/sourmash/index.py
index 6449fedaf..706691612 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -1,4 +1,9 @@
-from abc import ABC, abstractmethod
+from abc import ABCMeta, abstractmethod
+
+
+# compatible with Python 2 *and* 3:
+ABC = ABCMeta('ABC', (object,), {'__slots__': ()})
+
 
 class Index(ABC):
 
@@ -6,6 +11,10 @@ class Index(ABC):
     def find(self, search_fn, *args, **kwargs):
         ''' '''
 
+    @abstractmethod
+    def insert(self, node):
+        ''' '''
+
     @abstractmethod
     def save(self, path, storage=None, sparseness=0.0, structure_only=False):
         ''' '''
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index 431b8f895..f0d26e8fd 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -265,6 +265,9 @@ def save(self, db_name):
     def find(self, search_fn, *args, **kwargs):
         pass
 
+    def insert(self, node):
+        pass
+
     def downsample_scaled(self, scaled):
         """
         Downsample to the provided scaled value, i.e. eliminate all hashes
diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index b4b5714ac..bef1fcb6d 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -10,7 +10,7 @@
     graph1 = factory()
     # ... add stuff to graph1 ...
     leaf1 = Leaf("a", graph1)
-    root.add_node(leaf1)
+    root.insert(leaf1)
 
 For example, ::
 
@@ -26,7 +26,7 @@
         graph = factory()
         graph.consume_fasta(filename)
         leaf = Leaf(filename, graph)
-        root.add_node(leaf)
+        root.insert(leaf)
 
 then define a search function, ::
 
@@ -57,6 +57,7 @@ def search_transcript(node, seq, threshold):
 import sys
 from tempfile import NamedTemporaryFile
 
+from deprecation import deprecated
 import khmer
 
 try:
@@ -160,13 +161,13 @@ def new_node_pos(self, node):
 
         return self.next_node
 
-    def add_node(self, leaf):
-        pos = self.new_node_pos(leaf)
+    def insert(self, node):
+        pos = self.new_node_pos(node)
 
         if pos == 0:  # empty tree; initialize w/node.
             n = Node(self.factory, name="internal." + str(pos))
             self._nodes[0] = n
-            pos = self.new_node_pos(leaf)
+            pos = self.new_node_pos(node)
 
         # Cases:
         # 1) parent is a Leaf (already covered)
@@ -186,28 +187,32 @@ def add_node(self, leaf):
             c1, c2 = self.children(p.pos)[:2]
 
             self._leaves[c1.pos] = p.node
-            self._leaves[c2.pos] = leaf
+            self._leaves[c2.pos] = node 
             del self._leaves[p.pos]
 
-            for child in (p.node, leaf):
+            for child in (p.node, node):
                 child.update(n)
         elif isinstance(p.node, Node):
-            self._leaves[pos] = leaf
-            leaf.update(p.node)
+            self._leaves[pos] = node 
+            node.update(p.node)
         elif p.node is None:
             n = Node(self.factory, name="internal." + str(p.pos))
             self._nodes[p.pos] = n
             c1 = self.children(p.pos)[0]
-            self._leaves[c1.pos] = leaf
-            leaf.update(n)
+            self._leaves[c1.pos] = node 
+            node.update(n)
 
         # update all parents!
         p = self.parent(p.pos)
         while p:
             self._rebuild_node(p.pos)
-            leaf.update(self._nodes[p.pos])
+            node.update(self._nodes[p.pos])
             p = self.parent(p.pos)
 
+    @deprecated(details="Use the insert method instead")
+    def add_node(self, node):
+        self.insert(node)
+
     def find(self, search_fn, *args, **kwargs):
         "Search the tree using `search_fn`."
 
diff --git a/tests/test_sbt.py b/tests/test_sbt.py
index 69824caab..bef6e7c6e 100644
--- a/tests/test_sbt.py
+++ b/tests/test_sbt.py
@@ -43,11 +43,11 @@ def test_simple(n_children):
     leaf5.data.count('AAAAT')
     leaf5.data.count('GAAAA')
 
-    root.add_node(leaf1)
-    root.add_node(leaf2)
-    root.add_node(leaf3)
-    root.add_node(leaf4)
-    root.add_node(leaf5)
+    root.insert(leaf1)
+    root.insert(leaf2)
+    root.insert(leaf3)
+    root.insert(leaf4)
+    root.insert(leaf5)
 
     def search_kmer(obj, seq):
         return obj.data.get(seq)
@@ -104,11 +104,11 @@ def test_longer_search(n_children):
     leaf5.data.count('AAAAT')
     leaf5.data.count('GAAAA')
 
-    root.add_node(leaf1)
-    root.add_node(leaf2)
-    root.add_node(leaf3)
-    root.add_node(leaf4)
-    root.add_node(leaf5)
+    root.insert(leaf1)
+    root.insert(leaf2)
+    root.insert(leaf3)
+    root.insert(leaf4)
+    root.insert(leaf5)
 
     def kmers(k, seq):
         for start in range(len(seq) - k + 1):
@@ -213,7 +213,7 @@ def test_tree_save_load(n_children):
     for f in utils.SIG_FILES:
         sig = next(signature.load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
-        tree.add_node(leaf)
+        tree.insert(leaf)
         to_search = leaf
 
     print('*' * 60)
@@ -274,7 +274,7 @@ def test_search_minhashes():
     for f in utils.SIG_FILES:
         sig = next(signature.load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
-        tree.add_node(leaf)
+        tree.insert(leaf)
 
     to_search = next(iter(tree.leaves()))
 
@@ -298,7 +298,7 @@ def test_binary_nary_tree():
         sig = next(signature.load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
         for tree in trees.values():
-            tree.add_node(leaf)
+            tree.insert(leaf)
         to_search = leaf
         n_leaves += 1
 
@@ -325,11 +325,11 @@ def test_sbt_combine(n_children):
     for f in utils.SIG_FILES:
         sig = next(signature.load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
-        tree.add_node(leaf)
+        tree.insert(leaf)
         if n_leaves < 4:
-            tree_1.add_node(leaf)
+            tree_1.insert(leaf)
         else:
-            tree_2.add_node(leaf)
+            tree_2.insert(leaf)
         n_leaves += 1
 
     tree_1.combine(tree_2)
@@ -360,7 +360,7 @@ def test_sbt_combine(n_children):
     if not next_empty:
         next_empty = n + 1
 
-    tree_1.add_node(leaf)
+    tree_1.insert(leaf)
     assert tree_1.next_node == next_empty
 
 
@@ -372,7 +372,7 @@ def test_sbt_fsstorage():
         for f in utils.SIG_FILES:
             sig = next(signature.load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.add_node(leaf)
+            tree.insert(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -405,7 +405,7 @@ def test_sbt_tarstorage():
         for f in utils.SIG_FILES:
             sig = next(signature.load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.add_node(leaf)
+            tree.insert(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -441,7 +441,7 @@ def test_sbt_ipfsstorage():
         for f in utils.SIG_FILES:
             sig = next(signature.load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.add_node(leaf)
+            tree.insert(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -479,7 +479,7 @@ def test_sbt_redisstorage():
         for f in utils.SIG_FILES:
             sig = next(signature.load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.add_node(leaf)
+            tree.insert(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -527,14 +527,14 @@ def test_tree_repair():
     assert len(results_repair) == 2
 
 
-def test_tree_repair_add_node():
+def test_tree_repair_insert():
     tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'),
                            leaf_loader=SigLeaf.load)
 
     for f in utils.SIG_FILES:
         sig = next(signature.load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
-        tree_repair.add_node(leaf)
+        tree_repair.insert(leaf)
 
     for pos, node in tree_repair:
         # Every parent of a node must be an internal node (and not a leaf),
@@ -554,7 +554,7 @@ def test_save_sparseness(n_children):
     for f in utils.SIG_FILES:
         sig = next(signature.load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
-        tree.add_node(leaf)
+        tree.insert(leaf)
         to_search = leaf
 
     print('*' * 60)

From a39a74a286ad980778483cd38b5437085cbc64f8 Mon Sep 17 00:00:00 2001
From: Luiz Irber <luiz.irber@gmail.com>
Date: Fri, 12 Oct 2018 19:49:07 -0700
Subject: [PATCH 03/37] simple test

---
 sourmash/index.py   | 34 ++++++++++++++++++++-----
 tests/test_index.py | 62 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_index.py

diff --git a/sourmash/index.py b/sourmash/index.py
index 706691612..305dafdaf 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -2,24 +2,46 @@
 
 
 # compatible with Python 2 *and* 3:
-ABC = ABCMeta('ABC', (object,), {'__slots__': ()})
+ABC = ABCMeta("ABC", (object,), {"__slots__": ()})
 
 
 class Index(ABC):
-
     @abstractmethod
     def find(self, search_fn, *args, **kwargs):
-        ''' '''
+        """ """
 
     @abstractmethod
     def insert(self, node):
-        ''' '''
+        """ """
 
     @abstractmethod
     def save(self, path, storage=None, sparseness=0.0, structure_only=False):
-        ''' '''
+        """ """
 
     @classmethod
     @abstractmethod
     def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True):
-        ''' '''
+        """ """
+
+
+class LinearIndex(Index):
+    def __init__(self):
+        self.signatures = set()
+
+    def insert(self, node):
+        self.signatures.add(node)
+
+    def find(self, search_fn, *args, **kwargs):
+        matches = []
+
+        for node in self.signatures:
+            if search_fn(node, *args):
+                matches.append(node)
+        return matches
+
+    def save(self, path):
+        pass
+
+    @classmethod
+    def load(cls, location):
+        pass
diff --git a/tests/test_index.py b/tests/test_index.py
new file mode 100644
index 000000000..8e6e7af39
--- /dev/null
+++ b/tests/test_index.py
@@ -0,0 +1,62 @@
+from __future__ import print_function, unicode_literals
+
+from sourmash.index import LinearIndex
+from sourmash_lib.sbt import SBT, GraphFactory, Leaf
+
+
+def test_simple_index(n_children):
+    factory = GraphFactory(5, 100, 3)
+    root = SBT(factory, d=n_children)
+
+    leaf1 = Leaf("a", factory())
+    leaf1.data.count("AAAAA")
+    leaf1.data.count("AAAAT")
+    leaf1.data.count("AAAAC")
+
+    leaf2 = Leaf("b", factory())
+    leaf2.data.count("AAAAA")
+    leaf2.data.count("AAAAT")
+    leaf2.data.count("AAAAG")
+
+    leaf3 = Leaf("c", factory())
+    leaf3.data.count("AAAAA")
+    leaf3.data.count("AAAAT")
+    leaf3.data.count("CAAAA")
+
+    leaf4 = Leaf("d", factory())
+    leaf4.data.count("AAAAA")
+    leaf4.data.count("CAAAA")
+    leaf4.data.count("GAAAA")
+
+    leaf5 = Leaf("e", factory())
+    leaf5.data.count("AAAAA")
+    leaf5.data.count("AAAAT")
+    leaf5.data.count("GAAAA")
+
+    root.insert(leaf1)
+    root.insert(leaf2)
+    root.insert(leaf3)
+    root.insert(leaf4)
+    root.insert(leaf5)
+
+    def search_kmer(obj, seq):
+        return obj.data.get(seq)
+
+    kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"]
+
+    linear = LinearIndex()
+    linear.insert(leaf1)
+    linear.insert(leaf2)
+    linear.insert(leaf3)
+    linear.insert(leaf4)
+    linear.insert(leaf5)
+
+    for kmer in kmers:
+        assert set(root.find(search_kmer, kmer)) == set(linear.find(search_kmer, kmer))
+
+    print("-----")
+    print([x.metadata for x in root.find(search_kmer, "AAAAA")])
+    print([x.metadata for x in root.find(search_kmer, "AAAAT")])
+    print([x.metadata for x in root.find(search_kmer, "AAAAG")])
+    print([x.metadata for x in root.find(search_kmer, "CAAAA")])
+    print([x.metadata for x in root.find(search_kmer, "GAAAA")])

From 2769ae9c2c4fa5d84a50fc12aaa212d3dc36d2c4 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 3 Jan 2019 11:04:05 -0800
Subject: [PATCH 04/37] first pass definition of search and gather - tests
 pass, at least!

---
 sourmash/index.py         | 19 +++++++++++++++++++
 sourmash/lca/lca_utils.py |  6 ++++++
 sourmash/sbt.py           |  6 ++++++
 3 files changed, 31 insertions(+)

diff --git a/sourmash/index.py b/sourmash/index.py
index 305dafdaf..3eb08b1fa 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -10,6 +10,14 @@ class Index(ABC):
     def find(self, search_fn, *args, **kwargs):
         """ """
 
+    @abstractmethod
+    def search(self, signature, *args, **kwargs):
+        """ """
+
+    @abstractmethod
+    def gather(self, signature, *args, **kwargs):
+        """ """
+
     @abstractmethod
     def insert(self, node):
         """ """
@@ -39,6 +47,17 @@ def find(self, search_fn, *args, **kwargs):
                 matches.append(node)
         return matches
 
+    def search(self, signature, *args, **kwargs):
+        matches = []
+
+        for node in self.signatures:
+            if signature.similarity(node):
+                matches.append(node)
+        return matches
+
+    def gather(self, signature, *args, **kwargs):
+        pass
+
     def save(self, path):
         pass
 
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index f0d26e8fd..d542ca2c0 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -265,6 +265,12 @@ def save(self, db_name):
     def find(self, search_fn, *args, **kwargs):
         pass
 
+    def search(self, sig):
+        pass
+
+    def gather(self, sig):
+        pass
+
     def insert(self, node):
         pass
 
diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index bef1fcb6d..6d846ca4d 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -256,6 +256,12 @@ def find(self, search_fn, *args, **kwargs):
                             queue.extend(c.pos for c in self.children(node_p))
         return matches
 
+    def search(self, sig):
+        pass
+
+    def gather(self, sig):
+        pass
+
     def _rebuild_node(self, pos=0):
         """Recursively rebuilds an internal node (if it is not present).
 

From 6e2e504f3dac22ef685b93ca90d228b48ba0244a Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 5 Jan 2019 07:07:35 -0800
Subject: [PATCH 05/37] start adding code for LinearIndex.search

---
 sourmash/index.py   | 35 ++++++++++++++++++++++++++++++++---
 tests/test_index.py | 17 +++++++++++++++++
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index 3eb08b1fa..9fb0e107a 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -48,11 +48,40 @@ def find(self, search_fn, *args, **kwargs):
         return matches
 
     def search(self, signature, *args, **kwargs):
+        """@@
+
+        Note, the "best only" hint is ignored by LinearIndex.
+        """
+
+        # check arguments
+        if 'threshold' not in kwargs:
+            raise TypeError("'search' requires 'threshold'")
+
+        do_containment = kwargs.get('do_containment', False)
+        ignore_abundance = kwargs.get('ignore_abundance', False)
+
+        # configure search - containment? ignore abundance?
+        if do_containment:
+            query_match = lambda x: query.contained_by(x, downsample=True)
+        else:
+            query_match = lambda x: query.similarity(
+                x, downsample=True, ignore_abundance=ignore_abundance)
+
+        # do the actual search:
         matches = []
 
-        for node in self.signatures:
-            if signature.similarity(node):
-                matches.append(node)
+        for ss in self.signatures:
+            similarity = query_match(ss)
+            if similarity >= threshold:
+                # @CTB: check duplicates via md5sum - here or ??
+                sr = SearchResult(similarity=similarity,
+                                  match_sig=ss,
+                                  md5=ss.md5sum(),
+                                  filename = None,
+                                  name=ss.name())
+                matches.append(sr)
+
+        # @CTB sort here or ??
         return matches
 
     def gather(self, signature, *args, **kwargs):
diff --git a/tests/test_index.py b/tests/test_index.py
index 8e6e7af39..225b0c8a2 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -1,7 +1,9 @@
 from __future__ import print_function, unicode_literals
 
+import sourmash
 from sourmash.index import LinearIndex
 from sourmash_lib.sbt import SBT, GraphFactory, Leaf
+from . import sourmash_tst_utils as utils
 
 
 def test_simple_index(n_children):
@@ -60,3 +62,18 @@ def search_kmer(obj, seq):
     print([x.metadata for x in root.find(search_kmer, "AAAAG")])
     print([x.metadata for x in root.find(search_kmer, "CAAAA")])
     print([x.metadata for x in root.find(search_kmer, "GAAAA")])
+
+
+def test_linear_index_search():
+    sig2 = utils.get_test_data('2.fa.sig')
+    sig47 = utils.get_test_data('47.fa.sig')
+    sig63 = utils.get_test_data('63.fa.sig')
+
+    ss2 = sourmash.load_one_signature(sig2, ksize=31)
+    ss47 = sourmash.load_one_signature(sig47)
+    ss63 = sourmash.load_one_signature(sig63)
+
+    lidx = LinearIndex()
+    lidx.insert(ss2)
+    lidx.insert(ss47)
+    lidx.insert(ss63)

From 412e1012951dd19f8b29921c52e0d229d758a013 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 5 Jan 2019 07:50:00 -0800
Subject: [PATCH 06/37] an initial test of LinearIndex.search

---
 sourmash/index.py   | 10 +++++++++-
 tests/test_index.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index 9fb0e107a..44493b1ff 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -1,4 +1,11 @@
+"An Abstract Base Class for collections of signatures."
+
 from abc import ABCMeta, abstractmethod
+from collections import namedtuple
+
+# @CTB copied out of search.py to deal with import order issues, #willfix
+SearchResult = namedtuple('SearchResult',
+                          'similarity, match_sig, md5, filename, name')
 
 
 # compatible with Python 2 *and* 3:
@@ -47,7 +54,7 @@ def find(self, search_fn, *args, **kwargs):
                 matches.append(node)
         return matches
 
-    def search(self, signature, *args, **kwargs):
+    def search(self, query, *args, **kwargs):
         """@@
 
         Note, the "best only" hint is ignored by LinearIndex.
@@ -56,6 +63,7 @@ def search(self, signature, *args, **kwargs):
         # check arguments
         if 'threshold' not in kwargs:
             raise TypeError("'search' requires 'threshold'")
+        threshold = kwargs['threshold']
 
         do_containment = kwargs.get('do_containment', False)
         ignore_abundance = kwargs.get('ignore_abundance', False)
diff --git a/tests/test_index.py b/tests/test_index.py
index 225b0c8a2..7c8cd7e98 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -77,3 +77,32 @@ def test_linear_index_search():
     lidx.insert(ss2)
     lidx.insert(ss47)
     lidx.insert(ss63)
+
+    # now, search for sig2
+    sr = lidx.search(ss2, threshold=1.0)
+    print([s.name for s in sr])
+    assert len(sr) == 1
+    assert sr[0].match_sig == ss2
+
+    # search for sig47 with lower threshold; search order not guaranteed.
+    sr = lidx.search(ss47, threshold=0.1)
+    print([s.name for s in sr])
+    assert len(sr) == 2
+    sr.sort(key=lambda x: -x.similarity)
+    assert sr[0].match_sig == ss47
+    assert sr[1].match_sig == ss63
+
+    # search for sig63 with lower threshold; search order not guaranteed.
+    sr = lidx.search(ss63, threshold=0.1)
+    print([s.name for s in sr])
+    assert len(sr) == 2
+    sr.sort(key=lambda x: -x.similarity)
+    assert sr[0].match_sig == ss63
+    assert sr[1].match_sig == ss47
+
+    # search for sig63 with high threshold => 1 match
+    sr = lidx.search(ss63, threshold=0.8)
+    print([s.name for s in sr])
+    assert len(sr) == 1
+    sr.sort(key=lambda x: -x.similarity)
+    assert sr[0].match_sig == ss63

From 5f197d313c6c95de37619eab6d5f872438c0b637 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 13:44:01 -0400
Subject: [PATCH 07/37] implement save & load for LinearIndex

---
 sourmash/index.py   | 11 +++++++--
 tests/test_index.py | 56 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index 44493b1ff..1422039db 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -96,8 +96,15 @@ def gather(self, signature, *args, **kwargs):
         pass
 
     def save(self, path):
-        pass
+        from .signature import save_signatures
+        with open(path, 'wt') as fp:
+            save_signatures(self.signatures, fp)
 
     @classmethod
     def load(cls, location):
-        pass
+        from .signature import load_signatures
+        si = load_signatures(location)
+
+        lidx = LinearIndex()
+        lidx.signatures.update(si)
+        return lidx
diff --git a/tests/test_index.py b/tests/test_index.py
index 7c8cd7e98..08ee27435 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -1,5 +1,6 @@
 from __future__ import print_function, unicode_literals
 
+import os
 import sourmash
 from sourmash.index import LinearIndex
 from sourmash_lib.sbt import SBT, GraphFactory, Leaf
@@ -106,3 +107,58 @@ def test_linear_index_search():
     assert len(sr) == 1
     sr.sort(key=lambda x: -x.similarity)
     assert sr[0].match_sig == ss63
+
+
+def test_linear_index_save():
+    sig2 = utils.get_test_data('2.fa.sig')
+    sig47 = utils.get_test_data('47.fa.sig')
+    sig63 = utils.get_test_data('63.fa.sig')
+
+    ss2 = sourmash.load_one_signature(sig2, ksize=31)
+    ss47 = sourmash.load_one_signature(sig47)
+    ss63 = sourmash.load_one_signature(sig63)
+
+    linear = LinearIndex()
+    linear.insert(ss2)
+    linear.insert(ss47)
+    linear.insert(ss63)
+    
+    with utils.TempDirectory() as location:
+        filename = os.path.join(location, 'foo')
+        linear.save(filename)
+
+        from sourmash import load_signatures
+        si = set(load_signatures(filename))
+
+    x = { ss2, ss47, ss63}
+
+    print(len(si))
+    print(len(x))
+
+    assert si == x
+
+
+def test_linear_index_save_load():
+    sig2 = utils.get_test_data('2.fa.sig')
+    sig47 = utils.get_test_data('47.fa.sig')
+    sig63 = utils.get_test_data('63.fa.sig')
+
+    ss2 = sourmash.load_one_signature(sig2, ksize=31)
+    ss47 = sourmash.load_one_signature(sig47)
+    ss63 = sourmash.load_one_signature(sig63)
+
+    linear = LinearIndex()
+    linear.insert(ss2)
+    linear.insert(ss47)
+    linear.insert(ss63)
+    
+    with utils.TempDirectory() as location:
+        filename = os.path.join(location, 'foo')
+        linear.save(filename)
+        linear2 = LinearIndex.load(filename)
+        
+    # now, search for sig2
+    sr = linear2.search(ss2, threshold=1.0)
+    print([s.name for s in sr])
+    assert len(sr) == 1
+    assert sr[0].match_sig == ss2

From 723d3df758425e831bfecc6a6e9d99816cf913a3 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 13:45:54 -0400
Subject: [PATCH 08/37] add test for LinearIndex.load

---
 tests/test_index.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/test_index.py b/tests/test_index.py
index 08ee27435..88eba0ec5 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -138,6 +138,28 @@ def test_linear_index_save():
     assert si == x
 
 
+def test_linear_index_load():
+    sig2 = utils.get_test_data('2.fa.sig')
+    sig47 = utils.get_test_data('47.fa.sig')
+    sig63 = utils.get_test_data('63.fa.sig')
+
+    ss2 = sourmash.load_one_signature(sig2, ksize=31)
+    ss47 = sourmash.load_one_signature(sig47)
+    ss63 = sourmash.load_one_signature(sig63)
+
+    with utils.TempDirectory() as location:
+        from sourmash import save_signatures
+
+        filename = os.path.join(location, 'foo')
+        with open(filename, 'wt') as fp:
+            sourmash.save_signatures([ss2, ss47, ss63], fp)
+
+        linear = LinearIndex.load(filename)
+
+    x = { ss2, ss47, ss63}
+    assert linear.signatures == x
+
+
 def test_linear_index_save_load():
     sig2 = utils.get_test_data('2.fa.sig')
     sig47 = utils.get_test_data('47.fa.sig')

From 577c9fa29ceecfded68caa8e5e661d9d4d169a10 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 13:56:49 -0400
Subject: [PATCH 09/37] implemented & tested LinearIndex.gather

---
 sourmash/index.py   | 14 ++++++++++++--
 tests/test_index.py | 27 +++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index 1422039db..002bcfedd 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -92,8 +92,18 @@ def search(self, query, *args, **kwargs):
         # @CTB sort here or ??
         return matches
 
-    def gather(self, signature, *args, **kwargs):
-        pass
+    def gather(self, query, *args, **kwargs):
+        # check arguments
+        threshold = kwargs.get('threshold', 0)
+
+        results = []
+        for ss in self.signatures:
+            cont = query.minhash.containment_ignore_maxhash(ss.minhash)
+            if cont > threshold:
+                results.append((cont, ss))
+        results.sort(reverse=True)
+
+        return results
 
     def save(self, path):
         from .signature import save_signatures
diff --git a/tests/test_index.py b/tests/test_index.py
index 88eba0ec5..06e2903b1 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -109,6 +109,33 @@ def test_linear_index_search():
     assert sr[0].match_sig == ss63
 
 
+def test_linear_index_gather():
+    sig2 = utils.get_test_data('2.fa.sig')
+    sig47 = utils.get_test_data('47.fa.sig')
+    sig63 = utils.get_test_data('63.fa.sig')
+
+    ss2 = sourmash.load_one_signature(sig2, ksize=31)
+    ss47 = sourmash.load_one_signature(sig47)
+    ss63 = sourmash.load_one_signature(sig63)
+
+    lidx = LinearIndex()
+    lidx.insert(ss2)
+    lidx.insert(ss47)
+    lidx.insert(ss63)
+
+    matches = lidx.gather(ss2)
+    assert len(matches) == 1
+    assert matches[0][0] == 1.0
+    assert matches[0][1] == ss2
+
+    matches = lidx.gather(ss47)
+    assert len(matches) == 2
+    assert matches[0][0] == 1.0
+    assert matches[0][1] == ss47
+    assert round(matches[1][0], 2) == 0.49
+    assert matches[1][1] == ss63
+
+
 def test_linear_index_save():
     sig2 = utils.get_test_data('2.fa.sig')
     sig47 = utils.get_test_data('47.fa.sig')

From eb41b93b13ca9b5fe07711440dad2fd9f62ea4b1 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 14:26:04 -0400
Subject: [PATCH 10/37] implement LinearIndex in load_databases and search
 funtions

---
 sourmash/index.py         | 17 ++++++++++-------
 sourmash/search.py        | 18 +++++++-----------
 sourmash/sourmash_args.py | 19 ++++++++++---------
 tests/test_index.py       | 10 +++++++---
 tests/test_sourmash.py    |  3 ++-
 5 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index 002bcfedd..57bce5bdb 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -40,11 +40,15 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr
 
 
 class LinearIndex(Index):
-    def __init__(self):
-        self.signatures = set()
+    def __init__(self, signatures=[], filename=None):
+        self.signatures = list(signatures)
+        self.filename = filename
+
+    def __len__(self):
+        return len(self.signatures)
 
     def insert(self, node):
-        self.signatures.add(node)
+        self.signatures.append(node)
 
     def find(self, search_fn, *args, **kwargs):
         matches = []
@@ -85,11 +89,11 @@ def search(self, query, *args, **kwargs):
                 sr = SearchResult(similarity=similarity,
                                   match_sig=ss,
                                   md5=ss.md5sum(),
-                                  filename = None,
+                                  filename = self.filename,
                                   name=ss.name())
                 matches.append(sr)
 
-        # @CTB sort here or ??
+        matches.sort(key=lambda x: -x.similarity)
         return matches
 
     def gather(self, query, *args, **kwargs):
@@ -115,6 +119,5 @@ def load(cls, location):
         from .signature import load_signatures
         si = load_signatures(location)
 
-        lidx = LinearIndex()
-        lidx.signatures.update(si)
+        lidx = LinearIndex(si, filename=location)
         return lidx
diff --git a/sourmash/search.py b/sourmash/search.py
index 210640581..f1fd6fc93 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -88,18 +88,14 @@ def search_databases(query, databases, threshold, do_containment, best_only,
                     results.append(sr)
 
         else: # list of signatures
-            for ss in obj:
-                similarity = query_match(ss)
-                if similarity >= threshold and \
-                       ss.md5sum() not in found_md5:
-                    sr = SearchResult(similarity=similarity,
-                                      match_sig=ss,
-                                      md5=ss.md5sum(),
-                                      filename=filename,
-                                      name=ss.name())
-                    found_md5.add(sr.md5)
+            linear = obj
+            search_iter = linear.search(query, threshold=threshold,
+                                        do_containment=do_containment,
+                                        ignore_abundance=ignore_abundance)
+            for sr in search_iter:
+                if sr.md5 not in found_md5:
                     results.append(sr)
-
+                    found_md5.add(sr.md5)
 
     # sort results on similarity (reverse)
     results.sort(key=lambda x: -x.similarity)
diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py
index 33ca36565..09975463c 100644
--- a/sourmash/sourmash_args.py
+++ b/sourmash/sourmash_args.py
@@ -7,6 +7,7 @@
 from . import signature
 from .logging import notify, error
 
+from .index import LinearIndex
 from . import signature as sig
 from .sbt import SBT
 from .sbtmh import SigLeaf
@@ -297,12 +298,12 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, traverse=False):
                                                   ksize=query_ksize,
                                                   select_moltype=query_moltype)
                     siglist = filter_compatible_signatures(query, siglist, 1)
-                    siglist = list(siglist)
-                    databases.append((siglist, sbt_or_sigfile, False))
-                    notify('loaded {} signatures from {}', len(siglist),
+                    linear = LinearIndex(siglist, filename=sigfile)
+                    databases.append((linear, sbt_or_sigfile, False))
+                    notify('loaded {} signatures from {}', len(linear),
                            sigfile, end='\r')
-                    n_signatures += len(siglist)
-                except Exception:  # ignore errors with traverse
+                    n_signatures += len(linear)
+                except Exception:                       # ignore errors with traverse
                     pass
 
             # done! jump to beginning of main 'for' loop
@@ -355,12 +356,12 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, traverse=False):
                 raise ValueError
 
             siglist = filter_compatible_signatures(query, siglist, False)
-            siglist = list(siglist)
+            linear = LinearIndex(siglist, filename=sbt_or_sigfile)
+            databases.append((linear, sbt_or_sigfile, 'signature'))
 
-            databases.append((siglist, sbt_or_sigfile, 'signature'))
-            notify('loaded {} signatures from {}', len(siglist),
+            notify('loaded {} signatures from {}', len(linear),
                    sbt_or_sigfile, end='\r')
-            n_signatures += len(siglist)
+            n_signatures += len(linear)
         except (EnvironmentError, ValueError):
             error("\nCannot open file '{}'", sbt_or_sigfile)
             sys.exit(-1)
diff --git a/tests/test_index.py b/tests/test_index.py
index 06e2903b1..e232770ad 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -155,13 +155,16 @@ def test_linear_index_save():
         linear.save(filename)
 
         from sourmash import load_signatures
-        si = set(load_signatures(filename))
+        si = list(load_signatures(filename))
 
-    x = { ss2, ss47, ss63}
+    x = [ ss2, ss47, ss63 ]
 
     print(len(si))
     print(len(x))
 
+    print(si)
+    print(x)
+
     assert si == x
 
 
@@ -183,8 +186,9 @@ def test_linear_index_load():
 
         linear = LinearIndex.load(filename)
 
-    x = { ss2, ss47, ss63}
+    x = [ ss2, ss47, ss63 ]
     assert linear.signatures == x
+    assert linear.filename == filename
 
 
 def test_linear_index_save_load():
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index d7abdf7a5..9363a51b9 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -613,9 +613,10 @@ def test_search_csv():
         with open(csv_file) as fp:
             reader = csv.DictReader(fp)
             row = next(reader)
+            print('xxx', row)
             assert float(row['similarity']) == 0.93
             assert row['name'].endswith('short2.fa')
-            assert row['filename'].endswith('short2.fa.sig')
+            assert row['filename'].endswith('short2.fa.sig'), row['filename']
             assert row['md5'] == '914591cd1130aa915fe0c0c63db8f19d'
 
 

From 2a1342831d564a6cf5b08a35cc6b3c48128870d5 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 14:29:04 -0400
Subject: [PATCH 11/37] implemented LinearIndex for gather, too

---
 sourmash/index.py  | 2 +-
 sourmash/search.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index 57bce5bdb..af20ae12e 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -104,7 +104,7 @@ def gather(self, query, *args, **kwargs):
         for ss in self.signatures:
             cont = query.minhash.containment_ignore_maxhash(ss.minhash)
             if cont > threshold:
-                results.append((cont, ss))
+                results.append((cont, ss, self.filename))
         results.sort(reverse=True)
 
         return results
diff --git a/sourmash/search.py b/sourmash/search.py
index f1fd6fc93..a98b20067 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -162,10 +162,10 @@ def find_best(dblist, query, remainder):
 
             # search a signature
             else:
-                for ss in obj:
-                    similarity = query.minhash.containment_ignore_maxhash(ss.minhash)
-                    if similarity > 0.0:
-                        results.append((similarity, ss, filename))
+                linear = obj
+                gather_iter = linear.gather(query)
+                for similarity, ss, filename in gather_iter:
+                    results.append((similarity, ss, filename))
 
         if not results:
             return None, None, None

From 0803af3ca042fba341447d5a266f0547cecf63f9 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 14:37:07 -0400
Subject: [PATCH 12/37] implemented search in LCA db

---
 sourmash/lca/lca_utils.py | 28 +++++++++++++++++++++++-----
 sourmash/search.py        | 15 ++++++---------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index d542ca2c0..dcba9f0c4 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -24,6 +24,9 @@
 # type to store an element in a taxonomic lineage
 LineagePair = namedtuple('LineagePair', ['rank', 'name'])
 
+# @CTB copied out of search.py to deal with import order issues, #willfix
+SearchResult = namedtuple('SearchResult',
+                          'similarity, match_sig, md5, filename, name')
 
 def check_files_exist(*files):
     ret = True
@@ -262,11 +265,26 @@ def save(self, db_name):
             
             json.dump(save_d, fp)
 
-    def find(self, search_fn, *args, **kwargs):
-        pass
-
-    def search(self, sig):
-        pass
+    def search(self, query, *args, **kwargs):
+        # check arguments
+        if 'threshold' not in kwargs:
+            raise TypeError("'search' requires 'threshold'")
+        threshold = kwargs['threshold']
+        do_containment = kwargs.get('do_containment', False)
+        # @CTB ignore_abundance?
+
+        results = []
+        for x in self.find(query.minhash, threshold, do_containment):
+            (score, match_sig, md5, filename, name) = x
+            sr = SearchResult(similarity=score,
+                              match_sig=match_sig,
+                              md5=md5,
+                              filename=filename,
+                              name=name)
+            results.append(sr)
+
+        results.sort(key=lambda x: -x.similarity)
+        return results
 
     def gather(self, sig):
         pass
diff --git a/sourmash/search.py b/sourmash/search.py
index a98b20067..95f5c44d6 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -76,16 +76,13 @@ def search_databases(query, databases, threshold, do_containment, best_only,
 
         elif filetype == 'LCA':
             lca_db = obj
-            for x in lca_db.find(query.minhash, threshold, do_containment):
-                (score, match_sig, md5, filename, name) = x
-                if md5 not in found_md5:
-                    sr = SearchResult(similarity=score,
-                                      match_sig=match_sig,
-                                      md5=md5,
-                                      filename=filename,
-                                      name=name)
-                    found_md5.add(sr.md5)
+            search_iter = lca_db.search(query, threshold=threshold,
+                                        do_containment=do_containment,
+                                        ignore_abundance=ignore_abundance)
+            for sr in search_iter:
+                if sr.md5 not in found_md5:
                     results.append(sr)
+                    found_md5.add(sr.md5)
 
         else: # list of signatures
             linear = obj

From 9406e1008d7ca0b06357a0525f8832aab9152139 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 14:40:37 -0400
Subject: [PATCH 13/37] implemented gather on LCA DBs

---
 sourmash/lca/lca_utils.py | 11 +++++++++--
 sourmash/search.py        |  8 +++-----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index dcba9f0c4..1974518fb 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -286,8 +286,15 @@ def search(self, query, *args, **kwargs):
         results.sort(key=lambda x: -x.similarity)
         return results
 
-    def gather(self, sig):
-        pass
+    def gather(self, query, *args, **kwargs):
+        results = []
+        for x in self.find(query.minhash, 0.0,
+                           containment=True, ignore_scaled=True):
+            (score, match_sig, md5, filename, name) = x
+            if score > 0.0:
+                results.append((score, match_sig, filename))
+
+        return results
 
     def insert(self, node):
         pass
diff --git a/sourmash/search.py b/sourmash/search.py
index 95f5c44d6..e70b38628 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -151,11 +151,9 @@ def find_best(dblist, query, remainder):
             # or an LCA database
             elif filetype == 'LCA':
                 lca_db = obj
-                for x in lca_db.find(query.minhash, 0.0,
-                                     containment=True, ignore_scaled=True):
-                    (score, match_sig, md5, filename, name) = x
-                    if score > 0.0:
-                        results.append((score, match_sig, filename))
+                gather_iter = lca_db.gather(query)
+                for similarity, ss, filename in gather_iter:
+                    results.append((similarity, ss, filename))
 
             # search a signature
             else:

From 913721232c5d8fa455bc6001be7a058f70da5d37 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 14:46:50 -0400
Subject: [PATCH 14/37] implemented gather on SBT

---
 sourmash/sbt.py    | 16 ++++++++++++++--
 sourmash/search.py |  9 +++------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index 6d846ca4d..676242f2f 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -259,8 +259,20 @@ def find(self, search_fn, *args, **kwargs):
     def search(self, sig):
         pass
 
-    def gather(self, sig):
-        pass
+    def gather(self, query, *args, **kwargs):
+        from .sbtmh import GatherMinHashesFindBestIgnoreMaxHash
+        threshold = kwargs['threshold']
+
+        search_fn = GatherMinHashesFindBestIgnoreMaxHash(threshold).search
+
+        results = []
+        for leaf in self.find(search_fn, query, threshold):
+            leaf_e = leaf.data.minhash
+            similarity = query.minhash.containment_ignore_maxhash(leaf_e)
+            if similarity > 0.0:
+                results.append((similarity, leaf.data))
+
+        return results
 
     def _rebuild_node(self, pos=0):
         """Recursively rebuilds an internal node (if it is not present).
diff --git a/sourmash/search.py b/sourmash/search.py
index e70b38628..2fa3b4a8f 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -141,13 +141,10 @@ def find_best(dblist, query, remainder):
             # search a tree
             if filetype == 'SBT':
                 tree = obj
-                search_fn = GatherMinHashesFindBestIgnoreMaxHash(best_ctn_sofar).search
+                gather_iter = tree.gather(query, threshold=best_ctn_sofar)
+                for similarity, ss in gather_iter:
+                    results.append((similarity, ss, filename))
 
-                for leaf in tree.find(search_fn, query, best_ctn_sofar):
-                    leaf_e = leaf.data.minhash
-                    similarity = query.minhash.containment_ignore_maxhash(leaf_e)
-                    if similarity > 0.0:
-                        results.append((similarity, leaf.data, filename))
             # or an LCA database
             elif filetype == 'LCA':
                 lca_db = obj

From 100cbd9f5d091fafae0d1977eb0a22a65b0274b7 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 14:57:39 -0400
Subject: [PATCH 15/37] implemented search on SBTs

---
 sourmash/sbt.py    | 53 ++++++++++++++++++++++++++++++++++++++++++++--
 sourmash/search.py | 36 ++++++-------------------------
 2 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index 676242f2f..98302920c 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -77,6 +77,9 @@ def search_transcript(node, seq, threshold):
 }
 NodePos = namedtuple("NodePos", ["pos", "node"])
 
+# @CTB copied out of search.py to deal with import order issues, #willfix
+SearchResult = namedtuple('SearchResult',
+                          'similarity, match_sig, md5, filename, name')
 
 class GraphFactory(object):
     """Build new nodegraphs (Bloom filters) of a specific (fixed) size.
@@ -256,8 +259,54 @@ def find(self, search_fn, *args, **kwargs):
                             queue.extend(c.pos for c in self.children(node_p))
         return matches
 
-    def search(self, sig):
-        pass
+    def search(self, query, *args, **kwargs):
+        from .sbtmh import search_minhashes, search_minhashes_containment
+        from .sbtmh import SearchMinHashesFindBest
+        from .signature import SourmashSignature
+
+        threshold = kwargs['threshold']
+        ignore_abundance = kwargs['ignore_abundance']
+        do_containment = kwargs['do_containment']
+        best_only = kwargs['best_only']
+
+        search_fn = search_minhashes
+        query_match = lambda x: query.similarity(
+            x, downsample=True, ignore_abundance=ignore_abundance)
+        if do_containment:
+            search_fn = search_minhashes_containment
+            query_match = lambda x: query.contained_by(x, downsample=True)
+        
+        if best_only:            # this needs to be reset for each SBT
+            search_fn = SearchMinHashesFindBest().search
+
+        # figure out scaled value of tree, downsample query if needed.
+        leaf = next(iter(self.leaves()))
+        tree_mh = leaf.data.minhash
+
+        tree_query = query
+        if tree_mh.scaled and query.minhash.scaled and \
+          tree_mh.scaled > query.minhash.scaled:
+            resampled_query_mh = tree_query.minhash
+            resampled_query_mh = resampled_query_mh.downsample_scaled(tree_mh.scaled)
+            tree_query = SourmashSignature(resampled_query_mh)
+
+        # now, search!
+        results = []
+        for leaf in self.find(search_fn, tree_query, threshold):
+            similarity = query_match(leaf.data)
+
+            # tree search should always/only return matches above threshold
+            assert similarity >= threshold
+
+            sr = SearchResult(similarity=similarity,
+                              match_sig=leaf.data,
+                              md5=leaf.data.md5sum(),
+                              name=leaf.data.name(),
+                              filename=None)
+            results.append(sr)
+
+        return results
+        
 
     def gather(self, query, *args, **kwargs):
         from .sbtmh import GatherMinHashesFindBestIgnoreMaxHash
diff --git a/sourmash/search.py b/sourmash/search.py
index 2fa3b4a8f..182f18045 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -42,37 +42,15 @@ def search_databases(query, databases, threshold, do_containment, best_only,
     found_md5 = set()
     for (obj, filename, filetype) in databases:
         if filetype == 'SBT':
-            if best_only:            # this needs to be reset for each SBT
-                search_fn = SearchMinHashesFindBest().search
-
             tree = obj
-
-            # figure out scaled value of tree, downsample query if needed.
-            leaf = next(iter(tree.leaves()))
-            tree_mh = leaf.data.minhash
-
-            tree_query = query
-            if tree_mh.scaled and query.minhash.scaled and \
-              tree_mh.scaled > query.minhash.scaled:
-                resampled_query_mh = tree_query.minhash
-                resampled_query_mh = resampled_query_mh.downsample_scaled(tree_mh.scaled)
-                tree_query = SourmashSignature(resampled_query_mh)
-
-            # now, search!
-            for leaf in tree.find(search_fn, tree_query, threshold):
-                similarity = query_match(leaf.data)
-
-                # tree search should always/only return matches above threshold
-                assert similarity >= threshold
-
-                if leaf.data.md5sum() not in found_md5:
-                    sr = SearchResult(similarity=similarity,
-                                      match_sig=leaf.data,
-                                      md5=leaf.data.md5sum(),
-                                      filename=filename,
-                                      name=leaf.data.name())
-                    found_md5.add(sr.md5)
+            search_iter = tree.search(query, threshold=threshold,
+                                      do_containment=do_containment,
+                                      ignore_abundance=ignore_abundance,
+                                      best_only=best_only)
+            for sr in search_iter:
+                if sr.md5 not in found_md5:
                     results.append(sr)
+                    found_md5.add(sr.md5)
 
         elif filetype == 'LCA':
             lca_db = obj

From 980a470e51c056c9a9f8e4b72aaa9be14a9ac2aa Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 16:27:26 -0400
Subject: [PATCH 16/37] removed conditionals in search & gather in favor of
 Index interface

---
 sourmash/sbt.py    |  2 +-
 sourmash/search.py | 63 +++++++++++-----------------------------------
 2 files changed, 15 insertions(+), 50 deletions(-)

diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index 98302920c..2578d6f2c 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -319,7 +319,7 @@ def gather(self, query, *args, **kwargs):
             leaf_e = leaf.data.minhash
             similarity = query.minhash.containment_ignore_maxhash(leaf_e)
             if similarity > 0.0:
-                results.append((similarity, leaf.data))
+                results.append((similarity, leaf.data, None))
 
         return results
 
diff --git a/sourmash/search.py b/sourmash/search.py
index 182f18045..40e01c977 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -41,36 +41,15 @@ def search_databases(query, databases, threshold, do_containment, best_only,
     results = []
     found_md5 = set()
     for (obj, filename, filetype) in databases:
-        if filetype == 'SBT':
-            tree = obj
-            search_iter = tree.search(query, threshold=threshold,
-                                      do_containment=do_containment,
-                                      ignore_abundance=ignore_abundance,
-                                      best_only=best_only)
-            for sr in search_iter:
-                if sr.md5 not in found_md5:
-                    results.append(sr)
-                    found_md5.add(sr.md5)
-
-        elif filetype == 'LCA':
-            lca_db = obj
-            search_iter = lca_db.search(query, threshold=threshold,
-                                        do_containment=do_containment,
-                                        ignore_abundance=ignore_abundance)
-            for sr in search_iter:
-                if sr.md5 not in found_md5:
-                    results.append(sr)
-                    found_md5.add(sr.md5)
-
-        else: # list of signatures
-            linear = obj
-            search_iter = linear.search(query, threshold=threshold,
-                                        do_containment=do_containment,
-                                        ignore_abundance=ignore_abundance)
-            for sr in search_iter:
-                if sr.md5 not in found_md5:
-                    results.append(sr)
-                    found_md5.add(sr.md5)
+        search_iter = obj.search(query, threshold=threshold,
+                                 do_containment=do_containment,
+                                 ignore_abundance=ignore_abundance,
+                                 best_only=best_only)
+        for sr in search_iter:
+            if sr.md5 not in found_md5:
+                results.append(sr)
+                found_md5.add(sr.md5)
+
 
     # sort results on similarity (reverse)
     results.sort(key=lambda x: -x.similarity)
@@ -117,25 +96,11 @@ def find_best(dblist, query, remainder):
         results = []
         for (obj, filename, filetype) in dblist:
             # search a tree
-            if filetype == 'SBT':
-                tree = obj
-                gather_iter = tree.gather(query, threshold=best_ctn_sofar)
-                for similarity, ss in gather_iter:
-                    results.append((similarity, ss, filename))
-
-            # or an LCA database
-            elif filetype == 'LCA':
-                lca_db = obj
-                gather_iter = lca_db.gather(query)
-                for similarity, ss, filename in gather_iter:
-                    results.append((similarity, ss, filename))
-
-            # search a signature
-            else:
-                linear = obj
-                gather_iter = linear.gather(query)
-                for similarity, ss, filename in gather_iter:
-                    results.append((similarity, ss, filename))
+            tree = obj
+            gather_iter = tree.gather(query, threshold=best_ctn_sofar)
+            for similarity, ss, filename in gather_iter:
+                results.append((similarity, ss, filename))
+
 
         if not results:
             return None, None, None

From 622ddeea03e99dbf293e0be21a3fbb0d6b577041 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 19:38:14 -0400
Subject: [PATCH 17/37] fix remaining tests for search & gather

---
 sourmash/search.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/sourmash/search.py b/sourmash/search.py
index 40e01c977..e6aeaf7ef 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -86,19 +86,23 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
     # define a function to do a 'best' search and get only top match.
     def find_best(dblist, query, remainder):
 
+        # @CTB this is a tree-specific optimization, I think - should fix.
         # precompute best containment from all of the remainders
         best_ctn_sofar = 0.0
-        for x in remainder:
-            ctn = query.minhash.containment_ignore_maxhash(x.minhash)
-            if ctn > best_ctn_sofar:
-                best_ctn_sofar = ctn
+#        for x in remainder:
+#            ctn = query.minhash.containment_ignore_maxhash(x.minhash)
+#            if ctn > best_ctn_sofar:
+#                best_ctn_sofar = ctn
 
         results = []
         for (obj, filename, filetype) in dblist:
-            # search a tree
-            tree = obj
-            gather_iter = tree.gather(query, threshold=best_ctn_sofar)
-            for similarity, ss, filename in gather_iter:
+            # search a tree!
+            gather_iter = obj.gather(query, threshold=best_ctn_sofar)
+            for similarity, ss, fname in gather_iter:
+                # @CTB hackity-hack hack, this is because trees don't have
+                # filenames at the moment.
+                if fname is None and filename:
+                    fname = filename
                 results.append((similarity, ss, filename))
 
 

From dd55de6c0e1fed81aa7f17e2b38460ca335d0907 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 6 Sep 2019 19:40:42 -0400
Subject: [PATCH 18/37] remove some debugging code

---
 tests/test_sourmash.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index 9363a51b9..d7abdf7a5 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -613,10 +613,9 @@ def test_search_csv():
         with open(csv_file) as fp:
             reader = csv.DictReader(fp)
             row = next(reader)
-            print('xxx', row)
             assert float(row['similarity']) == 0.93
             assert row['name'].endswith('short2.fa')
-            assert row['filename'].endswith('short2.fa.sig'), row['filename']
+            assert row['filename'].endswith('short2.fa.sig')
             assert row['md5'] == '914591cd1130aa915fe0c0c63db8f19d'
 
 

From 6f2e4c23157f8a1e3232bcdb608574d5d64547b6 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 8 Sep 2019 05:56:56 -0700
Subject: [PATCH 19/37] fix my errant default parameter ways

---
 sourmash/index.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index af20ae12e..ef73dbec6 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -40,8 +40,10 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr
 
 
 class LinearIndex(Index):
-    def __init__(self, signatures=[], filename=None):
-        self.signatures = list(signatures)
+    def __init__(self, signatures=None, filename=None):
+        self.signatures = []
+        if signatures:
+            self.signatures = list(signatures)
         self.filename = filename
 
     def __len__(self):

From f5e622b15e24dc3be22016fe7c11fbec2a6ed774 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 8 Sep 2019 06:16:28 -0700
Subject: [PATCH 20/37] cleanup and simplification of gather code

---
 sourmash/index.py         |  5 +----
 sourmash/lca/lca_utils.py |  2 +-
 sourmash/sbt.py           |  7 +++----
 sourmash/sbtmh.py         | 17 ++++++++---------
 sourmash/search.py        | 23 ++++-------------------
 5 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index ef73dbec6..dea966eec 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -99,13 +99,10 @@ def search(self, query, *args, **kwargs):
         return matches
 
     def gather(self, query, *args, **kwargs):
-        # check arguments
-        threshold = kwargs.get('threshold', 0)
-
         results = []
         for ss in self.signatures:
             cont = query.minhash.containment_ignore_maxhash(ss.minhash)
-            if cont > threshold:
+            if cont:
                 results.append((cont, ss, self.filename))
         results.sort(reverse=True)
 
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index 1974518fb..73a78a8af 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -291,7 +291,7 @@ def gather(self, query, *args, **kwargs):
         for x in self.find(query.minhash, 0.0,
                            containment=True, ignore_scaled=True):
             (score, match_sig, md5, filename, name) = x
-            if score > 0.0:
+            if score:
                 results.append((score, match_sig, filename))
 
         return results
diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index 2578d6f2c..10661b031 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -310,12 +310,11 @@ def search(self, query, *args, **kwargs):
 
     def gather(self, query, *args, **kwargs):
         from .sbtmh import GatherMinHashesFindBestIgnoreMaxHash
-        threshold = kwargs['threshold']
-
-        search_fn = GatherMinHashesFindBestIgnoreMaxHash(threshold).search
+        # use a tree search function that keeps track of its best match.
+        search_fn = GatherMinHashesFindBestIgnoreMaxHash().search
 
         results = []
-        for leaf in self.find(search_fn, query, threshold):
+        for leaf in self.find(search_fn, query, 0.0):
             leaf_e = leaf.data.minhash
             similarity = query.minhash.containment_ignore_maxhash(leaf_e)
             if similarity > 0.0:
diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py
index 066b2a952..5f8c20f0b 100644
--- a/sourmash/sbtmh.py
+++ b/sourmash/sbtmh.py
@@ -204,8 +204,8 @@ def search_minhashes_containment(node, sig, threshold,
 
 
 class GatherMinHashesFindBestIgnoreMaxHash(object):
-    def __init__(self, initial_best_match=0.0):
-        self.best_match = initial_best_match
+    def __init__(self):
+        self.best_match = 0
 
     def search(self, node, query, threshold, results=None):
         score = 0
@@ -235,12 +235,11 @@ def search(self, node, query, threshold, results=None):
         if results is not None:
             results[node.name] = score
 
-        if score >= threshold:
-            # have we done better than this? if no, truncate searches below.
-            if score >= self.best_match:
-                # update best if it's a leaf node...
-                if isinstance(node, SigLeaf):
-                    self.best_match = score
-                return 1
+        # have we done better than this? if no, truncate searches below.
+        if score >= self.best_match:
+            # update best if it's a leaf node...
+            if isinstance(node, SigLeaf):
+                self.best_match = score
+            return 1
 
         return 0
diff --git a/sourmash/search.py b/sourmash/search.py
index e6aeaf7ef..08624f396 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -84,20 +84,11 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
     orig_scaled = orig_query.minhash.scaled
 
     # define a function to do a 'best' search and get only top match.
-    def find_best(dblist, query, remainder):
-
-        # @CTB this is a tree-specific optimization, I think - should fix.
-        # precompute best containment from all of the remainders
-        best_ctn_sofar = 0.0
-#        for x in remainder:
-#            ctn = query.minhash.containment_ignore_maxhash(x.minhash)
-#            if ctn > best_ctn_sofar:
-#                best_ctn_sofar = ctn
-
+    def find_best(dblist, query):
         results = []
         for (obj, filename, filetype) in dblist:
             # search a tree!
-            gather_iter = obj.gather(query, threshold=best_ctn_sofar)
+            gather_iter = obj.gather(query)
             for similarity, ss, fname in gather_iter:
                 # @CTB hackity-hack hack, this is because trees don't have
                 # filenames at the moment.
@@ -105,7 +96,6 @@ def find_best(dblist, query, remainder):
                     fname = filename
                 results.append((similarity, ss, filename))
 
-
         if not results:
             return None, None, None
 
@@ -113,19 +103,14 @@ def find_best(dblist, query, remainder):
         results.sort(key=lambda x: (-x[0], x[1].name()))   # reverse sort on similarity, and then on name
         best_similarity, best_leaf, filename = results[0]
 
-        for x in results[1:]:
-            remainder.add(x[1])
-
         return best_similarity, best_leaf, filename
 
-
     # construct a new query that doesn't have the max_hash attribute set.
     query = build_new_query([], orig_query)
 
     cmp_scaled = 0
-    remainder = set()
     while 1:
-        best_similarity, best_leaf, filename = find_best(databases, query, remainder)
+        best_similarity, best_leaf, filename = find_best(databases, query)
         if not best_leaf:          # no matches at all!
             break
 
@@ -136,7 +121,7 @@ def find_best(dblist, query, remainder):
         # figure out what the resolution of the banding on the subject is
         if not best_leaf.minhash.max_hash:
             error('Best hash match in sbt_gather has no max_hash')
-            error('Please prepare database of sequences with --scaled')
+            error('Please prepare gather databases with --scaled')
             sys.exit(-1)
 
         match_scaled = best_leaf.minhash.scaled

From 088395de379175b1af82a5e4142b4acf092d906c Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 8 Sep 2019 06:57:46 -0700
Subject: [PATCH 21/37] significant refactor of gather code

---
 sourmash/index.py  |   4 +-
 sourmash/search.py | 116 +++++++++++++++++++++++++--------------------
 2 files changed, 67 insertions(+), 53 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index dea966eec..47586db8a 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -99,12 +99,14 @@ def search(self, query, *args, **kwargs):
         return matches
 
     def gather(self, query, *args, **kwargs):
+        "Return the best containment in the list."
         results = []
         for ss in self.signatures:
             cont = query.minhash.containment_ignore_maxhash(ss.minhash)
             if cont:
                 results.append((cont, ss, self.filename))
-        results.sort(reverse=True)
+
+        results.sort(reverse=True)        # CTB: sort on ss.name() too?
 
         return results
 
diff --git a/sourmash/search.py b/sourmash/search.py
index 08624f396..580261dbb 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -57,8 +57,8 @@ def search_databases(query, databases, threshold, do_containment, best_only,
     return results
 
 
-# define a function to build new query object
-def build_new_query(to_remove, old_query, scaled=None):
+# build a new query object, subtracting found mins and downsampling if needed.
+def _build_new_query(to_remove, old_query, scaled=None):
     e = old_query.minhash
     e.remove_many(to_remove)
     if scaled:
@@ -70,64 +70,74 @@ def build_new_query(to_remove, old_query, scaled=None):
                           'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, leaf')
 
 
-def gather_databases(query, databases, threshold_bp, ignore_abundance):
-    orig_query = query
-    orig_mins = orig_query.minhash.get_hashes()
-    orig_abunds = { k: 1 for k in orig_mins }
+def _find_best(dblist, query):
+    """
+    Search for the best containment, return precisely one match.
+    """
 
-    # do we pay attention to abundances?
-    if orig_query.minhash.track_abundance and not ignore_abundance:
-        import numpy as np
-        orig_abunds = orig_query.minhash.get_mins(with_abundance=True)
+    best_cont = 0.0
+    best_match = None
+    best_filename = None
 
-    # store the scaled value for the query
-    orig_scaled = orig_query.minhash.scaled
+    # search across all databases
+    for (obj, filename, filetype) in dblist:
+        for cont, match, fname in obj.gather(query):
+            if cont:
+                # note, break ties based on name, to ensure consistent order.
+                if (cont == best_cont and match.name() < best_match.name()) or\
+                   cont > best_cont:
+                    # update best match.
+                    best_cont = cont
+                    best_match = match
 
-    # define a function to do a 'best' search and get only top match.
-    def find_best(dblist, query):
-        results = []
-        for (obj, filename, filetype) in dblist:
-            # search a tree!
-            gather_iter = obj.gather(query)
-            for similarity, ss, fname in gather_iter:
-                # @CTB hackity-hack hack, this is because trees don't have
-                # filenames at the moment.
-                if fname is None and filename:
-                    fname = filename
-                results.append((similarity, ss, filename))
+                    # some objects may not have associated filename (e.g. SBTs)
+                    best_filename = fname or filename
 
-        if not results:
-            return None, None, None
+    if not best_match:
+        return None, None, None
 
-        # take the best result
-        results.sort(key=lambda x: (-x[0], x[1].name()))   # reverse sort on similarity, and then on name
-        best_similarity, best_leaf, filename = results[0]
+    return best_cont, best_match, best_filename
 
-        return best_similarity, best_leaf, filename
 
-    # construct a new query that doesn't have the max_hash attribute set.
-    query = build_new_query([], orig_query)
+def gather_databases(query, databases, threshold_bp, ignore_abundance):
+    """
+    Iteratively find the best containment of `query` in all the `databases`,
+    until we find fewer than `threshold_bp` (estimated) bp in common.
+    """
+    # track original query information for later usage.
+    track_abundance = query.minhash.track_abundance and not ignore_abundance
+    orig_mh = query.minhash
+    orig_mins = orig_mh.get_hashes()
+    orig_abunds = { k: 1 for k in orig_mins }
 
-    cmp_scaled = 0
+    # do we pay attention to abundances?
+    if track_abundance:
+        import numpy as np
+        orig_abunds = orig_mh.get_mins(with_abundance=True)
+
+    # construct a new query object for later modification.
+    # @CTB note this doesn't actually construct a new query object...
+    query = _build_new_query([], query)
+
+    cmp_scaled = query.minhash.scaled    # initialize with resolution of query
     while 1:
-        best_similarity, best_leaf, filename = find_best(databases, query)
-        if not best_leaf:          # no matches at all!
+        best_cont, best_match, filename = _find_best(databases, query)
+        if not best_match:          # no matches at all!
             break
 
         # subtract found hashes from search hashes, construct new search
         query_mins = set(query.minhash.get_hashes())
-        found_mins = best_leaf.minhash.get_hashes()
+        found_mins = best_match.minhash.get_hashes()
 
-        # figure out what the resolution of the banding on the subject is
-        if not best_leaf.minhash.max_hash:
-            error('Best hash match in sbt_gather has no max_hash')
+        # Is the best match computed with scaled? Die if not.
+        match_scaled = best_match.minhash.scaled
+        if not match_scaled:
+            error('Best match in gather is not scaled.')
             error('Please prepare gather databases with --scaled')
-            sys.exit(-1)
-
-        match_scaled = best_leaf.minhash.scaled
+            raise Exception
 
         # pick the highest scaled / lowest resolution
-        cmp_scaled = max(cmp_scaled, match_scaled, orig_scaled)
+        cmp_scaled = max(cmp_scaled, match_scaled)
 
         # eliminate mins under this new resolution.
         # (CTB note: this means that if a high scaled/low res signature is
@@ -154,7 +164,7 @@ def find_best(dblist, query):
         f_orig_query = len(intersect_orig_mins) / float(len(orig_mins))
 
         # calculate fractions wrt second denominator - metagenome size
-        orig_mh = orig_query.minhash.downsample_scaled(cmp_scaled)
+        orig_mh = orig_mh.downsample_scaled(cmp_scaled)
         query_n_mins = len(orig_mh)
         f_unique_to_query = len(intersect_mins) / float(query_n_mins)
 
@@ -162,9 +172,10 @@ def find_best(dblist, query):
         f_unique_weighted = sum((orig_abunds[k] for k in intersect_mins)) \
                / sum_abunds
 
-        intersect_abunds = list(sorted(orig_abunds[k] for k in intersect_mins))
+        # calculate stats on abundances, if desired.
         average_abund, median_abund, std_abund = 0, 0, 0
-        if orig_query.minhash.track_abundance and not ignore_abundance:
+        if track_abundance:
+            intersect_abunds = list((orig_abunds[k] for k in intersect_mins))
             average_abund = np.mean(intersect_abunds)
             median_abund = np.median(intersect_abunds)
             std_abund = np.std(intersect_abunds)
@@ -179,14 +190,15 @@ def find_best(dblist, query):
                               median_abund=median_abund,
                               std_abund=std_abund,
                               filename=filename,
-                              md5=best_leaf.md5sum(),
-                              name=best_leaf.name(),
-                              leaf=best_leaf)
+                              md5=best_match.md5sum(),
+                              name=best_match.name(),
+                              leaf=best_match)
 
-        # construct a new query, minus the previous one.
-        query = build_new_query(found_mins, orig_query, cmp_scaled)
-        query_mins -= set(found_mins)
+        # construct a new query, subtracting hashes found in previous one.
+        query = _build_new_query(found_mins, query, cmp_scaled)
 
+        # compute weighted_missed:
+        query_mins -= set(found_mins)
         weighted_missed = sum((orig_abunds[k] for k in query_mins)) \
              / sum_abunds
 

From c281e79b009430acabf0461e211fc8bb7ccd8991 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 8 Sep 2019 07:06:38 -0700
Subject: [PATCH 22/37] futher refactoring and simplification

---
 sourmash/sbt.py    |  4 ++--
 sourmash/sbtmh.py  |  2 +-
 sourmash/search.py | 40 ++++++++++++++--------------------------
 3 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index 10661b031..127870424 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -309,9 +309,9 @@ def search(self, query, *args, **kwargs):
         
 
     def gather(self, query, *args, **kwargs):
-        from .sbtmh import GatherMinHashesFindBestIgnoreMaxHash
+        from .sbtmh import GatherMinHashes
         # use a tree search function that keeps track of its best match.
-        search_fn = GatherMinHashesFindBestIgnoreMaxHash().search
+        search_fn = GatherMinHashes().search
 
         results = []
         for leaf in self.find(search_fn, query, 0.0):
diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py
index 5f8c20f0b..44067d896 100644
--- a/sourmash/sbtmh.py
+++ b/sourmash/sbtmh.py
@@ -203,7 +203,7 @@ def search_minhashes_containment(node, sig, threshold,
     return 0
 
 
-class GatherMinHashesFindBestIgnoreMaxHash(object):
+class GatherMinHashes(object):
     def __init__(self):
         self.best_match = 0
 
diff --git a/sourmash/search.py b/sourmash/search.py
index 580261dbb..d9408e2b7 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -4,12 +4,10 @@
 
 from .logging import notify, error
 from .signature import SourmashSignature
-from .sbtmh import search_minhashes, search_minhashes_containment
-from .sbtmh import SearchMinHashesFindBest, GatherMinHashesFindBestIgnoreMaxHash
 from ._minhash import get_max_hash_for_scaled
 
 
-# generic SearchResult across individual signatures + SBTs.
+# generic SearchResult.
 SearchResult = namedtuple('SearchResult',
                           'similarity, match_sig, md5, filename, name')
 
@@ -30,14 +28,6 @@ def format_bp(bp):
 
 def search_databases(query, databases, threshold, do_containment, best_only,
                      ignore_abundance):
-    # set up the search & score function(s) - similarity vs containment
-    search_fn = search_minhashes
-    query_match = lambda x: query.similarity(
-        x, downsample=True, ignore_abundance=ignore_abundance)
-    if do_containment:
-        search_fn = search_minhashes_containment
-        query_match = lambda x: query.contained_by(x, downsample=True)
-
     results = []
     found_md5 = set()
     for (obj, filename, filetype) in databases:
@@ -50,26 +40,28 @@ def search_databases(query, databases, threshold, do_containment, best_only,
                 results.append(sr)
                 found_md5.add(sr.md5)
 
-
     # sort results on similarity (reverse)
     results.sort(key=lambda x: -x.similarity)
 
     return results
 
-
-# build a new query object, subtracting found mins and downsampling if needed.
-def _build_new_query(to_remove, old_query, scaled=None):
-    e = old_query.minhash
-    e.remove_many(to_remove)
-    if scaled:
-        e = e.downsample_scaled(scaled)
-    return SourmashSignature(e)
-
+###
+### gather code
+###
 
 GatherResult = namedtuple('GatherResult',
                           'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, leaf')
 
 
+# build a new query object, subtracting found mins and downsampling
+def _subtract_and_downsample(to_remove, old_query, scaled=None):
+    mh = old_query.minhash
+    mh = mh.downsample_scaled(scaled)
+    mh.remove_many(to_remove)
+
+    return SourmashSignature(mh)
+
+
 def _find_best(dblist, query):
     """
     Search for the best containment, return precisely one match.
@@ -115,10 +107,6 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
         import numpy as np
         orig_abunds = orig_mh.get_mins(with_abundance=True)
 
-    # construct a new query object for later modification.
-    # @CTB note this doesn't actually construct a new query object...
-    query = _build_new_query([], query)
-
     cmp_scaled = query.minhash.scaled    # initialize with resolution of query
     while 1:
         best_cont, best_match, filename = _find_best(databases, query)
@@ -195,7 +183,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
                               leaf=best_match)
 
         # construct a new query, subtracting hashes found in previous one.
-        query = _build_new_query(found_mins, query, cmp_scaled)
+        query = _subtract_and_downsample(found_mins, query, cmp_scaled)
 
         # compute weighted_missed:
         query_mins -= set(found_mins)

From e82cffc76e2fb31324692250bcf9583f71f377c2 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 8 Sep 2019 07:18:35 -0700
Subject: [PATCH 23/37] rely on 'Index.gather' returning actual matches

---
 sourmash/search.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/sourmash/search.py b/sourmash/search.py
index d9408e2b7..54fba18f0 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -74,16 +74,17 @@ def _find_best(dblist, query):
     # search across all databases
     for (obj, filename, filetype) in dblist:
         for cont, match, fname in obj.gather(query):
-            if cont:
-                # note, break ties based on name, to ensure consistent order.
-                if (cont == best_cont and match.name() < best_match.name()) or\
-                   cont > best_cont:
-                    # update best match.
-                    best_cont = cont
-                    best_match = match
-
-                    # some objects may not have associated filename (e.g. SBTs)
-                    best_filename = fname or filename
+            assert cont
+
+            # note, break ties based on name, to ensure consistent order.
+            if (cont == best_cont and match.name() < best_match.name()) or \
+               cont > best_cont:
+                # update best match.
+                best_cont = cont
+                best_match = match
+
+                # some objects may not have associated filename (e.g. SBTs)
+                best_filename = fname or filename
 
     if not best_match:
         return None, None, None

From ef9b900677c0219abd0530f5527da1bc0294f31a Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 8 Sep 2019 07:53:45 -0700
Subject: [PATCH 24/37] remove duplicate SearchResult, clean up & rationalize
 SearchResult and GatherResult

---
 sourmash/commands.py      | 18 +++++++++---------
 sourmash/index.py         | 19 +++++--------------
 sourmash/lca/lca_utils.py | 22 +++++++---------------
 sourmash/sbt.py           | 10 +---------
 sourmash/search.py        | 26 +++++++++++++++++---------
 tests/test_index.py       | 30 +++++++++++++++---------------
 6 files changed, 54 insertions(+), 71 deletions(-)

diff --git a/sourmash/commands.py b/sourmash/commands.py
index cd1cdeb5f..ef31532c6 100644
--- a/sourmash/commands.py
+++ b/sourmash/commands.py
@@ -570,7 +570,7 @@ def search(args):
     print_results("----------   -----")
     for sr in results[:n_matches]:
         pct = '{:.1f}%'.format(sr.similarity*100)
-        name = sr.match_sig._display_name(60)
+        name = sr.match._display_name(60)
         print_results('{:>6}       {}', pct, name)
 
     if args.best_only:
@@ -583,14 +583,14 @@ def search(args):
         w.writeheader()
         for sr in results:
             d = dict(sr._asdict())
-            del d['match_sig']
+            del d['match']
             w.writerow(d)
 
     # save matching signatures upon request
     if args.save_matches:
         outname = args.save_matches.name
         notify('saving all matched signatures to "{}"', outname)
-        sig.save_signatures([ sr.match_sig for sr in results ],
+        sig.save_signatures([ sr.match for sr in results ],
                             args.save_matches)
 
 
@@ -758,7 +758,7 @@ def gather(args):
         pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
         pct_genome = '{:.1f}%'.format(result.f_match*100)
         average_abund ='{:.1f}'.format(result.average_abund)
-        name = result.leaf._display_name(40)
+        name = result.match._display_name(40)
 
         if query.minhash.track_abundance and not args.ignore_abundance:
             print_results('{:9}   {:>7} {:>7} {:>9}    {}',
@@ -786,13 +786,13 @@ def gather(args):
         w.writeheader()
         for result in found:
             d = dict(result._asdict())
-            del d['leaf']                 # actual signature not in CSV.
+            del d['match']                 # actual signature not in CSV.
             w.writerow(d)
 
     if found and args.save_matches:
         outname = args.save_matches.name
         notify('saving all matches to "{}"', outname)
-        sig.save_signatures([ r.leaf for r in found ], args.save_matches)
+        sig.save_signatures([ r.match for r in found ], args.save_matches)
 
     if args.output_unassigned:
         if not len(query.minhash):
@@ -906,7 +906,7 @@ def multigather(args):
             pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
             pct_genome = '{:.1f}%'.format(result.f_match*100)
             average_abund ='{:.1f}'.format(result.average_abund)
-            name = result.leaf._display_name(40)
+            name = result.match._display_name(40)
 
             if query.minhash.track_abundance and not args.ignore_abundance:
                 print_results('{:9}   {:>7} {:>7} {:>9}    {}',
@@ -941,14 +941,14 @@ def multigather(args):
             w.writeheader()
             for result in found:
                 d = dict(result._asdict())
-                del d['leaf']                 # actual signature not in CSV.
+                del d['match']                 # actual signature not in CSV.
                 w.writerow(d)
 
         output_matches = output_base + '.matches.sig'
         with open(output_matches, 'wt') as fp:
             outname = output_matches
             notify('saving all matches to "{}"', outname)
-            sig.save_signatures([ r.leaf for r in found ], fp)
+            sig.save_signatures([ r.match for r in found ], fp)
 
         output_unassigned = output_base + '.unassigned.sig'
         with open(output_unassigned, 'wt') as fp:
diff --git a/sourmash/index.py b/sourmash/index.py
index 47586db8a..8e3147df7 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -3,11 +3,6 @@
 from abc import ABCMeta, abstractmethod
 from collections import namedtuple
 
-# @CTB copied out of search.py to deal with import order issues, #willfix
-SearchResult = namedtuple('SearchResult',
-                          'similarity, match_sig, md5, filename, name')
-
-
 # compatible with Python 2 *and* 3:
 ABC = ABCMeta("ABC", (object,), {"__slots__": ()})
 
@@ -87,15 +82,11 @@ def search(self, query, *args, **kwargs):
         for ss in self.signatures:
             similarity = query_match(ss)
             if similarity >= threshold:
-                # @CTB: check duplicates via md5sum - here or ??
-                sr = SearchResult(similarity=similarity,
-                                  match_sig=ss,
-                                  md5=ss.md5sum(),
-                                  filename = self.filename,
-                                  name=ss.name())
-                matches.append(sr)
-
-        matches.sort(key=lambda x: -x.similarity)
+                # @CTB: check duplicates via md5sum - here or later?
+                matches.append((similarity, ss, self.filename))
+
+        # sort!
+        matches.sort(key=lambda x: -x[0])
         return matches
 
     def gather(self, query, *args, **kwargs):
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index 73a78a8af..60d6fc897 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -24,9 +24,6 @@
 # type to store an element in a taxonomic lineage
 LineagePair = namedtuple('LineagePair', ['rank', 'name'])
 
-# @CTB copied out of search.py to deal with import order issues, #willfix
-SearchResult = namedtuple('SearchResult',
-                          'similarity, match_sig, md5, filename, name')
 
 def check_files_exist(*files):
     ret = True
@@ -275,24 +272,19 @@ def search(self, query, *args, **kwargs):
 
         results = []
         for x in self.find(query.minhash, threshold, do_containment):
-            (score, match_sig, md5, filename, name) = x
-            sr = SearchResult(similarity=score,
-                              match_sig=match_sig,
-                              md5=md5,
-                              filename=filename,
-                              name=name)
-            results.append(sr)
-
-        results.sort(key=lambda x: -x.similarity)
+            (score, match, filename) = x
+            results.append((score, match, filename))
+
+        results.sort(key=lambda x: -x[0])
         return results
 
     def gather(self, query, *args, **kwargs):
         results = []
         for x in self.find(query.minhash, 0.0,
                            containment=True, ignore_scaled=True):
-            (score, match_sig, md5, filename, name) = x
+            (score, match, filename) = x
             if score:
-                results.append((score, match_sig, filename))
+                results.append((score, match, filename))
 
         return results
 
@@ -396,7 +388,7 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
                 from .. import SourmashSignature
                 match_sig = SourmashSignature(match_mh, name=name)
 
-                yield score, match_sig, match_sig.md5sum(), self.filename, name
+                yield score, match_sig, self.filename
 
 
 def load_single_database(filename, verbose=False):
diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index 127870424..648b769a7 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -77,9 +77,6 @@ def search_transcript(node, seq, threshold):
 }
 NodePos = namedtuple("NodePos", ["pos", "node"])
 
-# @CTB copied out of search.py to deal with import order issues, #willfix
-SearchResult = namedtuple('SearchResult',
-                          'similarity, match_sig, md5, filename, name')
 
 class GraphFactory(object):
     """Build new nodegraphs (Bloom filters) of a specific (fixed) size.
@@ -298,12 +295,7 @@ def search(self, query, *args, **kwargs):
             # tree search should always/only return matches above threshold
             assert similarity >= threshold
 
-            sr = SearchResult(similarity=similarity,
-                              match_sig=leaf.data,
-                              md5=leaf.data.md5sum(),
-                              name=leaf.data.name(),
-                              filename=None)
-            results.append(sr)
+            results.append((similarity, leaf.data, None))
 
         return results
         
diff --git a/sourmash/search.py b/sourmash/search.py
index 54fba18f0..7694bdfdf 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -9,7 +9,7 @@
 
 # generic SearchResult.
 SearchResult = namedtuple('SearchResult',
-                          'similarity, match_sig, md5, filename, name')
+                          'similarity, match, md5, filename, name')
 
 
 def format_bp(bp):
@@ -35,22 +35,30 @@ def search_databases(query, databases, threshold, do_containment, best_only,
                                  do_containment=do_containment,
                                  ignore_abundance=ignore_abundance,
                                  best_only=best_only)
-        for sr in search_iter:
-            if sr.md5 not in found_md5:
-                results.append(sr)
-                found_md5.add(sr.md5)
+        for (similarity, match, filename) in search_iter:
+            md5 = match.md5sum()
+            if md5 not in found_md5:
+                results.append((similarity, match, filename))
+                found_md5.add(md5)
 
     # sort results on similarity (reverse)
-    results.sort(key=lambda x: -x.similarity)
+    results.sort(key=lambda x: -x[0])
 
-    return results
+    x = []
+    for (similarity, match, filename) in results:
+        x.append(SearchResult(similarity=similarity,
+                              match=match,
+                              md5=match.md5sum(),
+                              filename=filename,
+                              name=match.name()))
+    return x
 
 ###
 ### gather code
 ###
 
 GatherResult = namedtuple('GatherResult',
-                          'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, leaf')
+                          'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match')
 
 
 # build a new query object, subtracting found mins and downsampling
@@ -181,7 +189,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
                               filename=filename,
                               md5=best_match.md5sum(),
                               name=best_match.name(),
-                              leaf=best_match)
+                              match=best_match)
 
         # construct a new query, subtracting hashes found in previous one.
         query = _subtract_and_downsample(found_mins, query, cmp_scaled)
diff --git a/tests/test_index.py b/tests/test_index.py
index e232770ad..2f5bad4e1 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -81,32 +81,32 @@ def test_linear_index_search():
 
     # now, search for sig2
     sr = lidx.search(ss2, threshold=1.0)
-    print([s.name for s in sr])
+    print([s[1].name() for s in sr])
     assert len(sr) == 1
-    assert sr[0].match_sig == ss2
+    assert sr[0][1] == ss2
 
     # search for sig47 with lower threshold; search order not guaranteed.
     sr = lidx.search(ss47, threshold=0.1)
-    print([s.name for s in sr])
+    print([s[1].name() for s in sr])
     assert len(sr) == 2
-    sr.sort(key=lambda x: -x.similarity)
-    assert sr[0].match_sig == ss47
-    assert sr[1].match_sig == ss63
+    sr.sort(key=lambda x: -x[0])
+    assert sr[0][1] == ss47
+    assert sr[1][1] == ss63
 
     # search for sig63 with lower threshold; search order not guaranteed.
     sr = lidx.search(ss63, threshold=0.1)
-    print([s.name for s in sr])
+    print([s[1].name() for s in sr])
     assert len(sr) == 2
-    sr.sort(key=lambda x: -x.similarity)
-    assert sr[0].match_sig == ss63
-    assert sr[1].match_sig == ss47
+    sr.sort(key=lambda x: -x[0])
+    assert sr[0][1] == ss63
+    assert sr[1][1] == ss47
 
     # search for sig63 with high threshold => 1 match
     sr = lidx.search(ss63, threshold=0.8)
-    print([s.name for s in sr])
+    print([s[1].name for s in sr])
     assert len(sr) == 1
-    sr.sort(key=lambda x: -x.similarity)
-    assert sr[0].match_sig == ss63
+    sr.sort(key=lambda x: -x[0])
+    assert sr[0][1] == ss63
 
 
 def test_linear_index_gather():
@@ -212,6 +212,6 @@ def test_linear_index_save_load():
         
     # now, search for sig2
     sr = linear2.search(ss2, threshold=1.0)
-    print([s.name for s in sr])
+    print([s[1].name() for s in sr])
     assert len(sr) == 1
-    assert sr[0].match_sig == ss2
+    assert sr[0][1] == ss2

From df4b91145fd201c33baba971e60166b3b4f21fcf Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 8 Sep 2019 08:34:56 -0700
Subject: [PATCH 25/37] display full order of sigs in failed tests

---
 tests/test_index.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_index.py b/tests/test_index.py
index 2f5bad4e1..a4e371911 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -165,7 +165,7 @@ def test_linear_index_save():
     print(si)
     print(x)
 
-    assert si == x
+    assert si == x, si
 
 
 def test_linear_index_load():
@@ -187,7 +187,7 @@ def test_linear_index_load():
         linear = LinearIndex.load(filename)
 
     x = [ ss2, ss47, ss63 ]
-    assert linear.signatures == x
+    assert linear.signatures == x, linear.signatures
     assert linear.filename == filename
 
 

From 95ddb7fcfa68e6f37231aa96909369c7e2c9e336 Mon Sep 17 00:00:00 2001
From: Luiz Irber <luiz.irber@gmail.com>
Date: Wed, 23 Oct 2019 23:06:21 +0000
Subject: [PATCH 26/37] fix heisenbug in tests

---
 tests/test_index.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_index.py b/tests/test_index.py
index a4e371911..bc596b0f3 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -155,9 +155,9 @@ def test_linear_index_save():
         linear.save(filename)
 
         from sourmash import load_signatures
-        si = list(load_signatures(filename))
+        si = set(load_signatures(filename))
 
-    x = [ ss2, ss47, ss63 ]
+    x = {ss2, ss47, ss63}
 
     print(len(si))
     print(len(x))
@@ -186,8 +186,8 @@ def test_linear_index_load():
 
         linear = LinearIndex.load(filename)
 
-    x = [ ss2, ss47, ss63 ]
-    assert linear.signatures == x, linear.signatures
+    x = {ss2, ss47, ss63}
+    assert set(linear.signatures) == x, linear.signatures
     assert linear.filename == filename
 
 

From 5984d3de0c5bffdb2694e9b8a8ac7af07fae472a Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 12 Dec 2019 15:55:38 -0800
Subject: [PATCH 27/37] add signatures() iterator to Index objects

---
 sourmash/index.py         | 27 +++++++++++++++------------
 sourmash/lca/lca_utils.py | 11 +++++++----
 sourmash/sbt.py           |  3 +++
 tests/test_index.py       |  2 +-
 4 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index 8e3147df7..965bd199c 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -9,8 +9,8 @@
 
 class Index(ABC):
     @abstractmethod
-    def find(self, search_fn, *args, **kwargs):
-        """ """
+    def signatures(self):
+        "Return an iterator over all signatures in the Index object."
 
     @abstractmethod
     def search(self, signature, *args, **kwargs):
@@ -35,22 +35,25 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr
 
 
 class LinearIndex(Index):
-    def __init__(self, signatures=None, filename=None):
-        self.signatures = []
-        if signatures:
-            self.signatures = list(signatures)
+    def __init__(self, _signatures=None, filename=None):
+        self._signatures = []
+        if _signatures:
+            self._signatures = list(_signatures)
         self.filename = filename
 
+    def signatures(self):
+        return iter(self._signatures)
+
     def __len__(self):
-        return len(self.signatures)
+        return len(self._signatures)
 
     def insert(self, node):
-        self.signatures.append(node)
+        self._signatures.append(node)
 
     def find(self, search_fn, *args, **kwargs):
         matches = []
 
-        for node in self.signatures:
+        for node in self.signatures():
             if search_fn(node, *args):
                 matches.append(node)
         return matches
@@ -79,7 +82,7 @@ def search(self, query, *args, **kwargs):
         # do the actual search:
         matches = []
 
-        for ss in self.signatures:
+        for ss in self.signatures():
             similarity = query_match(ss)
             if similarity >= threshold:
                 # @CTB: check duplicates via md5sum - here or later?
@@ -92,7 +95,7 @@ def search(self, query, *args, **kwargs):
     def gather(self, query, *args, **kwargs):
         "Return the best containment in the list."
         results = []
-        for ss in self.signatures:
+        for ss in self.signatures():
             cont = query.minhash.containment_ignore_maxhash(ss.minhash)
             if cont:
                 results.append((cont, ss, self.filename))
@@ -104,7 +107,7 @@ def gather(self, query, *args, **kwargs):
     def save(self, path):
         from .signature import save_signatures
         with open(path, 'wt') as fp:
-            save_signatures(self.signatures, fp)
+            save_signatures(self.signatures(), fp)
 
     @classmethod
     def load(cls, location):
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index 60d6fc897..0ea5ace50 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -164,6 +164,9 @@ def __init__(self):
     def __repr__(self):
         return "LCA_Database('{}')".format(self.filename)
 
+    def signatures(self):
+        raise NotImplementedError
+
     def load(self, db_name):
         "Load from a JSON file."
         xopen = open
@@ -334,7 +337,7 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
         elif self.scaled < minhash.scaled and not ignore_scaled:
             raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
 
-        if not hasattr(self, 'signatures'):
+        if not hasattr(self, '_signatures'):
             debug('creating signatures for LCA DB...')
             sigd = defaultdict(minhash.copy_and_clear)
 
@@ -342,9 +345,9 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
                 for vv in v:
                     sigd[vv].add_hash(k)
 
-            self.signatures = sigd
+            self._signatures = sigd
 
-        debug('=> {} signatures!', len(self.signatures))
+        debug('=> {} signatures!', len(self._signatures))
 
         # build idx_to_ident from ident_to_idx
         if not hasattr(self, 'idx_to_ident'):
@@ -370,7 +373,7 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
             name = self.ident_to_name[ident]
             debug('looking at {} ({})', ident, name)
 
-            match_mh = self.signatures[idx]
+            match_mh = self._signatures[idx]
             match_size = len(match_mh)
 
             debug('count: {}; query_mins: {}; match size: {}',
diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index 648b769a7..743f6e25e 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -134,6 +134,9 @@ def __init__(self, factory, d=2, storage=None):
         self.next_node = 0
         self.storage = storage
 
+    def signatures(self):
+        return leaves()
+
     def new_node_pos(self, node):
         if not self._nodes:
             self.next_node = 1
diff --git a/tests/test_index.py b/tests/test_index.py
index bc596b0f3..923239d74 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -187,7 +187,7 @@ def test_linear_index_load():
         linear = LinearIndex.load(filename)
 
     x = {ss2, ss47, ss63}
-    assert set(linear.signatures) == x, linear.signatures
+    assert set(linear.signatures()) == x, linear.signatures
     assert linear.filename == filename
 
 

From 931737ef3f9ba796f8245682cca42260928f863b Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 12 Dec 2019 16:17:04 -0800
Subject: [PATCH 28/37] move search, gather functions into base Index class

---
 sourmash/index.py | 44 ++++++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index 965bd199c..667cc2ab7 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -12,14 +12,6 @@ class Index(ABC):
     def signatures(self):
         "Return an iterator over all signatures in the Index object."
 
-    @abstractmethod
-    def search(self, signature, *args, **kwargs):
-        """ """
-
-    @abstractmethod
-    def gather(self, signature, *args, **kwargs):
-        """ """
-
     @abstractmethod
     def insert(self, node):
         """ """
@@ -33,23 +25,6 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False):
     def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True):
         """ """
 
-
-class LinearIndex(Index):
-    def __init__(self, _signatures=None, filename=None):
-        self._signatures = []
-        if _signatures:
-            self._signatures = list(_signatures)
-        self.filename = filename
-
-    def signatures(self):
-        return iter(self._signatures)
-
-    def __len__(self):
-        return len(self._signatures)
-
-    def insert(self, node):
-        self._signatures.append(node)
-
     def find(self, search_fn, *args, **kwargs):
         matches = []
 
@@ -100,10 +75,27 @@ def gather(self, query, *args, **kwargs):
             if cont:
                 results.append((cont, ss, self.filename))
 
-        results.sort(reverse=True)        # CTB: sort on ss.name() too?
+        results.sort(reverse=True, key=lambda x: (x[0], x[1].name()))
 
         return results
 
+
+class LinearIndex(Index):
+    def __init__(self, _signatures=None, filename=None):
+        self._signatures = []
+        if _signatures:
+            self._signatures = list(_signatures)
+        self.filename = filename
+
+    def signatures(self):
+        return iter(self._signatures)
+
+    def __len__(self):
+        return len(self._signatures)
+
+    def insert(self, node):
+        self._signatures.append(node)
+
     def save(self, path):
         from .signature import save_signatures
         with open(path, 'wt') as fp:

From da7c97905cd7c1b1ea82dfb6a468a2678e293e5f Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 12 Dec 2019 16:39:12 -0800
Subject: [PATCH 29/37] fix lca search ignore abundance

---
 sourmash/commands.py      | 4 ++++
 sourmash/lca/lca_utils.py | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/sourmash/commands.py b/sourmash/commands.py
index c986c5600..e82fb6f3a 100644
--- a/sourmash/commands.py
+++ b/sourmash/commands.py
@@ -545,6 +545,10 @@ def search(args):
                                                 not args.containment,
                                                 args.traverse_directory)
 
+    # forcibly ignore abundances if query has no abundances
+    if not query.minhash.track_abundance:
+        args.ignore_abundance = True
+
     if not len(databases):
         error('Nothing found to search!')
         sys.exit(-1)
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index 0ea5ace50..e4ad78762 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -271,7 +271,9 @@ def search(self, query, *args, **kwargs):
             raise TypeError("'search' requires 'threshold'")
         threshold = kwargs['threshold']
         do_containment = kwargs.get('do_containment', False)
-        # @CTB ignore_abundance?
+        ignore_abundance = kwargs.get('ignore_abundance')
+        if not ignore_abundance:
+            raise TypeError("'search' on LCA databases does not use abundance")
 
         results = []
         for x in self.find(query.minhash, threshold, do_containment):

From 3fa8de353bf30a8e37d8ac6dad6239870bcafc24 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 12 Dec 2019 16:48:27 -0800
Subject: [PATCH 30/37] add function doc

---
 sourmash/index.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index 667cc2ab7..3f7666c16 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -34,7 +34,17 @@ def find(self, search_fn, *args, **kwargs):
         return matches
 
     def search(self, query, *args, **kwargs):
-        """@@
+        """Return set of matches with similarity above 'threshold'.
+
+        Results will be sorted by similarity, highest to lowest.
+
+        Optional arguments accepted by all Index subclasses:
+          * do_containment: default False. If True, use Jaccard containment.
+          * best_only: default False. If True, allow optimizations that
+            may. May discard matches better than threshold, but first match
+            is guaranteed to be best.
+          * ignore_abundance: default False. If True, and query signature
+            and database support k-mer abundances, ignore those abundances.
 
         Note, the "best only" hint is ignored by LinearIndex.
         """
@@ -60,7 +70,6 @@ def search(self, query, *args, **kwargs):
         for ss in self.signatures():
             similarity = query_match(ss)
             if similarity >= threshold:
-                # @CTB: check duplicates via md5sum - here or later?
                 matches.append((similarity, ss, self.filename))
 
         # sort!
@@ -68,7 +77,7 @@ def search(self, query, *args, **kwargs):
         return matches
 
     def gather(self, query, *args, **kwargs):
-        "Return the best containment in the list."
+        "Return the match with the best Jaccard containment in the Index."
         results = []
         for ss in self.signatures():
             cont = query.minhash.containment_ignore_maxhash(ss.minhash)

From d3dc2fb28bbd1d8446ae9aaee5ee7abb5dfb4d4c Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 13 Dec 2019 06:33:41 -0800
Subject: [PATCH 31/37] add signatures() method to both LCA and SBT indices

---
 sourmash/lca/lca_utils.py | 31 ++++++++++++++-------
 sourmash/sbt.py           |  3 ++-
 tests/test_lca.py         |  8 ++++++
 tests/test_sbt.py         | 57 +++++++++++++++++++++++++--------------
 4 files changed, 68 insertions(+), 31 deletions(-)

diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index e4ad78762..7167f18c9 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -165,7 +165,10 @@ def __repr__(self):
         return "LCA_Database('{}')".format(self.filename)
 
     def signatures(self):
-        raise NotImplementedError
+        from .. import SourmashSignature
+        self._create_signatures()
+        for v in self._signatures.values():
+            yield SourmashSignature(v)
 
     def load(self, db_name):
         "Load from a JSON file."
@@ -329,17 +332,13 @@ def get_lineage_assignments(self, hashval):
 
         return x
 
-    def find(self, minhash, threshold, containment=False, ignore_scaled=False):
-        """
-        Do a Jaccard similarity or containment search.
-        """
-        # make sure we're looking at the same scaled value as database
-        if self.scaled > minhash.scaled:
-            minhash = minhash.downsample_scaled(self.scaled)
-        elif self.scaled < minhash.scaled and not ignore_scaled:
-            raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
+    def _create_signatures(self):
+        "Create a _signatures member dictionary that contains {idx: minhash}."
+        from .. import MinHash
 
         if not hasattr(self, '_signatures'):
+            minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled)
+
             debug('creating signatures for LCA DB...')
             sigd = defaultdict(minhash.copy_and_clear)
 
@@ -351,6 +350,18 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
 
         debug('=> {} signatures!', len(self._signatures))
 
+    def find(self, minhash, threshold, containment=False, ignore_scaled=False):
+        """
+        Do a Jaccard similarity or containment search.
+        """
+        # make sure we're looking at the same scaled value as database
+        if self.scaled > minhash.scaled:
+            minhash = minhash.downsample_scaled(self.scaled)
+        elif self.scaled < minhash.scaled and not ignore_scaled:
+            raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
+
+        self._create_signatures()
+
         # build idx_to_ident from ident_to_idx
         if not hasattr(self, 'idx_to_ident'):
             idx_to_ident = {}
diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index 743f6e25e..ea713b720 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -135,7 +135,8 @@ def __init__(self, factory, d=2, storage=None):
         self.storage = storage
 
     def signatures(self):
-        return leaves()
+        for k in self.leaves():
+            yield k.data
 
     def new_node_pos(self, node):
         if not self._nodes:
diff --git a/tests/test_lca.py b/tests/test_lca.py
index fb63d63ad..820269dc8 100644
--- a/tests/test_lca.py
+++ b/tests/test_lca.py
@@ -133,6 +133,14 @@ def test_db_repr():
     assert repr(db) == "LCA_Database('{}')".format(filename)
 
 
+def test_lca_index_signatures_method():
+    filename = utils.get_test_data('lca/47+63.lca.json')
+    db, ksize, scaled = lca_utils.load_single_database(filename)
+
+    siglist = list(db.signatures())
+    assert len(siglist) == 2
+
+
 ## command line tests
 
 
diff --git a/tests/test_sbt.py b/tests/test_sbt.py
index bef6e7c6e..2c694d7a9 100644
--- a/tests/test_sbt.py
+++ b/tests/test_sbt.py
@@ -4,12 +4,12 @@
 
 import pytest
 
-from sourmash import signature
+from sourmash import load_signatures, load_one_signature
 from sourmash.sbt import SBT, GraphFactory, Leaf, Node
 from sourmash.sbtmh import (SigLeaf, search_minhashes,
-                                search_minhashes_containment)
+                            search_minhashes_containment)
 from sourmash.sbt_storage import (FSStorage, TarStorage,
-                                      RedisStorage, IPFSStorage)
+                                  RedisStorage, IPFSStorage)
 
 from . import sourmash_tst_utils as utils
 
@@ -138,7 +138,7 @@ def test_tree_v1_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(signature.load_signatures(testdata1))
+    to_search = next(load_signatures(testdata1))
 
     results_v1 = {str(s) for s in tree_v1.find(search_minhashes_containment,
                                                to_search, 0.1)}
@@ -157,7 +157,7 @@ def test_tree_v2_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(signature.load_signatures(testdata1))
+    to_search = next(load_signatures(testdata1))
 
     results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
                                                to_search, 0.1)}
@@ -176,7 +176,7 @@ def test_tree_v3_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(signature.load_signatures(testdata1))
+    to_search = next(load_signatures(testdata1))
 
     results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
                                                to_search, 0.1)}
@@ -195,7 +195,7 @@ def test_tree_v5_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(signature.load_signatures(testdata1))
+    to_search = next(load_signatures(testdata1))
 
     results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
                                                to_search, 0.1)}
@@ -211,7 +211,7 @@ def test_tree_save_load(n_children):
     tree = SBT(factory, d=n_children)
 
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = next(load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
         tree.insert(leaf)
         to_search = leaf
@@ -241,7 +241,7 @@ def test_tree_save_load_v5(n_children):
     tree = SBT(factory, d=n_children)
 
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = next(load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
         tree.add_node(leaf)
         to_search = leaf
@@ -272,7 +272,7 @@ def test_search_minhashes():
 
     n_leaves = 0
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = next(load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
         tree.insert(leaf)
 
@@ -295,7 +295,7 @@ def test_binary_nary_tree():
 
     n_leaves = 0
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = next(load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
         for tree in trees.values():
             tree.insert(leaf)
@@ -323,7 +323,7 @@ def test_sbt_combine(n_children):
 
     n_leaves = 0
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = next(load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
         tree.insert(leaf)
         if n_leaves < 4:
@@ -341,7 +341,7 @@ def test_sbt_combine(n_children):
     assert len(t_leaves) == len(t1_leaves)
     assert t1_leaves == t_leaves
 
-    to_search = next(signature.load_signatures(
+    to_search = next(load_signatures(
                         utils.get_test_data(utils.SIG_FILES[0])))
     t1_result = {str(s) for s in tree_1.find(search_minhashes,
                                              to_search, 0.1)}
@@ -370,7 +370,7 @@ def test_sbt_fsstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(signature.load_signatures(utils.get_test_data(f)))
+            sig = next(load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
             tree.insert(leaf)
             to_search = leaf
@@ -403,7 +403,7 @@ def test_sbt_tarstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(signature.load_signatures(utils.get_test_data(f)))
+            sig = next(load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
             tree.insert(leaf)
             to_search = leaf
@@ -439,7 +439,7 @@ def test_sbt_ipfsstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(signature.load_signatures(utils.get_test_data(f)))
+            sig = next(load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
             tree.insert(leaf)
             to_search = leaf
@@ -477,7 +477,7 @@ def test_sbt_redisstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(signature.load_signatures(utils.get_test_data(f)))
+            sig = next(load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
             tree.insert(leaf)
             to_search = leaf
@@ -516,7 +516,7 @@ def test_tree_repair():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(signature.load_signatures(testdata1))
+    to_search = next(load_signatures(testdata1))
 
     results_repair = {str(s) for s in tree_repair.find(search_minhashes,
                                                        to_search, 0.1)}
@@ -532,7 +532,7 @@ def test_tree_repair_insert():
                            leaf_loader=SigLeaf.load)
 
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = next(load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
         tree_repair.insert(leaf)
 
@@ -552,7 +552,7 @@ def test_save_sparseness(n_children):
     tree = SBT(factory, d=n_children)
 
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = next(load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
         tree.insert(leaf)
         to_search = leaf
@@ -586,3 +586,20 @@ def test_save_sparseness(n_children):
             # Leaf nodes can't have children
             if isinstance(node, Leaf):
                 assert all(c.node is None for c in tree_loaded.children(pos))
+
+
+def test_sbt_signatures():
+    factory = GraphFactory(31, 1e5, 4)
+    tree = SBT(factory, d=2)
+
+    sig47 = load_one_signature(utils.get_test_data('47.fa.sig'))
+    sig63 = load_one_signature(utils.get_test_data('63.fa.sig'))
+
+    tree.insert(SigLeaf('47', sig47))
+    tree.insert(SigLeaf('63', sig63))
+
+    xx = list(tree.signatures())
+    assert len(xx) == 2
+
+    assert sig47 in xx
+    assert sig63 in xx

From 83ad1b9ea8e100a6ba5b29d8dc75033eaab910dc Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 13 Dec 2019 16:16:27 -0800
Subject: [PATCH 32/37] Update tests/test_sbt.py

Co-Authored-By: Luiz Irber <luizirber@users.noreply.github.com>
---
 tests/test_sbt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_sbt.py b/tests/test_sbt.py
index 2c694d7a9..8bd059c0e 100644
--- a/tests/test_sbt.py
+++ b/tests/test_sbt.py
@@ -138,7 +138,7 @@ def test_tree_v1_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(load_signatures(testdata1))
+    to_search = load_one_signature(testdata1)
 
     results_v1 = {str(s) for s in tree_v1.find(search_minhashes_containment,
                                                to_search, 0.1)}

From c80ef462b23fed5e8e9471bce63fb793d922439e Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 14 Dec 2019 07:10:37 -0800
Subject: [PATCH 33/37] SBT.insert now matches Index.insert, while SBT.add_node
 does what insert used to

---
 sourmash/commands.py |  3 +--
 sourmash/sbt.py      | 13 +++++++----
 tests/test_index.py  | 10 ++++----
 tests/test_sbt.py    | 55 +++++++++++++++++++++-----------------------
 4 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/sourmash/commands.py b/sourmash/commands.py
index e82fb6f3a..ac81f816b 100644
--- a/sourmash/commands.py
+++ b/sourmash/commands.py
@@ -450,8 +450,7 @@ def index(args):
                 ss.minhash = ss.minhash.downsample_scaled(args.scaled)
             scaleds.add(ss.minhash.scaled)
 
-            leaf = SigLeaf(ss.md5sum(), ss)
-            tree.insert(leaf)
+            tree.insert(ss)
             n += 1
 
         if not ss:
diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index ea713b720..ddc2f617b 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -165,7 +165,14 @@ def new_node_pos(self, node):
 
         return self.next_node
 
-    def insert(self, node):
+    def insert(self, signature):
+        "Add a new SourmashSignature in to the SBT."
+        from .sbtmh import SigLeaf
+        
+        leaf = SigLeaf(signature.name(), signature)
+        self.add_node(leaf)
+
+    def add_node(self, node):
         pos = self.new_node_pos(node)
 
         if pos == 0:  # empty tree; initialize w/node.
@@ -213,10 +220,6 @@ def insert(self, node):
             node.update(self._nodes[p.pos])
             p = self.parent(p.pos)
 
-    @deprecated(details="Use the insert method instead")
-    def add_node(self, node):
-        self.insert(node)
-
     def find(self, search_fn, *args, **kwargs):
         "Search the tree using `search_fn`."
 
diff --git a/tests/test_index.py b/tests/test_index.py
index 923239d74..cfcc5c976 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -36,11 +36,11 @@ def test_simple_index(n_children):
     leaf5.data.count("AAAAT")
     leaf5.data.count("GAAAA")
 
-    root.insert(leaf1)
-    root.insert(leaf2)
-    root.insert(leaf3)
-    root.insert(leaf4)
-    root.insert(leaf5)
+    root.add_node(leaf1)
+    root.add_node(leaf2)
+    root.add_node(leaf3)
+    root.add_node(leaf4)
+    root.add_node(leaf5)
 
     def search_kmer(obj, seq):
         return obj.data.get(seq)
diff --git a/tests/test_sbt.py b/tests/test_sbt.py
index 2c694d7a9..7023f938d 100644
--- a/tests/test_sbt.py
+++ b/tests/test_sbt.py
@@ -43,11 +43,11 @@ def test_simple(n_children):
     leaf5.data.count('AAAAT')
     leaf5.data.count('GAAAA')
 
-    root.insert(leaf1)
-    root.insert(leaf2)
-    root.insert(leaf3)
-    root.insert(leaf4)
-    root.insert(leaf5)
+    root.add_node(leaf1)
+    root.add_node(leaf2)
+    root.add_node(leaf3)
+    root.add_node(leaf4)
+    root.add_node(leaf5)
 
     def search_kmer(obj, seq):
         return obj.data.get(seq)
@@ -104,11 +104,11 @@ def test_longer_search(n_children):
     leaf5.data.count('AAAAT')
     leaf5.data.count('GAAAA')
 
-    root.insert(leaf1)
-    root.insert(leaf2)
-    root.insert(leaf3)
-    root.insert(leaf4)
-    root.insert(leaf5)
+    root.add_node(leaf1)
+    root.add_node(leaf2)
+    root.add_node(leaf3)
+    root.add_node(leaf4)
+    root.add_node(leaf5)
 
     def kmers(k, seq):
         for start in range(len(seq) - k + 1):
@@ -212,8 +212,8 @@ def test_tree_save_load(n_children):
 
     for f in utils.SIG_FILES:
         sig = next(load_signatures(utils.get_test_data(f)))
-        leaf = SigLeaf(os.path.basename(f), sig)
-        tree.insert(leaf)
+        leaf = SigLeaf(f, sig)
+        tree.add_node(leaf)
         to_search = leaf
 
     print('*' * 60)
@@ -273,8 +273,7 @@ def test_search_minhashes():
     n_leaves = 0
     for f in utils.SIG_FILES:
         sig = next(load_signatures(utils.get_test_data(f)))
-        leaf = SigLeaf(os.path.basename(f), sig)
-        tree.insert(leaf)
+        tree.insert(sig)
 
     to_search = next(iter(tree.leaves()))
 
@@ -298,7 +297,7 @@ def test_binary_nary_tree():
         sig = next(load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
         for tree in trees.values():
-            tree.insert(leaf)
+            tree.add_node(leaf)
         to_search = leaf
         n_leaves += 1
 
@@ -324,12 +323,11 @@ def test_sbt_combine(n_children):
     n_leaves = 0
     for f in utils.SIG_FILES:
         sig = next(load_signatures(utils.get_test_data(f)))
-        leaf = SigLeaf(os.path.basename(f), sig)
-        tree.insert(leaf)
+        tree.insert(sig)
         if n_leaves < 4:
-            tree_1.insert(leaf)
+            tree_1.insert(sig)
         else:
-            tree_2.insert(leaf)
+            tree_2.insert(sig)
         n_leaves += 1
 
     tree_1.combine(tree_2)
@@ -360,7 +358,7 @@ def test_sbt_combine(n_children):
     if not next_empty:
         next_empty = n + 1
 
-    tree_1.insert(leaf)
+    tree_1.add_node(SigLeaf(to_search.name(), to_search))
     assert tree_1.next_node == next_empty
 
 
@@ -372,7 +370,7 @@ def test_sbt_fsstorage():
         for f in utils.SIG_FILES:
             sig = next(load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.insert(leaf)
+            tree.add_node(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -405,7 +403,7 @@ def test_sbt_tarstorage():
         for f in utils.SIG_FILES:
             sig = next(load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.insert(leaf)
+            tree.add_node(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -441,7 +439,7 @@ def test_sbt_ipfsstorage():
         for f in utils.SIG_FILES:
             sig = next(load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.insert(leaf)
+            tree.add_node(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -479,7 +477,7 @@ def test_sbt_redisstorage():
         for f in utils.SIG_FILES:
             sig = next(load_signatures(utils.get_test_data(f)))
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.insert(leaf)
+            tree.add_node(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -533,8 +531,7 @@ def test_tree_repair_insert():
 
     for f in utils.SIG_FILES:
         sig = next(load_signatures(utils.get_test_data(f)))
-        leaf = SigLeaf(os.path.basename(f), sig)
-        tree_repair.insert(leaf)
+        tree_repair.insert(sig)
 
     for pos, node in tree_repair:
         # Every parent of a node must be an internal node (and not a leaf),
@@ -554,7 +551,7 @@ def test_save_sparseness(n_children):
     for f in utils.SIG_FILES:
         sig = next(load_signatures(utils.get_test_data(f)))
         leaf = SigLeaf(os.path.basename(f), sig)
-        tree.insert(leaf)
+        tree.add_node(leaf)
         to_search = leaf
 
     print('*' * 60)
@@ -595,8 +592,8 @@ def test_sbt_signatures():
     sig47 = load_one_signature(utils.get_test_data('47.fa.sig'))
     sig63 = load_one_signature(utils.get_test_data('63.fa.sig'))
 
-    tree.insert(SigLeaf('47', sig47))
-    tree.insert(SigLeaf('63', sig63))
+    tree.insert(sig47)
+    tree.insert(sig63)
 
     xx = list(tree.signatures())
     assert len(xx) == 2

From b0af24d06e1ec23ce6a74bc32bc7ed2364e555e0 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 14 Dec 2019 15:24:06 -0800
Subject: [PATCH 34/37] clean up signature loading

---
 tests/test_sbt.py | 54 ++++++++++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/tests/test_sbt.py b/tests/test_sbt.py
index c97ba4954..5854bee87 100644
--- a/tests/test_sbt.py
+++ b/tests/test_sbt.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from sourmash import load_signatures, load_one_signature
+from sourmash import load_one_signature
 from sourmash.sbt import SBT, GraphFactory, Leaf, Node
 from sourmash.sbtmh import (SigLeaf, search_minhashes,
                             search_minhashes_containment)
@@ -157,7 +157,7 @@ def test_tree_v2_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(load_signatures(testdata1))
+    to_search = load_one_signature(testdata1)
 
     results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
                                                to_search, 0.1)}
@@ -176,7 +176,7 @@ def test_tree_v3_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(load_signatures(testdata1))
+    to_search = load_one_signature(testdata1)
 
     results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
                                                to_search, 0.1)}
@@ -195,7 +195,7 @@ def test_tree_v5_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(load_signatures(testdata1))
+    to_search = load_one_signature(testdata1)
 
     results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
                                                to_search, 0.1)}
@@ -211,8 +211,8 @@ def test_tree_save_load(n_children):
     tree = SBT(factory, d=n_children)
 
     for f in utils.SIG_FILES:
-        sig = next(load_signatures(utils.get_test_data(f)))
-        leaf = SigLeaf(f, sig)
+        sig = load_one_signature(utils.get_test_data(f))
+        leaf = SigLeaf(os.path.basename(f), sig)
         tree.add_node(leaf)
         to_search = leaf
 
@@ -241,7 +241,7 @@ def test_tree_save_load_v5(n_children):
     tree = SBT(factory, d=n_children)
 
     for f in utils.SIG_FILES:
-        sig = next(load_signatures(utils.get_test_data(f)))
+        sig = load_one_signature(utils.get_test_data(f))
         leaf = SigLeaf(os.path.basename(f), sig)
         tree.add_node(leaf)
         to_search = leaf
@@ -272,8 +272,9 @@ def test_search_minhashes():
 
     n_leaves = 0
     for f in utils.SIG_FILES:
-        sig = next(load_signatures(utils.get_test_data(f)))
-        tree.insert(sig)
+        sig = load_one_signature(utils.get_test_data(f))
+        leaf = SigLeaf(os.path.basename(f), sig)
+        tree.add_node(leaf)
 
     to_search = next(iter(tree.leaves()))
 
@@ -294,7 +295,7 @@ def test_binary_nary_tree():
 
     n_leaves = 0
     for f in utils.SIG_FILES:
-        sig = next(load_signatures(utils.get_test_data(f)))
+        sig = load_one_signature(utils.get_test_data(f))
         leaf = SigLeaf(os.path.basename(f), sig)
         for tree in trees.values():
             tree.add_node(leaf)
@@ -322,12 +323,13 @@ def test_sbt_combine(n_children):
 
     n_leaves = 0
     for f in utils.SIG_FILES:
-        sig = next(load_signatures(utils.get_test_data(f)))
-        tree.insert(sig)
+        sig = load_one_signature(utils.get_test_data(f))
+        leaf = SigLeaf(os.path.basename(f), sig)
+        tree.add_node(leaf)
         if n_leaves < 4:
-            tree_1.insert(sig)
+            tree_1.add_node(leaf)
         else:
-            tree_2.insert(sig)
+            tree_2.add_node(leaf)
         n_leaves += 1
 
     tree_1.combine(tree_2)
@@ -339,8 +341,7 @@ def test_sbt_combine(n_children):
     assert len(t_leaves) == len(t1_leaves)
     assert t1_leaves == t_leaves
 
-    to_search = next(load_signatures(
-                        utils.get_test_data(utils.SIG_FILES[0])))
+    to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0]))
     t1_result = {str(s) for s in tree_1.find(search_minhashes,
                                              to_search, 0.1)}
     tree_result = {str(s) for s in tree.find(search_minhashes,
@@ -368,7 +369,8 @@ def test_sbt_fsstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(load_signatures(utils.get_test_data(f)))
+            sig = load_one_signature(utils.get_test_data(f))
+
             leaf = SigLeaf(os.path.basename(f), sig)
             tree.add_node(leaf)
             to_search = leaf
@@ -401,7 +403,8 @@ def test_sbt_tarstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(load_signatures(utils.get_test_data(f)))
+            sig = load_one_signature(utils.get_test_data(f))
+
             leaf = SigLeaf(os.path.basename(f), sig)
             tree.add_node(leaf)
             to_search = leaf
@@ -437,7 +440,8 @@ def test_sbt_ipfsstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(load_signatures(utils.get_test_data(f)))
+            sig = load_one_signature(utils.get_test_data(f))
+
             leaf = SigLeaf(os.path.basename(f), sig)
             tree.add_node(leaf)
             to_search = leaf
@@ -475,7 +479,8 @@ def test_sbt_redisstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(load_signatures(utils.get_test_data(f)))
+            sig = load_one_signature(utils.get_test_data(f))
+
             leaf = SigLeaf(os.path.basename(f), sig)
             tree.add_node(leaf)
             to_search = leaf
@@ -514,7 +519,7 @@ def test_tree_repair():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(load_signatures(testdata1))
+    to_search = load_one_signature(testdata1)
 
     results_repair = {str(s) for s in tree_repair.find(search_minhashes,
                                                        to_search, 0.1)}
@@ -530,8 +535,9 @@ def test_tree_repair_insert():
                            leaf_loader=SigLeaf.load)
 
     for f in utils.SIG_FILES:
-        sig = next(load_signatures(utils.get_test_data(f)))
-        tree_repair.insert(sig)
+        sig = load_one_signature(utils.get_test_data(f))
+        leaf = SigLeaf(os.path.basename(f), sig)
+        tree_repair.add_node(leaf)
 
     for pos, node in tree_repair:
         # Every parent of a node must be an internal node (and not a leaf),
@@ -549,7 +555,7 @@ def test_save_sparseness(n_children):
     tree = SBT(factory, d=n_children)
 
     for f in utils.SIG_FILES:
-        sig = next(load_signatures(utils.get_test_data(f)))
+        sig = load_one_signature(utils.get_test_data(f))
         leaf = SigLeaf(os.path.basename(f), sig)
         tree.add_node(leaf)
         to_search = leaf

From 2e8c3ab48962af820688a7933358087f252b27b1 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 14 Dec 2019 19:38:57 -0800
Subject: [PATCH 35/37] round out Index method tests, sort of :)

---
 sourmash/index.py         | 10 +++++++++-
 sourmash/lca/lca_utils.py | 15 +++++++++------
 tests/test_lca.py         | 20 ++++++++++++++++++++
 tests/test_sbt.py         |  3 ++-
 4 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index 3f7666c16..df1f58e89 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -13,7 +13,7 @@ def signatures(self):
         "Return an iterator over all signatures in the Index object."
 
     @abstractmethod
-    def insert(self, node):
+    def insert(self, signature):
         """ """
 
     @abstractmethod
@@ -26,6 +26,14 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr
         """ """
 
     def find(self, search_fn, *args, **kwargs):
+        """Use search_fn to find matching signatures in the index.
+
+        search_fn(other_sig, *args) should return a boolean that indicates
+        whether other_sig is a match.
+
+        Returns a list.
+        """
+
         matches = []
 
         for node in self.signatures():
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index 7167f18c9..7dceb82c6 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -279,7 +279,7 @@ def search(self, query, *args, **kwargs):
             raise TypeError("'search' on LCA databases does not use abundance")
 
         results = []
-        for x in self.find(query.minhash, threshold, do_containment):
+        for x in self.find_signatures(query.minhash, threshold, do_containment):
             (score, match, filename) = x
             results.append((score, match, filename))
 
@@ -288,8 +288,8 @@ def search(self, query, *args, **kwargs):
 
     def gather(self, query, *args, **kwargs):
         results = []
-        for x in self.find(query.minhash, 0.0,
-                           containment=True, ignore_scaled=True):
+        for x in self.find_signatures(query.minhash, 0.0,
+                                      containment=True, ignore_scaled=True):
             (score, match, filename) = x
             if score:
                 results.append((score, match, filename))
@@ -297,7 +297,10 @@ def gather(self, query, *args, **kwargs):
         return results
 
     def insert(self, node):
-        pass
+        raise NotImplementedError
+
+    def find(self, search_fn, *args, **kwargs):
+        raise NotImplementedError
 
     def downsample_scaled(self, scaled):
         """
@@ -350,7 +353,8 @@ def _create_signatures(self):
 
         debug('=> {} signatures!', len(self._signatures))
 
-    def find(self, minhash, threshold, containment=False, ignore_scaled=False):
+    def find_signatures(self, minhash, threshold, containment=False,
+                       ignore_scaled=False):
         """
         Do a Jaccard similarity or containment search.
         """
@@ -400,7 +404,6 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
             debug('score: {} (containment? {})', score, containment)
 
             if score >= threshold:
-                # reconstruct signature... ugh.
                 from .. import SourmashSignature
                 match_sig = SourmashSignature(match_mh, name=name)
 
diff --git a/tests/test_lca.py b/tests/test_lca.py
index 820269dc8..f5ac6cb8d 100644
--- a/tests/test_lca.py
+++ b/tests/test_lca.py
@@ -134,12 +134,32 @@ def test_db_repr():
 
 
 def test_lca_index_signatures_method():
+    # test 'signatures' method from base class Index
     filename = utils.get_test_data('lca/47+63.lca.json')
     db, ksize, scaled = lca_utils.load_single_database(filename)
 
     siglist = list(db.signatures())
     assert len(siglist) == 2
 
+def test_lca_index_insert_method():
+    # test 'signatures' method from base class Index
+    filename = utils.get_test_data('lca/47+63.lca.json')
+    db, ksize, scaled = lca_utils.load_single_database(filename)
+
+    sig = next(iter(db.signatures()))
+
+    with pytest.raises(NotImplementedError) as e:
+        db.insert(sig)
+
+def test_lca_index_find_method():
+    # test 'signatures' method from base class Index
+    filename = utils.get_test_data('lca/47+63.lca.json')
+    db, ksize, scaled = lca_utils.load_single_database(filename)
+
+    sig = next(iter(db.signatures()))
+
+    with pytest.raises(NotImplementedError) as e:
+        db.find(None)
 
 ## command line tests
 
diff --git a/tests/test_sbt.py b/tests/test_sbt.py
index 5854bee87..3f2dfd51c 100644
--- a/tests/test_sbt.py
+++ b/tests/test_sbt.py
@@ -591,7 +591,8 @@ def test_save_sparseness(n_children):
                 assert all(c.node is None for c in tree_loaded.children(pos))
 
 
-def test_sbt_signatures():
+def test_sbt_as_index_signatures():
+    # test 'signatures' method from Index base class.
     factory = GraphFactory(31, 1e5, 4)
     tree = SBT(factory, d=2)
 

From c6c02139c674be505a76358a190984786ad23653 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 14 Dec 2019 21:02:24 -0800
Subject: [PATCH 36/37] [WIP] add signatures() method to both LCA and SBT
 indices (#796)

* add signatures() method to both LCA and SBT indices

* Update tests/test_sbt.py

Co-Authored-By: Luiz Irber <luizirber@users.noreply.github.com>

* SBT.insert now matches Index.insert, while SBT.add_node does what insert used to

* clean up signature loading

* round out Index method tests, sort of :)
---
 sourmash/commands.py      |   3 +-
 sourmash/index.py         |  10 +++-
 sourmash/lca/lca_utils.py |  44 +++++++++------
 sourmash/sbt.py           |  16 +++---
 tests/test_index.py       |  10 ++--
 tests/test_lca.py         |  28 ++++++++++
 tests/test_sbt.py         | 109 +++++++++++++++++++++++---------------
 7 files changed, 147 insertions(+), 73 deletions(-)

diff --git a/sourmash/commands.py b/sourmash/commands.py
index e82fb6f3a..ac81f816b 100644
--- a/sourmash/commands.py
+++ b/sourmash/commands.py
@@ -450,8 +450,7 @@ def index(args):
                 ss.minhash = ss.minhash.downsample_scaled(args.scaled)
             scaleds.add(ss.minhash.scaled)
 
-            leaf = SigLeaf(ss.md5sum(), ss)
-            tree.insert(leaf)
+            tree.insert(ss)
             n += 1
 
         if not ss:
diff --git a/sourmash/index.py b/sourmash/index.py
index 3f7666c16..df1f58e89 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -13,7 +13,7 @@ def signatures(self):
         "Return an iterator over all signatures in the Index object."
 
     @abstractmethod
-    def insert(self, node):
+    def insert(self, signature):
         """ """
 
     @abstractmethod
@@ -26,6 +26,14 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr
         """ """
 
     def find(self, search_fn, *args, **kwargs):
+        """Use search_fn to find matching signatures in the index.
+
+        search_fn(other_sig, *args) should return a boolean that indicates
+        whether other_sig is a match.
+
+        Returns a list.
+        """
+
         matches = []
 
         for node in self.signatures():
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index e4ad78762..7dceb82c6 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -165,7 +165,10 @@ def __repr__(self):
         return "LCA_Database('{}')".format(self.filename)
 
     def signatures(self):
-        raise NotImplementedError
+        from .. import SourmashSignature
+        self._create_signatures()
+        for v in self._signatures.values():
+            yield SourmashSignature(v)
 
     def load(self, db_name):
         "Load from a JSON file."
@@ -276,7 +279,7 @@ def search(self, query, *args, **kwargs):
             raise TypeError("'search' on LCA databases does not use abundance")
 
         results = []
-        for x in self.find(query.minhash, threshold, do_containment):
+        for x in self.find_signatures(query.minhash, threshold, do_containment):
             (score, match, filename) = x
             results.append((score, match, filename))
 
@@ -285,8 +288,8 @@ def search(self, query, *args, **kwargs):
 
     def gather(self, query, *args, **kwargs):
         results = []
-        for x in self.find(query.minhash, 0.0,
-                           containment=True, ignore_scaled=True):
+        for x in self.find_signatures(query.minhash, 0.0,
+                                      containment=True, ignore_scaled=True):
             (score, match, filename) = x
             if score:
                 results.append((score, match, filename))
@@ -294,7 +297,10 @@ def gather(self, query, *args, **kwargs):
         return results
 
     def insert(self, node):
-        pass
+        raise NotImplementedError
+
+    def find(self, search_fn, *args, **kwargs):
+        raise NotImplementedError
 
     def downsample_scaled(self, scaled):
         """
@@ -329,17 +335,13 @@ def get_lineage_assignments(self, hashval):
 
         return x
 
-    def find(self, minhash, threshold, containment=False, ignore_scaled=False):
-        """
-        Do a Jaccard similarity or containment search.
-        """
-        # make sure we're looking at the same scaled value as database
-        if self.scaled > minhash.scaled:
-            minhash = minhash.downsample_scaled(self.scaled)
-        elif self.scaled < minhash.scaled and not ignore_scaled:
-            raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
+    def _create_signatures(self):
+        "Create a _signatures member dictionary that contains {idx: minhash}."
+        from .. import MinHash
 
         if not hasattr(self, '_signatures'):
+            minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled)
+
             debug('creating signatures for LCA DB...')
             sigd = defaultdict(minhash.copy_and_clear)
 
@@ -351,6 +353,19 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
 
         debug('=> {} signatures!', len(self._signatures))
 
+    def find_signatures(self, minhash, threshold, containment=False,
+                       ignore_scaled=False):
+        """
+        Do a Jaccard similarity or containment search.
+        """
+        # make sure we're looking at the same scaled value as database
+        if self.scaled > minhash.scaled:
+            minhash = minhash.downsample_scaled(self.scaled)
+        elif self.scaled < minhash.scaled and not ignore_scaled:
+            raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
+
+        self._create_signatures()
+
         # build idx_to_ident from ident_to_idx
         if not hasattr(self, 'idx_to_ident'):
             idx_to_ident = {}
@@ -389,7 +404,6 @@ def find(self, minhash, threshold, containment=False, ignore_scaled=False):
             debug('score: {} (containment? {})', score, containment)
 
             if score >= threshold:
-                # reconstruct signature... ugh.
                 from .. import SourmashSignature
                 match_sig = SourmashSignature(match_mh, name=name)
 
diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index 743f6e25e..ddc2f617b 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -135,7 +135,8 @@ def __init__(self, factory, d=2, storage=None):
         self.storage = storage
 
     def signatures(self):
-        return leaves()
+        for k in self.leaves():
+            yield k.data
 
     def new_node_pos(self, node):
         if not self._nodes:
@@ -164,7 +165,14 @@ def new_node_pos(self, node):
 
         return self.next_node
 
-    def insert(self, node):
+    def insert(self, signature):
+        "Add a new SourmashSignature in to the SBT."
+        from .sbtmh import SigLeaf
+        
+        leaf = SigLeaf(signature.name(), signature)
+        self.add_node(leaf)
+
+    def add_node(self, node):
         pos = self.new_node_pos(node)
 
         if pos == 0:  # empty tree; initialize w/node.
@@ -212,10 +220,6 @@ def insert(self, node):
             node.update(self._nodes[p.pos])
             p = self.parent(p.pos)
 
-    @deprecated(details="Use the insert method instead")
-    def add_node(self, node):
-        self.insert(node)
-
     def find(self, search_fn, *args, **kwargs):
         "Search the tree using `search_fn`."
 
diff --git a/tests/test_index.py b/tests/test_index.py
index 923239d74..cfcc5c976 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -36,11 +36,11 @@ def test_simple_index(n_children):
     leaf5.data.count("AAAAT")
     leaf5.data.count("GAAAA")
 
-    root.insert(leaf1)
-    root.insert(leaf2)
-    root.insert(leaf3)
-    root.insert(leaf4)
-    root.insert(leaf5)
+    root.add_node(leaf1)
+    root.add_node(leaf2)
+    root.add_node(leaf3)
+    root.add_node(leaf4)
+    root.add_node(leaf5)
 
     def search_kmer(obj, seq):
         return obj.data.get(seq)
diff --git a/tests/test_lca.py b/tests/test_lca.py
index fb63d63ad..f5ac6cb8d 100644
--- a/tests/test_lca.py
+++ b/tests/test_lca.py
@@ -133,6 +133,34 @@ def test_db_repr():
     assert repr(db) == "LCA_Database('{}')".format(filename)
 
 
+def test_lca_index_signatures_method():
+    # test 'signatures' method from base class Index
+    filename = utils.get_test_data('lca/47+63.lca.json')
+    db, ksize, scaled = lca_utils.load_single_database(filename)
+
+    siglist = list(db.signatures())
+    assert len(siglist) == 2
+
+def test_lca_index_insert_method():
+    # test 'signatures' method from base class Index
+    filename = utils.get_test_data('lca/47+63.lca.json')
+    db, ksize, scaled = lca_utils.load_single_database(filename)
+
+    sig = next(iter(db.signatures()))
+
+    with pytest.raises(NotImplementedError) as e:
+        db.insert(sig)
+
+def test_lca_index_find_method():
+    # test 'signatures' method from base class Index
+    filename = utils.get_test_data('lca/47+63.lca.json')
+    db, ksize, scaled = lca_utils.load_single_database(filename)
+
+    sig = next(iter(db.signatures()))
+
+    with pytest.raises(NotImplementedError) as e:
+        db.find(None)
+
 ## command line tests
 
 
diff --git a/tests/test_sbt.py b/tests/test_sbt.py
index bef6e7c6e..3f2dfd51c 100644
--- a/tests/test_sbt.py
+++ b/tests/test_sbt.py
@@ -4,12 +4,12 @@
 
 import pytest
 
-from sourmash import signature
+from sourmash import load_one_signature
 from sourmash.sbt import SBT, GraphFactory, Leaf, Node
 from sourmash.sbtmh import (SigLeaf, search_minhashes,
-                                search_minhashes_containment)
+                            search_minhashes_containment)
 from sourmash.sbt_storage import (FSStorage, TarStorage,
-                                      RedisStorage, IPFSStorage)
+                                  RedisStorage, IPFSStorage)
 
 from . import sourmash_tst_utils as utils
 
@@ -43,11 +43,11 @@ def test_simple(n_children):
     leaf5.data.count('AAAAT')
     leaf5.data.count('GAAAA')
 
-    root.insert(leaf1)
-    root.insert(leaf2)
-    root.insert(leaf3)
-    root.insert(leaf4)
-    root.insert(leaf5)
+    root.add_node(leaf1)
+    root.add_node(leaf2)
+    root.add_node(leaf3)
+    root.add_node(leaf4)
+    root.add_node(leaf5)
 
     def search_kmer(obj, seq):
         return obj.data.get(seq)
@@ -104,11 +104,11 @@ def test_longer_search(n_children):
     leaf5.data.count('AAAAT')
     leaf5.data.count('GAAAA')
 
-    root.insert(leaf1)
-    root.insert(leaf2)
-    root.insert(leaf3)
-    root.insert(leaf4)
-    root.insert(leaf5)
+    root.add_node(leaf1)
+    root.add_node(leaf2)
+    root.add_node(leaf3)
+    root.add_node(leaf4)
+    root.add_node(leaf5)
 
     def kmers(k, seq):
         for start in range(len(seq) - k + 1):
@@ -138,7 +138,7 @@ def test_tree_v1_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(signature.load_signatures(testdata1))
+    to_search = load_one_signature(testdata1)
 
     results_v1 = {str(s) for s in tree_v1.find(search_minhashes_containment,
                                                to_search, 0.1)}
@@ -157,7 +157,7 @@ def test_tree_v2_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(signature.load_signatures(testdata1))
+    to_search = load_one_signature(testdata1)
 
     results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
                                                to_search, 0.1)}
@@ -176,7 +176,7 @@ def test_tree_v3_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(signature.load_signatures(testdata1))
+    to_search = load_one_signature(testdata1)
 
     results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
                                                to_search, 0.1)}
@@ -195,7 +195,7 @@ def test_tree_v5_load():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(signature.load_signatures(testdata1))
+    to_search = load_one_signature(testdata1)
 
     results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
                                                to_search, 0.1)}
@@ -211,9 +211,9 @@ def test_tree_save_load(n_children):
     tree = SBT(factory, d=n_children)
 
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = load_one_signature(utils.get_test_data(f))
         leaf = SigLeaf(os.path.basename(f), sig)
-        tree.insert(leaf)
+        tree.add_node(leaf)
         to_search = leaf
 
     print('*' * 60)
@@ -241,7 +241,7 @@ def test_tree_save_load_v5(n_children):
     tree = SBT(factory, d=n_children)
 
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = load_one_signature(utils.get_test_data(f))
         leaf = SigLeaf(os.path.basename(f), sig)
         tree.add_node(leaf)
         to_search = leaf
@@ -272,9 +272,9 @@ def test_search_minhashes():
 
     n_leaves = 0
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = load_one_signature(utils.get_test_data(f))
         leaf = SigLeaf(os.path.basename(f), sig)
-        tree.insert(leaf)
+        tree.add_node(leaf)
 
     to_search = next(iter(tree.leaves()))
 
@@ -295,10 +295,10 @@ def test_binary_nary_tree():
 
     n_leaves = 0
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = load_one_signature(utils.get_test_data(f))
         leaf = SigLeaf(os.path.basename(f), sig)
         for tree in trees.values():
-            tree.insert(leaf)
+            tree.add_node(leaf)
         to_search = leaf
         n_leaves += 1
 
@@ -323,13 +323,13 @@ def test_sbt_combine(n_children):
 
     n_leaves = 0
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = load_one_signature(utils.get_test_data(f))
         leaf = SigLeaf(os.path.basename(f), sig)
-        tree.insert(leaf)
+        tree.add_node(leaf)
         if n_leaves < 4:
-            tree_1.insert(leaf)
+            tree_1.add_node(leaf)
         else:
-            tree_2.insert(leaf)
+            tree_2.add_node(leaf)
         n_leaves += 1
 
     tree_1.combine(tree_2)
@@ -341,8 +341,7 @@ def test_sbt_combine(n_children):
     assert len(t_leaves) == len(t1_leaves)
     assert t1_leaves == t_leaves
 
-    to_search = next(signature.load_signatures(
-                        utils.get_test_data(utils.SIG_FILES[0])))
+    to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0]))
     t1_result = {str(s) for s in tree_1.find(search_minhashes,
                                              to_search, 0.1)}
     tree_result = {str(s) for s in tree.find(search_minhashes,
@@ -360,7 +359,7 @@ def test_sbt_combine(n_children):
     if not next_empty:
         next_empty = n + 1
 
-    tree_1.insert(leaf)
+    tree_1.add_node(SigLeaf(to_search.name(), to_search))
     assert tree_1.next_node == next_empty
 
 
@@ -370,9 +369,10 @@ def test_sbt_fsstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(signature.load_signatures(utils.get_test_data(f)))
+            sig = load_one_signature(utils.get_test_data(f))
+
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.insert(leaf)
+            tree.add_node(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -403,9 +403,10 @@ def test_sbt_tarstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(signature.load_signatures(utils.get_test_data(f)))
+            sig = load_one_signature(utils.get_test_data(f))
+
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.insert(leaf)
+            tree.add_node(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -439,9 +440,10 @@ def test_sbt_ipfsstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(signature.load_signatures(utils.get_test_data(f)))
+            sig = load_one_signature(utils.get_test_data(f))
+
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.insert(leaf)
+            tree.add_node(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -477,9 +479,10 @@ def test_sbt_redisstorage():
         tree = SBT(factory)
 
         for f in utils.SIG_FILES:
-            sig = next(signature.load_signatures(utils.get_test_data(f)))
+            sig = load_one_signature(utils.get_test_data(f))
+
             leaf = SigLeaf(os.path.basename(f), sig)
-            tree.insert(leaf)
+            tree.add_node(leaf)
             to_search = leaf
 
         print('*' * 60)
@@ -516,7 +519,7 @@ def test_tree_repair():
                         leaf_loader=SigLeaf.load)
 
     testdata1 = utils.get_test_data(utils.SIG_FILES[0])
-    to_search = next(signature.load_signatures(testdata1))
+    to_search = load_one_signature(testdata1)
 
     results_repair = {str(s) for s in tree_repair.find(search_minhashes,
                                                        to_search, 0.1)}
@@ -532,9 +535,9 @@ def test_tree_repair_insert():
                            leaf_loader=SigLeaf.load)
 
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = load_one_signature(utils.get_test_data(f))
         leaf = SigLeaf(os.path.basename(f), sig)
-        tree_repair.insert(leaf)
+        tree_repair.add_node(leaf)
 
     for pos, node in tree_repair:
         # Every parent of a node must be an internal node (and not a leaf),
@@ -552,9 +555,9 @@ def test_save_sparseness(n_children):
     tree = SBT(factory, d=n_children)
 
     for f in utils.SIG_FILES:
-        sig = next(signature.load_signatures(utils.get_test_data(f)))
+        sig = load_one_signature(utils.get_test_data(f))
         leaf = SigLeaf(os.path.basename(f), sig)
-        tree.insert(leaf)
+        tree.add_node(leaf)
         to_search = leaf
 
     print('*' * 60)
@@ -586,3 +589,21 @@ def test_save_sparseness(n_children):
             # Leaf nodes can't have children
             if isinstance(node, Leaf):
                 assert all(c.node is None for c in tree_loaded.children(pos))
+
+
+def test_sbt_as_index_signatures():
+    # test 'signatures' method from Index base class.
+    factory = GraphFactory(31, 1e5, 4)
+    tree = SBT(factory, d=2)
+
+    sig47 = load_one_signature(utils.get_test_data('47.fa.sig'))
+    sig63 = load_one_signature(utils.get_test_data('63.fa.sig'))
+
+    tree.insert(sig47)
+    tree.insert(sig63)
+
+    xx = list(tree.signatures())
+    assert len(xx) == 2
+
+    assert sig47 in xx
+    assert sig63 in xx

From 689dcf55f8e39cfbef8bbd9fec9c5174536ed4e0 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 14 Dec 2019 21:33:26 -0800
Subject: [PATCH 37/37] add more scaled relationship tests in lca DB

---
 sourmash/lca/lca_utils.py |  8 ++++++-
 sourmash/sourmash_args.py |  1 -
 tests/test_lca.py         | 50 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index 7dceb82c6..3c5530eac 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -274,7 +274,7 @@ def search(self, query, *args, **kwargs):
             raise TypeError("'search' requires 'threshold'")
         threshold = kwargs['threshold']
         do_containment = kwargs.get('do_containment', False)
-        ignore_abundance = kwargs.get('ignore_abundance')
+        ignore_abundance = kwargs.get('ignore_abundance', True)
         if not ignore_abundance:
             raise TypeError("'search' on LCA databases does not use abundance")
 
@@ -306,6 +306,10 @@ def downsample_scaled(self, scaled):
         """
         Downsample to the provided scaled value, i.e. eliminate all hashes
         that don't fall in the required range.
+
+        NOTE: we probably need to invalidate some of the dynamically
+        calculated members of this object, like _signatures, when we do this.
+        But we aren't going to right now.
         """
         if scaled == self.scaled:
             return
@@ -362,7 +366,9 @@ def find_signatures(self, minhash, threshold, containment=False,
         if self.scaled > minhash.scaled:
             minhash = minhash.downsample_scaled(self.scaled)
         elif self.scaled < minhash.scaled and not ignore_scaled:
+            # note that containment can be calculated w/o matching scaled.
             raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
+            pass
 
         self._create_signatures()
 
diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py
index ad90f9ff2..542559a9a 100644
--- a/sourmash/sourmash_args.py
+++ b/sourmash/sourmash_args.py
@@ -352,7 +352,6 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, traverse=False):
 
             assert query_ksize == lca_db.ksize
             query_scaled = query.minhash.scaled
-            assert query_scaled and query_scaled <= lca_db.scaled
 
             notify('loaded LCA {}', sbt_or_sigfile, end='\r')
             n_databases += 1
diff --git a/tests/test_lca.py b/tests/test_lca.py
index f5ac6cb8d..0c8c4c376 100644
--- a/tests/test_lca.py
+++ b/tests/test_lca.py
@@ -141,6 +141,7 @@ def test_lca_index_signatures_method():
     siglist = list(db.signatures())
     assert len(siglist) == 2
 
+
 def test_lca_index_insert_method():
     # test 'signatures' method from base class Index
     filename = utils.get_test_data('lca/47+63.lca.json')
@@ -151,6 +152,7 @@ def test_lca_index_insert_method():
     with pytest.raises(NotImplementedError) as e:
         db.insert(sig)
 
+
 def test_lca_index_find_method():
     # test 'signatures' method from base class Index
     filename = utils.get_test_data('lca/47+63.lca.json')
@@ -161,6 +163,54 @@ def test_lca_index_find_method():
     with pytest.raises(NotImplementedError) as e:
         db.find(None)
 
+
+def test_search_db_scaled_gt_sig_scaled():
+    dbfile = utils.get_test_data('lca/47+63.lca.json')
+    db, ksize, scaled = lca_utils.load_single_database(dbfile)
+    sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'))
+
+    results = db.search(sig, threshold=.01, ignore_abundance=True)
+    match_sig = results[0][1]
+
+    sig.minhash = sig.minhash.downsample_scaled(10000)
+    assert sig.minhash == match_sig.minhash
+
+
+def test_search_db_scaled_lt_sig_scaled():
+    dbfile = utils.get_test_data('lca/47+63.lca.json')
+    db, ksize, scaled = lca_utils.load_single_database(dbfile)
+    sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'))
+    sig.minhash = sig.minhash.downsample_scaled(100000)
+
+    with pytest.raises(ValueError) as e:
+        results = db.search(sig, threshold=.01, ignore_abundance=True)
+
+
+def test_gather_db_scaled_gt_sig_scaled():
+    dbfile = utils.get_test_data('lca/47+63.lca.json')
+    db, ksize, scaled = lca_utils.load_single_database(dbfile)
+    sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'))
+
+    results = db.gather(sig, threshold=.01, ignore_abundance=True)
+    match_sig = results[0][1]
+
+    sig.minhash = sig.minhash.downsample_scaled(10000)
+    assert sig.minhash == match_sig.minhash
+
+
+def test_gather_db_scaled_lt_sig_scaled():
+    dbfile = utils.get_test_data('lca/47+63.lca.json')
+    db, ksize, scaled = lca_utils.load_single_database(dbfile)
+    sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'))
+    sig.minhash = sig.minhash.downsample_scaled(100000)
+
+    results = db.gather(sig, threshold=.01, ignore_abundance=True)
+    match_sig = results[0][1]
+
+    match_sig.minhash = match_sig.minhash.downsample_scaled(100000)
+    assert sig.minhash == match_sig.minhash
+
+
 ## command line tests