Rework the find functionality for Index classes (#1392)

* have the 'find' function for SBTs return signatures * fix majority of tests * split find and _find_nodes to take different kinds of functions * redo 'find' on index * refactor lca_db to use new find * refactor SBT to use new find * refactor out common code * use 'passes' properly * adjust tree downsampling for regular minhashes, too * remove now-unused search functions in sbtmh * refactor categorize to use new find * fix jaccard calculation in sbt * check for compatibility of search fn and query signature * switch tests over to jaccard similarity, not containment * remove test for unimplemented LCA_Database.find method * document threshold change; update test * refuse to run abund signatures * flatten sigs internally for gather * reinflate abundances for saving * fix problem where sbt indices coudl be created with abund signatures * split flat and abund search * make ignore_abundance work again for categorize * turn off best-only, since it triggers on self-hits. * add test: 'sourmash index' flattens sigs * location is now a property * move search code into search.py * remove redundant scaled checking code * best-only now works properly for two tests * 'fix' tests by removing v1 and v2 SBT compatibility * simplify downsampling code * require keyword args in MinHash.downsample(...) * fix test to use proper downsampling, reverse order to match scaled * flatten subject MinHash, too * add IndexSearchResult namedtuple for search and gather results * add more tests for Index classes * add tests for subj & query num downsampling * tests for Index.search_abund * refactor make_jaccard_search_query; start tests * test collect, best_only * deal with status == None on SystemExit * upgrade and simplify categorize * fix abundance search in SBT for categorize * add explicit test for incompatible num * add simple tests for SBT load and search API * allow arbitrary kwargs for LCA_DAtabase.find * add testing of passthru-kwargs * docstring updates * better tests for gather --save-unassigned * SBT search doesn't work on v1 and v2 SBTs b/c no min_n_below * add intersection_and_union_size method to MinHash * make flatten a no-op if track_abundance=False * intersection_union_size in the FFI Co-authored-by: Luiz Irber <luiz.irber@gmail.com>
sourmash-bio · Apr 22, 2021 · f02e250 · f02e250
1 parent eb2b210
commit f02e250
Show file tree

Hide file tree

Showing 18 changed files with 1,419 additions and 659 deletions.
diff --git a/include/sourmash.h b/include/sourmash.h
@@ -195,7 +195,12 @@ void kmerminhash_hash_function_set(SourmashKmerMinHash *ptr, HashFunctions hash_
 
 bool kmerminhash_hp(const SourmashKmerMinHash *ptr);
 
-uint64_t kmerminhash_intersection(const SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);
+SourmashKmerMinHash *kmerminhash_intersection(const SourmashKmerMinHash *ptr,
+                                              const SourmashKmerMinHash *other);
+
+uint64_t kmerminhash_intersection_union_size(const SourmashKmerMinHash *ptr,
+                                             const SourmashKmerMinHash *other,
+                                             uint64_t *union_size);
 
 bool kmerminhash_is_compatible(const SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);
 

diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs
@@ -377,14 +377,31 @@ unsafe fn kmerminhash_count_common(ptr: *const SourmashKmerMinHash, other: *cons
 
 ffi_fn! {
 unsafe fn kmerminhash_intersection(ptr: *const SourmashKmerMinHash, other: *const SourmashKmerMinHash)
+    -> Result<*mut SourmashKmerMinHash> {
+    let mh = SourmashKmerMinHash::as_rust(ptr);
+    let other_mh = SourmashKmerMinHash::as_rust(other);
+
+    let isect = mh.intersection(other_mh)?;
+    let mut new_mh = mh.clone();
+    new_mh.clear();
+    new_mh.add_many(&isect.0)?;
+
+    Ok(SourmashKmerMinHash::from_rust(new_mh))
+}
+}
+
+ffi_fn! {
+unsafe fn kmerminhash_intersection_union_size(ptr: *const SourmashKmerMinHash, other: *const SourmashKmerMinHash, union_size: *mut u64)
     -> Result<u64> {
     let mh = SourmashKmerMinHash::as_rust(ptr);
     let other_mh = SourmashKmerMinHash::as_rust(other);
 
-    if let Ok((_, size)) = mh.intersection_size(other_mh) {
-        return Ok(size);
+    if let Ok((common, union_s)) = mh.intersection_size(other_mh) {
+        *union_size = union_s;
+        return Ok(common);
     }
 
+    *union_size = 0;
     Ok(0)
 }
 }

diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs
@@ -603,6 +603,7 @@ impl KmerMinHash {
     }
 
     // FIXME: intersection_size and count_common should be the same?
+    // (for scaled minhashes)
     pub fn intersection_size(&self, other: &KmerMinHash) -> Result<(u64, u64), Error> {
         self.check_compatible(other)?;
 

diff --git a/src/sourmash/cli/categorize.py b/src/sourmash/cli/categorize.py
@@ -7,10 +7,10 @@
 
 def subparser(subparsers):
     subparser = subparsers.add_parser('categorize')
-    subparser.add_argument('sbt_name', help='name of SBT to load')
+    subparser.add_argument('database', help='location of signature collection/database to load')
     subparser.add_argument(
         'queries', nargs='+',
-        help='list of signatures to categorize'
+        help='locations of signatures to categorize'
     )
     subparser.add_argument(
         '-q', '--quiet', action='store_true',

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
@@ -5,6 +5,7 @@
 import os
 import os.path
 import sys
+import copy
 
 import screed
 from .compare import (compare_all_pairs, compare_serial_containment,
@@ -14,11 +15,8 @@
 from . import signature as sig
 from . import sourmash_args
 from .logging import notify, error, print_results, set_quiet
-from .sbtmh import SearchMinHashesFindBest, SigLeaf
-
 from .sourmash_args import DEFAULT_LOAD_K, FileOutput, FileOutputCSV
 
-DEFAULT_N = 500
 WATERMARK_SIZE = 10000
 
 from .command_compute import compute
@@ -385,6 +383,8 @@ def index(args):
 
             if args.scaled:
                 ss.minhash = ss.minhash.downsample(scaled=args.scaled)
+            if ss.minhash.track_abundance:
+                ss.minhash = ss.minhash.flatten()
             scaleds.add(ss.minhash.scaled)
 
             tree.insert(ss)
@@ -422,7 +422,8 @@ def index(args):
 
 
 def search(args):
-    from .search import search_databases
+    from .search import (search_databases_with_flat_query,
+                         search_databases_with_abund_query)
 
     set_quiet(args.quiet)
     moltype = sourmash_args.calculate_moltype(args)
@@ -457,22 +458,32 @@ def search(args):
     databases = sourmash_args.load_dbs_and_sigs(args.databases, query,
                                                 not is_containment)
 
-    # forcibly ignore abundances if query has no abundances
-    if not query.minhash.track_abundance:
-        args.ignore_abundance = True
-
     if not len(databases):
         error('Nothing found to search!')
         sys.exit(-1)
 
+    # forcibly ignore abundances if query has no abundances
+    if not query.minhash.track_abundance:
+        args.ignore_abundance = True
+    else:
+        if args.ignore_abundance:
+            query.minhash = query.minhash.flatten()
+
     # do the actual search
-    results = search_databases(query, databases,
-                               threshold=args.threshold,
-                               do_containment=args.containment,
-                               do_max_containment=args.max_containment,
-                               best_only=args.best_only,
-                               ignore_abundance=args.ignore_abundance,
-                               unload_data=True)
+    if query.minhash.track_abundance:
+        results = search_databases_with_abund_query(query, databases,
+                                   threshold=args.threshold,
+                                   do_containment=args.containment,
+                                   do_max_containment=args.max_containment,
+                                   best_only=args.best_only,
+                                   unload_data=True)
+    else:
+        results = search_databases_with_flat_query(query, databases,
+                                   threshold=args.threshold,
+                                   do_containment=args.containment,
+                                   do_max_containment=args.max_containment,
+                                   best_only=args.best_only,
+                                   unload_data=True)
 
     n_matches = len(results)
     if args.best_only:
@@ -520,6 +531,7 @@ def search(args):
 def categorize(args):
     "Use a database to find the best match to many signatures."
     from .index import MultiIndex
+    from .search import make_jaccard_search_query
 
     set_quiet(args.quiet)
     moltype = sourmash_args.calculate_moltype(args)
@@ -533,7 +545,9 @@ def categorize(args):
                 already_names.add(row[0])
 
     # load search database
-    tree = load_sbt_index(args.sbt_name)
+    db = sourmash_args.load_file_as_index(args.database)
+    if args.ksize or moltype:
+        db = db.select(ksize=args.ksize, moltype=moltype)
 
     # utility function to load & select relevant signatures.
     def _yield_all_sigs(queries, ksize, moltype):
@@ -549,40 +563,44 @@ def _yield_all_sigs(queries, ksize, moltype):
         csv_fp = open(args.csv, 'w', newline='')
         csv_w = csv.writer(csv_fp)
 
-    for query, loc in _yield_all_sigs(args.queries, args.ksize, moltype):
+    search_obj = make_jaccard_search_query(threshold=args.threshold)
+    for orig_query, loc in _yield_all_sigs(args.queries, args.ksize, moltype):
         # skip if we've already done signatures from this file.
         if loc in already_names:
             continue
 
-        notify('loaded query: {}... (k={}, {})', str(query)[:30],
-               query.minhash.ksize, query.minhash.moltype)
+        notify('loaded query: {}... (k={}, {})', str(orig_query)[:30],
+               orig_query.minhash.ksize, orig_query.minhash.moltype)
 
-        results = []
-        search_fn = SearchMinHashesFindBest().search
+        if args.ignore_abundance:
+            query = copy.copy(orig_query)
+            query.minhash = query.minhash.flatten()
+        else:
+            if orig_query.minhash.track_abundance:
+                notify("ERROR: this search cannot be done on signatures calculated with abundance.")
+                notify("ERROR: please specify --ignore-abundance.")
+                sys.exit(-1)
 
-        # note, "ignore self" here may prevent using newer 'tree.search' fn.
-        for leaf in tree.find(search_fn, query, args.threshold):
-            if leaf.data.md5sum() != query.md5sum(): # ignore self.
-                similarity = query.similarity(
-                    leaf.data, ignore_abundance=args.ignore_abundance)
-                results.append((similarity, leaf.data))
+            query = orig_query
+
+        results = []
+        for match, score in db.find(search_obj, query):
+            if match.md5sum() != query.md5sum(): # ignore self.
+                results.append((orig_query.similarity(match), match))
 
-        best_hit_sim = 0.0
-        best_hit_query_name = ""
         if results:
             results.sort(key=lambda x: -x[0])   # reverse sort on similarity
             best_hit_sim, best_hit_query = results[0]
             notify('for {}, found: {:.2f} {}', query,
                                                best_hit_sim,
                                                best_hit_query)
             best_hit_query_name = best_hit_query.name
+            if csv_w:
+                csv_w.writerow([loc, query, best_hit_query_name,
+                               best_hit_sim])
         else:
             notify('for {}, no match found', query)
 
-        if csv_w:
-            csv_w.writerow([loc, query, best_hit_query_name,
-                           best_hit_sim])
-
     if csv_fp:
         csv_fp.close()
 
@@ -631,12 +649,14 @@ def gather(args):
 
     found = []
     weighted_missed = 1
+    is_abundance = query.minhash.track_abundance and not args.ignore_abundance
+    orig_query_mh = query.minhash
     new_max_hash = query.minhash._max_hash
     next_query = query
 
     for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
         if not len(found):                # first result? print header.
-            if query.minhash.track_abundance and not args.ignore_abundance:
+            if is_abundance:
                 print_results("")
                 print_results("overlap     p_query p_match avg_abund")
                 print_results("---------   ------- ------- ---------")
@@ -651,7 +671,7 @@ def gather(args):
         pct_genome = '{:.1f}%'.format(result.f_match*100)
         name = result.match._display_name(40)
 
-        if query.minhash.track_abundance and not args.ignore_abundance:
+        if is_abundance:
             average_abund ='{:.1f}'.format(result.average_abund)
             print_results('{:9}   {:>7} {:>7} {:>9}    {}',
                       format_bp(result.intersect_bp), pct_query, pct_genome,
@@ -666,16 +686,16 @@ def gather(args):
             break
 
 
-    # basic reporting
-    print_results('\nfound {} matches total;', len(found))
+    # basic reporting:
+    print_results(f'\nfound {len(found)} matches total;')
     if args.num_results and len(found) == args.num_results:
-        print_results('(truncated gather because --num-results={})',
-                      args.num_results)
+        print_results(f'(truncated gather because --num-results={args.num_results})')
 
-    print_results('the recovered matches hit {:.1f}% of the query',
-           (1 - weighted_missed) * 100)
+    p_covered = (1 - weighted_missed) * 100
+    print_results(f'the recovered matches hit {p_covered:.1f}% of the query')
     print_results('')
 
+    # save CSV?
     if found and args.output:
         fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
                       'f_unique_to_query', 'f_unique_weighted',
@@ -691,19 +711,34 @@ def gather(args):
                 del d['match']                 # actual signature not in CSV.
                 w.writerow(d)
 
+    # save matching signatures?
     if found and args.save_matches:
-        notify('saving all matches to "{}"', args.save_matches)
+        notify(f"saving all matches to '{args.save_matches}'")
         with FileOutput(args.save_matches, 'wt') as fp:
             sig.save_signatures([ r.match for r in found ], fp)
 
+    # save unassigned hashes?
     if args.output_unassigned:
         if not len(next_query.minhash):
             notify('no unassigned hashes to save with --output-unassigned!')
         else:
-            notify('saving unassigned hashes to "{}"', args.output_unassigned)
+            notify(f"saving unassigned hashes to '{args.output_unassigned}'")
+
+            if is_abundance:
+                # next_query is flattened; reinflate abundances
+                hashes = set(next_query.minhash.hashes)
+                orig_abunds = orig_query_mh.hashes
+                abunds = { h: orig_abunds[h] for h in hashes }
+
+                abund_query_mh = orig_query_mh.copy_and_clear()
+                # orig_query might have been downsampled...
+                abund_query_mh.downsample(scaled=next_query.minhash.scaled)
+                abund_query_mh.set_abundances(abunds)
+                next_query.minhash = abund_query_mh
 
             with FileOutput(args.output_unassigned, 'wt') as fp:
                 sig.save_signatures([ next_query ], fp)
+    # DONE w/gather function.
 
 
 def multigather(args):
@@ -765,9 +800,10 @@ def multigather(args):
 
             found = []
             weighted_missed = 1
+            is_abundance = query.minhash.track_abundance and not args.ignore_abundance
             for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
                 if not len(found):                # first result? print header.
-                    if query.minhash.track_abundance and not args.ignore_abundance:
+                    if is_abundance:
                         print_results("")
                         print_results("overlap     p_query p_match avg_abund")
                         print_results("---------   ------- ------- ---------")
@@ -782,7 +818,7 @@ def multigather(args):
                 pct_genome = '{:.1f}%'.format(result.f_match*100)
                 name = result.match._display_name(40)
 
-                if query.minhash.track_abundance and not args.ignore_abundance:
+                if is_abundance:
                     average_abund ='{:.1f}'.format(result.average_abund)
                     print_results('{:9}   {:>7} {:>7} {:>9}    {}',
                               format_bp(result.intersect_bp), pct_query, pct_genome,
@@ -844,6 +880,7 @@ def multigather(args):
 
                     e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
                     e.add_many(next_query.minhash.hashes)
+                    # CTB: note, multigather does not save abundances
                     sig.save_signatures([ sig.SourmashSignature(e) ], fp)
             n += 1