[WIP] Refactor LCA database to contain all the information. (#537)

* a trial refactoring of the lca db * save and load seem to basically work * got search & gather on LCA working in new framework * allow key error * keep identities with no lineage assigment * add multigather * Hyperlink DOIs to preferred resolver (#562) * fix divide by zero issue in MinHash.contained_by (#572) * fix divide by zero issue in contained_by * remove unused lineages and identifiers * update report output * majority of lca tests passing now! * fix gather? * ...all tests pass?
sourmash-bio · Dec 12, 2018 · 23e33b3 · 23e33b3
1 parent f7a2a54
commit 23e33b3
Show file tree

Hide file tree

Showing 18 changed files with 460 additions and 201 deletions.
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ We have demo notebooks on binder that you can interact with:
 
 [![Binder](http://mybinder.org/badge.svg)](http://mybinder.org/repo/dib-lab/sourmash)
 
-Sourmash is [published on JOSS](http://dx.doi.org/10.21105/joss.00027).
+Sourmash is [published on JOSS](https://doi.org/10.21105/joss.00027).
 
 ----
 

diff --git a/paper.bib b/paper.bib
@@ -6,5 +6,5 @@ @article{ondov2015fast
   year={2015},
   publisher={Cold Spring Harbor Labs Journals},
   doi={10.1101/029827},
-  url={http://dx.doi.org/10.1101/029827}
+  url={https://doi.org/10.1101/029827}
 }
diff --git a/sourmash/__main__.py b/sourmash/__main__.py
@@ -9,7 +9,7 @@
 
 from .commands import (categorize, compare, compute, dump, import_csv,
                        gather, index, sbt_combine, search,
-                       plot, watch, info, storage, migrate)
+                       plot, watch, info, storage, migrate, multigather)
 from .lca import main as lca_main
 
 usage='''
@@ -58,7 +58,8 @@ def main():
                 'sbt_combine': sbt_combine, 'info': info,
                 'storage': storage,
                 'lca': lca_main,
-                'migrate': migrate}
+                'migrate': migrate,
+                'multigather': multigather}
     parser = argparse.ArgumentParser(
         description='work with compressed sequence representations')
     parser.add_argument('command', nargs='?')

diff --git a/sourmash/_minhash.pyx b/sourmash/_minhash.pyx
@@ -364,6 +364,8 @@ cdef class MinHash(object):
         """\
         Calculate how much of self is contained by other.
         """
+        if not len(self):
+            return 0.0
         return self.count_common(other) / len(self.get_mins())
 
     def similarity_ignore_maxhash(self, MinHash other):

diff --git a/sourmash/commands.py b/sourmash/commands.py
@@ -1109,6 +1109,165 @@ def gather(args):
                                 args.output_unassigned)
 
 
+def multigather(args):
+    from .search import gather_databases, format_bp
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--db', nargs='+', action='append')
+    parser.add_argument('--query', nargs='+', action='append')
+    parser.add_argument('--traverse-directory', action='store_true',
+                        help='search all signatures underneath directories.')
+    parser.add_argument('--threshold-bp', type=float, default=5e4,
+                        help='threshold (in bp) for reporting results')
+    parser.add_argument('--scaled', type=float, default=0,
+                        help='downsample query to this scaled factor')
+    parser.add_argument('-q', '--quiet', action='store_true',
+                        help='suppress non-error output')
+    parser.add_argument('--ignore-abundance',  action='store_true',
+                        help='do NOT use k-mer abundances if present')
+    parser.add_argument('-d', '--debug', action='store_true')
+
+    sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
+    sourmash_args.add_moltype_args(parser)
+
+    args = parser.parse_args(args)
+    set_quiet(args.quiet)
+    moltype = sourmash_args.calculate_moltype(args)
+
+    if not args.db:
+        error('Error! must specify at least one database with --db')
+        sys.exit(-1)
+
+    if not args.query:
+        error('Error! must specify at least one query signature with --query')
+        sys.exit(-1)
+
+    # flatten --db and --query
+    args.db = [item for sublist in args.db for item in sublist]
+    args.query = [item for sublist in args.query for item in sublist]
+
+    query = sourmash_args.load_query_signature(args.query[0],
+                                               ksize=args.ksize,
+                                               select_moltype=moltype)
+    # set up the search databases
+    databases = sourmash_args.load_dbs_and_sigs(args.db, query, False,
+                                                args.traverse_directory)
+
+    if not len(databases):
+        error('Nothing found to search!')
+        sys.exit(-1)
+
+    # run gather on all the queries.
+    for queryfile in args.query:
+        # load the query signature & figure out all the things
+        query = sourmash_args.load_query_signature(queryfile,
+                                                   ksize=args.ksize,
+                                                   select_moltype=moltype)
+        notify('loaded query: {}... (k={}, {})', query.name()[:30],
+                                                 query.minhash.ksize,
+                                                 sourmash_args.get_moltype(query))
+
+        # verify signature was computed right.
+        if query.minhash.max_hash == 0:
+            error('query signature needs to be created with --scaled; skipping')
+            continue
+
+        # downsample if requested
+        if args.scaled:
+            notify('downsampling query from scaled={} to {}',
+                   query.minhash.scaled, int(args.scaled))
+            query.minhash = query.minhash.downsample_scaled(args.scaled)
+
+        # empty?
+        if not query.minhash.get_mins():
+            error('no query hashes!? skipping to next..')
+            continue
+
+        found = []
+        weighted_missed = 1
+        for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
+            # print interim result & save in a list for later use
+            pct_query = '{:.1f}%'.format(result.f_orig_query*100)
+            pct_genome = '{:.1f}%'.format(result.f_match*100)
+
+            name = result.leaf._display_name(40)
+
+
+            if not len(found):                # first result? print header.
+                if query.minhash.track_abundance and not args.ignore_abundance:
+                    print_results("")
+                    print_results("overlap     p_query p_match avg_abund")
+                    print_results("---------   ------- ------- ---------")
+                else:
+                    print_results("")
+                    print_results("overlap     p_query p_match")
+                    print_results("---------   ------- -------")
+
+
+            # print interim result & save in a list for later use
+            pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
+            pct_genome = '{:.1f}%'.format(result.f_match*100)
+            average_abund ='{:.1f}'.format(result.average_abund)
+            name = result.leaf._display_name(40)
+
+            if query.minhash.track_abundance and not args.ignore_abundance:
+                print_results('{:9}   {:>7} {:>7} {:>9}    {}',
+                          format_bp(result.intersect_bp), pct_query, pct_genome,
+                          average_abund, name)
+            else:
+                print_results('{:9}   {:>7} {:>7}    {}',
+                          format_bp(result.intersect_bp), pct_query, pct_genome,
+                          name)
+            found.append(result)
+
+
+        # basic reporting
+        print_results('\nfound {} matches total;', len(found))
+
+        print_results('the recovered matches hit {:.1f}% of the query',
+               (1 - weighted_missed) * 100)
+        print_results('')
+
+        if not found:
+            notify('nothing found... skipping.')
+            continue
+
+        output_base = os.path.basename(queryfile)
+        output_csv = output_base + '.csv'
+
+        fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
+                  'f_unique_to_query', 'f_unique_weighted',
+                  'average_abund', 'median_abund', 'std_abund', 'name', 'filename', 'md5']
+        with open(output_csv, 'wt') as fp:
+            w = csv.DictWriter(fp, fieldnames=fieldnames)
+            w.writeheader()
+            for result in found:
+                d = dict(result._asdict())
+                del d['leaf']                 # actual signature not in CSV.
+                w.writerow(d)
+
+        output_matches = output_base + '.matches.sig'
+        with open(output_matches, 'wt') as fp:
+            outname = output_matches
+            notify('saving all matches to "{}"', outname)
+            sig.save_signatures([ r.leaf for r in found ], fp)
+
+        output_unassigned = output_base + '.unassigned.sig'
+        with open(output_unassigned, 'wt') as fp:
+            if not found:
+                notify('nothing found - entire query signature unassigned.')
+            elif not query.minhash.get_mins():
+                notify('no unassigned hashes! not saving.')
+            else:
+                notify('saving unassigned hashes to "{}"', output_unassigned)
+
+                e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
+                e.add_many(next_query.minhash.get_mins())
+                sig.save_signatures([ sig.SourmashSignature(e) ], fp)
+
+        # fini, next query!
+
+
 def watch(args):
     "Build a signature from raw FASTA/FASTQ coming in on stdin, search."
 

diff --git a/sourmash/lca/command_gather.py b/sourmash/lca/command_gather.py
@@ -82,70 +82,69 @@ def gather_signature(query_sig, dblist, ignore_abundance):
     md5_to_name = {}
 
     x = set()
-    for hashval in query_mins:
-        for lca_db in dblist:
-            lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
-            for lid in lineage_ids:
-                md5 = lca_db.lineage_id_to_signature[lid]
-                x.add((lca_db, lid, md5))
-
-    for lca_db, lid, md5 in x:
-        md5_to_lineage[md5] = lca_db.lineage_dict[lid]
-        if lca_db.signature_to_name:
-            md5_to_name[md5] = lca_db.signature_to_name[md5]
-        else:
-            md5_to_name[md5] = ''
+    for lca_db in dblist:
+        siglist = []
+        idx_set = set()
+
+        for hashval in query_mins:
+            idx_set.update(lca_db.hashval_to_idx.get(hashval, []))
+
+        for idx in idx_set:
+            pass
 
     # now! do the gather:
     while 1:
         # find all of the assignments for the current set of hashes
         assignments = defaultdict(set)
         for hashval in query_mins:
             for lca_db in dblist:
-                lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
-                for lid in lineage_ids:
-                    md5 = lca_db.lineage_id_to_signature[lid]
-                    signature_size = lca_db.lineage_id_counts[lid]
-                    assignments[hashval].add((md5, signature_size))
+                idx_list = lca_db.hashval_to_idx.get(hashval, [])
+
+                for idx in idx_list:
+                    assignments[hashval].add((lca_db, idx))
+
         # none? quit.
         if not assignments:
             break
 
         # count the distinct signatures.
         counts = Counter()
-        for hashval, md5_set in assignments.items():
-            for (md5, sigsize) in md5_set:
-                counts[(md5, sigsize)] += 1
+        for hashval, match_set in assignments.items():
+            for (lca_db, idx) in match_set:
+                counts[(lca_db, idx)] += 1
 
         # collect the most abundant assignments
         common_iter = iter(counts.most_common())
         best_list = []
-        (md5, sigsize), top_count = next(common_iter)
+        (best_lca_db, best_idx), top_count = next(common_iter)
 
-        best_list.append((md5_to_name[md5], md5, sigsize))
-        for (md5, sigsize), count in common_iter:
+        best_list.append((lca_db, idx))
+        for (lca_db, idx), count in common_iter:
             if count != top_count:
                 break
-            best_list.append((md5_to_name[md5], md5, sigsize))
+            best_list.append((lca_db, idx))
 
         # sort on name and pick the top (for consistency).
-        best_list.sort()
-        _, top_md5, top_sigsize = best_list[0]
+        #best_list.sort()
+        #_, top_md5, top_sigsize = best_list[0]
 
         equiv_counts = len(best_list) - 1
 
         # now, remove from query mins.
         intersect_mins = set()
-        for hashval, md5_set in assignments.items():
-            if (top_md5, top_sigsize) in md5_set:
+        for hashval, match_set in assignments.items():
+            if (best_lca_db, best_idx) in match_set:
                 query_mins.remove(hashval)
                 intersect_mins.add(hashval)
 
         # should match!
         assert top_count == len(intersect_mins)
 
         # calculate size of match (# of hashvals belonging to that sig)
-        match_size = top_sigsize
+        match_size = 0
+        for hashval, idx_list in best_lca_db.hashval_to_idx.items():
+            if best_idx in idx_list:
+                match_size += 1
 
         # construct 'result' object
         intersect_bp = top_count * query_sig.minhash.scaled
@@ -155,13 +154,23 @@ def gather_signature(query_sig, dblist, ignore_abundance):
                / len(intersect_mins)
         f_match = len(intersect_mins) / match_size
 
+        # XXX name and lineage
+        for ident, idx in best_lca_db.ident_to_idx.items():
+            if idx == best_idx:
+                name = best_lca_db.ident_to_name[ident]
+
+        lid = best_lca_db.idx_to_lid.get(best_idx)
+        lineage = ()
+        if lid is not None:
+            lineage = best_lca_db.lid_to_lineage[lid]
+
         result = LCAGatherResult(intersect_bp = intersect_bp,
                                  f_unique_to_query= top_count / n_mins,
                                  f_unique_weighted=f_unique_weighted,
                                  average_abund=average_abund,
                                  f_match=f_match,
-                                 lineage=md5_to_lineage[top_md5],
-                                 name=md5_to_name[top_md5],
+                                 lineage=lineage,
+                                 name=name,
                                  n_equal_matches=equiv_counts)
 
         f_unassigned = len(query_mins) / n_mins