sourmash-bio · ctb · Apr 17, 2017 · Apr 17, 2017 · Apr 19, 2017 · Apr 19, 2017
diff --git a/sourmash_lib/_minhash.pyx b/sourmash_lib/_minhash.pyx
@@ -207,7 +207,14 @@ cdef class MinHash(object):
     def max_hash(self):
         mm = deref(self._this).max_hash
         if mm == 18446744073709551615:
-            return 0
+            mm = 0
+
+        # legacy - check cardinality => estimate
+        if mm == 0:
+            if self.hll:
+                genome_size = self.hll.estimate_cardinality()
+                mm = max(self.get_mins())
+
         return mm
 
     def add_hash(self, uint64_t h):
@@ -248,9 +255,20 @@ cdef class MinHash(object):
 
         return a
 
+    def downsample_max_hash(self, *others):
+        max_hashes = [ x.max_hash for x in others ]
+        new_max_hash = min(self.max_hash, *max_hashes)
+        new_scaled = int(get_minhash_max_hash() / new_max_hash)
+
+        return self.downsample_scaled(new_scaled)
+
     def downsample_scaled(self, new_num):
-        old_scaled = int(get_minhash_max_hash() / self.max_hash)
+        max_hash = self.max_hash
 
+        if max_hash is None:
+            raise ValueError('no max_hash available - cannot downsample')
+
+        old_scaled = int(get_minhash_max_hash() / self.max_hash)
         if old_scaled > new_num:
             raise ValueError('new scaled is lower than current sample scaled')
 
@@ -345,6 +363,12 @@ cdef class MinHash(object):
             distance = 2*math.acos(prod) / math.pi
             return 1.0 - distance
 
+    def containment(self, other):
+        """\
+        Calculate containment of self by other.
+        """
+        return self.count_common(other) / len(self.get_mins())
+
     def similarity_ignore_maxhash(self, MinHash other):
         a = set(self.get_mins())
 

diff --git a/sourmash_lib/commands.py b/sourmash_lib/commands.py
@@ -27,6 +27,7 @@ def search(args):
     parser.add_argument('-k', '--ksize', default=DEFAULT_K, type=int)
     parser.add_argument('-f', '--force', action='store_true')
     parser.add_argument('--save-matches', type=argparse.FileType('wt'))
+    parser.add_argument('--containment', action='store_true')
 
     sourmash_args.add_moltype_args(parser)
 
@@ -62,7 +63,10 @@ def search(args):
     # compute query x db
     distances = []
     for (x, filename) in against:
-        distance = query.similarity(x)
+        if args.containment:
+            distance = query.containment(x)
+        else:
+            distance = query.similarity(x)
         if distance >= args.threshold:
             distances.append((distance, x, filename))
 
@@ -224,8 +228,7 @@ def save_siglist(siglist, output_fp, filename=None):
                 sig.save_signatures(siglist, fp)
 
     if args.track_abundance:
-        print('Tracking abundance of input k-mers.',
-              file=sys.stderr)
+        notify('Tracking abundance of input k-mers.')
 
     if not args.merge:
         if args.output:
@@ -235,8 +238,7 @@ def save_siglist(siglist, output_fp, filename=None):
             sigfile = os.path.basename(filename) + '.sig'
             if not args.output and os.path.exists(sigfile) and not \
                 args.force:
-                print('skipping', filename, '- already done',
-                      file=sys.stderr)
+                notify('skipping {} - already done', filename)
                 continue
 
             if args.singleton:
@@ -249,20 +251,19 @@ def save_siglist(siglist, output_fp, filename=None):
 
                     siglist += build_siglist(args.email, Elist, filename,
                                              name=record.name)
-                print('calculated {} signatures for {} sequences in {}'.\
+                notify('calculated {} signatures for {} sequences in {}'.\
                           format(len(siglist), n + 1, filename))
             else:
                 # make minhashes for the whole file
                 Elist = make_minhashes()
 
                 # consume & calculate signatures
-                print('... reading sequences from', filename,
-                      file=sys.stderr)
+                notify('... reading sequences from {}', filename)
                 name = None
                 for n, record in enumerate(screed.open(filename)):
                     if n % 10000 == 0:
                         if n:
-                            print('...', filename, n, file=sys.stderr)
+                            notify('...{} {}', filename, n)
                         elif args.name_from_first:
                             name = record.name
 
@@ -275,7 +276,7 @@ def save_siglist(siglist, output_fp, filename=None):
                     siglist += sigs
                 else:
                     siglist = sigs
-                print('calculated {} signatures for {} sequences in {}'.\
+                notify('calculated {} signatures for {} sequences in {}'.\
                           format(len(siglist), n + 1, filename))
 
             if not args.output:
@@ -289,18 +290,17 @@ def save_siglist(siglist, output_fp, filename=None):
 
         for filename in args.filenames:
             # consume & calculate signatures
-            print('... reading sequences from', filename,
-                  file=sys.stderr)
+            notify('... reading sequences from', filename)
             for n, record in enumerate(screed.open(filename)):
                 if n % 10000 == 0 and n:
-                    print('...', filename, n, file=sys.stderr)
+                    notify('...', filename, n)
 
                 add_seq(Elist, record.sequence,
                         args.input_is_protein, args.check_sequence)
 
         siglist = build_siglist(args.email, Elist, filename,
                                 name=args.merge)
-        print('calculated {} signatures for {} sequences taken from {}'.\
+        notify('calculated {} signatures for {} sequences taken from {}'.\
                format(len(siglist), n + 1, " ".join(args.filenames)))
         # at end, save!
         save_siglist(siglist, args.output)
@@ -321,7 +321,7 @@ def compare(args):
     # load in the various signatures
     siglist = []
     for filename in args.signatures:
-        print('loading', filename, file=sys.stderr)
+        notify('loading {}', filename)
         loaded = sig.load_signatures(filename, select_ksize=args.ksize)
         loaded = list(loaded)
         if not loaded:
@@ -343,7 +343,8 @@ def compare(args):
         for j, E2 in enumerate(siglist):
             D[i][j] = E.similarity(E2, args.ignore_abundance)
 
-        print('%d-%20s\t%s' % (i, E.name(), D[i, :, ],))
+        if len(siglist) < 30:
+            print('%d-%20s\t%s' % (i, E.name(), D[i, :, ],))
         labeltext.append(E.name())
 
     notify('min similarity in matrix: {}', numpy.min(D))
@@ -362,6 +363,8 @@ def compare(args):
 
 def plot(args):
     "Produce a clustering and plot."
+    import matplotlib as mpl
+    mpl.use('Agg')
     import numpy
     import scipy
     import pylab
@@ -603,12 +606,18 @@ def sbt_search(args):
 
     results = []
     for leaf in tree.find(search_fn, query, args.threshold):
-        results.append((query.similarity(leaf.data), leaf.data))
-        #results.append((leaf.data.similarity(ss), leaf.data))
+        results.append((query.similarity(leaf.data, downsample=True),
+                        leaf.data))
 
     results.sort(key=lambda x: -x[0])   # reverse sort on similarity
+
+    if args.best_only:
+        notify("(truncated search because of --best-only; only trust top result")
+
+    notify("similarity   match")
+    notify("----------   -----")
     for (similarity, query) in results:
-        print('{:.2f} {}'.format(similarity, query.name()))
+        print('{:2.1f}%        {}'.format(similarity*100, query.name()))
 
     if args.save_matches:
         outname = args.save_matches.name
@@ -691,6 +700,7 @@ def categorize(args):
     if loader.skipped_nosig:
         notify('skipped/nosig: {}', loader.skipped_nosig)
 
+
 def sbt_gather(args):
     from sourmash_lib.sbt import SBT, GraphFactory
     from sourmash_lib.sbtmh import search_minhashes, SigLeaf
@@ -702,8 +712,8 @@ def sbt_gather(args):
     parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
     parser.add_argument('--threshold', default=0.05, type=float)
     parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
-    parser.add_argument('--csv', type=argparse.FileType('wt'))
     parser.add_argument('--save-matches', type=argparse.FileType('wt'))
+    parser.add_argument('--threshold-bp', type=float, default=5e4)
 
     sourmash_args.add_moltype_args(parser)
 
@@ -728,7 +738,6 @@ def sbt_gather(args):
         error('query signature needs to be created with --scaled')
         sys.exit(-1)
 
-    notify('query signature has max_hash: {}', query.minhash.max_hash)
     orig_query = query
     orig_mins = orig_query.minhash.get_hashes()
 
@@ -761,6 +770,19 @@ def build_new_signature(mins):
         e.add_many(mins)
         return sig.SourmashSignature('', e)
 
+    # xxx
+    def format_bp(bp):
+        bp = float(bp)
+        if bp < 500:
+            return '{:d} bp '.format(bp)
+        elif bp <= 500e3:
+            return '{:.1f} kbp'.format(round(bp / 1e3, 1))
+        elif bp < 500e6:
+            return '{:.1f} Mbp'.format(round(bp / 1e6, 1))
+        elif bp < 500e9:
+            return '{:.1f} Gbp'.format(round(bp / 1e9, 1))
+        return '???'
+
     # construct a new query that doesn't have the max_hash attribute set.
     new_mins = query.minhash.get_hashes()
     query = build_new_signature(new_mins)
@@ -779,21 +801,17 @@ def build_new_signature(mins):
         # figure out what the resolution of the banding on the genome is,
         # based either on an explicit --scaled parameter, or on genome
         # cardinality (deprecated)
-        if best_leaf.minhash.max_hash:
-            R_genome = sourmash_lib.MAX_HASH / \
-              float(best_leaf.minhash.max_hash)
-        elif best_leaf.minhash.hll:
-            genome_size = best_leaf.minhash.hll.estimate_cardinality()
-            genome_max_hash = max(found_mins)
-            R_genome = float(genome_size) / float(genome_max_hash)
-        else:
+        if not best_leaf.minhash.max_hash:
             error('Best hash match in sbt_gather has no cardinality')
             error('Please prepare database of sequences with --scaled')
             sys.exit(-1)
 
+        R_genome = sourmash_lib.MAX_HASH / float(best_leaf.minhash.max_hash)
+
         # pick the highest R / lowest resolution
         R_comparison = max(R_metagenome, R_genome)
 
+        # CTB: these could probably be replaced by minhash.downsample_scaled.
         new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison)
         query_mins = set([ i for i in query_mins if i < new_max_hash ])
         found_mins = set([ i for i in found_mins if i < new_max_hash ])
@@ -802,72 +820,62 @@ def build_new_signature(mins):
         # calculate intersection:
         intersect_mins = query_mins.intersection(found_mins)
         intersect_orig_mins = orig_mins.intersection(found_mins)
+        intersect_bp = R_comparison * len(intersect_orig_mins)
         sum_found += len(intersect_mins)
 
-        if len(intersect_mins) < 5:   # hard cutoff for now
-            notify('found only {} hashes in common.', len(intersect_mins))
-            notify('this is below a sane threshold => exiting.')
+        if intersect_bp < args.threshold_bp:   # hard cutoff for now
+            notify('found less than {} in common. => exiting',
+                   format_bp(intersect_bp))
             break
 
         # calculate fractions wrt first denominator - genome size
         genome_n_mins = len(found_mins)
         f_genome = len(intersect_mins) / float(genome_n_mins)
-        f_orig_query = len(intersect_orig_mins) / float(genome_n_mins)
+        f_orig_query = len(intersect_orig_mins) / float(len(orig_mins))
 
         # calculate fractions wrt second denominator - metagenome size
         query_n_mins = len(orig_query.minhash.get_hashes())
         f_query = len(intersect_mins) / float(query_n_mins)
 
+        if not len(found):                # first result? print header.
+            notify("")
+            notify("overlap    p_query p_genome")
+            notify("-------    ------- --------")
+
         # print interim result & save in a list for later use
-        notify('found: {:.2f} {:.2f} {}', f_genome, f_query,
-               best_leaf.name())
-        found.append((f_genome, best_leaf, f_query))
+        notify('{:8s} {:-5.1f}%    {:-5.1f}%      {}',
+               format_bp(intersect_bp), f_orig_query*100,
+               f_genome*100, best_leaf.name()[:40])
+        found.append((intersect_bp, f_orig_query, best_leaf, f_genome))
 
         # construct a new query, minus the previous one.
         query_mins -= set(found_mins)
         query = build_new_signature(query_mins)
 
     # basic reporting
-    notify('found {} matches total', len(found))
-    notify('the recovered matches hit {:.1f}% of the query',
-           100. * sum_found / len(orig_query.minhash.get_hashes()))
+    notify('\nfound {} matches total;', len(found))
+
+    sum_found /= len(orig_query.minhash.get_hashes())
+    notify('the recovered matches hit {:.1f}% of the query', sum_found * 100)
     notify('')
 
     if not found:
         sys.exit(0)
 
-    # sort by fraction of genome (first key) - change this?
-    found.sort(key=lambda x: x[0])
-    found.reverse()
-
-    notify('Composition:')
-    for (frac, leaf_sketch, genome_fraction) in found:
-        notify('{:.2f} {:.2f} {}', frac, genome_fraction, leaf_sketch.name())
-
     if args.output:
-        print('Composition:', file=args.output)
-        for (frac, leaf_sketch, genome_fraction) in found:
-            print('{:.2f} {:.f} {}'.format(frac, genome_fraction,
-                                        leaf_sketch.name()),
-                  file=args.output)
-
-    if args.csv:
-        fieldnames = ['fraction', 'name', 'sketch_kmers', 'genome_fraction']
-        w = csv.DictWriter(args.csv, fieldnames=fieldnames)
-
+        fieldnames = ['intersect_bp', 'f_orig_query', 'f_found_genome', 'name']
+        w = csv.DictWriter(args.output, fieldnames=fieldnames)
         w.writeheader()
-        for (frac, leaf_sketch, genome_fraction) in found:
-            cardinality = 0
-            if leaf_sketch.minhash.hll:
-                cardinality = leaf_sketch.minhash.hll.estimate_cardinality()
-            w.writerow(dict(fraction=frac, name=leaf_sketch.name(),
-                            sketch_kmers=cardinality,
-                            genome_fraction=genome_fraction))
+        for (intersect_bp, f_genome, leaf, f_orig_query) in found:
+            w.writerow(dict(intersect_bp=intersect_bp,
+                            f_orig_query=f_orig_query, name=leaf.name(),
+                            f_found_genome=f_genome,))
+
     if args.save_matches:
         outname = args.save_matches.name
         notify('saving all matches to "{}"', outname)
-        sig.save_signatures([ ss for (f, ss) in found ],
-                            args.save_matches)
+        sig.save_signatures([ ss for (_, _, ss, _) in found ],
+                              args.save_matches)
 
 
 def watch(args):