Merge branch 'latest' into docs_4.0

sourmash-bio · Feb 16, 2021 · a63d8e9 · a63d8e9
2 parents c802035 + 3bfd0fa
commit a63d8e9
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 33 deletions.
diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
@@ -624,13 +624,13 @@ def gather(args):
                 print_results("---------   ------- -------")
 
 
-        # print interim result & save in a list for later use
+        # print interim result & save in `found` list for later use
         pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
         pct_genome = '{:.1f}%'.format(result.f_match*100)
-        average_abund ='{:.1f}'.format(result.average_abund)
         name = result.match._display_name(40)
 
         if query.minhash.track_abundance and not args.ignore_abundance:
+            average_abund ='{:.1f}'.format(result.average_abund)
             print_results('{:9}   {:>7} {:>7} {:>9}    {}',
                       format_bp(result.intersect_bp), pct_query, pct_genome,
                       average_abund, name)
@@ -758,10 +758,10 @@ def multigather(args):
                 # print interim result & save in a list for later use
                 pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
                 pct_genome = '{:.1f}%'.format(result.f_match*100)
-                average_abund ='{:.1f}'.format(result.average_abund)
                 name = result.match._display_name(40)
 
                 if query.minhash.track_abundance and not args.ignore_abundance:
+                    average_abund ='{:.1f}'.format(result.average_abund)
                     print_results('{:9}   {:>7} {:>7} {:>9}    {}',
                               format_bp(result.intersect_bp), pct_query, pct_genome,
                               average_abund, name)

diff --git a/src/sourmash/search.py b/src/sourmash/search.py
@@ -105,9 +105,11 @@ def _find_best(dblist, query, threshold_bp):
 
 
 def _filter_max_hash(values, max_hash):
+    results = set()
     for v in values:
         if v < max_hash:
-            yield v
+            results.add(v)
+    return results
 
 
 def gather_databases(query, databases, threshold_bp, ignore_abundance):
@@ -118,10 +120,10 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
     # track original query information for later usage.
     track_abundance = query.minhash.track_abundance and not ignore_abundance
     orig_query_mh = query.minhash
-    orig_query_mins = orig_query_mh.hashes.keys()
+    orig_query_hashes = set(orig_query_mh.hashes)
 
     # do we pay attention to abundances?
-    orig_query_abunds = { k: 1 for k in orig_query_mins }
+    orig_query_abunds = { k: 1 for k in orig_query_hashes }
     if track_abundance:
         import numpy as np
         orig_query_abunds = orig_query_mh.hashes
@@ -138,8 +140,8 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
             break
 
         # subtract found hashes from search hashes, construct new search
-        query_mins = set(query.minhash.hashes.keys())
-        found_mins = best_match.minhash.hashes.keys()
+        query_hashes = set(query.minhash.hashes)
+        found_hashes = set(best_match.minhash.hashes)
 
         # Is the best match computed with scaled? Die if not.
         match_scaled = best_match.minhash.scaled
@@ -151,57 +153,55 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
         # pick the highest scaled / lowest resolution
         cmp_scaled = max(cmp_scaled, match_scaled)
 
-        # eliminate mins under this new resolution.
+        # eliminate hashes under this new resolution.
         # (CTB note: this means that if a high scaled/low res signature is
         # found early on, resolution will be low from then on.)
         new_max_hash = _get_max_hash_for_scaled(cmp_scaled)
-        query_mins = set(_filter_max_hash(query_mins, new_max_hash))
-        found_mins = set(_filter_max_hash(found_mins, new_max_hash))
-        orig_query_mins = set(_filter_max_hash(orig_query_mins, new_max_hash))
-        sum_abunds = sum(( v for (k,v) in orig_query_abunds.items() if k < new_max_hash ))
+        query_hashes = _filter_max_hash(query_hashes, new_max_hash)
+        found_hashes = _filter_max_hash(found_hashes, new_max_hash)
+        orig_query_hashes = _filter_max_hash(orig_query_hashes, new_max_hash)
+        sum_abunds = sum(( orig_query_abunds[k] for k in orig_query_hashes))
 
-        # calculate intersection with query mins:
-        intersect_mins = query_mins.intersection(found_mins)
-        unique_intersect_bp = cmp_scaled * len(intersect_mins)
-        intersect_orig_query_mins = orig_query_mins.intersection(found_mins)
-        intersect_bp = cmp_scaled * len(intersect_orig_query_mins)
+        # calculate intersection with query hashes:
+        intersect_hashes = query_hashes.intersection(found_hashes)
+        unique_intersect_bp = cmp_scaled * len(intersect_hashes)
+        intersect_orig_hashes = orig_query_hashes.intersection(found_hashes)
+        intersect_bp = cmp_scaled * len(intersect_orig_hashes)
 
         # calculate fractions wrt first denominator - genome size
-        genome_n_mins = len(found_mins)
-        f_match = len(intersect_mins) / float(genome_n_mins)
-        f_orig_query = len(intersect_orig_query_mins) / \
-            float(len(orig_query_mins))
+        assert intersect_hashes.issubset(found_hashes)
+        f_match = len(intersect_hashes) / len(found_hashes)
+        f_orig_query = len(intersect_orig_hashes) / len(orig_query_hashes)
 
         # calculate fractions wrt second denominator - metagenome size
-        orig_query_mh = orig_query_mh.downsample(scaled=cmp_scaled)
-        query_n_mins = len(orig_query_mh)
-        f_unique_to_query = len(intersect_mins) / float(query_n_mins)
+        assert intersect_hashes.issubset(orig_query_hashes)
+        f_unique_to_query = len(intersect_hashes) / len(orig_query_hashes)
 
         # calculate fraction of subject match with orig query
         f_match_orig = best_match.minhash.contained_by(orig_query_mh,
                                                        downsample=True)
 
         # calculate scores weighted by abundances
-        f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_mins))
+        f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_hashes))
         f_unique_weighted /= sum_abunds
 
         # calculate stats on abundances, if desired.
-        average_abund, median_abund, std_abund = 0, 0, 0
+        average_abund, median_abund, std_abund = None, None, None
         if track_abundance:
-            intersect_abunds = (orig_query_abunds[k] for k in intersect_mins)
+            intersect_abunds = (orig_query_abunds[k] for k in intersect_hashes)
             intersect_abunds = list(intersect_abunds)
 
             average_abund = np.mean(intersect_abunds)
             median_abund = np.median(intersect_abunds)
             std_abund = np.std(intersect_abunds)
 
         # construct a new query, subtracting hashes found in previous one.
-        query = _subtract_and_downsample(found_mins, query, cmp_scaled)
+        query = _subtract_and_downsample(found_hashes, query, cmp_scaled)
         remaining_bp = cmp_scaled * len(query.minhash.hashes)
 
         # compute weighted_missed:
-        query_mins -= set(found_mins)
-        weighted_missed = sum((orig_query_abunds[k] for k in query_mins)) \
+        query_hashes -= set(found_hashes)
+        weighted_missed = sum((orig_query_abunds[k] for k in query_hashes)) \
              / sum_abunds
 
         # build a result namedtuple

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
@@ -3669,7 +3669,7 @@ def test_gather_abund_10_1(c):
     assert total_bp_analyzed == total_query_bp
 
 
-@utils.in_thisdir
+@utils.in_tempdir
 def test_gather_abund_10_1_ignore_abundance(c):
     # see comments in test_gather_abund_1_1, above.
     # nullgraph/make-reads.py -S 1 -r 200 -C 2 tests/test-data/genome-s10.fa.gz > r1.fa
@@ -3686,7 +3686,9 @@ def test_gather_abund_10_1_ignore_abundance(c):
 
     status, out, err = c.run_sourmash('gather', query,
                                       '--ignore-abundance',
-                                      *against_list)
+                                      *against_list,
+                                      '-o', c.output('results.csv'))
+
 
     print(out)
     print(err)
@@ -3702,6 +3704,18 @@ def test_gather_abund_10_1_ignore_abundance(c):
     assert all(('42.8%   80.0%', 'tests/test-data/genome-s11.fa.gz' in out))
     assert 'genome-s12.fa.gz' not in out
 
+    with open(c.output('results.csv'), 'rt') as fp:
+        r = csv.DictReader(fp)
+        some_results = False
+        for row in r:
+            some_results = True
+            assert row['average_abund'] is ''
+            assert row['median_abund'] is ''
+            assert row['std_abund'] is ''
+
+        assert some_results
+
+
 
 @utils.in_tempdir
 def test_gather_output_unassigned_with_abundance(c):