Skip to content

Commit

Permalink
Merge branch 'latest' into docs_4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Feb 16, 2021
2 parents c802035 + 3bfd0fa commit a63d8e9
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 33 deletions.
6 changes: 3 additions & 3 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,13 +624,13 @@ def gather(args):
print_results("--------- ------- -------")


# print interim result & save in a list for later use
# print interim result & save in `found` list for later use
pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)
average_abund ='{:.1f}'.format(result.average_abund)
name = result.match._display_name(40)

if query.minhash.track_abundance and not args.ignore_abundance:
average_abund ='{:.1f}'.format(result.average_abund)
print_results('{:9} {:>7} {:>7} {:>9} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
average_abund, name)
Expand Down Expand Up @@ -758,10 +758,10 @@ def multigather(args):
# print interim result & save in a list for later use
pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)
average_abund ='{:.1f}'.format(result.average_abund)
name = result.match._display_name(40)

if query.minhash.track_abundance and not args.ignore_abundance:
average_abund ='{:.1f}'.format(result.average_abund)
print_results('{:9} {:>7} {:>7} {:>9} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
average_abund, name)
Expand Down
56 changes: 28 additions & 28 deletions src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,11 @@ def _find_best(dblist, query, threshold_bp):


def _filter_max_hash(values, max_hash):
results = set()
for v in values:
if v < max_hash:
yield v
results.add(v)
return results


def gather_databases(query, databases, threshold_bp, ignore_abundance):
Expand All @@ -118,10 +120,10 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
# track original query information for later usage.
track_abundance = query.minhash.track_abundance and not ignore_abundance
orig_query_mh = query.minhash
orig_query_mins = orig_query_mh.hashes.keys()
orig_query_hashes = set(orig_query_mh.hashes)

# do we pay attention to abundances?
orig_query_abunds = { k: 1 for k in orig_query_mins }
orig_query_abunds = { k: 1 for k in orig_query_hashes }
if track_abundance:
import numpy as np
orig_query_abunds = orig_query_mh.hashes
Expand All @@ -138,8 +140,8 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
break

# subtract found hashes from search hashes, construct new search
query_mins = set(query.minhash.hashes.keys())
found_mins = best_match.minhash.hashes.keys()
query_hashes = set(query.minhash.hashes)
found_hashes = set(best_match.minhash.hashes)

# Is the best match computed with scaled? Die if not.
match_scaled = best_match.minhash.scaled
Expand All @@ -151,57 +153,55 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
# pick the highest scaled / lowest resolution
cmp_scaled = max(cmp_scaled, match_scaled)

# eliminate mins under this new resolution.
# eliminate hashes under this new resolution.
# (CTB note: this means that if a high scaled/low res signature is
# found early on, resolution will be low from then on.)
new_max_hash = _get_max_hash_for_scaled(cmp_scaled)
query_mins = set(_filter_max_hash(query_mins, new_max_hash))
found_mins = set(_filter_max_hash(found_mins, new_max_hash))
orig_query_mins = set(_filter_max_hash(orig_query_mins, new_max_hash))
sum_abunds = sum(( v for (k,v) in orig_query_abunds.items() if k < new_max_hash ))
query_hashes = _filter_max_hash(query_hashes, new_max_hash)
found_hashes = _filter_max_hash(found_hashes, new_max_hash)
orig_query_hashes = _filter_max_hash(orig_query_hashes, new_max_hash)
sum_abunds = sum(( orig_query_abunds[k] for k in orig_query_hashes))

# calculate intersection with query mins:
intersect_mins = query_mins.intersection(found_mins)
unique_intersect_bp = cmp_scaled * len(intersect_mins)
intersect_orig_query_mins = orig_query_mins.intersection(found_mins)
intersect_bp = cmp_scaled * len(intersect_orig_query_mins)
# calculate intersection with query hashes:
intersect_hashes = query_hashes.intersection(found_hashes)
unique_intersect_bp = cmp_scaled * len(intersect_hashes)
intersect_orig_hashes = orig_query_hashes.intersection(found_hashes)
intersect_bp = cmp_scaled * len(intersect_orig_hashes)

# calculate fractions wrt first denominator - genome size
genome_n_mins = len(found_mins)
f_match = len(intersect_mins) / float(genome_n_mins)
f_orig_query = len(intersect_orig_query_mins) / \
float(len(orig_query_mins))
assert intersect_hashes.issubset(found_hashes)
f_match = len(intersect_hashes) / len(found_hashes)
f_orig_query = len(intersect_orig_hashes) / len(orig_query_hashes)

# calculate fractions wrt second denominator - metagenome size
orig_query_mh = orig_query_mh.downsample(scaled=cmp_scaled)
query_n_mins = len(orig_query_mh)
f_unique_to_query = len(intersect_mins) / float(query_n_mins)
assert intersect_hashes.issubset(orig_query_hashes)
f_unique_to_query = len(intersect_hashes) / len(orig_query_hashes)

# calculate fraction of subject match with orig query
f_match_orig = best_match.minhash.contained_by(orig_query_mh,
downsample=True)

# calculate scores weighted by abundances
f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_mins))
f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_hashes))
f_unique_weighted /= sum_abunds

# calculate stats on abundances, if desired.
average_abund, median_abund, std_abund = 0, 0, 0
average_abund, median_abund, std_abund = None, None, None
if track_abundance:
intersect_abunds = (orig_query_abunds[k] for k in intersect_mins)
intersect_abunds = (orig_query_abunds[k] for k in intersect_hashes)
intersect_abunds = list(intersect_abunds)

average_abund = np.mean(intersect_abunds)
median_abund = np.median(intersect_abunds)
std_abund = np.std(intersect_abunds)

# construct a new query, subtracting hashes found in previous one.
query = _subtract_and_downsample(found_mins, query, cmp_scaled)
query = _subtract_and_downsample(found_hashes, query, cmp_scaled)
remaining_bp = cmp_scaled * len(query.minhash.hashes)

# compute weighted_missed:
query_mins -= set(found_mins)
weighted_missed = sum((orig_query_abunds[k] for k in query_mins)) \
query_hashes -= set(found_hashes)
weighted_missed = sum((orig_query_abunds[k] for k in query_hashes)) \
/ sum_abunds

# build a result namedtuple
Expand Down
18 changes: 16 additions & 2 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3669,7 +3669,7 @@ def test_gather_abund_10_1(c):
assert total_bp_analyzed == total_query_bp


@utils.in_thisdir
@utils.in_tempdir
def test_gather_abund_10_1_ignore_abundance(c):
# see comments in test_gather_abund_1_1, above.
# nullgraph/make-reads.py -S 1 -r 200 -C 2 tests/test-data/genome-s10.fa.gz > r1.fa
Expand All @@ -3686,7 +3686,9 @@ def test_gather_abund_10_1_ignore_abundance(c):

status, out, err = c.run_sourmash('gather', query,
'--ignore-abundance',
*against_list)
*against_list,
'-o', c.output('results.csv'))


print(out)
print(err)
Expand All @@ -3702,6 +3704,18 @@ def test_gather_abund_10_1_ignore_abundance(c):
assert all(('42.8% 80.0%', 'tests/test-data/genome-s11.fa.gz' in out))
assert 'genome-s12.fa.gz' not in out

with open(c.output('results.csv'), 'rt') as fp:
r = csv.DictReader(fp)
some_results = False
for row in r:
some_results = True
assert row['average_abund'] is ''
assert row['median_abund'] is ''
assert row['std_abund'] is ''

assert some_results



@utils.in_tempdir
def test_gather_output_unassigned_with_abundance(c):
Expand Down

0 comments on commit a63d8e9

Please sign in to comment.