Skip to content

Commit

Permalink
[MRG] Refactor the gather code so that it uses 'hashes' instead of 'm…
Browse files Browse the repository at this point in the history
…ins' (#1329)

* update gather with no abundances so that abund output is empty

* add tests for new CSV abund output behavior

* refactoring and simplification of gather code to use 'hashes' instead of 'mins'

* further simplifications
  • Loading branch information
ctb committed Feb 15, 2021
1 parent bf5eeba commit 3bfd0fa
Showing 1 changed file with 27 additions and 27 deletions.
54 changes: 27 additions & 27 deletions src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,11 @@ def _find_best(dblist, query, threshold_bp):


def _filter_max_hash(values, max_hash):
results = set()
for v in values:
if v < max_hash:
yield v
results.add(v)
return results


def gather_databases(query, databases, threshold_bp, ignore_abundance):
Expand All @@ -118,10 +120,10 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
# track original query information for later usage.
track_abundance = query.minhash.track_abundance and not ignore_abundance
orig_query_mh = query.minhash
orig_query_mins = orig_query_mh.hashes.keys()
orig_query_hashes = set(orig_query_mh.hashes)

# do we pay attention to abundances?
orig_query_abunds = { k: 1 for k in orig_query_mins }
orig_query_abunds = { k: 1 for k in orig_query_hashes }
if track_abundance:
import numpy as np
orig_query_abunds = orig_query_mh.hashes
Expand All @@ -138,8 +140,8 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
break

# subtract found hashes from search hashes, construct new search
query_mins = set(query.minhash.hashes.keys())
found_mins = best_match.minhash.hashes.keys()
query_hashes = set(query.minhash.hashes)
found_hashes = set(best_match.minhash.hashes)

# Is the best match computed with scaled? Die if not.
match_scaled = best_match.minhash.scaled
Expand All @@ -151,57 +153,55 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
# pick the highest scaled / lowest resolution
cmp_scaled = max(cmp_scaled, match_scaled)

# eliminate mins under this new resolution.
# eliminate hashes under this new resolution.
# (CTB note: this means that if a high scaled/low res signature is
# found early on, resolution will be low from then on.)
new_max_hash = _get_max_hash_for_scaled(cmp_scaled)
query_mins = set(_filter_max_hash(query_mins, new_max_hash))
found_mins = set(_filter_max_hash(found_mins, new_max_hash))
orig_query_mins = set(_filter_max_hash(orig_query_mins, new_max_hash))
sum_abunds = sum(( v for (k,v) in orig_query_abunds.items() if k < new_max_hash ))
query_hashes = _filter_max_hash(query_hashes, new_max_hash)
found_hashes = _filter_max_hash(found_hashes, new_max_hash)
orig_query_hashes = _filter_max_hash(orig_query_hashes, new_max_hash)
sum_abunds = sum(( orig_query_abunds[k] for k in orig_query_hashes))

# calculate intersection with query mins:
intersect_mins = query_mins.intersection(found_mins)
unique_intersect_bp = cmp_scaled * len(intersect_mins)
intersect_orig_query_mins = orig_query_mins.intersection(found_mins)
intersect_bp = cmp_scaled * len(intersect_orig_query_mins)
# calculate intersection with query hashes:
intersect_hashes = query_hashes.intersection(found_hashes)
unique_intersect_bp = cmp_scaled * len(intersect_hashes)
intersect_orig_hashes = orig_query_hashes.intersection(found_hashes)
intersect_bp = cmp_scaled * len(intersect_orig_hashes)

# calculate fractions wrt first denominator - genome size
genome_n_mins = len(found_mins)
f_match = len(intersect_mins) / float(genome_n_mins)
f_orig_query = len(intersect_orig_query_mins) / \
float(len(orig_query_mins))
assert intersect_hashes.issubset(found_hashes)
f_match = len(intersect_hashes) / len(found_hashes)
f_orig_query = len(intersect_orig_hashes) / len(orig_query_hashes)

# calculate fractions wrt second denominator - metagenome size
orig_query_mh = orig_query_mh.downsample(scaled=cmp_scaled)
query_n_mins = len(orig_query_mh)
f_unique_to_query = len(intersect_mins) / float(query_n_mins)
assert intersect_hashes.issubset(orig_query_hashes)
f_unique_to_query = len(intersect_hashes) / len(orig_query_hashes)

# calculate fraction of subject match with orig query
f_match_orig = best_match.minhash.contained_by(orig_query_mh,
downsample=True)

# calculate scores weighted by abundances
f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_mins))
f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_hashes))
f_unique_weighted /= sum_abunds

# calculate stats on abundances, if desired.
average_abund, median_abund, std_abund = None, None, None
if track_abundance:
intersect_abunds = (orig_query_abunds[k] for k in intersect_mins)
intersect_abunds = (orig_query_abunds[k] for k in intersect_hashes)
intersect_abunds = list(intersect_abunds)

average_abund = np.mean(intersect_abunds)
median_abund = np.median(intersect_abunds)
std_abund = np.std(intersect_abunds)

# construct a new query, subtracting hashes found in previous one.
query = _subtract_and_downsample(found_mins, query, cmp_scaled)
query = _subtract_and_downsample(found_hashes, query, cmp_scaled)
remaining_bp = cmp_scaled * len(query.minhash.hashes)

# compute weighted_missed:
query_mins -= set(found_mins)
weighted_missed = sum((orig_query_abunds[k] for k in query_mins)) \
query_hashes -= set(found_hashes)
weighted_missed = sum((orig_query_abunds[k] for k in query_hashes)) \
/ sum_abunds

# build a result namedtuple
Expand Down

0 comments on commit 3bfd0fa

Please sign in to comment.