Skip to content

Commit

Permalink
[WIP] Refactor LCA database to contain all the information. (#537)
Browse files Browse the repository at this point in the history
* a trial refactoring of the lca db

* save and load seem to basically work

* got search & gather on LCA working in new framework

* allow key error

* keep identities with no lineage assigment

* add multigather

* Hyperlink DOIs to preferred resolver (#562)

* fix divide by zero issue in MinHash.contained_by (#572)

* fix divide by zero issue in contained_by

* remove unused lineages and identifiers

* update report output

* majority of lca tests passing now!

* fix gather?

* ...all tests pass?
  • Loading branch information
ctb committed Dec 12, 2018
1 parent f7a2a54 commit 23e33b3
Show file tree
Hide file tree
Showing 18 changed files with 460 additions and 201 deletions.
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -17,7 +17,7 @@ We have demo notebooks on binder that you can interact with:

[![Binder](http://mybinder.org/badge.svg)](http://mybinder.org/repo/dib-lab/sourmash)

Sourmash is [published on JOSS](http://dx.doi.org/10.21105/joss.00027).
Sourmash is [published on JOSS](https://doi.org/10.21105/joss.00027).

----

Expand Down
2 changes: 1 addition & 1 deletion paper.bib
Expand Up @@ -6,5 +6,5 @@ @article{ondov2015fast
year={2015},
publisher={Cold Spring Harbor Labs Journals},
doi={10.1101/029827},
url={http://dx.doi.org/10.1101/029827}
url={https://doi.org/10.1101/029827}
}
5 changes: 3 additions & 2 deletions sourmash/__main__.py
Expand Up @@ -9,7 +9,7 @@

from .commands import (categorize, compare, compute, dump, import_csv,
gather, index, sbt_combine, search,
plot, watch, info, storage, migrate)
plot, watch, info, storage, migrate, multigather)
from .lca import main as lca_main

usage='''
Expand Down Expand Up @@ -58,7 +58,8 @@ def main():
'sbt_combine': sbt_combine, 'info': info,
'storage': storage,
'lca': lca_main,
'migrate': migrate}
'migrate': migrate,
'multigather': multigather}
parser = argparse.ArgumentParser(
description='work with compressed sequence representations')
parser.add_argument('command', nargs='?')
Expand Down
2 changes: 2 additions & 0 deletions sourmash/_minhash.pyx
Expand Up @@ -364,6 +364,8 @@ cdef class MinHash(object):
"""\
Calculate how much of self is contained by other.
"""
if not len(self):
return 0.0
return self.count_common(other) / len(self.get_mins())

def similarity_ignore_maxhash(self, MinHash other):
Expand Down
159 changes: 159 additions & 0 deletions sourmash/commands.py
Expand Up @@ -1109,6 +1109,165 @@ def gather(args):
args.output_unassigned)


def multigather(args):
from .search import gather_databases, format_bp

parser = argparse.ArgumentParser()
parser.add_argument('--db', nargs='+', action='append')
parser.add_argument('--query', nargs='+', action='append')
parser.add_argument('--traverse-directory', action='store_true',
help='search all signatures underneath directories.')
parser.add_argument('--threshold-bp', type=float, default=5e4,
help='threshold (in bp) for reporting results')
parser.add_argument('--scaled', type=float, default=0,
help='downsample query to this scaled factor')
parser.add_argument('-q', '--quiet', action='store_true',
help='suppress non-error output')
parser.add_argument('--ignore-abundance', action='store_true',
help='do NOT use k-mer abundances if present')
parser.add_argument('-d', '--debug', action='store_true')

sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
sourmash_args.add_moltype_args(parser)

args = parser.parse_args(args)
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)

if not args.db:
error('Error! must specify at least one database with --db')
sys.exit(-1)

if not args.query:
error('Error! must specify at least one query signature with --query')
sys.exit(-1)

# flatten --db and --query
args.db = [item for sublist in args.db for item in sublist]
args.query = [item for sublist in args.query for item in sublist]

query = sourmash_args.load_query_signature(args.query[0],
ksize=args.ksize,
select_moltype=moltype)
# set up the search databases
databases = sourmash_args.load_dbs_and_sigs(args.db, query, False,
args.traverse_directory)

if not len(databases):
error('Nothing found to search!')
sys.exit(-1)

# run gather on all the queries.
for queryfile in args.query:
# load the query signature & figure out all the things
query = sourmash_args.load_query_signature(queryfile,
ksize=args.ksize,
select_moltype=moltype)
notify('loaded query: {}... (k={}, {})', query.name()[:30],
query.minhash.ksize,
sourmash_args.get_moltype(query))

# verify signature was computed right.
if query.minhash.max_hash == 0:
error('query signature needs to be created with --scaled; skipping')
continue

# downsample if requested
if args.scaled:
notify('downsampling query from scaled={} to {}',
query.minhash.scaled, int(args.scaled))
query.minhash = query.minhash.downsample_scaled(args.scaled)

# empty?
if not query.minhash.get_mins():
error('no query hashes!? skipping to next..')
continue

found = []
weighted_missed = 1
for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
# print interim result & save in a list for later use
pct_query = '{:.1f}%'.format(result.f_orig_query*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)

name = result.leaf._display_name(40)


if not len(found): # first result? print header.
if query.minhash.track_abundance and not args.ignore_abundance:
print_results("")
print_results("overlap p_query p_match avg_abund")
print_results("--------- ------- ------- ---------")
else:
print_results("")
print_results("overlap p_query p_match")
print_results("--------- ------- -------")


# print interim result & save in a list for later use
pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)
average_abund ='{:.1f}'.format(result.average_abund)
name = result.leaf._display_name(40)

if query.minhash.track_abundance and not args.ignore_abundance:
print_results('{:9} {:>7} {:>7} {:>9} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
average_abund, name)
else:
print_results('{:9} {:>7} {:>7} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
name)
found.append(result)


# basic reporting
print_results('\nfound {} matches total;', len(found))

print_results('the recovered matches hit {:.1f}% of the query',
(1 - weighted_missed) * 100)
print_results('')

if not found:
notify('nothing found... skipping.')
continue

output_base = os.path.basename(queryfile)
output_csv = output_base + '.csv'

fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
'f_unique_to_query', 'f_unique_weighted',
'average_abund', 'median_abund', 'std_abund', 'name', 'filename', 'md5']
with open(output_csv, 'wt') as fp:
w = csv.DictWriter(fp, fieldnames=fieldnames)
w.writeheader()
for result in found:
d = dict(result._asdict())
del d['leaf'] # actual signature not in CSV.
w.writerow(d)

output_matches = output_base + '.matches.sig'
with open(output_matches, 'wt') as fp:
outname = output_matches
notify('saving all matches to "{}"', outname)
sig.save_signatures([ r.leaf for r in found ], fp)

output_unassigned = output_base + '.unassigned.sig'
with open(output_unassigned, 'wt') as fp:
if not found:
notify('nothing found - entire query signature unassigned.')
elif not query.minhash.get_mins():
notify('no unassigned hashes! not saving.')
else:
notify('saving unassigned hashes to "{}"', output_unassigned)

e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
e.add_many(next_query.minhash.get_mins())
sig.save_signatures([ sig.SourmashSignature(e) ], fp)

# fini, next query!


def watch(args):
"Build a signature from raw FASTA/FASTQ coming in on stdin, search."

Expand Down
73 changes: 41 additions & 32 deletions sourmash/lca/command_gather.py
Expand Up @@ -82,70 +82,69 @@ def gather_signature(query_sig, dblist, ignore_abundance):
md5_to_name = {}

x = set()
for hashval in query_mins:
for lca_db in dblist:
lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
for lid in lineage_ids:
md5 = lca_db.lineage_id_to_signature[lid]
x.add((lca_db, lid, md5))

for lca_db, lid, md5 in x:
md5_to_lineage[md5] = lca_db.lineage_dict[lid]
if lca_db.signature_to_name:
md5_to_name[md5] = lca_db.signature_to_name[md5]
else:
md5_to_name[md5] = ''
for lca_db in dblist:
siglist = []
idx_set = set()

for hashval in query_mins:
idx_set.update(lca_db.hashval_to_idx.get(hashval, []))

for idx in idx_set:
pass

# now! do the gather:
while 1:
# find all of the assignments for the current set of hashes
assignments = defaultdict(set)
for hashval in query_mins:
for lca_db in dblist:
lineage_ids = lca_db.hashval_to_lineage_id.get(hashval, [])
for lid in lineage_ids:
md5 = lca_db.lineage_id_to_signature[lid]
signature_size = lca_db.lineage_id_counts[lid]
assignments[hashval].add((md5, signature_size))
idx_list = lca_db.hashval_to_idx.get(hashval, [])

for idx in idx_list:
assignments[hashval].add((lca_db, idx))

# none? quit.
if not assignments:
break

# count the distinct signatures.
counts = Counter()
for hashval, md5_set in assignments.items():
for (md5, sigsize) in md5_set:
counts[(md5, sigsize)] += 1
for hashval, match_set in assignments.items():
for (lca_db, idx) in match_set:
counts[(lca_db, idx)] += 1

# collect the most abundant assignments
common_iter = iter(counts.most_common())
best_list = []
(md5, sigsize), top_count = next(common_iter)
(best_lca_db, best_idx), top_count = next(common_iter)

best_list.append((md5_to_name[md5], md5, sigsize))
for (md5, sigsize), count in common_iter:
best_list.append((lca_db, idx))
for (lca_db, idx), count in common_iter:
if count != top_count:
break
best_list.append((md5_to_name[md5], md5, sigsize))
best_list.append((lca_db, idx))

# sort on name and pick the top (for consistency).
best_list.sort()
_, top_md5, top_sigsize = best_list[0]
#best_list.sort()
#_, top_md5, top_sigsize = best_list[0]

equiv_counts = len(best_list) - 1

# now, remove from query mins.
intersect_mins = set()
for hashval, md5_set in assignments.items():
if (top_md5, top_sigsize) in md5_set:
for hashval, match_set in assignments.items():
if (best_lca_db, best_idx) in match_set:
query_mins.remove(hashval)
intersect_mins.add(hashval)

# should match!
assert top_count == len(intersect_mins)

# calculate size of match (# of hashvals belonging to that sig)
match_size = top_sigsize
match_size = 0
for hashval, idx_list in best_lca_db.hashval_to_idx.items():
if best_idx in idx_list:
match_size += 1

# construct 'result' object
intersect_bp = top_count * query_sig.minhash.scaled
Expand All @@ -155,13 +154,23 @@ def gather_signature(query_sig, dblist, ignore_abundance):
/ len(intersect_mins)
f_match = len(intersect_mins) / match_size

# XXX name and lineage
for ident, idx in best_lca_db.ident_to_idx.items():
if idx == best_idx:
name = best_lca_db.ident_to_name[ident]

lid = best_lca_db.idx_to_lid.get(best_idx)
lineage = ()
if lid is not None:
lineage = best_lca_db.lid_to_lineage[lid]

result = LCAGatherResult(intersect_bp = intersect_bp,
f_unique_to_query= top_count / n_mins,
f_unique_weighted=f_unique_weighted,
average_abund=average_abund,
f_match=f_match,
lineage=md5_to_lineage[top_md5],
name=md5_to_name[top_md5],
lineage=lineage,
name=name,
n_equal_matches=equiv_counts)

f_unassigned = len(query_mins) / n_mins
Expand Down

0 comments on commit 23e33b3

Please sign in to comment.