Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Temporary branch for a metagenome workshop at UCSC #188

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
23f9137
enrich -o output, provide column headers, eliminate --csv on sbt_gather
ctb Apr 17, 2017
609bf01
fix tests, add --save-matches test
ctb Apr 17, 2017
0b19027
simplify sbt_gather a bit by doing legacy hll compute in minhash max_…
ctb Apr 19, 2017
fc89bd3
hackity hack add in downsample
ctb Apr 19, 2017
b8eac7d
Merge branch 'master' of github.com:dib-lab/sourmash into update/gath…
ctb Apr 19, 2017
0956b4f
add in containment
ctb Apr 19, 2017
28fb64e
Merge branch 'fix/sbt_search' into 2017-ucsc-metagenome
ctb Apr 22, 2017
4fc8aff
fix legacy max_hash
ctb Apr 22, 2017
5821a1b
Merge branch 'fix/sbt_search' into 2017-ucsc-metagenome
ctb Apr 22, 2017
66f1c10
fixed some super dumb errors in sourmash sbt_gather
ctb Apr 22, 2017
264a598
Merge branch 'update/gather_out' into 2017-ucsc-metagenome
ctb Apr 22, 2017
a11fdf3
fix print -> notify
ctb Apr 22, 2017
b4b3c76
Merge branch 'update/gather_out' into 2017-ucsc-metagenome
ctb Apr 22, 2017
65e5928
fix notifications
ctb Apr 22, 2017
ddc0b6b
Merge branch 'update/gather_out' into 2017-ucsc-metagenome
ctb Apr 22, 2017
69b8ae6
matplotlib fix
ctb Apr 22, 2017
1b32aed
Merge branch 'update/gather_out' into 2017-ucsc-metagenome
ctb Apr 22, 2017
bc74e1f
avoid printing out really large matrices
ctb Apr 23, 2017
ad8d3bb
Merge branch 'update/gather_out' into 2017-ucsc-metagenome
ctb Apr 23, 2017
92fc2fb
update output format
ctb Apr 24, 2017
1ff3e69
fix sbt_search --best-only
ctb May 2, 2017
e0a4c0b
clean up sbt_search output a bit
ctb May 2, 2017
394e3cb
Merge branch 'fix/sbt_search' of github.com:dib-lab/sourmash into 201…
ctb May 9, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
28 changes: 26 additions & 2 deletions sourmash_lib/_minhash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,14 @@ cdef class MinHash(object):
def max_hash(self):
mm = deref(self._this).max_hash
if mm == 18446744073709551615:
return 0
mm = 0

# legacy - check cardinality => estimate
if mm == 0:
if self.hll:
genome_size = self.hll.estimate_cardinality()
mm = max(self.get_mins())

return mm

def add_hash(self, uint64_t h):
Expand Down Expand Up @@ -248,9 +255,20 @@ cdef class MinHash(object):

return a

def downsample_max_hash(self, *others):
max_hashes = [ x.max_hash for x in others ]
new_max_hash = min(self.max_hash, *max_hashes)
new_scaled = int(get_minhash_max_hash() / new_max_hash)

return self.downsample_scaled(new_scaled)

def downsample_scaled(self, new_num):
old_scaled = int(get_minhash_max_hash() / self.max_hash)
max_hash = self.max_hash

if max_hash is None:
raise ValueError('no max_hash available - cannot downsample')

old_scaled = int(get_minhash_max_hash() / self.max_hash)
if old_scaled > new_num:
raise ValueError('new scaled is lower than current sample scaled')

Expand Down Expand Up @@ -345,6 +363,12 @@ cdef class MinHash(object):
distance = 2*math.acos(prod) / math.pi
return 1.0 - distance

def containment(self, other):
"""\
Calculate containment of self by other.
"""
return self.count_common(other) / len(self.get_mins())

def similarity_ignore_maxhash(self, MinHash other):
a = set(self.get_mins())

Expand Down
140 changes: 74 additions & 66 deletions sourmash_lib/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def search(args):
parser.add_argument('-k', '--ksize', default=DEFAULT_K, type=int)
parser.add_argument('-f', '--force', action='store_true')
parser.add_argument('--save-matches', type=argparse.FileType('wt'))
parser.add_argument('--containment', action='store_true')

sourmash_args.add_moltype_args(parser)

Expand Down Expand Up @@ -62,7 +63,10 @@ def search(args):
# compute query x db
distances = []
for (x, filename) in against:
distance = query.similarity(x)
if args.containment:
distance = query.containment(x)
else:
distance = query.similarity(x)
if distance >= args.threshold:
distances.append((distance, x, filename))

Expand Down Expand Up @@ -224,8 +228,7 @@ def save_siglist(siglist, output_fp, filename=None):
sig.save_signatures(siglist, fp)

if args.track_abundance:
print('Tracking abundance of input k-mers.',
file=sys.stderr)
notify('Tracking abundance of input k-mers.')

if not args.merge:
if args.output:
Expand All @@ -235,8 +238,7 @@ def save_siglist(siglist, output_fp, filename=None):
sigfile = os.path.basename(filename) + '.sig'
if not args.output and os.path.exists(sigfile) and not \
args.force:
print('skipping', filename, '- already done',
file=sys.stderr)
notify('skipping {} - already done', filename)
continue

if args.singleton:
Expand All @@ -249,20 +251,19 @@ def save_siglist(siglist, output_fp, filename=None):

siglist += build_siglist(args.email, Elist, filename,
name=record.name)
print('calculated {} signatures for {} sequences in {}'.\
notify('calculated {} signatures for {} sequences in {}'.\
format(len(siglist), n + 1, filename))
else:
# make minhashes for the whole file
Elist = make_minhashes()

# consume & calculate signatures
print('... reading sequences from', filename,
file=sys.stderr)
notify('... reading sequences from {}', filename)
name = None
for n, record in enumerate(screed.open(filename)):
if n % 10000 == 0:
if n:
print('...', filename, n, file=sys.stderr)
notify('...{} {}', filename, n)
elif args.name_from_first:
name = record.name

Expand All @@ -275,7 +276,7 @@ def save_siglist(siglist, output_fp, filename=None):
siglist += sigs
else:
siglist = sigs
print('calculated {} signatures for {} sequences in {}'.\
notify('calculated {} signatures for {} sequences in {}'.\
format(len(siglist), n + 1, filename))

if not args.output:
Expand All @@ -289,18 +290,17 @@ def save_siglist(siglist, output_fp, filename=None):

for filename in args.filenames:
# consume & calculate signatures
print('... reading sequences from', filename,
file=sys.stderr)
notify('... reading sequences from', filename)
for n, record in enumerate(screed.open(filename)):
if n % 10000 == 0 and n:
print('...', filename, n, file=sys.stderr)
notify('...', filename, n)

add_seq(Elist, record.sequence,
args.input_is_protein, args.check_sequence)

siglist = build_siglist(args.email, Elist, filename,
name=args.merge)
print('calculated {} signatures for {} sequences taken from {}'.\
notify('calculated {} signatures for {} sequences taken from {}'.\
format(len(siglist), n + 1, " ".join(args.filenames)))
# at end, save!
save_siglist(siglist, args.output)
Expand All @@ -321,7 +321,7 @@ def compare(args):
# load in the various signatures
siglist = []
for filename in args.signatures:
print('loading', filename, file=sys.stderr)
notify('loading {}', filename)
loaded = sig.load_signatures(filename, select_ksize=args.ksize)
loaded = list(loaded)
if not loaded:
Expand All @@ -343,7 +343,8 @@ def compare(args):
for j, E2 in enumerate(siglist):
D[i][j] = E.similarity(E2, args.ignore_abundance)

print('%d-%20s\t%s' % (i, E.name(), D[i, :, ],))
if len(siglist) < 30:
print('%d-%20s\t%s' % (i, E.name(), D[i, :, ],))
labeltext.append(E.name())

notify('min similarity in matrix: {}', numpy.min(D))
Expand All @@ -362,6 +363,8 @@ def compare(args):

def plot(args):
"Produce a clustering and plot."
import matplotlib as mpl
mpl.use('Agg')
import numpy
import scipy
import pylab
Expand Down Expand Up @@ -603,12 +606,18 @@ def sbt_search(args):

results = []
for leaf in tree.find(search_fn, query, args.threshold):
results.append((query.similarity(leaf.data), leaf.data))
#results.append((leaf.data.similarity(ss), leaf.data))
results.append((query.similarity(leaf.data, downsample=True),
leaf.data))

results.sort(key=lambda x: -x[0]) # reverse sort on similarity

if args.best_only:
notify("(truncated search because of --best-only; only trust top result")

notify("similarity match")
notify("---------- -----")
for (similarity, query) in results:
print('{:.2f} {}'.format(similarity, query.name()))
print('{:2.1f}% {}'.format(similarity*100, query.name()))

if args.save_matches:
outname = args.save_matches.name
Expand Down Expand Up @@ -691,6 +700,7 @@ def categorize(args):
if loader.skipped_nosig:
notify('skipped/nosig: {}', loader.skipped_nosig)


def sbt_gather(args):
from sourmash_lib.sbt import SBT, GraphFactory
from sourmash_lib.sbtmh import search_minhashes, SigLeaf
Expand All @@ -702,8 +712,8 @@ def sbt_gather(args):
parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
parser.add_argument('--threshold', default=0.05, type=float)
parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
parser.add_argument('--csv', type=argparse.FileType('wt'))
parser.add_argument('--save-matches', type=argparse.FileType('wt'))
parser.add_argument('--threshold-bp', type=float, default=5e4)

sourmash_args.add_moltype_args(parser)

Expand All @@ -728,7 +738,6 @@ def sbt_gather(args):
error('query signature needs to be created with --scaled')
sys.exit(-1)

notify('query signature has max_hash: {}', query.minhash.max_hash)
orig_query = query
orig_mins = orig_query.minhash.get_hashes()

Expand Down Expand Up @@ -761,6 +770,19 @@ def build_new_signature(mins):
e.add_many(mins)
return sig.SourmashSignature('', e)

# xxx
def format_bp(bp):
bp = float(bp)
if bp < 500:
return '{:d} bp '.format(bp)
elif bp <= 500e3:
return '{:.1f} kbp'.format(round(bp / 1e3, 1))
elif bp < 500e6:
return '{:.1f} Mbp'.format(round(bp / 1e6, 1))
elif bp < 500e9:
return '{:.1f} Gbp'.format(round(bp / 1e9, 1))
return '???'

# construct a new query that doesn't have the max_hash attribute set.
new_mins = query.minhash.get_hashes()
query = build_new_signature(new_mins)
Expand All @@ -779,21 +801,17 @@ def build_new_signature(mins):
# figure out what the resolution of the banding on the genome is,
# based either on an explicit --scaled parameter, or on genome
# cardinality (deprecated)
if best_leaf.minhash.max_hash:
R_genome = sourmash_lib.MAX_HASH / \
float(best_leaf.minhash.max_hash)
elif best_leaf.minhash.hll:
genome_size = best_leaf.minhash.hll.estimate_cardinality()
genome_max_hash = max(found_mins)
R_genome = float(genome_size) / float(genome_max_hash)
else:
if not best_leaf.minhash.max_hash:
error('Best hash match in sbt_gather has no cardinality')
error('Please prepare database of sequences with --scaled')
sys.exit(-1)

R_genome = sourmash_lib.MAX_HASH / float(best_leaf.minhash.max_hash)

# pick the highest R / lowest resolution
R_comparison = max(R_metagenome, R_genome)

# CTB: these could probably be replaced by minhash.downsample_scaled.
new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison)
query_mins = set([ i for i in query_mins if i < new_max_hash ])
found_mins = set([ i for i in found_mins if i < new_max_hash ])
Expand All @@ -802,72 +820,62 @@ def build_new_signature(mins):
# calculate intersection:
intersect_mins = query_mins.intersection(found_mins)
intersect_orig_mins = orig_mins.intersection(found_mins)
intersect_bp = R_comparison * len(intersect_orig_mins)
sum_found += len(intersect_mins)

if len(intersect_mins) < 5: # hard cutoff for now
notify('found only {} hashes in common.', len(intersect_mins))
notify('this is below a sane threshold => exiting.')
if intersect_bp < args.threshold_bp: # hard cutoff for now
notify('found less than {} in common. => exiting',
format_bp(intersect_bp))
break

# calculate fractions wrt first denominator - genome size
genome_n_mins = len(found_mins)
f_genome = len(intersect_mins) / float(genome_n_mins)
f_orig_query = len(intersect_orig_mins) / float(genome_n_mins)
f_orig_query = len(intersect_orig_mins) / float(len(orig_mins))

# calculate fractions wrt second denominator - metagenome size
query_n_mins = len(orig_query.minhash.get_hashes())
f_query = len(intersect_mins) / float(query_n_mins)

if not len(found): # first result? print header.
notify("")
notify("overlap p_query p_genome")
notify("------- ------- --------")

# print interim result & save in a list for later use
notify('found: {:.2f} {:.2f} {}', f_genome, f_query,
best_leaf.name())
found.append((f_genome, best_leaf, f_query))
notify('{:8s} {:-5.1f}% {:-5.1f}% {}',
format_bp(intersect_bp), f_orig_query*100,
f_genome*100, best_leaf.name()[:40])
found.append((intersect_bp, f_orig_query, best_leaf, f_genome))

# construct a new query, minus the previous one.
query_mins -= set(found_mins)
query = build_new_signature(query_mins)

# basic reporting
notify('found {} matches total', len(found))
notify('the recovered matches hit {:.1f}% of the query',
100. * sum_found / len(orig_query.minhash.get_hashes()))
notify('\nfound {} matches total;', len(found))

sum_found /= len(orig_query.minhash.get_hashes())
notify('the recovered matches hit {:.1f}% of the query', sum_found * 100)
notify('')

if not found:
sys.exit(0)

# sort by fraction of genome (first key) - change this?
found.sort(key=lambda x: x[0])
found.reverse()

notify('Composition:')
for (frac, leaf_sketch, genome_fraction) in found:
notify('{:.2f} {:.2f} {}', frac, genome_fraction, leaf_sketch.name())

if args.output:
print('Composition:', file=args.output)
for (frac, leaf_sketch, genome_fraction) in found:
print('{:.2f} {:.f} {}'.format(frac, genome_fraction,
leaf_sketch.name()),
file=args.output)

if args.csv:
fieldnames = ['fraction', 'name', 'sketch_kmers', 'genome_fraction']
w = csv.DictWriter(args.csv, fieldnames=fieldnames)

fieldnames = ['intersect_bp', 'f_orig_query', 'f_found_genome', 'name']
w = csv.DictWriter(args.output, fieldnames=fieldnames)
w.writeheader()
for (frac, leaf_sketch, genome_fraction) in found:
cardinality = 0
if leaf_sketch.minhash.hll:
cardinality = leaf_sketch.minhash.hll.estimate_cardinality()
w.writerow(dict(fraction=frac, name=leaf_sketch.name(),
sketch_kmers=cardinality,
genome_fraction=genome_fraction))
for (intersect_bp, f_genome, leaf, f_orig_query) in found:
w.writerow(dict(intersect_bp=intersect_bp,
f_orig_query=f_orig_query, name=leaf.name(),
f_found_genome=f_genome,))

if args.save_matches:
outname = args.save_matches.name
notify('saving all matches to "{}"', outname)
sig.save_signatures([ ss for (f, ss) in found ],
args.save_matches)
sig.save_signatures([ ss for (_, _, ss, _) in found ],
args.save_matches)


def watch(args):
Expand Down