From 67aba7973273537cc6ecd83ba6289d6ed06c1f4b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 16 Jul 2021 14:46:58 -0700 Subject: [PATCH] [MRG] refactor and simplify `sourmash sig` subcommand signature loading (#1672) * add -f and --from-file to downsample * upgrade downsample error handling and output * more better status output for downsample * use new sig loading in cat, describe, and split * support --from-file and force for rest of sig subcommands * remove unused import * trap expected errors and print out nicely * turn LoadManySignatures into a generator * test force * add num test for sig split * add some flatten tests for code cov * add test for both --num and --scaled * add picklist checks to everything * minor cleanup * minor cleanup * factor out common argparse * clean up and test error output * add many more tests etc. * add picklist test for sig cat * remove picklists from subtract for now Co-authored-by: Tessa Pierce Ward --- src/sourmash/cli/categorize.py | 2 - src/sourmash/cli/compute.py | 2 - src/sourmash/cli/import_csv.py | 1 - src/sourmash/cli/sig/cat.py | 7 +- src/sourmash/cli/sig/describe.py | 17 +- src/sourmash/cli/sig/downsample.py | 16 +- src/sourmash/cli/sig/export.py | 2 - src/sourmash/cli/sig/extract.py | 12 +- src/sourmash/cli/sig/filter.py | 2 - src/sourmash/cli/sig/flatten.py | 16 +- src/sourmash/cli/sig/ingest.py | 2 - src/sourmash/cli/sig/intersect.py | 16 +- src/sourmash/cli/sig/manifest.py | 3 - src/sourmash/cli/sig/merge.py | 16 +- src/sourmash/cli/sig/rename.py | 14 +- src/sourmash/cli/sig/split.py | 16 +- src/sourmash/cli/sig/subtract.py | 4 +- src/sourmash/sig/__main__.py | 527 +++++++++++++++-------------- src/sourmash/sourmash_args.py | 43 +++ tests/test_cmd_signature.py | 336 +++++++++++++++++- 20 files changed, 746 insertions(+), 308 deletions(-) diff --git a/src/sourmash/cli/categorize.py b/src/sourmash/cli/categorize.py index 8c5692409c..eb8c995f8d 100644 --- a/src/sourmash/cli/categorize.py +++ b/src/sourmash/cli/categorize.py @@ -1,7 +1,5 @@ "'sourmash categorize' - query an SBT for bes match, with many signatures." -import argparse - from sourmash.cli.utils import add_ksize_arg, add_moltype_args diff --git a/src/sourmash/cli/compute.py b/src/sourmash/cli/compute.py index 7291bab0f5..6b9db002e2 100644 --- a/src/sourmash/cli/compute.py +++ b/src/sourmash/cli/compute.py @@ -29,8 +29,6 @@ --- """ -from argparse import FileType - from sourmash.minhash import get_minhash_default_seed from sourmash.cli.utils import add_construct_moltype_args diff --git a/src/sourmash/cli/import_csv.py b/src/sourmash/cli/import_csv.py index 13e11991de..77fcbd14f8 100644 --- a/src/sourmash/cli/import_csv.py +++ b/src/sourmash/cli/import_csv.py @@ -1,6 +1,5 @@ """'sourmash import_csv' description goes here""" -import sys from sourmash.logging import notify diff --git a/src/sourmash/cli/sig/cat.py b/src/sourmash/cli/sig/cat.py index 6da03f886e..6e80b7b5f4 100644 --- a/src/sourmash/cli/sig/cat.py +++ b/src/sourmash/cli/sig/cat.py @@ -1,7 +1,7 @@ """concatenate signature files""" -import sourmash -from sourmash.logging import notify, print_results, error +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, + add_picklist_args) def subparser(subparsers): @@ -27,6 +27,9 @@ def subparser(subparsers): '-f', '--force', action='store_true', help='try to load all files as signatures' ) + add_ksize_arg(subparser, 31) + add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/sig/describe.py b/src/sourmash/cli/sig/describe.py index e94e25c41f..ca382732d8 100644 --- a/src/sourmash/cli/sig/describe.py +++ b/src/sourmash/cli/sig/describe.py @@ -1,12 +1,12 @@ """show details of signature""" -import sourmash -from sourmash.logging import notify, print_results, error +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, + add_picklist_args) def subparser(subparsers): subparser = subparsers.add_parser('describe') - subparser.add_argument('signatures', nargs='+') + subparser.add_argument('signatures', nargs='*') subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' @@ -15,6 +15,17 @@ def subparser(subparsers): '--csv', metavar='FILE', help='output information to a CSV file' ) + subparser.add_argument( + '-f', '--force', action='store_true', + help='try to load all files as signatures' + ) + subparser.add_argument( + '--from-file', + help='a text file containing a list of files to load signatures from' + ) + add_ksize_arg(subparser, 31) + add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/sig/downsample.py b/src/sourmash/cli/sig/downsample.py index f9e94fd3f6..aafee7bfd5 100644 --- a/src/sourmash/cli/sig/downsample.py +++ b/src/sourmash/cli/sig/downsample.py @@ -1,17 +1,20 @@ """downsample one or more signatures""" -import sys - -from sourmash.cli.utils import add_moltype_args, add_ksize_arg +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, + add_picklist_args) def subparser(subparsers): subparser = subparsers.add_parser('downsample') - subparser.add_argument('signatures', nargs="+") + subparser.add_argument('signatures', nargs="*") subparser.add_argument( '--scaled', type=int, default=0, help='scaled value to downsample to' ) + subparser.add_argument( + '--from-file', + help='a text file containing a list of files to load signatures from' + ) subparser.add_argument( '--num', metavar='N', type=int, default=0, help='num value to downsample to' @@ -25,8 +28,13 @@ def subparser(subparsers): help='output signature to this file (default stdout)', default='-', ) + subparser.add_argument( + '-f', '--force', action='store_true', + help='try to load all files as signatures' + ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/sig/export.py b/src/sourmash/cli/sig/export.py index 3f181e7b10..5dfe2793d1 100644 --- a/src/sourmash/cli/sig/export.py +++ b/src/sourmash/cli/sig/export.py @@ -1,7 +1,5 @@ """export a signature, e.g. to mash""" -import sys - from sourmash.cli.utils import add_ksize_arg, add_moltype_args diff --git a/src/sourmash/cli/sig/extract.py b/src/sourmash/cli/sig/extract.py index 9ea71eb229..7bc601a2a3 100644 --- a/src/sourmash/cli/sig/extract.py +++ b/src/sourmash/cli/sig/extract.py @@ -1,14 +1,12 @@ """extract one or more signatures""" -import sys - from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, add_picklist_args) def subparser(subparsers): subparser = subparsers.add_parser('extract') - subparser.add_argument('signatures', nargs='+') + subparser.add_argument('signatures', nargs='*') subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' @@ -26,6 +24,14 @@ def subparser(subparsers): '--name', default=None, help='select signatures whose name contains this substring' ) + subparser.add_argument( + '-f', '--force', action='store_true', + help='try to load all files as signatures' + ) + subparser.add_argument( + '--from-file', + help='a text file containing a list of files to load signatures from' + ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) add_picklist_args(subparser) diff --git a/src/sourmash/cli/sig/filter.py b/src/sourmash/cli/sig/filter.py index 41c3ec0bce..7cd7f34885 100644 --- a/src/sourmash/cli/sig/filter.py +++ b/src/sourmash/cli/sig/filter.py @@ -1,7 +1,5 @@ """filter k-mers on abundance""" -import sys - from sourmash.cli.utils import add_moltype_args, add_ksize_arg diff --git a/src/sourmash/cli/sig/flatten.py b/src/sourmash/cli/sig/flatten.py index 6bc5538bcf..98d0990ac5 100644 --- a/src/sourmash/cli/sig/flatten.py +++ b/src/sourmash/cli/sig/flatten.py @@ -1,13 +1,12 @@ """remove abundances""" -import sys - -from sourmash.cli.utils import add_moltype_args, add_ksize_arg +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, + add_picklist_args) def subparser(subparsers): subparser = subparsers.add_parser('flatten') - subparser.add_argument('signatures', nargs='+') + subparser.add_argument('signatures', nargs='*') subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' @@ -25,8 +24,17 @@ def subparser(subparsers): '--name', default=None, help='select signatures whose name contains this substring' ) + subparser.add_argument( + '-f', '--force', action='store_true', + help='try to load all files as signatures' + ) + subparser.add_argument( + '--from-file', + help='a text file containing a list of files to load signatures from' + ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/sig/ingest.py b/src/sourmash/cli/sig/ingest.py index fd37983abf..b8a642236f 100644 --- a/src/sourmash/cli/sig/ingest.py +++ b/src/sourmash/cli/sig/ingest.py @@ -1,7 +1,5 @@ """ingest/import a mash or other signature""" -import sys - def subparser(subparsers): # Dirty hack to simultaneously support new and previous interface diff --git a/src/sourmash/cli/sig/intersect.py b/src/sourmash/cli/sig/intersect.py index 0f8fa09672..542eba8a3b 100644 --- a/src/sourmash/cli/sig/intersect.py +++ b/src/sourmash/cli/sig/intersect.py @@ -1,13 +1,12 @@ """intersect one or more signatures""" -import sys - -from sourmash.cli.utils import add_moltype_args, add_ksize_arg +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, + add_picklist_args) def subparser(subparsers): subparser = subparsers.add_parser('intersect') - subparser.add_argument('signatures', nargs='+') + subparser.add_argument('signatures', nargs='*') subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' @@ -20,8 +19,17 @@ def subparser(subparsers): '-A', '--abundances-from', metavar='FILE', help='intersect with & take abundances from this signature' ) + subparser.add_argument( + '-f', '--force', action='store_true', + help='try to load all files as signatures' + ) + subparser.add_argument( + '--from-file', + help='a text file containing a list of files to load signatures from' + ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/sig/manifest.py b/src/sourmash/cli/sig/manifest.py index 805c9d1a56..f6797be731 100644 --- a/src/sourmash/cli/sig/manifest.py +++ b/src/sourmash/cli/sig/manifest.py @@ -1,8 +1,5 @@ """create a manifest for a collection of signatures""" -import sourmash -from sourmash.logging import notify, print_results, error - def subparser(subparsers): subparser = subparsers.add_parser('manifest') diff --git a/src/sourmash/cli/sig/merge.py b/src/sourmash/cli/sig/merge.py index a6dde37935..34cd7ac658 100644 --- a/src/sourmash/cli/sig/merge.py +++ b/src/sourmash/cli/sig/merge.py @@ -1,13 +1,12 @@ """merge one or more signatures""" -import sys - -from sourmash.cli.utils import add_moltype_args, add_ksize_arg +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, + add_picklist_args) def subparser(subparsers): subparser = subparsers.add_parser('merge') - subparser.add_argument('signatures', nargs='+') + subparser.add_argument('signatures', nargs='*') subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' @@ -24,8 +23,17 @@ def subparser(subparsers): '--name', help='rename merged signature' ) + subparser.add_argument( + '-f', '--force', action='store_true', + help='try to load all files as signatures' + ) + subparser.add_argument( + '--from-file', + help='a text file containing a list of files to load signatures from' + ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/sig/rename.py b/src/sourmash/cli/sig/rename.py index ea60dceabd..2a6c7c4de9 100644 --- a/src/sourmash/cli/sig/rename.py +++ b/src/sourmash/cli/sig/rename.py @@ -1,11 +1,12 @@ """rename signature""" -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, + add_picklist_args) def subparser(subparsers): subparser = subparsers.add_parser('rename') - subparser.add_argument('sigfiles', nargs='+') + subparser.add_argument('signatures', nargs='*') subparser.add_argument('name') subparser.add_argument( '-q', '--quiet', action='store_true', @@ -20,8 +21,17 @@ def subparser(subparsers): help='output renamed signature to this file (default stdout)', default='-' ) + subparser.add_argument( + '-f', '--force', action='store_true', + help='try to load all files as signatures' + ) + subparser.add_argument( + '--from-file', + help='a text file containing a list of files to load signatures from' + ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/sig/split.py b/src/sourmash/cli/sig/split.py index 7b062336e8..d2b8835981 100644 --- a/src/sourmash/cli/sig/split.py +++ b/src/sourmash/cli/sig/split.py @@ -1,11 +1,12 @@ """concatenate signature files""" -import sourmash +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, + add_picklist_args) def subparser(subparsers): subparser = subparsers.add_parser('split') - subparser.add_argument('signatures', nargs='+') + subparser.add_argument('signatures', nargs='*') subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' @@ -13,6 +14,17 @@ def subparser(subparsers): subparser.add_argument( '--outdir', help='output signatures to this directory' ) + subparser.add_argument( + '-f', '--force', action='store_true', + help='try to load all files as signatures' + ) + subparser.add_argument( + '--from-file', + help='a text file containing a list of files to load signatures from' + ) + add_ksize_arg(subparser, 31) + add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/sig/subtract.py b/src/sourmash/cli/sig/subtract.py index 6a2dc77771..483a6cc027 100644 --- a/src/sourmash/cli/sig/subtract.py +++ b/src/sourmash/cli/sig/subtract.py @@ -1,8 +1,6 @@ """subtract one or more signatures""" -import sys - -from sourmash.cli.utils import add_moltype_args, add_ksize_arg +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg) def subparser(subparsers): diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 0f7c412cca..698454c42e 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -13,7 +13,7 @@ from sourmash.logging import set_quiet, error, notify, print_results, debug from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled -from sourmash.picklist import SignaturePicklist + usage=''' sourmash signature [] - manipulate/work with signature files. @@ -47,6 +47,13 @@ def _check_abundance_compatibility(sig1, sig2): raise ValueError("incompatible signatures: track_abundance is {} in first sig, {} in second".format(sig1.minhash.track_abundance, sig2.minhash.track_abundance)) +def _extend_signatures_with_from_file(args): + # extend input signatures with --from-file + if args.from_file: + more_files = sourmash_args.load_pathlist_from_file(args.from_file) + args.signatures = list(args.signatures) + args.signatures.extend(more_files) + def _set_num_scaled(mh, num, scaled): "set num and scaled values on a MinHash object" mh_params = list(mh.__getstate__()) @@ -67,40 +74,37 @@ def cat(args): concatenate all signatures into one file. """ set_quiet(args.quiet) + moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) encountered_md5sums = defaultdict(int) # used by --unique - progress = sourmash_args.SignatureLoadingProgress() + # open output for saving sigs save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) save_sigs.open() - if args.from_file: - more_files = sourmash_args.load_pathlist_from_file(args.from_file) - args.signatures = list(args.signatures) - args.signatures.extend(more_files) - - for sigfile in args.signatures: - try: - loader = sourmash_args.load_file_as_signatures(sigfile, - progress=progress, - yield_all_files=args.force) - n_loaded = 0 - for sig in loader: - n_loaded += 1 - - md5 = sig.md5sum() - encountered_md5sums[md5] += 1 - if args.unique and encountered_md5sums[md5] > 1: - continue - - save_sigs.add(sig) - except Exception as exc: - error(str(exc)) - error('(continuing)') + _extend_signatures_with_from_file(args) - notify('loaded {} signatures from {}...', n_loaded, sigfile, end='\r') + # start loading! + progress = sourmash_args.SignatureLoadingProgress() + loader = sourmash_args.load_many_signatures(args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force) + for ss, sigloc in loader: + md5 = ss.md5sum() + encountered_md5sums[md5] += 1 + if args.unique and encountered_md5sums[md5] > 1: + continue + + save_sigs.add(ss) notify('loaded {} signatures total.', len(save_sigs)) + if picklist: + sourmash_args.report_picklist(args, picklist) save_sigs.close() @@ -118,6 +122,9 @@ def split(args): split all signatures into individual files """ set_quiet(args.quiet) + moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) + _extend_signatures_with_from_file(args) output_names = set() output_scaled_template = '{md5sum}.k={ksize}.scaled={scaled}.{moltype}.dup={dup}.{basename}.sig' @@ -129,61 +136,60 @@ def split(args): os.mkdir(args.outdir) progress = sourmash_args.SignatureLoadingProgress() + loader = sourmash_args.load_many_signatures(args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force) - for sigfile in args.signatures: - # load signatures from input file: - this_siglist = sourmash_args.load_file_as_signatures(sigfile, - progress=progress) - + for sig, sigloc in loader: # save each file individually -- - n_signatures = 0 - for sig in this_siglist: - n_signatures += 1 - md5sum = sig.md5sum()[:8] - minhash = sig.minhash - basename = os.path.basename(sig.filename) - if not basename or basename == '-': - basename = 'none' - - params = dict(basename=basename, - md5sum=md5sum, - scaled=minhash.scaled, - ksize=minhash.ksize, - num=minhash.num, - moltype=minhash.moltype) - - if minhash.scaled: - output_template = output_scaled_template - else: # num - assert minhash.num - output_template = output_num_template - - # figure out if this is duplicate, build unique filename - n = 0 + md5sum = sig.md5sum()[:8] + minhash = sig.minhash + basename = os.path.basename(sig.filename) + if not basename or basename == '-': + basename = 'none' + + params = dict(basename=basename, + md5sum=md5sum, + scaled=minhash.scaled, + ksize=minhash.ksize, + num=minhash.num, + moltype=minhash.moltype) + + if minhash.scaled: + output_template = output_scaled_template + else: # num + assert minhash.num + output_template = output_num_template + + # figure out if this is duplicate, build unique filename + n = 0 + params['dup'] = n + output_name = output_template.format(**params) + while output_name in output_names: params['dup'] = n output_name = output_template.format(**params) - while output_name in output_names: - params['dup'] = n - output_name = output_template.format(**params) - n += 1 - - output_names.add(output_name) + n += 1 - if args.outdir: - output_name = os.path.join(args.outdir, output_name) + output_names.add(output_name) - if os.path.exists(output_name): - notify("** overwriting existing file {}".format(output_name)) + if args.outdir: + output_name = os.path.join(args.outdir, output_name) - # save! - with open(output_name, 'wt') as outfp: - sourmash.save_signatures([sig], outfp) - notify('writing sig to {}', output_name) + if os.path.exists(output_name): + notify("** overwriting existing file {}".format(output_name)) - notify('loaded {} signatures from {}...', n_signatures, sigfile, - end='\r') + # save! + with open(output_name, 'wt') as outfp: + sourmash.save_signatures([sig], outfp) + notify('writing sig to {}', output_name) notify(f'loaded and split {len(progress)} signatures total.') + if picklist: + sourmash_args.report_picklist(args, picklist) def describe(args): @@ -191,6 +197,9 @@ def describe(args): provide basic info on signatures """ set_quiet(args.quiet) + moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) + _extend_signatures_with_from_file(args) # write CSV? w = None @@ -205,37 +214,39 @@ def describe(args): extrasaction='ignore') w.writeheader() - # load signatures and display info. + # start loading! progress = sourmash_args.SignatureLoadingProgress() + loader = sourmash_args.load_many_signatures(args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force) - for signature_file in args.signatures: - try: - idx = sourmash_args.load_file_as_index(signature_file) - loader = idx.signatures_with_location() - - for sig, location in progress.start_file(signature_file, loader): - # extract info, write as appropriate. - mh = sig.minhash - ksize = mh.ksize - moltype = mh.moltype - scaled = mh.scaled - num = mh.num - seed = mh.seed - n_hashes = len(mh) - with_abundance = 0 - if mh.track_abundance: - with_abundance = 1 - md5 = sig.md5sum() - name = sig.name - p_name = name or "** no name **" - filename = sig.filename - p_filename = filename or "** no name **" - license = sig.license - - if w: - w.writerow(locals()) - - print_results('''\ + for sig, location in loader: + # extract info, write as appropriate. + mh = sig.minhash + ksize = mh.ksize + moltype = mh.moltype + scaled = mh.scaled + num = mh.num + seed = mh.seed + n_hashes = len(mh) + with_abundance = 0 + if mh.track_abundance: + with_abundance = 1 + md5 = sig.md5sum() + name = sig.name + p_name = name or "** no name **" + filename = sig.filename + p_filename = filename or "** no name **" + license = sig.license + + if w: + w.writerow(locals()) + + print_results('''\ --- signature filename: {location} signature: {p_name} @@ -246,17 +257,12 @@ def describe(args): signature license: {license} ''', **locals()) - except Exception as exc: - error('\nError while reading signatures from {}:'.format(signature_file)) - error(str(exc)) - error('(continuing)') - raise - - notify(f'loaded {len(progress)} signatures total.') - if csv_fp: csv_fp.close() + if picklist: + sourmash_args.report_picklist(args, picklist) + def manifest(args): """ @@ -384,48 +390,45 @@ def merge(args): """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) + _extend_signatures_with_from_file(args) first_sig = None mh = None - total_loaded = 0 - # iterate over all the sigs from all the files. + # start loading! progress = sourmash_args.SignatureLoadingProgress() + loader = sourmash_args.load_many_signatures(args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force) + + for sigobj, sigloc in loader: + # first signature? initialize a bunch of stuff + if first_sig is None: + first_sig = sigobj + mh = first_sig.minhash.copy_and_clear() + + # forcibly remove abundance? + if args.flatten: + mh.track_abundance = False - for sigfile in args.signatures: - notify('loading signatures from {}...', sigfile, end='\r') - this_n = 0 - for sigobj in sourmash_args.load_file_as_signatures(sigfile, - ksize=args.ksize, - select_moltype=moltype, - progress=progress): + try: + sigobj_mh = sigobj.minhash + if not args.flatten: + _check_abundance_compatibility(first_sig, sigobj) + else: + sigobj_mh.track_abundance = False - # first signature? initialize a bunch of stuff - if first_sig is None: - first_sig = sigobj - mh = first_sig.minhash.copy_and_clear() - - # forcibly remove abundance? - if args.flatten: - mh.track_abundance = False - - try: - sigobj_mh = sigobj.minhash - if not args.flatten: - _check_abundance_compatibility(first_sig, sigobj) - else: - sigobj_mh.track_abundance = False - - mh.merge(sigobj_mh) - except: - error("ERROR when merging signature '{}' ({}) from file {}", - sigobj, sigobj.md5sum()[:8], sigfile) - raise - - this_n += 1 - total_loaded += 1 - if this_n: - notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r') + mh.merge(sigobj_mh) + except (TypeError, ValueError) as exc: + error("ERROR when merging signature '{}' ({}) from file {}", + sigobj, sigobj.md5sum()[:8], sigloc) + error(str(exc)) + sys.exit(-1) if not len(progress): error("no signatures to merge!?") @@ -438,6 +441,9 @@ def merge(args): notify(f'loaded and merged {len(progress)} signatures') + if picklist: + sourmash_args.report_picklist(args, picklist) + def intersect(args): """ @@ -447,28 +453,33 @@ def intersect(args): """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) + _extend_signatures_with_from_file(args) first_sig = None mins = None + # start loading! progress = sourmash_args.SignatureLoadingProgress() + loader = sourmash_args.load_many_signatures(args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force) + + for sigobj, sigloc in loader: + if first_sig is None: + first_sig = sigobj + mins = set(sigobj.minhash.hashes) + else: + # check signature compatibility -- + if not sigobj.minhash.is_compatible(first_sig.minhash): + error("incompatible minhashes; specify -k and/or molecule type.") + sys.exit(-1) - for sigfile in args.signatures: - for sigobj in sourmash_args.load_file_as_signatures(sigfile, - ksize=args.ksize, - select_moltype=moltype, - progress=progress): - if first_sig is None: - first_sig = sigobj - mins = set(sigobj.minhash.hashes) - else: - # check signature compatibility -- - if not sigobj.minhash.is_compatible(first_sig.minhash): - error("incompatible minhashes; specify -k and/or molecule type.") - sys.exit(-1) - - mins.intersection_update(sigobj.minhash.hashes) - notify('loaded and intersected signatures from {}...', sigfile, end='\r') + mins.intersection_update(sigobj.minhash.hashes) if len(progress) == 0: error("no signatures to merge!?") @@ -503,6 +514,8 @@ def intersect(args): sourmash.save_signatures([intersect_sigobj], fp=fp) notify(f'loaded and intersected {len(progress)} signatures') + if picklist: + sourmash_args.report_picklist(args, picklist) def subtract(args): @@ -547,7 +560,6 @@ def subtract(args): error("no signatures to subtract!?") sys.exit(-1) - subtract_mh = from_sigobj.minhash.copy_and_clear() subtract_mh.add_many(subtract_mins) @@ -565,26 +577,32 @@ def rename(args): """ set_quiet(args.quiet, args.quiet) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) + _extend_signatures_with_from_file(args) - progress = sourmash_args.SignatureLoadingProgress() save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) save_sigs.open() - for filename in args.sigfiles: - debug('loading {}', filename) - siglist = sourmash_args.load_file_as_signatures(filename, - ksize=args.ksize, - select_moltype=moltype, - progress=progress) + # start loading! + progress = sourmash_args.SignatureLoadingProgress() + loader = sourmash_args.load_many_signatures(args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force) - for sigobj in siglist: - sigobj._name = args.name - save_sigs.add(sigobj) + for sigobj, sigloc in loader: + sigobj._name = args.name + save_sigs.add(sigobj) save_sigs.close() notify(f"set name to '{args.name}' on {len(save_sigs)} signatures") + if picklist: + sourmash_args.report_picklist(args, picklist) def extract(args): @@ -594,39 +612,38 @@ def extract(args): set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) picklist = sourmash_args.load_picklist(args) + _extend_signatures_with_from_file(args) # further filtering on md5 or name? if args.md5 is not None or args.name is not None: - def filter_fn(it): - for ss in it: - # match? - keep = False - if args.name and args.name in str(ss): - keep = True - if args.md5 and args.md5 in ss.md5sum(): - keep = True - - if keep: - yield ss + def filter_fn(ss): + # match? + keep = False + if args.name and args.name in str(ss): + keep = True + if args.md5 and args.md5 in ss.md5sum(): + keep = True + + return keep else: # whatever comes out of the database is fine - def filter_fn(it): - for ss in it: - yield ss + filter_fn = lambda x: True # ok! filtering defined, let's go forward - progress = sourmash_args.SignatureLoadingProgress() - save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) save_sigs.open() - for filename in args.signatures: - siglist = sourmash_args.load_file_as_signatures(filename, - ksize=args.ksize, - select_moltype=moltype, - picklist=picklist, - progress=progress) - for ss in filter_fn(siglist): + # start loading! + progress = sourmash_args.SignatureLoadingProgress() + loader = sourmash_args.load_many_signatures(args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force) + for ss, sigloc in loader: + if filter_fn(ss): save_sigs.add(ss) notify(f"loaded {len(progress)} total that matched ksize & molecule type") @@ -703,96 +720,114 @@ def flatten(args): """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) - - progress = sourmash_args.SignatureLoadingProgress() + picklist = sourmash_args.load_picklist(args) + _extend_signatures_with_from_file(args) save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) save_sigs.open() - for filename in args.signatures: - siglist = sourmash_args.load_file_as_signatures(filename, - ksize=args.ksize, - select_moltype=moltype, - progress=progress) - siglist = list(siglist) - + # start loading! + progress = sourmash_args.SignatureLoadingProgress() + loader = sourmash_args.load_many_signatures(args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force) + for ss, sigloc in loader: # select! if args.md5 is not None: - siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] + if args.md5 not in ss.md5sum(): + continue # skip + if args.name is not None: - siglist = [ ss for ss in siglist if args.name in ss.name ] + if args.name not in ss.name: + continue # skip - for ss in siglist: - ss.minhash = ss.minhash.flatten() - save_sigs.add(ss) + ss.minhash = ss.minhash.flatten() + save_sigs.add(ss) save_sigs.close() notify(f"loaded {len(progress)} total that matched ksize & molecule type") notify("extracted {} signatures from {} file(s)", len(save_sigs), len(args.signatures)) + if picklist: + sourmash_args.report_picklist(args, picklist) def downsample(args): """ - downsample a scaled signature. + downsample num and scaled signatures, and also interconvert. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) + _extend_signatures_with_from_file(args) if not args.num and not args.scaled: - error('must specify either --num or --scaled value') + error('ERROR: must specify either --num or --scaled value') sys.exit(-1) if args.num and args.scaled: - error('cannot specify both --num and --scaled') + error('ERROR: cannot specify both --num and --scaled') sys.exit(-1) + # open output for saving sigs save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) save_sigs.open() + # start loading! progress = sourmash_args.SignatureLoadingProgress() + loader = sourmash_args.load_many_signatures(args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force) + for ss, sigloc in loader: + mh = ss.minhash + + if args.scaled: + # downsample scaled to scaled? straightforward. + if mh.scaled: + mh_new = mh.downsample(scaled=args.scaled) + # try to turn a num into a scaled - trickier. + else: + # first check: can we? + max_hash = _get_max_hash_for_scaled(args.scaled) + mins = mh.hashes + if max(mins) < max_hash: + raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.") + + mh_new = mh.copy() + _set_num_scaled(mh_new, 0, args.scaled) + elif args.num: + # downsample num to num? straightforward. + if mh.num: + mh_new = mh.downsample(num=args.num) + # try to turn a scaled into a num - trickier. + else: + # first check: can we? + if len(mh) < args.num: + raise ValueError(f"this scaled MinHash has only {len(mh)} hashes") - for sigfile in args.signatures: - siglist = sourmash_args.load_file_as_signatures(sigfile, - ksize=args.ksize, - select_moltype=moltype, - progress=progress) + mh_new = mh.copy() + _set_num_scaled(mh_new, args.num, 0) - for sigobj in siglist: - mh = sigobj.minhash - - notify('loading and downsampling signature from {}...', sigfile, end='\r') - if args.scaled: - if mh.scaled: - mh_new = mh.downsample(scaled=args.scaled) - else: # try to turn a num into a scaled - # first check: can we? - max_hash = _get_max_hash_for_scaled(args.scaled) - mins = mh.hashes - if max(mins) < max_hash: - raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.") - - mh_new = mh.copy() - _set_num_scaled(mh_new, 0, args.scaled) - elif args.num: - if mh.num: - mh_new = mh.downsample(num=args.num) - else: # try to turn a scaled into a num - # first check: can we? - if len(mh) < args.num: - raise ValueError("this scaled MinHash has only {} hashes") - - mh_new = mh.copy() - _set_num_scaled(mh_new, args.num, 0) - - sigobj.minhash = mh_new - - save_sigs.add(sigobj) + ss.minhash = mh_new + + # save! + save_sigs.add(ss) save_sigs.close() - notify(f"loaded and downsampled {len(progress)} signatures") + notify(f"loaded {len(progress)} signatures") + notify(f"output {len(save_sigs)} downsampled signatures", len(save_sigs)) + if picklist: + sourmash_args.report_picklist(args, picklist) def sig_import(args): diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 3261a7bceb..62c3807398 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -585,6 +585,49 @@ def start_file(self, location, loader): end='\r') +def load_many_signatures(locations, progress, *, yield_all_files=False, + ksize=None, moltype=None, picklist=None, force=False): + """ + Load many signatures from multiple files, with progress indicators. + + Takes ksize, moltype, and picklist selectors. + + If 'yield_all_files=True' then tries to load all files in specified + directories. + + If 'force=True' then continues past survivable errors. + + Yields (sig, location) tuples. + """ + for loc in locations: + try: + idx = load_file_as_index(loc, yield_all_files=yield_all_files) + idx = idx.select(ksize=ksize, moltype=moltype, picklist=picklist) + + loader = idx.signatures_with_location() + + n = 0 + for sig, sigloc in progress.start_file(loc, loader): + yield sig, sigloc + n += 1 + notify(f"loaded {n} isgnatures from '{loc}'", end='\r') + except ValueError as exc: + # trap expected errors, and either power through or display + exit. + if force: + notify("ERROR: {}", str(exc)) + notify("(continuing)") + continue + else: + notify("ERROR: {}". str(exc)) + sys.exit(-1) + except KeyboardInterrupt: + notify("Received CTRL-C - exiting.") + sys.exit(-1) + + n_files = len(locations) + notify(f"loaded {len(progress)} signatures total, from {n_files} files") + + # # enum and classes for saving signatures progressively # diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index cc6f9bfb99..1b9cb65725 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -16,6 +16,13 @@ ## command line tests +def _write_file(runtmp, basename, lines): + loc = runtmp.output(basename) + with open(loc, 'wt') as fp: + fp.write("\n".join(lines)) + return loc + + def test_run_sourmash_signature_cmd(): status, out, err = utils.runscript('sourmash', ['signature'], fail_ok=True) assert not 'sourmash: error: argument cmd: invalid choice:' in err @@ -30,8 +37,9 @@ def test_run_sourmash_sig_cmd(): assert status != 0 # no args provided, ok ;) -@utils.in_tempdir -def test_sig_merge_1_use_full_signature_in_cmd(c): +def test_sig_merge_1_use_full_signature_in_cmd(runtmp): + c = runtmp + # merge of 47 & 63 should be union of mins sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') @@ -51,6 +59,34 @@ def test_sig_merge_1_use_full_signature_in_cmd(c): assert actual_merge_sig.minhash == test_merge_sig.minhash +def test_sig_merge_1_fromfile_picklist(runtmp): + c = runtmp + + # merge of 47 & 63 should be union of mins + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + sig47and63 = utils.get_test_data('47+63.fa.sig') + + from_file = _write_file(runtmp, 'list.txt', [sig47, sig63]) + picklist = _write_file(runtmp, 'pl.csv', + ['md5short', '09a08691', '38729c63']) + + c.run_sourmash('signature', 'merge', '--from-file', from_file, + '--picklist', f'{picklist}:md5short:md5short') + + # stdout should be new signature + out = c.last_result.out + + test_merge_sig = sourmash.load_one_signature(sig47and63) + actual_merge_sig = sourmash.load_one_signature(out) + + print(test_merge_sig.minhash) + print(actual_merge_sig.minhash) + print(out) + + assert actual_merge_sig.minhash == test_merge_sig.minhash + + @utils.in_tempdir def test_sig_merge_1(c): # merge of 47 & 63 should be union of mins @@ -139,9 +175,11 @@ def test_sig_merge_1_ksize_moltype_fail(c): sig63 = utils.get_test_data('63.fa.sig') sig2and63 = utils.get_test_data('2+63.fa.sig') - with pytest.raises(ValueError): + with pytest.raises(ValueError) as exc: c.run_sourmash('sig', 'merge', sig2, sig63) + assert "ERROR when merging signature" in str(exc.value) + @utils.in_tempdir def test_sig_merge_2(c): @@ -320,8 +358,9 @@ def test_sig_merge_flatten_2(c): assert actual_merge_sig.minhash == test_merge_sig.minhash -@utils.in_tempdir -def test_sig_intersect_1(c): +def test_sig_intersect_1(runtmp): + c = runtmp + # intersect of 47 and 63 should be intersection of mins sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') @@ -341,6 +380,34 @@ def test_sig_intersect_1(c): assert actual_intersect_sig.minhash == test_intersect_sig.minhash +def test_sig_intersect_1_fromfile_picklist(runtmp): + c = runtmp + + # intersect of 47 and 63 should be intersection of mins + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + sig47and63 = utils.get_test_data('47+63-intersect.fa.sig') + + from_file = _write_file(runtmp, 'list.txt', [sig47, sig63]) + picklist = _write_file(runtmp, 'pl.csv', + ['md5short', '09a08691', '38729c63']) + + c.run_sourmash('signature', 'intersect', '--from-file', from_file, + '--picklist', f'{picklist}:md5short:md5short') + + # stdout should be new signature + out = c.last_result.out + + test_intersect_sig = sourmash.load_one_signature(sig47and63) + actual_intersect_sig = sourmash.load_one_signature(out) + + print(test_intersect_sig.minhash) + print(actual_intersect_sig.minhash) + print(out) + + assert actual_intersect_sig.minhash == test_intersect_sig.minhash + + @utils.in_tempdir def test_sig_intersect_2(c): # intersect of 47 with abund and 63 with abund should be same @@ -569,8 +636,9 @@ def test_sig_subtract_4_ksize_succeed(c): assert 'loaded and subtracted 1 signatures' in c.last_result.err -@utils.in_tempdir -def test_sig_rename_1(c): +def test_sig_rename_1(runtmp): + c = runtmp + # set new name for 47 sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'rename', sig47, 'fiz bar') @@ -589,6 +657,32 @@ def test_sig_rename_1(c): assert actual_rename_sig.name == 'fiz bar' +def test_sig_rename_1_fromfile_picklist(runtmp): + c = runtmp + + # set new name for 47 + sig47 = utils.get_test_data('47.fa.sig') + + from_file = _write_file(runtmp, 'list.txt', [sig47]) + picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + + c.run_sourmash('sig', 'rename', '--from-file', from_file, 'fiz bar', + '--picklist', f'{picklist}:md5short:md5short') + + # stdout should be new signature + out = c.last_result.out + + test_rename_sig = sourmash.load_one_signature(sig47) + actual_rename_sig = sourmash.load_one_signature(out) + + print(test_rename_sig.minhash) + print(actual_rename_sig.minhash) + + assert actual_rename_sig.minhash == test_rename_sig.minhash + assert test_rename_sig.name != actual_rename_sig.name + assert actual_rename_sig.name == 'fiz bar' + + @utils.in_tempdir def test_sig_rename_1_multisig(c): # set new name for multiple signatures/files @@ -649,6 +743,15 @@ def test_sig_rename_3_file_dne(c): assert "Error while reading signatures from 'no-such-sig'" in c.last_result.err +@utils.in_tempdir +def test_sig_rename_3_file_dne_force(c): + # rename on a file that does not exist should fail! + c.run_sourmash('sig', 'rename', 'no-such-sig', 'fiz bar', '-f') + print(c.last_result.err) + + assert "Error while reading signatures from 'no-such-sig'" in c.last_result.err + + @utils.in_thisdir def test_sig_cat_1(c): # cat 47 to 47... @@ -873,8 +976,46 @@ def test_sig_cat_5_from_file(c): assert repr(siglist) == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]""" -@utils.in_tempdir -def test_sig_split_1(c): +def test_sig_cat_5_from_file_picklist(runtmp): + c = runtmp + + # cat using a file list as input + sig47 = utils.get_test_data('47.fa.sig') + sbt = utils.get_test_data('v6.sbt.zip') + + filelist = c.output("filelist") + with open(filelist, 'w') as f: + f.write("\n".join((sig47, sbt))) + + picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + + c.run_sourmash('sig', 'cat', '--from-file', filelist, + '--picklist', f'{picklist}:md5short:md5short', + '-o', 'out.sig') + + # stdout should be same signatures + out = c.output('out.sig') + + siglist = list(load_signatures(out)) + print(len(siglist)) + # print("siglist: ",siglist) + # print("\n") + + # verify the number of signatures matches what we expect to see based + # on the input files + all_sigs = [] + all_sigs += list(load_signatures(sig47, ksize=31)) + + assert len(all_sigs) == len(siglist) + + # sort the signatures by something deterministic and unique + siglist.sort(key = lambda x: x.md5sum()) + + assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691)]""" + + +def test_sig_split_1(runtmp): + c = runtmp # split 47 into 1 sig :) sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'split', sig47) @@ -889,6 +1030,27 @@ def test_sig_split_1(c): assert actual_split_sig == test_split_sig +def test_sig_split_1_fromfile_picklist(runtmp): + c = runtmp + # split 47 into 1 sig :) + sig47 = utils.get_test_data('47.fa.sig') + + from_file = _write_file(runtmp, 'list.txt', [sig47]) + picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + + c.run_sourmash('sig', 'split', '--from-file', from_file, + '--picklist', f'{picklist}:md5short:md5short') + + outname = '09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' + + assert os.path.exists(c.output(outname)) + + test_split_sig = sourmash.load_one_signature(sig47) + actual_split_sig = sourmash.load_one_signature(c.output(outname)) + + assert actual_split_sig == test_split_sig + + @utils.in_tempdir def test_sig_split_1_overwrite(c): # check message about overwriting @@ -1011,8 +1173,27 @@ def test_sig_split_5_no_exist(c): c.run_sourmash('sig', 'split', 'foo') -@utils.in_tempdir -def test_sig_extract_1(c): +def test_sig_split_6_numsigs(runtmp): + c = runtmp + + sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') + c.run_sourmash('sig', 'split', sigs11) + + print(c.last_result.out) + print(c.last_result.err) + + outlist = ['1437d8ea.k=21.num=500.DNA.dup=0.genome-s11.fa.gz.sig', + '37aea787.k=7.num=500.protein.dup=0.genome-s11.fa.gz.sig', + '68c565be.k=30.num=500.DNA.dup=0.genome-s11.fa.gz.sig', + '73b6df1c.k=10.num=500.protein.dup=0.genome-s11.fa.gz.sig'] + + for filename in outlist: + assert os.path.exists(c.output(filename)) + + +def test_sig_extract_1(runtmp): + c = runtmp + # extract 47 from 47... :) sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'extract', sig47) @@ -1026,6 +1207,23 @@ def test_sig_extract_1(c): assert actual_extract_sig == test_extract_sig +def test_sig_extract_1(runtmp): + c = runtmp + + # extract 47 from 47... :) + sig47 = utils.get_test_data('47.fa.sig') + from_file = _write_file(runtmp, 'list.txt', [sig47]) + c.run_sourmash('sig', 'extract', '--from-file', from_file) + + # stdout should be new signature + out = c.last_result.out + + test_extract_sig = sourmash.load_one_signature(sig47) + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig == test_extract_sig + + @utils.in_tempdir def test_sig_extract_2(c): # extract matches to 47's md5sum from among several @@ -2033,8 +2231,9 @@ def test_sig_extract_12_picklist_bad_colname_exclude(runtmp): assert "column 'BADCOLNAME' not in pickfile" in err -@utils.in_tempdir -def test_sig_flatten_1(c): +def test_sig_flatten_1(runtmp): + c = runtmp + # extract matches to several names from among several signatures & flatten sig47abund = utils.get_test_data('track_abund/47.fa.sig') sig47 = utils.get_test_data('47.fa.sig') @@ -2052,8 +2251,74 @@ def test_sig_flatten_1(c): assert test_flattened.minhash == siglist[0].minhash +def test_sig_flatten_1(runtmp): + c = runtmp + + # extract matches to several names from among several signatures & flatten + sig47abund = utils.get_test_data('track_abund/47.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + + from_file = _write_file(runtmp, 'list.txt', [sig47abund]) + picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + + c.run_sourmash('sig', 'flatten', '--from-file', from_file, + '--picklist', f'{picklist}:md5short:md5short') + + # stdout should be new signature + out = c.last_result.out + + siglist = load_signatures(out) + siglist = list(siglist) + + assert len(siglist) == 1 + + test_flattened = sourmash.load_one_signature(sig47) + assert test_flattened.minhash == siglist[0].minhash + + @utils.in_tempdir -def test_sig_flatten_2_ksize(c): +def test_sig_flatten_1_select_name(c): + # extract matches to several names from among several signatures & flatten + sig47abund = utils.get_test_data('track_abund/47.fa.sig') + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + c.run_sourmash('sig', 'flatten', sig2, sig47abund, '--name', 'Shewanella') + + # stdout should be new signature + out = c.last_result.out + + siglist = load_signatures(out) + siglist = list(siglist) + + assert len(siglist) == 1 + + test_flattened = sourmash.load_one_signature(sig47) + assert test_flattened.minhash == siglist[0].minhash + + +def test_sig_flatten_1_select_md5(runtmp): + c = runtmp + + # extract matches to several names from among several signatures & flatten + sig47abund = utils.get_test_data('track_abund/47.fa.sig') + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + c.run_sourmash('sig', 'flatten', sig2, sig47abund, '--md5', '09a08691c') + + # stdout should be new signature + out = c.last_result.out + + siglist = load_signatures(out) + siglist = list(siglist) + + assert len(siglist) == 1 + + test_flattened = sourmash.load_one_signature(sig47) + assert test_flattened.minhash == siglist[0].minhash + + +def test_sig_flatten_2_ksize(runtmp): + c = runtmp # flatten only one signature selected using ksize psw_mag = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') c.run_sourmash('sig', 'flatten', psw_mag, '-k', '31') @@ -2190,6 +2455,16 @@ def test_sig_downsample_2_num_to_scaled_fail(c): '-k', '21', '--dna', sigs11) +@utils.in_tempdir +def test_sig_downsample_2_num_and_scaled_both_fail(c): + # cannot specify both --num and --scaled + sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') + + with pytest.raises(ValueError): + c.run_sourmash('sig', 'downsample', '--scaled', '100', '--num', '50', + '-k', '21', '--dna', sigs11) + + @utils.in_tempdir def test_sig_downsample_2_num_empty(c): # downsample a num signature @@ -2199,8 +2474,9 @@ def test_sig_downsample_2_num_empty(c): c.run_sourmash('sig', 'downsample', '-k', '21', '--dna', sigs11) -@utils.in_tempdir -def test_sig_describe_1(c): +def test_sig_describe_1(runtmp): + c = runtmp + # get basic info on a signature sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'describe', sig47) @@ -2220,6 +2496,32 @@ def test_sig_describe_1(c): assert line.strip() in out +def test_sig_describe_1_fromfile_picklist(runtmp): + c = runtmp + + # get basic info on a signature + sig47 = utils.get_test_data('47.fa.sig') + from_file = _write_file(runtmp, 'list.txt', [sig47]) + picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + + c.run_sourmash('sig', 'describe', '--from-file', from_file, + '--picklist', f'{picklist}:md5short:md5short') + + out = c.last_result.out + print(c.last_result) + + expected_output = """\ +signature: NC_009665.1 Shewanella baltica OS185, complete genome +source file: 47.fa +md5: 09a08691ce52952152f0e866a59f6261 +k=31 molecule=DNA num=0 scaled=1000 seed=42 track_abundance=0 +size: 5177 +signature license: CC0 +""".splitlines() + for line in expected_output: + assert line.strip() in out + + @utils.in_thisdir def test_sig_describe_protein(c): # test describe on a singleton protein signature @@ -2340,7 +2642,7 @@ def test_sig_describe_1_hp(c): for line in out.splitlines(): cleaned_line = line.strip().replace( testdata_dirname, '').replace(location, '') - assert cleaned_line in expected_output + assert cleaned_line in expected_output, cleaned_line @utils.in_tempdir