From 96920c74bba7a124881e80a526518a7f3575f1fb Mon Sep 17 00:00:00 2001 From: Keya Barve <53328492+keyabarve@users.noreply.github.com> Date: Fri, 16 Jul 2021 15:28:39 -0700 Subject: [PATCH] [MRG] Doing bounds checking on --scaled via command line (#1650) * Index function * Search function * Gather function * Multigather function * Prefetch function * For loop Index function * Modified all functions * Search function tests * Gather function tests * Multigather function tests * Index function tests * Prefetch function tests * Made final changes * Added function in command line files * Made changes in function and tests * Imported argparse * Removed code block from gather function * Removed code blocks and their corresponding tests * Prefetch function cli * Search function cli * Multigather function cli * Index function cli * Made final changes * Made requested changes * Fixed tests and cli function code * Fixed indentations Co-authored-by: Tessa Pierce Ward --- src/sourmash/cli/gather.py | 7 +- src/sourmash/cli/index.py | 7 +- src/sourmash/cli/multigather.py | 7 +- src/sourmash/cli/prefetch.py | 7 +- src/sourmash/cli/search.py | 7 +- src/sourmash/cli/utils.py | 24 ++++ src/sourmash/commands.py | 14 +-- tests/test_prefetch.py | 42 +++++++ tests/test_sourmash.py | 196 ++++++++++++++++++++++++++++++++ 9 files changed, 277 insertions(+), 34 deletions(-) diff --git a/src/sourmash/cli/gather.py b/src/sourmash/cli/gather.py index 6e0addd427..c5f3f9a315 100644 --- a/src/sourmash/cli/gather.py +++ b/src/sourmash/cli/gather.py @@ -1,7 +1,7 @@ """search a metagenome signature against dbs""" from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args) + add_picklist_args, add_scaled_arg) def subparser(subparsers): @@ -45,10 +45,6 @@ def subparser(subparsers): help='output unassigned portions of the query as a signature to the ' 'specified file' ) - subparser.add_argument( - '--scaled', metavar='FLOAT', type=float, default=0, - help='downsample query to the specified scaled factor' - ) subparser.add_argument( '--ignore-abundance', action='store_true', help='do NOT use k-mer abundances if present' @@ -82,6 +78,7 @@ def subparser(subparsers): add_ksize_arg(subparser, 31) add_moltype_args(subparser) add_picklist_args(subparser) + add_scaled_arg(subparser, 0) def main(args): diff --git a/src/sourmash/cli/index.py b/src/sourmash/cli/index.py index 334b394bfe..4737a8a8b9 100644 --- a/src/sourmash/cli/index.py +++ b/src/sourmash/cli/index.py @@ -26,7 +26,7 @@ """ from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args) + add_picklist_args, add_scaled_arg) def subparser(subparsers): @@ -66,13 +66,10 @@ def subparser(subparsers): help='What percentage of internal nodes will not be saved; ranges ' 'from 0.0 (save all nodes) to 1.0 (no nodes saved)' ) - subparser.add_argument( - '--scaled', metavar='FLOAT', type=float, default=0, - help='downsample signatures to the specified scaled factor' - ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) add_picklist_args(subparser) + add_scaled_arg(subparser, 0) def main(args): diff --git a/src/sourmash/cli/multigather.py b/src/sourmash/cli/multigather.py index 7d1e5204cd..0830d26229 100644 --- a/src/sourmash/cli/multigather.py +++ b/src/sourmash/cli/multigather.py @@ -1,6 +1,6 @@ "'sourmash multigather' - gather many signatures against multiple databases." -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import add_ksize_arg, add_moltype_args, add_scaled_arg def subparser(subparsers): @@ -28,16 +28,13 @@ def subparser(subparsers): '--threshold-bp', metavar='REAL', type=float, default=5e4, help='threshold (in bp) for reporting results (default=50,000)' ) - subparser.add_argument( - '--scaled', metavar='FLOAT', type=float, default=0, - help='downsample query to the specified scaled factor' - ) subparser.add_argument( '--ignore-abundance', action='store_true', help='do NOT use k-mer abundances if present' ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_scaled_arg(subparser, 0) def main(args): diff --git a/src/sourmash/cli/prefetch.py b/src/sourmash/cli/prefetch.py index e04c537193..c4f8110ba3 100644 --- a/src/sourmash/cli/prefetch.py +++ b/src/sourmash/cli/prefetch.py @@ -1,7 +1,7 @@ """search a signature against dbs, find all overlaps""" from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args) + add_picklist_args, add_scaled_arg) def subparser(subparsers): @@ -54,10 +54,6 @@ def subparser(subparsers): help='output matching query hashes as a signature to the ' 'specified file' ) - subparser.add_argument( - '--scaled', metavar='FLOAT', type=float, default=None, - help='downsample signatures to the specified scaled factor' - ) subparser.add_argument( '--md5', default=None, help='select the signature with this md5 as query' @@ -65,6 +61,7 @@ def subparser(subparsers): add_ksize_arg(subparser, 31) add_moltype_args(subparser) add_picklist_args(subparser) + add_scaled_arg(subparser, 0) def main(args): diff --git a/src/sourmash/cli/search.py b/src/sourmash/cli/search.py index c4e1d41323..c200b7b37b 100644 --- a/src/sourmash/cli/search.py +++ b/src/sourmash/cli/search.py @@ -1,7 +1,7 @@ """search a signature against other signatures""" from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args) + add_picklist_args, add_scaled_arg) def subparser(subparsers): @@ -46,10 +46,6 @@ def subparser(subparsers): help='do NOT use k-mer abundances if present; note: has no effect if ' '--containment or --max-containment is specified' ) - subparser.add_argument( - '--scaled', metavar='FLOAT', type=float, default=0, - help='downsample query to this scaled factor (yields greater speed)' - ) subparser.add_argument( '-o', '--output', metavar='FILE', help='output CSV containing matches to this file' @@ -61,6 +57,7 @@ def subparser(subparsers): add_ksize_arg(subparser, 31) add_moltype_args(subparser) add_picklist_args(subparser) + add_scaled_arg(subparser, 0) def main(args): diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index a074357bd1..2de23e9045 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -1,6 +1,7 @@ from glob import glob import os import argparse +from sourmash.logging import notify def add_moltype_args(parser): @@ -93,3 +94,26 @@ def command_list(dirpath): basenames = [os.path.splitext(path)[0] for path in filenames if not path.startswith('__')] basenames = filter(opfilter, basenames) return sorted(basenames) + + +def check_scaled_bounds(arg): + actual_min_val = 0 + min_val = 100 + max_val = 1e6 + + f = float(arg) + + if f < actual_min_val: + raise argparse.ArgumentTypeError(f"ERROR: --scaled value must be positive") + if f < min_val: + notify('WARNING: --scaled value should be >= 100. Continuing anyway.') + if f > max_val: + notify('WARNING: --scaled value should be <= 1e6. Continuing anyway.') + return f + + +def add_scaled_arg(parser, default=None): + parser.add_argument( + '--scaled', metavar='FLOAT', type=check_scaled_bounds, + help='scaled value should be between 100 and 1e6' + ) \ No newline at end of file diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 2843345f19..24fde667dd 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -390,6 +390,7 @@ def index(args): if args.scaled: ss.minhash = ss.minhash.downsample(scaled=args.scaled) + if ss.minhash.track_abundance: ss.minhash = ss.minhash.flatten() scaleds.add(ss.minhash.scaled) @@ -450,15 +451,13 @@ def search(args): query.minhash.ksize, sourmash_args.get_moltype(query)) - # downsample if requested if args.scaled: if not query.minhash.scaled: error('cannot downsample a signature not created with --scaled') sys.exit(-1) - if args.scaled != query.minhash.scaled: notify('downsampling query from scaled={} to {}', - query.minhash.scaled, int(args.scaled)) + query.minhash.scaled, int(args.scaled)) query.minhash = query.minhash.downsample(scaled=args.scaled) # set up the search databases @@ -646,10 +645,9 @@ def gather(args): error('query signature needs to be created with --scaled') sys.exit(-1) - # downsample if requested if args.scaled: notify('downsampling query from scaled={} to {}', - query.minhash.scaled, int(args.scaled)) + query.minhash.scaled, int(args.scaled)) query.minhash = query.minhash.downsample(scaled=args.scaled) # empty? @@ -868,12 +866,11 @@ def multigather(args): error('query signature needs to be created with --scaled; skipping') continue - # downsample if requested if args.scaled: notify('downsampling query from scaled={} to {}', - query.minhash.scaled, int(args.scaled)) + query.minhash.scaled, int(args.scaled)) query.minhash = query.minhash.downsample(scaled=args.scaled) - + # empty? if not len(query.minhash): error('no query hashes!? skipping to next..') @@ -1137,7 +1134,6 @@ def prefetch(args): if query_mh.track_abundance: query_mh = query_mh.flatten() - # downsample if/as requested if args.scaled: notify(f'downsampling query from scaled={query_mh.scaled} to {int(args.scaled)}') query_mh = query_mh.downsample(scaled=args.scaled) diff --git a/tests/test_prefetch.py b/tests/test_prefetch.py index 82057b35ab..235e56ab76 100644 --- a/tests/test_prefetch.py +++ b/tests/test_prefetch.py @@ -390,6 +390,48 @@ def test_prefetch_no_db(runtmp, linear_gather): assert "ERROR: no databases or signatures to search!?" in c.last_result.err +def test_prefetch_check_scaled_bounds_negative(runtmp, linear_gather): + c = runtmp + + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + with pytest.raises(ValueError) as exc: + c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, + '--scaled', '-5', linear_gather) + + assert "ERROR: --scaled value must be positive" in str(exc.value) + + +def test_prefetch_check_scaled_bounds_less_than_minimum(runtmp, linear_gather): + c = runtmp + + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + with pytest.raises(ValueError) as exc: + c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, + '--scaled', '50', linear_gather) + + assert "WARNING: --scaled value should be >= 100. Continuing anyway." in str(exc.value) + + +def test_prefetch_check_scaled_bounds_more_than_maximum(runtmp, linear_gather): + c = runtmp + + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + with pytest.raises(ValueError) as exc: + c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, + '--scaled', '1e9', linear_gather) + + assert "WARNING: --scaled value should be <= 1e6. Continuing anyway." in str(exc.value) + + def test_prefetch_downsample_scaled(runtmp, linear_gather): c = runtmp diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 93da279c3b..431756fedc 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1802,6 +1802,48 @@ def test_search_4(): assert 'short3.fa' in out +@utils.in_tempdir +def test_index_check_scaled_bounds_negative(c): + with utils.TempDirectory() as location: + status, out, err = utils.runscript('sourmash', + ['index', 'zzz', + 'short.fa.sig', + 'short2.fa.sig', + '-k', '31', '--scaled', '-5', + '--dna'], + in_directory=location, fail_ok=True) + + assert "ERROR: --scaled value must be positive" in err + + +@utils.in_tempdir +def test_index_check_scaled_bounds_less_than_minimum(c): + with utils.TempDirectory() as location: + status, out, err = utils.runscript('sourmash', + ['index', 'zzz', + 'short.fa.sig', + 'short2.fa.sig', + '-k', '31', '--scaled', '50', + '--dna'], + in_directory=location, fail_ok=True) + + assert "WARNING: --scaled value should be >= 100. Continuing anyway." in err + + +@utils.in_tempdir +def test_index_check_scaled_bounds_more_than_maximum(c): + with utils.TempDirectory() as location: + status, out, err = utils.runscript('sourmash', + ['index', 'zzz', + 'short.fa.sig', + 'short2.fa.sig', + '-k', '31', '--scaled', '1e9', + '--dna'], + in_directory=location, fail_ok=True) + + assert "WARNING: --scaled value should be <= 1e6. Continuing anyway." in err + + @utils.in_tempdir def test_index_metagenome_fromfile(c): # test index --from-file @@ -1966,6 +2008,51 @@ def test_search_traverse_incompatible(c): assert '100.0% NC_009665.1 Shewanella baltica OS185, complete genome' in c.last_result.out +def test_search_check_scaled_bounds_negative(): + + with utils.TempDirectory() as location: + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = 'search {} gcf_all -k 21 --scaled -5'.format(query_sig) + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location, fail_ok=True) + + assert "ERROR: --scaled value must be positive" in err + + +def test_search_check_scaled_bounds_less_than_minimum(): + + with utils.TempDirectory() as location: + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = 'search {} gcf_all -k 21 --scaled 50'.format(query_sig) + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location, fail_ok=True) + + assert "WARNING: --scaled value should be >= 100. Continuing anyway." in err + + +def test_search_check_scaled_bounds_more_than_maximum(): + + with utils.TempDirectory() as location: + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = 'search {} gcf_all -k 21 --scaled 1e9'.format(query_sig) + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location, fail_ok=True) + + assert "WARNING: --scaled value should be <= 1e6. Continuing anyway." in err + + # explanation: you cannot downsample a scaled SBT to match a scaled # signature, so make sure that when you try such a search, it fails! # (you *can* downsample a signature to match an SBT.) @@ -3481,6 +3568,67 @@ def test_multigather_metagenome(): 'NC_011294.1 Salmonella enterica subsp...' in out)) +@utils.in_tempdir +def test_multigather_check_scaled_bounds_negative(c): + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = ['index', 'gcf_all'] + cmd.extend(testdata_sigs) + cmd.extend(['-k', '21']) + c.run_sourmash(*cmd) + + cmd = 'multigather --query {} --db gcf_all -k 21 --scaled -5 --threshold-bp=0'.format(query_sig) + cmd = cmd.split(' ') + with pytest.raises(ValueError) as exc: + c.run_sourmash(*cmd) + + assert "ERROR: --scaled value must be positive" in str(exc.value) + + +@utils.in_tempdir +def test_multigather_check_scaled_bounds_less_than_minimum(c): + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = ['index', 'gcf_all'] + cmd.extend(testdata_sigs) + cmd.extend(['-k', '21']) + c.run_sourmash(*cmd) + + cmd = 'multigather --query {} --db gcf_all -k 21 --scaled 50 --threshold-bp=0'.format(query_sig) + cmd = cmd.split(' ') + # Note: this is the value error that is emited, but we want the Warning from below to be generated instead. (ValueError: new scaled 50.0 is lower than current sample scaled 10000) + with pytest.raises(ValueError) as exc: + c.run_sourmash(*cmd) + + assert "WARNING: --scaled value should be >= 100. Continuing anyway." in str(exc.value) + + +@utils.in_tempdir +def test_multigather_check_scaled_bounds_more_than_maximum(c): + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = ['index', 'gcf_all'] + cmd.extend(testdata_sigs) + cmd.extend(['-k', '21']) + c.run_sourmash(*cmd) + + cmd = 'multigather --query {} --db gcf_all -k 21 --scaled 1e9 --threshold-bp=0'.format(query_sig) + cmd = cmd.split(' ') + + c.run_sourmash(*cmd) + + assert "WARNING: --scaled value should be <= 1e6. Continuing anyway." in c.last_result.err + + @utils.in_tempdir def test_multigather_metagenome_query_from_file(c): # test multigather --query-from-file @@ -4002,6 +4150,54 @@ def test_gather_metagenome_output_unassigned_nomatches_protein(runtmp, linear_ga assert y.minhash.moltype == "protein" +def test_gather_check_scaled_bounds_negative(prefetch_gather, linear_gather): + with utils.TempDirectory() as location: + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = 'gather {} {} {} gcf_all -k 21 --scaled -5 --threshold-bp 50000'.format(query_sig, prefetch_gather, linear_gather) + + status, out, err = utils.runscript('sourmash', + cmd.split(' '), + in_directory=location, fail_ok=True) + + assert "ERROR: --scaled value must be positive" in err + + +def test_gather_check_scaled_bounds_less_than_minimum(prefetch_gather, linear_gather): + with utils.TempDirectory() as location: + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = 'gather {} {} {} gcf_all -k 21 --scaled 50 --threshold-bp 50000'.format(query_sig, prefetch_gather, linear_gather) + + status, out, err = utils.runscript('sourmash', + cmd.split(' '), + in_directory=location, fail_ok=True) + + assert "WARNING: --scaled value should be >= 100. Continuing anyway." in err + + +def test_gather_check_scaled_bounds_more_than_maximum(prefetch_gather, linear_gather): + with utils.TempDirectory() as location: + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = 'gather {} {} {} gcf_all -k 21 --scaled 1e9 --threshold-bp 50000'.format(query_sig, prefetch_gather, linear_gather) + + status, out, err = utils.runscript('sourmash', + cmd.split(' '), + in_directory=location, fail_ok=True) + + assert "WARNING: --scaled value should be <= 1e6. Continuing anyway." in err + + def test_gather_metagenome_downsample(prefetch_gather, linear_gather): # downsample w/scaled of 100,000 with utils.TempDirectory() as location: