/
search.py
124 lines (110 loc) · 4.17 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""search a signature against other signatures"""
usage="""
The `search` subcommand searches a collection of signatures or SBTs
for matches to the query signature. It can search for matches with
either high Jaccard similarity [1] or containment; the default is to
use Jaccard similarity, unless `--containment` is specified.
`-o/--output` will create a CSV file containing the matches.
`search` will load all of provided signatures into memory, which can
be slow and somewhat memory intensive for large collections. You can
use `sourmash index` to create a Sequence Bloom Tree (SBT) that can be
quickly searched on disk; this is the same format in which we provide
GenBank and other databases.
Command line usage:
```
sourmash search query.sig [ list of signatures or SBTs ]
```
Example output:
```
49 matches; showing first 20:
similarity match
---------- -----
75.4%% NZ_JMGW01000001.1 Escherichia coli 1-176-05_S4_C2 e117605...
72.2%% NZ_GG774190.1 Escherichia coli MS 196-1 Scfld2538, whole ...
71.4%% NZ_JMGU01000001.1 Escherichia coli 2-011-08_S3_C2 e201108...
70.1%% NZ_JHRU01000001.1 Escherichia coli strain 100854 100854_1...
69.0%% NZ_JH659569.1 Escherichia coli M919 supercont2.1, whole g...
...
```
[1] https://en.wikipedia.org/wiki/Jaccard_index
---
"""
from sourmash.cli.utils import (add_ksize_arg, add_moltype_args,
add_picklist_args, add_scaled_arg,
add_pattern_args)
def subparser(subparsers):
subparser = subparsers.add_parser('search', description=__doc__, usage=usage)
subparser.add_argument(
'query', help='query signature'
)
subparser.add_argument(
'databases', nargs='+',
help='signatures/SBTs to search',
)
subparser.add_argument(
'-q', '--quiet', action='store_true',
help='suppress non-error output'
)
subparser.add_argument(
'-d', '--debug', action='store_true',
help='output debug information'
)
subparser.add_argument(
'-t', '--threshold', metavar='T', default=0.08, type=float,
help='minimum threshold for reporting matches; default=0.08'
)
subparser.add_argument(
'--save-matches', metavar='FILE',
help='output matching signatures to the specified file'
)
subparser.add_argument(
'--best-only', action='store_true',
help='report only the best match (with greater speed)'
)
subparser.add_argument(
'-n', '--num-results', default=3, type=int, metavar='N',
help='number of results to display to user; 0 to report all'
)
subparser.add_argument(
'--containment', action='store_true',
help='score based on containment rather than similarity'
)
subparser.add_argument(
'--max-containment', action='store_true',
help='score based on max containment rather than similarity'
)
subparser.add_argument(
'--estimate-ani-ci', action='store_true',
help='for containment searches, also output confidence intervals for ANI estimates'
)
subparser.add_argument(
'--ignore-abundance', action='store_true',
help='do NOT use k-mer abundances if present; note: has no effect if '
'--containment or --max-containment is specified'
)
subparser.add_argument(
'-o', '--output', metavar='FILE',
help='output CSV containing matches to this file'
)
subparser.add_argument(
'--md5', default=None,
help='select the signature with this md5 as query'
)
subparser.add_argument(
'--fail-on-empty-database', action='store_true',
help='stop at databases that contain no compatible signatures'
)
subparser.add_argument(
'--no-fail-on-empty-database', action='store_false',
dest='fail_on_empty_database',
help='continue past databases that contain no compatible signatures'
)
subparser.set_defaults(fail_on_empty_database=True)
add_ksize_arg(subparser)
add_moltype_args(subparser)
add_picklist_args(subparser)
add_pattern_args(subparser)
add_scaled_arg(subparser, 0)
def main(args):
import sourmash
return sourmash.commands.search(args)