/
search.py
146 lines (130 loc) · 4.38 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""search a signature against other signatures"""
usage = """
The `search` subcommand searches a collection of signatures or SBTs
for matches to the query signature. It can search for matches with
either high Jaccard similarity [1] or containment; the default is to
use Jaccard similarity, unless `--containment` is specified.
`-o/--output` will create a CSV file containing the matches.
`search` will load all of provided signatures into memory, which can
be slow and somewhat memory intensive for large collections. You can
use `sourmash index` to create a Sequence Bloom Tree (SBT) that can be
quickly searched on disk; this is the same format in which we provide
GenBank and other databases.
Command line usage:
```
sourmash search query.sig [ list of signatures or SBTs ]
```
Example output:
```
49 matches; showing first 20:
similarity match
---------- -----
75.4%% NZ_JMGW01000001.1 Escherichia coli 1-176-05_S4_C2 e117605...
72.2%% NZ_GG774190.1 Escherichia coli MS 196-1 Scfld2538, whole ...
71.4%% NZ_JMGU01000001.1 Escherichia coli 2-011-08_S3_C2 e201108...
70.1%% NZ_JHRU01000001.1 Escherichia coli strain 100854 100854_1...
69.0%% NZ_JH659569.1 Escherichia coli M919 supercont2.1, whole g...
...
```
[1] https://en.wikipedia.org/wiki/Jaccard_index
When `--containment` is provided, the containment of the query in each
of the search signatures or databases is reported.
---
"""
from sourmash.cli.utils import (
add_ksize_arg,
add_moltype_args,
add_picklist_args,
add_scaled_arg,
add_pattern_args,
)
def subparser(subparsers):
subparser = subparsers.add_parser("search", description=__doc__, usage=usage)
subparser.add_argument("query", help="query signature")
subparser.add_argument(
"databases",
nargs="+",
help="signatures/SBTs to search",
)
subparser.add_argument(
"-q", "--quiet", action="store_true", help="suppress non-error output"
)
subparser.add_argument(
"-d", "--debug", action="store_true", help="output debug information"
)
subparser.add_argument(
"-t",
"--threshold",
metavar="T",
default=0.08,
type=float,
help="minimum threshold for reporting matches; default=0.08",
)
subparser.add_argument(
"--save-matches",
metavar="FILE",
help="output matching signatures to the specified file",
)
subparser.add_argument(
"--best-only",
action="store_true",
help="report only the best match (with greater speed)",
)
subparser.add_argument(
"-n",
"--num-results",
default=3,
type=int,
metavar="N",
help="number of results to display to user; 0 to report all",
)
subparser.add_argument(
"--containment",
action="store_true",
help="score based on containment rather than similarity",
)
subparser.add_argument(
"--max-containment",
action="store_true",
help="score based on max containment rather than similarity",
)
subparser.add_argument(
"--estimate-ani-ci",
action="store_true",
help="for containment searches, also output confidence intervals for ANI estimates",
)
subparser.add_argument(
"--ignore-abundance",
action="store_true",
help="do NOT use k-mer abundances if present; note: has no effect if "
"--containment or --max-containment is specified",
)
subparser.add_argument(
"-o",
"--output",
metavar="FILE",
help="output CSV containing matches to this file",
)
subparser.add_argument(
"--md5", default=None, help="select the signature with this md5 as query"
)
subparser.add_argument(
"--fail-on-empty-database",
action="store_true",
help="stop at databases that contain no compatible signatures",
)
subparser.add_argument(
"--no-fail-on-empty-database",
action="store_false",
dest="fail_on_empty_database",
help="continue past databases that contain no compatible signatures",
)
subparser.set_defaults(fail_on_empty_database=True)
add_ksize_arg(subparser)
add_moltype_args(subparser)
add_picklist_args(subparser)
add_pattern_args(subparser)
add_scaled_arg(subparser, 0)
def main(args):
import sourmash
return sourmash.commands.search(args)