-
Notifications
You must be signed in to change notification settings - Fork 78
/
metagenome.py
150 lines (135 loc) · 4.54 KB
/
metagenome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""summarize metagenome gather results"""
usage = """
sourmash tax metagenome --gather-csv <gather_csv> [ ... ] --taxonomy-csv <taxonomy-csv> [ ... ]
The 'tax metagenome' command reads in metagenome gather result CSVs and
summarizes by taxonomic lineage.
The default output format consists of four columns,
'query_name,rank,fraction,lineage', where 'fraction' is the fraction
of the query matched to that reported rank and lineage. The summarization
is reported for each taxonomic rank.
Alternatively, you can output results at a specific rank (e.g. species)
in 'krona', 'lineage_summary', and 'human' formats.
Use '-F human' to display human-readable output.
Please see the 'tax metagenome' documentation for more details:
https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-metagenome-summarize-metagenome-content-from-gather-results
"""
import sourmash
from sourmash.logging import notify, print_results, error
from sourmash.cli.utils import add_rank_arg, check_rank, check_tax_outputs
def subparser(subparsers):
subparser = subparsers.add_parser("metagenome", usage=usage)
subparser.add_argument(
"-g",
"--gather-csv",
action="extend",
nargs="*",
default=[],
help="CSVs from sourmash gather",
)
subparser.add_argument(
"--from-file",
metavar="FILE",
default=None,
help="input many gather results as a text file, with one gather CSV per line",
)
subparser.add_argument(
"-q", "--quiet", action="store_true", help="suppress non-error output"
)
subparser.add_argument(
"-o",
"--output-base",
default="-",
help="base filepath for output file(s) (default stdout)",
)
subparser.add_argument(
"--output-dir", default="", help="directory for output files"
)
subparser.add_argument(
"-t",
"--taxonomy-csv",
"--taxonomy",
metavar="FILE",
action="extend",
nargs="+",
required=True,
help="database lineages CSV",
)
subparser.add_argument(
"--keep-full-identifiers",
action="store_true",
help="do not split identifiers on whitespace",
)
subparser.add_argument(
"--keep-identifier-versions",
action="store_true",
help="after splitting identifiers, do not remove accession versions",
)
subparser.add_argument(
"--fail-on-missing-taxonomy",
action="store_true",
help="fail quickly if taxonomy is not available for an identifier",
)
subparser.add_argument(
"-F",
"--output-format",
default=[],
nargs="*",
action="extend",
choices=[
"human",
"csv_summary",
"krona",
"lineage_summary",
"kreport",
"lingroup",
"bioboxes",
],
help="choose output format(s)",
)
subparser.add_argument(
"-f",
"--force",
action="store_true",
help="continue past errors in taxonomy database loading",
)
subparser.add_argument(
"--lins",
"--lin-taxonomy",
action="store_true",
default=False,
help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information.",
)
subparser.add_argument(
"--lingroup",
"--lingroups",
metavar="FILE",
default=None,
help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group.",
)
subparser.add_argument(
"--ictv",
"--ictv-taxonomy",
action="store_true",
default=False,
help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.",
)
add_rank_arg(subparser)
def main(args):
try:
if not args.gather_csv and not args.from_file:
raise ValueError(
"No gather CSVs found! Please input via '-g' or '--from-file'."
)
if args.rank:
args.rank = check_rank(args)
args.output_format = check_tax_outputs(
args,
rank_required=["krona", "lineage_summary"],
incompatible_with_lins=["bioboxes", "kreport"],
use_lingroup_format=True,
)
except ValueError as exc:
error(f"ERROR: {str(exc)}")
import sys
sys.exit(-1)
return sourmash.tax.__main__.metagenome(args)