/
compute.py
170 lines (149 loc) · 6.01 KB
/
compute.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""compute sequence signatures for inputs"""
usage="""
sourmash compute -k 21,31,51 *.fa *.fq
Create MinHash sketches at k-mer sizes of 21, 31 and 51, for
all FASTA and FASTQ files in the current directory, and save them in
signature files ending in '.sig'. You can rapidly compare these files
with `compare` and query them with `search`, among other operations;
see the full documentation at http://sourmash.rtfd.io/.
The key options for compute are:
* `-k/--ksize <int>[, <int>]: k-mer size(s) to use, e.g. -k 21,31,51
* `-n/--num <int>` or `--scaled <int>`: set size or resolution of sketches
* `--track-abundance`: track abundances of hashes (default False)
* `--dna or --protein`: nucleotide and/or protein signatures (default `--dna`)
* `--merge <name>`: compute a merged signature across all inputs.
* `--singleton`: compute individual signatures for each sequence.
* `--name-from-first`: set name of signature from first sequence in file.
* `-o/--output`: save all computed signatures to this file.
Please see -h for all of the options as well as more detailed help.
---
"""
from argparse import FileType
from sourmash._minhash import get_minhash_default_seed
from sourmash.cli.utils import add_construct_moltype_args
def ksize_parser(ksizes):
# get list of k-mer sizes for which to compute sketches
if ',' in ksizes:
ksizes = ksizes.split(',')
ksizes = list(map(int, ksizes))
else:
ksizes = [int(ksizes)]
return ksizes
def subparser(subparsers):
subparser = subparsers.add_parser('compute', description=__doc__, usage=usage)
sketch_args = subparser.add_argument_group('Sketching options')
sketch_args.add_argument(
'-k', '--ksizes', default='21,31,51',
type=ksize_parser,
help='comma-separated list of k-mer sizes; default=%(default)s'
)
sketch_args.add_argument(
'-n', '--num-hashes', type=int, default=500,
help='number of hashes to use in each sketch; default=%(default)i'
)
sketch_args.add_argument(
'--track-abundance', action='store_true',
help='track k-mer abundances in the generated signature'
)
sketch_args.add_argument(
'--scaled', type=float, default=0,
help='choose number of hashes as 1 in FRACTION of input k-mers'
)
add_construct_moltype_args(sketch_args)
sketch_args.add_argument(
'--input-is-protein', action='store_true',
help='Consume protein sequences - no translation needed.'
)
sketch_args.add_argument(
'--seed', type=int, default=get_minhash_default_seed(),
help='seed used by MurmurHash; default=%(default)i'
)
tenx_args = subparser.add_argument_group('10x options')
tenx_args.add_argument(
'--input-is-10x', action='store_true',
help='input is 10x single cell output folder'
)
tenx_args.add_argument(
'--count-valid-reads', default=0, type=int,
help='a barcode is only considered a valid barcode read and its '
'signature is written if number of umis are greater than '
'count-valid-reads. It is used to weed out cell barcodes with few '
'umis that might have been due to false rna enzyme reactions'
)
tenx_args.add_argument(
'--write-barcode-meta-csv', type=str,
help='for each of the unique barcodes, Write to a given path, number '
'of reads and number of umis per barcode.'
)
tenx_args.add_argument(
'-p', '--processes', default=2, type=int,
help='number of processes to use for reading 10x bam file'
)
tenx_args.add_argument(
'--save-fastas', default="", type=str,
help='save merged fastas for all the unique barcodes to '
'{CELL_BARCODE}.fasta in the absolute path given by this flag; by '
'default, fastas are not saved'
)
tenx_args.add_argument(
'--line-count', type=int, default=1500,
help='line count for each bam shard',
)
tenx_args.add_argument(
'--rename-10x-barcodes', metavar='FILE',
help='Tab-separated file mapping 10x barcode name to new name, e.g. '
'with channel or cell annotation label'
)
tenx_args.add_argument(
'--barcodes-file', metavar='FILE',
help='Barcodes file if the input is unfiltered 10x bam file'
)
file_args = subparser.add_argument_group('File handling options')
file_args.add_argument(
'-f', '--force', action='store_true',
help='recompute signatures even if the file exists'
)
file_args.add_argument(
'-o', '--output',
help='output computed signatures to this file'
)
file_args.add_argument(
'--outdir', help='output computed signatures to this directory'
)
file_args.add_argument(
'--singleton', action='store_true',
help='compute a signature for each sequence record individually'
)
file_args.add_argument(
'--merge', '--name', type=str, default='', metavar="FILE",
help='merge all input files into one signature file with the '
'specified name'
)
file_args.add_argument(
'--name-from-first', action='store_true',
help='name the signature generated from each file after the first '
'record in the file'
)
file_args.add_argument(
'--randomize', action='store_true',
help='shuffle the list of input filenames randomly'
)
subparser.add_argument(
'-q', '--quiet', action='store_true', help='suppress non-error output'
)
subparser.add_argument(
'--check-sequence', action='store_true',
help='complain if input sequence is invalid'
)
subparser.add_argument(
'--license', default='CC0', type=str,
help='signature license. Currently only CC0 is supported.'
)
subparser.add_argument(
'filenames', nargs='+', help='file(s) of sequences'
)
subparser._positionals.title = 'Required arguments'
subparser._optionals.title = 'Miscellaneous options'
def main(args):
from sourmash.command_compute import compute
return compute(args)