/
sourmash_args.py
348 lines (279 loc) · 11.3 KB
/
sourmash_args.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
"""
Utility functions for dealing with input args to the sourmash command line.
"""
import sys
import os
import argparse
from . import signature
from .logging import notify, error
from .index import LinearIndex
from . import signature as sig
from .sbt import SBT
from .sbtmh import SigLeaf
from .lca import lca_utils
import sourmash
DEFAULT_LOAD_K = 31
DEFAULT_N = 500
def get_moltype(sig, require=False):
if sig.minhash.is_molecule_type('DNA'):
moltype = 'DNA'
elif sig.minhash.is_molecule_type('dayhoff'):
moltype = 'dayhoff'
elif sig.minhash.is_molecule_type('hp'):
moltype = 'hp'
elif sig.minhash.is_molecule_type('protein'):
moltype = 'protein'
else:
raise ValueError('unknown molecule type for sig {}'.format(sig.name()))
return moltype
def calculate_moltype(args, default=None):
if args.protein:
if args.dna is True:
error('cannot specify both --dna/--rna and --protein!')
sys.exit(-1)
args.dna = False
moltype = default
if args.dna:
moltype = 'DNA'
elif args.dayhoff:
moltype = 'dayhoff'
elif args.hp:
moltype = 'hp'
elif args.protein:
moltype = 'protein'
return moltype
def load_query_signature(filename, ksize, select_moltype):
try:
sl = signature.load_signatures(filename,
ksize=ksize,
select_moltype=select_moltype,
do_raise=True)
sl = list(sl)
except (IOError, ValueError):
error("Cannot open file '{}'", filename)
sys.exit(-1)
if len(sl) and ksize is None:
ksizes = set([ ss.minhash.ksize for ss in sl ])
if len(ksizes) == 1:
ksize = ksizes.pop()
sl = [ ss for ss in sl if ss.minhash.ksize == ksize ]
notify('select query k={} automatically.', ksize)
elif DEFAULT_LOAD_K in ksizes:
sl = [ ss for ss in sl if ss.minhash.ksize == DEFAULT_LOAD_K ]
notify('selecting default query k={}.', DEFAULT_LOAD_K)
elif ksize:
notify('selecting specified query k={}', ksize)
if len(sl) != 1:
error('When loading query from "{}"', filename)
error('{} signatures matching ksize and molecule type;', len(sl))
error('need exactly one. Specify --ksize or --dna, --rna, or --protein.')
sys.exit(-1)
return sl[0]
class LoadSingleSignatures(object):
def __init__(self, filelist, ksize=None, select_moltype=None,
ignore_files=set()):
self.filelist = filelist
self.ksize = ksize
self.select_moltype = select_moltype
self.ignore_files = ignore_files
self.skipped_ignore = 0
self.skipped_nosig = 0
self.ksizes = set()
self.moltypes = set()
def __iter__(self):
for filename in self.filelist:
if filename in self.ignore_files:
self.skipped_ignore += 1
continue
sl = signature.load_signatures(filename,
ksize=self.ksize,
select_moltype=self.select_moltype)
sl = list(sl)
if len(sl) == 0:
self.skipped_nosig += 1
continue
for query in sl:
query_moltype = get_moltype(query)
query_ksize = query.minhash.ksize
self.ksizes.add(query_ksize)
self.moltypes.add(query_moltype)
if len(self.ksizes) > 1 or len(self.moltypes) > 1:
raise ValueError('multiple k-mer sizes/molecule types present')
for query in sl:
yield filename, query, query_moltype, query_ksize
def traverse_find_sigs(dirnames, yield_all_files=False):
for dirname in dirnames:
if (dirname.endswith('.sig') or yield_all_files) and os.path.isfile(dirname):
yield dirname
continue
for root, dirs, files in os.walk(dirname):
for name in files:
if name.endswith('.sig') or yield_all_files:
fullname = os.path.join(root, name)
yield fullname
def filter_compatible_signatures(query, siglist, force=False):
for ss in siglist:
if check_signatures_are_compatible(query, ss):
yield ss
else:
if not force:
raise ValueError("incompatible signature")
def check_signatures_are_compatible(query, subject):
# is one scaled, and the other not? cannot do search
if query.minhash.scaled and not subject.minhash.scaled or \
not query.minhash.scaled and subject.minhash.scaled:
error("signature {} and {} are incompatible - cannot compare.",
query.name(), subject.name())
if query.minhash.scaled:
error("{} was calculated with --scaled, {} was not.",
query.name(), subject.name())
if subject.minhash.scaled:
error("{} was calculated with --scaled, {} was not.",
subject.name(), query.name())
return 0
return 1
def check_tree_is_compatible(treename, tree, query, is_similarity_query):
# get a minhash from the tree
leaf = next(iter(tree.leaves()))
tree_mh = leaf.data.minhash
query_mh = query.minhash
if tree_mh.ksize != query_mh.ksize:
error("ksize on tree '{}' is {};", treename, tree_mh.ksize)
error('this is different from query ksize of {}.', query_mh.ksize)
return 0
# is one scaled, and the other not? cannot do search.
if (tree_mh.scaled and not query_mh.scaled) or \
(query_mh.scaled and not tree_mh.scaled):
error("for tree '{}', tree and query are incompatible for search.",
treename)
if tree_mh.scaled:
error("tree was calculated with scaled, query was not.")
else:
error("query was calculated with scaled, tree was not.")
return 0
# are the scaled values incompatible? cannot downsample tree for similarity
if tree_mh.scaled and tree_mh.scaled < query_mh.scaled and \
is_similarity_query:
error("for tree '{}', scaled value is smaller than query.", treename)
error("tree scaled: {}; query scaled: {}. Cannot do similarity search.",
tree_mh.scaled, query_mh.scaled)
return 0
return 1
def load_dbs_and_sigs(filenames, query, is_similarity_query, traverse=False):
"""
Load one or more SBTs, LCAs, and/or signatures.
Check for compatibility with query.
"""
query_ksize = query.minhash.ksize
query_moltype = get_moltype(query)
n_signatures = 0
n_databases = 0
databases = []
for sbt_or_sigfile in filenames:
notify('loading from {}...', sbt_or_sigfile, end='\r')
# are we collecting signatures from a directory/path?
if traverse and os.path.isdir(sbt_or_sigfile):
for sigfile in traverse_find_sigs([sbt_or_sigfile]):
try:
siglist = sig.load_signatures(sigfile,
ksize=query_ksize,
select_moltype=query_moltype)
siglist = filter_compatible_signatures(query, siglist, 1)
linear = LinearIndex(siglist, filename=sigfile)
databases.append((linear, sbt_or_sigfile, False))
notify('loaded {} signatures from {}', len(linear),
sigfile, end='\r')
n_signatures += len(linear)
except Exception: # ignore errors with traverse
pass
# done! jump to beginning of main 'for' loop
continue
# no traverse? try loading as an SBT.
try:
tree = SBT.load(sbt_or_sigfile, leaf_loader=SigLeaf.load)
if not check_tree_is_compatible(sbt_or_sigfile, tree, query,
is_similarity_query):
sys.exit(-1)
databases.append((tree, sbt_or_sigfile, 'SBT'))
notify('loaded SBT {}', sbt_or_sigfile, end='\r')
n_databases += 1
# done! jump to beginning of main 'for' loop
continue
except (ValueError, EnvironmentError):
# not an SBT - try as an LCA
pass
# ok. try loading as an LCA.
try:
lca_db = lca_utils.LCA_Database()
lca_db.load(sbt_or_sigfile)
assert query_ksize == lca_db.ksize
query_scaled = query.minhash.scaled
notify('loaded LCA {}', sbt_or_sigfile, end='\r')
n_databases += 1
databases.append((lca_db, sbt_or_sigfile, 'LCA'))
continue
except (ValueError, TypeError, EnvironmentError):
# not an LCA database - try as a .sig
pass
# not a tree? try loading as a signature.
try:
siglist = sig.load_signatures(sbt_or_sigfile,
ksize=query_ksize,
select_moltype=query_moltype)
siglist = list(siglist)
if len(siglist) == 0: # file not found, or parse error?
raise ValueError
siglist = filter_compatible_signatures(query, siglist, False)
linear = LinearIndex(siglist, filename=sbt_or_sigfile)
databases.append((linear, sbt_or_sigfile, 'signature'))
notify('loaded {} signatures from {}', len(linear),
sbt_or_sigfile, end='\r')
n_signatures += len(linear)
except (EnvironmentError, ValueError):
error("\nCannot open file '{}'", sbt_or_sigfile)
sys.exit(-1)
notify(' '*79, end='\r')
if n_signatures and n_databases:
notify('loaded {} signatures and {} databases total.', n_signatures,
n_databases)
elif n_signatures:
notify('loaded {} signatures.', n_signatures)
elif n_databases:
notify('loaded {} databases.', n_databases)
else:
sys.exit(-1)
if databases:
print('')
return databases
class FileOutput(object):
"""A context manager for file outputs that handles sys.stdout gracefully.
Usage:
with FileOutput(filename, mode) as fp:
...
does what you'd expect, but it handles the situation where 'filename'
is '-' or None. This makes it nicely compatible with argparse usage,
e.g.
p = argparse.ArgumentParser()
p.add_argument('--output')
args = p.parse_args()
...
with FileOutput(args.output, 'wt') as fp:
...
will properly handle no argument or '-' as sys.stdout.
"""
def __init__(self, filename, mode='wt'):
self.filename = filename
self.mode = mode
self.fp = None
def open(self):
if self.filename == '-' or self.filename is None:
return sys.stdout
self.fp = open(self.filename, self.mode)
return self.fp
def __enter__(self):
return self.open()
def __exit__(self, type, value, traceback):
# do we need to handle exceptions here?
if self.fp:
self.fp.close()
return False