/
config.py
377 lines (319 loc) · 13.3 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
"""
This module provides options(), which returns a argparse.Namespace object
containing all configurations required by sepp. config.options() reads and
saves the configurations the first time it is called. Subsequent calls simply
return the already-saved configurations. A Typical usage is:
some_config_attribute = sepp.config.options().some_config_attribute
All command-line options are found directly inside the return value of
options() as attributes. Commandline options can be specified either directly
using the command-line, or under [commandline] section in a config file,
passed to SEPP using -c commandline option. commandline values overwrite
config file values.
Information stored in other sections of the config file are available as
nested arparse.Namespaes attributes inside the results of options(),
with the config file header used as the attribute name. For example,
imagine the config file has:
[pplacer]
path = /some/path
In this case, config.options().pplacer will be a arparse.Namespace and
config.options().pplacer.path will be "/some/path".
A "main" configuration file under {home}/.sepp/main.config is used
to store some basic configurations such as the location of extra programs, etc.
This main config file is read first, so that user provided config file can
overwrite its values.
In addition, each client of this module (e.g. a new algorithm)) can add new
commandline options by getting the parser object using get_parser() and then
adding extra options. This has to happen *before* the first call to options()
module. For an example see exhaustive_upp.
"""
from argparse import ArgumentParser, Namespace
from sepp.filemgr import get_default_temp_dir, check_or_make_dir_path
import sys
try:
import configparser
except ImportError:
import ConfigParser as configparser
from sepp import version, get_logger
import argparse
import os
import os.path
from multiprocessing import cpu_count
from sepp import scheduler
import random
_LOG = get_logger(__name__)
root_p = open(os.path.join(os.path.split(
os.path.split(__file__)[0])[0], "home.path")).readlines()[0].strip()
main_config_path = os.path.join(root_p, "main.config")
def set_main_config_path(filename):
global main_config_path
main_config_path = filename
def _read_config_file(filename, opts, expand=None):
_LOG.debug("Reading config %s" %filename)
config_defaults = []
cparser = configparser.ConfigParser()
cparser.optionxform = str
cparser.read_file(filename)
if cparser.has_section('commandline'):
for (k, v) in cparser.items('commandline'):
config_defaults.append("--%s" % k)
config_defaults.append(v)
for section in cparser.sections():
if section == "commandline":
continue
if getattr(opts, section, None):
section_name_space = getattr(opts, section)
else:
section_name_space = Namespace()
for (k, v) in cparser.items(section):
if expand and k == "path":
v = os.path.join(expand, v)
section_name_space.__setattr__(k, v)
opts.__setattr__(section, section_name_space)
return config_defaults
def valid_dir_path(path):
ret = check_or_make_dir_path(path)
if ret is None:
raise argparse.ArgumentTypeError(
"%s is not a valid directory path." % path)
return ret
def valid_molecule(molecule):
ret = molecule in ['dna', 'rna', 'amino']
if ret is False:
raise argparse.ArgumentTypeError(
("%s is not a valid molecule type. Must be 'dna', 'rna', or "
"'amino'.") % molecule)
return molecule
def valid_decomp_strategy(strategy):
ret = strategy in ['hierarchical', 'normal', 'midpoint', 'centroid']
if ret is False:
raise argparse.ArgumentTypeError(
("%s is not a valid strategy. Must be 'normal', 'hierarchical',"
" 'centroid', or 'midpoint'.") % strategy)
return strategy
def valid_file_prefix(prefix):
if os.path.dirname(prefix) != "":
raise argparse.ArgumentTypeError(
"%s is not a valid output prefix (includes a directory)." % prefix)
return prefix
def set_cpu(cpus):
c = int(cpus)
scheduler.default_cpus = c
return c
def set_checkpoint(checkpoint):
import sepp.checkpointing
return sepp.checkpointing.CheckPointManager(checkpoint)
_parser = None
def _init_parser():
global _parser
_parser = ArgumentParser(
description=(
"This script runs the SEPP algorithm on an input "
"tree, alignment, fragment file, and RAxML info file."),
conflict_handler='resolve')
_parser.add_argument("-v", "--version", action='version',
version="%(prog)s " + version)
decompGroup = _parser.add_argument_group(
"Decomposition Options".upper(),
' '.join([
"These options determine the alignment decomposition size and",
"taxon insertion size. If None is given, then the default",
"is to align/place at 10% of total taxa. "
"The alignment decomosition size must be",
"less than the taxon insertion size."]))
_parser.groups = dict()
_parser.groups['decompGroup'] = decompGroup
decompGroup.add_argument(
"-A", "--alignmentSize", type=int,
dest="alignment_size", metavar="N", default=None,
help=("max alignment subset size of N "
"[default: 10%% of the total number of taxa or the placement"
" subset size if given]"))
decompGroup.add_argument(
"-P", "--placementSize", type=int,
dest="placement_size", metavar="N", default=None,
help=("max placement subset size of N "
"[default: 10%% of the total number of taxa or the alignment "
"length (whichever bigger)]"))
decompGroup.add_argument(
"-F", "--fragmentChunkSize", type=int,
dest="max_chunk_size", metavar="N", default=20000,
help=("maximum fragment chunk size of N. Helps controlling memory. "
"[default: 20000]"))
decompGroup.add_argument(
"-D", "--distance", type=float,
dest="distance", metavar="DISTANCE",
default=1,
help=("minimum p-distance before stopping the decomposition"
"[default: 1]"))
# uym2 added #
decompGroup.add_argument(
"-M", "--diameter", type=float,
dest="maxDiam", metavar="DIAMETER",
default=None,
help=("maximum tree diameter before stopping the decomposition"
"[default: None]"))
decompGroup.add_argument(
"-S", "--decomp_strategy", type=valid_decomp_strategy,
dest="decomp_strategy", metavar="DECOMP",
default="normal",
# default = "midpoint",
help="decomposition strategy "
"[default: using tree branch length]")
# "[default: only include smallest subsets]")
outputGroup = _parser.add_argument_group(
"Output Options".upper(), "These options control output.")
_parser.groups['outputGroup'] = outputGroup
outputGroup.add_argument(
"-p", "--tempdir",
dest="tempdir", metavar="DIR",
type=valid_dir_path,
default=get_default_temp_dir(),
help=("Tempfile files will be written to DIR. Full-path required. "
"[default: %(default)s]"))
outputGroup.add_argument(
"-rt", "--remtemp",
dest="remtemp",
action="store_true",
help=("Remove tempfile directory. "
"[default: disabled]"))
outputGroup.set_defaults(remtemp=False)
outputGroup.add_argument(
"-o", "--output",
dest="output", metavar="OUTPUT",
default="output",
type=valid_file_prefix,
help="output files with prefix OUTPUT. [default: %(default)s]")
outputGroup.add_argument(
"-d", "--outdir",
dest="outdir", metavar="OUTPUT_DIR",
default=os.path.curdir,
type=valid_dir_path,
help=("output to OUTPUT_DIR directory. full-path required. "
"[default: %(default)s]"))
inputGroup = _parser.add_argument_group(
"Input Options".upper(),
' '.join([
"These options control input. To run SEPP the following is "
"required. A backbone tree (in newick format), a RAxML_info file "
"(this is the file generated by RAxML during estimation of the "
"backbone tree. Pplacer uses this info file to set model "
"parameters), a backbone alignment file (in fasta format), and a"
" fasta file including fragments. The input sequences are assumed"
" to be DNA unless specified otherwise."]))
_parser.groups['inputGroup'] = inputGroup
inputGroup.add_argument(
"-c", "--config",
dest="config_file", metavar="CONFIG",
type=argparse.FileType('r'),
help=("A config file, including options used to run SEPP. Options"
" provided as command line arguments overwrite config file "
"values for those options. "
"[default: %(default)s]"))
inputGroup.add_argument(
"-t", "--tree",
dest="tree_file", metavar="TREE",
type=argparse.FileType('r'),
help="Input tree file (newick format) [default: %(default)s]")
inputGroup.add_argument(
"-r", "--raxml",
dest="info_file", metavar="RAXML",
type=argparse.FileType('r'),
help=("RAxML_info file including model parameters, generated by RAxML."
"[default: %(default)s]"))
inputGroup.add_argument(
"-a", "--alignment",
dest="alignment_file", metavar="ALIGN",
type=argparse.FileType('r'),
help="Aligned fasta file [default: %(default)s]")
inputGroup.add_argument(
"-f", "--fragment",
dest="fragment_file", metavar="FRAG",
type=argparse.FileType('r'),
help="fragment file [default: %(default)s]")
inputGroup.add_argument(
"-m", "--molecule",
dest="molecule", metavar="MOLECULE",
type=valid_molecule,
default="dna",
help=("Molecule type of sequences. Can be amino, dna, or rna "
"[default: %(default)s]"))
otherGroup = _parser.add_argument_group(
"Other options".upper(), "These options control how SEPP is run")
_parser.groups['otherGroup'] = otherGroup
otherGroup.add_argument(
"-x", "--cpu", type=set_cpu,
dest="cpu", metavar="N",
default=set_cpu(cpu_count()),
help=("Use N cpus "
"[default: number of cpus available on the machine]"))
otherGroup.add_argument(
"-cp", "--checkpoint", type=set_checkpoint,
dest="checkpoint", metavar="CHCK_FILE",
default=set_checkpoint(None),
help="checkpoint file [default: no checkpointing]")
otherGroup.add_argument(
"-cpi", "--interval", type=int,
dest="checkpoint_interval", metavar="N",
default=3600,
help=("Interval (in seconds) between checkpoint writes. Has effect "
"only with -cp provided. [default: 3600]"))
otherGroup.add_argument(
"-seed", "--randomseed", type=int,
dest="seed", metavar="N",
default=297834,
help="random seed number. [default: 297834]")
# inputGroup.add_argument("-p", "--package",
# dest="package", metavar="PKG",
# help="package directory"
# "[default: %(default)s]")
#
return _parser
def get_parser():
global _parser
if _parser is None:
_parser = _init_parser()
return _parser
def _parse_options():
parser = get_parser()
opts = Namespace()
''' First read the main configuration file '''
_LOG.debug("Main configuration file at %s" % main_config_path)
if not os.path.exists(main_config_path):
_LOG.warning(
"Main configuration file was not found at: %s\n" %
main_config_path + "Proceeding without the main configuration...")
main_cmd_defaults = []
else:
with open(main_config_path, 'r') as cfile:
main_cmd_defaults = _read_config_file(
cfile, opts)
input_args = main_cmd_defaults + (sys.argv[1:])
''' Then read the commandline options '''
opts = parser.parse_args(input_args, namespace=opts)
''' If there is a user-specified config file, read that '''
if opts.config_file is not None:
config_cmd_defaults = _read_config_file(opts.config_file, opts)
input_args = main_cmd_defaults + config_cmd_defaults + (sys.argv[1:])
def error_callback(message):
newmessage = message.replace(
"arguments:",
"arguments (potentially from the config file):").replace(
"--", "")
ArgumentParser.error(parser, newmessage)
parser.error = error_callback
_LOG.debug(str(input_args))
''' Read commandline options again to overwrite config file values'''
opts = parser.parse_args(input_args, namespace=opts)
random.seed(opts.seed)
_LOG.info("Seed number: %d" % opts.seed)
return opts
_options_singelton = None
def options():
"""
Returns the configurations read from main configuration file,
commandline and the user input configuration file.
"""
global _options_singelton
if _options_singelton is None:
_options_singelton = _parse_options()
return _options_singelton