From 4056b394693660f66e8c9bd7ce1683e1538915e4 Mon Sep 17 00:00:00 2001 From: Siavash Mirarab Date: Wed, 13 Jan 2021 08:59:54 -0800 Subject: [PATCH] 4.3.19 --- .travis.yml | 2 +- CHANGELOG.md | 3 +++ sepp/alignment.py | 11 +++++++++++ sepp/config.py | 7 ++++++- sepp/jobs.py | 11 +++++++---- tutorial/upp-tutorial.md | 23 +++++++++++++++++++++++ 6 files changed, 51 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2f4de97..fa4fd7b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,7 @@ os: - linux - osx env: - - PYVERSION="3.5" + - PYVERSION="3.7" - PYVERSION="3.8" before_install: - echo "$TRAVIS_OS_NAME" diff --git a/CHANGELOG.md b/CHANGELOG.md index 7454ab8..6b0a290 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +* Version 4.3.18: + * Enable `--symfrac` from config file + * Enable config files that overwrite the default values only when provided * Version 4.3.18: * UPP speed improvement for cases with super gappy backbone alignment (more can be done) * Slightly better logging diff --git a/sepp/alignment.py b/sepp/alignment.py index 9a32c3c..ada5de8 100644 --- a/sepp/alignment.py +++ b/sepp/alignment.py @@ -332,12 +332,23 @@ def remove_columns(self, indexes): self[name] = ''.join(( char for idx, char in enumerate(seq) if idx not in indexes)) + def keep_columns(self, indexes): + for name, seq in self.items(): + self[name] = ''.join(( + char for idx, char in enumerate(seq) if idx in indexes)) + def get_all_gap_cols(self): all_gaps = list(range(0, self.get_length())) for seq in self.values(): all_gaps[:] = [i for i in all_gaps if seq[i] == '-'] return all_gaps + def get_all_nongap_cols(self): + all_gaps = list(range(0, self.get_length())) + for seq in self.values(): + all_gaps[:] = [i for i in all_gaps if seq[i] == '-'] + return [x for x in range(0, self.get_length()) if x not in all_gaps] + def delete_all_gap(self): """ Delete all sites that consists of nothing but gaps diff --git a/sepp/config.py b/sepp/config.py index 298fe4a..151b08e 100644 --- a/sepp/config.py +++ b/sepp/config.py @@ -59,6 +59,7 @@ def set_main_config_path(filename): def _read_config_file(filename, opts, expand=None): + _LOG.debug("Reading config %s" %filename) config_defaults = [] cparser = configparser.ConfigParser() cparser.optionxform = str @@ -72,7 +73,10 @@ def _read_config_file(filename, opts, expand=None): for section in cparser.sections(): if section == "commandline": continue - section_name_space = Namespace() + if getattr(opts, section, None): + section_name_space = getattr(opts, section) + else: + section_name_space = Namespace() for (k, v) in cparser.items(section): if expand and k == "path": v = os.path.join(expand, v) @@ -351,6 +355,7 @@ def error_callback(message): parser.error = error_callback + _LOG.debug(str(input_args)) ''' Read commandline options again to overwrite config file values''' opts = parser.parse_args(input_args, namespace=opts) random.seed(opts.seed) diff --git a/sepp/jobs.py b/sepp/jobs.py index 858340b..953a7e5 100644 --- a/sepp/jobs.py +++ b/sepp/jobs.py @@ -265,12 +265,15 @@ def setup_for_subproblem(self, subproblem, symfrac=True, self.options = kwargs['options'] def get_invocation(self): - invoc = [self.path, '--ere', '0.59', "--cpu", "1", + useroptions = self.options.split() + invoc = [self.path, "--cpu", "1", "--%s" % self.molecule] - if self.symfrac is True: + if "--ere" not in useroptions: + invoc.extend(['--ere', '0.59']) + if self.symfrac is True and "--symfrac" not in useroptions: invoc.extend(["--symfrac", "0.0"]) - if self.options != "": - invoc.extend(self.options.split()) + if useroptions: + invoc.extend(useroptions) if self.informat == "fasta": invoc.extend(['--informat', 'afa']) invoc.extend([self.outfile, self.infile]) diff --git a/tutorial/upp-tutorial.md b/tutorial/upp-tutorial.md index f607299..0115bb5 100644 --- a/tutorial/upp-tutorial.md +++ b/tutorial/upp-tutorial.md @@ -224,6 +224,29 @@ run_upp.py -c sample.config -o config_example ``` --------- + + + +Hints for scalability +--- +If UPP is too slow or takes too much memory, consider the following options: + +* Use `-A` to increase alignment subset size. The default, 10, is too small for very large alignments. + For example, for 10,000 species, consider using `-A 100` instead of the default `-A 10`. +* If your backbone alignment (either what you provide or the pasta alignment that UPP internally estimates) become + too long and gappy (e.g., more than 100,000 sites), you can try the following option. + * Make a config file (e.g., `upp.conf`) and put the following in it: + ~~~ + [hmmbuild] + options = --symfrac 0.05 + ~~~ + * The `0.05` tells HMMBuild to treat sites that are 90% or more gaps as insertion sites rather than match sites. + This can make the HMMs smaller. Choose a value that makes sense for your data, based on how gappy it is. + * Run up with `-c upp.conf` option provided. +* If you have set `-B` to a very large value (say more than 10,000), consider lowering it. + Very large numbers of species in the backbone can make the PASTA alignment very gappy and memory intensive. + + Contact ===