Skip to content

Commit

Permalink
Merge pull request #82 from balabanmetin/dev_backtranslate
Browse files Browse the repository at this point in the history
backtranslate functionality added to UPP.
  • Loading branch information
smirarab committed Jul 25, 2020
2 parents 824182a + 9181321 commit 5e961be
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 4 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
* Version 4.3.13:
* Added backtranslation functionality to UPP for amino acid sequences.
* Version 4.3.12:
* Added `-rt` to remove temp files
* Version 4.3.11:
Expand Down
14 changes: 11 additions & 3 deletions README.UPP.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ PARALLEL IMPLEMENTATION: UPP is embarrassingly parallel. See Advanced Usage info



Developers: Nam Nguyen, Siavash Mirarab, and Tandy Warnow.
Developers: Nam Nguyen, Siavash Mirarab, and Tandy Warnow with valuable contributions from Metin Balaban.

###Publication:
Nam Nguyen, Siavash Mirarab, Keerthana Kumar, and Tandy Warnow. `Ultra-large alignments using Phylogeny Aware Profiles`. Accepted to RECOMB 2015 (Research in Computational Molecular Biology 2015) and Genome Biology.
Expand Down Expand Up @@ -76,9 +76,9 @@ The general command for running UPP is:

This will run UPP(Default) as described in the main paper. This will automatically select up to 1,000 sequences to be in the backbone set, generate a PASTA alignment and tree, and then align the remaining sequences to the backbone alignment. UPP can also be run using a configuration file.

The main outputs of UPP are two alignment files, <prefix>_alignment.fasta and <prefix>_alignment_masked.fasta. The <prefix>_alignment.fasta file is the alignment of the unaligned sequences. The <prefix>_alignment_masked.fasta is the masked alignment file; non-homologous sites in the query set are removed.
The main outputs of UPP are two alignment files, `<prefix>_alignment.fasta` and `<prefix>_alignment_masked.fasta`. The `<prefix>_alignment.fasta` file is the alignment of the unaligned sequences. The `<prefix>_alignment_masked.fasta` is the masked alignment file; non-homologous sites in the query set are removed.

The secondary outputs are the backbone alignment and tree (always named as pasta.fasta and pasta.fasttree) and the list of insertion columns (named _insertion_columns.txt).
The secondary outputs are the backbone alignment and tree (always named as pasta.fasta and pasta.fasttree) and the list of insertion columns (named `<prefix>_insertion_columns.txt`).

Sample configuration files and input files can be found under `test/unittest/data/upp/`. Change to that directory to run UPP on the sample files. To run UPP(Fast) on a small test example with 1,000 sequences, run the following command from the `test/unittest/data/upp/` directory:

Expand Down Expand Up @@ -124,6 +124,14 @@ To run the parallelized version of UPP, run

`python <bin>/run_upp.py -s input.fas -x <cpus>`

If nucleotide sequences are known for input amino acid sequences (backbone and query), backtranslation of extended
alignment is performed via the following command:

`python <bin>/run_upp.py -s input.fas -a <alignment_file> -t <tree_file> -b nucleotide_sequences.fas`

where nucleotide_sequences.fas contains unaligned DNA sequences of both backbone and query sequences.
The command will create two DNA alignment files: `<prefix>_backtranslated_alignment.fasta` and
`<prefix>_backtranslated_alignment_masked.fasta` .

---------------------------------------------
Bugs and Errors
Expand Down
86 changes: 86 additions & 0 deletions sepp/backtranslate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env python


import itertools
from sepp.alignment import ExtendedAlignment


gencode = {
'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
'TAC': 'Y', 'TAT': 'Y', 'TAA': '_', 'TAG': '_',
'TGC': 'C', 'TGT': 'C', 'TGA': '_', 'TGG': 'W' # ,
# 'CCA':'Z', 'CCC':'Z', 'CCG':'Z', 'CCT':'Z',
# 'GCA':'Z', 'GCC':'Z', 'GCG':'Z', 'GCT':'Z'
}

dnacode = {'A': ['A'], 'C': ['C'], 'G': ['G'], 'T': ['T'],
'S': ['G', 'C'], 'R': ['G', 'A'], 'Y': ['T', 'C'],
'W': ['A', 'T'], 'M': ['A', 'C'], 'K': ['G', 'T'],
'B': ['G', 'C', 'T'], 'H': ['A', 'C', 'T'],
'D': ['G', 'A', 'T'], 'V': ['G', 'C', 'A'],
'N': ['A', 'C', 'G', 'T']}


def is_compatible(cd, aa):
if aa == 'Z':
return is_compatible(cd, 'E') or is_compatible(cd, 'Q')
elif aa == 'B':
return is_compatible(cd, 'N') or is_compatible(cd, 'D')
else:
return aa == 'X' or aa in set(gencode[''.join(a)]
for a in itertools.product(
[''.join(a) for a in itertools.product(
dnacode[cd[0]], dnacode[cd[1]])], dnacode[cd[2]]))


def is_ambiguous(cd):
return len(set(gencode[''.join(a)]
for a in itertools.product(
[''.join(a) for a in itertools.product(
dnacode[cd[0]], dnacode[cd[1]])],
dnacode[cd[2]]))) > 1


def backtranslate(faa, fna):
newfna = ExtendedAlignment(faa.fragments)
for k, s in fna.items():
if k in faa.keys():
aa = faa[k].upper()
cd = []
i = 0
for r in aa:
cds = s[i:i + 3]
if r == '-':
cd.append('---')
else:
if is_compatible(cds, r):
cd.append(cds)
i += 3
else:
if i == 0 and (cds == 'GTG' or cds == 'TTG'):
cd.append(cds)
i += 3
else:
raise ValueError(
'%s at position %d of %s '
'does not translate to %s' % (cds, i, k, r))
newfna[k] = ''.join(cd)
else:
continue
col_lab = faa.col_labels
for i in col_lab:
newfna._col_labels = newfna._col_labels + [i, i, i]
return newfna
45 changes: 45 additions & 0 deletions sepp/exhaustive_upp.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import sepp.config
from sepp.math_utils import lcm
from sepp.problem import SeppProblem
from sepp.backtranslate import backtranslate

_LOG = get_logger(__name__)

Expand Down Expand Up @@ -178,6 +179,14 @@ def check_options(self):
% (options().backbone_size, backbone_size))
if options().placement_size is None:
options().placement_size = options().backbone_size

if options().backtranslation_sequence_file and \
options().molecule != "amino":
_LOG.error(
("Backtranslation can be performed only when "
"input sequences are amino acid. "))
exit(-1)

return ExhaustiveAlgorithm.check_options(self)

def merge_results(self):
Expand Down Expand Up @@ -278,6 +287,33 @@ def output_results(self):
outfilename = self.get_output_filename("insertion_columns.txt")
extended_alignment.write_insertion_column_indexes(outfilename)
_LOG.info("The index of insertion columns written to %s" % outfilename)
if self.options.backtranslation_sequence_file:
outfilename = self.get_output_filename(
"backtranslated_alignment.fasta")
backtranslation_seqs = MutableAlignment()
backtranslation_seqs.read_file_object(
self.options.backtranslation_sequence_file)
try:
extended_backtranslated_alignment = backtranslate(
self.results, backtranslation_seqs)
except Exception as e:
_LOG.warning("Backtranslation failed due "
"to following error: " + str(e) + ".\n"
"No translated DNA sequence will be "
"written to a file.")
pass
else:
extended_backtranslated_alignment.write_to_path(outfilename)
_LOG.info(
"Backtranslated alignment written to %s" % outfilename)
extended_backtranslated_alignment.remove_insertion_columns()
outfilename = self.get_output_filename(
"backtranslated_alignment_masked.fasta")
extended_backtranslated_alignment.write_to_path(outfilename)
_LOG.info(
"Backtranslated masked alignment written "
"to %s" % outfilename)

extended_alignment.remove_insertion_columns()
outfilename = self.get_output_filename("alignment_masked.fasta")
extended_alignment.write_to_path(outfilename)
Expand Down Expand Up @@ -410,6 +446,15 @@ def augment_parser():
help="Aligned fasta file "
"[default: %(default)s]")

inputGroup.add_argument(
"-b", "--backtranslation",
dest="backtranslation_sequence_file", metavar="SEQ",
type=argparse.FileType('r'),
default=None,
help="Fasta file containing unaligned DNA sequences "
"corresponding every reference and query sequence "
"[default: None]")

uppGroup = parser.add_argument_group(
"UPP Options".upper(),
"These options set settings specific to UPP")
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from distutils.spawn import find_executable

use_setuptools(version="0.6.24")
version = "4.3.12"
version = "4.3.13"


def get_tools_dir(where):
Expand Down

0 comments on commit 5e961be

Please sign in to comment.