Merge pull request #82 from balabanmetin/dev_backtranslate

backtranslate functionality added to UPP.
smirarab · Jul 25, 2020 · 5e961be · 5e961be
2 parents 824182a + 9181321
commit 5e961be
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,5 @@
+* Version 4.3.13:
+	* Added backtranslation functionality to UPP for amino acid sequences.
 * Version 4.3.12:
 	* Added `-rt` to remove temp files	
 * Version 4.3.11:

diff --git a/README.UPP.md b/README.UPP.md
@@ -22,7 +22,7 @@ PARALLEL IMPLEMENTATION: UPP is embarrassingly parallel. See Advanced Usage info
 
 
 
-Developers: Nam Nguyen, Siavash Mirarab, and Tandy Warnow.
+Developers: Nam Nguyen, Siavash Mirarab, and Tandy Warnow with valuable contributions from Metin Balaban.
 
 ###Publication:
 Nam Nguyen, Siavash Mirarab, Keerthana Kumar, and Tandy Warnow. `Ultra-large alignments using Phylogeny Aware Profiles`. Accepted to RECOMB 2015 (Research in Computational Molecular Biology 2015) and Genome Biology.
@@ -76,9 +76,9 @@ The general command for running UPP is:
 
 This will run UPP(Default) as described in the main paper.  This will automatically select up to 1,000 sequences to be in the backbone set, generate a PASTA alignment and tree, and then align the remaining sequences to the backbone alignment.  UPP can also be run using a configuration file. 
 
-The main outputs of UPP are two alignment files, <prefix>_alignment.fasta and <prefix>_alignment_masked.fasta.  The  <prefix>_alignment.fasta file is the alignment of the unaligned sequences.  The <prefix>_alignment_masked.fasta is the masked alignment file; non-homologous sites in the query set are removed.  
+The main outputs of UPP are two alignment files, `<prefix>_alignment.fasta` and `<prefix>_alignment_masked.fasta`.  The  `<prefix>_alignment.fasta` file is the alignment of the unaligned sequences.  The `<prefix>_alignment_masked.fasta` is the masked alignment file; non-homologous sites in the query set are removed.  
 
-The secondary outputs are the backbone alignment and tree (always named as pasta.fasta and pasta.fasttree) and the list of insertion columns (named _insertion_columns.txt).
+The secondary outputs are the backbone alignment and tree (always named as pasta.fasta and pasta.fasttree) and the list of insertion columns (named `<prefix>_insertion_columns.txt`).
 
 Sample configuration files and input files can be found under `test/unittest/data/upp/`. Change to that directory to run UPP on the sample files.  To run UPP(Fast) on a small test example with 1,000 sequences, run the following command from the `test/unittest/data/upp/` directory:
 
@@ -124,6 +124,14 @@ To run the parallelized version of UPP, run
 
 `python <bin>/run_upp.py -s input.fas -x <cpus>`
 
+If nucleotide sequences are known for input amino acid sequences (backbone and query), backtranslation of extended 
+alignment is performed via the following command:
+
+`python <bin>/run_upp.py -s input.fas -a <alignment_file> -t <tree_file> -b nucleotide_sequences.fas`
+
+where nucleotide_sequences.fas contains unaligned DNA sequences of both backbone and query sequences. 
+The command will create two DNA alignment files: `<prefix>_backtranslated_alignment.fasta` and 
+`<prefix>_backtranslated_alignment_masked.fasta` .
 
 ---------------------------------------------
 Bugs and Errors

diff --git a/sepp/backtranslate.py b/sepp/backtranslate.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+
+import itertools
+from sepp.alignment import ExtendedAlignment
+
+
+gencode = {
+    'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
+    'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
+    'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
+    'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
+    'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
+    'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
+    'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
+    'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
+    'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
+    'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
+    'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
+    'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
+    'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
+    'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
+    'TAC': 'Y', 'TAT': 'Y', 'TAA': '_', 'TAG': '_',
+    'TGC': 'C', 'TGT': 'C', 'TGA': '_', 'TGG': 'W'  # ,
+    #    'CCA':'Z', 'CCC':'Z', 'CCG':'Z', 'CCT':'Z',
+    #    'GCA':'Z', 'GCC':'Z', 'GCG':'Z', 'GCT':'Z'
+}
+
+dnacode = {'A': ['A'], 'C': ['C'], 'G': ['G'], 'T': ['T'],
+           'S': ['G', 'C'], 'R': ['G', 'A'], 'Y': ['T', 'C'],
+           'W': ['A', 'T'], 'M': ['A', 'C'], 'K': ['G', 'T'],
+           'B': ['G', 'C', 'T'], 'H': ['A', 'C', 'T'],
+           'D': ['G', 'A', 'T'], 'V': ['G', 'C', 'A'],
+           'N': ['A', 'C', 'G', 'T']}
+
+
+def is_compatible(cd, aa):
+    if aa == 'Z':
+        return is_compatible(cd, 'E') or is_compatible(cd, 'Q')
+    elif aa == 'B':
+        return is_compatible(cd, 'N') or is_compatible(cd, 'D')
+    else:
+        return aa == 'X' or aa in set(gencode[''.join(a)]
+                                      for a in itertools.product(
+            [''.join(a) for a in itertools.product(
+                dnacode[cd[0]], dnacode[cd[1]])], dnacode[cd[2]]))
+
+
+def is_ambiguous(cd):
+    return len(set(gencode[''.join(a)]
+                   for a in itertools.product(
+        [''.join(a) for a in itertools.product(
+            dnacode[cd[0]], dnacode[cd[1]])],
+        dnacode[cd[2]]))) > 1
+
+
+def backtranslate(faa, fna):
+    newfna = ExtendedAlignment(faa.fragments)
+    for k, s in fna.items():
+        if k in faa.keys():
+            aa = faa[k].upper()
+            cd = []
+            i = 0
+            for r in aa:
+                cds = s[i:i + 3]
+                if r == '-':
+                    cd.append('---')
+                else:
+                    if is_compatible(cds, r):
+                        cd.append(cds)
+                        i += 3
+                    else:
+                        if i == 0 and (cds == 'GTG' or cds == 'TTG'):
+                            cd.append(cds)
+                            i += 3
+                        else:
+                            raise ValueError(
+                                '%s at position %d of %s '
+                                'does not translate to %s' % (cds, i, k, r))
+            newfna[k] = ''.join(cd)
+        else:
+            continue
+    col_lab = faa.col_labels
+    for i in col_lab:
+        newfna._col_labels = newfna._col_labels + [i, i, i]
+    return newfna
diff --git a/sepp/exhaustive_upp.py b/sepp/exhaustive_upp.py
@@ -17,6 +17,7 @@
 import sepp.config
 from sepp.math_utils import lcm
 from sepp.problem import SeppProblem
+from sepp.backtranslate import backtranslate
 
 _LOG = get_logger(__name__)
 
@@ -178,6 +179,14 @@ def check_options(self):
             % (options().backbone_size, backbone_size))
         if options().placement_size is None:
             options().placement_size = options().backbone_size
+
+        if options().backtranslation_sequence_file and \
+                options().molecule != "amino":
+            _LOG.error(
+                ("Backtranslation can be performed only when "
+                 "input sequences are amino acid. "))
+            exit(-1)
+
         return ExhaustiveAlgorithm.check_options(self)
 
     def merge_results(self):
@@ -278,6 +287,33 @@ def output_results(self):
         outfilename = self.get_output_filename("insertion_columns.txt")
         extended_alignment.write_insertion_column_indexes(outfilename)
         _LOG.info("The index of insertion columns written to %s" % outfilename)
+        if self.options.backtranslation_sequence_file:
+            outfilename = self.get_output_filename(
+                "backtranslated_alignment.fasta")
+            backtranslation_seqs = MutableAlignment()
+            backtranslation_seqs.read_file_object(
+                self.options.backtranslation_sequence_file)
+            try:
+                extended_backtranslated_alignment = backtranslate(
+                    self.results, backtranslation_seqs)
+            except Exception as e:
+                _LOG.warning("Backtranslation failed due "
+                             "to following error: " + str(e) + ".\n"
+                             "No translated DNA sequence will be "
+                             "written to a file.")
+                pass
+            else:
+                extended_backtranslated_alignment.write_to_path(outfilename)
+                _LOG.info(
+                    "Backtranslated alignment written to %s" % outfilename)
+                extended_backtranslated_alignment.remove_insertion_columns()
+                outfilename = self.get_output_filename(
+                    "backtranslated_alignment_masked.fasta")
+                extended_backtranslated_alignment.write_to_path(outfilename)
+                _LOG.info(
+                    "Backtranslated masked alignment written "
+                    "to %s" % outfilename)
+
         extended_alignment.remove_insertion_columns()
         outfilename = self.get_output_filename("alignment_masked.fasta")
         extended_alignment.write_to_path(outfilename)
@@ -410,6 +446,15 @@ def augment_parser():
         help="Aligned fasta file "
              "[default: %(default)s]")
 
+    inputGroup.add_argument(
+        "-b", "--backtranslation",
+        dest="backtranslation_sequence_file", metavar="SEQ",
+        type=argparse.FileType('r'),
+        default=None,
+        help="Fasta file containing unaligned DNA sequences "
+             "corresponding every reference and query sequence "
+             "[default: None]")
+
     uppGroup = parser.add_argument_group(
         "UPP Options".upper(),
         "These options set settings specific to UPP")

diff --git a/setup.py b/setup.py
@@ -30,7 +30,7 @@
 from distutils.spawn import find_executable
 
 use_setuptools(version="0.6.24")
-version = "4.3.12"
+version = "4.3.13"
 
 
 def get_tools_dir(where):