In [1]:
def translate(seq):
   """
   Translate a string containing a nucleotide sequence into a string containing the corresponding sequence of amino acids.
   Nucleotides are translated in triplets using the table dictionary; each amino acid 4 is encoded with a string of length 1.
   """
   table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
   }

   # Build a protein
   # check sequence length divisible by 3
   protein = ''
   if len(seq) % 3 == 0:
      # loop over seq
      for i in range(0, len(seq), 3):
         # extract a single codon
         codon = seq[i : i+3]
         # look up codon and store result
         protein += table[codon]
   else:
        protein = 'Should be divisible by 3'

   return protein

In [2]:
print( translate('ATA') ) # I

I


In [3]:
print( translate('AAA') ) # K

K


In [4]:
print( translate('GCC') ) # A

A


In [5]:
seq = 'cccgcccccg cctggagtcc gacgtggaag ttgctggctg actgggcttg cgaggaaacc gcctcggagc tgcagccgaa ggccaaggaa tcactgaaga tcggcgaggg aggacagggg gttcatcatg ggtggctttt'
#seq += '_'
#seq += 'c'
seq = seq.replace(' ', '').upper()
print(seq[40:50])

ACTGGGCTTG


In [6]:
# now we will try with some real DNA
inputfi = 'dna.txt'

fi = open(inputfi, 'r')
seq = fi.read()
seq = seq.replace('\n', '')
seq = seq.replace('\r', '')

print(seq)

GGTCAGAAAAAGCCCTCTCCATGTCTACTCACGATACATCCCTGAAAACCACTGAGGAAGTGGCTTTTCAGATCATCTTGCTTTGCCAGTTTGGGGTTGGGACTTTTGCCAATGTATTTCTCTTTGTCTATAATTTCTCTCCAATCTCGACTGGTTCTAAACAGAGGCCCAGACAAGTGATTTTAAGACACATGGCTGTGGCCAATGCCTTAACTCTCTTCCTCACTATATTTCCAAACAACATGATGACTTTTGCTCCAATTATTCCTCAAACTGACCTCAAATGTAAATTAGAATTCTTCACTCGCCTCGTGGCAAGAAGCACAAACTTGTGTTCAACTTGTGTTCTGAGTATCCATCAGTTTGTCACACTTGTTCCTGTTAATTCAGGTAAAGGAATACTCAGAGCAAGTGTCACAAACATGGCAAGTTATTCTTGTTACAGTTGTTGGTTCTTCAGTGTCTTAAATAACATCTACATTCCAATTAAGGTCACTGGTCCACAGTTAACAGACAATAACAATAACTCTAAAAGCAAGTTGTTCTGTTCCACTTCTGATTTCAGTGTAGGCATTGTCTTCTTGAGGTTTGCCCATGATGCCACATTCATGAGCATCATGGTCTGGACCAGTGTCTCCATGGTACTTCTCCTCCATAGACATTGTCAGAGAATGCAGTACATATTCACTCTCAATCAGGACCCCAGGGGCCAAGCAGAGACCACAGCAACCCATACTATCCTGATGCTGGTAGTCACATTTGTTGGCTTTTATCTTCTAAGTCTTATTTGTATCATCTTTTACACCTATTTTATATATTCTCATCATTCCCTGAGGCATTGCAATGACATTTTGGTTTCGGGTTTCCCTACAATTTCTCCTTTACTGTTGACCTTCAGAGACCCTAAGGGTCCTTGTTCTGTGTTCTTCAACTGTTGAAAGCCAGAGTCACTAAAAATGCCAAACACAGAAGACAGCTTTGCTAATACCATTAAATACTT

In [7]:
print(seq[40:50])

CCTGAAAACC


In [8]:
print( translate(seq) )

GQKKPSPCLLTIHP_KPLRKWLFRSSCFASLGLGLLPMYFSLSIISLQSRLVLNRGPDK_F_DTWLWPMP_LSSSLYFQTT__LLLQLFLKLTSNVN_NSSLASWQEAQTCVQLVF_VSISLSHLFLLIQVKEYSEQVSQTWQVILVTVVGSSVS_ITSTFQLRSLVHS_QTITITLKASCSVPLLISV_ALSS_GLPMMPHS_ASWSGPVSPWYFSSIDIVRECSTYSLSIRTPGAKQRPQQPILS_CW_SHLLAFIF_VLFVSSFTPILYILIIP_GIAMTFWFRVSLQFLLYC_PSETLRVLVLCSSTVESQSH_KCQTQKTALLIPLNTLFHKYVFKSLYEQGMVLTAILIKE_GYNHLLI_KDFWLESD_NSELFTTLHSL
