## 'Open Reading Frame'

**Connections**: `PROT`, `SUBS`

---


**Given**: A DNA string *s* of length at most 1 kbp in FASTA format.

**Return**: Every distinct candidate protein string that can be translated from ORFs of *s*. Strings can be returned in any order.


*Notes*:


In [1]:
# Libraries to load:
import os


In [15]:
# Previous functions generated

def dna_complement(string):
    '''
    Load in string text of DNA nucleotides and return a string of complementary bases.
    The string text is read 5` to 3`, so the initial complement will be 3`-5`. The output must be 5` to 3`.
    Notes: 
       A-T, T-A, C-G, and G-C
    '''
    nt_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
    comp = ''
    for i in string:
        comp += nt_dict[i]
    return comp[::-1]


def fasta_dictionary(path_to_filename):
    '''
    A more robust function that in the `nb_gc` notebook. Here, open a FASTA file and keep only the identifier and the sequence (ignoring any additional information).
    Convert into a dictionary.
    Output: a dictionary where the key-value pairs are sequence IDs and sequences, respectively
    '''
    with open(path_to_filename, 'r') as f:
        lst  = f.readlines()
    f.close()
    for i in range(len(lst)):
        if lst[i].startswith('>'):
            lst[i] = lst[i].split(' ')[0]+'\n'
    lst  = [i.replace('\n', ' ') for i in lst]
    str1 = ''.join(lst)  
    lst2 = str1.split('>')
    lst2 = lst2[1:]
    seq_dict = {lst2[i].split(' ')[0]:''.join(lst2[i].split(' ')[1:]) for i in range(len(lst2))}
    del lst, lst2
    return seq_dict


with open(os.getcwd() + "/datasets/rosalind_DNA_codon_table.txt", "r") as f:
    text = f.read().replace('\n', ' ').split()
dnacodon_dict = {text[i]:text[i+1] for i in range(len(text)) if i%2==0}
del text

dnacodon_start = 'ATG'


In [16]:
coding     = 'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'
complement = dna_complement(coding)

translations = []
for string in [coding, complement]:
    for i in range(0, len(string)-2):
        if i == 0:
            start  = False
            stop   = False
        codon = string[i:i+3]
        seq = ''
        if codon == dnacodon_start:
            start = True
            translating = True
            j,k = i, i+3
            while (stop==False)&(start==True):
                if string[j:k] in ['TAA','TAG','TGA']:
                    stop = True
                    translating = False
                else:
                    try:
                        seq += dnacodon_dict[string[j:k]]
                        j += 3
                        k += 3
                    except KeyError:
                        break
            if (stop == True)&(translating == False):
                translations.append(seq)
                start = False
                stop  = False
            del j,k
        del codon, seq
    del start, stop
translations = sorted(list(set(translations)))

for item in translations:
    print(item)

M
MGMTPRLGLESLLE
MLLGSFRLIPKETLIQVAGSSPCNLS
MTPRLGLESLLE


In [22]:
seqs       = fasta_dictionary(os.getcwd() + '/datasets/rosalind_sample_dataset.txt')
for key, val in seqs.items():
    coding     = val
    complement = dna_complement(coding)

    translations = []
    for string in [coding, complement]:
        for i in range(0, len(string)-2):
            if i == 0:
                start  = False
                stop   = False
            codon = string[i:i+3]
            seq = ''
            if codon == dnacodon_start:
                start = True
                translating = True
                j,k = i, i+3
                while (stop==False)&(start==True):
                    if string[j:k] in ['TAA','TAG','TGA']:
                        stop = True
                        translating = False
                    else:
                        try:
                            seq += dnacodon_dict[string[j:k]]
                            j += 3
                            k += 3
                        except KeyError:
                            break
                if (stop == True)&(translating == False):
                    translations.append(seq)
                    start = False
                    stop  = False
                del j,k
            del codon, seq
        del start, stop
    translations = sorted(list(set(translations)))
    
    for item in translations:
        print(item)
    print('-'*10)

M
MGMTPRLGLESLLE
MLLGSFRLIPKETLIQVAGSSPCNLS
MTPRLGLESLLE
----------


---

### Problem Attempt:

In [23]:
seqs = fasta_dictionary(os.getcwd() + '/datasets/rosalind_orf.txt')

for key, val in seqs.items():
    coding       = val
    complement   = dna_complement(coding)
    translations = []
    for string in [coding, complement]:
        for i in range(0, len(string)-2):
            if i == 0:
                start  = False
                stop   = False
            codon = string[i:i+3]
            seq = ''
            if codon == dnacodon_start:
                start = True
                translating = True
                j,k = i, i+3
                while (stop==False)&(start==True):
                    if string[j:k] in ['TAA','TAG','TGA']:
                        stop = True
                        translating = False
                    else:
                        try:
                            seq += dnacodon_dict[string[j:k]]
                            j += 3
                            k += 3
                        except KeyError:
                            break
                if (stop == True)&(translating == False):
                    translations.append(seq)
                    start = False
                    stop  = False
                del j,k
            del codon, seq
        del start, stop
    translations = sorted(list(set(translations)))
    outfile = open(os.getcwd() + '/answer_submissions/rosalind_orf_submission.txt', "w")
    for item in translations:
        outfile.write(item + '\n')
    outfile.close()
    del coding, complement, translations

del seqs