## 'RNA Splicing'

**Connections**: `PROT`, `SUBS`

---


**Given**: A DNA string *s* (of length at most 1 kbp) and a collection of substrings of *s* acting as introns.

**Return**: A protein string resulting from transcribing and translating the exons of *s*.


*Notes*:
- *Input*: All strings are given in FASTA format.
- Only one solution will exist for the dataset provided.

In [1]:
# Libraries to load:
import os, math, random


In [19]:
# Previous functions generated

def fasta_dictionary(path_to_filename):
    '''
    Open a FASTA file and keep only the identifier and the sequence (ignoring any additional information).
    Convert into a dictionary.
    Output: a dictionary where the key-value pairs are sequence IDs and sequences, respectively
    '''
    with open(path_to_filename, 'r') as f:
        lst  = f.readlines()
    f.close()
    for i in range(len(lst)):
        if lst[i].startswith('>'):
            lst[i] = lst[i].split(' ')[0]+'\n'
    lst  = [i.replace('\n', ' ') for i in lst]
    str1 = ''.join(lst)  
    lst2 = str1.split('>')
    lst2 = lst2[1:]
    seq_dict = {lst2[i].split(' ')[0]:''.join(lst2[i].split(' ')[1:]) for i in range(len(lst2))}
    del lst, lst2
    return seq_dict


def dna_rna_transcribe(dna_string: str) -> str:
    '''
    Load in a string of DNA nucleotides and transcribe into RNA nucleotides,
       replacing thymine 'T' with uracil 'U'
    Result will be a output of the transcribed RNA string text
    '''
    rna_string = ''
    for i in dna_string:
        if i!='T':
            rna_string += i
        else:
            rna_string += 'U'
    return rna_string


def rna_prot_translate(rna_seq: str) -> str:
    '''
    For the RNA nucleotide string, translate into an amino acid (AA) residue string.
    Step 1: load RNA codon table
        Using data found at https://rosalind.info/glossary/rna-codon-table/
        Result in a dictionary where the 3-nt RNA codon is the key and the 1-aa residue is the value 
    Step 2: begin translation 
        Make sure that is the RNA codon translate 'Stop' that one does not include 'Stop' in the residue string
    Result: string of AA residues
    '''
    with open(os.getcwd() + "/datasets/rosalind_RNA_codon_table.txt", "r") as f:
        text = f.read().replace('\n', ' ').split()
    codon_dict = {text[i]:text[i+1] for i in range(len(text)) if i%2==0}
    del text
    aa_seq = ''
    for i in range(len(rna_seq)):
        if (i%3 == 0):
            codon = rna_seq[i:i+3]
            if len(codon) == 3:
                if codon_dict[codon] == 'Stop':
                    break
                else:
                    aa_seq += codon_dict[ codon ]
            del codon
    del codon_dict
    return aa_seq



In [20]:
seqs_dict = fasta_dictionary(os.getcwd() + '/datasets/rosalind_sample_dataset.txt')

seqs_dict

{'Rosalind_10': 'ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG',
 'Rosalind_12': 'ATCGGTCGAA',
 'Rosalind_15': 'ATCGGTCGAGCGTGT'}

In [36]:
seq_names = list(seqs_dict.keys())
dna_test1 = seqs_dict[ seq_names[0] ]

rna_test1 = dna_rna_transcribe(dna_test1)
pro_test1 = rna_prot_translate(rna_test1)

print(dna_test1)
print(rna_test1)
print(pro_test1)
print()

del dna_test1, rna_test1, pro_test1

ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG
AUGGUCUACAUAGCUGACAAACAGCACGUAGCAAUCGGUCGAAUCUCGAGAGGCAUAUGGUCACAUGAUCGGUCGAGCGUGUUUCAAAGUUUGCGCCUAG
MVYIADKQHVAIGRISRGIWSHDRSSVFQSLRL



In [51]:
def dna_to_protein_with_splicing(sequence_dictionary) -> str:
    '''
    '''
    seq_names  = list(sequence_dictionary.keys())
    dna_exon   = sequence_dictionary[ seq_names[0] ]
    intron_lst = [sequence_dictionary[seq_names[i]] for i in range(1, len(seq_names))]
    
    for intron in intron_lst:
        if intron in dna_exon:
            dna_exon = ''.join(dna_exon.split(intron))
    
    pro_trans = rna_prot_translate(dna_rna_transcribe(dna_exon))
    del seq_names, dna_exon, intron_lst
    return pro_trans



In [54]:
with open(os.getcwd() + '/answer_submissions/rosalind_sample_dataset_submission.txt', 'w') as outfile:
    outfile.write(dna_to_protein_with_splicing( fasta_dictionary(os.getcwd() + '/datasets/rosalind_sample_dataset.txt') ) +'\n')
outfile.close()


---

### Problem Attempt:

In [55]:
with open(os.getcwd() + '/answer_submissions/rosalind_splc_submission.txt', 'w') as outfile:
    outfile.write(dna_to_protein_with_splicing( fasta_dictionary(os.getcwd() + '/datasets/rosalind_splc.txt') ) +'\n')
outfile.close()
