## 'Computing GC Content'

**Given**: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).

**Return**: The ID of the string having the highest GC-content, followed by the GC-content of that string. Rosalind allows for a default error of 0.001 in all decimal answers unless otherwise stated; please see the note on absolute error below.


In [83]:
def nt_gc_counter(text_string: str):
    '''
    For the loaded string of nucleotides, determine 'G' and 'C' content.
    Output: frequency (ratio) of 'G' and 'C' nucleotides
    '''
    ct = 0
    for nt in text_string:
        if nt=='G' or nt=='C':
            ct+=1  
    return 100*(ct/len(text_string))


def fasta_dict_generator(filepath) -> dict:
    '''
    For _filepath_ provided, 
       1. open file and read
       2. replace all new line characters
       3. split at each '>'
       4. remove empty dictionary entry
    Output: a dictionary with a key:value pair such that each key is 'Rosalind_XXXX' and each value its sequence
    '''
    with open(filepath, "r") as infile:
        text = infile.read().replace('\n', '').split('>')
        seq_dict = {s[:13]:s[13:] for s in text}
        del seq_dict['']
    return seq_dict


def gc_content_comp(dictionary) -> tuple:
    '''
    For loaded _dictionary_ of sequence labels (keys) and nucleotide sequences (value), determine 
        which key:value pair has the greatest GC content.
    Output:
    '''
    best_seq = ''
    gc_cont = 0
    for key in dictionary.keys():
        if nt_gc_counter(dictionary[key]) > gc_cont:
            best_seq = key
            gc_cont = nt_gc_counter(dictionary[key])
    return (best_seq, gc_cont)



In [84]:
# Test code
test = 'AGCTATAG'
l = nt_gc_counter(test)
print(l)
del test, l

37.5


In [85]:
seq_dict = fasta_dict_generator("datasets/rosalind_sample_dataset.txt")

print(seq_dict)
print()

test_cont = gc_content_comp(seq_dict)
print(test_cont[0])
print(round(test_cont[1],6))

del seq_dict, test_cont


{'Rosalind_6404': 'CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG', 'Rosalind_5959': 'CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC', 'Rosalind_0808': 'CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGACTGGGAACCTGCGGGCAGTAGGTGGAAT'}

Rosalind_0808
60.91954


---

In [86]:
seq_dict = fasta_dict_generator("datasets/rosalind_gc.txt")

content = gc_content_comp(seq_dict)
print(content[0])
print(round(content[1],6))

del seq_dict, content

Rosalind_6581
51.515152
