## 'Locating Restriction Sites'

**Connections**: `SUBS`

---


**Given**: A DNA string of length at most 1 kbp in FASTA format.

**Return**: The position and length of *every* reverse palindrome in the string having length between 4 and 12. You may return these pairs in any order.



In [1]:
# Libraries to load:
import os, time


In [2]:
# Previous functions generated
def fasta_dictionary(path_to_filename):
    '''
    Open a FASTA file and keep only the identifier and the sequence (ignoring any additional information).
    Convert into a dictionary.
    Output: a dictionary where the key-value pairs are sequence IDs and sequences, respectively
    '''
    with open(path_to_filename, 'r') as f:
        lst  = f.readlines()
    f.close()
    for i in range(len(lst)):
        if lst[i].startswith('>'):
            lst[i] = lst[i].split(' ')[0]+'\n'
    lst  = [i.replace('\n', ' ') for i in lst]
    str1 = ''.join(lst)  
    lst2 = str1.split('>')
    lst2 = lst2[1:]
    seq_dict = {lst2[i].split(' ')[0]:''.join(lst2[i].split(' ')[1:]) for i in range(len(lst2))}
    del lst, lst2
    return seq_dict


def dna_complement(string: str) -> str:
    '''
    Load in string text of DNA nucleotides and return a string of complementary bases.
    The string text is read 5` to 3`, so the initial complement will be 3`-5`. The output must be 5` to 3`.
    Notes: 
       A-T, T-A, C-G, and G-C
    '''
    nt_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
    comp = ''
    for i in string:
        comp += nt_dict[i]
    return comp




In [3]:
sequence_dict = fasta_dictionary( os.getcwd() + '/datasets/rosalind_sample_dataset.txt')

sequence_dict

{'Rosalind_24': 'TCAATGCATGCGGGTCTATATGCAT'}

In [4]:
print( len(sequence_dict['Rosalind_24']) )
print(sequence_dict['Rosalind_24'])
print(dna_complement(sequence_dict['Rosalind_24']))


25
TCAATGCATGCGGGTCTATATGCAT
AGTTACGTACGCCCAGATATACGTA


In [5]:
print(sequence_dict['Rosalind_24'][0:12])
print(dna_complement(sequence_dict['Rosalind_24'])[0:12])

TCAATGCATGCG
AGTTACGTACGC


In [15]:
coding     = sequence_dict['Rosalind_24']
complement = dna_complement(sequence_dict['Rosalind_24'])

site_length = 6

for n in range(len(coding)):
    window = coding[n:n+site_length]
    if len(window) == site_length:
        rev_window = complement[n:n+site_length][::-1]
        if window == rev_window:
            print('Coding position:', str(n+1))
            print(window)
            print(rev_window[::-1])
            print()
            
        del rev_window
    del window

del coding, complement, site_length

Coding position: 4
ATGCAT
TACGTA

Coding position: 6
GCATGC
CGTACG

Coding position: 20
ATGCAT
TACGTA



In [17]:
starttime = time.time()

coding     = sequence_dict['Rosalind_24']
complement = dna_complement(sequence_dict['Rosalind_24'])

palindrome_dict = {}
for site_length in range(4, 13):
    for n in range(len(coding)):
        window = coding[n:n+site_length]
        if len(window) == site_length:
            rev_window = complement[n:n+site_length][::-1]
            if window == rev_window:
                palindrome_dict[n+1]=site_length
            del rev_window
        del window        
print('Program time length:', str(round(time.time()-starttime, 6)) )

del starttime,coding, complement

Program time length: 0.000291


In [18]:
print("pos\tlength")
for k, v in palindrome_dict.items():
    print(str(k)+'\t'+str(v)+'\t'+sequence_dict['Rosalind_24'][(k-1):(k-1)+v])

pos	length
5	4	TGCA
7	4	CATG
17	4	TATA
18	4	ATAT
21	4	TGCA
4	6	ATGCAT
6	6	GCATGC
20	6	ATGCAT


---

### Problem Attempt:

#### Attempt 01

In [19]:
starttime = time.time()
sequence_dict = fasta_dictionary( os.getcwd() + '/datasets/rosalind_revp_attempt01.txt')
KEY        = list(sequence_dict.keys())[0]
coding     = sequence_dict[KEY]
complement = dna_complement(sequence_dict[KEY])

palindrome_dict = {}
for site_length in range(4, 13):
    for n in range(len(coding)):
        window = coding[n:n+site_length]
        if len(window) == site_length:
            rev_window = complement[n:n+site_length][::-1]
            if window == rev_window:
                palindrome_dict[n+1]=site_length
            del rev_window
        del window        

print('Program time length:', str(round(time.time()-starttime, 6)) )


Program time length: 0.012838


In [23]:
for k, v in palindrome_dict.items():
    print(str(k), str(v))
    print(sequence_dict[KEY][(k-1):(k-1)+v])
    print(dna_complement(sequence_dict[KEY])[(k-1):(k-1)+v])
    print()
    

1 4
CGCG
GCGC

2 4
GCGC
CGCG

7 4
GTAC
CATG

40 4
GTAC
CATG

43 4
CATG
GTAC

74 4
TCGA
AGCT

76 4
GATC
CTAG

109 4
TGCA
ACGT

122 4
CCGG
GGCC

149 4
CGCG
GCGC

155 4
GTAC
CATG

159 4
ATAT
TATA

160 4
TATA
ATAT

189 4
GATC
CTAG

200 4
TTAA
AATT

207 4
AGCT
TCGA

215 4
GATC
CTAG

219 4
ATAT
TATA

241 8
TCGATCGA
AGCTAGCT

243 4
GATC
CTAG

245 4
TCGA
AGCT

270 4
GATC
CTAG

298 4
CGCG
GCGC

311 4
CATG
GTAC

321 4
CTAG
GATC

326 4
GCGC
CGCG

337 4
TGCA
ACGT

346 4
CATG
GTAC

365 4
GATC
CTAG

393 4
TGCA
ACGT

397 4
ACGT
TGCA

411 4
GGCC
CCGG

442 4
AGCT
TCGA

444 4
CTAG
GATC

460 4
GGCC
CCGG

479 4
CTAG
GATC

486 4
TGCA
ACGT

496 4
TTAA
AATT

500 4
GTAC
CATG

562 6
TATATA
ATATAT

563 4
ATAT
TATA

564 4
TATA
ATAT

592 4
GCGC
CGCG

593 4
CGCG
GCGC

604 4
TATA
ATAT

623 4
AATT
TTAA

636 4
TCGA
AGCT

645 4
TTAA
AATT

654 4
TGCA
ACGT

660 4
TGCA
ACGT

663 4
AATT
TTAA

668 4
GTAC
CATG

678 4
GTAC
CATG

680 4
ACGT
TGCA

687 4
CCGG
GGCC

706 4
GGCC
CCGG

711 4
GTAC
CATG

715 4
TTAA
AATT

751 4
TTAA
A

In [24]:
del starttime, coding, complement, KEY, palindrome_dict,sequence_dict

#### Attempt 02

In [27]:
starttime = time.time()
sequence_dict = fasta_dictionary( os.getcwd() + '/datasets/rosalind_revp_attempt02.txt')
KEY        = list(sequence_dict.keys())[0]
coding     = sequence_dict[KEY]
complement = dna_complement(sequence_dict[KEY])

palindrome_dict = {}
for site_length in range(4, 13):
    for n in range(len(coding)):
        window = coding[n:n+site_length]
        if len(window) == site_length:
            rev_window = complement[n:n+site_length][::-1]
            if window == rev_window:
                palindrome_dict[n+1]=site_length
            del rev_window
        del window        

print('Program time length:', str(round(time.time()-starttime, 6)) )

Program time length: 0.010918


In [28]:
for k, v in palindrome_dict.items():
    if v > 4:
        print(str(k), str(v))
        print(sequence_dict[KEY][(k-1):(k-1)+v])
        print(dna_complement(sequence_dict[KEY])[(k-1):(k-1)+v])
        print()

126 6
ATATAT
TATATA

127 6
TATATA
ATATAT

510 8
CATGCATG
GTACGTAC

53 6
TGCGCA
ACGCGT

101 6
ATCGAT
TAGCTA

160 6
AGCGCT
TCGCGA

194 6
CCCGGG
GGGCCC

332 6
TTATAA
AATATT

337 6
ACATGT
TGTACA

341 6
GTGCAC
CACGTG

450 6
CGGCCG
GCCGGC

509 6
GCATGC
CGTACG

511 6
ATGCAT
TACGTA

537 6
GCCGGC
CGGCCG

550 6
AAGCTT
TTCGAA

583 6
AAATTT
TTTAAA

589 6
AACGTT
TTGCAA

695 6
TTTAAA
AAATTT

745 6
TGCGCA
ACGCGT

762 6
CATATG
GTATAC

193 8
ACCCGGGT
TGGGCCCA

449 8
ACGGCCGT
TGCCGGCA

582 8
TAAATTTA
ATTTAAAT

448 10
CACGGCCGTG
GTGCCGGCAC

447 12
CCACGGCCGTGG
GGTGCCGGCACC



In [None]:
del starttime, coding, complement, KEY, palindrome_dict,sequence_dict

#### Attempt 03

In [39]:
starttime = time.time()
sequence_dict = fasta_dictionary( os.getcwd() + '/datasets/rosalind_revp_attempt03.txt')
KEY        = list(sequence_dict.keys())[0]
coding     = sequence_dict[KEY]
complement = dna_complement(sequence_dict[KEY])

pal_dict_1 = {}
for site_length in range(4, 13):
    for n in range(len(coding)):
        window = coding[n:n+site_length]
        if len(window) == site_length:
            rev_window = complement[n:n+site_length][::-1]
            if window == rev_window:
                pal_dict_1[n+1]=site_length
            del rev_window
        del window        

keys = sorted(list(pal_dict_1.keys()))
pal_dict_2 = {key:pal_dict_1[key] for key in keys}
del pal_dict_1, keys

print('Program time length:', str(round(time.time()-starttime, 6)) )

with open( os.getcwd() + '/answer_submissions/rosalind_revp_submission_attempt3.txt', 'w') as outfile:
    for k, v in pal_dict_2.items():
        outfile.write(str(k)+' '+str(v)+'\n')
outfile.close()


Program time length: 0.013232


In [40]:
print(coding[460:480])
print(complement[460:480])

TTAGGTACCGGTACCAAGGA
AATCCATGGCCATGGTTCCT


In [42]:
for k, v in pal_dict_2.items():
    print(str(k), str(v))


1 6
2 4
3 4
20 4
65 4
67 4
153 4
162 6
163 4
192 4
207 6
208 4
218 6
219 4
243 4
317 4
348 6
349 4
355 6
356 4
370 4
380 6
381 4
383 6
384 4
399 4
416 4
419 4
447 4
454 6
455 4
464 12
465 10
466 8
467 6
468 4
470 6
471 4
491 4
514 4
521 10
522 8
523 6
524 4
526 4
549 4
555 4
565 4
577 4
592 4
598 4
627 4
639 4
643 4
644 6
645 4
667 10
668 8
669 6
670 4
707 4
752 4
760 4
771 4
838 4
842 4
849 4
852 4
855 4
869 4
872 4
873 6
874 4
877 6
878 4
880 4
883 4
885 4
893 4
903 4
920 4
924 4
935 4


In [43]:
del starttime, coding, complement, KEY, pal_dict_2, sequence_dict

#### Attempt 04

In [44]:
starttime     = time.time()
sequence_dict = fasta_dictionary( os.getcwd() + '/datasets/rosalind_revp.txt')

KEY        = list(sequence_dict.keys())[0]
coding     = sequence_dict[KEY]
complement = dna_complement(sequence_dict[KEY])

pal_dict_1 = {}
for site_length in range(4, 13):
    for n in range(len(coding)):
        window = coding[n:n+site_length]
        if len(window) == site_length:
            rev_window = complement[n:n+site_length][::-1]
            if window == rev_window:
                pal_dict_1[n+1]=site_length
            del rev_window
        del window        

keys       = sorted(list(pal_dict_1.keys()))
pal_dict_2 = {key:pal_dict_1[key] for key in keys}
del pal_dict_1, keys

with open( os.getcwd() + '/answer_submissions/rosalind_revp_submission.txt', 'w') as outfile:
    for k, v in pal_dict_2.items():
        outfile.write(str(k)+' '+str(v)+'\n')
outfile.close()

print('Program time length:', str(round(time.time()-starttime, 6)) )
del pal_dict_2, complement, coding, KEY, sequence_dict, starttime


Program time length: 0.021188
