## 'Overlap Graphs'

**Connections**: `GC`, `SUBS`

*Graph Theory, one of my favorite subjects!*

---


**Given**: A collection of DNA strings in FASTA format having total length at most 10 kbp.

**Return**: The adjacency list corresponding to $O_3$. You may return edges in any order.


In [1]:
# Libraries to load



In [2]:
# Previous defined functions

def fasta_dictionary(path_to_filename):
    '''
    A more robust function that in the `nb_gc` notebook. Here, open a FASTA file and keep only the identifier and the sequence (ignoring any additional information).
    Convert into a dictionary.
    Output: a dictionary where the key-value pairs are sequence IDs and sequences, respectively
    '''
    with open(path_to_filename, 'r') as f:
        lst  = f.readlines()
    f.close()
    for i in range(len(lst)):
        if lst[i].startswith('>'):
            lst[i] = lst[i].split(' ')[0]+'\n'
    lst  = [i.replace('\n', ' ') for i in lst]
    str1 = ''.join(lst)  
    lst2 = str1.split('>')
    lst2 = lst2[1:]
    seq_dict = {lst2[i].split(' ')[0]:''.join(lst2[i].split(' ')[1:]) for i in range(len(lst2))}
    del lst, lst2
    return seq_dict



In [3]:
sd = fasta_dictionary('datasets/rosalind_sample_dataset.txt')

sd

{'Rosalind_0498': 'AAATAAA',
 'Rosalind_2391': 'AAATTTT',
 'Rosalind_2323': 'TTTTCCC',
 'Rosalind_0442': 'AAATCCC',
 'Rosalind_5013': 'GGGTGGG'}

In [4]:
sd['Rosalind_2323'][:3]

'TTT'

In [5]:
sd['Rosalind_2323'][-3:]

'CCC'

In [6]:
l = list(sd.keys())
print(l)
del l

['Rosalind_0498', 'Rosalind_2391', 'Rosalind_2323', 'Rosalind_0442', 'Rosalind_5013']


In [7]:
sd_keys = list(sd.keys())

for k1 in sd_keys:
    suff   = sd[k1][-3:]
    n_keys = [i for i in sd_keys if i != k1]
    for k2 in n_keys:
        if suff == sd[k2][:3]:
            print(k1+' '+k2)
    del suff, n_keys
del sd_keys


Rosalind_0498 Rosalind_2391
Rosalind_0498 Rosalind_0442
Rosalind_2391 Rosalind_2323


In [8]:
def overlap_graph(sequence_dict, overlap_size:int):
    '''
    Load in a sequence dictionary from a FASTA-formatted file.
    Extract a suffix of size __overlap_size__ from one sequence and compare to all prefixes of the remaining sequences.
    Output: Print statement of the two keys in suffix-prefix order.
      Optional: this can be converted such that the output can be an adjacency list as a list-of-tuples
    '''
    adj_list = []
    sd_keys = list(sequence_dict.keys())
    for k1 in sd_keys:
        suff   = sequence_dict[k1][-overlap_size:]
        n_keys = [i for i in sd_keys if i != k1]
        for k2 in n_keys:
            if suff == sequence_dict[k2][:overlap_size]:
                adj_list.append(k1+' '+k2)
        del suff, n_keys
    del sd_keys
    for i in adj_list:
        print(i)
    del adj_list
    return



In [9]:
overlap_graph(sd, 3)

Rosalind_0498 Rosalind_2391
Rosalind_0498 Rosalind_0442
Rosalind_2391 Rosalind_2323


In [10]:
del sd

In [11]:
sd = fasta_dictionary('datasets/rosalind_gc.txt')

print(sd.keys()) 
print('-'*25)

overlap_graph(sd, 3)

del sd

dict_keys(['Rosalind_2351', 'Rosalind_6581', 'Rosalind_3806', 'Rosalind_3430', 'Rosalind_6890', 'Rosalind_3696'])
-------------------------


---

### Problem Attempt:

In [12]:
overlap_graph(fasta_dictionary('datasets/rosalind_grph.txt'), 3)


Rosalind_8529 Rosalind_8412
Rosalind_8719 Rosalind_0533
Rosalind_7377 Rosalind_1483
Rosalind_0952 Rosalind_4252
Rosalind_0952 Rosalind_8629
Rosalind_6010 Rosalind_0533
Rosalind_2981 Rosalind_8961
Rosalind_2981 Rosalind_2935
Rosalind_2981 Rosalind_6705
Rosalind_4216 Rosalind_2736
Rosalind_5376 Rosalind_8863
Rosalind_5376 Rosalind_7474
Rosalind_5376 Rosalind_9405
Rosalind_8215 Rosalind_0044
Rosalind_8215 Rosalind_1127
Rosalind_8215 Rosalind_7659
Rosalind_8215 Rosalind_1880
Rosalind_8992 Rosalind_2660
Rosalind_1679 Rosalind_6462
Rosalind_1679 Rosalind_0151
Rosalind_1953 Rosalind_0533
Rosalind_6808 Rosalind_0437
Rosalind_6808 Rosalind_2981
Rosalind_6808 Rosalind_7337
Rosalind_6808 Rosalind_8893
Rosalind_7337 Rosalind_8529
Rosalind_4566 Rosalind_8194
Rosalind_4566 Rosalind_6765
Rosalind_4566 Rosalind_9053
Rosalind_3055 Rosalind_8719
Rosalind_3055 Rosalind_4648
Rosalind_2644 Rosalind_1553
Rosalind_2644 Rosalind_4133
Rosalind_3114 Rosalind_0193
Rosalind_8961 Rosalind_2162
Rosalind_2588 Rosali