In [1]:
import numpy as np
#import pandas as pd

In [2]:
# convert a list of sequences into the corresponding graph matrix
def to_graph(seqs):
    num_nodes = sum([len(s) for s in seqs])
    m = np.array([[0] * num_nodes] * num_nodes)
    i = 0 # indicates where the current seq starts
    for s in seqs:
        l = len(s)
        
        # implement directed edges between nodes of same seq
        for j in range(i, i + l - 1):
            m[j][j+1] = 1
        
        # implement undirected edges between nodes of s and all other nodes
        for j in range(i, i + l):
            for k in range(num_nodes):
                if (k < i or k >= i + l):
                    m[j][k] = 1
                    m[k][j] = 1
        
        i += l
    return m

# return number of paths between two nodes (given as indices) in given graph matrix
# start from end, iterate over all nodes that have an edge to the current node and 
# have not been visited yet. If one is the start node, we have found a path.
def num_paths(start, end, m, print_path = False, N = set()):
    if (N == set()):
        N = set(range(len(m))) # set of all nodes
    if (print_path): 
        # set of all nodes to inspect, which nodes have not been visited, and which nodes have been visited in order
        S = {(end, frozenset(N - {end}), tuple([end]))}
    else:
        S = {(end, frozenset(N - {end}))} # set of all nodes to inspect and which nodes have not been visited
    count = 0
    while S: # S is not empty
        s = S.pop()
        for n in s[1]:
            if (m[n][s[0]] and n == start):
                count += 1
                if (print_path):
                    print((s[2] + tuple([n]))[::-1])
            elif (m[n][s[0]]): #exists edge from n to curr node s[0]
                if (print_path):
                    S |= {(n, frozenset(s[1] - {s[0]}), s[2] + tuple([n]))}
                else:
                    S |= {(n, frozenset(s[1] - {s[0]}))}
    return count

In [3]:
seqs = ['CATG', 'CAGT', 'AGTT']
m = to_graph(seqs)

# undirected edges = edges betwwen nucleotides in different sequences
undir = 0
for i in range(len(m)):
    for j in range(len(m)):
        if (m[i][j] == 1 and m[j][i] == 1):
            undir += 1
undir = int(undir / 2)
print(undir) # is correct

48


In [8]:
m  = np.array([[0,1,0,0,0], [0,0,1,1,1], [0,0,0,1,0], [1,1,1,0,1], [0,1,0,1,0]])
print('Number of paths from A to C: {}'.format(num_paths(0,2,m,True))) # Is correct!

# count mixed cycles:
# start with a directed edge, then calc # of paths from end of the directed edge to its start
# for two sequences with 2 letters, for which I counted 6 mixed cycles per hand
m = np.array([[0,1,1,1], [0,0,1,1], [1,1,0,1], [1,1,0,0]]) 
dir_edges = set()
for i in range(len(m)):
    for j in range(len(m)):
        if (m[i][j] == 1 and m[j][i] != 1):
            dir_edges |= {(i,j)}
            
count = 0
for e in dir_edges:
    count += num_paths(e[1],e[0],m)
print('Number of mixed cycles for two sequences AB and CD: {}'.format(count))

(0, 1, 2)
(0, 1, 4, 3, 2)
(0, 1, 3, 2)
Number of paths from A to C: 3
Number of mixed cycles for two sequences AB and CD: 6


In [5]:
# Now simple mixed cylces: 
# start with any combination of directed edges that are connected, then calc # of paths from end to start,
# while excluding the directed edges from that sequence
# but we still have to prevent that the algorithm visits the other sequences multiple times...