In [2]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's prefix in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

In [2]:
import itertools

def scs(ss):
    """Takes a list of short strings (ss). Returns shortest common superstring of given strings,
        assuming no string is a strict substring of another."""
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]
        for i in range(len(ssperm) - 1):
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            sup += ssperm[i+1][olen:]
        # check is this superstring is the shortest we've seen so far
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup
    return shortest_sup

ss = ["ACGGATGAGC", "GAGCGGA", "GAGCGAG"]
scs(ss)

'ACGGATGAGCGAGCGGA'

In [3]:
import itertools

def scs(ss):
    """Takes a list of short strings (ss). Returns shortest common superstring of given strings,
        assuming no string is a strict substring of another."""
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]
        for i in range(len(ssperm) + 1):  # +1 results in tuple being out of range
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            sup += ssperm[i+1][olen:]
        # check is this superstring is the shortest we've seen so far
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup
    return shortest_sup

ss = ["ACGGATGAGC", "GAGCGGA", "GAGCGAG"]
scs(ss)

IndexError: tuple index out of range

In [4]:
def pick_maximal_overlap(reads, k):
    """Returns two reads that have maximal ovelap."""
    reada, readb = None, None
    best_olen = 0
    for a, b in itertools.permutations(reads, 2):
        olen = overlap(a, b, min_length=k)
        if olen > best_olen:
            reada, readb = a, b
            best_olen = olen
    return reada, readb, best_olen

def greedy_scs(reads, k):
    read_a, read_b, olen = pick_maximal_overlap(reads, k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[olen:])  # just the suffix of read_b since prefix overlaps with read_a
        read_a, read_b, olen = pick_maximal_overlap(reads, k)
    return "".join(reads)

reads = ["ABC", "BCA", "CAB"]
k = 2
greedy_scs(reads, k)

'CABCA'

In [9]:
print(f'fast method:        {greedy_scs(["ABCD", "CDBC", "BCDA"], 1)}')
print(f'brute force method: {scs(["ABCD", "CDBC", "BCDA"])}')

fast method:        CDBCABCDA
brute force method: ABCDBCDA


In [5]:
def de_bruijn_ize(st, k):
    edges = []
    nodes = set()
    for i in range(len(st) - k + 1):
        edges.append((st[i:i+k-1], st[i+1:i+k]))
        nodes.add(st[i:i+k-1])
        nodes.add(st[i+1:i+k])
    return nodes, edges

nodes, edges = de_bruijn_ize("ACGCGTCG", 3)

In [8]:
nodes

{'AC', 'CG', 'GC', 'GT', 'TC'}

In [9]:
edges

[('AC', 'CG'),
 ('CG', 'GC'),
 ('GC', 'CG'),
 ('CG', 'GT'),
 ('GT', 'TC'),
 ('TC', 'CG')]

In [11]:
%install_ext http://raw.github.com/cjdrake/ipython-magic/master/gvmagic.py
%load_ext gvmagic

ERROR:root:Line magic function `%install_ext` not found.


ModuleNotFoundError: No module named 'gvmagic'

In [None]:
def visualize_de_bruijn(st, k):
    """ Visualize a directed multigraph using graphviz """
    nodes, edges = de_bruijn_ize(st, k)
    dot_str = 'digraph "DeBruijn graph" {\n'
    for node in nodes:
        dot_str += '  %s [label="%s"] ;\n' % (node, node)
    for src, dst in edges:
        dot_str += '  %s -> %s ;\n' % (src, dst)
    return dot_str + '}\n'

%dotstr visualize_de_bruijn("ACGCGTCG", 3)