### Implement Greedy Shortest Common Superstring

### Overlap Function

In [1]:
def overlap(a, b, min_length=3):
  """Return length of longest suffix of 'a' matching
     a prefix of 'b' that is at least 'min_length'
     characters long. If no such overlap exists,
     return 0."""

  start = 0  
  # find algorithm
  while True:
    start = a.find(b[:min_length], start)  # start = which index to start search
    if start == -1:                # no occurence of this prefix in a
      return 0
                                   # found the prefix of b in a
    if b.startswith(a[start:]):    # prefix of b == suffix starting at 
                                   # position start                                   
      return len(a)-start          # length of overlap
    start += 1                     # move past the previous match

### Shortest Common Superstring (SCS) Function

In [2]:
import itertools

def scs(ss):
    shortest_sup = None
    for ssperm in itertools.permutations(ss): # for each permutation set of strings
      sup = ssperm[0]                         # superstring 1st string in the list
      for i in range(len(ss)-1):              # skip 1st string = superstring

        # find overlap length between current string and the next one
        olen = overlap(ssperm[i], ssperm[i+1], min_length=1)

        # Append onto superstring the part of the next string that doesn't 
        # overlap, i.e. from [olen:] onwards!
        sup += ssperm[i+1][olen:]

      # Test whether superstring is the shortest.
      if shortest_sup is None or len(sup) < len(shortest_sup):

          # replace current superstring with current superstring
        shortest_sup = sup
    
    return shortest_sup

### Maximal Overlap Function

In [3]:
def pick_maximal_overlap(reads, k):
    reada, readb = None, None    # initialize read a and b
    best_olen = 0                # initialize best overlap length = 0 then update
    for a, b in itertools.permutations(reads, 2): # pairs of reads
        olen = overlap(a, b, min_length=k)
        if olen > best_olen:                      # current overlap length > best
            reada, readb = a, b                   # store reads as best ones found
            best_olen = olen                      # update best_olen to current olen

    return reada, readb, best_olen

### Greedy Shortest Common Superstring Function

In [6]:
def greedy_scs(reads, k):
    read_a, read_b, olen = pick_maximal_overlap(reads, k)
    while olen > 0:
      reads.remove(read_a)                 # remove read_a from reads
      reads.remove(read_b)                 # remove read_b from reads
      reads.append(read_a + read_b[olen:])   # concatenate read_a to read_b 
                                           # omitting overlap [olen:]
      read_a, read_b, olen = pick_maximal_overlap(reads, k) # recalculate overlap

    return ''.join(reads) # remaining reads, all the reads that don't 
                          # have any overlap join them together to give
                          # common superstring.

### Test Greedy Shortest Common Superstring Function (correct result)

In [7]:
greedy_scs (['ABC', 'BCA', 'CAB'], 2) # All the strings overlap by 2

'CABCA'

### Test Greedy Shortest Common Superstring Function (incorrect result)

In [8]:
greedy_scs(['ABCD', 'CDBC', 'BCDA'], 1) # Common Superstring longer 
                                        # than it should be





'CDBCABCDA'

### Test Shortest Common Superstring Function (Brute Force Method)

In [10]:
scs(['ABCD', 'CDBC', 'BCDA']) # Superstring of scs function shorter than 
                              # the superstring for the greedy function 

'ABCDBCDA'