### Finding and representing all overlaps

### Modified Overlap Function

In [9]:
def overlap(a, b, min_length=3):
  """Return length of longest suffix of 'a' matching
     a prefix of 'b' that is at least 'min_length'
     characters long. If no such overlap exists,
     return 0."""

  start = 0  
  # find algorithm
  while True:
    start = a.find(b[:min_length], start)  # start = which index to start search
    if start == -1:                # no occurence of this prefix in a
      return 0
                                   # found the prefix of b in a
    if b.startswith(a[start:]):    # prefix of b == suffix starting at 
                                   # position start                                   
      return len(a)-start          # length of overlap
    start += 1                     # move past the previous match

### Permutations Function

In [3]:
from itertools import permutations

# Example of what the permutations function can do

# prints out all the permutations of this set of size 1
list(permutations([1,2,3], 1))


[(1,), (2,), (3,)]

In [4]:
# prints out all the permutations of this set of size 2
list(permutations([1,2,3], 2))


[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]

In [6]:
# prints out all the permutations of this set of size 3
list(permutations([1,2,3], 3))


[(1, 2, 3), (1, 3, 2), (2, 1, 3), (2, 3, 1), (3, 1, 2), (3, 2, 1)]

### Define Naive Overlap Function

In [7]:
def naive_overlap_map(reads, k): #k minimum overlap length
   olaps = {}
   for a,b in permutations(reads, 2):
     olen = overlap(a, b, min_length=k)
     if olen > 0:
       olaps[(a,b)] = olen
   return olaps


### Test Naive Overlap Function

In [11]:
reads = ['ACGGTGATC', 'GATCAAGT', 'TTCACGGA']
print(naive_overlap_map(reads, 3))

{('ACGGTGATC', 'GATCAAGT'): 4}


In [12]:
# 1st read overlaps the 2nd one, and the 3rd read overlaps the 1st one
reads = ['ACGGATGATC', 'GATCAAGT', 'TTCACGGA']
print(naive_overlap_map(reads, 3))

{('ACGGATGATC', 'GATCAAGT'): 4, ('TTCACGGA', 'ACGGATGATC'): 5}
