## Algorithms for DNA Sequencing Labs
## Week 3
    3.1) edit distance DP
    3.2) global alignment
    3.3) finding overlaps
    3.4) finding all overlaps

## 3.1 edit distance DP

In [121]:
# implementing dynamic programming for edit distance

In [160]:
def editDistance(x, y):
    D = []
    for i in range(len(x)+1):
        D.append([0]* (len(y)+1))
        
    for i in range(len(x)+1):
        D[i][0] = i
    for i in range(len(y)+1):
        D[0][i] = i
    
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            distHor = D[i][j-1] + 1
            distVer = D[i-1][j] + 1
            if x[i-1] == y[j-1]:
                distDiag = D[i-1][j-1]
            else:
                distDiag = D[i-1][j-1] + 1
            D[i][j] = min(distHor, distVer, distDiag)
    return D[-1][-1]

In [None]:
%%time
x = 'GCTGATCGATCGTACG'
y = 'Shakespear'
print(editDistance(x, y))

In [124]:
def editDistRecursive(x, y):
    # This implementation is very slow
    if len(x) == 0:
        return len(y)
    elif len(y) == 0:
        return len(x)
    else:
        distHor = editDistRecursive(x[:-1], y) + 1
        distVer = editDistRecursive(x, y[:-1]) + 1
        if x[-1] == y[-1]:
            distDiag = editDistRecursive(x[:-1], y[:-1])
        else:
            distDiag = editDistRecursive(x[:-1], y[:-1]) + 1
        return min(distHor, distVer, distDiag)

In [125]:
%%time
x = 'shake spea'
y = 'Shakespear'
print(editDistRecursive(x, y))

3
CPU times: user 4.5 s, sys: 6.38 ms, total: 4.51 s
Wall time: 4.51 s


In [123]:
%%time
x = 'shake spea'
y = 'Shakespear'
print(editDistance(x, y))

3
CPU times: user 172 µs, sys: 69 µs, total: 241 µs
Wall time: 187 µs


## 3.2 global alignment

In [126]:
# implementing global alignment

In [127]:
alphabet = ['A', 'C', 'G', 'T']
score = [[0, 4, 2, 4, 8],
         [4, 0, 4, 2, 8],
         [2, 4, 0, 4, 8],
         [4, 2, 4, 0, 8],
         [8, 8, 8, 8, 8]]

In [128]:
# converts from character to its offset in list alphabet
alphabet.index('A')

0

In [130]:
alphabet.index('G')

2

In [131]:
# penalty associated with A (from X) mismatching with T (from Y)
score[alphabet.index('A')][alphabet.index('T')]

4

In [132]:
# penalty associated with C (from X) being deleted in Y
score[alphabet.index('C')][-1]

8

In [133]:
def globalAlignment(x, y):
    # Create distance matrix
    D = []
    for i in range(len(x)+1):
        D.append([0] * (len(y)+1))
        
    # Initialize first column
    for i in range(1, len(x)+1):
        D[i][0] = D[i-1][0] + score[alphabet.index(x[i-1])][-1]

    # Initialize first row
    for j in range(1,len(y)+1):
        D[0][j] = D[0][j-1] + score[-1][alphabet.index(y[j-1])]
        
    # Fill rest of the matrix
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            distHor = D[i][j-1] + score[-1][alphabet.index(y[j-1])]
            distVer = D[i-1][j] + score[alphabet.index(x[i-1])][-1]
            distDiag = D[i-1][j-1] + score[alphabet.index(x[i-1])][alphabet.index(y[j-1])]
            D[i][j] = min(distHor, distVer, distDiag)
    
    return D[-1][-1]  # return value in bottom right corner

In [134]:
x = 'TATGTCATGC'
y = 'TATGGCAGC'
print(globalAlignment(x,y))

12


## 3.3 finding overlaps

In [135]:
# overlaps between pairs of reads

In [136]:
def overlap(a, b, min_length=3):
    start = 0
    
    while True:
        start = a.find(b[:min_length], start)
        if start == -1:
            return 0
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1

In [137]:
overlap('TTACGT', 'CGTACCGT')

3

In [138]:
overlap('TTACTT', 'CGTACCGT')

0

## 3.4 finding all overlaps

In [139]:
# finding and representing all overlaps

In [140]:
def overlap(a, b, min_length=3):
    start = 0
    
    while True:
        start = a.find(b[:min_length], start)
        if start == -1:
            return 0
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1

In [141]:
from itertools import permutations

In [148]:
list(permutations([1, 2, 3], 2))

TypeError: 'itertools.permutations' object is not callable

In [147]:
#weird

<itertools.permutations object at 0x12195bf10>


In [149]:
def naive_overlap_map(reads, k):
    olaps = {}
    for a,b in permutations(reads, 2):
        olen = overlap(a,b,min_length=k)
        if olen > 0:
            olaps[(a,b)] = olen
    return olaps

In [152]:
reads = ['ACGGATGATC', 'GATCAAGT', 'TTCACGGA']
print(naive_overlap_map(reads, 3))

{('ACGGATGATC', 'GATCAAGT'): 4, ('TTCACGGA', 'ACGGATGATC'): 5}
