https://rob-p.github.io/CSE549F17/lectures/Lec08.pdf
https://academic.oup.com/bioinformatics/article/19/suppl_1/i54/227687

In [1]:
import pandas as pd
import numpy as np
def createScorMat(match = 1,mismatch = 0,nts = ['A','C','T','G']):
    """
    Create scoring matrix for nucleotides with a given match and mismatch values
    """
    scoringMatrix = np.zeros((len(nts),len(nts)),)
    np.fill_diagonal(scoringMatrix,match)
    scoringMatrix
    scoringMatrix[scoringMatrix == 0] = mismatch
    scoringMatrix = pd.DataFrame(scoringMatrix,index=nts,columns=nts)
    return scoringMatrix

#### Dynamic programming implementation
def scoringMatrix_local(x,y,subMat,indel):
    m = len(x) + 1
    n = len(y) + 1
    globMax = (0,0)
    scoringMat=np.zeros((n,m))
    for i in range(1,m):
        #print(x[i-1])
        for j in range(1,n):
            if x[i-1] == y[j-1]:
                #scoringMat[j-1][i-1] + subMat.loc[x[i-1]][y[j-1]]
                scoringMat[j][i] = scoringMat[j-1][i-1] + subMat.loc[x[i-1]][y[j-1]]
            else:
                mismatch = scoringMat[j-1][i-1] + subMat.loc[x[i-1]][y[j-1]]
                left = scoringMat[j-1][i] + indel
                up = scoringMat[j][i-1] + indel
                scoringMat[j][i] = max(mismatch,left,up,0)
            if scoringMat[j][i] >= scoringMat[globMax[1]][globMax[0]]:
                globMax = (i,j)
    return scoringMat,globMax


def scoringMatrix_global(x,y,subMat,indel):
    m = len(x) + 1
    n = len(y) + 1
    scoringMat=np.zeros((n,m))
    scoringMat[:][0] = [0] + list(np.cumsum([indel for _ in x]))
    scoringMat[:,0] = [0] + list(np.cumsum([indel for _ in y]))
    for i in range(1,m):
        #print(x[i-1])
        for j in range(1,n):
            if x[i-1] == y[j-1]:
                #scoringMat[j-1][i-1] + subMat.loc[x[i-1]][y[j-1]]
                scoringMat[j][i] = scoringMat[j-1][i-1] + subMat.loc[x[i-1]][y[j-1]]
            else:
                mismatch = scoringMat[j-1][i-1] + subMat.loc[x[i-1]][y[j-1]]
                left = scoringMat[j-1][i] + indel
                up = scoringMat[j][i-1] + indel
                scoringMat[j][i] = max(mismatch,left,up)
    return scoringMat


def backtrack_local(x,y,sm,maxLocs):
    i = maxLocs[0]
    j = maxLocs[1]
    w=''
    z=''
    z,w
    while i*j > 0:
        if x[i-1] == y[j-1]:
            w += x[i-1]
            z += y[j-1]
            i -= 1
            j -= 1
        else:
            left = sm[j-1][i]
            up = sm[j][i-1]
            diag = sm[j-1][i-1]
            whichmax = np.argmax([left,up,diag])
            if whichmax == 0:
                w += '-'
                z += y[j-1]
                j -= 1
            elif whichmax == 1:
                z += '-'
                w += x[i-1]
                i -= 1
            else:
                w += x[i-1]
                z += y[j-1]
                i -= 1
                j -= 1
        if sm[j][i] == 0:
            break
    return w[::-1],z[::-1]

def backtrack_global(x,y,sm,subMat,indel):
    i = len(x)
    j = len(y)
    w=''
    z=''
    z,w
    while i*j > 0:
        if x[i-1] == y[j-1]:
            w += x[i-1]
            z += y[j-1]
            i -= 1
            j -= 1
        else:
            left = sm[j-1][i] + indel
            up = sm[j][i-1] + indel
            diag = sm[j-1][i-1] + subMat.loc[x[i-1]][y[j-1]]
            whichmax = np.argmax([left,up,diag])
            if whichmax == 0:
                w += '-'
                z += y[j-1]
                j -= 1
            elif whichmax == 1:
                z += '-'
                w += x[i-1]
                i -= 1
            else:
                w += x[i-1]
                z += y[j-1]
                i -= 1
                j -= 1
    if j==0 and i>0:
        w = x[:i]+w[::-1]
        z = '-' * i + z[::-1]
    elif i==0 and j>0:
        z = y[:i]+z[::-1]
        w = '-' * j + w[::-1]
    else:
        w = w[::-1]
        z = z[::-1]
    return w,z

def locAl(x,y,subMat,indel):
    '''
    A wrapper for DP implementation of Smith-Waterman
    '''
    sm, maxLocs = scoringMatrix_local(x,y,subMat,indel)
    z,w = backtrack_local(x,y,sm,maxLocs)
    return z,w,sm[maxLocs[1],maxLocs[0]]

def globAl(x,y,subMat,indel):
    '''
    A wrapper for DP implementation of Smith-Waterman
    '''
    sm = scoringMatrix_global(x,y,subMat,indel)
    z,w = backtrack_global(x,y,sm,subMat,indel)
    return z,w,sm[len(y),len(x)]

def vizScoringMat(x,y,scoringMat):
    return pd.DataFrame(scoringMat,index=['']+[_ for _ in y],
             columns=['']+[_ for _ in x])

In [2]:
def kmerize(s,k):
    """
    Function to generate all kmers of size k
    """
    kmers = []
    for _ in range(0,len(s)-k+1):
        kmers.append(s[_:_+k])
    return kmers

def gloCal(x,y,subMat,indel):
    '''
    brute force
    '''
    
    ### Define target and query
    m = len(x)
    n = len(y)
    whichMax = np.argmax([m,n])
    
    if whichMax == 0: target,query = x,y
    else: target,query = y,x
    ### Get all possible kmers
    kmers = []
    for k in range(len(query),len(target)+1):
        kmers.append(kmerize(target,k))
    kmers = sum(kmers , []) #Unlist the list of lists
    res = {}
    for i,kmer in enumerate(kmers):
        res[i] = globAl(kmer,y,subMat,indel)
    
    return res[np.argmax([_[2] for _ in res.values()])]

In [3]:
def semiGlobal(x,y,subMat,indel):
    """
    ##https://rob-p.github.io/CSE549F17/lectures/Lec08.pdf
    """
    m = len(x) + 1
    n = len(y) + 1
    scoringMat=np.zeros((n,m))
    scoringMat[:][0] = [0] + list(np.cumsum([0 for _ in x]))
    scoringMat[:,0] = [0] + list(np.cumsum([indel for _ in y]))
    for i in range(1,m):
        #print(x[i-1])
        for j in range(1,n):
            if x[i-1] == y[j-1]:
                #scoringMat[j-1][i-1] + subMat.loc[x[i-1]][y[j-1]]
                scoringMat[j][i] = scoringMat[j-1][i-1] + subMat.loc[x[i-1]][y[j-1]]
            else:
                mismatch = scoringMat[j-1][i-1] + subMat.loc[x[i-1]][y[j-1]]
                left = scoringMat[j-1][i] + indel
                up = scoringMat[j][i-1] + indel
                scoringMat[j][i] = max(mismatch,left,up)

    lastRow = scoringMat[:][n-1]
    cutHere = np.where(lastRow == np.max(lastRow))[0][-2]
    
    #return scoringMat, cutHere# [:,:cutHere+1]
    return scoringMat[:,:cutHere+1]

def semiBacktrack(x,y,scoringMat,submat,indel):
    x = x[ : scoringMat.shape[1] - 1]
    i = scoringMat.shape[1] - 1
    j = scoringMat.shape[0] - 1
    w=''
    z=''
    while i*j > 0:
        #print(w[::-1],z[::-1])
        if x[i-1] == y[j-1]:
            w += x[i-1]
            z += y[j-1]
            i -= 1
            j -= 1
        else:
            left = scoringMat[j-1][i] + indel
            up = scoringMat[j][i-1] + indel
            diag = scoringMat[j-1][i-1] + subMat.loc[x[i-1]][y[j-1]]
            whichmax = np.argmax([left,up,diag])
            if whichmax == 0:
                w += '-'
                z += y[j-1]
                j -= 1
            elif whichmax == 1:
                z += '-'
                w += x[i-1]
                i -= 1
            else:
                w += x[i-1]
                z += y[j-1]
                i -= 1
                j -= 1

    return w[::-1],z[::-1]


def semiAl(x,y,subMat,indel):
    '''
    A wrapper for DP implementation of Smith-Waterman
    '''
    sm = semiGlobal(x,y,subMat,indel)
    z,w = semiBacktrack(x,y,sm,subMat,indel)
    return z,w, sm[sm.shape[0]-1,sm.shape[1]-1]

http://rosalind.info/glossary/fitting-alignment/

![image.png](attachment:image.png)

In [4]:
x='GTAGGCTTAAGGTTA'
y='TAGATA'
indel = -1
subMat = createScorMat(1,mismatch = -1)

In [5]:
globAl(x,y,subMat,indel)
# TAGGCTTA
# TAGA--TA

('GTAGGCTTAAGGTTA', '-TA-G----A---TA', -3.0)

In [6]:
locAl(x,y,subMat,indel)
# TAG
# TAG

('TAG', 'TAG', 3.0)

In [7]:
gloCal(x,y,subMat,indel)
#TAGGCTTA
#TAGA--TA

('TAGGCTTA', 'TAGA--TA', 2.0)

In [42]:
x='GTAGGCTTAAGGTTA'
y='TAGATA'
semiAl(x,y,subMat,indel)
# 2
# TAGGCTTA
# TAGA--TA

('TAGGCTTA', 'TAGA--TA', 2.0)

In [43]:
x='CAATCACCCCAATCCCTCAATCCTGGCCCCACGCATAGGCTAATGCCAATCGCGGCCAGGGTATAACCGCCATAACTGTGGGTCAGAAGGGATAAGTTCCACAATCCTATTTTCCTCGAGGCGCTTCGATGCGTTAACGCGTACACTCTGTCGGCCAACCGTGTGGGAGCCGAATTGGCTGGGCTGTTGAACATTCTATCAGTAGATAAACGAAGGTACATCCGAGGTTGTCGATCGACCGCGGGGTCGTAGCGCGTGCATGTTCCTTTCAGGCCCACATACTCCGGAACGGTTCATATCACGACTATTCTTGCACAATCGGACAACGGTGTACCATGGTGGACACCGTAGGAGACCAATACTGCGTAAATCATAAGCATTGGAGAGTGGACTGCTAGCGAGGCTCACCATGGAGTCTCGGTCGGCATCTCCTGACTGCTGTTCCATCGCGTTTTTCTTTTACTCACGCAATAAATCAATACCCCCTAACACAGGCCTGCTCCAGCCTTATTAAGGCCATAGTAGCTCTACATGTAGACCGAACGGAAGCACAGTTTGGTAGAAATTCTTAATCGACTATGGTCCGTGCAGGCCAAAAAAGGAATAATCTTCGAATTCTCACGCCTTCATTAGGGCGCACATGGTGGGGTAAATCACTGCACTCTGTTCGCAGTTAAGCGTTGCAATCAATATCGGCAGAACTCGGAGTCCGTATAAAGCCGCCTCAGCGTGCACACGCCCGTGCGGCACGTCATTAGACGAGGATTCCGGGGGACTGGCCTGTTCGTAATCCACTAAAACAATGGTCCTACCATCTAAAACGCACCGTGTTCCCCTCTACGGGAACCCCCTAGAT'
y='AGAGCGCAGAGAAGTCATTAGAACATGTAGCACATCGCTTATTAAGGGTCAATACCTAAAGGGCCTAACTATACGCCACACGGAACAGCTC'
semiAl(x,y,subMat,indel)
#22
#AGGGCGCACATG--GTGGGGTA-AATCAC-T-GCAC-TCTG-TTCGCAGTTAAGCGTTGCAATCAATATCGGC-AGAACTCGGAGTCCGTA--TAAAGCCGCCTCAGCGTGCACACGC-C
#AGAGCGCAGA-GAAGTCAT-TAGAA-CATGTAGCACATC-GCTT---A-TTAAG-G--G---TCAATA-C--CTA-AA---GG-G-CC-TAACTATA-C-GCCACA-CG-GAACA-GCTC

('AGGGCGCACATG--GTGGGGTA-AATCAC-T-GCAC-TCTG-TTCGCAGTTAAGCGTTGCAATCAATATCGGC-AGAACTCGGAGTCCGT-A-TAAAGCCGCCTCAGCGTGCACACGC-C',
 'AGAGCGCAGA-GAAGTCAT-TAGAA-CATGTAGCACATC-GCTT---A-TTAAG-G--G---TCAATA-C--CTA-AA---GG-G-CC-TAACTATA--CGCCACA-CG-GAACA-GCTC',
 22.0)

In [44]:
#### Challenge dataset
#### TOO SLOW - Need to implement the faster algorithm here.
# x='TGTAATAGATATCCCAGTATATAATGTAACTAGAAGGATATTGGATGCGTGGAGGCGAAAGGGATGTAGTGCTCGATCCTTTAACGGAGTCCTCTTAGGTTAAGAATTCCGCCACCGGCGGTGAATACTCCGTAAGGTTGTGGCGTGCTCCGCTAACAAAAACTTAAGCCACGTCTTTACTTGGCCGTCTTTTTGATCGGACAGAAAAGTGATCATCTTGATGAGTAGAATACGCGGGAGTTAATCTGCCGGCAGCATACGCGCGGAGTCAATCCATCCCACGTGCACTTCTATAAGTCTTAAAGGGGAACTGCTGATAGCGATGCAGTGTCAAACAGGTTATCGGGGCTCTGACATCCCACACTTGTCAAACGGTTAGCAAAGCGCTACGACATCGTTTTCGTCCACGGTACTTTCATAGGATAACATGTTTCATTAAAAAAGTCCGTCGCCCTTTGTGACTGATTATTTTGGTTACCAGCTGCAGGGAGCATCGACCGTGAAGCGTGAAACGCCGAAAAAAGCTTCTGCAACAGTTTGGATCGCCCTGTTGCCTCCCGGCAAGTGAGAGCGGACGCTCTTCGGTCCGCGCCTCTTAAAAGGAACGGCCTCTTCTCTACTTTATGGGCACTTAGAGTCTCCAAACTTTAGTCAAGTTCCCCCTACCGATTAAGATCGGGGCACAGACCAGCGGAGACGATGTCCAGTAGCCATGCTCTTATTCCGTAGAAGGGGCGGATTGCAGGCAGAGAGATATACCAAGTTCCTAGATTTTCCTCTACTCGGGCGAGCCTTGTCCATTAACGTGTAAGTTCTAGCGTTACTCGCTTACCGTATGTACTCCCCGAACCGTTGGGAGTGCGTAGGGCGACTTGTTAGCACCCTGTCCAGAATCTGCTTGCTATGATCCCCAGTGAATCTTGTGCCCCCAGTAGTTTGGCCTGATCGATAGCAGGCTGTGTCGCGTCAGTCAATCGAATTCAGTCCGGAACAAGCCTCCTCAAAGATTTTTCTTCCGTTCTACGCACCTTAGCCCGTGGTCGTAGTTGACGCCTTTAAGTCAAGTCATTAAAGCCCGCGCCGCATATAGATGTAGTCCATCGACTAGACGCGGAGCGATTAGGAGCCTCATTACTTTGATTCTAGAGGCTTTCTGCAAAAGGGTATTGCCTGATGTATAACGATCAACTGCGCAACTGAAAGACCGGACTGGACGAAGACGCAACACAGGACCACCGTGGTGTAGTCAACAATTAAGTTCGCCAACCAACTTTATGTGGCGACGGATAAGTAGATTGAAAGGTAACCAGGGTGCGAGGTATCATAAGTATACCGGGAGCCCCCTGTCTGGCTTGTCCAAATGTGATGCCAGGCGAAAACAACCTTTCTGCGCTTCCCAACCATCAGTGAGACACTAATTCCTAAAACACCACCGCAAGGGGTGGTGCGATCACGGTCACTGGTATCAGGGCTGCACTGTTTAGCGGGGGGGGCTTCGCATCGGCGCCTCTCGGTAAATTCCCAATAATCGGTTGTGTCCGTTGACACACTGTCGCTCTTGTACGAAAGGTATGAGACCGTCGAGCACGCGGATATGTCGACCTCCAACCGAATGGAAGTAGAAGTGCGTTCTTTTTCATATGATGGATGTTATTAGTAAGAATTGCTGCGACCCTAACTGATAATCTCTACTTTTATTTCATCACTATGCCTAACCCTCGAAATCTTTCTTAGGCACGTTAGCCGACCCCATCTGGCGGCCAAATAGAAGTATTTTACCCCAATCAACCTTCCCCTTGGCTTTTATGACCTTCAAAAAAAACCCATGGATTCGGAAGGGAAACAATGAGGTTCCGTCTTAATTTTTGATGTTAGGGGCGCAGTGCTAAGGATGACTCAGGAAACTGAGGTTTTATTTTGCCCTAACCAATTCGAGATACTTAGCGATACACCGAGGTCGAGCTCTCAGCCTCGTTCTGCCAACATTCTCTTATGCTCTGGTGTCTGGACTAGGTTATTGTCCAAGACACCATGCCCTAAGCCTCGCGCAGCGGGTTCTTAACTTCAAGATTGAACACGCTAAGACCAGGACTCAGGGGCCACTATAGCCGGAACTCACAGCGGGATCGTCATTCCATATGTCCGTCTATAACGCGAACTAGGTTTGAGGCCTCGGCTCTAGTCGTATGCACGTCGACGCTAAGTTAACTATTATATGGCTCTCTGAGATCAAGGCTCAAGCTGGACGAGGTGAGATAATTGCTGATGCAAACCCCAAGTTGAACACGCCTCCAAAGTGGGTTGAGGTCATCCAGCATAGCAGATGGAACTCCCGCAGAGCCTCCTTCGCACCTGCGACCGACTAACTCAGCGAGTGCCTCTCTAGTAGCCTTTCTTCCAATACTAGTTATTTTCTGAGATTTTATGTGCACCTTCGAAGGGCTAACTGTAAATCGTCCCCGCAAGCTTAGCTTCGCTAGGACATCTCGTTTAGTCGTGCACAGGCTTTGAGCGCTGGATATTCACTAGGATTCCGGCAGGGGGATTCACGTTCTACCTCTCAGGTGAATCAGGTTTCCTCAACACTAACCCACGCTTCTGGTGGCAGTGGATTCAGGGTGCGATGTGGTTATGATCTTTAACGGCAATTTTTAACAGTGGCTTATCCAAGAATCCGCTATTGTAAATTACGCGGTCACTCTACTCGTAAGGAGTGAAGATCGAACCCACACAAACTCTATTAGGTACCCGAAGGTCACGTCGCATGGCAGATGGCAAGGAGCCATGCTCCCGGGTCAAATGGGACCTGTTTTGTACTAGCTAAGTGACTGTACAAGTCGGGGCTCTTAGACCACGAAAATCTGCCCCGAGACCATCTTGACGTCGGTTACGTTTTGGCTGGACCACTGCACGCAACCCAGACTAAGCCAGGTTGAAACGTTAACTCGCTATATCGGCATACCATACAGGGCGAGATCGCCTTTTGATATAGCGTCCCGGGATTATGGATCGCAATATGCTTCTGTGGCGAGCTAGCTTACAAGTTAAAACCGTTCGGTGACCCGTCGGCTAAAATTGGCTTTTCAATAAGACTACCGTCGTCTGGCCCCGATAAGGTGATCTGTCTGCGTAGCGGTCCGGGTATTCGAACTTGTTGAAAAGCTTTTCCTTAAAGTCGCAATACTGGACGATCTATCAACTAGAATCACGCGACGAACGAAACAGAAGAATTACTATCAAAAGACGAGCAATATCCTTCCTGCTTATCGCGGCCACACACGGAGCATACAAGAATAACCGAACTCTCTATTTGGAATCAGAAGTCCTATCCGACGAACAATTATCCGCAAACAGGATATAGCCCCTATGTGTAAACGACGGTGCCGTAACTATCAAGGCCGCCAGCCGATGACTCACGCATAAACATAAGGAGGCCTATAACGCAACCGTAAGTGTCCCAGCTTAGATAATTGGACCTATTTTCAATATTGCAGAAAAGGAATAGACTGCAAATGCTTAGAGAACCATCCTGCCAATTGTGTCCCTCTGAGTCTTGGCGTTGTGTATACCTAGATTTGGAGTATCTCTCGATGATAGCTACATGTGTGGTCAATTTAGGACCCCTAATATTCAGTCAAAATGGATCGGGCCCACGGCCGTGTGCGGATCAGAGTAAACACAGTAAGAACCTGCTCACCCAATCGGATTCTTGGCCCCAAAATTCGTTGTTATGCTTACAACTCCTTACCGACTGTTATGACTCGCCTTGGGACTGCGCCGATTTGACCGGCCAAATCTTTACAATTTGGTCACCGCGGAGAGCAGAGTAGATAATCCCTCGCGAACGTCAAAAACCCCCTAGGAAACGCATCGTCTAGTAAGACTCATTCTACCCGATCCGACACTCCCGTGGCTCTTCAATGATTAAACCATATACCGCGTATAGCACTTGTCCCGTAGTCTCTAAGGTGACACACTCCTTCGTGCCTACTACATTTCACGGGACTGCGGCTTGTCTTCGCAGTAGGTCTGAACCCCATTCCGGAGGAGATATCGTCAATGTCTGCTTGCTCCGTGAAGGCGATGGTAAGATTCGCCCCTGGTTGTGTCCTACAACGCTAGACTGGAATGGCATTAAACGTGAGTGCGATTAATGAGAGCTTCCTCTGCGACTTTCACGGATGTTTCCTACATCTCTTTTACTCGTAGCAATCTTCCGTTCTCGTGACGTCTTCCTACCGAGGCCGCTAATAGGCGTATTTAACGTATACCCCCTGCAAGCAACCACGTATAAGTACTTCCCGGGGCCATGCAAATAGCTAGAAGGTCTGAATGTGAAACTTTTCAAACTGCCGCTTCGGGCCTTTATGCAAATGGCATATGACGCTCAACTAAAAGCCCCTTGCCGGTTCGGCTACCTTGGCAGCGTCAGACGAATAGTAACAGAAGGATTGGGGTTTCTAATCCTCCACTAACGACTCGAGAGAGATGGCCGTGCAGCAATCGCAGGCGAGCAGCTAACCATACATACGTAAGCTCCGGGTGCGCAGGGCGATTTCTGTAACAAGCCAAGGCTTTGTTAGCGCCCTGCGTCTTCGTCGAAGCGACCTATGGCTATAGTTCCCTAAAACATACCGTTGAGTATTCACGCTGCTTGTTCGCTAACCGACGTCTGTTGCTCAGACTATGCATGATACTTAGTGCCCATTGCTTTGGCACAATTTCAGTGACGTTAAACTGCTTTACTCCTCCAGGCAAGTCTCTAAAAACCGGTAGAGGGATGTTGCTAACCTACACAAATTACGGCTGAGTGCTGCTCCATATCTCTCTGAGCCGTTTCACTGGTCAGTCACTTCACGTGCGGACCCGTCCGCGAGGGATTGGCCCCGAGGAATACACCAGAACGGACGTAGTTGGCCCACATCTGGCCGGTTCAGAACCTAATGCCCGAGTCGGACTACAAGGATCCTTAAGGGGCGTGTGTCGCAAGCGCGCCGGGACCACTAACCTGTAGATTCAATCCGTCTTATGCAATACAGGTAAACGTTGCAGGCCCAGCGGTGGTCGGGATGTATTAATTCCGCAGTGTTGTGGAATACAGGAATCCGAGGCAAAACGGCACATCCCTTTGCATATTTTGCGCATCCGCGGGGACGTCGTTGAGGTATGGAGCAGAAAAAAAGAAGGCAGGCTGTTCCAATGGAGTTAGACCAACTACCCTCGGTCCACGTAAACTCGTGAGCAAAATCTCGGAACCCCCAAGCCCGGAAGTTCCGGGACTCAGCATACTACCGCGGAGATGCATGAGGCGGGATACACATTTCGGCGGACTACAGGCAGGCCCAACCCTGTATACTAGTAACTCTCCAACGAGGCCATAGACGTGCACACCGCTTGTCTCCCCGGTTGTATCCAACGTTGTTCCGCACAAACGCCTTATCTTTGATTTTTACTAATCCGAAGTACTGAGTGCGATGTGTATAACCTCGGAGAAAACTGGTCTTTACGGACATTTCGATAGCGGGAGTTTGGGTGATGCGCTCCACCTTTGAGCGCTGTCTGGCAGGTCGTAAGAGGTCACTGTTGGGGGGTGCCGCATCTAGAAGCCAGTGGCGGTCTGTGCGATTTTAATCGTCGAGTCCGCATGTGCTAGTCAACTATCCCTATTAGGTTTCCGTAGCTATGACAACTCTTGAGCGGGAAAGGGGTGCCCGCGGGTGAATCTTTTCGACCTCTGAGTGATCCGTGCGGAAGATTGAATATCTTCTCTACTAATATTCTATACTCCGGGGAGAACTACTTATTATTGCGGATAATCCTCCCATCTTTATAATCGGTCCTGAACCATAAGAAAATTTGTCATACCAGCAATTCCGCCACATGATACGAGCGCCCAACGGTGTGCTCTAAGGTGACGCCGGTATGGTAGAAGTACGTGTACAACGCTTCGAATACGCTATATCTAGTAGGTCCGAGGTACGAAACCTATTCCGTTCTCGGGATTGGACCGCATGGGTTCATCTAGGGGGCCCAGTAGGCGCCCACCCAAACTCCCGCATCACGTTTCACGTATGGGACGTCAGAACTAAAATTGCTCATAAATTAGTATGGCGCACACGTCAACTCACTGAGTCTTGTACACCATCCGGACTCATCCTGAGTGTGGGCTCTGAGGTACGAACAGCGTCACTGTTTTACGAGGTCGTGCAGCTGATGACCAAATCAAGAATCCACGAAGCTCGGTAGTACGGTACCTACTGCTCCGCCAACAGATCACCGTCGGGATCCTTTACTCTTCATAACGAAGACGCAACTTCCAGACACATGGATTGGTTGCTACTATGATGGACAATCTACGTGAGCATGACGCCTGCGGTGTGGCTTGTTAGTCTACTCACGAGAATGTACAAATCGCGTCGAGCTAGTTGTACCGAAGTAATTGGTCGGAACGTCCTTAGCCTGCCGTCTCGATATTCAGGTCCAACAGCGGTCTCTTACAAGTGTCTTAGTAGCGCGCTAGCAGATGCAGAGGTACAAGGTCTCTACATCGAGCGTCCGCACGCTGGAAGGTGCTATCTCTGGACCTCCTTTGCCGTCCTGCTAACGGAAAAGGAACTTAAGTCCGATTACGTCATTGTAAGGCCTTTCTCAAGTGGCGTATGGAGATGGCTTAGCGGTCGGCTAGATTGGCCGGTAGCGATTTCTGCAAACAAAATGCGAAGCGCGCCTGTATGTCCCCCCCCCTGAGGTGGGTACAGAGACAAGCCCTTCCGTTCTTGTAGCCTTAATTCCCAACGCGTACACACTTCTCCACGATTCAATACACAGGACTTTCGTGGAGGATTGGTTCATGCGCGTGGAAGTATAAGCATCCCTGTGGATGTACCAATAAGGGGACTGGAAACTGCCGCTGACTGGTTTATTTTACCGCGCGTCCTCTTTTGGGATAACGGAGGACGTCCTATAACCTGTTACAAATGATTAGGCAATTAATTCGCCAAACCCGTTGAGCCAGATACTGCGCGAGAGGGCTCGCGGTTAGCTCGCTATGTTGCGGCGGGAGAAGACAACTGATGTAGTTCGGTTTAGTGAGAGTGTCTTCTGACTAAGAGGGTTTTGTGCACTAACTGCGATAACGAACCACTTCCCCACTGCAACCAACCCTATCGGGGGCGTGTTCTTAGTTTGTAGCCGTCCAAATCATTCCCTACTACGACCATACTCGTCGAGGCGACCTCTTCGAGACCCGGTTTCGTTGCTTGTAAGGTTTAGAAATGAGACCGATTATGATAAGTGAACCACCTCGCCGCGGTCCGCGTGGCGAAGCCGTCCTACGGAGGAACCCAGTCAGTGTCCTCCTCCATCTAGTATGTGCTGCTCTAGCAGCTTGAGCTCCCGTGCGGCGCGTTTCCCAGTGGCTTCCCGTACGCTCGTTGGCGCCTCCAACGCCCTTTTGGGCCCGTGTATTCACGGTATAACGGTTTTGGCCACCTAATTCATCTATGAAATTGCTGCTGAATGCCCCTGTAGATAAGAGATGATGGCCTCTTTCCCTATTAAATCCCACTGGGAGCCATCAGCACCGCGTCCAGTGCGGCCGCCGCTTGTGACTCTTTTAATCAGTTAGTTTCACTACATGTAAGATCCGTGCTCCCTAGTACCCGAGTTAAGACACTATCACGTAGTTTGAGTGCGCGTCTCACGTCTACTTTCTATTGAGCCCTGCACATGGCGACGTTGCTTTTGTTTCCGCTACCCCTTAGGATATAAGATCAACAGAAGAATCGCTGAAGACCTGCCCTTCAAACTGGTATCACGCGTAATCAGCGCAATGGCAGAAGCTGCCGGTATCAATCTTCACTACTCTGCTTGCGAGAAGAAAGCGAGAGCACCTTACGCACCTACTTGGTTTCAGGCTACTTTTGTCGAAAATGCGTGTTGACATAGCACGTTGCATTAATTGGGTTAGGGATCGTTCCCGGGCAGAGATCGGGCACTAGGTTGGCCCGTGCGGACCGTCGATTGAGCAACTTATACGAGTGTAATAGGGACGATAGGCAACACTGGTGGGATACGTCCACTCTGGAATATGTTCGTAAATACATACGTATGGCGCGCGTCGGTGATACCAGCGGAGTATTGATTCCCAAGTTATTGCCAGTATCGGGGTAATAGGCCGGTTACTAGCTCGATGCAGTAACGCCGCCGGTCCGGACCTAGAATCCTCGGAGTGCTGCTCGCTTGCATGTAACTCGATTCCTCGCCACCAACTTTTCGACCCGTTATGTTATAGGGCGCGCGTTGCGCGGCCGACGCTTTGGTAGCGCTATGGGTTAGCTACTTCCATTGCAGGCATAACTCTGGGTTGCGTGATCCCCAGATCACGTTACATTCGGTGCTGACAATAGGCACAGCTTCTATGGAGCATATCGAGCTCTATGCTAGGTGGGCGCCTGGCAAATATGCATGGGTTATCACGCGGTCGAGTCCAACCGTACCCATGGGATTCTCTGGCTAGCCTCATGATTATAGTCTCGTCATCAACAGTCTGACCTCTGATCCGCACTCCATGCAGAAAACCGAAGTACCGAAGTAGAAGTTACGGATCCATGAGTTGTGGCGGCACGATTTGGACGCCGGAGATGGCTACAGCCCAATTCGTTGACAGATCAAAAGGTCCACGTCATGGACGTCCCTCAGCCGAGACGGTCGTTAGGGCGCCATCAAGTTGTTGTGATTGAAAGTGTTCAACTCTCGAGAGCCAAGGGCCCGAACACATAGGCACCAGTAGTCAAGCTGCGGGAGGCGACTTTTCCGTGGGTATTCAATCCGGGTGCTCATGCCTACAGAGAGTCAACATGCTAAGTCGTAACGTGTCACGCCCGTACTCAAAATGCGTCCCTCATCCGAACGAGCCACGGATGACTCTCTTAAGAGACTGATACTAACGCCATCGATCTGCGGTTTACCAAAGTCGGCCGTGTTGCACGTTTACTAATACCTTAGATGCGTGGGTCTACTCGGGTGATATCGCAGAGCTCTAGCTACATAGTCTACACCAGCATCCAGGGTCAAAGGCCGATGGGGGCCTGATGACAGGATATGCCCTGACTGATTCATAAATGGACTGGCTGGTCAATAGTGCGGGATATAATCGCCCTTTTTTTGTCCTAAAATATGGCGAGGGAGCGGTTACGGTTGGCGCCTACACCGACGTATGTGGAATAGCCACTGGAAGAGTACTGCGCGGTGGGTCTCGTAGGTAACCTGCCCCCTGTCATGGGGTCCCATATATCGACGGATCGAAGATTACACCATCTGGTCGTGTCATGCAGTTCTAAAGACCATTGGAGCTCCATATGGTCGTGTCCCCGTCCACTCAAAGGCAATGGATTACCCTGTCCGGGTTTCACTGGTCACAGTATGGAGACGCAAAGGGCACCGGACCCACCGACCTGGAAGCCATGGTACTCCGTACAGGTAATTAGACGTTCTCGGGGAGGCAACCACCGACGCTGACACTAGACTTACACATCATGGAGTTCTCGTAGGGTACAAGTCGGTCAAGGGGGGCCAAGGGGAGTCAGAGTCGCGGTTGAGACTGTTGCGGGGTAGCCCCGCCGAAAACGTTTGCCGTGCGGCGCCAACCTAGGATACGTAAGTGTACAAGAAAGGAAGTTTCTTGACTATCATTATTTGTCCCCCGTTTAGAAATAAGCCGAAAGCGTATCTAAAAGAGCAAGCACTTTAGGCTAGCATTTTGACAGCACTAAGCGACGCGTTGGACAGTTGCCCAAACGACGAAGAACTACGGTGACCTAACGCATTTAGCGAATGTATACGCCTCCAACCTCGCCCCAAGACACGATCACATCTCAATGGCTCAAAAAAAACACCTAGAGCTCGCGTAGTGCTAGGTCGGGGCCGCTGATGGGGTAGATGACTTACGAGACCCAAATCTCTGATGAATCTTTCAGGAGATGACC'
# y='GGGGCAATGTATGGGACGGAGACCGGAACAATGGACCGTAAAGCCAGGTAACACAGCGAGAGTACGGTGTCTTGGTCGTAGGTTCCACTGCTTGGCGGACGATTGAGATTCTGCCCTAACCCGAAGAACCGTTAACCTATTAGGACGTGACGTCTATCCCTCGTTAACTACCACGGATGAATAGTCAGATACACTCTTTCGAAGAGCAGAACGTGGCCTTCGGACATATGGGCGGACCAGAACACCCCATCGTCTTACGGCTGGGAGGCCATAGCTCTCGCGCCCCCGACTTTTACACCAAATCGTTGCACTCGCGGAGAGCCGGATAGTAGTCTTCAACTCCTAGTCGGGGTGGCTTTCAGTCCTGCCTGGTGGGAAAAGAGAGCCTGTGCCACGGACGCCACCTAAACTCACAGATGTGCGCCGTGGCGAGGAACAGCCCAACAGTAATTGCAGTAGTATAACCCGTCTAGATTCCTACCTATAAGTAACCACATAAAAGATTAATTGATATCGGGCTACATGAAGCCTGCTATAGAGATACAAGGTTCTCTGACATAATAATGTGAGGCTACGCGCTCACACGTCATGCGTGCCGCCCTGAGAATATGTCCCACTCGTAGTGCTCAGCTAATGAATCCAGAGGGGATCTACAAAGTTATAAGTCGTCAGCATGTTTCCTGTTTTGAGGCAACTATATGCCGTGGCTATTACCGTACATGACTTTCCAACTAGGTACTTCAACGGTCATAGGTGAGCTAGTCAAGGAAGGCTATCGGTCGGCACGCCCTTCAGCCGGAGCGCAACATCCAGGCTTGGCTGGTCGTGGTCGACGGCAGAATCCCTCCTGACGATCGGGGTCAAACCTAAATTGTAGCACCATGGTCGGCATATTGCGCCTAGTGGGACCTCGCACAGGTGTATCAAAGCGCCCAGA'
# semiAl(x,y,subMat,indel)

KeyboardInterrupt: 

In [None]:
#### Overlapping alignment

In [18]:
x='PAWHEAE'
y='HEAGAWGHEE'
indel = -2
subMat = createScorMat(match = 1,mismatch = -2,nts = list(set(y) | set(x)))
subMat

Unnamed: 0,E,H,W,G,P,A
E,1.0,-2.0,-2.0,-2.0,-2.0,-2.0
H,-2.0,1.0,-2.0,-2.0,-2.0,-2.0
W,-2.0,-2.0,1.0,-2.0,-2.0,-2.0
G,-2.0,-2.0,-2.0,1.0,-2.0,-2.0
P,-2.0,-2.0,-2.0,-2.0,1.0,-2.0
A,-2.0,-2.0,-2.0,-2.0,-2.0,1.0


In [29]:
smLocal,_ = scoringMatrix_local(y,x,subMat,indel)
smGlobal = scoringMatrix_global(y,x,subMat,indel)

In [31]:
vizScoringMat(y,x,smGlobal)

Unnamed: 0,Unnamed: 1,H,E,A,G,A.1,W,G.1,H.1,E.1,E.2
,0.0,-2.0,-4.0,-6.0,-8.0,-10.0,-12.0,-14.0,-16.0,-18.0,-20.0
P,-2.0,-2.0,-4.0,-6.0,-8.0,-10.0,-12.0,-14.0,-16.0,-18.0,-20.0
A,-4.0,-4.0,-4.0,-3.0,-5.0,-7.0,-9.0,-11.0,-13.0,-15.0,-17.0
W,-6.0,-6.0,-6.0,-5.0,-5.0,-7.0,-6.0,-8.0,-10.0,-12.0,-14.0
H,-8.0,-5.0,-7.0,-7.0,-7.0,-7.0,-8.0,-8.0,-7.0,-9.0,-11.0
E,-10.0,-7.0,-4.0,-6.0,-8.0,-9.0,-9.0,-10.0,-9.0,-6.0,-8.0
A,-12.0,-9.0,-6.0,-3.0,-5.0,-7.0,-9.0,-11.0,-11.0,-8.0,-8.0
E,-14.0,-11.0,-8.0,-5.0,-5.0,-7.0,-9.0,-11.0,-13.0,-10.0,-7.0


In [32]:
vizScoringMat(y,x,smLocal)

Unnamed: 0,Unnamed: 1,H,E,A,G,A.1,W,G.1,H.1,E.1,E.2
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
W,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
H,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
E,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
A,0.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
E,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


#### Try this implementation later

https://academic.oup.com/bioinformatics/article/19/suppl_1/i54/227687

### My implementation

In [4]:
m = len(x)
n = len(y)
whichMax = np.argmax([m,n])
if whichMax == 0:
    target,query = x,y

else:
    target,query = y,x
target

'GTAGGCTTAAGGTTA'

In [143]:
def makeIndex(target,k=3):
    k = 3
    index = {}
    for _ in range(0,len(target)-2):
        kmer = target[_:_+k]
        if index.get(kmer) == None:
            index[kmer] = [_]
        else:
            index[kmer].append(_)
    return index
index = makeIndex(target,k=3)
index

{'GTA': [0],
 'TAG': [1],
 'AGG': [2, 9],
 'GGC': [3],
 'GCT': [4],
 'CTT': [5],
 'TTA': [6, 12],
 'TAA': [7],
 'AAG': [8],
 'GGT': [10],
 'GTT': [11]}

In [29]:
kmers = []
for _ in range(0,len(query)-2):
    kmers.append(query[_:_+k])
kmers

NameError: name 'k' is not defined

In [153]:
index.get(kmers[0])

[1]

In [162]:
globAl(target[1:],query,subMat,indel)

('TAGGCTTAAGGTTA', 'TA-G----A---TA', -2.0)