In [1]:
import pandas as pd
import numpy as np

## From https://github.com/ZhangQiuxue/Rosalind/blob/master/Textbook_05E.py
def global_alignment(v, w, scoring_matrix, sigma):
    '''Returns the global alignment of v and w subject to the given scoring matrix and indel penalty sigma.'''
    # Initialize the matrices.
    S = [[0]*(len(w)+1) for _ in range(len(v)+1)]
    backtrack = [[0]*(len(w)+1) for _ in range(len(v)+1)]

    # Initialize the edges with the given penalties.
    for i in range(1, len(v)+1):
        S[i][0] = -i*sigma
    for j in range(1, len(w)+1):
        S[0][j] = -j*sigma

    # Fill in the Score and Backtrack matrices.
    for i in range(1, len(v)+1):
        for j in range(1, len(w)+1):
            scores = [S[i-1][j] - sigma, S[i][j-1] - sigma, S[i-1][j-1] + blosum.loc[v[i-1]][w[j-1]]]
            S[i][j] = max(scores)
            backtrack[i][j] = scores.index(S[i][j])

    # Quick lambda function to insert indels.
    insert_indel = lambda word, i: word[:i] + '-' + word[i:]

    # Initialize the aligned strings as the input strings.
    v_aligned, w_aligned = v, w

    # Get the position of the highest scoring cell in the matrix and the high score.
    i, j = len(v), len(w)
    max_score = str(S[i][j])

    # Backtrack to the edge of the matrix starting at the highest scoring cell.
    while i*j != 0:
        if backtrack[i][j] == 0:
            i -= 1
            w_aligned = insert_indel(w_aligned, j)
        elif backtrack[i][j] == 1:
            j -= 1
            v_aligned = insert_indel(v_aligned, i)
        else:
            i -= 1
            j -= 1

    # Prepend the necessary preceeding indels to get to (0,0).
    for _ in range(i):
        w_aligned = insert_indel(w_aligned, 0)
    for _ in range(j):
        v_aligned = insert_indel(v_aligned, 0)

    return max_score, v_aligned, w_aligned,scores,S,backtrack


def readBlosum62():
    '''
    Read scoring matrix
    '''
    with open('blosum62.txt') as matrix_file:
        matrix = matrix_file.read()
        lines = matrix.strip().split('\n')

    blosum={}
    cols = lines[0].split()

    for row in lines[1:]:       
        idx = row[0]
        vals = row[1:].split()
        vals = [int(_) for _ in vals]
        #print(idx,vals)
        blosum[idx]=vals

    blosum = pd.DataFrame.from_dict(blosum,orient='index',columns=cols)
    return blosum

In [2]:
def scorematch(residueA,residueB,scoringMatrix):
    '''
    Get the value of a score between two matches
    '''
    return scoringMatrix.loc[residueA][residueB]

def linearSpaceAlign(x,y,scoringMatrix,indel,verbose=False):
    ####
    m = len(y)
    n = len(x)
    score = list(np.cumsum([indel for _ in y]))
    nullScore = list(np.cumsum([0] + [indel for _ in x]))
    prev = score.copy()
    score,nullScore,prev
    ####
    for j,residue in enumerate(x):
        if verbose: print(j,residue)
        score = [nullScore[j]] + score
        prev = score.copy()

        for i in range(1,m+1):
            if residue == y[i-1]:
                if verbose: print(i, residue,'match',y[i-1], prev,score)
                score[i] = prev[i-1] + scorematch(residue,y[i-1],scoringMatrix) 
            else:
                if verbose: print(i, residue, 'no match', y[i-1], prev,score,'choose',
                                      score[i-1],
                                      score[i],prev[i-1])
                score[i] = max(score[i-1] + indel, #Up
                                      score[i] + indel, #Left
                                      prev[i-1] + scorematch(residue,y[i-1],scoringMatrix)) #Mismatch
            if verbose: print(score,"\n----")

        score = score[1:]
    
    score = [nullScore[j+1]] + score
    return score


In [3]:
blosum = readBlosum62()

In [4]:
x='PLEASANTLY'
y='MEASNLY'

In [17]:
def splitstring(s,point):
    fwd = s[0:point:]
    rev = s[point:][::-1]
    return fwd, rev

def halfstring(s):
    fwd = s[0:int(len(s)/2):]
    rev = s[int(len(s)/2):][::-1]
    return fwd, rev

def linspace(residue,y,updatedScore,nullScore,scoringMatrix,indel=0,j=0,verbose=False):    
    """
    Alignment in linear space
    Add scoring matrix capability
    """
    n = len(y)

    updatedScore = [nullScore[j]] + updatedScore
    prevScore = updatedScore.copy()

    #(prevScore,updatedScore)

    for i in range(1,n+1):
        if residue == y[i-1]:
            if verbose: print(i, 'match',y[i-1], prevScore,updatedScore)
            updatedScore[i] = prevScore[i-1] + scorematch(residue,y[i-1],scoringMatrix) 
        else:
            if verbose: print(i,'no match', y[i-1], prevScore,updatedScore,'choose',
                                  updatedScore[i-1],
                                  updatedScore[i],prevScore[i-1])
            updatedScore[i] = max(updatedScore[i-1] + indel, #Up
                                  updatedScore[i] + indel, #Left
                                  prevScore[i-1] + scorematch(residue,y[i-1],scoringMatrix)) #Mismatch
        if verbose: print(updatedScore,"\n----")
    return updatedScore[1:]

def createScorMat(match = 1,mismatch = 0):
    """
    Create scoring matrix for nucleotides with a given match and mismatch values
    """
    nts = ['A','C','T','G']

    scoringMatrix = np.zeros((len(nts),len(nts)),)
    np.fill_diagonal(scoringMatrix,match)
    scoringMatrix
    scoringMatrix[scoringMatrix == 0] = mismatch
    scoringMatrix = pd.DataFrame(scoringMatrix,index=nts,columns=nts)
    return scoringMatrix


def align_linear(x,y,scoringMatrix,indel=0,verbose=False):
    # Initialize scoring 
    score = list(np.cumsum([indel for _ in y]))
    nullScore = list(np.cumsum([0] + [indel for _ in x]))
    for i in range(1,len(x)+1):
        prev = score.copy()
        score = linspace(x[i-1],y,prev,nullScore,scoringMatrix=scoringMatrix,indel=indel,verbose=False,j=i-1)
    return score

def hirsh(x,y,scoringmatrix,indel):
    fwd, rev = halfstring(x)
    fwdMat = wrapAligner(y,fwdX,scoringmatrix,indel)

In [15]:
x='PLEASANTLY'
y='MEASNLY'
indel = -5
####
yA,yB = halfstring(y)
scoreF = linearSpaceAlign(yA,x,blosum,indel)
scoreR = linearSpaceAlign(yB[::-1],x[::-1],blosum,indel)
print (scoreF,scoreR)
print (np.array(scoreF) + np.array(scoreR[::-1]))

[-15, -11, -7, -3, 6, 1, -4, -9, -14, -19, -24] [-20, -8, -8, -5, -8, -9, -6, -11, -16, -20, -22]
[-37 -31 -23 -14   0  -8 -12 -14 -22 -27 -44]


In [78]:
x='TATGC'
y='AGTACGCA'
####
indel = -2
mismatch=-1
match = 2
### Create scoring matrix
scoringMatrix = createScorMat(match,mismatch)

#### Return last row of alignment
align_linear(x,y,scoringMatrix,indel) #Max is last element

[-6.0, -2.0, -1.0, -3.0, 1.0, -1.0, 3.0, 1.0]

In [80]:
xA,xB = halfstring(x)
scoreF = linearSpaceAlign(xA,y,scoringMatrix,indel)
scoreR = linearSpaceAlign(xB[::-1],y[::-1],scoringMatrix,indel)

arrSum = np.array(scoreF) + np.array(scoreR[::-1])
maxScore = max(arrSum)
whichMax = np.argmax(arrSum)
maxScore,whichMax

(1.0, 4)

In [81]:
x='TATGC'
y='AGTACGCA'
####
indel = -2
mismatch=-1
match = 2
### Create scoring matrix
scoringMatrix = createScorMat(match,mismatch)
global_alignment(x,y,scoringMatrix,2)

('17',
 '--TATGC-',
 'AGTACGCA',
 [4, 17, 8],
 [[0, -2, -4, -6, -8, -10, -12, -14, -16],
  [-2, 0, -2, 1, -1, -3, -5, -7, -9],
  [-4, 2, 0, -1, 5, 3, 1, -1, -3],
  [-6, 0, 0, 5, 3, 4, 2, 0, -1],
  [-8, -2, 6, 4, 5, 3, 10, 8, 6],
  [-10, -4, 4, 5, 4, 14, 12, 19, 17]],
 [[0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 2, 1, 2, 1, 1, 1, 1, 1],
  [0, 2, 1, 0, 2, 1, 1, 1, 1],
  [0, 0, 2, 2, 0, 2, 1, 1, 2],
  [0, 0, 2, 1, 2, 1, 2, 1, 1],
  [0, 0, 0, 2, 2, 2, 1, 2, 1]])

In [82]:
global_alignment(xA,y,scoringMatrix,2)

('-3',
 '--TA----',
 'AGTACGCA',
 [-11, -3, -3],
 [[0, -2, -4, -6, -8, -10, -12, -14, -16],
  [-2, 0, -2, 1, -1, -3, -5, -7, -9],
  [-4, 2, 0, -1, 5, 3, 1, -1, -3]],
 [[0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 2, 1, 2, 1, 1, 1, 1, 1],
  [0, 2, 1, 0, 2, 1, 1, 1, 1]])

In [83]:
global_alignment(xB[::-1],y[::-1],scoringMatrix,2)

('5',
 'T-GC----',
 'ACGCATGA',
 [-3, 5, 1],
 [[0, -2, -4, -6, -8, -10, -12, -14, -16],
  [-2, 0, -2, -4, -6, -8, -5, -7, -9],
  [-4, -2, -3, 4, 2, 0, -2, 1, -1],
  [-6, -4, 7, 5, 13, 11, 9, 7, 5]],
 [[0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 2, 1, 1, 1, 1, 2, 1, 1],
  [0, 0, 2, 2, 1, 1, 1, 2, 1],
  [0, 0, 2, 1, 2, 1, 1, 1, 1]])

In [84]:
linearSpaceAlign(xA,y,scoringMatrix,indel)

[-4, 0.0, -2.0, -4.0, 0.0, -2.0, -4.0, -6.0, -8.0]

In [85]:
linearSpaceAlign(xB[::-1],y[::-1],scoringMatrix,indel)

[-6, -5.0, -1.0, -3.0, 1.0, -1.0, -3.0, -5.0, -7.0]