First we define a couple of functions that we will need later, for formating and printing alignments

In [3]:
import argparse
import numpy as np

def print_alignment(seqA,seqB):
  print(seqA)
  print(seqB)

def print_dynamic(seqA,seqB,dpm):
  seqA,seqB = "-" + seqA, "-" + seqB
  m,n = len(seqA),len(seqB)
  print '{:^5}'.format(" "),
  for j in range(n):
    print '{:^5}'.format(seqB[j]),
  print
  for i in range(m):
    print '{:^5}'.format(seqA[i]),
    for j in range(n):
        print '{:5.1f}'.format(dpm[i,j]),
    print
  print

def format_alignment(seqA,seqB,trace):
  outA,outB = "",""
  i,j = len(seqA),len(seqB)
  while i>0 or j>0:
    di,dj = trace[i,j]
    i += int(di)
    j += int(dj)
    if di == 0:
      outA = "-" + outA
    else:
      outA = seqA[i] + outA
    if dj == 0:
      outB = "-" + outB
    else:
      outB = seqB[j] + outB
  return outA,outB

Then we setup the scoring system we need for the alignment

In [4]:
gap_penalty = -1.0

def match_score(letterA,letterB):
  if letterA == letterB:
    return 1.0
  else:
    return -1.0


Now we turn our attention to the core of the Needleman-Wunsch, the dynamic programming

In [5]:
def align(seqA,seqB,print_dynamic_matrix = False):
  # Initiating variables
  m, n = len(seqA)+1, len(seqB)+1
  S = np.zeros((m,n))
  trace = np.zeros((m,n,2))
  # Set up dynamic programming matrices
  S[0,0] = 0.
  trace[0,0,:] = (0.,0.)
  for i in range(1,m):
    S[i,0] = gap_penalty * i
    trace[i,0,:] = (-1.,0.)
  for j in range(1,n):
    S[0,j] = gap_penalty * j
    trace[0,j,:] = (0.,-1.)
  # Set up dynamic programming matrices
  for i in range(1,m):
    for j in range(1,n):
      match = S[i-1][j-1] + match_score(seqA[i-1],seqB[j-1])
      delete = S[i-1][j] + gap_penalty
      insert = S[i][j-1] + gap_penalty
      S[i,j] = max(match, delete, insert)
      if match >= max(delete,insert):
        trace[i,j,:] = (-1,-1.)
      elif delete >= insert:
        trace[i,j,:] = (-1,0)
      else:
        trace[i,j,:] = (0,-1)
  if print_dynamic_matrix:
      print_dynamic(seqA,seqB,S)
  print("Best score: " + str(S[m-1,n-1]))
  return format_alignment(seqA,seqB,trace)

Now everything is set. We can try the code for any of our favorite sequences. One can toggle the printout of the dynamic programming matrix by a boolean flag as a third argument.

In [6]:
seqA,seqB = align("BANANA","ANANAS",True)
print_alignment(seqA,seqB)

        -     A     N     A     N     A     S  
  -     0.0  -1.0  -2.0  -3.0  -4.0  -5.0  -6.0
  B    -1.0  -1.0  -2.0  -3.0  -4.0  -5.0  -6.0
  A    -2.0   0.0  -1.0  -1.0  -2.0  -3.0  -4.0
  N    -3.0  -1.0   1.0   0.0   0.0  -1.0  -2.0
  A    -4.0  -2.0   0.0   2.0   1.0   1.0   0.0
  N    -5.0  -3.0  -1.0   1.0   3.0   2.0   1.0
  A    -6.0  -4.0  -2.0   0.0   2.0   4.0   3.0

Best score: 3.0
BANANA-
-ANANAS
