# Align Two Strings Using Affine Gap Penalties

[ba5j](https://rosalind.info/problems/ba5j/)

A gap is a contiguous sequence of spaces in a row of an alignment. One way to score gaps more appropriately is to define an affine penalty for a gap of length k as σ + ε · (k − 1), where σ is the gap opening penalty, assessed to the first symbol in the gap, and ε is the gap extension penalty, assessed to each additional symbol in the gap. We typically select ε to be smaller than σ so that the affine penalty for a gap of length k is smaller than the penalty for k independent single-nucleotide indels (σ · k).

## Alignment with Affine Gap Penalties Problem

Construct a highest-scoring global alignment of two strings (with affine gap penalties).

    Given: 
    
Two amino acid strings v and w (each of length at most 100).

    Return: 
    
The maximum alignment score between v and w, followed by an alignment of v and w achieving this maximum score. Use the BLOSUM62 scoring matrix, a gap opening penalty of 11, and a gap extension penalty of 1.

In [2]:
blosum = [
    [ 4,  0, -2, -1, -2,  0, -2, -1, -1, -1, -1, -2, -1, -1, -1,  1,  0,  0, -3, -2],
    [ 0,  9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2],
    [-2, -3,  6,  2, -3, -1, -1, -3, -1, -4, -3,  1, -1,  0, -2,  0, -1, -3, -4, -3],
    [-1, -4,  2,  5, -3, -2,  0, -3,  1, -3, -2,  0, -1,  2,  0,  0, -1, -2, -3, -2],
    [-2, -2, -3, -3,  6, -3, -1,  0, -3,  0,  0, -3, -4, -3, -3, -2, -2, -1,  1,  3],
    [ 0, -3, -1, -2, -3,  6, -2, -4, -2, -4, -3,  0, -2, -2, -2,  0, -2, -3, -2, -3],
    [-2, -3, -1,  0, -1, -2,  8, -3, -1, -3, -2,  1, -2,  0,  0, -1, -2, -3, -2,  2],
    [-1, -1, -3, -3,  0, -4, -3,  4, -3,  2,  1, -3, -3, -3, -3, -2, -1,  3, -3, -1],
    [-1, -3, -1,  1, -3, -2, -1, -3,  5, -2, -1,  0, -1,  1,  2,  0, -1, -2, -3, -2],
    [-1, -1, -4, -3,  0, -4, -3,  2, -2,  4,  2, -3, -3, -2, -2, -2, -1,  1, -2, -1],
    [-1, -1, -3, -2,  0, -3, -2,  1, -1,  2,  5, -2, -2,  0, -1, -1, -1,  1, -1, -1],
    [-2, -3,  1,  0, -3,  0,  1, -3,  0, -3, -2,  6, -2,  0,  0,  1,  0, -3, -4, -2],
    [-1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2,  7, -1, -2, -1, -1, -2, -4, -3],
    [-1, -3,  0,  2, -3, -2,  0, -3,  1, -2,  0,  0, -1,  5,  1,  0, -1, -2, -2, -1],
    [-1, -3, -2,  0, -3, -2,  0, -3,  2, -2, -1,  0, -2,  1,  5, -1, -1, -3, -3, -2],
    [ 1, -1,  0,  0, -2,  0, -1, -2,  0, -2, -1,  1, -1,  0, -1,  4,  1, -2, -3, -2],
    [ 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1,  0, -1, -1, -1,  1,  5,  0, -2, -2],
    [ 0, -1, -3, -2, -1, -3, -3,  3, -2,  1,  1, -3, -2, -2, -3, -2,  0,  4, -3, -1],
    [-3, -2, -4, -3,  1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11,  2],
    [-2, -2, -3, -2,  3, -3,  2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1,  2,  7]
]
aminoacids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']


In [69]:
def do_alignment(v, w, sigma, epsilon):
    m = len(v)
    n = len(w)
    lower = [[0]*(n + 1) for _ in range(m + 1)]
    main = [[0]*(n + 1) for _ in range(m + 1)]
    upper = [[0]*(n + 1) for _ in range(m + 1)]
    lower_dir = [[0]*(n + 1) for _ in range(m + 1)]
    main_dir = [[0]*(n + 1) for _ in range(m + 1)]
    upper_dir = [[0]*(n + 1) for _ in range(m + 1)]

    for i in range(1, m+1):
        lower[i][0] = -sigma - (i-1)*epsilon
        main[i][0] = -sigma - (i-1)*epsilon
        upper[i][0] = -10*sigma
    for j in range(1, n+1):
        lower[0][j] = -10*sigma
        main[0][j] = -sigma - (j-1)*epsilon
        upper[0][j] = -sigma - (j-1)*epsilon

    for i in range(1, m+1):
        for j in range(1, n+1):
            coef = blosum[aminoacids.index(v[i - 1])][aminoacids.index(w[j - 1])]
            lower_scores = [lower[i-1][j] - epsilon, main[i-1][j] - sigma]
            lower[i][j] = max(lower_scores)
            lower_dir[i][j] = lower_scores.index(lower[i][j])

            upper_scores = [upper[i][j-1] - epsilon, main[i][j-1] - sigma]
            upper[i][j] = max(upper_scores)
            upper_dir[i][j] = upper_scores.index(upper[i][j])

            middle_scores = [lower[i][j], main[i-1][j-1] + coef, upper[i][j]]
            main[i][j] = max(middle_scores)
            main_dir[i][j] = middle_scores.index(main[i][j])

    i, j = m, n
    rv, rw = [], []

    matrix_scores = [lower[i][j], main[i][j], upper[i][j]]
    max_score = max(matrix_scores)
    backtrack_matrix = matrix_scores.index(max_score)

    while i != 0 or j != 0:
        if backtrack_matrix == 0:
            if lower_dir[i][j] == 1:
                backtrack_matrix = 1
            rv.append(v[i - 1])
            rw.append('-')
            i -= 1

        elif backtrack_matrix == 1:
            if main_dir[i][j] == 0:
                backtrack_matrix = 0
            elif main_dir[i][j] == 2:
                backtrack_matrix = 2
            else:
                rv.append(v[i - 1])
                rw.append(w[j - 1])
                i -= 1
                j -= 1

        else:
            if upper_dir[i][j] == 1:
                backtrack_matrix = 1
            rv.append('-')
            rw.append(w[j - 1])
            j -= 1

    return str(max_score), ''.join(reversed(rv)), ''.join(reversed(rw))

In [70]:
file = "rosalind_ba5j.txt" 
with open(file, 'r') as f:
    lines = f.readlines()
    line1, line2 = lines[0].split()[0], lines[1].split()[0]

print('\n'.join(do_alignment(line1, line2, 11, 1)))

212
NESDDRACNLLHDKCAKKLMENWHDRWMCGRNSSMRW-CGWAACPFQMSINAVNPYWQNTPDELRKQKPIDYVFIKMLRP--WDW
NE--ERFQFKLHDKCAKKLFENWHDRWMCGRNSSMRFQMAWGM--FGEVINAVNPLRPANWCAGCYIKPIDYVFCKMLSPAHWDW
