# Handout 04
#### Sara Díaz del Ser

In [1]:
import numpy as np
from collections import Counter
from itertools import combinations
import pandas as pd
from termcolor import colored
import tabulate

### Ex. 1 _(0 pts)_  Manually determining scoring matrices
To test your understanding, consider this small example of a blocks alignment database consisting of
5 sequences of length 12 each. From these to numbers one can determinate the denominators in eqs.
(2) and (3). What is $∑_{i} f_{i}$ and $∑_{{i,j}} f_{ij}$ ?
```
TSVKTYAKFVTH
TSVKTYAKFSTH
TSVKTYAKFVTH
LSVKKYPKYVVQ
SSVKKYPKYSVL
```
Count the frequencies $f_{a}$ for all amino acids in the alignment and $f_{ab}$ for all amino acid pairs occurring
in the same column of the alignment. (For the pairs, do not consider the order of the amino acids: do
not distinguish between VS and SV, for example). From these values, calculate the relative frequencies
$p_{a}$ for each occurring amino acid and $p_{ab}$ for each occurring amino acid pair. Finally, calculate the
expected probability and the score for each amino acid pair. Fill your results into the given tables.

In [2]:
# sequences
seq1 = 'TSVKTYAKFVTH'
seq2 = 'TSVKTYAKFSTH'
seq3 = 'TSVKTYAKFVTH'
seq4 = 'LSVKKYPKYVVQ'
seq5 = 'SSVKKYPKYSVL'

In [3]:
# 20 aa
aa = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

In [4]:
# fa
def fa(seq:str) -> dict:
	"""Return frequency of all amino acids"""
	# Start count at 1
	counter = Counter(seq)
	add_1 = {key :1 for key in list(counter.keys()) }
	return counter + Counter(add_1)

In [5]:
# fab
def pair_freq(seqA,seqB):
	"""Add pair frequencies from two sequences into dictionary"""
	all_pairs = []
	# seq1 should be the smaller one
	if len(seqA) == len(seqB):
		seq1, seq2 = seqA, seqB
	else:
		seq1 = min([seqA,seqB],key = len)
		seq2 = max([seqA,seqB], key = len)

	for i in range(len(seq1)):
		# Append pair (ordered) to list:
		key = sorted([seq1[i],seq2[i]])
		all_pairs.append("".join(key))
	return all_pairs

def all_pair_freq(seq_list:list) -> dict:
	"""Get pair frequencies from list of sequences"""
	all_pairs = []
	for seq1,seq2 in combinations(seq_list,2):
		all_pairs.extend(pair_freq(seq1,seq2))

	counter = Counter(all_pairs)
	add_1 = {key :1 for key in list(counter.keys()) }
	return counter + Counter(add_1)


In [6]:
# pa (relative frequency for aa)
def rel_freq(frequency:dict) -> dict:
	"""Determine relative frequency for each aminoacid"""
	return { key : (val/sum(frequency.values())) for key, val in frequency.items() }

In [7]:
def expected_prob(freq_rel:dict) -> dict:
	"""Determine the expected probability of each aminoacid"""
	expected = {}
	# Get all possible pairs
	possible_pairs = ["".join(sorted([aa1,aa2])) for aa1,aa2 in combinations(freq.keys(),2) ]
	# add pairs of the same aa
	possible_pairs.extend([aa+aa for aa in freq_rel.keys() ])
	for pair in possible_pairs:
		# eaa = pa * pa
		if pair[0]==pair[1]:
			prob = freq_rel.get(pair[0],1/400)*freq_rel.get(pair[0],1/400)
		# eab = pa * pb + pb * pa = 2 * pa * pb
		else:
			prob = 2*freq_rel.get(pair[0],1/400)*freq_rel.get(pair[1],1/400)
		expected.update({pair: prob})
	return expected

In [8]:
def pair_score(p_freq_rel, p_expect) -> dict:
	"""Calculate the score for each aminoacid pair"""
	scores = {}
	for pair in p_freq_rel.keys():
		# sab = 2 * log2(pab/eab)
		score = 2*np.log2(p_freq_rel.get(pair,1)/p_expect.get(pair,1))
		scores.update({pair : round(score,0)})
	return scores

In [9]:
freq = fa(seq1+seq2+seq3+seq4+seq5)

In [10]:
p_freq = all_pair_freq([seq1,seq2,seq3,seq4,seq5])

In [11]:
freq_rel = rel_freq(freq)

In [12]:
p_freq_rel = rel_freq(p_freq)

In [13]:
p_exp = expected_prob(freq_rel)

In [14]:
p_score = pair_score(p_freq_rel, p_exp)

In [15]:
# Fill the tables
single_df = pd.DataFrame([freq, freq_rel], ['$F_{a}$','$P_{a}$']).round(decimals =2)
single_df

Unnamed: 0,T,S,V,K,Y,A,F,H,L,P,Q
$F_{a}$,10.0,9.0,11.0,13.0,8.0,4.0,4.0,4.0,3.0,3.0,2.0
$P_{a}$,0.14,0.13,0.15,0.18,0.11,0.06,0.06,0.06,0.04,0.04,0.03


In [16]:
pair_df = pd.DataFrame([p_freq, p_freq_rel,p_exp,p_score], ['$F_{ab}$','$P_{ab}$', '$E_{ab}$','$S_{ab}$']).round(decimals =2)
pair_df = pair_df.dropna(axis=1)
pair_df

Unnamed: 0,TT,SS,VV,KK,YY,AA,FF,SV,HH,LT,KT,AP,FY,TV,HQ,ST,HL,LS,PP,LQ
$F_{ab}$,10.0,12.0,15.0,22.0,12.0,4.0,4.0,7.0,4.0,4.0,7.0,7.0,7.0,7.0,4.0,4.0,4.0,2.0,2.0,2.0
$P_{ab}$,0.07,0.09,0.11,0.16,0.09,0.03,0.03,0.05,0.03,0.03,0.05,0.05,0.05,0.05,0.03,0.03,0.03,0.01,0.01,0.01
$E_{ab}$,0.02,0.02,0.02,0.03,0.01,0.0,0.0,0.04,0.0,0.01,0.05,0.0,0.01,0.04,0.0,0.04,0.0,0.01,0.0,0.0
$S_{ab}$,4.0,5.0,4.0,4.0,6.0,6.0,6.0,1.0,6.0,3.0,-0.0,7.0,4.0,0.0,6.0,-1.0,5.0,1.0,6.0,5.0


### Ex.2 _(7 pts)_ Calculating scoring matrices

Write a program that computes a scoring matrix score from a given block alignment database. You
should be able to call your problem like ```python blosum.py alignment.dat blosum_matrix.out``` from
the command line. The input file alignment.dat (found in the course folder) contains a small alignment
"database". It contains a number of aligned sequences of equal length with no gaps with one sequence
per line.Your program should compute the substitution matrix from this alignment. The output file
```blosum_matrix.out``` should contain an output of the matrix like in the example of the BLOSUM matrix
given above. (The original BLOSUM62 matrix is provided as blosum62.txt in the group folder. Are
the scores of the matrix you have calculated for ```alignment.dat``` similar?)

Your program will need to.
(a) read in the alignment data in an appropriate data structure
(b) determine the log-odds scores for each possible alignment of amino acids,
(c) produce a (nicely formatted) output of the resulting scoring matrix.


In [17]:
# (a) read in the alignment data in an appropriate data structure
def read_file(file:str) -> list:
	"""Read file into list of lines"""
	with open (file, 'r') as f:
		return f.readlines()

file = './alignment.dat'
seqs = read_file(file)

In [18]:
def pre_processing(seqs:list) -> list:
	"""Pre-process alignment data:
	- Add padding at the end of shorter strings to make sure they're the same size
	- Remove newlines
	"""
	max_length = max([len(each) for each in seqs ])
	seqs = [ seq.ljust(max_length).replace('\n','') for seq in seqs ]
	return seqs

seqs = pre_processing(seqs)

In [19]:
# (b) determine the log-odds scores for each possible alignment of amino acids
def calc_score(seqs:list) -> dict:
	"""Determine score from list of pre-processed aligned sequences"""

	freq = fa("".join(seqs))
	p_freq = all_pair_freq(seqs)

	freq_rel = rel_freq(freq)
	p_freq_rel = rel_freq(p_freq)

	p_exp = expected_prob(freq_rel)
	p_score = pair_score(p_freq_rel, p_exp)

	return p_score

score = calc_score(seqs)

In [20]:
# (c) produce a (nicely formatted) output of the resulting scoring matrix.
def calc_score_matrix(score:dict,aa) -> np.ndarray:
	"""Generate a score matrix from a score dictionary"""
	# Create matrix of 0 of size (20,20)
	matrix = np.zeros((20,20),dtype=int)

	# Assign index to each aa
	aa_index = { key: i for i,key in enumerate(aa)}
	# Iterate through score dictionary and input values into matrix
	for key, val in score.items():
		matrix[aa_index[key[0]],aa_index[key[1]]] = val
		matrix[aa_index[key[1]],aa_index[key[0]]] = val

	return matrix

score_matrix = calc_score_matrix(score,aa)


In [21]:
def score_table(score_matrix:np.ndarray,aa, file,) -> pd.DataFrame:
	"""Turn a numpy score matrix into a dataframe"""
	df = pd.DataFrame(score_matrix, index=aa, columns=aa)

	with open(file, 'w') as f:
		print(df.to_string(), file=f)
		print(colored(f'Score matrix saved to "{file}"'), 'green')
	return df

In [22]:
score_table(score_matrix, aa, 'score_matrix.txt')

Score matrix saved to "score_matrix.txt"[0m green


Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y,V
A,5,-17,-18,-16,-19,-1,-14,-17,-1,-17,-2,-1,-20,-4,0,1,0,-23,-6,0
R,-17,6,-18,-18,-17,-17,-16,-16,-17,-18,-16,-13,-20,-18,-20,-17,-16,-21,-18,-18
N,-18,-18,7,-15,-22,-19,-15,-19,-20,-18,-22,-18,-23,-20,-19,-15,-16,-23,-21,-17
D,-16,-18,-15,6,-20,-21,-11,-17,-19,-17,-17,-17,-23,-18,-19,-17,-19,-23,-20,-16
C,-19,-17,-22,-20,8,-18,-17,-18,0,-19,-17,-20,-20,-17,-18,-18,-17,-22,-19,-18
Q,-1,-17,-19,-21,-18,7,-15,-20,3,-17,-2,1,-23,-3,-4,-3,-4,-23,-5,-4
E,-14,-16,-15,-11,-17,-15,5,-16,-21,-17,-16,-14,-17,-20,-14,-15,-16,-18,-18,-16
G,-17,-16,-19,-17,-18,-20,-16,7,-21,-19,-16,-18,-21,-21,-19,-14,-17,-16,-21,-17
H,-1,-17,-20,-19,0,3,-21,-21,8,-17,-3,-4,-21,-4,-2,-1,-3,-23,1,-3
I,-17,-18,-18,-17,-19,-17,-17,-19,-17,6,-14,-19,-19,-17,-18,-17,-17,-22,-19,-13


Some scores look good, some look off. Not sure why.

#### Run functions as script

In [23]:
%run diazdelser-blosum.py 'alignment.dat' 'blosum_matrix.out'

[32mScore matrix saved to "blosum_matrix.out"[0m


### Ex.3 _(13 pts)_ Global/local alignment

#### (a) _(5 pts)_ Implement the Needleman-Wunsch algorithm:
Write a program called needle.py that takes 4 parameters
and should be called like ```python needle.py -8 blosum.txt sequence1.fasta sequence2.fasta```, where -8 is the gap
penalty w, ```blosum.txt``` is a text file containing the scoring matrix and ```sequence1.fasta``` and
```sequence2.fasta``` are two amino acid sequences in Fasta format.

The program should output the score of the alignment and the aligned sequences, i.e. the sequences
plus the gaps (represented by the symbol -).

• Note that you will need to write a function that is able to read in a scoring matrix from a
text file and store it in a suitable data structure.

• Test your program using the scoring matrix blosum62.dat and a gap penalty of -5 on the
sequences:
THRQATWQPPLERMANGRQVE and RAYMQNDLVKVRYYACHT

In [24]:
seqA = 'THRQATWQPPLERMANGRQVE'
seqB = 'RAYMQNDLVKVRYYACHT'

In [25]:
def score_matrix_to_df(lines:list) -> (str, list):
	"""Read list of lines into dataframe"""
	lines = [line.strip() for line in lines]
	# Turn lines into list of scores
	lines =[ [ letter for letter in line.split() if letter !=' '] for line in lines ]
	# Separate header from rest of the strings
	header = lines[0]
	# Remove first item from lists (it's the aa)
	lines = [line[1:] for line in lines[1:]]
	# Generate pandas dataframe, remove emtpy rows
	df = pd.DataFrame(lines, columns=header).dropna(how='all')
	# Add aa symbols as row headers
	df = df.rename(dict(enumerate(header)))
	return df

Note: I read the ```hints for Handout 04``` too late and I was already using a dataframe.
In the future, will implement it with a dictionary

In [26]:
# Open score matrix file
text = read_file('./blosum/blosum62.dat')
# Read into df
score = score_matrix_to_df(text)

In [27]:
def create_alignment_matrix(seqA:str, seqB:str,w, score) -> (np.ndarray,np.ndarray):
	"""Create a Neeedleman-Wunsch alignment matrix"""
	#Initialize: M(0; 0) = 0
	matrix = np.zeros((len(seqA),len(seqB)),dtype=int)
	traceback = np.zeros((len(seqA),len(seqB)),dtype=tuple)
	# first row M(0; j) = jw for j = 1 ... m,
	for i in range(len(seqA)-1):
		matrix[i,0] = i*w
	# first column M(i; 0) = iw for i = 1 ... n
	for j in range(len(seqB)-1):
		matrix[0,j] = j*w
		pass
	# Fill Rest of the matrix
	for i in range(len(seqA)-1):
		for j in range(len(seqB)-1):
			options = {
				matrix[i-1,j-1] + int(score.loc[seqA[i], seqB[j]]) : (i-1,j-1),
				matrix[i-1,j] + w : (i-1,j),
				matrix[i,j-1] : (i,j-1),
			}
			best_option = max(options.keys())
			matrix[i,j] = best_option
			traceback[i,j] = options.get(best_option)
	return matrix, traceback


In [28]:
def traceback(i,j,traceback_matrix, seqA,seqB,alignement):
	"""Check traceback"""
	if traceback_matrix[i,j] == (i-1,j-1):
		alignement.append((seqA[j],seqB[i]))
		traceback(i-1,j-1,traceback_matrix, seqA,seqB,alignement)
	elif traceback_matrix[i,j] == (i-1,j):
		alignement.append((seqA[j],'-'))
		traceback(i-1,j,traceback_matrix, seqA,seqB,alignement)
	elif traceback_matrix[i,j] ==(i,j-1):
		alignement.append(('-',seqB[i]))
		traceback(i,j-1,traceback_matrix, seqA,seqB,alignement)
	return alignement


In [29]:
def print_alignment(alignement:list):
	"""Print sequence alignment"""
	x,y=zip(*alignement)
	print("".join(x))
	print("".join(['|' if x == y else ' ' for x,y in zip(x,y) ]))
	print("".join(y))

In [30]:
def total_score(alignement:list, score_matrix:pd.DataFrame) -> int or str:
	"""Calculate the total alignment score of a given alignment"""
	if alignement:
		x,y=zip(*alignement)
		scores = [ int(score_matrix.loc[x,y]) for x,y in zip(x,y) if (x != '-') and (y!= '-') ]
		return sum(scores)
	else:
		return 'N/A'

In [31]:
def Needleman_Wunsch(seqA:str, seqB:str, score_matrix:pd.DataFrame, w:int) -> int:
	"""Runs Neeedleman-Wunsch Algorithm"""
	# Construction of alignment matrix
	matrix, traceback_matrix = create_alignment_matrix(seqB,seqA, w,score_matrix)

	# Construct alignment via traceback matrix
	n = len(traceback_matrix)
	m = len(traceback_matrix[0])
	alignement = []
	alignement = traceback(n-2,m-2,traceback_matrix, seqA,seqB,alignement)

	print_alignment(alignement)
	return total_score(alignement, score_matrix)

In [32]:
# Try Needleman_Wunsch algorithm
s= Needleman_Wunsch(seqA=seqA, seqB=seqB, score_matrix=score, w=-3)
print('Score: ',s)

----NAAMMREELLPPQWW-A-R
      |  |   |  |   | |
HHHHH-A-YR-K-LDNQ-YAARR
Score:  27


#### (b) _(2 pts)_ Formatted output.
Modifiy the output of the alignment so that sequence 1 appears in
the first line and sequence 2 in the third line while the second line contains symbols reflecting
the similarity between aligned symbols:
 - for identical residues,
 - : for conservative substitutions (positive score in scoring matrix)
 - and spaces otherwise.

In [33]:
def grouper(n, iterable):
	if iterable:
		if n < len(iterable):
			args = [iter(iterable)] * n
			return zip(*args)
		else:
			return [iterable]

In [34]:
def print_alignment(alignement:list,score_matrix:pd.DataFrame,n:int=80):
	"""Print sequence alignment"""
	# Print out a max of 80 characters per line
	zipper = grouper(n,iterable=alignement)
	if zipper:
		for alignement_bit in zipper:
			X,Y=zip(*alignement_bit)
			score_symbol = []
			for x,y in zip(X,Y):
				if x == y:
					score_symbol.append('|')
				elif (x=='-') or (y=='-'):
					score_symbol.append(" ")
				elif int(score_matrix.loc[x,y]) > 0:
					score_symbol.append(':')
				else:
					score_symbol.append(" ")

			print("".join(X))
			print("".join(score_symbol))
			print("".join(Y))
	else:
		print("No alignment")

In [35]:
def Needleman_Wunsch(seqA:str, seqB:str, score_matrix:pd.DataFrame, w:int) -> int:
	"""Runs Neeedleman-Wunsch Algorithm"""
	# Construction of alignment matrix
	matrix, traceback_matrix = create_alignment_matrix(seqB,seqA, w,score_matrix)

	# Construct alignment via traceback matrix
	n = len(traceback_matrix)
	m = len(traceback_matrix[0])
	alignement = []
	alignement = traceback(n-2,m-2,traceback_matrix, seqA,seqB,alignement)
	print_alignment(alignement,score_matrix)
	return total_score(alignement, score_matrix)


In [36]:
# Try Needleman_Wunsch algorithm
s = Needleman_Wunsch(seqA=seqA, seqB=seqB, score_matrix=score, w=-3)
print(colored(f'\nScore: {s}','green'))

----NAAMMREELLPPQWW-A-R
    : |  | : |  | : | |
HHHHH-A-YR-K-LDNQ-YAARR
[32m
Score: 27[0m


#### (c) _(2 pts)_ Testing.
Perform all pairwise sequence alignments between sequences
```RNAS1_horse.fasta```, ```RNAS1_minke-whale.fasta```, and ```RNAS1_red-kangaroo.fasta``` using ```blosum50.dat```
and a gap penalty of -8. What can you conclude about the pairwise relationships?

In [37]:
%run diazdelser-needle.py './fastas/RNAS1_horse.fasta' './fastas/RNAS1_red-kangaroo.fasta' './fastas/RNAS1_red-kangaroo.fasta' './blosum/blosum50.txt' 8


[32mAlignment for: ./fastas/RNAS1_horse.fasta & ./fastas/RNAS1_red-kangaroo.fasta
[0m
---V-ADFHVPVY-NGECAVIIHREKQSTQYACNPYKSGG-TLRCDTIHMSSSSQYCNSQGNKCTINKQLCIAQVDALPE
     ||||||||  |||||||::   ||:| |||||| | | ||:|| : |:|:|||::|||||:|:| |:| | : ||
YYYYAADFHVPVYQQGECAVIIQKNLNSTEYQCNPYKS-GTTQRCNTISLRSNSKYCNTRGNKCTVNEQHCVADVVSKPE


[32m
Total score: 596[0m


#### (d) _(2 pt)_ Smith-Waterman.
Implement the Smith-Waterman algorithm by modifying the code from
```needle.py```. In addition to the alignment score you should output the sequence similarity and the
sequence identity of the two aligned sequences in percent. Sequence identity is the percentage
of matching residues relative to the length of the aligned sequences including gaps, sequence
similarity is the percentage of matched similar residues (i.e., those with a positive value in the
scoring matrix) relative to the length of the aligned sequences including gaps.

In [38]:
def create_alignment_matrix(seqA:str, seqB:str,w, score) -> (np.ndarray,np.ndarray):
	"""Create a Smith-Waterman alignment matrix"""
	#Initialize: M(0; 0) = 0
	matrix = np.zeros((len(seqA),len(seqB)),dtype=int)
	traceback = np.zeros((len(seqA),len(seqB)),dtype=tuple)
	# first row M(0; j) = jw for j = 1 ... m,
	for i in range(len(seqA)-1):
		matrix[i,0] = i*w
	# first column M(i; 0) = iw for i = 1 ... n
	for j in range(len(seqB)-1):
		matrix[0,j] = j*w
		pass
	# Fill Rest of the matrix
	for i in range(len(seqA)-1):
		for j in range(len(seqB)-1):
			options = {
				matrix[i-1,j-1] + int(score.loc[seqA[i], seqB[j]]) : (i-1,j-1),
				matrix[i-1,j] + w : (i-1,j),
				matrix[i,j-1] : (i,j-1),
				0: 0
			}
			best_option = max(options.keys())
			matrix[i,j] = best_option
			traceback[i,j] = options.get(best_option)
	return matrix, traceback


In [39]:
def traceback(i,j,traceback_matrix, seqA,seqB,alignement):
	"""Check traceback"""
	if traceback_matrix[i,j] == (i-1,j-1):
		alignement.append((seqA[j],seqB[i]))
		traceback(i-1,j-1,traceback_matrix, seqA,seqB,alignement)
	elif traceback_matrix[i,j] == (i-1,j):
		alignement.append((seqA[j],'-'))
		traceback(i-1,j,traceback_matrix, seqA,seqB,alignement)
	elif traceback_matrix[i,j] ==(i,j-1):
		alignement.append(('-',seqB[i]))
		traceback(i,j-1,traceback_matrix, seqA,seqB,alignement)
	elif traceback_matrix[i,j] ==0:
		return alignement

In [40]:
def Smith_Waterman(seqA:str, seqB:str, score_matrix:pd.DataFrame, w:int) -> int:
	"""Runs Neeedleman-Wunsch Algorithm"""
	# Construction of alignment matrix
	matrix, traceback_matrix = create_alignment_matrix(seqB,seqA, w,score_matrix)

	# Construct alignment via traceback matrix
	ind = np.unravel_index(np.argmax(score_matrix, axis=None), score_matrix.shape)  # returns a tuple
	n,m = ind
	alignement = []
	alignement = traceback(n,m,traceback_matrix, seqA,seqB,alignement)
	print_alignment(alignement,score_matrix)
	return total_score(alignement, score_matrix)

In [41]:
# Try Smith_Waterman algorithm
s = Smith_Waterman(seqA=seqA, seqB=seqB, score_matrix=score, w=-3)

No alignment


#### (e) _(2 pt)_ Testing.
Run a global and a local sequence alignment on the sequences ```halodurans.fasta```
and ```lentus.fasta``` using ```blosum62.txt``` and a gap penalty of -8. Which is more appropriate?

In [42]:
%run diazdelser-waterman.py './fastas/halodurans.fasta' './fastas/lentus.fasta' './blosum/blosum62.txt' 8

[32mAlignment for: ./fastas/halodurans.fasta & ./fastas/lentus.fasta
[0m
No alignment
[32m
Total score: N/A[0m


In [43]:
%run diazdelser-needle.py './fastas/halodurans.fasta' './fastas/lentus.fasta' './blosum/blosum62.txt' 8




[32mAlignment for: ./fastas/halodurans.fasta & ./fastas/lentus.fasta
[0m
TARGAHVLGNGYLSPSGLYTATQNIRQRIQNNTYSPYRSKVLAAVGAVHPTAMSTGSLSVYRNGTYTSNVNVGPASIEIE
||  |:|||:|||: ||| |||  :  ||| |::|| : ||||| |||||||||||:|| | :| ||| |||||| |:: 
TAAEANVLGSGYLNTSGLSTATNKLHNRIQVNSWSPNKQKVLAAAGAVHPTAMSTGNLSAYTSGPYTSQVNVGPAVIDLG


PGYTSFSARQGNQDVAAVAMVGSYRAPYNVGG-RGTNGAAGVLLIGANNARNVALELTSSGSTSGLSMNIIHMNNNIAWE
 || |||||  ||| | |||  :|||||:: |  |:||:| |:|:| : | ||| |||:| | ||||:| :|| || |||
AGYQSFSARNNNQDTAGVAMANAYRAPYSI-GAAGSNGSAAVVLVGRSTASNVAQELTASPSPSGLSLNAVHMGNNGAWE


IGQAVSALSGSGNRDLVKVAYLDASPAVGLVGISNNLAAITGAVHTGHGNNDHYSPESSIFSAGGAIRLDPHSAIGTDLV
:|||:|::||||:  |||||||:||||||||||||||||||||||||||| |  |||  :||||| | ||||::||||||
LGQAISSVSGSGSAGLVKVAYLEASPAVGLVGISNNLAAITGAVHTGHGNGDQTSPEGPVFSAGGRINLDPHTSIGTDLV


AVRAGNGFIGRNH--------------------A----A---P-------------A------------------EV-Q-
||: |:| :||||                    |    |   |             |                  :| : 
AVKVGSGTLGRNHAAAAAAAAAAAAAAA

Clearly, something is wrong with my Smith_Waterman algorithm. I think it might have something to do
with the scoring of the mismatches, but I'm not sure how to fix it.