In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount("/content/drive/")
%cd '/content/drive/My Drive/DS320'
!pwd

Mounted at /content/drive/
/content/drive/My Drive/DS320
/content/drive/My Drive/DS320


In [None]:
# Load csv files
GDP_Happiness_2015 = pd.read_csv("2015.csv")
GDP_Happiness_2016 = pd.read_csv("2016.csv")
GDP_Happiness_2017 = pd.read_csv("2017.csv")
GDP_Happiness_2018 = pd.read_csv("2018.csv")
GDP_Happiness_2019 = pd.read_csv("2019.csv")
Suicide_before_modified = pd.read_csv("who_suicide_statistics.csv")

In [None]:
# Pre-processing for GDP_Happiness Data : Remove unnecessary columns
GH2015 = GDP_Happiness_2015.drop(columns=['Region','Happiness Rank','Standard Error','Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Dystopia Residual'])
GH2016 = GDP_Happiness_2016.drop(columns=['Region','Happiness Rank','Lower Confidence Interval','Upper Confidence Interval', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Dystopia Residual'])
GH2017 = GDP_Happiness_2017.drop(columns=['Happiness.Rank','Whisker.high','Whisker.low', 'Family', 'Health..Life.Expectancy.', 'Freedom', 'Trust..Government.Corruption.', 'Generosity', 'Dystopia.Residual'])
GH2018 = GDP_Happiness_2018.drop(columns=['Overall rank','Social support','Healthy life expectancy','Freedom to make life choices', 'Generosity', 'Perceptions of corruption'])
GH2019 = GDP_Happiness_2019.drop(columns=['Overall rank','Social support','Healthy life expectancy','Freedom to make life choices', 'Generosity', 'Perceptions of corruption'])
# Rename some columns with different name
GH2015 = GH2015.rename(columns={'Country':'country'})
GH2016 = GH2016.rename(columns={'Country':'country'})
GH2017 = GH2017.rename(columns={'Country':'country', 'Happiness.Score':'Happiness Score', 'Economy..GDP.per.Capita.':'Economy (GDP per Capita)'})
GH2018 = GH2018.rename(columns={'Country or region':'country', 'Score':'Happiness Score', 'GDP per capita':'Economy (GDP per Capita)'})
GH2019 = GH2019.rename(columns={'Country or region':'country', 'Score':'Happiness Score', 'GDP per capita':'Economy (GDP per Capita)'})
# Adding year variable into each dataframe
GH2015.insert(0, 'Year', '2015')
GH2016.insert(0, 'Year', '2016')
GH2017.insert(0, 'Year', '2017')
GH2018.insert(0, 'Year', '2018')
GH2019.insert(0, 'Year', '2019')

In [None]:
GH = pd.concat([GH2015], ignore_index=True)
GH

Unnamed: 0,Year,country,Happiness Score,Economy (GDP per Capita)
0,2015,Switzerland,7.587,1.39651
1,2015,Iceland,7.561,1.30232
2,2015,Denmark,7.527,1.32548
3,2015,Norway,7.522,1.45900
4,2015,Canada,7.427,1.32629
...,...,...,...,...
153,2015,Rwanda,3.465,0.22208
154,2015,Benin,3.340,0.28665
155,2015,Syria,3.006,0.66320
156,2015,Burundi,2.905,0.01530


In [None]:
# Pre-processing for Suicide Data : Remove unnecessary columns
Suicide = Suicide_before_modified.drop(columns=['sex', 'age'])
# Replace NaN values with zero(0)
Suicide = Suicide.replace(np.nan, 0)
# Calculate suicide rate
Suicide = Suicide.groupby(by=['country','year'], as_index = False).sum()
suicide_rate = (Suicide['suicides_no']/Suicide['population'])*100
Suicide.insert(2, 'suicide_rate(%)', suicide_rate)

In [None]:
# Select specific years that we need to investigate
Suicide_2015_16 = Suicide.loc[Suicide['year'] == 2015]
# Remove missing data(population should be larger than zero(0))
Suicide_Final = Suicide_2015_16.loc[Suicide_2015_16['population'] > 0]
Suicide_Final

Unnamed: 0,country,year,suicide_rate(%),suicides_no,population
30,Albania,2015,0.000000,0.0,2719684.0
87,Antigua and Barbuda,2015,0.001088,1.0,91889.0
124,Argentina,2015,0.007741,3073.0,39699624.0
158,Armenia,2015,0.002647,74.0,2795335.0
179,Aruba,2015,0.009117,9.0,98712.0
...,...,...,...,...,...
3448,Ukraine,2015,0.018773,7574.0,40345446.0
3491,United Kingdom,2015,0.008038,4910.0,61082942.0
3529,United States of America,2015,0.014726,44189.0,300078511.0
3564,Uruguay,2015,0.019744,630.0,3190795.0


In [None]:
# Merge two dataframe with left-join
Joined_data = pd.merge(Suicide_Final, GH, on='country', how='left')
Joined_data

Unnamed: 0,country,year,suicide_rate(%),suicides_no,population,Year,Happiness Score,Economy (GDP per Capita)
0,Albania,2015,0.000000,0.0,2719684.0,2015,4.959,0.87867
1,Antigua and Barbuda,2015,0.001088,1.0,91889.0,,,
2,Argentina,2015,0.007741,3073.0,39699624.0,2015,6.574,1.05351
3,Armenia,2015,0.002647,74.0,2795335.0,2015,4.350,0.76821
4,Aruba,2015,0.009117,9.0,98712.0,,,
...,...,...,...,...,...,...,...,...
68,Ukraine,2015,0.018773,7574.0,40345446.0,2015,4.681,0.79907
69,United Kingdom,2015,0.008038,4910.0,61082942.0,2015,6.867,1.26637
70,United States of America,2015,0.014726,44189.0,300078511.0,,,
71,Uruguay,2015,0.019744,630.0,3190795.0,2015,6.485,1.06166


In [None]:
# To find heterogeneity data
Heterogeneity = Joined_data[Joined_data['Happiness Score'].isna()]
Heterogeneity

Unnamed: 0,country,year,suicide_rate(%),suicides_no,population,Year,Happiness Score,Economy (GDP per Capita)
1,Antigua and Barbuda,2015,0.001088,1.0,91889.0,,,
4,Aruba,2015,0.009117,9.0,98712.0,,,
9,Belize,2015,0.008129,26.0,319835.0,,,
11,Brunei Darussalam,2015,0.001823,7.0,384080.0,,,
15,Cuba,2015,0.013949,1511.0,10832068.0,,,
26,Grenada,2015,0.0,0.0,96892.0,,,
28,Hong Kong SAR,2015,0.014124,990.0,7009500.0,,,
31,Iran (Islamic Rep of),2015,0.003273,2372.0,72460999.0,,,
48,Puerto Rico,2015,0.006514,226.0,3469521.0,,,
50,Republic of Korea,2015,0.027757,13510.0,48671752.0,,,


In [None]:
from numpy import full



#build an array of zeroes
seq1 = 'Malta'
seq2 = 'Egypt'

n_rows = len("-"+seq1)
n_columns = len("-"+seq2)

scoring_array = full([n_rows,n_columns],0)
print("Scoring array:\n",scoring_array)

traceback_array = full([n_rows,n_columns],"-")
print("Traceback array:\n",traceback_array)

Scoring array:
 [[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Traceback array:
 [['-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-']]


In [None]:
from IPython.core.display import HTML,display
import pandas as pd

def pretty_table_from_array(data_array, row_labels,col_labels):
    """Show an HTML table from a 2d numpy array"""
    df = pd.DataFrame(data_array,index=row_labels,columns=col_labels)
    table_html = df.to_html()
    return HTML(table_html)

In [None]:
row_labels = [label for label in "-"+seq1]
column_labels = [label for label in "-"+seq2]

print("Scoring array:")
display(pretty_table_from_array(scoring_array,row_labels,column_labels))
print("Traceback array:")
display(pretty_table_from_array(traceback_array,row_labels,column_labels))

Scoring array:


Unnamed: 0,-,E,g,y,p,t
-,0,0,0,0,0,0
M,0,0,0,0,0,0
a,0,0,0,0,0,0
l,0,0,0,0,0,0
t,0,0,0,0,0,0
a,0,0,0,0,0,0


Traceback array:


Unnamed: 0,-,E,g,y,p,t
-,-,-,-,-,-,-
M,-,-,-,-,-,-
a,-,-,-,-,-,-
l,-,-,-,-,-,-
t,-,-,-,-,-,-
a,-,-,-,-,-,-


In [None]:
count = 0
for row_index in range(n_rows):
    for col_index in range(n_columns):    
        scoring_array[row_index,col_index] = count
        count += 1
        
display(pretty_table_from_array(scoring_array,row_labels,column_labels))

Unnamed: 0,-,E,g,y,p,t
-,0,1,2,3,4,5
M,6,7,8,9,10,11
a,12,13,14,15,16,17
l,18,19,20,21,22,23
t,24,25,26,27,28,29
a,30,31,32,33,34,35


In [None]:
up_arrow = "\u2191"
right_arrow = "\u2192"
down_arrow = "\u2193"
left_arrow = "\u2190"
down_right_arrow = "\u2198"
up_left_arrow = "\u2196"

print("Up arrow",up_arrow)
print("Left arrow",left_arrow)
print("Up Left arrow",up_left_arrow)


Up arrow ↑
Left arrow ←
Up Left arrow ↖


In [None]:
n_rows = len(seq1) + 1
n_columns = len(seq2) + 1 
row_labels = [label for label in "-"+seq1]
column_labels = [label for label in "-"+seq2]


scoring_array = full([n_rows,n_columns],0)
traceback_array = full([n_rows,n_columns],"-")


up_arrow = "\u2191"
right_arrow = "\u2192"
down_arrow = "\u2193"
left_arrow = "\u2190"
down_right_arrow = "\u2198"
up_left_arrow = "\u2196"

arrow = "-"
gap_penalty = -1
match_bonus = 4
mismatch_penalty = -1

for row in range(n_rows):
    for col in range(n_columns):        
        if row == 0 and col == 0:
            score = 0
            arrow = "-"
        elif row == 0:
            previous_score = scoring_array[row,col - 1]
            score = previous_score + gap_penalty
            arrow = left_arrow
        elif col == 0:
            previous_score = scoring_array[row -1,col]
            score = previous_score + gap_penalty
            arrow = up_arrow
        else: 
            cell_to_the_left = scoring_array[row,col-1]
            from_left_score = cell_to_the_left + gap_penalty
            above_cell = scoring_array[row-1,col]
            from_above_score = above_cell + gap_penalty
            diagonal_left_cell = scoring_array[row-1,col-1]   
            if seq1[row-1] == seq2[col-1]:
                diagonal_left_cell_score = diagonal_left_cell + match_bonus
            else:
                diagonal_left_cell_score = diagonal_left_cell + mismatch_penalty
            score = max([from_left_score,from_above_score,diagonal_left_cell_score]) 
            if score == from_left_score:
                arrow = left_arrow
            elif score == from_above_score:
                arrow = up_arrow
            elif score == diagonal_left_cell_score:
                arrow = up_left_arrow
                
        traceback_array[row,col]=arrow    
        scoring_array[row,col] = score
        
display(pretty_table_from_array(scoring_array,row_labels,column_labels))  
display(pretty_table_from_array(traceback_array,row_labels,column_labels))

Unnamed: 0,-,E,g,y,p,t
-,0,-1,-2,-3,-4,-5
M,-1,-1,-2,-3,-4,-5
a,-2,-2,-2,-3,-4,-5
l,-3,-3,-3,-3,-4,-5
t,-4,-4,-4,-4,-4,0
a,-5,-5,-5,-5,-5,-1


Unnamed: 0,-,E,g,y,p,t
-,-,←,←,←,←,←
M,↑,↖,←,←,←,←
a,↑,↑,↖,←,←,←
l,↑,↑,↑,↖,←,←
t,↑,↑,↑,↑,↖,↖
a,↑,↑,↑,↑,↑,↑


In [None]:
def traceback_alignment(traceback_array,seq1,seq2,up_arrow = "\u2191" ,\
                        left_arrow="\u2190",up_left_arrow="\u2196",stop="-"):
   
    n_rows = len(seq1) + 1 
    n_columns = len(seq2) + 1
    
    row = len(seq1)
    col = len(seq2)
    arrow = traceback_array[row,col]
    aligned_seq1 = ""
    aligned_seq2 = ""
    alignment_indicator = ""
    while arrow is not "-":
            print("Currently on row:",row)
            print("Currently on col:",col)
            arrow = traceback_array[row,col]
            print("Arrow:",arrow)
            
            if arrow == up_arrow: 
                print("insert indel into top sequence")
                #We want to add the new indel onto the left 
                #side of the growing aligned sequence
                aligned_seq2 = "-"+aligned_seq2 
                aligned_seq1 = seq1[row-1] + aligned_seq1
                alignment_indicator = " "+alignment_indicator
                row -=1
                            
            elif arrow == up_left_arrow:
                print("match or mismatch")
                #Note that we look up the row-1 and col-1 indexes
                #because there is an extra "-" character at the
                #start of each sequence
                seq1_character = seq1[row-1]
                seq2_character = seq2[col-1]
                aligned_seq1 = seq1[row-1] + aligned_seq1
                aligned_seq2 = seq2[col-1] + aligned_seq2
                if seq1_character == seq2_character:
                    alignment_indicator = "|"+alignment_indicator
                else:
                    alignment_indicator = " "+alignment_indicator
                row -=1
                col -=1
                
            elif arrow == left_arrow:
                print("Insert indel into left sequence")
                aligned_seq1 = "-"+aligned_seq1
                aligned_seq2 = seq2[col-1] + aligned_seq2
                alignment_indicator = " "+alignment_indicator
                col -=1
                
            elif arrow == stop:
                break
            else:
                raise ValueError(f"Traceback array entry at {row},{col}: {arrow} is not recognized.")
            #print(traceback_array,-row,-col,traceback_array[-row,-col])
            print(aligned_seq1)
            print(alignment_indicator)
            print(aligned_seq2)
            
    return aligned_seq1,aligned_seq2
traceback_alignment(traceback_array,seq1,seq2)

Currently on row: 5
Currently on col: 5
Arrow: ↑
insert indel into top sequence
a
 
-
Currently on row: 4
Currently on col: 5
Arrow: ↖
match or mismatch
ta
| 
t-
Currently on row: 3
Currently on col: 4
Arrow: ←
Insert indel into left sequence
-ta
 | 
pt-
Currently on row: 3
Currently on col: 3
Arrow: ↖
match or mismatch
l-ta
  | 
ypt-
Currently on row: 2
Currently on col: 2
Arrow: ↖
match or mismatch
al-ta
   | 
gypt-
Currently on row: 1
Currently on col: 1
Arrow: ↖
match or mismatch
Mal-ta
    | 
Egypt-
Currently on row: 0
Currently on col: 0
Arrow: -


('Mal-ta', 'Egypt-')

In [None]:
from numpy import full



#build an array of zeroes
seq1 = 'Hong Kong'
seq2 = 'Hong Kong SAR'

n_rows = len("-"+seq1)
n_columns = len("-"+seq2)

scoring_array = full([n_rows,n_columns],0)
print("Scoring array:\n",scoring_array)

traceback_array = full([n_rows,n_columns],"-")
print("Traceback array:\n",traceback_array)

Scoring array:
 [[0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Traceback array:
 [['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']]


In [None]:
from IPython.core.display import HTML,display
import pandas as pd

def pretty_table_from_array(data_array, row_labels,col_labels):
    """Show an HTML table from a 2d numpy array"""
    df = pd.DataFrame(data_array,index=row_labels,columns=col_labels)
    table_html = df.to_html()
    return HTML(table_html)

In [None]:
row_labels = [label for label in "-"+seq1]
column_labels = [label for label in "-"+seq2]

print("Scoring array:")
display(pretty_table_from_array(scoring_array,row_labels,column_labels))
print("Traceback array:")
display(pretty_table_from_array(traceback_array,row_labels,column_labels))

Scoring array:


Unnamed: 0,-,H,o,n,g,Unnamed: 6,K,o.1,n.1,g.1,Unnamed: 11,S,A,R
-,0,0,0,0,0,0,0,0,0,0,0,0,0,0
H,0,0,0,0,0,0,0,0,0,0,0,0,0,0
o,0,0,0,0,0,0,0,0,0,0,0,0,0,0
n,0,0,0,0,0,0,0,0,0,0,0,0,0,0
g,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0
K,0,0,0,0,0,0,0,0,0,0,0,0,0,0
o,0,0,0,0,0,0,0,0,0,0,0,0,0,0
n,0,0,0,0,0,0,0,0,0,0,0,0,0,0
g,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Traceback array:


Unnamed: 0,-,H,o,n,g,Unnamed: 6,K,o.1,n.1,g.1,Unnamed: 11,S,A,R
-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
H,-,-,-,-,-,-,-,-,-,-,-,-,-,-
o,-,-,-,-,-,-,-,-,-,-,-,-,-,-
n,-,-,-,-,-,-,-,-,-,-,-,-,-,-
g,-,-,-,-,-,-,-,-,-,-,-,-,-,-
,-,-,-,-,-,-,-,-,-,-,-,-,-,-
K,-,-,-,-,-,-,-,-,-,-,-,-,-,-
o,-,-,-,-,-,-,-,-,-,-,-,-,-,-
n,-,-,-,-,-,-,-,-,-,-,-,-,-,-
g,-,-,-,-,-,-,-,-,-,-,-,-,-,-


In [None]:
count = 0
for row_index in range(n_rows):
    for col_index in range(n_columns):    
        scoring_array[row_index,col_index] = count
        count += 1
        
display(pretty_table_from_array(scoring_array,row_labels,column_labels))

Unnamed: 0,-,H,o,n,g,Unnamed: 6,K,o.1,n.1,g.1,Unnamed: 11,S,A,R
-,0,1,2,3,4,5,6,7,8,9,10,11,12,13
H,14,15,16,17,18,19,20,21,22,23,24,25,26,27
o,28,29,30,31,32,33,34,35,36,37,38,39,40,41
n,42,43,44,45,46,47,48,49,50,51,52,53,54,55
g,56,57,58,59,60,61,62,63,64,65,66,67,68,69
,70,71,72,73,74,75,76,77,78,79,80,81,82,83
K,84,85,86,87,88,89,90,91,92,93,94,95,96,97
o,98,99,100,101,102,103,104,105,106,107,108,109,110,111
n,112,113,114,115,116,117,118,119,120,121,122,123,124,125
g,126,127,128,129,130,131,132,133,134,135,136,137,138,139


In [None]:
up_arrow = "\u2191"
right_arrow = "\u2192"
down_arrow = "\u2193"
left_arrow = "\u2190"
down_right_arrow = "\u2198"
up_left_arrow = "\u2196"

print("Up arrow",up_arrow)
print("Left arrow",left_arrow)
print("Up Left arrow",up_left_arrow)


Up arrow ↑
Left arrow ←
Up Left arrow ↖


In [None]:
n_rows = len(seq1) + 1
n_columns = len(seq2) + 1 
row_labels = [label for label in "-"+seq1]
column_labels = [label for label in "-"+seq2]


scoring_array = full([n_rows,n_columns],0)
traceback_array = full([n_rows,n_columns],"-")


up_arrow = "\u2191"
right_arrow = "\u2192"
down_arrow = "\u2193"
left_arrow = "\u2190"
down_right_arrow = "\u2198"
up_left_arrow = "\u2196"

arrow = "-"
gap_penalty = -1
match_bonus = 4
mismatch_penalty = -1

for row in range(n_rows):
    for col in range(n_columns):        
        if row == 0 and col == 0:
            score = 0
            arrow = "-"
        elif row == 0:
            previous_score = scoring_array[row,col - 1]
            score = previous_score + gap_penalty
            arrow = left_arrow
        elif col == 0:
            previous_score = scoring_array[row -1,col]
            score = previous_score + gap_penalty
            arrow = up_arrow
        else: 
            cell_to_the_left = scoring_array[row,col-1]
            from_left_score = cell_to_the_left + gap_penalty
            above_cell = scoring_array[row-1,col]
            from_above_score = above_cell + gap_penalty
            diagonal_left_cell = scoring_array[row-1,col-1]   
            if seq1[row-1] == seq2[col-1]:
                diagonal_left_cell_score = diagonal_left_cell + match_bonus
            else:
                diagonal_left_cell_score = diagonal_left_cell + mismatch_penalty
            score = max([from_left_score,from_above_score,diagonal_left_cell_score]) 
            if score == from_left_score:
                arrow = left_arrow
            elif score == from_above_score:
                arrow = up_arrow
            elif score == diagonal_left_cell_score:
                arrow = up_left_arrow
                
        traceback_array[row,col]=arrow    
        scoring_array[row,col] = score
        
display(pretty_table_from_array(scoring_array,row_labels,column_labels))  
display(pretty_table_from_array(traceback_array,row_labels,column_labels))

Unnamed: 0,-,H,o,n,g,Unnamed: 6,K,o.1,n.1,g.1,Unnamed: 11,S,A,R
-,0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13
H,-1,4,3,2,1,0,-1,-2,-3,-4,-5,-6,-7,-8
o,-2,3,8,7,6,5,4,3,2,1,0,-1,-2,-3
n,-3,2,7,12,11,10,9,8,7,6,5,4,3,2
g,-4,1,6,11,16,15,14,13,12,11,10,9,8,7
,-5,0,5,10,15,20,19,18,17,16,15,14,13,12
K,-6,-1,4,9,14,19,24,23,22,21,20,19,18,17
o,-7,-2,3,8,13,18,23,28,27,26,25,24,23,22
n,-8,-3,2,7,12,17,22,27,32,31,30,29,28,27
g,-9,-4,1,6,11,16,21,26,31,36,35,34,33,32


Unnamed: 0,-,H,o,n,g,Unnamed: 6,K,o.1,n.1,g.1,Unnamed: 11,S,A,R
-,-,←,←,←,←,←,←,←,←,←,←,←,←,←
H,↑,↖,←,←,←,←,←,←,←,←,←,←,←,←
o,↑,↑,↖,←,←,←,←,←,←,←,←,←,←,←
n,↑,↑,↑,↖,←,←,←,←,←,←,←,←,←,←
g,↑,↑,↑,↑,↖,←,←,←,←,←,←,←,←,←
,↑,↑,↑,↑,↑,↖,←,←,←,←,←,←,←,←
K,↑,↑,↑,↑,↑,↑,↖,←,←,←,←,←,←,←
o,↑,↑,↑,↑,↑,↑,↑,↖,←,←,←,←,←,←
n,↑,↑,↑,↑,↑,↑,↑,↑,↖,←,←,←,←,←
g,↑,↑,↑,↑,↑,↑,↑,↑,↑,↖,←,←,←,←


In [None]:
def traceback_alignment(traceback_array,seq1,seq2,up_arrow = "\u2191" ,\
                        left_arrow="\u2190",up_left_arrow="\u2196",stop="-"):
   
    n_rows = len(seq1) + 1 
    n_columns = len(seq2) + 1
    
    row = len(seq1)
    col = len(seq2)
    arrow = traceback_array[row,col]
    aligned_seq1 = ""
    aligned_seq2 = ""
    alignment_indicator = ""
    while arrow is not "-":
            print("Currently on row:",row)
            print("Currently on col:",col)
            arrow = traceback_array[row,col]
            print("Arrow:",arrow)
            
            if arrow == up_arrow: 
                print("insert indel into top sequence")
                #We want to add the new indel onto the left 
                #side of the growing aligned sequence
                aligned_seq2 = "-"+aligned_seq2 
                aligned_seq1 = seq1[row-1] + aligned_seq1
                alignment_indicator = " "+alignment_indicator
                row -=1
                            
            elif arrow == up_left_arrow:
                print("match or mismatch")
                #Note that we look up the row-1 and col-1 indexes
                #because there is an extra "-" character at the
                #start of each sequence
                seq1_character = seq1[row-1]
                seq2_character = seq2[col-1]
                aligned_seq1 = seq1[row-1] + aligned_seq1
                aligned_seq2 = seq2[col-1] + aligned_seq2
                if seq1_character == seq2_character:
                    alignment_indicator = "|"+alignment_indicator
                else:
                    alignment_indicator = " "+alignment_indicator
                row -=1
                col -=1
                
            elif arrow == left_arrow:
                print("Insert indel into left sequence")
                aligned_seq1 = "-"+aligned_seq1
                aligned_seq2 = seq2[col-1] + aligned_seq2
                alignment_indicator = " "+alignment_indicator
                col -=1
                
            elif arrow == stop:
                break
            else:
                raise ValueError(f"Traceback array entry at {row},{col}: {arrow} is not recognized.")
            #print(traceback_array,-row,-col,traceback_array[-row,-col])
            print(aligned_seq1)
            print(alignment_indicator)
            print(aligned_seq2)
            
    return aligned_seq1,aligned_seq2
traceback_alignment(traceback_array,seq1,seq2)

Currently on row: 9
Currently on col: 13
Arrow: ←
Insert indel into left sequence
-
 
R
Currently on row: 9
Currently on col: 12
Arrow: ←
Insert indel into left sequence
--
  
AR
Currently on row: 9
Currently on col: 11
Arrow: ←
Insert indel into left sequence
---
   
SAR
Currently on row: 9
Currently on col: 10
Arrow: ←
Insert indel into left sequence
----
    
 SAR
Currently on row: 9
Currently on col: 9
Arrow: ↖
match or mismatch
g----
|    
g SAR
Currently on row: 8
Currently on col: 8
Arrow: ↖
match or mismatch
ng----
||    
ng SAR
Currently on row: 7
Currently on col: 7
Arrow: ↖
match or mismatch
ong----
|||    
ong SAR
Currently on row: 6
Currently on col: 6
Arrow: ↖
match or mismatch
Kong----
||||    
Kong SAR
Currently on row: 5
Currently on col: 5
Arrow: ↖
match or mismatch
 Kong----
|||||    
 Kong SAR
Currently on row: 4
Currently on col: 4
Arrow: ↖
match or mismatch
g Kong----
||||||    
g Kong SAR
Currently on row: 3
Currently on col: 3
Arrow: ↖
match or mismatch
ng Kong-

('Hong Kong----', 'Hong Kong SAR')

In [None]:
from numpy import full



#build an array of zeroes
seq1 = 'Republic of Korea'
seq2 = 'South Korea'

n_rows = len("-"+seq1)
n_columns = len("-"+seq2)

scoring_array = full([n_rows,n_columns],0)
print("Scoring array:\n",scoring_array)

traceback_array = full([n_rows,n_columns],"-")
print("Traceback array:\n",traceback_array)

Scoring array:
 [[0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]]
Traceback array:
 [['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
 ['-' '-' '-' '-' '-

In [None]:
from IPython.core.display import HTML,display
import pandas as pd

def pretty_table_from_array(data_array, row_labels,col_labels):
    """Show an HTML table from a 2d numpy array"""
    df = pd.DataFrame(data_array,index=row_labels,columns=col_labels)
    table_html = df.to_html()
    return HTML(table_html)

In [None]:
row_labels = [label for label in "-"+seq1]
column_labels = [label for label in "-"+seq2]

print("Scoring array:")
display(pretty_table_from_array(scoring_array,row_labels,column_labels))
print("Traceback array:")
display(pretty_table_from_array(traceback_array,row_labels,column_labels))

Scoring array:


Unnamed: 0,-,S,o,u,t,h,Unnamed: 7,K,o.1,r,e,a
-,0,0,0,0,0,0,0,0,0,0,0,0
R,0,0,0,0,0,0,0,0,0,0,0,0
e,0,0,0,0,0,0,0,0,0,0,0,0
p,0,0,0,0,0,0,0,0,0,0,0,0
u,0,0,0,0,0,0,0,0,0,0,0,0
b,0,0,0,0,0,0,0,0,0,0,0,0
l,0,0,0,0,0,0,0,0,0,0,0,0
i,0,0,0,0,0,0,0,0,0,0,0,0
c,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0


Traceback array:


Unnamed: 0,-,S,o,u,t,h,Unnamed: 7,K,o.1,r,e,a
-,-,-,-,-,-,-,-,-,-,-,-,-
R,-,-,-,-,-,-,-,-,-,-,-,-
e,-,-,-,-,-,-,-,-,-,-,-,-
p,-,-,-,-,-,-,-,-,-,-,-,-
u,-,-,-,-,-,-,-,-,-,-,-,-
b,-,-,-,-,-,-,-,-,-,-,-,-
l,-,-,-,-,-,-,-,-,-,-,-,-
i,-,-,-,-,-,-,-,-,-,-,-,-
c,-,-,-,-,-,-,-,-,-,-,-,-
,-,-,-,-,-,-,-,-,-,-,-,-


In [None]:
count = 0
for row_index in range(n_rows):
    for col_index in range(n_columns):    
        scoring_array[row_index,col_index] = count
        count += 1
        
display(pretty_table_from_array(scoring_array,row_labels,column_labels))

Unnamed: 0,-,S,o,u,t,h,Unnamed: 7,K,o.1,r,e,a
-,0,1,2,3,4,5,6,7,8,9,10,11
R,12,13,14,15,16,17,18,19,20,21,22,23
e,24,25,26,27,28,29,30,31,32,33,34,35
p,36,37,38,39,40,41,42,43,44,45,46,47
u,48,49,50,51,52,53,54,55,56,57,58,59
b,60,61,62,63,64,65,66,67,68,69,70,71
l,72,73,74,75,76,77,78,79,80,81,82,83
i,84,85,86,87,88,89,90,91,92,93,94,95
c,96,97,98,99,100,101,102,103,104,105,106,107
,108,109,110,111,112,113,114,115,116,117,118,119


In [None]:
up_arrow = "\u2191"
right_arrow = "\u2192"
down_arrow = "\u2193"
left_arrow = "\u2190"
down_right_arrow = "\u2198"
up_left_arrow = "\u2196"

print("Up arrow",up_arrow)
print("Left arrow",left_arrow)
print("Up Left arrow",up_left_arrow)


Up arrow ↑
Left arrow ←
Up Left arrow ↖


In [None]:
n_rows = len(seq1) + 1
n_columns = len(seq2) + 1 
row_labels = [label for label in "-"+seq1]
column_labels = [label for label in "-"+seq2]


scoring_array = full([n_rows,n_columns],0)
traceback_array = full([n_rows,n_columns],"-")


up_arrow = "\u2191"
right_arrow = "\u2192"
down_arrow = "\u2193"
left_arrow = "\u2190"
down_right_arrow = "\u2198"
up_left_arrow = "\u2196"

arrow = "-"
gap_penalty = -1
match_bonus = 4
mismatch_penalty = -1

for row in range(n_rows):
    for col in range(n_columns):        
        if row == 0 and col == 0:
            score = 0
            arrow = "-"
        elif row == 0:
            previous_score = scoring_array[row,col - 1]
            score = previous_score + gap_penalty
            arrow = left_arrow
        elif col == 0:
            previous_score = scoring_array[row -1,col]
            score = previous_score + gap_penalty
            arrow = up_arrow
        else: 
            cell_to_the_left = scoring_array[row,col-1]
            from_left_score = cell_to_the_left + gap_penalty
            above_cell = scoring_array[row-1,col]
            from_above_score = above_cell + gap_penalty
            diagonal_left_cell = scoring_array[row-1,col-1]   
            if seq1[row-1] == seq2[col-1]:
                diagonal_left_cell_score = diagonal_left_cell + match_bonus
            else:
                diagonal_left_cell_score = diagonal_left_cell + mismatch_penalty
            score = max([from_left_score,from_above_score,diagonal_left_cell_score]) 
            if score == from_left_score:
                arrow = left_arrow
            elif score == from_above_score:
                arrow = up_arrow
            elif score == diagonal_left_cell_score:
                arrow = up_left_arrow
                
        traceback_array[row,col]=arrow    
        scoring_array[row,col] = score
        
display(pretty_table_from_array(scoring_array,row_labels,column_labels))  
display(pretty_table_from_array(traceback_array,row_labels,column_labels))

Unnamed: 0,-,S,o,u,t,h,Unnamed: 7,K,o.1,r,e,a
-,0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11
R,-1,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11
e,-2,-2,-2,-3,-4,-5,-6,-7,-8,-9,-5,-6
p,-3,-3,-3,-3,-4,-5,-6,-7,-8,-9,-6,-6
u,-4,-4,-4,1,0,-1,-2,-3,-4,-5,-6,-7
b,-5,-5,-5,0,0,-1,-2,-3,-4,-5,-6,-7
l,-6,-6,-6,-1,-1,-1,-2,-3,-4,-5,-6,-7
i,-7,-7,-7,-2,-2,-2,-2,-3,-4,-5,-6,-7
c,-8,-8,-8,-3,-3,-3,-3,-3,-4,-5,-6,-7
,-9,-9,-9,-4,-4,-4,1,0,-1,-2,-3,-4


Unnamed: 0,-,S,o,u,t,h,Unnamed: 7,K,o.1,r,e,a
-,-,←,←,←,←,←,←,←,←,←,←,←
R,↑,↖,←,←,←,←,←,←,←,←,←,←
e,↑,↑,↖,←,←,←,←,←,←,←,↖,←
p,↑,↑,↑,↖,←,←,←,←,←,←,↑,↖
u,↑,↑,↑,↖,←,←,←,←,←,←,←,←
b,↑,↑,↑,↑,↖,←,←,←,←,←,←,←
l,↑,↑,↑,↑,↑,↖,←,←,←,←,←,←
i,↑,↑,↑,↑,↑,↑,↖,←,←,←,←,←
c,↑,↑,↑,↑,↑,↑,↑,↖,←,←,←,←
,↑,↑,↑,↑,↑,↑,↖,←,←,←,←,←


In [None]:
def traceback_alignment(traceback_array,seq1,seq2,up_arrow = "\u2191" ,\
                        left_arrow="\u2190",up_left_arrow="\u2196",stop="-"):
   
    n_rows = len(seq1) + 1 
    n_columns = len(seq2) + 1
    
    row = len(seq1)
    col = len(seq2)
    arrow = traceback_array[row,col]
    aligned_seq1 = ""
    aligned_seq2 = ""
    alignment_indicator = ""
    while arrow is not "-":
            print("Currently on row:",row)
            print("Currently on col:",col)
            arrow = traceback_array[row,col]
            print("Arrow:",arrow)
            
            if arrow == up_arrow: 
                print("insert indel into top sequence")
                #We want to add the new indel onto the left 
                #side of the growing aligned sequence
                aligned_seq2 = "-"+aligned_seq2 
                aligned_seq1 = seq1[row-1] + aligned_seq1
                alignment_indicator = " "+alignment_indicator
                row -=1
                            
            elif arrow == up_left_arrow:
                print("match or mismatch")
                #Note that we look up the row-1 and col-1 indexes
                #because there is an extra "-" character at the
                #start of each sequence
                seq1_character = seq1[row-1]
                seq2_character = seq2[col-1]
                aligned_seq1 = seq1[row-1] + aligned_seq1
                aligned_seq2 = seq2[col-1] + aligned_seq2
                if seq1_character == seq2_character:
                    alignment_indicator = "|"+alignment_indicator
                else:
                    alignment_indicator = " "+alignment_indicator
                row -=1
                col -=1
                
            elif arrow == left_arrow:
                print("Insert indel into left sequence")
                aligned_seq1 = "-"+aligned_seq1
                aligned_seq2 = seq2[col-1] + aligned_seq2
                alignment_indicator = " "+alignment_indicator
                col -=1
                
            elif arrow == stop:
                break
            else:
                raise ValueError(f"Traceback array entry at {row},{col}: {arrow} is not recognized.")
            #print(traceback_array,-row,-col,traceback_array[-row,-col])
            print(aligned_seq1)
            print(alignment_indicator)
            print(aligned_seq2)
            
    return aligned_seq1,aligned_seq2
traceback_alignment(traceback_array,seq1,seq2)

Currently on row: 17
Currently on col: 11
Arrow: ↖
match or mismatch
a
|
a
Currently on row: 16
Currently on col: 10
Arrow: ↖
match or mismatch
ea
||
ea
Currently on row: 15
Currently on col: 9
Arrow: ↖
match or mismatch
rea
|||
rea
Currently on row: 14
Currently on col: 8
Arrow: ↖
match or mismatch
orea
||||
orea
Currently on row: 13
Currently on col: 7
Arrow: ↖
match or mismatch
Korea
|||||
Korea
Currently on row: 12
Currently on col: 6
Arrow: ↑
insert indel into top sequence
 Korea
 |||||
-Korea
Currently on row: 11
Currently on col: 6
Arrow: ↑
insert indel into top sequence
f Korea
  |||||
--Korea
Currently on row: 10
Currently on col: 6
Arrow: ↑
insert indel into top sequence
of Korea
   |||||
---Korea
Currently on row: 9
Currently on col: 6
Arrow: ↖
match or mismatch
 of Korea
|   |||||
 ---Korea
Currently on row: 8
Currently on col: 5
Arrow: ↑
insert indel into top sequence
c of Korea
 |   |||||
- ---Korea
Currently on row: 7
Currently on col: 5
Arrow: ↑
insert indel into top se

('Republic of Korea', 'So-uth-- ---Korea')