In [1]:
from pysam import FastaFile,FastxFile
from ete3 import Tree
import csv

In [2]:
folder = '/Users/sanjanatule/Documents/uq/Projects/Indels/indelmip/data/real/MBL_243/'
nwk_file_path = folder + '/psp_ancestors.nwk'
sol1 = '/Users/sanjanatule/Documents/uq/Projects/Indels/indelmip/data/real/MBL_243/mip_ancestor_indel.fasta'
sol2 = '/Users/sanjanatule/Documents/uq/Projects/Indels/indelmip/data/real_mip_altopt/MBL_243/mip_ancestor_indel.fasta'
annotation_file = '/Users/sanjanatule/Documents/uq/Projects/Indels/indelmip/plots_for_paper/MBL_243_annot.csv'

In [3]:
tree_file = open(nwk_file_path,"r")
my_tree = tree_file.read() + ";"
tree = Tree(my_tree, format=1)
sol1_out = FastaFile(sol1)
sol2_out = FastaFile(sol2)

total_diff = 0
ancestors_diff  = []
ancestors_annot = []

for n in tree.traverse(): # level order
    if not n.is_leaf():
        m_parent_sequence_1 = sol1_out.fetch(n.name)
        m_parent_sequence_2 = sol2_out.fetch(n.name)
        
        if m_parent_sequence_1 != m_parent_sequence_2:
            total_diff += 1
            ancestors_diff.append(n.name)
            ancestors_annot.append([n.name,'*'])
            
print(f"Total ancestors different {total_diff}")
print(f"Total ancestors same {242 - total_diff}")
print(f"Ancestor names are {ancestors_diff}")

# save in csv file
with open(annotation_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(ancestors_annot)

Total ancestors different 70
Total ancestors same 172
Ancestor names are ['N0', 'N1', 'N233', 'N2', 'N240', 'N3', 'N60', 'N52', 'N62', 'N29', 'N58', 'N146', 'N238', 'N30', 'N45', 'N59', 'N147', 'N212', 'N239', 'N7', 'N46', 'N191', 'N213', 'N8', 'N66', 'N149', 'N211', 'N9', 'N15', 'N96', 'N150', 'N152', 'N193', 'N77', 'N151', 'N153', 'N194', 'N196', 'N11', 'N90', 'N154', 'N195', 'N209', 'N14', 'N119', 'N210', 'N94', 'N123', 'N95', 'N202', 'N185', 'N203', 'N186', 'N187', 'N173', 'N177', 'N188', 'N164', 'N169', 'N174', 'N178', 'N165', 'N170', 'N179', 'N166', 'N167', 'N180', 'N181', 'N182', 'N183']


## MINI TREE 1 ( under ancestor N236)

In [19]:
def get_indel_events(str1,str2):
    dis = 0
    prev_dis = 0
    pos = []
    val_dict_str1 = {}
    val_dict_str2 = {}
    val_for_pos = [612,714,749,991,1048,1049,1050,1071,1212]
    for i in range(0,len(str1)):
        if i+1 in val_for_pos:
            val_dict_str1[i+1] = int(str1[i])
            val_dict_str2[i+1] = int(str2[i])
                
        curr_dis = int(str1[i]) - int(str2[i])
        if curr_dis != 0 and curr_dis != prev_dis:
            dis += 1
            pos.append(i+1)
            
        prev_dis = curr_dis
    return dis,pos,val_dict_str1,val_dict_str2

def get_tree_indel_events(method_fasta_file,nwk_file_path):
    level = 0
    extant_list = []
    
    # sequence info
    sequences_fasta_info = FastaFile(method_fasta_file)
    
    # tree file
    tree_file = open(nwk_file_path,"r")
    my_tree = tree_file.read() + ";"
    tree = Tree(my_tree, format=1)
    
    for n in tree.traverse():
        if n.up is not None: # root node
            n.add_features(level = n.up.level + 1)
            seq_name = n.name
            indel_sequence_curr_level = sequences_fasta_info.fetch(n.name)
            indel_sequence_up_level   = sequences_fasta_info.fetch(n.up.name)
            total_indel_events,idel_pos,val_dict1,val_dict2 = get_indel_events(indel_sequence_up_level,indel_sequence_curr_level)
            n.add_features(indel_events = total_indel_events)
            if n.name in ['N239','N238','N236','N237'] or n.up.name in ['N239','N238','N236','N237']:
                print(f"Indel event between {n.up.name} and {n.name} is {total_indel_events}")
                print(idel_pos)
                print(f"{n.up.name} - {val_dict1}")
                print(f"{n.name} - {val_dict2}")

        else:
            n.add_features(level = level)
            n.add_features(indel_events = 0)

In [20]:
get_tree_indel_events(sol1,nwk_file_path)

Indel event between N235 and N236 is 0
[]
N235 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
N236 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
Indel event between N236 and N237 is 2
[2, 24]
N236 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
N237 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
Indel event between N236 and N238 is 1
[991]
N236 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
N238 - {612: 0, 714: 0, 749: 0, 991: 1, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
Indel event between N237 and sp|P16692|PHNP_ECOLI is 0
[]
N237 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
sp|P16692|PHNP_ECOLI - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
Indel event between N237 and tr|A0A564JD77|A0A564JD77_9ENTR is 0
[]
N237 - {612: 0, 714: 0, 749: 0, 991: 0, 1048

In [21]:
get_tree_indel_events(sol2,nwk_file_path)

Indel event between N235 and N236 is 0
[]
N235 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
N236 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
Indel event between N236 and N237 is 2
[2, 24]
N236 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
N237 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
Indel event between N236 and N238 is 0
[]
N236 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
N238 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
Indel event between N237 and sp|P16692|PHNP_ECOLI is 0
[]
N237 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
sp|P16692|PHNP_ECOLI - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1, 1049: 1, 1050: 1, 1071: 0, 1212: 0}
Indel event between N237 and tr|A0A564JD77|A0A564JD77_9ENTR is 0
[]
N237 - {612: 0, 714: 0, 749: 0, 991: 0, 1048: 1

## MINI TREE 2 ( under ancestor N117)

In [5]:
node = tree.search_nodes(name="N176")[0]
print(node.write())

((((((((tr|A0A0C9YFG4|A0A0C9YFG4_9AGAR:0.386223,tr|A0A0D0E5N9|A0A0D0E5N9_9AGAM:0.385295)1:0.0686107,tr|A0A0D7BCQ6|A0A0D7BCQ6_9AGAR:0.813676)1:0.0838332,tr|A0A166LIQ8|A0A166LIQ8_9AGAM:0.443659)1:0.0946587,tr|A0A286UJK2|A0A286UJK2_9AGAM:0.531246)1:0.0710168,tr|A0A165L3U8|A0A165L3U8_EXIGL:0.381311)1:0.184432,tr|A0A0E9N9E2|A0A0E9N9E2_SAICN:0.700329)1:0.0553765,tr|A0A1Y2FE70|A0A1Y2FE70_PROLT:1.1045)1:0.192644,(tr|S8BW22|S8BW22_DACHA:0.986847,tr|D5G6Y2|D5G6Y2_TUBMM:0.893913)1:0.266066)1:0.157104;


In [47]:
def get_indel_events(str1,str2):
    dis = 0
    prev_dis = 0
    pos = []
    for i in range(0,len(str1)):
        curr_dis = int(str1[i]) - int(str2[i])
        
        if curr_dis != 0 and curr_dis != prev_dis:
            dis += 1
            pos.append(i+1)
        prev_dis = curr_dis
        
    return dis,pos

def get_tree_indel_events(method_fasta_file,nwk_file_path):
    level = 0
    extant_list = []
    
    # sequence info
    sequences_fasta_info = FastaFile(method_fasta_file)
    
    # tree file
    tree_file = open(nwk_file_path,"r")
    my_tree = tree_file.read() + ";"
    tree = Tree(my_tree, format=1)
    
    for n in tree.traverse():
        if n.up is not None: # root node
            n.add_features(level = n.up.level + 1)
            seq_name = n.name
            indel_sequence_curr_level = sequences_fasta_info.fetch(n.name)
            indel_sequence_up_level   = sequences_fasta_info.fetch(n.up.name)
            total_indel_events,pos = get_indel_events(indel_sequence_up_level,indel_sequence_curr_level)
            n.add_features(indel_events = total_indel_events + n.up.indel_events)
            if n.name in ['N178'] or n.up.name in ['N178']:
                print(f"Indel event between {n.up.name} and {n.name} is {total_indel_events}")
                #print(f"{n.name},{total_indel_events}")
                print(f"pos {pos}")

        else:
            n.add_features(level = level)
            n.add_features(indel_events = 0)

In [48]:
get_tree_indel_events(sol1,nwk_file_path)

Indel event between N177 and N178 is 2
pos [317, 626]
Indel event between N178 and N179 is 0
pos []
Indel event between N178 and tr|A0A0E9N9E2|A0A0E9N9E2_SAICN is 11
pos [27, 34, 90, 244, 263, 318, 324, 339, 481, 493, 1086]


In [49]:
get_tree_indel_events(sol2,nwk_file_path)

Indel event between N177 and N178 is 2
pos [317, 626]
Indel event between N178 and N179 is 0
pos []
Indel event between N178 and tr|A0A0E9N9E2|A0A0E9N9E2_SAICN is 11
pos [27, 34, 90, 244, 263, 318, 324, 339, 481, 493, 1086]
