In [17]:
from ete3 import Tree
from pysam import FastaFile,FastxFile
from operator import add

1 - TREE SCORE

In [10]:
''' Function to calculate parsimony score for the whole tree for a given indel solution'''
def sequence_distance_score(str1,str2):
    dis = 0
    prev_dis = 0
    
    for i in range(0,len(str1)):
        if str1[i] != str2[i]:  # not matching
            if prev_dis == 0:   # previous matches
                dis += 3
                prev_dis = 1
            else:
                dis += 1
        else:
            prev_dis = 0
    return dis
    
def score_tree_indels(treefile,indelfastafile):
    parsiscore = 0
    
    # load the fasta file
    indel_pattern = FastaFile(indelfastafile)
    # load the tree
    tree_file = open(treefile,"r")
    my_tree = tree_file.read() + ";"
    tree = Tree(my_tree, format=1)
    
    # load pattern on the tree
    for n in tree.traverse():
        if n.is_leaf() == False:  
            current_node = n.name
            current_node_sequence = indel_pattern.fetch(current_node)
            child_seq_1 = indel_pattern.fetch(n.children[0].name)
            child_seq_2 = indel_pattern.fetch(n.children[1].name)
    
            # calculate score
            parsiscore += sequence_distance_score(current_node_sequence,child_seq_1)
            parsiscore += sequence_distance_score(current_node_sequence,child_seq_2)
    
    return parsiscore

In [11]:
# test this
input_folder = '/Users/sanjanatule/Documents/uq/Projects/MIPIndel/data/'
protein_family = 'CYP2U_165'
input_indel_fasta_file = input_folder + protein_family + '/mip_ancestor_indel.fasta'
input_tree_file = input_folder + protein_family + '/CYP2U_165_ancestors.nwk'

score_tree_indels(input_tree_file,input_indel_fasta_file)

3146

2 - INDEL EVENTS

In [38]:
''' function to calculate the total indel events at each node of the tree'''
def count_mutations(p,n,c1,c2):
    tn,i1a,i1b,i2a,i2b,i3 = 0,0,0,0,0,0
    
    for i in range(0,len(n)):
        #print(f"p[i]-{p[i]}:n[i]-{n[i]}:c1[i]-{c1[i]}:c2[i]-{c2[i]}")
        if n[i] == p[i] == c1[i] == c2[i]:
            continue
        elif n[i] == p[i]: #(node and parent position are equal)
            if c1[i] == c2[i] and c1[i] != n[i]:
                i2b += 1
            if c1[i] != c2[i]: # kids are not equal
                i1b += 1
        elif n[i] != p[i]: #(node not equal to parent)
            if c1[i] != c2[i]:
                i2a += 1
            if c1[i] == c2[i] and c1[i] == n[i]:
                i1a += 1
            if c1[i] == c2[i] and c1[i] != n[i]:
                i3 += 1
        #print(i1a,i1b,i2a,i2b,i3)
    if i1a + i1b + i2a + i2b + i3 == 0: # no mutations
        tn = 0
    else:
        tn = 1 
    return [tn,i1a,i1b,i2a,i2b,i3]

def count_indel_events(treefile,indelfastafile):
    total_mut = [0,0,0,0,0,0] #event flag,i1a,i1b,i2a,i2b,i3
    
    # load the tree
    tree_file = open(treefile,"r")
    my_tree = tree_file.read() + ";"
    tree = Tree(my_tree, format=1)
    
    # traverse tree
    indel_pattern = FastaFile(indelfastafile)
    
    for n in tree.traverse():
        if n.is_leaf() == False:  
            current_node = n.name
            current_node_sequence = indel_pattern.fetch(current_node)
            child_seq_1 = indel_pattern.fetch(n.children[0].name)
            child_seq_2 = indel_pattern.fetch(n.children[1].name)
            
            if n.is_root() == False: # if root then parent sequence same as the root
                parent_node  = n.up.name
                parent_node_sequence = indel_pattern.fetch(parent_node)
            else:
                parent_node_sequence = current_node_sequence
        #print("parent_node_sequence",parent_node_sequence)
        #print("current_node_sequence",current_node_sequence)
        #print("child_seq_1",child_seq_1)
        #print("child_seq_2",child_seq_2)
        mut_ret   = count_mutations(parent_node_sequence,current_node_sequence,child_seq_1,child_seq_2)
        #print("mut_ret",mut_ret)
        total_mut =  list(map(add, total_mut, mut_ret))
    return total_mut

In [39]:
count_indel_events(input_tree_file,input_indel_fasta_file)

[200, 902, 3921, 13, 11, 0]