In [1]:
from ete3 import Tree
from pysam import FastaFile
import networkx as nx
from networkx.drawing.nx_pydot import read_dot
import glob
import numpy as np
import sys
import csv
import os
import re
import json
import numpy.ma as ma
import math
import re

In [14]:
# convert pogs to numpy arrays
def convert_pogs_to_numpy(folder):
    path = folder + '/pogs.json'
    instance_name = path.split('/')[-2]
    
    # read the json file
    with open(path, 'r') as j:
        pog_all_data = json.loads(j.read())
            
        # read all ancestors and extants
        for node_type in ['Ancestors','Extants']:
            for pog_data in pog_all_data[node_type]:
                if node_type == 'Ancestors':
                    node_name = 'N' + pog_data['Name']
                else:
                    node_name = pog_data['Name']
                

                if node_name == 'N533':
                    print("edges",pog_data['Edges'])
                    print("start",pog_data['Starts'])
                    print('indices',pog_data['Indices'])
                    print("pog_data['Adjacent']",pog_data['Adjacent'])
                    
                # read that node's data
                nodes = pog_data['Size'] + 2

                # create numpy zero matrix
                mat = np.zeros(shape=(nodes,nodes))

                # Edges from special Start node to the start nodes
                for s in pog_data['Starts']:
                    mat[0,s + 1] = 1

                # Edges from last node to the special End node
                for e in pog_data['Ends']:
                    mat[e + 1,nodes-1] = 1

                for ind,node in enumerate(pog_data['Indices']):
                    row_mat = node
                    row_col = pog_data['Adjacent'][ind]
                    

                    for rc in row_col:
                        mat[row_mat + 1,rc + 1] = 1                
                            
    
                # convert bidirectional mat into unidirectional
                mat = mat + mat.T     #add up the transpose
                mat = np.clip(mat,0,1)
                mat = np.triu(mat)   #only the upper triangle
                
                # translate mat
                if node_name == 'N533':
                    for i in range(0,nodes):
                        for j in range(0,nodes):
                            if mat[i][j] != 0:
                                print("Edge from node {} to node {} is. {}".format(i,j,mat[i][j]))

                # save numpy array in file
                mat_file_name = folder + '/' + node_name + '.npy'
                np.save(mat_file_name,mat)
                                
# test the function
convert_pogs_to_numpy('/Users/sanjanatule/Documents/uq/Projects/PreferredPath/data/DHAD_585/')

edges [{'Recip': True, 'From': 4, 'To': 5, 'Weight': 0.0642464224061238}, {'Recip': False, 'From': 5, 'To': 6, 'Weight': 0.46388740222829944}, {'Recip': True, 'From': 17, 'To': 18, 'Weight': 0.3695279434589825}, {'Recip': True, 'From': 18, 'To': 19, 'Weight': 0.3695279434589825}, {'Recip': True, 'From': 19, 'To': 20, 'Weight': 0.3695279434589825}, {'Recip': True, 'From': 20, 'To': 21, 'Weight': 0.3695279434589825}, {'Recip': True, 'From': 21, 'To': 22, 'Weight': 0.3695279434589825}, {'Recip': True, 'From': 22, 'To': 23, 'Weight': 0}, {'Recip': True, 'From': 23, 'To': 24, 'Weight': 0}, {'Recip': True, 'From': 24, 'To': 25, 'Weight': 0}, {'Recip': True, 'From': 25, 'To': 26, 'Weight': 0}, {'Recip': True, 'From': 26, 'To': 27, 'Weight': 0}, {'Recip': True, 'From': 27, 'To': 28, 'Weight': 0}, {'Recip': True, 'From': 28, 'To': 29, 'Weight': 0}, {'Recip': True, 'From': 29, 'To': 30, 'Weight': 0}, {'Recip': True, 'From': 30, 'To': 31, 'Weight': 0}, {'Recip': True, 'From': 31, 'To': 32, 'Weigh

In [81]:
# convert json into amino acids labels for each node
def get_amino_acids_json(folder):
    path = folder + '/pogs.json'
    instance_name = path.split('/')[-2]
    
    amino_acid_dict = {}
    
    # read the json file
    with open(path, 'r') as j:
        pog_all_data = json.loads(j.read())
        
        # read all ancestors and extants
        for node_type in ['Ancestors','Extants']:
            for pog_data in pog_all_data[node_type]:
                if node_type == 'Ancestors':
                    node_name = 'N' + pog_data['Name']
                else:
                    node_name = pog_data['Name']
                #print("node_name",node_name)
                
                # read that node's data
                nodes = pog_data['Size']
                
                amino_acid_list = ['-'] * (nodes)
                
                for node in pog_data['Nodes']:
                    amino_acid_list[node['Index']] = node['Label']
                
                amino_acid_dict[node_name] = amino_acid_list
        return amino_acid_dict
                                
# test the function
amino_acid_lkp = get_amino_acids_json('/Users/sanjanatule/Documents/uq/Projects/PreferredPath/data')

In [82]:
# convert list of arrays into markov chain model for a node
def convert_mc(neighbor_list,neighbor_distance,current_node):
    folder = '/Users/sanjanatule/Documents/uq/Projects/PreferredPath/data/'
    pseudocount = 0.0001
    total_states = np.load(folder + neighbor_list[0] + '.npy').shape[0]    
    mc_model = np.full((total_states, total_states),0)   #total states in markov chain
    #print("mc_model",mc_model)
    
    # normalise the weight of the neighbor by their distance
    #print("neighbor_distance",neighbor_distance)
    total_sum = 0
    new_neighbor_distance = []
    for n in neighbor_distance:
        whole_n = math.ceil(n)
        new_neighbor_distance.append(whole_n - n)
        total_sum = total_sum + whole_n
    #print("new neighbor_distance",new_neighbor_distance)
    new_neighbor_distance = [n/total_sum for n in new_neighbor_distance]
    #print("new neighbor_distance",new_neighbor_distance)
        
    #max_distance = max(neighbor_distance) + 1
    #min_distance = min(neighbor_distance)
    # min max scaling
    #neighbor_distance = [(max_distance - n)/(max_distance-min_distance) for n in neighbor_distance]
    
    
    
    # sum the neighbour arrays
    for n in range(0,len(neighbor_list)):
        #print(neighbor_list[n])
        m = np.load(folder + neighbor_list[n] + '.npy')  * new_neighbor_distance[n]
        mc_model = np.add(mc_model,m)
    #print("mc_model",mc_model)
        
    # project the neighbour markov chain into the current node
    # if the path does not exists in the current node, change the value to 0
    current_node_np = np.load(folder + current_node + '.npy')
    #print("current_node_np",current_node_np)
    
    
    # node not present will be wiped out
    mc_model = mc_model * current_node_np 
    # add current POGs weight to the mc model
    
    #print("AFTER mc_model",mc_model)
    
    # add the current node and neighbour arrays to create a markov chain on the current node
    mc_model = np.add(mc_model,current_node_np)
    
    
    # add pseudocounts for 0 entries
    mc_model[mc_model == 0] = pseudocount
    #print("AFTER mc_model",mc_model)
    
    # convert into probabilities row wise
    mc_model = np.round(mc_model/mc_model.sum(axis=1)[:,None] ,5)
    return mc_model

# test the function
#convert_mc(['A10', 'N12', 'N10'],[0.007488331676753748, 0.053298079515180216, 0.06444513511324562],'N11')
#convert_mc(['A20', 'A8', 'N2'],[0.015606083380247753, 0.015638650920124612, 0.13339705503870197],'N5')

In [83]:
# inference using markov chain
# return the list of positions in the final sequence

def mc_inference(mc_model):
    
    total_nodes = mc_model.shape[0] # total nodes to traverse
    pf    = [0] * total_nodes       # preferred path
    pf[0] = 1                       # starting node is always traversed
    
    i= 0
    while(i < total_nodes-1):       # till we reach the end of the nodes
        tr_ind = np.where(mc_model[i] == np.amax(mc_model[i]))[0][0] # 1st element if more than 1 returned
        pf[int(tr_ind)] = 1
        i = tr_ind
    return pf

# test the function
mc_inference(np.array([[0.0,0.1,0.9],[0.4,0.5,0.1],[0.6,0.6,0.6]]))

[1, 0, 1]

In [85]:
# get amino acids from the .json files based on the preferred path
def get_sequence(path,node_name,amino_acid_lkp):    
    sequence_str = amino_acid_lkp[node_name]
    #print(sequence_str)
    # remove first and the last element of the preferred path as they are for special character
    #path = path[1:-1]
    # filter out amino acids not supported positions in the preferred path
    #print(sequence_str)
    for i in range(0,len(sequence_str)):
        if path[i] == 0:
            sequence_str[i] = '-'
    
    #print(sequence_str)
    pro_sequence = ''.join(sequence_str)
    return pro_sequence
#get_sequence([1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1],'N5',amino_acid_lkp)

In [86]:
# load the tree
tree_string_file = open('/Users/sanjanatule/Documents/uq/Projects/PreferredPath/data/grasp_ancestors.nwk',"r")
my_tree = tree_string_file.read() + ";"
t = Tree(my_tree, format=1)

In [87]:
# get immediate neighbours and convert the neighbour POGs into markov chains for a node 
# Infer preferred path from the markov chain
# Get the protein sequence based on preferred path
# Save the protein sequence into .fasta file ( to be added later)

for n in t.traverse():
    #print("Node:",n.name)
    np_names_list = []
    np_distance_list = []
    if n.is_root() == True:                         # has no parent
        print('>',n.name)
        for c in n.children:  
            np_names_list.append(c.name)
            np_distance_list.append(t.get_distance(n,c.name))
        #print("Neighbours:",np_names_list)
        #print("Distance:",np_distance_list)
        mc_model       = convert_mc(np_names_list,np_distance_list,n.name)        # convert array to markov chain
        #print("mc_model",mc_model)
        preferred_path = mc_inference(mc_model)            # get the preferred path
        #print("preferred_path:",preferred_path[1:-1])
        pro_sequence   = get_sequence(preferred_path[1:-1],n.name,amino_acid_lkp) # generate the protein sequence
        n.add_features(mc_model = mc_model)                # add markov chain model to the node
        n.add_features(preferred_path = preferred_path[1:-1])    # add preferred path to the node
        n.add_features(sequence = pro_sequence)            # add protein sequence to the node
        print(pro_sequence)
        
        
    elif n.is_leaf() == False:
        #print('Children Name::',n.children)
        #print("Parent:",n.up.name)
        print('>',n.name)
        for c in n.children: 
            np_names_list.append(c.name)
            np_distance_list.append(t.get_distance(n,c.name))
            
        np_names_list.append(n.up.name)   # parent sequences
        np_distance_list.append(t.get_distance(n,n.up.name))
        #print("Neighbours:",np_names_list)
        #print("Distance:",np_distance_list)
        mc_model = convert_mc(np_names_list,np_distance_list,n.name) # convert array to markov chain
        #print("mc_model",mc_model)
        preferred_path = mc_inference(mc_model)
        #print("preferred_path:",preferred_path[1:-1])
        pro_sequence = get_sequence(preferred_path[1:-1],n.name,amino_acid_lkp)
        n.add_features(mc_model = mc_model)         # add markov chain model to the node
        n.add_features(preferred_path = preferred_path[1:-1]) 
        n.add_features(sequence = pro_sequence)
        print(pro_sequence)
        

> N0
M----------AAGP-------------WPEA----GQGLELGQTAGAGA-YL--TPLLL---GLALLLLSLYLL---RRR-GRE--ACRLPPGPPPWPLTGNFGFLLLPSLVWRPW--------R--------ARAASRR--G-APL--PPHLYLTELGKTYGEIYRLYLGRRLMVVLNGFEVVRDALVHHAEVFSDRPTVPLITIITKRK-GIVFAPYGPVWRQQRKFSLSTLRYFGLGKLDLEPKIIEELKFVKSEFLK-----AGG-GAFCPAPVIHNAVSNVICSICFGRRFEYEDEEFRTMLNLIVRGLELAVNSPAILINVCPWLYYLPFGPFKELRKTERDVTAFLKRIIDQHRETLDPENPRDFVDFYLLEI---EYQRKGK-QETSFSEDYLFYIIGDLFVAGTDTTTNTLLWALLYMCLHPEVQERVHKEIDAVIGPDRPPSLKDKLHMPFTEATIMEVLRMTTVVPLAIPHMASETTEFRGYTIPKGSVVVPNLWSVHRDPNIWEKPDEFNPSRFLDPDGNILKKEAFIPFGIGRRVCMGEQLAKMELFLIFSTLLQSFCFRLPEGTDPPNMEGRFGLTLAPYPFKIIPTKR---------------------
> N1
M----------AAARL------------WPEM----ASGLVLAQTATAGA-LL----LLL---GLALLLLSLYL----RRRFGRR--ASNLPPGPKPWPLVGNFGFLLLPSFILRRW--GLG---R--------ARAASRR--G-APL--SPHVVLTELAKVYGNIYSLFIGSRLMVVLNGFEVVRDALVNHAEVFSDRPSVPLITIITKRK-GIVFAPYGPVWRQQRKFSHSTLRHFGLGKLSLEPKIIEELKFVKSEMLK-----AGG-GAFSPAPIINNAVSNVICSISFGRRFDYEDEEFRTMLNLMSRGLEISVNSPAILINVCPWLYYLPFGPFKELRQIERDITAFLKRIIAQHRET

M----------EAAV-------------WQEL----LRTSVFSSVNI-VA--L----VVF---IVVFYLLQQY-----RRR--RR--FANIPPGPKPWPIVGNFGGFLVPSFILRRF--ARN---R--------EEYAKQQ--S-NPL--SPQVGLMELSKVYGNIYSIFVGSQLMVVLTGYEVVRDAMSNHAEVFSDRPDIPLITIMTKRK-GIVFAPYGPVWRKQRKFCHSTLRNFGLGKLSLEPCIHEGLAMVKSELLRLSEE-AGG-SGVDLTPLISNAVSNVISSISLGQRFHHQDQEFRTMLDLMAHGLEISVNSPALLINVFPWLYYLPCGVFKELRQVERDITAFLKKIIARHRATLDPENPRDFIDMYLVEM--LAQQKAGESEESSFSEDYLFYIIGDLFIAGTDTTTNSVLWMILYMCLYPDVQEKVQQEIDAVVGRDRVPSLTDKGSLPYTEATIMEVQRMTVVVPLAIPHMASETTEFRGYTIPKGTVIIPNLWSVHRDPTVWENPDDFNPSRFLDEQGKLLRKEYFIPFGIGRRVCMGEQLAKMELFLMFTSLMQAFTFRLPEGKPPPPMHGRFGLTLAPCPFTVCVTPR---------------------
> N15
M----------VMEL-------------WHEL----LSTSALSHVCI-LA--L----TVF---VAVYYIMHLF-----RKR--QD--FSNIPPGPKPWPIVGNFGGFLVPNFILRRF--GGR---RGE------DDAKSKQ--R-API--SPQVILTEQAKVYGNIYSIWVGSQLVVVLNGYEVVRDALSNRADVFSDRPEIPTVTIMTKRK-GIVFAPYGPVWRRQRKFCHTTLRNFGLGKLSLEPCILEGLAVVKSELLRLSEEDTEG-SGVDLTPLITNAVSNVISSIALGQRFHHADREFGALLDLMARGLEIIVNSAAVLINVFPLLYYLPFGVFKEVRQVERDITAFLKQIITRHRETLDPE

M----------AAAA--------SAEEP----------PAPLAPDTVRAA-------MLG---ATGLLLALWWL----LGR--QS--LQRLPPGPKPWPLVGNFAFALLPKRLHAVG--VFA---PRKKEA-----GAGRR--A-EPL--PMHVLLTGLAKMYGSIFRIALGSRHIIVLNDFEAVRDALVTQAEVFSDRPSVPLVTILTKKK-GLVFAPYGPVWKKQRKFSHSTLRHFGLGKHSLEPKIIEEFKYVKEEILK-----HGE-EQFNPFPIIGNAVSNVICSMAFGRRFDYDDAEFKNMLRLMSHALEFSVNSQVLLVNICPWLYYLPFGSFRELRQSVLDITAFLKNIIKQHRESLDAQNPRDFIDMYLLHS---DEEKKIN-GESSFNEDYLFFIIGDLFIAGTDTTSNTLLWSLLYMSLHPQEQRKVQEEIDLVIGCNRPPTLADKVHMPLTEATIMEVQRMTVVVPLSIPRMASETTELQGYTIPKGSVIIPNLWSVHRDPNKWENPDDFHPARFLDENGQLLKKETFIPFGIGKRVCMGEQLAKMELFLMFVSLLQSFTFLYPADLKKPSMEGRFGLTLAPFPFKLIALKR---------------------
> N60
M----------AA------------GGEW------------LLRAPTATE-------LLL---AAVCWLGCYWL----L--RPRA--PPGLPPGPAPWPLVGNFAFALLPPPLLRRW--VLE------------VWGRGRG--S-PVF--SPHVFLTGLTKMYGSIFRLFVGSRPFIVLNTFEAVREALVQKAEVFSDRPSVPIVLMITHKK-GVIFAPYGPVWKQQRKFSLSTLRHFGVGRHSLEPKIIEELKFIKEEMLK-----HGK-DSFSPFPIIRNAVSNVICSMAFGRRFNYEDVEFKTMLKNMARALELSVNSYMILVNICPWLYYLPFGPFRELRKTELDITAFLKKIIAQHRDTLDAA

M----------FPLS-------------CLEH----LSSSVLSHVNI-AA--L----LVL---LLLYYLVHFY-----QKQ--RH--LANIPPGPKPWPVVGNFGGFLVPSFLQRRF--GRR---G-----------------S-ANK--NAMVVLTEQANVYGNVFSLFVGSQLIVVLNGYEVVKDALSNHPEEFSDRPDIPAVSIMTKRK-GIVFAPYGPIWKKQRKFCHTILRNFGLGKLSFEPCILQGLATIKTELLRLNEE-SGG-AGVDLAPLISNAVSNVICSMTLGQRFHHEDREFRTLLDLMDRGLEICVNSPAVLINVFPLLYYLPFGVFKELRQVERDITVFLKRIIAKHRETLDPENPRDLVDMYLMEM--LAQQAAGQ-EDSSFTEDYLFYIVGDLFIAGTDTTTNSVLWILLYMVLYPDIQEKVQAEIDEVVGRHRVPSLTDKGSLPFTEATIMEVQRLVVVVPLAIPHMASKTTEFRGYTIPKGTVILPNLWSVHRDPTVWDDPDSFNPARFLDDDGKLLRKECFIPFGIGRRVCMGEQLAKMELFLTVTSLLQAFKFRLPEGKPPPPLHGRFGLTLAPCPFTVCVSTRS--------------------
> N25
M----------FPLS-------------CLEH----VSSSVLSHVNI-VA--L----IVF---LLVYYLVHFY-----QKQ--RH--LANIPPGPKPWPVVGNFGGFLIPSFIQRRF--GQR-----------------SD--S-ADTMKNAMVVLTEQANVYGNVFSLFVGSQLIVVLNGYEVVKDALSNHPEVFSDRPDIPAVTIMTKRK-GIVFAPYGPIWKKQRKFCHTILRNFGLGKLSFEPCILQGLATIKTELLRLNEE-SGG-AGVDLAPLISNAVSNVICSMILGQRFHHEDREFRTLLDLMDRGLEICVNSPAVLINVFPLLYYLPFGVFKELRQVEGDITVFLKRIIAKHRETLDPE

M----------GSLS-------------WLAD----LSSSALSPPNM-LP--L----LLF---LVVFYLVRFY-----QKQ--RG-IYRNIPPGPRPWPVVGNFGGLLLPPFIRRRF--GQT---S-----------------N-RNI--GVMEALTSQASVYGNIYSLFVGSQLIVVLNGYEVVKDALSNHPDVFSDRPDVPTISILTKRK-GIVFAPYGPVWRKQRKFCHATLRNFGLGRLSLEPCIQQGVAAVKTELLRLNGE-RGA-CGVDPSRLISNAVSNVICSLILGQRFHHDDPEFRGILDLMSRGLEICINSPAVLINIFPLLYYLPFGAFRELRRVERDITVFLKKIIESHSNTLDPDNPRDLTDMYLMEM--LAQQAAGE-QDSSFTEDYLFYIIGDLFIAGTDTTTNSVLWVLLYLALYPDIQDQVQAEIDRVVGRRRPPSLTDRGSLPFTEATIMEVQRLTAVVPLSIPHMASETTVFRGFTIPKGTVIMPNLYSVHRDPSVWDDPDAFNPARFLDGEGKLLRRESFIPFGIGRRVCMGEQLAKMELFLTVSGLLQACTFRLPDGAPAPSLHGRFGLTLAPCPFALCVSARSEDS--FSPNA-------LS-
> N37
M----------DSLS-------------WLEH----VSSFALSPSNM-VP--L----LVF---LLVFYLVRFY-----QKQ--RG-IYRNIPPGPKPWPVVGNFGGFLVPPAIRRRF--GQQ---------------------S-KNV--SVMEALTSQANVYGNIYSLFVGSQLIVVLNGYEVVKDALSNHPEVFSDRPDVPAISILTKRK-GIVFAPYGPVWRKQRKFCHTTLRNFGLGKLSLEPCIQRGLVTIKAELLRLNEE-SGG-AGVDPAPLISNAVSNVICSLILGQRFHHEDPEFRAILGLMARGLEICINSPAVLINVFPLLYYLPFGVFKELRQVERDITVFLKRIIANHRETLDPE

M----------ASPG--LPQP-PAEGSPWPLRLLH-APPGLLRLDPTGGA--L----LLL---GLAALLGWSWL-----WR--HR--ARGIPPGPTPWPVVGNFGFVLLPPFLRRKS--WLH---RR-------ARAAGME--P-SAL--GPQLLLADLARVYGNVFSFFIGHYLVVVLSDFHSVREALVQQAEIFSDRPRVPLVSLVTKEK-GIVFAHYGPVWRQQRKFSHSTLRHFGLGKLSLEPKIIEEFKYVKEEMQK-----HGE-DPFNPFPIVNNAVSNIICSLCFGQRFDYTNSEFKKMLNLMSRALEICLNTQLLLVNICSWLYYLPFGPFKELRQIEKDITTFLKKIIKDHRESLDVENPQDFIDMYLLHV---EEERKNN-SNSSFNEDYLFYIIGDLFIAGTDTTTNSLLWCLLYMSLNPDIQEKVQEEIERVIGADRVPSLTDKAQMPYTEATIMEVQRLTVVVPLAIPHMTSEKTVLQGYTIPKGTVILPNLWSVHRDPAIWEKPDDFYPNRFLDDQGQLIKKETFIPFGIGKRVCMGEQLAKMELFLMFVSLMQSFTFALPKDSKKPILTGRYGLTLAPHPFNIIISKR---------------------
> N109
M----------AAQG--LPQP-SAEGSPWPLSLLH-APPGLLLLDRTGGA--L----LLL---GLAALLGWSWL-----WR--RR--ARGIPPGPTPWPVVGNFGFVLLPPFLRRKS--WPY---RR-------AKAGGLN--L-SGL--GAQLLLAEMAHKYGNICSFFIGHYLVVVLNDFHSVREALVQQAEVFSDRPRVPLVSIMTKGK-GIVFAHYGPVWKQQRKFSHSTLRHFGLGKLSLEPKIIEEFKYVKEEMQK-----HGE-DPFNPFPIVNNAVSNIICSLCFGQRFDYTNSEFKKMLNFTSRALEICLNNQLLLVNICSWLYYLPFGPFKELRQIEKDLTTFLKKIIKDHRESLDA

M----------AAAA--PPP--PPEGSPWPLSLLP-APLGLLRLDPTGGA--L----LLL---GLAALLGWSWV-----RR--CR--PQGIPPGPTPWPVVGNFGFVLLPPFLRGRS--WTK---FG-------KNIPDLD--P-STP--ASQVLLTNLARVYGNIYSFFVGPYLVVVLNDFHSVREALVQQAEAFSDRPRMPLVSQVTKEK-GVVFARYGPVWRQQRKFSHSTLRHFGLGKLSLEPKIIEEFKYVKEEMQK-----HGG-DPFDPFPIVHNAISNIICLMCFGRRFDYTNSEFRKMLNFISRGLEVCLNSQLQLVNICSWLYYLPFGPFKEFRVIENDILNFLKRIIKEHRESLDVENPRDYIDMYLLHV---DEEKKNN-SNSSFNEDYLLYIISDLFVAGTDTTSNSLLWCLLYMSLNPDIQEKVHEEIERVIGADRVPSFTDKAQMPYTEATIMEVQRLTMVVPLSIPHMTSEKTVLQGYTIPKGTVIIPNLWAVHRDPAIWEKPDDFNPNRFLDDQGQLIKRETFIPFGIGKRVCMGEQLAKMELFLTFVSLMQSFTFALPKDSQKPLLTGKYGLTLAPHPFNIIISKR---------------------
> N103
M----------ASPG--LPQP-PAEGSPWPLRLLH-APPGLLRLDPTGGA--L----LLL---SLAALLGWSWL-----WR--HR--ARGIPPGPTPWPVVGNFGFVLLPPFLRRKS--WLH---RR-------ARAAGME--P-SAL--GPQLLLADLARVYGNVFSFFIGHYLVVVLSDFHSVREALVQQAEIFSDRPRVPLVSLVTKEK-GIVFAHYGPVWRQQRKFSHSTLRHFGLGKLSLEPKIIEEFKYVKEEMQK-----HGE-DPFDPFPIVNNAVSNIICSLCFGQRFDYTNSEFKKMLNLMSRALEICLNTQLLLVNICSWLYYLPFGPFKELRQIEKDITTFLKKIIKDHRESLDV

M----------SSPG--PPQP-PAEDPPWPARLLR-APLGLLRLDPSGGA--L----LLC---GLVALLGWSWL-----RR--RR--ARGIPPGPTPWPLVGNFGHVLLPPFLRRRS--WLS---SR-------TRAAGID--P-SVV--GPQVLLAHLARVYGSIFSFFIGHYLVVVLSDFHSVREALVQQAEVFSDRPRVPLISIVTKEK-GVVFAHYGPVWRQQRKFSHSTLRHFGLGKLSLEPKIIEEFKYVKAEMQK-----HGE-DPFCPFSIISNAVSNIICSLCFGQRFDYTNSEFKKMLGFMSRGLEICLNSQVLLVNICPWLYYLPFGPFKELRQIEKDITSFLKKIIKDHQESLDRENPQDFIDMYLLHM---EEERKNN-SNSSFDEEYLFYIIGDLFIAGTDTTTNSLLWCLLYMSLNPDVQEKVHEEIERVIGANRAPSLTDKAQMPYTEATIMEVQRLTVVVPLAIPHMTSENTVLQGYTIPKGTLILPNLWSVHRDPAIWEKPEDFYPNRFLDDQGQLIKKETFIPFGIGKRVCMGEQLAKMELFLMFVSLMQSFAFALPKDSKKPLLTGRFGLTLAPHPFNITISRR---------------------
> N116
M----------ASAG--LPQA-PAEDSPWPLRLLH-APPGLLRLDPTGGA--L----LLL---VLAALLGWSWL-----WR--RP--KRGIPPGPTPWPVVGNFGFVLLPPFLRRKS--WPY---RR-------ARTGELN--T-SGL--GVQLLLADLARVYGNIYSFFIGHYLVVVLNDFHSVREALVQQAEVFSDRPRMPLNYILTKGK-GIVFAHYGPVWRQQRKFSHSTLRHFGLGKLSLEPKIIEEFKYVKEEMQK-----NGE-VLFNPFPIVNNAVSNIICSLCFGQRFDYTNSEFKKMLNFMSRALEICLNTQLLLVNICSWLYYLPFGPFKELRQIEKDLTIFLKKIIEDHRESLDV

M----------SSPG--PPQP-PAEDPPWPARLLR-APLGLLRMDPSGDA--L----LLC---GLVAVLGWSWL-----RR--RR--ARGIPPGPTPWPLVGNFGHVLLPPFLRRRS--WLS---SR-------TRAAGID--P-SVV--GPQVLLAHLARVYGSIFSFFIGHYLVVVLSDFHSVREALVQQAEVFSDRPRVPLISIVTKEK-GVVFAHYGPIWRQQRKFSHSTLRHFGLGKLSLEPKIIEEFKYVKAEMQK-----HGE-DPFCPFSIISNAVSNIICSLCFGQRFDYTNSEFKKMLGFMSRGLEICLNSQVLMVNICPWLYYLPFGPFKELRQIEKDITSFLKKIIKDHQESLDRENPQDFIDMYLLHM---EEERKNN-SNSSFDEEYLFYIIGDLFIAGTDTTTNSLLWCLLYMSLNPDVQEKVHEEIERVIGANRAPSLTDKAQMPYTEATIMEVQRLTVVVPLAIPHMTSGNTVLQGYTIPKGTLILPNLWSVHRDPAIWEKPEDFYPNRFLDDQGQLIKKETFIPFGIGKRVCMGEQLAKMELFLMFVSLMQSFAFALPKDSKKPLLTGRFGLTLAPHPFNITISRR---------------------
> N140
M----------SSLG--GQRP-AAGEQPG----------ARLHVRATGGA--L----LLC---VLAVLLGWVWL-----RR--QR--ACGIPPGPKPRPLVGNFGHLLVPRFLRPQF--WL---------------GSGSQ--T-DTV--GQHVYLARMARVYGNIFSFFIGHRLVVVLSDFHSVREALVQQAEVFSDRPRMPLISIMTKEK-GIVFAHYGPIWKQQRRFSHSTLRHFGLGKLSLEPRIIEEFAYVKEAMQK-----HGE-APFSPFPIISNAVSNIICSLCFGQRFDYTNKEFKKVLDFMSRGLEICLHSQLFLINICPWFYYLPFGPFKELRQIERDISCFLKNIIREHQESLDA

In [105]:
# difference between old and new predictions for preferred path
ancestor_fasta = "/Users/sanjanatule/Documents/uq/Projects/PreferredPath/data/grasp_ancestors.fa"
new_ancestor_fasta = "/Users/sanjanatule/Documents/uq/Projects/PreferredPath/data/new_grasp_ancestors.fasta"
regex = r'^N[0-9]+'

f1 = open(ancestor_fasta,"r").readlines()
f2 = open(new_ancestor_fasta,"r").readlines()

## Read the files
dict1 = {}
dict2 = {}
currentID = ""
for l in f1:
    line = l.strip()
    if line[0] == ">":
        currentID = line[1:]
        dict1[line[1:]] = ""
    else:
        dict1[currentID] = dict1[currentID]+line

currentID = ""

for l in f2:
    line = l.strip()
    if line[0] == ">":
        currentID = line[1:]
        dict2[line[1:]] = ""
    else:
        dict2[currentID] = dict2[currentID]+line
# print(dict1)
# print(dict2)

##Assuming that both sequences have same length
total_ancestors_compared = 0
total_ancestors_changed = 0
for key in dict1.keys():
    if re.match(regex, key):
        total_ancestors_compared +=1
        if dict1[key] != dict2[key]:
            print("Node             ",key)
            total_ancestors_changed +=1
            #print("Baseline Ancestor",dict1[key])
            #print("New Ancestor     ",dict2[key])
print("Total Ancestors Compared::",total_ancestors_compared)
print("Total Ancestors Changed::",total_ancestors_changed)

Node              N0
Node              N1
Node              N2
Node              N3
Node              N5
Node              N14
Node              N20
Node              N23
Node              N28
Node              N33
Node              N39
Node              N46
Node              N47
Node              N48
Node              N50
Node              N51
Node              N55
Node              N56
Node              N57
Node              N77
Node              N82
Node              N96
Node              N129
Node              N134
Node              N135
Node              N136
Total Ancestors Compared:: 164
Total Ancestors Changed:: 26


### OLD CODE

In [72]:
# # convert pogs to numpy arrays
# def convert_pogs_to_numpy(folder):
#     path = folder + '/N11.json'
#     #path = folder + '/N2.json' ## for testing
#     pog_files = glob.glob(path)
#     instance_name = path.split('/')[-2]
    
#     for pf in pog_files:
#         print("pf",pf)
#         with open(pf, 'r') as j:
#             pog_data = json.loads(j.read())
#             nodes = pog_data['Size'] + 2
            
#             # create numpy zero matrix
#             mat = np.zeros(shape=(nodes,nodes))
            
#             # Edges from special Start node to the start nodes
#             for s in pog_data['Starts']:
#                 mat[0,s + 1] = 1
                
#             # Edges from last node to the special End node
#             for e in pog_data['Ends']:
#                 mat[e + 1,nodes-1] = 1

#             # create the adjency matrix for all nodes except from special node start
#             for e in pog_data['Edges']:
#                 #print("e[From]={} and e[To]={}".format(e['From'],e['To']))
#                 mat[e['From'] + 1,e['To'] + 1] = 1
            
#             print("mat",mat)
#             # convert bidirectional mat into unidirectional
#             mat = mat + mat.T    #add up the transpose
#             mat = np.clip(mat,0,1)
#             mat = np.triu(mat) #only the upper triangle
    
#             # save numpy array in file
#             mat_file_name = pf.replace('.json','.npy')
#             np.save(mat_file_name,mat)

# # test the function
# convert_pogs_to_numpy('/Users/sanjanatule/Documents/uq/Projects/PreferredPath/data')

In [73]:
# run inference on the markov chain to choose the most probabilistic POG
# for n in t.traverse():
#     print(n.name)
#     if n.is_leaf() == False: # leaf node
#         preferred_path = mc_inference(n.mc_model)
#         n.add_features(preferred_path = preferred_path) # add preferred path for a node
#         print("preferred path",preferred_path)
#         # get the amino acids from the .json files
#         pro_sequence = get_sequence(preferred_path,n.name)
#         print("pro_sequence",pro_sequence)
#         n.add_features(sequence = pro_sequence)