In [1]:
#!/usr/bin/python3
import os
import numpy as np
from sys import argv
from datetime import datetime
# import matplotlib
# matplotlib.use('Agg')
import matplotlib.pyplot as plt
import ete3
#import dendropy
from dendropy import Tree
datasets_address="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/archive/"
# oma_database_address = datasets_address + "OmaServer.h5"
omaID_address = datasets_address+"oma-species.txt"
#bird6ID_address = datasets_address+"info.tsv"
uniprot_table_address = datasets_address+"proteomes_taxonomy_Mammalia_40674_refyes.tab"


def read_omaID_file(omaID_address):
    
    omaID_file = open(omaID_address,'r')

    taxonID_omaID={}
    omaID_taxonID={}

    # omaID_scienceFull={}
    # scienceFull_omaID={}
    scientific_taxonID={}
    taxonID_scientific={}

    #  !!! limitation  ignoring strains and isolate
    for line in omaID_file:
        line_strip = line.strip()
        if line_strip.startswith('#'):
            pass
            #header_lines_list.append(line_strip)
        else:
            line_parts = line_strip.split('\t')

            omaID = line_parts[0]
            taxonID = line_parts[1]
            taxonID_omaID[taxonID] = omaID
            omaID_taxonID[omaID] = taxonID

            scientific = line_parts[2]
            taxonID_scientific[taxonID] = scientific
            scientific_taxonID[scientific] = taxonID

    omaID_file.close()

    print("-- The map for OMA taxonID of",len(taxonID_omaID),"records have read.") 
    
    return (taxonID_omaID,omaID_taxonID,taxonID_scientific,scientific_taxonID)


def read_taxonID_uniprot(uniprot_table_address):

    taxonID_scientific = {}
    scientific_taxonID = {}
    common_taxonID = {}
    taxonID_uniprot = {}
    taxonID_scientificUnder = {}
    scientificUnder_taxonID = {}

    uniprot_file = open(uniprot_table_address,'r')
    for line in uniprot_file:
        line_strip = line.strip()
        if line_strip.startswith('Pro'):
            pass
        else:
            line_parts = line_strip.split('\t')
            uniprot = line_parts[0]
            taxonID = line_parts[2]

            taxonID_uniprot[uniprot] = taxonID
            
            line_parts_1= line_parts[1].split("(")
            if len(line_parts_1) > 1:

                scientific = line_parts_1[0]
                common = line_parts_1[1].strip()[:-1]
            else:
                scientific = line_parts_1[0]
                common = line_parts_1[0]
            
            scientific_taxonID[taxonID] = scientific
            taxonID_scientific[scientific] = taxonID 

            common_taxonID[taxonID] = common
            
            
            scientificUnder = '_'.join(scientific.split(" ")[:2])
            taxonID_scientificUnder[scientificUnder] = taxonID
            scientificUnder_taxonID[taxonID] = scientificUnder

    print("-- WARNING, for taxonID_scientificUnder, only the first two parts of species name are used!!! ")
    print("-- The map for uniprot taxonID of",len(scientific_taxonID),"records have read.") 
    return (scientific_taxonID, taxonID_scientific, common_taxonID, taxonID_uniprot, taxonID_scientificUnder, scientificUnder_taxonID)


def collapse_tree_sup_branch(tree_raw_address, support_treshold, length_threshold ):


    tree = dendropy.Tree.get_from_path(tree_raw_address,"newick") # newick nexus

    print("length of input tree is: ", len(tree))

    for idx, node in enumerate(tree):
        if not node.is_leaf() and node.label:
            support_value = int(node.label)
            if support_value <= support_treshold:
                edge= node.edge   #print(edge.head_node.child_nodes())
                edge.collapse()
    print("length of tree  after collapsing based on support values is: ", len(tree))


    for e in tree.postorder_edge_iter():    
        if e.length is None or (e.length <= length_threshold) and e.is_internal():
            e.collapse()
    print("length of tree after collapsing based on branch length is: ", len(tree))

    tree_address_out = tree_raw_address+"_collapsed__"+str(support_treshold)+"_"+str(length_threshold)
    tree.write_to_path(tree_address_out, "newick")
    print("Output tree is written in ",tree_address_out)
    return (tree)

# def read_taxonID_uniprot_temp(uniprot_table_address):

#     taxonID_scientific = {}
#     scientific_taxonID = {}
#     common_taxonID = {}
#     taxonID_uniprot = {}
    
#     taxonID_scientificUnder={}
        
#     uniprot_file = open(uniprot_table_address,'r')
#     for line in uniprot_file:
#         line_strip = line.strip()
#         if line_strip.startswith('Pro'):
#             pass
#         else:
#             line_parts = line_strip.split('\t')
#             uniprot = line_parts[0]
#             taxonID = line_parts[2]

#             taxonID_uniprot[uniprot]=taxonID
            
#             line_parts_1= line_parts[1].split("(")
#             if len(line_parts_1) > 1:

#                 scientific = line_parts_1[0]
#                 common = line_parts_1[1].strip()[:-1]
#             else:
#                 scientific = line_parts_1[0]
#                 common = line_parts_1[0]
            
#             scientific_taxonID[taxonID] = scientific
#             taxonID_scientific[scientific] = taxonID 

#             common_taxonID[taxonID] = common
#             scientificUnder='_'.join(scientific.split(" ")[:2])
#             taxonID_scientificUnder[scientificUnder]=taxonID

#     print("-- The map for uniprot taxonID of",len(scientific_taxonID),"records have read.") 
#     return (taxonID_scientificUnder)


In [2]:
#taxonID_scientificUnder = read_taxonID_uniprot_temp(uniprot_table_address)

(scientific_taxonID, taxonID_scientific, common_taxonID, taxonID_uniprot, taxonID_scientificUnder, scientificUnder_taxonID) =read_taxonID_uniprot(uniprot_table_address)

#(scientific_taxonID,taxonID_scientific,common_taxonID,taxonID_uniprot)=read_taxonID_uniprot(uniprot_table_address)

(taxonID_omaID,omaID_taxonID,taxonID_scientific,scientific_taxonID)= read_omaID_file(omaID_address)


-- The map for uniprot taxonID of 101 records have read.
-- The map for OMA taxonID of 2424 records have read.


### Collapsing fast tree  -  support value _ branch length

In [None]:
# import dendropy

# fast_tree_address="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/primate_oma/v27_10/fastoma_core_nosplice/"

# tree_raw_address= fast_tree_address + "_100_msa_concatanated.txt.contree"

# support_treshold = 94
# length_threshold=1e-3 

# out_tree = collapse_tree_sup_branch(tree_raw_address, support_treshold,length_threshold )


# Reading FastOMA tree - collapsed ,  OMAid name

In [57]:

fast_tree_address = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/primate_oma/v27_10/fastoma_core_splice/"
fast_tree_address += "_100_msa_concatanated.txt.contree"+"_collapsed__94_0.001"

print(fast_tree_address)
print(round(os.path.getsize(fast_tree_address)/1000),"kb")

fast_tree= ete3.Tree(fast_tree_address) # ,format=0
print("length of tree is ",len(fast_tree))


/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/primate_oma/v27_10/fastoma_core_splice/_100_msa_concatanated.txt.contree_collapsed__94_0.001
1 kb
length of tree is  35


In [58]:
# fast_tree.write()

In [59]:
# ##oma_standalon backbone species names
# import pyoma.browser.db as db
# project_folder_root="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/mammal/old/mml_v4/"
# oma_database_address=project_folder_root+"omamer_v4_database/oma_path_/OmaServer.h5"
# oma_db = db.Database(oma_database_address);
# print(oma_db.tax.as_dict())

### use scientific name for  backbone (S001)

# #dic_oma={'S0005':'rabbit', 'S0003':'human', 'S0006':'redfox', 'S0004':'opossum', 'S0001':'chicken','S0002':'frog'}
# #{'rabbit':'S0005', 'human':'S0003', 'redfox': 'S0006', 'opossum':'S0004', 'chicken':'S0001','frog':'S0002'}

# old/mml_v4/"
# dic_oma_db={'Oryctolagus_cuniculus':'S0007', 'Homo_sapiens':'S0003', 'Vulpes_vulpes':'S0008','Monodelphis_domestica':'S0006', 'Gallus_gallus':'S0002', 'Xenopus_tropicalis':'S0009','Latimeria_chalumnae': 'S0004', 'Mola_mola':'S0005', 'Danio_rerio': 'S0001'}
# dic_oma_db_=dict((v, k) for k, v in dic_oma_db.items())

# for node in fast_tree.traverse(strategy="postorder"):
#     if node.is_leaf() :    
#         node_name = node.name
#         if node_name in dic_oma_db_:            
#             node_name_ = dic_oma_db_[node_name]
#             node.name = node_name_+"_back"            

# fast_tree.write()        

### convert to taxonID 

In [60]:

list_species_tax_fast=[]
list_species_omaID_fast=[]
no_list=[]
for node in fast_tree.traverse(strategy="postorder"):
    if node.is_leaf() :    
        node_name = node.name
        list_species_omaID_fast.append(node_name)
        if node_name in omaID_taxonID:
            node_name_tax = omaID_taxonID[node_name]
            node.name = node_name_tax            
            list_species_tax_fast.append(node_name_tax)
        else:
            no_list.append(node_name)
            
print("number of leaves, species, is:",len(list_species_omaID_fast),len(list_species_tax_fast),len(no_list))

number of leaves, species, is: 35 25 10


In [61]:
no_list

['BOVIN_',
 'MOUSE_',
 'OTOGA_',
 'CARSF_',
 'SAIBB_',
 'GORGO_',
 'PANTR_',
 'MACNE_',
 'MANLE_',
 'COLAP_']

In [62]:
fast_tree.write()

'(336983:0.00172937,(((((((9913:0.00223615,BOVIN_:0.0137497)100:0.0863897,(10090:0.000391497,MOUSE_:0.00311982)100:0.10428)100:0.0146723,((30611:2.62966e-05,OTOGA_:0.000726701)100:0.0615912,(30608:0.0371289,379532:0.0265011)100:0.013476)100:0.0155775,(1868482:2.7073e-06,CARSF_:2.8823e-06)100:0.0674355)100:0.0248855,(((39432:0.00218759,SAIBB_:0.0070922)100:0.0190844,9483:0.0366017)97:0.00151199,37293:0.0532771)100:0.0142606)100:0.00783661,((((9595:0.0012583,GORGO_:0.0075994)100:0.0274976,((9598:0.00122899,PANTR_:0.00588478)100:0.00781253,9597:0.0230622)100:0.00168806,9606:0.0110641)100:0.00274361,9601:0.015492)100:0.00131714,61853:0.0382361)100:0.00531884)100:0.0116669,((9545:0.0022106,MACNE_:0.00679428)100:0.00742282,9541:0.0163475)96:0.00197187,(9568:0.000396765,MANLE_:0.00501159)100:0.00974183,9531:0.0245506,(60711:0.0107976,9555:0.0131583)96:0.00203856,9544:0.046456)100:0.00319826,(61621:0.013633,61622:0.00265876)100:0.0153811)100:0.0108101,COLAP_:0.0115106);'

In [63]:
# keep only leaves of taxon id , remove backbone

print(len(fast_tree))
fast_tree.prune(list_species_tax_fast, preserve_branch_length=True)
print(len(fast_tree))

fast_tree.write()

35
25


'(336983:0.00172937,((((((9913:0.0886258,10090:0.104671)100:0.0146723,((30608:0.0371289,379532:0.0265011)100:0.013476,30611:0.0616175)100:0.0155775,1868482:0.0674382)100:0.0248855,((9483:0.0366017,39432:0.021272)97:0.00151199,37293:0.0532771)100:0.0142606)100:0.00783661,((((9597:0.0230622,9598:0.00904152)100:0.00168806,9606:0.0110641,9595:0.0287559)100:0.00274361,9601:0.015492)100:0.00131714,61853:0.0382361)100:0.00531884)100:0.0116669,(9541:0.0163475,9545:0.00963342)96:0.00197187,9531:0.0245506,(60711:0.0107976,9555:0.0131583)96:0.00203856,9544:0.046456,9568:0.0101386)100:0.00319826,(61621:0.013633,61622:0.00265876)100:0.0153811)100:0.0108101);'

In [64]:
# list_sepcies_mammal = [10020, 10029, 10036, 10090, 10116, 10141, 10160, 10181, 103600, 116960, 118797, 127582, 132908, 13616, 1706337, 185453, 1868482, 191816, 230844, 2715852, 27622, 29073, 29088, 29139, 30522, 30538, 30611, 310752, 32536, 336983, 34884, 37032, 37293, 379532, 38626, 391180, 39432, 40151, 419612, 43179, 43346, 46360, 51298, 56216, 59463, 59472, 59479, 60711, 61621, 61622, 61853, 72004, 77932, 885580, 89673, 9258, 9305, 9365, 9402, 9407, 9483, 9515, 9531, 9541, 9544, 9545, 9555, 9568, 9595, 9597, 9598, 9601, 9606, 9615, 9627, 9643, 9646, 9669, 9678, 9685, 9696, 9708, 9713, 9739, 9749, 9755, 9764, 9770, 9785, 9796, 9823, 9838, 9880, 9886, 9913, 9915, 9925, 9940, 9986, 9995]
# #outgroups=[8364, 9031] # frog chicken
# list_sepcies_str=[str(i) for i in list_sepcies_mammal]

# print(len(fast_tree))
# fast_tree.prune(list_sepcies_str, preserve_branch_length=True)
# print(len(fast_tree))

# fast_tree.set_outgroup("9258") 
# # frog 8364
# # 9258  Ornithorhynchus anatinus (Duckbill platypus)


# Reading  NCBI tree

In [65]:
 
ncbi_taxon_list = list_species_tax_fast


ncbi = ete3.NCBITaxa()  # first time download in ~/.etetoolkit/
ncbi_sub_tree = ncbi.get_topology(ncbi_taxon_list)

print(len(ncbi_sub_tree))

25


In [66]:
ncbi_sub_tree.write()

'((((((((((9597:1,9598:1)1:1,9606:1,9595:1)1:1,9601:1)1:1,61853:1)1:1,(((9541:1,9544:1,9545:1)1:1,60711:1,9531:1,9555:1,9568:1)1:1,((61621:1,61622:1)1:1,336983:1)1:1)1:1)1:1,((39432:1,9483:1)1:1,37293:1)1:1)1:1,1868482:1)1:1,((379532:1,30608:1)1:1,30611:1)1:1)1:1,10090:1)1:1,9913:1);'

# RF distance

In [11]:
def parition_editor(partitions_input,intersect_leaves):
    pivot_species= sorted(intersect_leaves)[0]
    partitions_ed=[]
    for parition in  partitions_input:
        #parition=set(parition)
        if len(parition)<len(intersect_leaves)/2 and len(parition)>1:  # removing partitions with length of 0/1 
            partitions_ed.append(parition)
        if len(parition)>len(intersect_leaves)/2 and len(parition)<len(intersect_leaves)-1:
            partitions_ed.append(tuple(intersect_leaves-set(parition)))
        if len(parition)==len(intersect_leaves)/2 and pivot_species not in parition: # make sure alwyas one half is there
            partitions_ed.append(tuple(intersect_leaves-set(parition)))
            
    # partitions = [e for e in edges if n > len(e) > 1]
    # partitions = list(map(lambda p: p if len(p) <= n/2 else all_species -set(p), partitions)      
    # if len(p) < n/2 or len(p) == n/2 and pivot_species in p: return p
    # else: return all_species - set(p)   
    return partitions_ed
        
        

In [None]:
# #fast_tree.set_outgroup("9913")  # bovin
# node=fast_tree.get_common_ancestor("9913","10090")

# fast_tree.set_outgroup("9913")  # bovin

In [67]:
fast_tree.set_outgroup("9913")  # bovin 9913  # 9258 ornithorhynchus anatinus  Platypus



intersection_sets= set(ncbi_taxon_list)

out_fast_ncbi = fast_tree.robinson_foulds(ncbi_sub_tree)
(rf_fast_ncbi, maxparts_fast_ncbi, common_attrs, partitions_fast_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_fast_ncbi 
partitions_fast = set(parition_editor(partitions_fast_raw, intersection_sets))
partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersection_sets))

print("RF distance is %s over a total of %s" %(rf_fast_ncbi, maxparts_fast_ncbi))
print("in fast:",len(partitions_fast),", in ncbi:",len(partitions_ncbi)) # , "Common Partitions",len(common_attrs_paper),)
print("in both fast and ncbi:", len(partitions_ncbi & partitions_fast))
print("only in ncbi, not in fast:", len(partitions_ncbi-partitions_fast)) # order not sure  http://etetoolkit.org/docs/latest/tutorial/tutorial_trees.html#robinson-foulds-distance
print("only in fast, not in ncbi:", len(partitions_fast-partitions_ncbi))




RF distance is 5 over a total of 35
in fast: 16 , in ncbi: 17
in both fast and ncbi: 14
only in ncbi, not in fast: 3
only in fast, not in ncbi: 2


### The following may not needed

## Trees with common name

In [68]:
no_list=[]
for node in fast_tree.traverse(strategy="postorder"):
    if node.is_leaf() :           
        #fast_tree_leaves.append(node.name)
        #old_node_name
        if node.name in common_taxonID:
            node_name = common_taxonID[node.name]
            node.name=node_name + "_"+node.name
        elif node.name=='30608' : node.name="Lesser mouse lemur_30608"
        else:
            no_list.append(node.name)
        
print(no_list)

[]


In [69]:
no_list=[]
for node in ncbi_sub_tree.traverse(strategy="postorder"):
    if node.is_leaf() :           
        if node.name in common_taxonID:
            node_name = common_taxonID[node.name]
            node.name=node_name + "_"+node.name
        elif node.name=='30608' : node.name="Lesser mouse lemur_30608"
            # Microcebus murinus 30608
        else:
            no_list.append(node.name)
        
print(no_list)

[]


In [70]:
ncbi_sub_tree.write()

"((((((((((Pygmy chimpanzee_9597:1,Chimpanzee_9598:1)1:1,Human_9606:1,Western lowland gorilla_9595:1)1:1,Sumatran orangutan_9601:1)1:1,Northern white-cheeked gibbon_61853:1)1:1,(((Crab-eating macaque_9541:1,Rhesus macaque_9544:1,Pig-tailed macaque_9545:1)1:1,Green monkey_60711:1,Sooty mangabey_9531:1,Olive baboon_9555:1,Drill_9568:1)1:1,((Black snub-nosed monkey_61621:1,Golden snub-nosed monkey_61622:1)1:1,Peters' Angolan colobus_336983:1)1:1)1:1)1:1,((Bolivian squirrel monkey_39432:1,White-tufted-ear marmoset_9483:1)1:1,Ma's night monkey_37293:1)1:1)1:1,Philippine tarsier_1868482:1)1:1,((Coquerel's sifaka_379532:1,Lesser mouse lemur_30608:1)1:1,Small-eared galago_30611:1)1:1)1:1,Mouse_10090:1)1:1,Bovine_9913:1);"

In [71]:
fast_tree.write()

"(Bovine_9913:0.0443129,(Mouse_10090:0.104671,(((Lesser mouse lemur_30608:0.0371289,Coquerel's sifaka_379532:0.0265011)100:0.013476,Small-eared galago_30611:0.0616175)100:0.0155775,Philippine tarsier_1868482:0.0674382,(((White-tufted-ear marmoset_9483:0.0366017,Bolivian squirrel monkey_39432:0.021272)97:0.00151199,Ma's night monkey_37293:0.0532771)100:0.0142606,(((((Pygmy chimpanzee_9597:0.0230622,Chimpanzee_9598:0.00904152)100:0.00168806,Human_9606:0.0110641,Western lowland gorilla_9595:0.0287559)100:0.00274361,Sumatran orangutan_9601:0.015492)100:0.00131714,Northern white-cheeked gibbon_61853:0.0382361)100:0.00531884,((Crab-eating macaque_9541:0.0163475,Pig-tailed macaque_9545:0.00963342)96:0.00197187,Sooty mangabey_9531:0.0245506,(Green monkey_60711:0.0107976,Olive baboon_9555:0.0131583)96:0.00203856,Rhesus macaque_9544:0.046456,Drill_9568:0.0101386,((Black snub-nosed monkey_61621:0.013633,Golden snub-nosed monkey_61622:0.00265876)100:0.0153811,Peters' Angolan colobus_336983:0.01253

## ncbi with undescore name

In [None]:
 
ncbi_taxon_list = list_species_tax_fast


ncbi = ete3.NCBITaxa()  # first time download in ~/.etetoolkit/
ncbi_sub_tree = ncbi.get_topology(ncbi_taxon_list)

print(len(ncbi_sub_tree))

In [None]:
no_list=[]
for node in ncbi_sub_tree.traverse(strategy="postorder"):
    if node.is_leaf() :           
        if node.name in scientificUnder_taxonID:
            node_name = scientificUnder_taxonID[node.name]
            node.name=node_name + "_"+node.name
        elif node.name=='30608' : node.name="Microcebus_murinus_30608" # "Lesser mouse lemur_30608"
        else:
            no_list.append(node.name)
        
print(no_list)

In [None]:
ncbi_sub_tree.write()

## Standard OMA  -slow, with underscore

In [None]:
# tree_raw_address= "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/primate_oma/standard/"
# tree_raw_address += "concatanted.fasta.contree"

# support_treshold = 94
# length_threshold=1e-3 

# out_tree = collapse_tree_sup_branch(tree_raw_address, support_treshold,length_threshold )


In [24]:

slow_tree_address="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/primate_oma/standard/"
slow_tree_address=slow_tree_address+"concatanted.fasta.contree_collapsed__94_0.001"

slow_tree= ete3.Tree(slow_tree_address) # ,format=0
print(len(slow_tree))


25


In [25]:
slow_tree.write()

'(Cercocebus_atys:0.00669107,((Chlorocebus_sabaeus:0.00449395,((Colobus_angolensis_palliatus:0.0092681,(Rhinopithecus_bieti:0.00506623,Rhinopithecus_roxellana:0.00289074)100:0.00308848)100:0.00208691,(((Gorilla_gorilla_gorilla:0.0124626,Homo_sapiens:0.00232641,(Pan_paniscus:0.00289673,Pan_troglodytes:0.00269454)100:0.00343484)100:0.00275311,Pongo_abelii:0.0103617,Nomascus_leucogenys:0.0157206)100:0.00374624,(((Aotus_nancymaae:0.0137647,Saimiri_boliviensis_boliviensis:0.0105541)99:0.00117242,Callithrix_jacchus:0.0187279)100:0.0127103,(Carlito_syrichta:0.0536053,(((Microcebus_murinus:0.0231193,Propithecus_coquereli:0.0201631)100:0.0101179,Otolemur_garnettii:0.0478609)100:0.00893513,(Bos_taurus:0.0563131,Mus_musculus:0.0892101)100:0.00807406)100:0.0033399)100:0.0169115)100:0.00580039)100:0.0053156)100:0.00136178)100:0.00102458,Papio_anubis:0.0117063,Macaca_fascicularis:0.00241916,Macaca_nemestrina:0.00454226,Macaca_mulatta:0.0173844)100:0.00123758,Mandrillus_leucophaeus:0.00729859);'

In [26]:
dic_threeparts= {"Colobus_angolensis_palliatus":"Colobus_angolensis", "Gorilla_gorilla_gorilla":"Gorilla_gorilla",
                "Saimiri_boliviensis_boliviensis":"Saimiri_boliviensis"}

## slow undescore to taxon

In [28]:

list_species_tax_slow=[]
list_species_omaID_slow=[]
no_list_slow=[]
for node in slow_tree.traverse(strategy="postorder"):
    if node.is_leaf() :    
        node_name = node.name
        list_species_omaID_slow.append(node_name)
        if node_name in taxonID_scientificUnder:
            node_name_tax = taxonID_scientificUnder[node_name]
            node.name = node_name_tax            
            list_species_tax_slow.append(node_name_tax)
        elif node_name in dic_threeparts:
            node_name2=dic_threeparts[node_name]
            node_name_tax = taxonID_scientificUnder[node_name2]
            node.name = node_name_tax            
            list_species_tax_slow.append(node_name_tax) 
        elif node.name=="Microcebus_murinus" : 
            node_name_tax = "30608"
            node.name = node_name_tax            
            list_species_tax_slow.append(node_name_tax) 
            # Microcebus murinus 30608
        else:
            no_list_slow.append(node_name)
            
print("number of leaves, species, is:",len(list_species_omaID_slow),len(list_species_tax_slow),len(no_list_slow))


number of leaves, species, is: 25 25 0


In [29]:
no_list_slow

[]

In [30]:
slow_tree.write()

'(9531:0.00669107,((60711:0.00449395,((336983:0.0092681,(61621:0.00506623,61622:0.00289074)100:0.00308848)100:0.00208691,(((9595:0.0124626,9606:0.00232641,(9597:0.00289673,9598:0.00269454)100:0.00343484)100:0.00275311,9601:0.0103617,61853:0.0157206)100:0.00374624,(((37293:0.0137647,39432:0.0105541)99:0.00117242,9483:0.0187279)100:0.0127103,(1868482:0.0536053,(((30608:0.0231193,379532:0.0201631)100:0.0101179,30611:0.0478609)100:0.00893513,(9913:0.0563131,10090:0.0892101)100:0.00807406)100:0.0033399)100:0.0169115)100:0.00580039)100:0.0053156)100:0.00136178)100:0.00102458,9555:0.0117063,9541:0.00241916,9545:0.00454226,9544:0.0173844)100:0.00123758,9568:0.00729859);'

### make sure fast and ncbi trees are with taxonid, otherwise re-run from the begining

In [44]:
fast_tree.write()

'(9595:2.4974e-06,((((((((10090:0.216591,9913:0.167672)100:0.0267584,((30608:0.0820969,379532:0.0449706)100:0.0289214,30611:0.117716)100:0.0290836)100:0.0142417,1868482:0.149012)100:0.0541056,((37293:0.037888,9483:0.067527)100:0.00644747,39432:0.0305257)100:0.0361803)100:0.0171856,((((9544:0.0326042,9555:0.0190522)100:0.00813149,60711:0.0249677)100:0.00678587,((9545:0.00449408,9541:0.00491265)100:0.00546981,(9531:0.0145259,9568:0.0176014)100:0.00225342)98:0.00190155)100:0.00748055,((61621:0.0132501,61622:0.00969258)100:0.00947199,336983:0.0178891)100:0.00583219)100:0.020963)100:0.011959,61853:0.0316329)100:0.00294442,9601:0.0317007)100:0.0068013,((9598:2.1611e-06,9597:0.0146491)100:0.00685148,9606:0.00712411)100:0.00429428)100:0.0226656);'

In [48]:
ncbi_sub_tree.write()

'((((((((((9597:1,9598:1)1:1,9606:1,9595:1)1:1,9601:1)1:1,61853:1)1:1,(((9541:1,9544:1,9545:1)1:1,60711:1,9531:1,9555:1,9568:1)1:1,((61621:1,61622:1)1:1,336983:1)1:1)1:1)1:1,((39432:1,9483:1)1:1,37293:1)1:1)1:1,1868482:1)1:1,((379532:1,30608:1)1:1,30611:1)1:1)1:1,10090:1)1:1,9913:1);'

In [45]:


intersection_slow_fast=set(list_species_tax_slow) & set(list_species_tax_fast)

print("length of tree is ",len(fast_tree),len(slow_tree),len(list_species_tax_fast))


length of tree is  25 25 25


In [49]:
fast_tree.prune(intersection_slow_fast, preserve_branch_length=True)

slow_tree.prune(intersection_slow_fast, preserve_branch_length=True)
ncbi_sub_tree.prune(intersection_slow_fast, preserve_branch_length=True)

print(len(fast_tree),len(slow_tree),len(ncbi_sub_tree))


25 25 25


In [None]:
fast_tree.write()

In [None]:
slow_tree.write()

In [None]:
ncbi_sub_tree.write()

In [50]:
slow_tree.set_outgroup("9913") 


out_slow_ncbi = slow_tree.robinson_foulds(ncbi_sub_tree)
(rf_slow_ncbi, maxparts_slow_ncbi, common_attrs, partitions_slow_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_slow_ncbi 
partitions_slow = set(parition_editor(partitions_slow_raw, intersection_sets))
partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersection_sets))

print("RF distance is %s over a total of %s" %(rf_slow_ncbi, maxparts_slow_ncbi))
print("in slow:",len(partitions_slow),", in ncbi:",len(partitions_ncbi)) # , "Common Partitions",len(common_attrs_paper),)
print("in both slow and ncbi:", len(partitions_ncbi & partitions_slow))
print("only in ncbi, not in slow:", len(partitions_ncbi-partitions_slow)) # order not sure  http://etetoolkit.org/docs/latest/tutorial/tutorial_trees.html#robinson-foulds-distance
print("only in slow, not in ncbi:", len(partitions_slow-partitions_ncbi))


RF distance is 6 over a total of 36
in slow: 17 , in ncbi: 17
in both slow and ncbi: 14
only in ncbi, not in slow: 3
only in slow, not in ncbi: 3


In [53]:
fast_tree.set_outgroup("9913") 


out_fast_ncbi = fast_tree.robinson_foulds(ncbi_sub_tree)
(rf_fast_ncbi, maxparts_fast_ncbi, common_attrs, partitions_fast_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_fast_ncbi 
partitions_fast = set(parition_editor(partitions_fast_raw, intersection_sets))
partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersection_sets))

print("RF distance is %s over a total of %s" %(rf_fast_ncbi, maxparts_fast_ncbi))
print("in fast:",len(partitions_fast),", in ncbi:",len(partitions_ncbi)) # , "Common Partitions",len(common_attrs_paper),)
print("in both fast and ncbi:", len(partitions_ncbi & partitions_fast))
print("only in ncbi, not in fast:", len(partitions_ncbi-partitions_fast)) # order not sure  http://etetoolkit.org/docs/latest/tutorial/tutorial_trees.html#robinson-foulds-distance
print("only in fast, not in ncbi:", len(partitions_fast-partitions_ncbi))




RF distance is 9 over a total of 41
in fast: 22 , in ncbi: 17
in both fast and ncbi: 15
only in ncbi, not in fast: 2
only in fast, not in ncbi: 7


In [54]:
print("only in fastOMA:", len((partitions_fast - partitions_ncbi) - partitions_slow))
print("only in fastOMA & slow:",len((partitions_fast & partitions_slow) - partitions_ncbi))
print("only in fastOMA & NCBI:",len((partitions_fast & partitions_ncbi) - partitions_slow))
print("only in slow & NCBI:",len((partitions_slow & partitions_ncbi) -  partitions_fast))
print("only in fastOMA & NCBI & slow:",len((partitions_fast & partitions_ncbi) & partitions_slow))
print("only in slow:",len((partitions_slow -partitions_fast) - partitions_ncbi))
print("only in NCBI:",len((partitions_ncbi - partitions_fast) - partitions_slow))


only in fastOMA: 6
only in fastOMA & slow: 1
only in fastOMA & NCBI: 1
only in slow & NCBI: 0
only in fastOMA & NCBI & slow: 14
only in slow: 2
only in NCBI: 2


## slow with common name

In [55]:
no_list=[]
for node in slow_tree.traverse(strategy="postorder"):
    if node.is_leaf() :           
        if node.name in common_taxonID:
            node_name = common_taxonID[node.name]
            node.name=node_name + "_"+node.name
        elif node.name=='30608' : node.name="Lesser mouse lemur_30608"
            # Microcebus murinus 30608
        else:
            no_list.append(node.name)
        
print(no_list)

[]


In [56]:
slow_tree.write()

"(Bovine_9913:0.0281565,(Mouse_10090:0.0892101,(((Lesser mouse lemur_30608:0.0231193,Coquerel's sifaka_379532:0.0201631)100:0.0101179,Small-eared galago_30611:0.0478609)100:0.00893513,(Philippine tarsier_1868482:0.0536053,(((Ma's night monkey_37293:0.0137647,Bolivian squirrel monkey_39432:0.0105541)99:0.00117242,White-tufted-ear marmoset_9483:0.0187279)100:0.0127103,(((Western lowland gorilla_9595:0.0124626,Human_9606:0.00232641,(Pygmy chimpanzee_9597:0.00289673,Chimpanzee_9598:0.00269454)100:0.00343484)100:0.00275311,Sumatran orangutan_9601:0.0103617,Northern white-cheeked gibbon_61853:0.0157206)100:0.00374624,((Peters' Angolan colobus_336983:0.0092681,(Black snub-nosed monkey_61621:0.00506623,Golden snub-nosed monkey_61622:0.00289074)100:0.00308848)100:0.00208691,(Green monkey_60711:0.00449395,(Olive baboon_9555:0.0117063,Crab-eating macaque_9541:0.00241916,Pig-tailed macaque_9545:0.00454226,Rhesus macaque_9544:0.0173844,(Sooty mangabey_9531:0.00669107,Drill_9568:0.00729859)100:0.001

### three way comparison

In [None]:
# # fast_tree= ete3.Tree(fast_treee_address_out, format=1)
# # fast_tree.prune(intersect_leaves, preserve_branch_length=True)

# # fast_tree.set_outgroup(outgroup) # CALPUG MOUS  BUCCAP CHLMAC

# out_paper_ncbi = bird_paper_tree.robinson_foulds(ncbi_sub_tree) # , expand_polytomies = True  #,polytomy_size_limit=20  ,unrooted_trees=True # , expand_polytomies = True
# (rf_paper_ncbi, maxparts_paper_ncbi, common_attrs, partitions_paper_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_paper_ncbi 
# partitions_paper = set(parition_editor(partitions_paper_raw, intersect_leaves))
# partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersect_leaves))

# out_fast_ncbi = fast_tree.robinson_foulds(ncbi_sub_tree)  # ,unrooted_trees=True # , expand_polytomies = True
# (rf_fast_ncbi, maxparts_fast_ncbi, common_attrs, partitions_fast_raw, partitions_ncbi_raw2, discard_t1, discard_t2)=out_fast_ncbi 
# partitions_fast = set(parition_editor(partitions_fast_raw, intersect_leaves))
# partitions_ncbi2 = set(parition_editor(partitions_ncbi_raw2, intersect_leaves))
# assert len(partitions_ncbi2-partitions_ncbi)==0
# out_fast_paper = fast_tree.robinson_foulds(bird_paper_tree)  # ,unrooted_trees=True # , expand_polytomies = True
# (rf_fast_paper, maxparts_fast_paper, common_attrs, partitions_fast_raw2, partitions_paper_raw2, discard_t1, discard_t2)=out_fast_paper
# partitions_paper2 = set(parition_editor(partitions_paper_raw2, intersect_leaves))
# partitions_fast2 = set(parition_editor(partitions_fast_raw2, intersect_leaves))
# assert len(partitions_fast2-partitions_fast)==0
# assert len(partitions_paper2-partitions_paper)==0


# values_comparison=[len((partitions_fast - partitions_ncbi) - partitions_paper),
#                    len((partitions_fast & partitions_paper) - partitions_ncbi2),
#                    len((partitions_fast & partitions_ncbi) - partitions_paper),
#                    len((partitions_paper & partitions_ncbi) -  partitions_fast),
#                    len((partitions_fast & partitions_ncbi) &  partitions_paper),
#                    len((partitions_paper - partitions_fast) - partitions_ncbi),
#                    len((partitions_ncbi - partitions_fast) - partitions_paper) ]
# print(values_comparison)

# ****

# others

### plot different collapsing  support value


In [None]:

# tresh=60 
# project_folder_root="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3b/"
# tresh_list=[60,80,90,95,100]
# tresh_list_name=["_collapsed_"+str(i) for i in tresh_list] 
# tresh_list_name=[""]+tresh_list_name

# values_comparison_all=[]
# for tresh in tresh_list_name:
#     fast_tree_address=project_folder_root+"iqtree5/_100_msa_concatanated.txt_copy1.contree"+tresh

#     fast_tree= ete3.Tree(fast_tree_address)
#     #print("len fast tree",len(fast_tree))


#     fast_tree.prune(intersect_leaves)
#     #print("len fast tree after prune",len(fast_tree))

#     fast_tree.set_outgroup(outgroup) # CALPUG MOUS  BUCCAP  CHLMAC

#     out_paper_ncbi = bird_paper_tree.robinson_foulds(ncbi_sub_tree) # , expand_polytomies = True  #,polytomy_size_limit=20  ,unrooted_trees=True # , expand_polytomies = True
#     (rf_paper_ncbi, maxparts_paper_ncbi, common_attrs, partitions_paper_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_paper_ncbi 
#     partitions_paper = set(parition_editor(partitions_paper_raw, intersect_leaves))
#     partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersect_leaves))

#     out_fast_ncbi = fast_tree.robinson_foulds(ncbi_sub_tree)  # ,unrooted_trees=True # , expand_polytomies = True
#     (rf_fast_ncbi, maxparts_fast_ncbi, common_attrs, partitions_fast_raw, partitions_ncbi_raw2, discard_t1, discard_t2)=out_fast_ncbi 
#     partitions_fast = set(parition_editor(partitions_fast_raw, intersect_leaves))
#     partitions_ncbi2 = set(parition_editor(partitions_ncbi_raw2, intersect_leaves))
#     assert len(partitions_ncbi2-partitions_ncbi)==0
#     out_fast_paper = fast_tree.robinson_foulds(bird_paper_tree)  # ,unrooted_trees=True # , expand_polytomies = True
#     (rf_fast_paper, maxparts_fast_paper, common_attrs, partitions_fast_raw2, partitions_paper_raw2, discard_t1, discard_t2)=out_fast_paper
#     partitions_paper2 = set(parition_editor(partitions_paper_raw2, intersect_leaves))
#     partitions_fast2 = set(parition_editor(partitions_fast_raw2, intersect_leaves))
#     assert len(partitions_fast2-partitions_fast)==0
#     assert len(partitions_paper2-partitions_paper)==0


#     values_comparison=[len((partitions_fast - partitions_ncbi) - partitions_paper),
#                        len((partitions_fast & partitions_paper) - partitions_ncbi2),
#                        len((partitions_fast & partitions_ncbi) - partitions_paper),
#                        len((partitions_paper & partitions_ncbi) -  partitions_fast),
#                        len((partitions_fast & partitions_ncbi) &  partitions_paper),
#                        len((partitions_paper - partitions_fast) - partitions_ncbi),
#                        len((partitions_ncbi - partitions_fast) - partitions_paper) ]
#     #sum1=values_comparison[0]+values_comparison[4]+values_comparison[5]+values_comparison[6]
#     print(values_comparison) # ,sum1
#     values_comparison_all.append(values_comparison)
# values_comparison_all=np.array(values_comparison_all)
# #print(values_comparison_all)

# #t_all=np.transpose(values_comparison_all)
# lenged1=['fastOMA','fastOMA & paper',  'fastOMA & NCBI',
#   'paper & NCBI', 'fastOMA & NCBI & paper',  'paper', 'NCBI']

# import pandas as pd
# df = pd.DataFrame(data=values_comparison_all)
# df.columns=lenged1   #ind = [0,60,80,90,95,100] # ,"BirdPaper"
# df.index=["no"]+tresh_list
# df

### plot different collapsing branch length

In [None]:

# #tresh=60 
# project_folder_root="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3b/"
# tresh_list=[1e-05,0.0001,0.001,0.005,0.01,0.1]
# tresh_list_name=["_collapsed_brnch_"+str(i) for i in tresh_list] 
# tresh_list_name=[""]+tresh_list_name

# values_comparison_all=[]
# for tresh in tresh_list_name:
#     fast_tree_address=project_folder_root+"iqtree5/_100_msa_concatanated.txt_copy1.contree"+tresh

#     fast_tree= ete3.Tree(fast_tree_address)
#     #print("len fast tree",len(fast_tree))


#     fast_tree.prune(intersect_leaves)
#     #print("len fast tree after prune",len(fast_tree))

#     fast_tree.set_outgroup(outgroup) # CALPUG MOUS  BUCCAP

#     out_paper_ncbi = bird_paper_tree.robinson_foulds(ncbi_sub_tree) # , expand_polytomies = True  #,polytomy_size_limit=20  ,unrooted_trees=True # , expand_polytomies = True
#     (rf_paper_ncbi, maxparts_paper_ncbi, common_attrs, partitions_paper_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_paper_ncbi 
#     partitions_paper = set(parition_editor(partitions_paper_raw, intersect_leaves))
#     partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersect_leaves))

#     out_fast_ncbi = fast_tree.robinson_foulds(ncbi_sub_tree)  # ,unrooted_trees=True # , expand_polytomies = True
#     (rf_fast_ncbi, maxparts_fast_ncbi, common_attrs, partitions_fast_raw, partitions_ncbi_raw2, discard_t1, discard_t2)=out_fast_ncbi 
#     partitions_fast = set(parition_editor(partitions_fast_raw, intersect_leaves))
#     partitions_ncbi2 = set(parition_editor(partitions_ncbi_raw2, intersect_leaves))
#     assert len(partitions_ncbi2-partitions_ncbi)==0
#     out_fast_paper = fast_tree.robinson_foulds(bird_paper_tree)  # ,unrooted_trees=True # , expand_polytomies = True
#     (rf_fast_paper, maxparts_fast_paper, common_attrs, partitions_fast_raw2, partitions_paper_raw2, discard_t1, discard_t2)=out_fast_paper
#     partitions_paper2 = set(parition_editor(partitions_paper_raw2, intersect_leaves))
#     partitions_fast2 = set(parition_editor(partitions_fast_raw2, intersect_leaves))
#     assert len(partitions_fast2-partitions_fast)==0
#     assert len(partitions_paper2-partitions_paper)==0


#     values_comparison=[len((partitions_fast - partitions_ncbi) - partitions_paper),
#                        len((partitions_fast & partitions_paper) - partitions_ncbi2),
#                        len((partitions_fast & partitions_ncbi) - partitions_paper),
#                        len((partitions_paper & partitions_ncbi) -  partitions_fast),
#                        len((partitions_fast & partitions_ncbi) &  partitions_paper),
#                        len((partitions_paper - partitions_fast) - partitions_ncbi),
#                        len((partitions_ncbi - partitions_fast) - partitions_paper) ]
#     print(values_comparison)
#     values_comparison_all.append(values_comparison)
# values_comparison_all=np.array(values_comparison_all)
# #print(values_comparison_all)

# #t_all=np.transpose(values_comparison_all)
# lenged1=['fastOMA','fastOMA & paper',  'fastOMA & NCBI',
#   'paper & NCBI', 'fastOMA & NCBI & paper',  'paper', 'NCBI']

# import pandas as pd
# df = pd.DataFrame(data=values_comparison_all)
# df.columns=lenged1   #ind = [0,60,80,90,95,100] # ,"BirdPaper"
# df.index=["no"]+tresh_list
# df