In [1]:
#!/usr/bin/python3
import os
import numpy as np
from sys import argv
from datetime import datetime
# import matplotlib
# matplotlib.use('Agg')
import matplotlib.pyplot as plt
import ete3
#import dendropy
from dendropy import Tree
datasets_address="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/archive/"
# oma_database_address = datasets_address + "OmaServer.h5"
omaID_address = datasets_address+"oma-species.txt"
#bird6ID_address = datasets_address+"info.tsv"
uniprot_table_address = datasets_address+"proteomes_taxonomy_Mammalia_40674_refyes.tab"


def read_omaID_file(omaID_address):
    
    omaID_file = open(omaID_address,'r')

    taxonID_omaID={}
    omaID_taxonID={}

    # omaID_scienceFull={}
    # scienceFull_omaID={}
    scientific_taxonID={}
    taxonID_scientific={}

    #  !!! limitation  ignoring strains and isolate
    for line in omaID_file:
        line_strip = line.strip()
        if line_strip.startswith('#'):
            pass
            #header_lines_list.append(line_strip)
        else:
            line_parts = line_strip.split('\t')

            omaID = line_parts[0]
            taxonID = line_parts[1]
            taxonID_omaID[taxonID] = omaID
            omaID_taxonID[omaID] = taxonID

            scientific = line_parts[2]
            taxonID_scientific[taxonID] = scientific
            scientific_taxonID[scientific] = taxonID

    omaID_file.close()

    print("-- The map for OMA taxonID of",len(taxonID_omaID),"records have read.") 
    
    return (taxonID_omaID,omaID_taxonID,taxonID_scientific,scientific_taxonID)


def read_taxonID_uniprot(uniprot_table_address):

    taxonID_scientific = {}
    scientific_taxonID = {}
    common_taxonID = {}
    taxonID_uniprot = {}
    taxonID_scientificUnder = {}
    scientificUnder_taxonID = {}

    uniprot_file = open(uniprot_table_address,'r')
    for line in uniprot_file:
        line_strip = line.strip()
        if line_strip.startswith('Pro'):
            pass
        else:
            line_parts = line_strip.split('\t')
            uniprot = line_parts[0]
            taxonID = line_parts[2]

            taxonID_uniprot[uniprot] = taxonID
            
            line_parts_1= line_parts[1].split("(")
            if len(line_parts_1) > 1:

                scientific = line_parts_1[0]
                common = line_parts_1[1].strip()[:-1]
            else:
                scientific = line_parts_1[0]
                common = line_parts_1[0]
            
            scientific_taxonID[taxonID] = scientific
            taxonID_scientific[scientific] = taxonID 

            common_taxonID[taxonID] = common
            
            
            scientificUnder = '_'.join(scientific.split(" ")[:2])
            taxonID_scientificUnder[scientificUnder] = taxonID
            scientificUnder_taxonID[taxonID] = scientificUnder

    print("-- WARNING, for taxonID_scientificUnder, only the first two parts of species name are used!!! ")
    print("-- The map for uniprot taxonID of",len(scientific_taxonID),"records have read.") 
    return (scientific_taxonID, taxonID_scientific, common_taxonID, taxonID_uniprot, taxonID_scientificUnder, scientificUnder_taxonID)


def collapse_tree_sup_branch(tree_raw_address, support_treshold, length_threshold ):


    tree = dendropy.Tree.get_from_path(tree_raw_address,"newick") # newick nexus

    print("length of input tree is: ", len(tree))

    for idx, node in enumerate(tree):
        if not node.is_leaf() and node.label:
            support_value = int(node.label)
            if support_value <= support_treshold:
                edge= node.edge   #print(edge.head_node.child_nodes())
                edge.collapse()
    print("length of tree  after collapsing based on support values is: ", len(tree))


    for e in tree.postorder_edge_iter():    
        if e.length is None or (e.length <= length_threshold) and e.is_internal():
            e.collapse()
    print("length of tree after collapsing based on branch length is: ", len(tree))

    tree_address_out = tree_raw_address+"_collapsed__"+str(support_treshold)+"_"+str(length_threshold)
    tree.write_to_path(tree_address_out, "newick")
    print("Output tree is written in ",tree_address_out)
    return (tree)

# def read_taxonID_uniprot_temp(uniprot_table_address):

#     taxonID_scientific = {}
#     scientific_taxonID = {}
#     common_taxonID = {}
#     taxonID_uniprot = {}
    
#     taxonID_scientificUnder={}
        
#     uniprot_file = open(uniprot_table_address,'r')
#     for line in uniprot_file:
#         line_strip = line.strip()
#         if line_strip.startswith('Pro'):
#             pass
#         else:
#             line_parts = line_strip.split('\t')
#             uniprot = line_parts[0]
#             taxonID = line_parts[2]

#             taxonID_uniprot[uniprot]=taxonID
            
#             line_parts_1= line_parts[1].split("(")
#             if len(line_parts_1) > 1:

#                 scientific = line_parts_1[0]
#                 common = line_parts_1[1].strip()[:-1]
#             else:
#                 scientific = line_parts_1[0]
#                 common = line_parts_1[0]
            
#             scientific_taxonID[taxonID] = scientific
#             taxonID_scientific[scientific] = taxonID 

#             common_taxonID[taxonID] = common
#             scientificUnder='_'.join(scientific.split(" ")[:2])
#             taxonID_scientificUnder[scientificUnder]=taxonID

#     print("-- The map for uniprot taxonID of",len(scientific_taxonID),"records have read.") 
#     return (taxonID_scientificUnder)
#taxonID_scientificUnder = read_taxonID_uniprot_temp(uniprot_table_address)

(scientific_taxonID, taxonID_scientific, common_taxonID, taxonID_uniprot, taxonID_scientificUnder, scientificUnder_taxonID) =read_taxonID_uniprot(uniprot_table_address)

#(scientific_taxonID,taxonID_scientific,common_taxonID,taxonID_uniprot)=read_taxonID_uniprot(uniprot_table_address)

(taxonID_omaID,omaID_taxonID,taxonID_scientific,scientific_taxonID)= read_omaID_file(omaID_address)


-- The map for uniprot taxonID of 101 records have read.
-- The map for OMA taxonID of 2424 records have read.


# Collapsing fast tree  -  support value _ branch length

In [30]:
# # #fast_tree_address="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/mammal/old/mml_v4/"
# # #"omamer_v4_search/fastoma_out/_80_msa_concatanated.txt.contree_collapsed__94_0.001"

# import dendropy



# tree_raw_address= "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/mammal_uniprot/mml_v4/fastoma_core_column/"
# tree_raw_address += "___msa_concatanated_filtered_row___col_0.9436.txt.contree"

# support_treshold = 95
# length_threshold=1e-3

# out_tree = collapse_tree_sup_branch(tree_raw_address, support_treshold,length_threshold )


length of input tree is:  107
length of tree  after collapsing based on support values is:  107
length of tree after collapsing based on branch length is:  107
Output tree is written in  /work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/mammal_uniprot/mml_v4/fastoma_core_column/___msa_concatanated_filtered_row___col_0.9436.txt.contree_collapsed__95_0.001


# Reading FastOMA tree - collapsed ,  undescore name

In [136]:
#project_folder_root="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/mammal/mml26/"
#fast_tree_address=project_folder_root+"fastoma_core/all_splices/_100_msa_concatanated.txt.contree_collapsed_94_0.001" # 

fast_tree_address = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/mammal_uniprot/"
fast_tree_address += "/mml_v4/fastoma_core_nosplice_v5/"+"_100_msa_concatanated.txt.contree"+ "_collapsed__95_0.001"
 # fastoma_core_nosplice_v5  fastoma_core_column  ___msa_concatanated_filtered_row___col_0.96.txt.contree

print(fast_tree_address)
print(round(os.path.getsize(fast_tree_address)/1000),"kb")

fast_tree= ete3.Tree(fast_tree_address) # ,format=0
print("length of tree is ",len(fast_tree))


/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/mammal_uniprot//mml_v4/fastoma_core_nosplice_v5/_100_msa_concatanated.txt.contree_collapsed__95_0.001
5 kb
length of tree is  106


In [137]:
#fast_tree.write()

In [138]:
# ##oma_standalon backbone species names
# import pyoma.browser.db as db
# project_folder_root="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/mammal/old/mml_v4/"
# oma_database_address=project_folder_root+"omamer_v4_database/oma_path_/OmaServer.h5"
# oma_db = db.Database(oma_database_address);
# print(oma_db.tax.as_dict())



### use scientific name for  backbone (S001)

In [139]:
# # #dic_oma={'S0005':'rabbit', 'S0003':'human', 'S0006':'redfox', 'S0004':'opossum', 'S0001':'chicken','S0002':'frog'}
# # #{'rabbit':'S0005', 'human':'S0003', 'redfox': 'S0006', 'opossum':'S0004', 'chicken':'S0001','frog':'S0002'}

# # old/mml_v4/"
# dic_oma_db={'Oryctolagus_cuniculus':'S0007', 'Homo_sapiens':'S0003', 'Vulpes_vulpes':'S0008','Monodelphis_domestica':'S0006', 'Gallus_gallus':'S0002', 'Xenopus_tropicalis':'S0009','Latimeria_chalumnae': 'S0004', 'Mola_mola':'S0005', 'Danio_rerio': 'S0001'}
# dic_oma_db_=dict((v, k) for k, v in dic_oma_db.items())
# print(len(dic_oma_db_))

In [140]:
# exist_from_oma=[]
# #exist_from_fa=[]
# for node in fast_tree.traverse(strategy="postorder"):
#     if node.is_leaf() :    
#         node_name = node.name
#         if node_name in dic_oma_db_:            
#             node_name_ = dic_oma_db_[node_name]
#             node.name = node_name_+"_back"  
#             exist_from_oma.append(node_name_)
# #         if node_name in fa_files:
# #             exist_from_fa.append(node_name)

# #fa_files=["Acinonyx_jubatus", "Ailuropoda_melanoleuca", "Aotus_nancymaae", "Balaenoptera_acutorostrata", "Balaenoptera_physalus", "Bison_bison", "Bos_indicus", "Bos_mutus", "Bos_taurus", "Callithrix_jacchus", "Callorhinus_ursinus", "Camelus_dromedarius", "Camelus_ferus", "Canis_lupus", "Capra_hircus", "Carlito_syrichta", "Cavia_porcellus", "Cebus_imitator", "Cercocebus_atys", "Cervus_elaphus", "Chlorocebus_sabaeus", "Chrysochloris_asiatica", "Colobus_angolensis", "Cricetulus_griseus", "Crocuta_crocuta", "Delphinapterus_leucas", "Dipodomys_ordii", "Enhydra_lutris", "Equus_caballus", "Erinaceus_europaeus", "Eschrichtius_robustus", "Felis_catus", "Fukomys_damarensis", "Gorilla_gorilla", "Gulo_gulo", "Heterocephalus_glaber", "Homo_sapiens", "Ictidomys_tridecemlineatus", "Leptonychotes_weddellii", "Lipotes_vexillifer", "Loxodonta_africana", "Lynx_pardinus", "Macaca_fascicularis", "Macaca_mulatta", "Macaca_nemestrina", "Mandrillus_leucophaeus", "Marmota_monax", "Mesocricetus_auratus", "Monodelphis_domestica", "Monodon_monoceros", "Muntiacus_reevesi", "Mus_musculus", "Mustela_putorius", "Myotis_brandtii", "Myotis_davidii", "Myotis_lucifugus", "Neomonachus_schauinslandi", "Neophocaena_asiaeorientalis", "Neotoma_lepida", "Nomascus_leucogenys", "Octodon_degus", "Odobenus_rosmarus", "Odocoileus_virginianus", "Ornithorhynchus_anatinus", "Oryctolagus_cuniculus", "Otolemur_garnettii", "Ovis_aries", "Pan_paniscus", "Pan_troglodytes", "Papio_anubis", "Peromyscus_maniculatus", "Phascolarctos_cinereus", "Phyllostomus_discolor", "Physeter_macrocephalus", "Pongo_abelii", "Propithecus_coquereli", "Pteropus_alecto", "Pteropus_vampyrus", "Puma_concolor", "Rattus_norvegicus", "Rhinolophus_ferrumequinum", "Rhinopithecus_bieti", "Rhinopithecus_roxellana", "Saimiri_boliviensis", "Sapajus_apella", "Sarcophilus_harrisii", "Sousa_chinensis", "Suricata_suricatta", "Sus_scrofa", "Trichechus_manatus", "Tupaia_chinensis", "Tursiops_truncatus", "Ursus_americanus", "Ursus_arctos", "Ursus_maritimus", "Vicugna_pacos", "Vombatus_ursinus", "Vulpes_vulpes"]
# #print(set(fa_files) - set(exist_from_fa) )
# #len(exist_from_oma),exist_from_oma

In [141]:
# fast_tree.write()

In [142]:
# list_species=[]
# for node in fast_tree.traverse(strategy="postorder"):
#     if node.is_leaf() :    
#         list_species.append(node.name)
        
# print("number of leaves, species, is:",len(list_species))

### convert to taxonID 

In [143]:

list_species_tax_fast=[]
list_species_omaID_fast=[]
no_list=[]
for node in fast_tree.traverse(strategy="postorder"):
    if node.is_leaf() :    
        node_name = node.name
        list_species_omaID_fast.append(node_name)
        if node_name in taxonID_scientificUnder:
            
            node_name_tax = taxonID_scientificUnder[node_name]
            node.name = node_name_tax            
            list_species_tax_fast.append(node_name_tax)
        else:
            no_list.append(node_name)
            
print("number of leaves, species, is:",len(list_species_omaID_fast),len(list_species_tax_fast),len(no_list),len(taxonID_scientificUnder))

#set(exist_from_fa)-set(taxonID_scientificUnder.keys())
#set(taxonID_scientificUnder.keys())-set(exist_from_fa)

# no_list

number of leaves, species, is: 106 94 12 100


In [144]:
#fast_tree.write()

In [145]:
# keep only leaves of taxon id , remove backbone

print(len(fast_tree))
fast_tree.prune(list_species_tax_fast, preserve_branch_length=True)
print(len(fast_tree))

# fast_tree.write() 

106
94


In [146]:
# list_sepcies_mammal = [10020, 10029, 10036, 10090, 10116, 10141, 10160, 10181, 103600, 116960, 118797, 127582, 132908, 13616, 1706337, 185453, 1868482, 191816, 230844, 2715852, 27622, 29073, 29088, 29139, 30522, 30538, 30611, 310752, 32536, 336983, 34884, 37032, 37293, 379532, 38626, 391180, 39432, 40151, 419612, 43179, 43346, 46360, 51298, 56216, 59463, 59472, 59479, 60711, 61621, 61622, 61853, 72004, 77932, 885580, 89673, 9258, 9305, 9365, 9402, 9407, 9483, 9515, 9531, 9541, 9544, 9545, 9555, 9568, 9595, 9597, 9598, 9601, 9606, 9615, 9627, 9643, 9646, 9669, 9678, 9685, 9696, 9708, 9713, 9739, 9749, 9755, 9764, 9770, 9785, 9796, 9823, 9838, 9880, 9886, 9913, 9915, 9925, 9940, 9986, 9995]
# #outgroups=[8364, 9031] # frog chicken
# list_sepcies_str=[str(i) for i in list_sepcies_mammal]

# print(len(fast_tree))
# fast_tree.prune(list_sepcies_str, preserve_branch_length=True)
# print(len(fast_tree))

# fast_tree.set_outgroup("9258") 
# # frog 8364
# # 9258  Ornithorhynchus anatinus (Duckbill platypus)


# Reading  NCBI tree

In [147]:
 
ncbi_taxon_list = list_species_tax_fast


ncbi = ete3.NCBITaxa()  # first time download in ~/.etetoolkit/
ncbi_sub_tree = ncbi.get_topology(ncbi_taxon_list)

print(len(ncbi_sub_tree))

#ncbi_sub_tree.write()

94


# RF distance

In [148]:
def parition_editor(partitions_input,intersect_leaves):
    pivot_species= sorted(intersect_leaves)[0]
    partitions_ed=[]
    for parition in  partitions_input:
        #parition=set(parition)
        if len(parition)<len(intersect_leaves)/2 and len(parition)>1:  # removing partitions with length of 0/1 
            partitions_ed.append(parition)
        if len(parition)>len(intersect_leaves)/2 and len(parition)<len(intersect_leaves)-1:
            partitions_ed.append(tuple(intersect_leaves-set(parition)))
        if len(parition)==len(intersect_leaves)/2 and pivot_species not in parition: # make sure alwyas one half is there
            partitions_ed.append(tuple(intersect_leaves-set(parition)))
            
    # partitions = [e for e in edges if n > len(e) > 1]
    # partitions = list(map(lambda p: p if len(p) <= n/2 else all_species -set(p), partitions)      
    # if len(p) < n/2 or len(p) == n/2 and pivot_species in p: return p
    # else: return all_species - set(p)   
    return partitions_ed
        
# #fast_tree.set_outgroup("9913")  # bovin
# node=fast_tree.get_common_ancestor("9913","10090")

# fast_tree.set_outgroup("9913")  # bovin

fast_tree.set_outgroup("9258")  # bovin 9913  # 9258 ornithorhynchus anatinus  Platypus



intersection_sets= set(ncbi_taxon_list)

out_fast_ncbi = fast_tree.robinson_foulds(ncbi_sub_tree)
(rf_fast_ncbi, maxparts_fast_ncbi, common_attrs, partitions_fast_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_fast_ncbi 
partitions_fast = set(parition_editor(partitions_fast_raw, intersection_sets))
partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersection_sets))

print("RF distance is %s over a total of %s" %(rf_fast_ncbi, maxparts_fast_ncbi))
print("in fast:",len(partitions_fast),", in ncbi:",len(partitions_ncbi)) # , "Common Partitions",len(common_attrs_paper),)
print("in both fast and ncbi:", len(partitions_ncbi & partitions_fast))
print("only in ncbi, not in fast:", len(partitions_ncbi-partitions_fast)) # order not sure  http://etetoolkit.org/docs/latest/tutorial/tutorial_trees.html#robinson-foulds-distance
print("only in fast, not in ncbi:", len(partitions_fast-partitions_ncbi))


      

RF distance is 38 over a total of 140
in fast: 76 , in ncbi: 62
in both fast and ncbi: 50
only in ncbi, not in fast: 12
only in fast, not in ncbi: 26


## The followign is for visualization, may be ignored

## Trees with common name

In [30]:
no_list=[]
for node in fast_tree.traverse(strategy="postorder"):
    if node.is_leaf() :           
        #fast_tree_leaves.append(node.name)
        #old_node_name
        if node.name in common_taxonID:
            node_name = common_taxonID[node.name]
            node.name=node_name + "_"+node.name
        elif node.name=='30608' : node.name="Lesser mouse lemur_30608"
        else:
            no_list.append(node.name)
        
print(no_list)

[]


In [31]:
fast_tree.write()

"(Duckbill platypus_9258:0.365317,(((Tasmanian devil_9305:0.191273,Koala_38626:0.154523)100:0.0414404,Gray short-tailed opossum_13616:0.236146)100:0.339808,((((((Ord's kangaroo rat_10020:0.336606,((Rat_10116:0.124241,Mouse_10090:0.0634782)100:0.0686967,Chinese hamster_10029:0.264948)100:0.157961)100:0.0349106,((Naked mole rat_10181:0.196736,Damaraland mole rat_885580:0.214702)100:0.0797236,(Guinea pig_10141:0.201194,Degu_10160:0.177856)100:0.0300928)100:0.148493,Thirteen-lined ground squirrel_43179:0.214548)100:0.0400459,Rabbit_9986:0.51432)100:0.0242575,((((Northern white-cheeked gibbon_61853:0.132469,Pygmy chimpanzee_9597:0.0458389,Human_9606:0.0110071,Sumatran orangutan_9601:0.092031,(Green monkey_60711:0.0297213,(Rhesus macaque_9544:0.0535278,Crab-eating macaque_9541:0.0637958)100:0.0350517,Drill_9568:0.0541511,(Olive baboon_9555:0.0398768,Sooty mangabey_9531:0.0173492)100:0.0104805,Pig-tailed macaque_9545:0.0269368,(Black snub-nosed monkey_61621:0.0432399,Golden snub-nosed monkey_

In [32]:
no_list=[]
for node in ncbi_sub_tree.traverse(strategy="postorder"):
    if node.is_leaf() :           
        if node.name in common_taxonID:
            node_name = common_taxonID[node.name]
            node.name=node_name + "_"+node.name
        elif node.name=='30608' : node.name="Lesser mouse lemur_30608"
        else:
            no_list.append(node.name)
        
print(no_list)

[]


In [33]:
ncbi_sub_tree.write()

"(((((((((Ma's night monkey_37293:1,White-tufted-ear marmoset_9483:2)1:1,(((((Pygmy chimpanzee_9597:1,Chimpanzee_9598:1)1:1,Human_9606:1)1:1,Sumatran orangutan_9601:1)1:1,Northern white-cheeked gibbon_61853:1)1:1,(((Crab-eating macaque_9541:1,Rhesus macaque_9544:1,Pig-tailed macaque_9545:1)1:1,Green monkey_60711:1,Sooty mangabey_9531:1,Olive baboon_9555:1,Drill_9568:1)1:1,(Black snub-nosed monkey_61621:1,Golden snub-nosed monkey_61622:1)1:2)1:1)1:1)1:1,Philippine tarsier_1868482:1)1:1,(Coquerel's sifaka_379532:1,Small-eared galago_30611:1)1:1)1:1,((((Damaraland mole rat_885580:1,Naked mole rat_10181:1)1:1,Guinea pig_10141:1,Degu_10160:1)1:1,((Rat_10116:1,Mouse_10090:1)1:1,Chinese hamster_10029:3)1:1,Ord's kangaroo rat_10020:1,Thirteen-lined ground squirrel_43179:2)1:1,Rabbit_9986:1)1:1)1:1,((Pig_9823:1,Atlantic bottle-nosed dolphin_9739:4,Alpaca_30538:2,((Goat_9925:1,Sheep_9940:1)1:1,Bovine_9913:3)1:2)1:1,((((Polar bear_29073:1,American black bear_9643:1)1:1,Giant panda_9646:1)1:1,Red 

In [34]:
fast_tree.write()

"(Duckbill platypus_9258:0.365317,(((Tasmanian devil_9305:0.191273,Koala_38626:0.154523)100:0.0414404,Gray short-tailed opossum_13616:0.236146)100:0.339808,((((((Ord's kangaroo rat_10020:0.336606,((Rat_10116:0.124241,Mouse_10090:0.0634782)100:0.0686967,Chinese hamster_10029:0.264948)100:0.157961)100:0.0349106,((Naked mole rat_10181:0.196736,Damaraland mole rat_885580:0.214702)100:0.0797236,(Guinea pig_10141:0.201194,Degu_10160:0.177856)100:0.0300928)100:0.148493,Thirteen-lined ground squirrel_43179:0.214548)100:0.0400459,Rabbit_9986:0.51432)100:0.0242575,((((Northern white-cheeked gibbon_61853:0.132469,Pygmy chimpanzee_9597:0.0458389,Human_9606:0.0110071,Sumatran orangutan_9601:0.092031,(Green monkey_60711:0.0297213,(Rhesus macaque_9544:0.0535278,Crab-eating macaque_9541:0.0637958)100:0.0350517,Drill_9568:0.0541511,(Olive baboon_9555:0.0398768,Sooty mangabey_9531:0.0173492)100:0.0104805,Pig-tailed macaque_9545:0.0269368,(Black snub-nosed monkey_61621:0.0432399,Golden snub-nosed monkey_

## ncbi with undescore name

In [None]:
 
ncbi_taxon_list = list_species_tax_fast


ncbi = ete3.NCBITaxa()  # first time download in ~/.etetoolkit/
ncbi_sub_tree = ncbi.get_topology(ncbi_taxon_list)

print(len(ncbi_sub_tree))

In [None]:
no_list=[]
for node in ncbi_sub_tree.traverse(strategy="postorder"):
    if node.is_leaf() :           
        if node.name in scientificUnder_taxonID:
            node_name = scientificUnder_taxonID[node.name]
            node.name=node_name + "_"+node.name
        else:
            no_list.append(node.name)
        
print(no_list)

In [None]:
scientificUnder_taxonID["2715852"]

In [None]:
ncbi_sub_tree.write()

## Standard OMA  -slow, with undescore

In [149]:

# tree_raw_address = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/mammal_uniprot/standard/"
# tree_raw_address += "/concatanted.fasta.contree"  # _collapsed_brnch_0.01# 1e-05" # 

# import dendropy
# support_treshold = 95
# length_threshold=1e-3 

# out_tree = collapse_tree_sup_branch(tree_raw_address, support_treshold,length_threshold )


In [150]:

slow_tree_address = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/mammal_uniprot/standard/"
slow_tree_address += "/concatanted.fasta.contree"  # _collapsed_brnch_0.01# 1e-05" # 
slow_tree_address += "_collapsed__95_0.001"

slow_tree= ete3.Tree(slow_tree_address) # ,format=0
print(len(slow_tree))


73


In [151]:
slow_tree.write()

'(Monodelphis_domestica:0.0549807,(((((((Microcebus_murinus:0.0254167,Propithecus_coquereli:0.0253452)100:0.0139706,Otolemur_garnettii:0.0583853)100:0.0133941,(((((((Cercocebus_atys:0.00631594,Mandrillus_leucophaeus:0.00596072)100:0.00121839,(Macaca_fascicularis:0.00150168,Macaca_nemestrina:0.00411493)99:0.0015487,Macaca_mulatta:0.0134382,Papio_anubis:0.00853622)100:0.00111409,Chlorocebus_sabaeus:0.00740434)100:0.00216345,(Colobus_angolensis_palliatus:0.00793718,(Rhinopithecus_bieti:0.00673514,Rhinopithecus_roxellana:0.00261409)100:0.00507519)100:0.00416822)100:0.00742584,((Gorilla_gorilla_gorilla:0.0120148,Homo_sapiens:0.00326515,(Pan_paniscus:0.00288553,Pan_troglodytes:0.00289888)100:0.00459114)100:0.00306984,Pongo_abelii:0.00944045,Nomascus_leucogenys:0.0154418)100:0.00419246)100:0.00765934,((Aotus_nancymaae:0.0147995,Saimiri_boliviensis_boliviensis:0.0160102)100:0.00302408,Callithrix_jacchus:0.0174124)100:0.0150893)100:0.0221442,Carlito_syrichta:0.0574195)100:0.0044947)100:0.006161

In [152]:

list_species_tax_slow=[]
list_species_omaID_slow=[]
no_list_slow=[]
for node in slow_tree.traverse(strategy="postorder"):
    if node.is_leaf() :    
        node_name = node.name
        list_species_omaID_slow.append(node_name)
        if node_name in taxonID_scientificUnder:
            node_name_tax = taxonID_scientificUnder[node_name]
            node.name = node_name_tax            
            list_species_tax_slow.append(node_name_tax)
#         elif node_name in dic_threeparts:
#             node_name2=dic_threeparts[node_name]
#             node_name_tax = taxonID_scientificUnder[node_name2]
#             node.name = node_name_tax            
#             list_species_tax_slow.append(node_name_tax) 
#         elif node.name=="Microcebus_murinus" : 
#             node_name_tax = "30608"
#             node.name = node_name_tax            
#             list_species_tax_slow.append(node_name_tax) 
#             # Microcebus murinus 30608
        else:
            no_list_slow.append(node_name)
            
print("number of leaves, species, is:",len(list_species_omaID_slow),len(list_species_tax_slow),len(no_list_slow))


number of leaves, species, is: 73 49 24


In [153]:
no_list_slow

['Microcebus_murinus',
 'Colobus_angolensis_palliatus',
 'Gorilla_gorilla_gorilla',
 'Saimiri_boliviensis_boliviensis',
 'Tupaia_belangeri',
 'Ochotona_princeps',
 'Jaculus_jaculus',
 'Nannospalax_galili',
 'Cavia_aperea',
 'Chinchilla_lanigera',
 'Canis_lupus_familiaris',
 'Mustela_putorius_furo',
 'Manis_javanica',
 'Equus_asinus',
 'Sorex_araneus',
 'Procavia_capensis',
 'Echinops_telfairi',
 'Dasypus_novemcinctus',
 'Choloepus_hoffmanni',
 'Gallus_gallus',
 'Junco_hyemalis',
 'Anolis_carolinensis',
 'Xenopus_tropicalis',
 'Macropus_eugenii']

In [154]:
slow_tree.write()

'(13616:0.0549807,(((((((Microcebus_murinus:0.0254167,379532:0.0253452)100:0.0139706,30611:0.0583853)100:0.0133941,(((((((9531:0.00631594,9568:0.00596072)100:0.00121839,(9541:0.00150168,9545:0.00411493)99:0.0015487,9544:0.0134382,9555:0.00853622)100:0.00111409,60711:0.00740434)100:0.00216345,(Colobus_angolensis_palliatus:0.00793718,(61621:0.00673514,61622:0.00261409)100:0.00507519)100:0.00416822)100:0.00742584,((Gorilla_gorilla_gorilla:0.0120148,9606:0.00326515,(9597:0.00288553,9598:0.00289888)100:0.00459114)100:0.00306984,9601:0.00944045,61853:0.0154418)100:0.00419246)100:0.00765934,((37293:0.0147995,Saimiri_boliviensis_boliviensis:0.0160102)100:0.00302408,9483:0.0174124)100:0.0150893)100:0.0221442,1868482:0.0574195)100:0.0044947)100:0.00616186,Tupaia_belangeri:0.0740458,((9986:0.0459954,Ochotona_princeps:0.0749465)100:0.034132,((10020:0.0823708,(Jaculus_jaculus:0.0825591,((10029:0.0405367,(10090:0.0287524,10116:0.0341675)100:0.0241807)100:0.0317554,Nannospalax_galili:0.0532233)100:0.

In [155]:
# no_list=[]
# list_species_tax_slow=[]
# for node in slow_tree.traverse(strategy="postorder"):
#     if node.is_leaf() :           
#         if node.name in omaID_taxonID:
#             node_name = omaID_taxonID[node.name]
#             node.name=  node_name 
#             list_species_tax_slow.append(node_name)
#         else:
#             no_list.append(node.name)
        
# print(no_list)

In [156]:

# taxon id 
#fast_tree= ete3.Tree(fast_tree_address) # ,format=0
print("length of tree is ",len(fast_tree),len(list_species_tax_fast))


intersection_slow_fast=set(list_species_tax_slow) & set(list_species_tax_fast)

print(len(fast_tree),len(slow_tree))
fast_tree.prune(intersection_slow_fast, preserve_branch_length=True)
slow_tree.prune(intersection_slow_fast, preserve_branch_length=True)
ncbi_sub_tree.prune(intersection_slow_fast, preserve_branch_length=True)

print(len(fast_tree),len(slow_tree),len(ncbi_sub_tree))


length of tree is  94 94
94 73
49 49 49


In [157]:
fast_tree.write()

'(9258:0.365317,(((9305:0.191273,38626:0.154523)100:0.0414404,13616:0.236146)100:0.339808,((((((10020:0.336606,((10116:0.124241,10090:0.0634782)100:0.0686967,10029:0.264948)100:0.157961)100:0.0349106,((10181:0.196736,885580:0.214702)100:0.0797236,(10141:0.201194,10160:0.177856)100:0.0300928)100:0.148493,43179:0.214548)100:0.0400459,9986:0.51432)100:0.0242575,((((61853:0.132469,9597:0.0458389,9606:0.0110071,9601:0.092031,(60711:0.0297213,(9544:0.0535278,9541:0.0637958)100:0.0350517,9568:0.0541511,(9555:0.0398768,9531:0.0173492)100:0.0104805,9545:0.0269368,(61621:0.0432399,61622:0.0329304)100:0.0597879)100:0.0370803,9598:0.0170219)100:0.039766,(9483:0.107978,37293:0.0918721)100:0.0569786)100:0.115987,1868482:0.263974)100:0.022731,(379532:0.157923,30611:0.19776)100:0.0444909)100:0.0322722)100:0.0455135,(((9627:0.189379,((9643:0.0775759,29073:0.177738)99:0.021176,9646:0.128381)100:0.113529)100:0.0345644,9685:0.247897)100:0.106647,9365:0.423021,((((9739:0.226609,((9925:0.0315861,9940:0.1535

In [158]:
slow_tree.write()

'(13616:0.0549807,((((((30611:0.0583853,379532:0.0393157)100:0.0133941,(((((((9531:0.00631594,9568:0.00596072)100:0.00121839,(9541:0.00150168,9545:0.00411493)99:0.0015487,9544:0.0134382,9555:0.00853622)100:0.00111409,60711:0.00740434)100:0.00216345,(61621:0.00673514,61622:0.00261409)100:0.00924342)100:0.00742584,((9606:0.00326515,(9597:0.00288553,9598:0.00289888)100:0.00459114)100:0.00306984,9601:0.00944045,61853:0.0154418)100:0.00419246)100:0.00765934,(9483:0.0174124,37293:0.0178236)100:0.0150893)100:0.0221442,1868482:0.0574195)100:0.0044947)100:0.00616186,(((10020:0.0823708,(10029:0.0405367,(10090:0.0287524,10116:0.0341675)100:0.0241807)100:0.0605809)100:0.00809267,((885580:0.0378169,10181:0.0336759)100:0.0138397,(10141:0.0594112,10160:0.0613543)100:0.00995267)100:0.0372224,43179:0.061471)100:0.00859228,9986:0.0801274)100:0.00571266)100:0.00761878,((((((9913:0.0148103,(9925:0.00894654,9940:0.0104143)100:0.00938196)100:0.0377489,9739:0.0327881)100:0.00841663,9823:0.0495794)100:0.00432

In [159]:
ncbi_sub_tree.write()

'(((((((((37293:1,9483:2)1:1,(((((9597:1,9598:1)1:1,9606:1)1:1,9601:1)1:1,61853:1)1:1,(((9541:1,9544:1,9545:1)1:1,60711:1,9531:1,9555:1,9568:1)1:1,(61621:1,61622:1)1:2)1:1)1:1)1:1,1868482:1)1:1,(379532:1,30611:1)1:1)1:1,((((885580:1,10181:1)1:1,10141:1,10160:1)1:1,((10116:1,10090:1)1:1,10029:3)1:1,10020:1,43179:2)1:1,9986:1)1:1)1:1,((9823:1,9739:4,30538:2,((9925:1,9940:1)1:1,9913:3)1:2)1:1,((((29073:1,9643:1)1:1,9646:1)1:1,9627:2)1:1,9685:4)1:1,(59463:2,132908:2)1:1,9796:1,9365:1)1:1)1:1,9785:2)1:1,(9305:1,13616:1,38626:2)1:1)1:1,9258:1);'

In [160]:
slow_tree.set_outgroup("9258") 


out_slow_ncbi = slow_tree.robinson_foulds(ncbi_sub_tree)
(rf_slow_ncbi, maxparts_slow_ncbi, common_attrs, partitions_slow_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_slow_ncbi 
partitions_slow = set(parition_editor(partitions_slow_raw, intersection_sets))
partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersection_sets))

print("RF distance is %s over a total of %s" %(rf_slow_ncbi, maxparts_slow_ncbi))
print("in slow:",len(partitions_slow),", in ncbi:",len(partitions_ncbi)) # , "Common Partitions",len(common_attrs_paper),)
print("in both slow and ncbi:", len(partitions_ncbi & partitions_slow))
print("only in ncbi, not in slow:", len(partitions_ncbi-partitions_slow)) # order not sure  http://etetoolkit.org/docs/latest/tutorial/tutorial_trees.html#robinson-foulds-distance
print("only in slow, not in ncbi:", len(partitions_slow-partitions_ncbi))


RF distance is 11 over a total of 75
in slow: 42 , in ncbi: 35
in both slow and ncbi: 33
only in ncbi, not in slow: 2
only in slow, not in ncbi: 9


In [161]:


out_fast_ncbi = fast_tree.robinson_foulds(ncbi_sub_tree)
(rf_fast_ncbi, maxparts_fast_ncbi, common_attrs, partitions_fast_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_fast_ncbi 
partitions_fast = set(parition_editor(partitions_fast_raw, intersection_sets))
partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersection_sets))

print("RF distance is %s over a total of %s" %(rf_fast_ncbi, maxparts_fast_ncbi))
print("in fast:",len(partitions_fast),", in ncbi:",len(partitions_ncbi)) # , "Common Partitions",len(common_attrs_paper),)
print("in both fast and ncbi:", len(partitions_ncbi & partitions_fast))
print("only in ncbi, not in fast:", len(partitions_ncbi-partitions_fast)) # order not sure  http://etetoolkit.org/docs/latest/tutorial/tutorial_trees.html#robinson-foulds-distance
print("only in fast, not in ncbi:", len(partitions_fast-partitions_ncbi))




RF distance is 14 over a total of 70
in fast: 37 , in ncbi: 35
in both fast and ncbi: 29
only in ncbi, not in fast: 6
only in fast, not in ncbi: 8


In [29]:
print("only in fastOMA:", len((partitions_fast - partitions_ncbi) - partitions_slow))
print("only in fastOMA & slow:",len((partitions_fast & partitions_slow) - partitions_ncbi))
print("only in fastOMA & NCBI:",len((partitions_fast & partitions_ncbi) - partitions_slow))
print("only in slow & NCBI:",len((partitions_slow & partitions_ncbi) -  partitions_fast))
print("only in fastOMA & NCBI & slow:",len((partitions_fast & partitions_ncbi) & partitions_slow))
print("only in slow:",len((partitions_slow -partitions_fast) - partitions_ncbi))
print("only in NCBI:",len((partitions_ncbi - partitions_fast) - partitions_slow))


only in fastOMA: 4
only in fastOMA & slow: 7
only in fastOMA & NCBI: 0
only in slow & NCBI: 4
only in fastOMA & NCBI & slow: 29
only in slow: 2
only in NCBI: 2


## common name

In [None]:
no_list=[]
for node in fast_tree.traverse(strategy="postorder"):
    if node.is_leaf() :           
        if node.name in common_taxonID:
            node_name = common_taxonID[node.name]
            node.name=node_name + "_"+node.name
        else:
            no_list.append(node.name)
        
print(no_list)

In [None]:
no_list=[]
for node in slow_tree.traverse(strategy="postorder"):
    if node.is_leaf() :           
        if node.name in common_taxonID:
            node_name = common_taxonID[node.name]
            node.name=node_name + "_"+node.name
        else:
            no_list.append(node.name)
        
print(no_list)

In [None]:
no_list=[]
for node in ncbi_sub_tree.traverse(strategy="postorder"):
    if node.is_leaf() :           
        if node.name in common_taxonID:
            node_name = common_taxonID[node.name]
            node.name=node_name + "_"+node.name
        else:
            no_list.append(node.name)
        
print(no_list)

In [None]:
ncbi_sub_tree.write()

In [None]:
fast_tree.write()

In [None]:
slow_tree.write()

In [135]:
fast_tree_column=fast_tree

In [165]:


out_fast_ncbi = fast_tree.robinson_foulds(fast_tree_column)
(rf_fast_ncbi, maxparts_fast_ncbi, common_attrs, partitions_fast_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_fast_ncbi 
partitions_fast2 = set(parition_editor(partitions_fast_raw, intersection_sets))
partitions_ncbi2 = set(parition_editor(partitions_ncbi_raw, intersection_sets))

print("RF distance is %s over a total of %s" %(rf_fast_ncbi, maxparts_fast_ncbi))
print("in fast:",len(partitions_fast2),", in ncbi:",len(partitions_ncbi2)) # , "Common Partitions",len(common_attrs_paper),)
print("in both fast and ncbi:", len(partitions_ncbi2 & partitions_fast2))
print("only in ncbi, not in fast:", len(partitions_ncbi2-partitions_fast2)) # order not sure  http://etetoolkit.org/docs/latest/tutorial/tutorial_trees.html#robinson-foulds-distance
print("only in fast, not in ncbi:", len(partitions_fast2-partitions_ncbi2))




RF distance is 11 over a total of 75
in fast: 37 , in ncbi: 40
in both fast and ncbi: 33
only in ncbi, not in fast: 7
only in fast, not in ncbi: 4


In [163]:
fast_tree.write()

'(9258:0.365317,(((9305:0.191273,38626:0.154523)100:0.0414404,13616:0.236146)100:0.339808,((((((10020:0.336606,((10116:0.124241,10090:0.0634782)100:0.0686967,10029:0.264948)100:0.157961)100:0.0349106,((10181:0.196736,885580:0.214702)100:0.0797236,(10141:0.201194,10160:0.177856)100:0.0300928)100:0.148493,43179:0.214548)100:0.0400459,9986:0.51432)100:0.0242575,((((61853:0.132469,9597:0.0458389,9606:0.0110071,9601:0.092031,(60711:0.0297213,(9544:0.0535278,9541:0.0637958)100:0.0350517,9568:0.0541511,(9555:0.0398768,9531:0.0173492)100:0.0104805,9545:0.0269368,(61621:0.0432399,61622:0.0329304)100:0.0597879)100:0.0370803,9598:0.0170219)100:0.039766,(9483:0.107978,37293:0.0918721)100:0.0569786)100:0.115987,1868482:0.263974)100:0.022731,(379532:0.157923,30611:0.19776)100:0.0444909)100:0.0322722)100:0.0455135,(((9627:0.189379,((9643:0.0775759,29073:0.177738)99:0.021176,9646:0.128381)100:0.113529)100:0.0345644,9685:0.247897)100:0.106647,9365:0.423021,((((9739:0.226609,((9925:0.0315861,9940:0.1535

In [164]:
fast_tree_column.write()

'(9258:0.119426,(((9305:0.0514398,38626:0.041506)100:0.0133573,13616:0.0591581)100:0.0860501,((((((9601:0.016081,(((60711:0.00908008,9555:0.0102064,(9544:0.0129243,(9531:0.00901525,9568:0.00520202)100:0.00135302)100:0.00432962,9541:0.00667588)100:0.00324997,9545:0.00889862)100:0.00457618,(61621:0.013254,61622:0.00590356)100:0.00788714)100:0.0100684,(9606:0.00189283,9597:0.00818895,9598:0.00427517,61853:0.0298152)100:0.0013515)100:0.00911884,(37293:0.0183635,9483:0.0225373)100:0.0140363)100:0.0218746,1868482:0.055102,(379532:0.034169,30611:0.0566109)100:0.0119312)100:0.00641987,((((((10116:0.0320554,10090:0.0269705)100:0.0222184,10029:0.0567332)100:0.0524704,10020:0.0794034)100:0.00819968,((10181:0.0349635,885580:0.0351472)100:0.0158348,(10141:0.0477276,10160:0.0477597)100:0.0115739)100:0.0354548)99:0.00406299,43179:0.055955)100:0.00765098,9986:0.0910795)100:0.00438938)100:0.00860874,(9365:0.122184,((((((9925:0.00660053,9940:0.0192299)100:0.0104258,9913:0.028215)100:0.037151,9739:0.0493

### three way comparison

In [None]:
# # fast_tree= ete3.Tree(fast_treee_address_out, format=1)
# # fast_tree.prune(intersect_leaves, preserve_branch_length=True)

# # fast_tree.set_outgroup(outgroup) # CALPUG MOUS  BUCCAP CHLMAC

# out_paper_ncbi = bird_paper_tree.robinson_foulds(ncbi_sub_tree) # , expand_polytomies = True  #,polytomy_size_limit=20  ,unrooted_trees=True # , expand_polytomies = True
# (rf_paper_ncbi, maxparts_paper_ncbi, common_attrs, partitions_paper_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_paper_ncbi 
# partitions_paper = set(parition_editor(partitions_paper_raw, intersect_leaves))
# partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersect_leaves))

# out_fast_ncbi = fast_tree.robinson_foulds(ncbi_sub_tree)  # ,unrooted_trees=True # , expand_polytomies = True
# (rf_fast_ncbi, maxparts_fast_ncbi, common_attrs, partitions_fast_raw, partitions_ncbi_raw2, discard_t1, discard_t2)=out_fast_ncbi 
# partitions_fast = set(parition_editor(partitions_fast_raw, intersect_leaves))
# partitions_ncbi2 = set(parition_editor(partitions_ncbi_raw2, intersect_leaves))
# assert len(partitions_ncbi2-partitions_ncbi)==0
# out_fast_paper = fast_tree.robinson_foulds(bird_paper_tree)  # ,unrooted_trees=True # , expand_polytomies = True
# (rf_fast_paper, maxparts_fast_paper, common_attrs, partitions_fast_raw2, partitions_paper_raw2, discard_t1, discard_t2)=out_fast_paper
# partitions_paper2 = set(parition_editor(partitions_paper_raw2, intersect_leaves))
# partitions_fast2 = set(parition_editor(partitions_fast_raw2, intersect_leaves))
# assert len(partitions_fast2-partitions_fast)==0
# assert len(partitions_paper2-partitions_paper)==0


# values_comparison=[len((partitions_fast - partitions_ncbi) - partitions_paper),
#                    len((partitions_fast & partitions_paper) - partitions_ncbi2),
#                    len((partitions_fast & partitions_ncbi) - partitions_paper),
#                    len((partitions_paper & partitions_ncbi) -  partitions_fast),
#                    len((partitions_fast & partitions_ncbi) &  partitions_paper),
#                    len((partitions_paper - partitions_fast) - partitions_ncbi),
#                    len((partitions_ncbi - partitions_fast) - partitions_paper) ]
# print(values_comparison)

# ****

# others

### plot different collapsing  support value


In [None]:

# tresh=60 
# project_folder_root="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3b/"
# tresh_list=[60,80,90,95,100]
# tresh_list_name=["_collapsed_"+str(i) for i in tresh_list] 
# tresh_list_name=[""]+tresh_list_name

# values_comparison_all=[]
# for tresh in tresh_list_name:
#     fast_tree_address=project_folder_root+"iqtree5/_100_msa_concatanated.txt_copy1.contree"+tresh

#     fast_tree= ete3.Tree(fast_tree_address)
#     #print("len fast tree",len(fast_tree))


#     fast_tree.prune(intersect_leaves)
#     #print("len fast tree after prune",len(fast_tree))

#     fast_tree.set_outgroup(outgroup) # CALPUG MOUS  BUCCAP  CHLMAC

#     out_paper_ncbi = bird_paper_tree.robinson_foulds(ncbi_sub_tree) # , expand_polytomies = True  #,polytomy_size_limit=20  ,unrooted_trees=True # , expand_polytomies = True
#     (rf_paper_ncbi, maxparts_paper_ncbi, common_attrs, partitions_paper_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_paper_ncbi 
#     partitions_paper = set(parition_editor(partitions_paper_raw, intersect_leaves))
#     partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersect_leaves))

#     out_fast_ncbi = fast_tree.robinson_foulds(ncbi_sub_tree)  # ,unrooted_trees=True # , expand_polytomies = True
#     (rf_fast_ncbi, maxparts_fast_ncbi, common_attrs, partitions_fast_raw, partitions_ncbi_raw2, discard_t1, discard_t2)=out_fast_ncbi 
#     partitions_fast = set(parition_editor(partitions_fast_raw, intersect_leaves))
#     partitions_ncbi2 = set(parition_editor(partitions_ncbi_raw2, intersect_leaves))
#     assert len(partitions_ncbi2-partitions_ncbi)==0
#     out_fast_paper = fast_tree.robinson_foulds(bird_paper_tree)  # ,unrooted_trees=True # , expand_polytomies = True
#     (rf_fast_paper, maxparts_fast_paper, common_attrs, partitions_fast_raw2, partitions_paper_raw2, discard_t1, discard_t2)=out_fast_paper
#     partitions_paper2 = set(parition_editor(partitions_paper_raw2, intersect_leaves))
#     partitions_fast2 = set(parition_editor(partitions_fast_raw2, intersect_leaves))
#     assert len(partitions_fast2-partitions_fast)==0
#     assert len(partitions_paper2-partitions_paper)==0


#     values_comparison=[len((partitions_fast - partitions_ncbi) - partitions_paper),
#                        len((partitions_fast & partitions_paper) - partitions_ncbi2),
#                        len((partitions_fast & partitions_ncbi) - partitions_paper),
#                        len((partitions_paper & partitions_ncbi) -  partitions_fast),
#                        len((partitions_fast & partitions_ncbi) &  partitions_paper),
#                        len((partitions_paper - partitions_fast) - partitions_ncbi),
#                        len((partitions_ncbi - partitions_fast) - partitions_paper) ]
#     #sum1=values_comparison[0]+values_comparison[4]+values_comparison[5]+values_comparison[6]
#     print(values_comparison) # ,sum1
#     values_comparison_all.append(values_comparison)
# values_comparison_all=np.array(values_comparison_all)
# #print(values_comparison_all)

# #t_all=np.transpose(values_comparison_all)
# lenged1=['fastOMA','fastOMA & paper',  'fastOMA & NCBI',
#   'paper & NCBI', 'fastOMA & NCBI & paper',  'paper', 'NCBI']

# import pandas as pd
# df = pd.DataFrame(data=values_comparison_all)
# df.columns=lenged1   #ind = [0,60,80,90,95,100] # ,"BirdPaper"
# df.index=["no"]+tresh_list
# df

### plot different collapsing branch length

In [None]:

# #tresh=60 
# project_folder_root="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3b/"
# tresh_list=[1e-05,0.0001,0.001,0.005,0.01,0.1]
# tresh_list_name=["_collapsed_brnch_"+str(i) for i in tresh_list] 
# tresh_list_name=[""]+tresh_list_name

# values_comparison_all=[]
# for tresh in tresh_list_name:
#     fast_tree_address=project_folder_root+"iqtree5/_100_msa_concatanated.txt_copy1.contree"+tresh

#     fast_tree= ete3.Tree(fast_tree_address)
#     #print("len fast tree",len(fast_tree))


#     fast_tree.prune(intersect_leaves)
#     #print("len fast tree after prune",len(fast_tree))

#     fast_tree.set_outgroup(outgroup) # CALPUG MOUS  BUCCAP

#     out_paper_ncbi = bird_paper_tree.robinson_foulds(ncbi_sub_tree) # , expand_polytomies = True  #,polytomy_size_limit=20  ,unrooted_trees=True # , expand_polytomies = True
#     (rf_paper_ncbi, maxparts_paper_ncbi, common_attrs, partitions_paper_raw, partitions_ncbi_raw, discard_t1, discard_t2)=out_paper_ncbi 
#     partitions_paper = set(parition_editor(partitions_paper_raw, intersect_leaves))
#     partitions_ncbi = set(parition_editor(partitions_ncbi_raw, intersect_leaves))

#     out_fast_ncbi = fast_tree.robinson_foulds(ncbi_sub_tree)  # ,unrooted_trees=True # , expand_polytomies = True
#     (rf_fast_ncbi, maxparts_fast_ncbi, common_attrs, partitions_fast_raw, partitions_ncbi_raw2, discard_t1, discard_t2)=out_fast_ncbi 
#     partitions_fast = set(parition_editor(partitions_fast_raw, intersect_leaves))
#     partitions_ncbi2 = set(parition_editor(partitions_ncbi_raw2, intersect_leaves))
#     assert len(partitions_ncbi2-partitions_ncbi)==0
#     out_fast_paper = fast_tree.robinson_foulds(bird_paper_tree)  # ,unrooted_trees=True # , expand_polytomies = True
#     (rf_fast_paper, maxparts_fast_paper, common_attrs, partitions_fast_raw2, partitions_paper_raw2, discard_t1, discard_t2)=out_fast_paper
#     partitions_paper2 = set(parition_editor(partitions_paper_raw2, intersect_leaves))
#     partitions_fast2 = set(parition_editor(partitions_fast_raw2, intersect_leaves))
#     assert len(partitions_fast2-partitions_fast)==0
#     assert len(partitions_paper2-partitions_paper)==0


#     values_comparison=[len((partitions_fast - partitions_ncbi) - partitions_paper),
#                        len((partitions_fast & partitions_paper) - partitions_ncbi2),
#                        len((partitions_fast & partitions_ncbi) - partitions_paper),
#                        len((partitions_paper & partitions_ncbi) -  partitions_fast),
#                        len((partitions_fast & partitions_ncbi) &  partitions_paper),
#                        len((partitions_paper - partitions_fast) - partitions_ncbi),
#                        len((partitions_ncbi - partitions_fast) - partitions_paper) ]
#     print(values_comparison)
#     values_comparison_all.append(values_comparison)
# values_comparison_all=np.array(values_comparison_all)
# #print(values_comparison_all)

# #t_all=np.transpose(values_comparison_all)
# lenged1=['fastOMA','fastOMA & paper',  'fastOMA & NCBI',
#   'paper & NCBI', 'fastOMA & NCBI & paper',  'paper', 'NCBI']

# import pandas as pd
# df = pd.DataFrame(data=values_comparison_all)
# df.columns=lenged1   #ind = [0,60,80,90,95,100] # ,"BirdPaper"
# df.index=["no"]+tresh_list
# df