In [2]:
#!/usr/bin/python3

import numpy as np
from sys import argv
import pyoma.browser.db as db
import pyoma.browser.models as mod
import zoo.wrappers.aligners.mafft as mafft
import zoo.wrappers.treebuilders.fasttree as fasttree
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Alphabet import IUPAC, SingleLetterAlphabet
from Bio.Seq import Seq, UnknownSeq
from Bio.SeqRecord import SeqRecord
from collections import defaultdict

from os import listdir
from os.path import isfile, join
from datetime import datetime

import concurrent.futures


import ast
#  for development 
import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt


datasets_address= "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/archive/"



oma_database_address = datasets_address + "OmaServer.h5"
hog_og_map_address = datasets_address + "hog_og_map.dic"


omaID_address = datasets_address+"oma-species.txt"
bird6ID_address = datasets_address+"info.tsv"


# should end in /

# very small
project_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3a/ST/f4_100S/" 
#project_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3a/A/f7_2kA/" 

#project_folder = argv[1]


# PANPA.fa  PANPA.hogmap
# The species name of query is the name of the file; 
#  argv[2] 




In [None]:

def parse_oma(oma_database_address, hog_og_map_address):
    
    ############### Parsing OMA db ####################
    ###################################################

    oma_db = db.Database(oma_database_address)

    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- OMA data is parsed and its release name is:", oma_db.get_release_name())
    list_speices= [z.uniprot_species_code for z in oma_db.tax.genomes.values()] 
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time,"- There are",len(list_speices),"species in the OMA database.")

    file = open(hog_og_map_address, "r")
    contents = file.read()
    hog_OG_map = ast.literal_eval(contents)
    file.close()
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time,"- The hog-og map is read from file with the length of ", len(hog_OG_map))
    
    
    return (oma_db, hog_OG_map, list_speices)


def parse_proteome(project_folder, list_speices):
    
    ############### Parsing query proteome of species #######
    #########################################################

    project_files = listdir(project_folder)

    query_species_names = []
    for file in project_files:
        if file.split(".")[-1]=="fa":
            file_name_split = file.split(".")[:-1]
            query_species_names.append('.'.join(file_name_split))

    # we may assert existence of query_species_name+".fa/hogmap"
    query_prot_records_species = [ ]
    for query_species_name in query_species_names:
        query_prot_address = project_folder + query_species_name + ".fa" 
        query_prot_records = list(SeqIO.parse(query_prot_address, "fasta")) 
        query_prot_records_species.append(query_prot_records)

    # for development
    query_species_num = len(query_species_names)
    for species_i in range(query_species_num):
        len_prot_record_i = len( query_prot_records_species[species_i] )
        species_name_i = query_species_names[species_i]
        #print(species_name_i,len_prot_record_i)
        if species_name_i in list_speices: 
            current_time = now.strftime("%H:%M:%S")
            print(current_time,"- the species",species_name_i," already exists in the oma database, remove it first")
            exit()

    return (query_species_names, query_prot_records_species)



def parse_hogmap_omamer(project_folder , query_species_names):

    ################### Parsing omamer's output  ########
    #####################################################

    query_prot_names_species = []
    query_hogids_species = []

    for query_species_name in query_species_names:
        omamer_output_address = project_folder + query_species_name + ".hogmap"     
        omamer_output_file = open(omamer_output_address,'r');

        query_prot_names= []
        query_hogids= []

        for line in omamer_output_file:
            line_strip=line.strip()
            if not line_strip.startswith('qs'):
                line_split= line_strip.split("\t")        
                query_prot_names.append(line_split[0])
                query_hogids.append(line_split[1])
        #print("number of proteins in omamer output for ",query_species_name,"is",len(query_hogids)) # ,query_hogids
        query_prot_names_species.append(query_prot_names)
        query_hogids_species.append(query_hogids)    
    
    return (query_prot_names_species, query_hogids_species)
    
    
    

def extract_unique_hog(query_species_names,query_hogids_species, query_prot_names_species,query_prot_records_species):
    ###### Extracting unique HOG list and corresponding query proteins ########
    ###########################################################################

    query_hogids_filtr_species = []
    query_prot_names_filtr_species = []
    query_prot_records_filtr_species = []

    repeated_hogid_num = 0
    
    query_species_num = len(query_species_names) 
    
    for species_i in range(query_species_num):
        #print(query_species_names[species_i])

        query_hogids =  query_hogids_species[species_i]
        query_prot_names = query_prot_names_species[species_i]
        query_prot_records  = query_prot_records_species[species_i]



        query_hogids_filtr = []
        query_prot_names_filtr = []
        query_prot_records_filtr = []

        for prot_i in range(len(query_hogids)):

            if not query_hogids[prot_i] in query_hogids_filtr: 

                query_hogids_filtr.append(query_hogids[prot_i])
                query_prot_names_filtr.append(query_prot_names[prot_i])
                query_prot_records_filtr.append(query_prot_records[prot_i])
            else:
                repeated_hogid_num += 1 
                # for development
                #print("repeated hogid",query_hogids[prot_i], " for protein ",query_prot_names[prot_i])
                # now we keep the first protein query when these are repeated


        query_hogids_filtr_species.append(query_hogids_filtr)
        query_prot_names_filtr_species.append(query_prot_names_filtr)
        query_prot_records_filtr_species.append(query_prot_records_filtr)


        num_query_filtr = len(query_hogids_filtr)
        #print("Number of prot queries after filtering is",num_query_filtr,"\n")

    

    return (query_hogids_filtr_species, query_prot_names_filtr_species, query_prot_records_filtr_species )
    
    

In [None]:

def gather_OG(query_species_names, query_hogids_filtr_species, query_prot_names_filtr_species, query_prot_records_filtr_species):

    ############ Extracting the most frequent OG  ########
    #####################################################

    #dict (oma_group_nr -> dict(species, [proteins]))
    #Og[555] = {homo_erectus: [blabla, blublu], yellow_bird: [P52134], brown_bear: [P2121,B53223]}

    OGs_queries = {}

    # hog_OG_map = {}

    mostFrequent_OG_list_species = []

    frq_most_frequent_og_list_all = [] # for development
    
    query_species_num = len(query_species_names)  
    for species_i in  range(query_species_num):

        query_species_name = query_species_names[species_i]
        #print("\n",query_species_name)

        query_hogids_filtr = query_hogids_filtr_species[species_i]
        query_prot_names_filtr = query_prot_names_filtr_species[species_i]
        query_prot_records_filtr = query_prot_records_filtr_species[species_i]

        mostFrequent_OG_list=[]

        num_query_filtr = len(query_hogids_filtr)
        for  item_idx in range(num_query_filtr): #

            #query_protein = query_prot_names_filtr[item_idx]
            seqRecords_query =  query_prot_records_filtr[item_idx]
            seqRecords_query_edited = SeqRecord(Seq(str(seqRecords_query.seq)), query_species_name, '', '')
            #print(seqRecords_query_edited)

            hog_id= query_hogids_filtr[item_idx]

            if not hog_id in hog_OG_map:   # Caculitng  most frq OG for the new hog
                mostFrequent_OG= -1
                hog_OG_map[hog_id]=mostFrequent_OG

            else:  # hog_id is in hog_OG_map dic
                #print("using the hog-og-map")
                mostFrequent_OG = hog_OG_map[hog_id]

            if mostFrequent_OG in OGs_queries:
                OGs_queries_k = OGs_queries[mostFrequent_OG]

                if not query_species_name in OGs_queries_k:
                    OGs_queries_k[query_species_name] = seqRecords_query_edited
                    OGs_queries[mostFrequent_OG] = OGs_queries_k
            else:
                OGs_queries[mostFrequent_OG] = {query_species_name: seqRecords_query_edited} # query_protein = query_prot_names_filtr[item_idx]
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- Needed HOH-OG map is extracted from the map file.") 
    
    return OGs_queries
    

In [None]:

def combine_OG_query(OGs_queries, oma_db,threshold_least_query_sepecies_in_OG):
    
    ########## Combine proteins of OG with queries ##################
    #################################################################
    
    seqRecords_OG_queries = []
    seqRecords_all = []
    for OG_q in OGs_queries.keys():  # OG found in the query

        dic_species_prot = OGs_queries[OG_q]
        if len(dic_species_prot) >threshold_least_query_sepecies_in_OG:

            seqRecords_query_edited_all = []
            for query_species_name,seqRecords_query_edited  in dic_species_prot.items():
                #print(seqRecords_query_edited)
                seqRecords_query_edited_all.append(seqRecords_query_edited) 


            mostFrequent_OG = OG_q
            if mostFrequent_OG != -1:
                OG_members = oma_db.oma_group_members(mostFrequent_OG)
                proteins_object_OG = [db.ProteinEntry(oma_db, pr) for pr in OG_members]  # the protein IDs of og members
                 # covnert to biopython objects
                seqRecords_OG=[SeqRecord(Seq(pr.sequence),str(pr.genome.uniprot_species_code),'','') for pr in proteins_object_OG]

                seqRecords_OG_queries =seqRecords_OG + seqRecords_query_edited_all
                current_time = datetime.now().strftime("%H:%M:%S")
                #print("length of OG",mostFrequent_OG,"was",len(seqRecords_OG),",now is",len(seqRecords_OG_queries))
                print(current_time, " - Combining an OG with length of ",len(seqRecords_OG),"\t with a query is just finished.")

            seqRecords_all.append(seqRecords_OG_queries)


    current_time = datetime.now().strftime("%H:%M:%S")
    print("\n", current_time, "- Combining queries with OG is finished! number of OGs",len(seqRecords_all)) # 
    return(seqRecords_all)


In [None]:

def run_msa_OG(seqRecords_OG_queries):
    ############## MSA  ##############
    ##################################
    #current_time = datetime.now().strftime("%H:%M:%S")
    #print(current_time, "- working on new OG with length of ",len(seqRecords_OG_queries))

    wrapper_mafft = mafft.Mafft(seqRecords_OG_queries,datatype="PROTEIN") 
    # MAfft error: Alphabet 'U' is unknown. -> add --anysymbol argument needed to define in the sourse code
    # workaround sed "s/U/X/g"
    
    wrapper_mafft.options.options['--retree'].set_value(1)


    run_mafft = wrapper_mafft() # it's wrapper  storing the result  and time 
    time_taken_mafft = wrapper_mafft.elapsed_time

    result_mafft = wrapper_mafft.result 
    time_taken_mafft2 = wrapper_mafft.elapsed_time
    
    current_time = datetime.now().strftime("%H:%M:%S")
    #print(current_time,"- time elapsed for MSA: ",time_taken_mafft2)
    print(current_time,"- MSA for an OG is just finished: ",time_taken_mafft2)

    return(result_mafft)
   


In [None]:

def concatante_alignments(result_mafft_all_species, project_folder):
    ############## Concatante alignments  ##############
    ####################################################

    #alignments= result_maf2_all

    alignments= result_mafft_all_species
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- alignments len",len(alignments))
    #print([len(aln) for aln in alignments ])
    #print([len(seq) for aln in alignments for seq in aln])

    all_labels_raw = [seq.id for aln in alignments for seq in aln]
    all_labels = set(all_labels_raw)
    print("ids: ",len(all_labels),len(all_labels_raw))

    # Make a dictionary to store info as we go along
    # (defaultdict is convenient -- asking for a missing key gives back an empty list)
    concat_buf = defaultdict(list)

    # Assume all alignments have same alphabet
    alphabet = alignments[0]._alphabet

    for aln in alignments:
        length = aln.get_alignment_length()
        #print("length",length)
        # check if any labels are missing in the current alignment
        these_labels = set(rec.id for rec in aln)
        missing = all_labels - these_labels
        #print(missing)
        # if any are missing, create unknown data of the right length,
        # stuff the string representation into the concat_buf dict
        for label in missing:
            new_seq = UnknownSeq(length, alphabet=alphabet)
            concat_buf[label].append(str(new_seq))

        # else stuff the string representation into the concat_buf dict
        for rec in aln:
            concat_buf[rec.id].append(str(rec.seq))

    # Stitch all the substrings together using join (most efficient way),
    # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
    msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(seq_arr), alphabet=alphabet), id=label)
                                for (label, seq_arr) in concat_buf.items())



    out_name_msa=project_folder+"_msa_concatanated.txt"
    handle_msa_fasta = open(out_name_msa,"w")
    SeqIO.write(msa, handle_msa_fasta,"fasta")
    handle_msa_fasta.close()
    
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- ", len(msa),msa.get_alignment_length()) # super matrix size
    
    return msa
    
    

In [None]:


def msa_filter_row(msa,project_folder,tresh_ratio_gap_row,query_species_names):

    
    msa_filtered_row = [] # msa_fltr
    ratio_records=[]
    for record in msa:
        species_name = record.id
        seq = record.seq
        seqLen = len(record)
        
        gap_count=seq.count("-") + seq.count("?") + seq.count(".") +seq.count("~")
                
        ratio_record_nongap= 1-gap_count/seqLen
        ratio_records.append(round(ratio_record_nongap,3))

        if ratio_record_nongap > tresh_ratio_gap_row:
            msa_filtered_row.append(record)
        elif species_name in query_species_names : 
            msa_filtered_row.append(record)
            current_time = datetime.now().strftime("%H:%M:%S")
            print(current_time, "- Many row-wise gap for query",species_name,"with a ratio of",ratio_record_nongap) 

    current_time = datetime.now().strftime("%H:%M:%S")

    print(current_time, "- Row-wise filtering of MSA is finished.") 
    print(current_time, "- Out of ",len(msa),"species,",len(msa_filtered_row),"species (row of msa) remained.")


    out_name_msa=project_folder+"_msa_concatanated_filtered_row_"+str(tresh_ratio_gap_row)+".txt"
    handle_msa_fasta = open(out_name_msa,"w")
    SeqIO.write(msa_filtered_row, handle_msa_fasta,"fasta")
    handle_msa_fasta.close()
    
    print(current_time, "- MSA Row-wise filtered stored in file.") # super matrix size
    
    
    return msa_filtered_row
    

In [None]:

def msa_filter_col(msa_filtered_row, tresh_ratio_gap_col):

    ratio_col_all = []

    length_record= len(msa_filtered_row[1])
    num_records = len(msa_filtered_row)


    keep_cols = []
    for col_i in range(length_record):  # inspired by https://github.com/andreas-wilm/compbio-utils/blob/master/prune_aln_cols.py 

        col_values = [record.seq[col_i] for record in msa_filtered_row]

        gap_count=col_values.count("-") + col_values.count("?") + col_values.count(".") +col_values.count("~")

        ratio_col_nongap = 1- gap_count/num_records
        ratio_col_all.append(ratio_col_nongap)
        if ratio_col_nongap  > tresh_ratio_gap_col:
            keep_cols.append(col_i)


    plt.hist(ratio_col_all,bins=100) # , bins=10
    #plt.show()
    plt.savefig("./__ratio_col.pdf")

    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- Columns indecis extracted. Out of ", length_record,"columns,",len(keep_cols),"is remained.") 

    msa_filtered_row_col = []

    for record in msa_filtered_row :
        record_seq = str(record.seq)

        record_seq_edited  = ''.join([record_seq[i] for i in keep_cols  ])
        record_edited= SeqRecord(Seq(record_seq_edited), record.id, '', '')
        msa_filtered_row_col.append(record_edited)                         


    
    out_name_msa=project_folder+"_msa_concatanated_filtered_row_col_"+str(tresh_ratio_gap_col)+".txt"
    handle_msa_fasta = open(out_name_msa,"w")
    SeqIO.write(msa_filtered_row_col, handle_msa_fasta,"fasta")
    handle_msa_fasta.close()
    
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- Column-wise filtering of MSA is finished",len(msa_filtered_row_col),len(msa_filtered_row_col[0])) 
       
    #msa_filtered_row_col = MultipleSeqAlignment(msa_filtered_row_col)
    return msa_filtered_row_col



In [None]:

def draw_tree(msa, project_folder):
    ############## Tree inference  ###################
    ##################################################

    wrapper_tree=fasttree.Fasttree(msa,datatype="PROTEIN")
    wrapper_tree.options.options['-fastest']    
    result_tree1 = wrapper_tree()

    time_taken_tree = wrapper_tree.elapsed_time 
    time_taken_tree

    result_tree2 = wrapper_tree.result
    tree_nwk=str(result_tree2["tree"])
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time,"- ",len(tree_nwk))

    out_name_tree=project_folder+"_tree.txt"
    file1 = open(out_name_tree,"w")
    file1.write(tree_nwk)
    file1.close() 
    return tree_nwk

In [None]:



if __name__ == "__main__":
    (oma_db, hog_OG_map, list_speices) = parse_oma(oma_database_address, hog_og_map_address)


    (query_species_names, query_prot_records_species) = parse_proteome(project_folder, list_speices)
    (query_prot_names_species, query_hogids_species) = parse_hogmap_omamer(project_folder,query_species_names )


    (query_hogids_filtr_species, query_prot_names_filtr_species, query_prot_records_filtr_species) = extract_unique_hog(query_species_names,query_hogids_species, query_prot_names_species,query_prot_records_species)

    OGs_queries = gather_OG(query_species_names, query_hogids_filtr_species, query_prot_names_filtr_species, query_prot_records_filtr_species)

    #seqRecords_all = combine_OG_query(OGs_queries, oma_db)
    
    threshold_least_query_sepecies_in_OG = 15
    seqRecords_all = combine_OG_query(OGs_queries, oma_db,threshold_least_query_sepecies_in_OG)
    #combine_OG_query(OGs_queries, oma_db)
    
    
    num_OGs= len(seqRecords_all)
    
    

In [None]:
    
    iterotr_OGs = 0 
    
    result_mafft_all_species=[]
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- Parallel msa is started for ",len(seqRecords_all)," OGs.")
    
    number_max_workers = 1
    
    with concurrent.futures.ProcessPoolExecutor(max_workers=number_max_workers) as executor: 
        for seqRecords_OG_queries, output_values in zip(seqRecords_all, executor.map(run_msa_OG, seqRecords_all)):
            result_mafft_all_species.append(output_values)

    msa= concatante_alignments(result_mafft_all_species, project_folder)
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- all msa are concatanated")
    
    


In [None]:
#     current_time = datetime.now().strftime("%H:%M:%S")
#     print(current_time, "- Tree inference is started")

#     concurrent.futures.as_completed(executor)
#     tree_nwk = draw_tree(msa, project_folder)
    
    
#     print(current_time, "- Tree inference is finsiehd. Thanks for your patience!")
#     tree_nwk

In [None]:
    #msa_origin =list(msa)[:]

In [None]:
#     #add="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3a/A/f7_2kA/_msa_concatanated.txt"
#     add="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3a/A/f7_2kA/_msa_concatanated.txt"
#     msa = AlignIO.read(add,"fasta")
#     print(len(msa),len(msa[0]))


In [None]:
##    Instead of running MSA and using concatatntion. you could used pre_calculated concat msa

#     project_folder =  "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3a/A/f7_2kA/" # _msa_concatanated_hogmapX.txt"
#     project_files = listdir(project_folder)
#     query_species_names = []
#     for file in project_files:
#         if file.split(".")[-1]=="fa":
#             file_name_split = file.split(".")[:-1]
#             query_species_names.append('.'.join(file_name_split))


#     msa_input = project_folder+"_msa_concatanated.txt"
#     msa = AlignIO.read(msa_input,"fasta")
#     print("finish reading file",len(msa),len(msa[0]))


In [None]:
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- Row-wise filtering of MSA is started.") 
       
    tresh_ratio_gap_row = 0.02
    msa_filtered_row = msa_filter_row(msa,project_folder,tresh_ratio_gap_row,query_species_names)

    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- Column-wise filtering of MSA is started.") 
    
    tresh_ratio_gap_col = 0.1
    msa_filtered_row_col=  msa_filter_col(msa_filtered_row, tresh_ratio_gap_col)
    
    


In [None]:
import ete3

In [None]:
    msa_input = project_folder+"_msa_concatanated.txt"
    msa = AlignIO.read(msa_input,"fasta")
    print("finish reading file",len(msa),len(msa[0]))


In [None]:
msa2=msa[:3]

In [None]:
a=draw_tree(msa2,project_folder)

In [25]:


def read_taxonID_map(omaID_address,bird6ID_address):
    
    omaID_file = open(omaID_address,'r')
    taxonID_omaID={}
    omaID_taxonID={}
    for line in omaID_file:
        line_strip = line.strip()
        if line_strip.startswith('#'):
            pass
            #header_lines_list.append(line_strip)
        else:
            line_parts=line_strip.split('\t')

            omaID = line_parts[0]
            taxonID = int(line_parts[1])
            taxonID_omaID[taxonID]=omaID
            omaID_taxonID[omaID]=taxonID
            
    omaID_file.close()
        
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- The map for taxonID omaID of",len(taxonID_omaID),"records have read.") 
    
    
    bird6ID_file = open(bird6ID_address,'r')
    taxonID_bird6ID={}
    bird6ID_taxonID= {}
    for line in bird6ID_file:
        line_strip = line.strip()
        if line_strip.startswith('Or'):
            pass
            line_strip1=line_strip
            #header_lines_list.append(line_strip)
        else:
            line_parts=line_strip.split('\t')

            bird6ID = line_parts[6]
            taxonID = int(line_parts[8])
            taxonID_bird6ID[taxonID]=bird6ID
            bird6ID_taxonID[bird6ID]=taxonID
            
    bird6ID_file.close()
        
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- The map for taxonID bird6ID of",len(taxonID_bird6ID),"records have read.") 
  
    return (taxonID_omaID,taxonID_bird6ID,omaID_taxonID,bird6ID_taxonID)
    
            

In [26]:
len(taxonID_list)

2778

In [27]:
(taxonID_omaID,taxonID_bird6ID,omaID_taxonID,bird6ID_taxonID) = read_taxonID_map(omaID_address,bird6ID_address)
taxonID_map = {**taxonID_omaID,**taxonID_bird6ID }







#print(len(taxonID_omaID),len(taxonID_bird6ID),len(taxonID_map))
taxonID_list= list(taxonID_map.keys())

import ete3
ncbi = ete3.NCBITaxa()  # first time download in ~/.etetoolkit/
ncbi_sub_tree = ncbi.get_topology(taxonID_list)

for node in ncbi_sub_tree.traverse(strategy="postorder"):
    node.name2 = node.name
    if node.is_leaf() and int(node.name) in taxonID_map: # why ?
        node.name = taxonID_map[int(node.name)]

current_time = datetime.now().strftime("%H:%M:%S")
print(current_time, "- The NCBI taxanomy is read and the leaves names changed to OMA/bird6 ID containing",len(ncbi_sub_tree),".") 
#print(ncbi_sub_tree.get_ascii(attributes=["name"]))     
#ncbi_sub_tree.write() 

11:09:50 - The map for taxonID omaID of 2424 records have read.
11:09:50 - The map for taxonID bird6ID of 363 records have read.
11:10:05 - The NCBI taxanomy is read and the leaves names changed to OMA/bird6 ID containing 2756 .


In [8]:
ncbi_sub_tree.write()

'((((((((DESCC:1,DESRM:1,DESRL:1)1:1,((DESHD:1,DESHY:1)1:1,DESDL:1)1:1,(DESMD:1,DESOD:1,DESAJ:1)1:1,PELTS:1,DESAS:1,DESAP:1,THEPJ:1,DESK7:1,SYNGF:1)1:1,((RUMC1:1,RUMA7:1)1:1,((HUNT2:1,HUNT1:1)1:1,HUNCD:1)1:1,RUMCH:1,MAGIU:1,OSCVS:1,ETHHY:1)1:1,((((CLOBB:1,CLOB6:1,CLOBK:1)1:1,(CLOBJ:1,CLOBM:1,CLOB1:1,CLOBH:1)1:1,(CLOB2:1,CLOBL:1)1:1,CLOB0:1,CLOBA:1)1:1,(CLOS4:1,CLOSS:1)1:1,(CLOAB:1,CLOAE:1)1:1,(CLOPS:1,CLOPE:1,CLOP1:1)1:1,CLONN:1,CLOLD:1,CLOB8:1,CLOK5:1,CLOC7:1,CLOTE:1)1:1,(ALKOO:1,ALKMQ:1)1:1,ARTSS:1)1:1,((CLOD6:1,CLODR:1)1:1,FILAD:1,ACESD:1)1:1,(ANAHA:1,CELLD:1,ROSHA:1,CLOSW:1,LACP7:1,AGARV:1,EUBE2:1,BUTPB:1)1:1,(EUBLK:1,ACEWD:1)1:1,(SYNLT:1,SYNWW:1)1:1,((SULAD:1,SULAT:1)1:1,THEM7:1)1:1,SYMTH:1,HELMI:1)1:1,((((THEPX:1,THESX:1)1:1,THEP3:1,THEIA:1,THEM3:1)1:1,THEPS:1,CARHZ:1,MOOTA:1,CALS4:1)1:1,((CALH1:1,CALKI:1,CALK2:1,CALOW:1,CALOO:1,CALBD:1,CALS8:1)1:1,(THESW:1,THEXL:1,THETC:1)1:1)1:1,MAHA5:1)1:1,((ACEAZ:1,HALHC:1)1:1,((HALHG:1,HALPG:1)1:1,HALOH:1)1:1)1:1,(TEPAE:1,THEOJ:1)1:1,NATTJ:1

In [42]:
bird_hog_tree_address="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3a/hogmapX/out4/_tree_filt_7.txt"
#"/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3a/A/f7_2kA/_tree__msa_concatanated_filtered_row_col_0.00101.txt"
bird_hog_tree= ete3.Tree(bird_hog_tree_address)





In [11]:
a_count=0
for node in ncbi_sub_tree.traverse(strategy="postorder"):
    node.name2 = node.name
    if node.is_leaf() : # why ?
        a_count+=1


In [12]:
a_count

2756

In [13]:
b_count=0
for node in bird_hog_tree.traverse(strategy="postorder"):
    if node.is_leaf() : # why ?
        b_count+=1
b_count

411

In [9]:
bird_hog_tree.write()

'(((RHIDAH:0.00916118,CHAPAP:0.00759941)1:0.00218398,(DICMEG:0.00764643,(MYIHEB:0.00930175,((LANLUD:0.0100277,(APHCOE:0.00418719,(CORMON:0.00242676,(CORCOR:0.00159595,CORBRA:0.00301169)1:0.000693276)1:0.00236036)1:0.00371554)1:0.000713619,(STRCIN:0.00728446,(PARRAG:0.00655149,IFRKOW:0.00728625)1:5.6528e-05)1:6.8882e-05)1:6.8104e-05)0:8.1981e-05)1:7.9969e-05)1:0.00134414,(EDOCOE:0.00927021,DAPCHR:0.012291)1:0.000149208,((MOHOCH:0.00887504,(PTILEU:0.0117964,(((CALWIL:0.0104743,NOTCIN:0.0163835)1:0.000624526,(CNELOR:0.0106946,(MELVER:0.0112781,((((ANTMIN:0.0143893,(POEATR:0.00417223,(PSEHUM:0.00339711,(PARMJ:0.00208132,PARMAJ:0.00069855)1:0.00289579)1:0.000804694)1:0.00734026)1:0.00172581,((ALACHE:0.0166442,PANBIA:0.0165384)1:0.00104961,(NICCHL:0.0122264,(SYLVIR:0.0139867,((HIRRUS:0.0127127,(((BRAATR:0.00700473,PYCJOC:0.00676696)1:0.00539269,((SINWEB:0.00855782,(SYLATR:0.00541378,SYLBOR:0.00456145)1:0.00584524)1:0.000586819,((STEDEN:0.00419501,(ZOSHYP:0.0023303,ZOSLAT:0.00328568)1:0.00242

In [7]:
out = bird_hog_tree.robinson_foulds(ncbi_sub_tree,unrooted_trees=True)  # , expand_polytomies = True
(rf, rf_max, common_attrs, names, edges_t1, edges_t2, discarded_edges_t1) =out
print("RF distance is %s over a total of %s" %(rf, rf_max))


#print "Partitions in tree2 that were not found in tree1:", parts_t1 - parts_t2
#print "Partitions in tree1 that were not found in tree2:", parts_t2 - parts_t1

RF distance is 345 over a total of 567


In [132]:
for node in bird_hog_tree.traverse(strategy="postorder"):
    node.name2 = node.name
    if node.is_leaf(): 
        if "2000_X" in node.name:
            node.name = node.name[:6]
            #print(node.name)

    
(rf, rf_max, common_attrs, names, edges_t1, edges_t2, discarded_edges_t1) =out
print("RF distance is %s over a total of %s" %(rf, rf_max))


#print "Partitions in tree2 that were not found in tree1:", parts_t1 - parts_t2
#print "Partitions in tree1 that were not found in tree2:", parts_t2 - parts_t1

RF distance is 1383 over a total of 2585


In [10]:
list_leaf= []
for node in bird_hog_tree.traverse(strategy="postorder"):
    if node.is_leaf(): 
        list_leaf.append(node.name)


In [138]:
#ncbi_sub_tree.write() 

In [140]:

node_name_list= []
for node in bird_hog_tree.traverse(strategy="postorder"):
    node.name2 = node.name
    if node.is_leaf(): 
        if "2000_X" in node.name:
            node.name = node.name[:6]
        node_name_list.append(node.name)
            
            
            

In [164]:
not_list=['PHYPR', 'GUITH', 'CYAME', 'PUCGR', 'COPCI', 'SCHCO', 'PHACH', 'EMEND', 'PHAND', 'NECHA', 'VERDA', 'PROLT', 'BATDE', 'CRYPV', 'TOXGO', 'MYCCU', 'BURSC', 'METAA', 'AGRSH', 'MELGA', 'CHICK', 'ANAPL', 'MELUD', 'FICAL', 'PARMJ', 'TAEGU', 'JUNHY', 'STAA3', 'LACRD', 'SERCA', 'ANAPLA']

#bird6ID_list=list(taxonID_bird6ID.keys()) taxon
node_name_list_sub = [i for i in node_name_list if i not in not_list and i in bird6ID_list]

In [165]:
node_name_list_sub

['APTFOR',
 'ALELAT',
 'APTROW',
 'APTOWE',
 'ALCTOR',
 'ASASCU',
 'AMAGUI',
 'APHCOE',
 'AGEPHO',
 'AEGBEN',
 'APAVIT',
 'ACACHL',
 'ANSCYG',
 'ANAZON',
 'APTHAA',
 'AGAROS',
 'AQUCHR',
 'ALOBEC',
 'ATLROG',
 'APTAUS',
 'ANTMIN',
 'AEGCAU',
 'ANTCAR',
 'ATRCLA',
 'ALERUF',
 'ALACHE',
 'ACRARU',
 'ANSSEM',
 'ARDKOR',
 'ARAGUA',
 'AREINT',
 'ANHRUF',
 'ANHANH']

In [166]:

ncbi_sub_tree_original= ncbi_sub_tree
ncbi_sub_tree.prune(node_name_list_sub)


In [9]:
ncbi_sub_tree.write()

'((((((((DESCC:1,DESRM:1,DESRL:1)1:1,((DESHD:1,DESHY:1)1:1,DESDL:1)1:1,(DESMD:1,DESOD:1,DESAJ:1)1:1,PELTS:1,DESAS:1,DESAP:1,THEPJ:1,DESK7:1,SYNGF:1)1:1,((RUMC1:1,RUMA7:1)1:1,((HUNT2:1,HUNT1:1)1:1,HUNCD:1)1:1,RUMCH:1,MAGIU:1,OSCVS:1,ETHHY:1)1:1,((((CLOBB:1,CLOB6:1,CLOBK:1)1:1,(CLOBJ:1,CLOBM:1,CLOB1:1,CLOBH:1)1:1,(CLOB2:1,CLOBL:1)1:1,CLOB0:1,CLOBA:1)1:1,(CLOS4:1,CLOSS:1)1:1,(CLOAB:1,CLOAE:1)1:1,(CLOPS:1,CLOPE:1,CLOP1:1)1:1,CLONN:1,CLOLD:1,CLOB8:1,CLOK5:1,CLOC7:1,CLOTE:1)1:1,(ALKOO:1,ALKMQ:1)1:1,ARTSS:1)1:1,((CLOD6:1,CLODR:1)1:1,FILAD:1,ACESD:1)1:1,(ANAHA:1,CELLD:1,ROSHA:1,CLOSW:1,LACP7:1,AGARV:1,EUBE2:1,BUTPB:1)1:1,(EUBLK:1,ACEWD:1)1:1,(SYNLT:1,SYNWW:1)1:1,((SULAD:1,SULAT:1)1:1,THEM7:1)1:1,SYMTH:1,HELMI:1)1:1,((((THEPX:1,THESX:1)1:1,THEP3:1,THEIA:1,THEM3:1)1:1,THEPS:1,CARHZ:1,MOOTA:1,CALS4:1)1:1,((CALH1:1,CALKI:1,CALK2:1,CALOW:1,CALOO:1,CALBD:1,CALS8:1)1:1,(THESW:1,THEXL:1,THETC:1)1:1)1:1,MAHA5:1)1:1,((ACEAZ:1,HALHC:1)1:1,((HALHG:1,HALPG:1)1:1,HALOH:1)1:1)1:1,(TEPAE:1,THEOJ:1)1:1,NATTJ:1

In [163]:
bird6ID_list=[]
bird6ID_file = open(bird6ID_address,'r')
taxonID_bird6ID={}
for line in bird6ID_file:
    line_strip = line.strip()
    if line_strip.startswith('Or'):
        pass
        line_strip1=line_strip
        #header_lines_list.append(line_strip)
    else:
        line_parts=line_strip.split('\t')

        bird6ID = line_parts[6]
        bird6ID_list.append(bird6ID)


In [162]:
line_parts[6]

['Passeriformes',
 'Zosteropidae B',
 'Zosterops',
 'Zosterops lateralis',
 'Silver-eye',
 'NCBI-003',
 'ZOSLAT',
 '15251',
 '43581']

In [None]:
t = Tree('((((H,K),(F,I)G),E),((L,(N,Q)O),(P,S)));', format=1)
print("Original tree looks like this:")
print(t)

t.prune(["H","F","E","Q", "P"])
print("Pruned tree"
print t

In [168]:

bird_hog_tree_original= bird_hog_tree
bird_hog_tree.prune(node_name_list_sub)


In [169]:
bird_hog_tree.write()

'(APTFOR:1.45395,(((ALELAT:0.0938539,((APTROW:0.0212476,APTOWE:0.0431023)1:0.0691344,ALCTOR:0.073543)0.972:0.0417979)1:0.126189,ASASCU:0.0902211)0.998:0.025349,(((APHCOE:0.452866,AGEPHO:0.172617)1:0.0605827,AMAGUI:0.0481192)0.231:0.00638122,((AEGBEN:0.228914,((APAVIT:0.259342,(ACACHL:0.172978,(ANSCYG:0.054373,ANAZON:0.0206031)0.232:0.0451732)0.261:0.110195)0.796:0.041677,(APTHAA:0.692812,AGAROS:0.41341)0.999:0.670898)0.975:0.0818402)0.993:0.0852679,(((AQUCHR:0.258063,ALOBEC:0.613767)0.974:0.155622,(ATLROG:0.498934,APTAUS:0.366763)0.828:0.0314773)0.993:0.11265,((ANTMIN:0.764073,((AEGCAU:0.0573052,ANTCAR:0.64669)0.984:0.0735707,(ATRCLA:0.122335,((ALERUF:0.118689,ALACHE:0.111168)0.374:0.0793286,(ACRARU:0.026524,ANSSEM:0.109276)0.978:0.0883819)1:0.14146)0.564:0.0845237)1:0.14491)0.908:0.0463586,(ARDKOR:0.642256,((ARAGUA:0.110572,AREINT:0.123587)0.991:0.123277,(ANHRUF:0.224563,ANHANH:0.473539)0.98:0.151921)0.838:0.0373177)0.936:0.0474706)0.999:0.0768067)0.988:0.0692005)1:0.239339)0.996:0.02

In [170]:
out = bird_hog_tree.robinson_foulds(ncbi_sub_tree,unrooted_trees=True)  # , expand_polytomies = True
(rf, rf_max, common_attrs, names, edges_t1, edges_t2, discarded_edges_t1) =out
print("RF distance is %s over a total of %s" %(rf, rf_max))


#print "Partitions in tree2 that were not found in tree1:", parts_t1 - parts_t2
#print "Partitions in tree1 that were not found in tree2:", parts_t2 - parts_t1

RF distance is 41 over a total of 43


In [18]:

taxonID_from_oma_list_dic={}

list_leaf= []
for node in bird_hog_tree.traverse(strategy="postorder"):
    if node.is_leaf(): 
        list_leaf.append(node.name)

taxonID_from_oma_list =[]
omaID_file = open(omaID_address,'r')
# taxonID_omaID_small ={}
for line in omaID_file:
    line_strip = line.strip()
    if line_strip.startswith('#'):
        pass
        #header_lines_list.append(line_strip)
    else:
        line_parts=line_strip.split('\t')         
        omaID = line_parts[0]
        taxonID = int(line_parts[1])
        
        if omaID in list_leaf:
#             taxonID_omaID_small[taxonID]=omaID
            taxonID_from_oma_list.append(taxonID)

omaID_file.close()


taxonID_from_bird_list =[]
bird6ID_file = open(bird6ID_address,'r')
# taxonID_bird6ID={}
for line in bird6ID_file:
    line_strip = line.strip()
    if line_strip.startswith('Or'):
        pass
        line_strip1=line_strip
        #header_lines_list.append(line_strip)
    else:
        line_parts=line_strip.split('\t')

        bird6ID = line_parts[6]        
        taxonID = int(line_parts[8])
        if bird6ID in list_leaf:
            taxonID_from_bird_list.append(taxonID)
#         taxonID_bird6ID[taxonID]=bird6ID
        
        
        
bird6ID_file.close()

# current_time = datetime.now().strftime("%H:%M:%S")
# print(current_time, "- The map for taxonID bird6ID of",len(taxonID_bird6ID),"records have read.") 

taxonID_list_all =  taxonID_from_bird_list + taxonID_from_oma_list

print(len(taxonID_list_all))

ncbi_sub_tree_small = ncbi.get_topology(taxonID_list_all)

for node in ncbi_sub_tree_small.traverse(strategy="postorder"):
    node.name2 = node.name
    if node.is_leaf() and int(node.name) in taxonID_map: # why ?
        node.name = taxonID_map[int(node.name)]

        
        
ncbi_sub_tree_small.write()

411


'((((((BUCCAP:1,GALDEA:1)1:1,((((TAEGUT:1,LONSTR:1)1:1,(VIDCHA:1,VIDMAC:1)1:1)1:1,(((LOXLEU:1,LOXCUR:1)1:1,SERCAN:1)1:1,((CALORN:1,EMBFUC:1)1:1,(CARCAR:1,PASAMO:1,PHEMEL:1)1:1)1:1,UROPYL:1,CHLVIR:1,PEUTAE:1,HEMWIL:1)1:1,((PRUFUL:1,PRUHIM:1)1:1,PASDOM:1)1:1,(PROCAF:1,LEPASP:1)1:1,(AGEPHO:1,QUIMEX:1,MOLATE:1)1:1,(SETCOR:1,SETKIR:1)1:1,OREARF:1,MELVER:1,MOTALB:1,DICEXI:1,PLONIG:1)1:1,(((ZOSLAT:1,ZOSHYP:1)1:1,HYPCIN:1,STEDEN:1)1:1,(PORRUF:1,ILLCLE:1,PTEMEL:1,ERPZAN:1,POSRUF:1,MYSCRO:1,OXYMAD:1)1:1,(BRAATR:1,PYCJOC:1,NICCHL:1)1:1,((ACRARU:1,CETCET:1,HIPICT:1,SYLVIR:1,HYLPRA:1)1:1,(PANBIA:1,REGSAT:1)1:1,(SYLATR:1,SYLBOR:1)1:1,SINWEB:1)1:1,(PHYTRO:1,RHASIB:1)1:1,HIRRUS:1,ALACHE:1,LOCOCH:1,HORVUL:1,CISJUN:1,LEILUT:1,AEGCAU:1)1:1,(((CHLCYA:1,CHLHAR:1)1:1,IRECYA:1)1:1,((CORMON:1,CORCOR:1,CORBRA:1)1:1,CNELOR:1,APHCOE:1,ORIORI:1)1:1,(ALERUF:1,EULNIG:1,RHALEU:1,DAPCHR:1,FALFRO:1,PACPHI:1)1:1,(IFRKOW:1,PTILEU:1)1:1,(ERYMCC:1,DICMEG:1,MACNIG:1)1:1,LANLUD:1,MYIHEB:1,CALWIL:1,DRYGAM:1,EDOCOE:1,PARRAG:1

In [19]:
len(ncbi_sub_tree_small),len(bird_hog_tree)

(405, 411)

In [24]:
out = ncbi_sub_tree_small.robinson_foulds(bird_hog_tree,unrooted_trees=True)  # , expand_polytomies = True
(rf, rf_max, common_attrs, names, edges_t1, edges_t2, discarded_edges_t1) =out
print("RF distance is %s over a total of %s" %(rf, rf_max))



RF distance is 345 over a total of 569


In [20]:
len(taxonID_from_bird_list),len(taxonID_list_all),len()

(363, 411, 405)

In [31]:
taxonID_list_all =  taxonID_from_bird_list + taxonID_from_oma_list
#taxonID_list_all


In [21]:
taxonID_list_al_unq= set(taxonID_list_all)
for i in taxonID_list_al_unq:
    if taxonID_list_all.count(i)>1:
        print(i)
    

59729
59894
8839
9031
9135
9157


In [None]:
ncbi_sub_tree_original= ncbi_sub_tree
ncbi_sub_tree.prune(node_name_list_sub)

In [29]:
for i in [59729,59894,8839,9031,9135,9157]:
    if i in taxonID_omaID:
        print(taxonID_omaID[i])
    if i in taxonID_bird6ID:
        print(taxonID_bird6ID[i])


TAEGU
TAEGUT
FICAL
FICALB
ANAPL
ANAPLA
CHICK
GALGAL
SERCA
SERCAN
PARMJ
PARMAJ


In [None]:
ncbi_sub_tree_original= ncbi_sub_tree
ncbi_sub_tree.prune(node_name_list_sub)

In [None]:
            taxonID_bird6ID[taxonID]=bird6ID
            bird6ID_taxonID[bird6ID]=taxonID
        
        

In [54]:
five_repetead=set(["TAEGU","FICAL","ANAPL","CHICK","SERCA","PARMJ"])

node_uniq=list(set(list(bird6ID_taxonID.keys())+list(omaID_taxonID.keys()))-five_repetead)
len(node_uniq)

2781

In [None]:
taxonID_map = {**taxonID_omaID,**taxonID_bird6ID }

In [48]:


bird_hog_tree.nodes()


AttributeError: 'TreeNode' object has no attribute 'nodes'

In [55]:
    #bird_hog_tree=

ValueError: Node names not found: ['PSEUT', 'LISIN', 'GEOBB', 'PETMO', 'LACRJ', 'TRYB2', 'HELP4', 'LACLN', 'POLNS', 'PSEUX', 'NAUPA', 'NEIMW', 'ORYRU', 'LACPS', 'ANAMM', 'HELPG', 'OENOB', 'AZOC5', 'PENRW', 'ENTBF', 'CORCM', 'PELHB', 'BLASD', 'RHOS5', 'EHRCJ', 'DRORH', 'BORRA', 'MYCT3', 'CLONN', 'MYCAS', 'SALBC', 'RHOSU', 'ABSGL', 'THEP3', 'BUCA0', 'TETBL', 'CALK2', 'AZOOP', 'BURP1', 'MYCA1', 'METRJ', 'CHRVO', 'STRE4', 'SALSV', 'NOCDD', 'LACCD', 'METAX', 'STRT2', 'LACRL', 'STRSY', 'METM5', 'CIOSA', 'PECCP', 'CRYPV', 'AYWBP', 'SYNR3', 'ZOSMR', 'SPHS2', 'BRELN', 'LEUCJ', 'MICSL', 'PARPJ', 'METH6', 'LACSS', 'BORPC', 'ECO7I', 'AEQSU', 'AURPU', 'RHOP5', 'SALPS', 'ALTAL', 'THASP', 'CALNY', 'HELPQ', 'DRONM', 'PROMH', 'PAESJ', 'HALS3', 'TROW8', 'LOTJA', 'BIFAA', 'SHIFL', 'ACTM4', 'PHYSP', 'BORDL', 'FLAFP', 'RIPO2', 'ENTNP', 'FRACF', 'KLEFL', 'GEOSK', 'NATPD', 'GEODF', 'CUPMC', 'ECO81', 'ENTFU', 'BURVG', 'CUTAK', 'AVIPA', 'PENDC', 'STAAK', 'CORUB', 'ALISL', 'PARXL', 'SALHS', 'PEDHC', 'STRG2', 'YERPN', 'CORDH', 'BRARP', 'BACST', 'DROSI', 'CLOBM', 'KLEP3', 'YEASO', 'CAUST', 'HALP7', 'CRYMR', 'TRIAD', 'PSEP2', 'SULIY', 'BARC7', 'ORYPU', 'STRMM', 'DROOB', 'DESVH', 'GALSU', 'PECPW', 'SHESH', 'LACF3', 'METS4', 'SPHTD', 'THECC', 'CORK4', 'DROBS', 'ACIBS', 'MYCBH', 'PSEPQ', 'ADVKW', 'PSEAE', 'XANOM', 'BORBM', 'ROSS1', 'VEIPT', 'RHOPR', 'PYRNV', 'ERWBE', 'STREK', 'THESQ', 'SALTS', 'LACBN', 'NITHN', 'MONDO', 'ARCG5', 'CAPI1', 'PYRAB', 'CIOIN', 'PROCA', 'KYTSD', 'NEIG2', 'FRATH', 'DESCC', 'HYDMR', 'LEPCP', 'BRAOL', 'HAPBU', 'ANOGA', 'LEPIN', 'MORNO', 'STRP4', 'FUSC1', 'ACIAD', 'LACS5', 'BARHE', 'SINAD', 'BACA3', 'RAMTT', 'DESB2', 'PSYIN', 'MYCBP', 'CARS1', 'STRRD', 'CORDW', 'SORAR', 'STAAG', 'HALMD', 'SACS9', 'GLUAR', 'NITMS', 'ROSLO', 'ACIFD', 'PYRCJ', 'SCOMX', 'POPTR', 'PHEZH', 'RHOCB', 'ZYMMO', 'FERPD', 'SYNSC', 'PHOAA', 'ALIAD', 'STAAB', 'ENTTH', 'LACSC', 'ALKMQ', 'BIFAB', 'HELPJ', 'STRAW', 'PYRAE', 'SULIA', 'METS3', 'ACEWD', 'SULIK', 'LACPL', 'PECAS', 'BACAA', 'MYCHL', 'MUSAM', 'HYDVU', 'PERMH', 'CAMJS', 'SALPA', 'PYRFU', 'CHLT3', 'BACAI', 'CAMJJ', 'GEOS4', 'PLABA', 'CLOBA', 'DROWI', 'ACIB5', 'DOKS4', 'STRES', 'STROU', 'EMENI', 'BORBP', 'BART1', 'OCTBM', 'LEUS2', 'VOLCA', 'ANGAN', 'WOLPI', 'SALHC', 'FRAAD', 'CORGK', 'BIFLS', 'HARSA', 'HYPBU', 'MICCC', 'HAHCH', 'STRMD', 'SYNWW', 'CORD3', 'BLAHO', 'BRAFD', 'CAPTE', 'GONPJ', 'YEAST', 'ACIF5', 'THEFY', 'TRAPU', 'PREDD', 'CENSY', 'BRAHW', 'IXOSC', 'STRED', 'SELS3', 'ANOCA', 'ECO45', 'AMYMU', 'ASPAC', 'CAMJE', 'ECO5T', 'FRADG', 'LACRG', 'SHEB8', 'HUNT2', 'OCEGH', 'ACIC1', 'RHOBA', 'ACEP3', 'BACC0', 'YANXG', 'MYCHM', 'ATETH', 'MICLC', 'SPIPN', 'FLAJ1', 'BACHK', 'TREAZ', 'POLNA', 'RUBBR', 'TRIHA', 'MELRP', 'KANKD', 'NAKMY', 'PSEPW', 'DENPD', 'THIK1', 'PYRIL', 'WOLCO', 'DESOH', 'ALTMD', 'HELPC', 'RAHSY', 'GLOC7', 'PROCO', 'METMS', 'BORCA', 'PICGL', 'GIBZA', 'DIACI', 'DEHM1', 'FRACN', 'GORB4', 'LEPFC', 'CYTH3', 'RICPT', 'OOCBI', 'TERRK', 'SYNLT', 'BORPE', 'LISSS', 'STAND', 'DICC1', 'SCLS1', 'STRZO', 'CAPAN', 'BACC2', 'SACES', 'SALTI', 'COCAP', 'YERPY', 'THEC1', 'STRS7', 'STRZJ', 'THENR', 'ALLVD', 'SULIL', 'HELPR', 'NEIMA', 'PHAVU', 'SYNS3', 'SYNE7', 'CRYPA', 'ECOL6', 'STRIC', 'ECO24', 'OCHA4', 'CHLPP', 'CROS5', 'ECOHS', 'PARUW', 'CROAH', 'MYCCT', 'THEON', 'ENCCU', 'ENT38', 'BIFLB', 'GLOTR', 'NOSS7', 'WOLTR', 'CORD2', 'PONAB', 'THISH', 'PSYCK', 'PLAVS', 'PHYBL', 'RICS1', 'STRG3', 'TOXGO', 'DESAR', 'ASPOR', 'MYCPE', 'STRT1', 'DIPOR', 'ECTSI', 'THEU7', 'RICPP', 'CLOPE', 'SARSC', 'STAHY', 'HALLT', 'LUCCU', 'RETFI', 'MICPN', 'POLGS', 'THET1', 'METBM', 'PANVC', 'DESPD', 'HELPP', 'MYCH2', 'STRA5', 'ECHTE', 'NITWC', 'SEBTE', 'THENN', 'VIBFN', 'CHLTS', 'BURGB', 'CLOAI', 'STRC3', 'CHLTF', 'SPHPG', 'YERPD', 'MYCSS', 'THEGJ', 'SYNAS', 'KITAU', 'ANAD2', 'XANCP', 'GARV4', 'STRS2', 'ALKPO', 'DESBD', 'MESAW', 'THEBD', 'PHYIT', 'CHRTP', 'HELP5', 'TRURR', 'BUTPB', 'ROTDC', 'STRDJ', 'ACTUT', 'METEZ', 'YERPH', 'FRASN', 'ORYNI', 'LACFC', 'SERP5', 'UREP2', 'ENTLS', 'NOCSI', 'ANADF', 'BRALA', 'MYCPU', 'ECO10', 'BURP6', 'FERFK', 'ORYSJ', 'AERHH', 'VIBC3', 'THEA1', 'TRIVA', 'VIBCB', 'ECOLC', 'METFJ', 'KOMMN', 'TRES6', 'RHIME', 'PEDPA', 'GEOTN', 'THETO', 'SERSA', 'DROAR', 'THES4', 'DROER', 'SINFN', 'MYCUA', 'PRIPA', 'VIBSN', 'FONAL', 'MYCBT', 'FLACA', 'POREA', 'POERE', 'ECHVK', 'COXBR', 'HELPI', 'THECD', 'RHIEC', 'ACEAZ', 'STAAC', 'BODSA', 'PROMT', 'SHELP', 'MYCHI', 'SHESA', 'SULAT', 'CAMJI', 'BRUA2', 'LACAL', 'HELPV', 'STRSE', 'BURMS', 'ICTTR', 'ECOAB', 'STRRB', 'THESW', 'LACAR', 'LACJO', 'PROM3', 'NITHX', 'CYAGP', 'PSEF5', 'CHLP6', 'ACIMA', 'METNJ', 'NITEC', 'ECOCB', 'VIBCM', 'HALPG', 'TOXGV', 'METII', 'DEIML', 'NEPCE', 'TETPH', 'METEA', 'PRUPE', 'SAICN', 'PSE14', 'SHISS', 'BLOVB', 'GEOKA', 'SYNGF', 'PROMP', 'DYAFD', 'FOLCA', 'MARBU', 'WEIKK', 'DROTK', 'ALTNA', 'SULMS', 'FERNB', 'THEMA', 'GRENI', 'SERDU', 'CYCSP', 'METI4', 'METMP', 'LEGLN', 'DESOD', 'GEOSM', 'PSEAB', 'STRE2', 'URSAM', 'LEUGJ', 'BACT0', 'CANAW', 'STRTN', 'MYCBG', 'RHIO9', 'TRIIM', 'SCHCR', 'FLESM', 'PUSST', 'MYCHH', 'HAMHA', 'SYMTH', 'HELPS', 'RICAG', 'PLAKH', 'BATDJ', 'COPPD', 'CAVAP', 'CALBD', 'HALHC', 'CHLPE', 'MEISD', 'TAKRU', 'PARRH', 'PARTC', 'BURPS', 'CORDJ', 'CATAD', 'PASMU', 'NEIMF', 'AGARV', 'LINHU', 'MELCN', 'WIGBR', 'CYBJN', 'DESRM', 'STRSH', 'KLEOK', 'NITEU', 'METS5', 'PHYPA', 'SIMNZ', 'STAAT', 'AGGAN', 'FUSPO', 'STRMK', 'BRASO', 'FOMME', 'SALPC', 'HALOH', 'ARCNC', 'AQUAE', 'ARCGX', 'LACLA', 'DESHD', 'STRBB', 'SALT4', 'ILYPC', 'GORPV', 'STRPD', 'MAIZE', 'PHAND', 'THEEB', 'KORCO', 'NOSP7', 'DESGG', 'CHESB', 'ECOK1', 'MYCCR', 'LATCH', 'LACLM', 'MYCTD', 'STRSW', 'PLAFO', 'BRUSI', 'CUPNN', 'OSTTA', 'STRE5', 'THEKO', 'CHLPB', 'TURTR', 'MYCSK', 'PHYNI', 'HORVV', 'PSEPB', 'VIBA3', 'ECOBR', 'SHEP2', 'ALTME', 'ASPGL', 'HELME', 'STRP6', 'XANAC', 'YERPG', 'VERDA', 'BURTA', 'BORBU', 'ECO57', 'CAMJR', 'CHLAD', 'FLABF', 'ISOV2', 'STRPN', 'POLPP', 'COLGR', 'LEUMM', 'ASHGO', 'LEIMA', 'DROSZ', 'DROBM', 'TEPAE', 'RICCK', 'AGABU', 'STRMN', 'METAQ', 'NEUT9', 'MESFL', 'BACLD', 'FRAIE', 'CHLTB', 'BLOFL', 'CHLTJ', 'ECOC1', 'HALWD', 'CHLT0', 'NONDD', 'ONYPE', 'SULAD', 'STAAH', 'CORA7', 'SANKS', 'DROMI', 'NEIMG', 'BRUSU', 'METSB', 'UREPA', 'NECHA', 'SALPB', 'STAMF', 'DESTD', 'USTHO', 'SCHCM', 'NIAKG', 'THEID', 'PARAV', 'ACTP2', 'STYLE', 'CYPVA', 'METPE', 'CYAP4', 'SERFO', 'ECO55', 'PHYMT', 'CHAP6', 'BUCA5', 'ENTCC', 'SULIR', 'DASNO', 'STREC', 'RENSM', 'ANACC', 'METRM', 'METEP', 'PICTO', 'GRAMM', 'CHLT9', 'BORT9', 'MYCVP', 'RHOPT', 'TROWT', 'GLOVI', 'MYCPA', 'VULDI', 'WEEVC', 'LODEL', 'ASTRU', 'SULDN', 'TREPC', 'LACLV', 'OLICM', 'HIPMA', 'MYCHA', 'PINTA', 'CELAD', 'GARV3', 'SALAR', 'ZYMMA', 'METGS', 'MARMM', 'SHIF2', 'METMJ', 'NEOFI', 'PSET1', 'PSESL', 'ALCDB', 'BRADU', 'STRA1', 'ACIB3', 'STRGC', 'ECOUM', 'HAEIN', 'PERM5', 'ORYLO', 'HUNT1', 'PYROM', 'EGGLE', 'HALO1', 'MYCGO', 'MANES', 'NITMU', 'CORP1', 'AERPE', 'CORPF', 'EMIHU', 'MAGIU', 'ORCCI', 'CARRP', 'BACTN', 'SACD2', 'MYCMS', 'ORITI', 'EXOME', 'BACC7', 'MYCTK', 'STRS4', 'STRZP', 'ICHMG', 'BIFLD', 'ALIDK', 'MYCPN', 'BORP1', 'CAMJD', 'MYCFP', 'PEDCP', 'CLOBK', 'NICAT', 'MICTS', 'BELBD', 'RUEST', 'PSEPG', 'LARHH', 'ASTCA', 'DESSD', 'EXIS2', 'LEIBR', 'CHIPD', 'LACSM', 'STAA9', 'STAC7', 'UREU1', 'WEIVI', 'LEPSM', 'RALSL', 'CELFN', 'MICAA', 'MORCR', 'BUCCC', 'ICTPU', 'HALNC', 'NATSJ', 'LEPFM', 'LACRR', 'HALMS', 'LACTC', 'METLA', 'CALOW', 'ECOLX', 'LEPBJ', 'BACCR', 'EUBLK', 'ARCHD', 'HELP2', 'TREBD', 'CYCMS', 'SULAO', 'STRLN', 'ATTCE', 'NASVI', 'RIEPU', 'STRPX', 'ACTCC', 'NOSS1', 'BACMQ', 'CHTCT', 'CAMC5', 'LACH4', 'SOLTU', 'TREPP', 'EHRRW', 'ERIEU', 'TREME', 'AMPOC', 'ASPFU', 'METTP', 'STRP3', 'ECOBB', 'CALS4', 'XYLFG', 'LEPSE', 'HODCD', 'GRABC', 'ACIS3', 'MYCRN', 'CORU7', 'DICD3', 'RHOM4', 'STRA3', 'BARGA', 'PARBA', 'STRP8', 'TATMI', 'TETNG', 'BURM7', 'PROMS', 'TALSN', 'SPILD', 'CITK8', 'CHLPD', 'BARQU', 'CHLFF', 'BIFL2', 'PSEPM', 'LEIMU', 'PROLT', 'ORITB', 'BACC6', 'CAMJM', 'PIRSE', 'MYCHN', 'STACT', 'SINMM', 'MICAI', 'MYCA9', 'RHIOR', 'THETK', 'CHLTD', 'THISK', 'HALHT', 'MARPO', 'THET7', 'TREPM', 'ECOS5', 'RHOGW', 'THEAM', 'PSEAA', 'RUEPO', 'STRPF', 'THELN', 'MONBE', 'LACP7', 'STAS1', 'NAUCC', 'SARHA', 'SYNJB', 'MODMB', 'LACD2', 'KINRD', 'VIBA7', 'METBF', 'HELMI', 'AZOVD', 'SIDLE', 'METFS', 'PHYRM', 'STRPB', 'THIDA', 'BLAVI', 'MIXOS', 'COXBN', 'WOLSU', 'CORD7', 'LISML', 'ORYSI', 'LACGA', 'RAHAC', 'TREPA', 'CAPOD', 'STAAW', 'HAEIE', 'VITVI', 'STRR1', 'MYCIA', 'CHLPM', 'ENTFD', 'RICAC', 'SHIBC', 'PUCT1', 'BOMIM', 'PICGU', 'COMT2', 'SYNP2', 'RUBXD', 'PSEFS', 'KORVE', 'ACIS0', 'METS6', 'BRUA1', 'GEOS0', 'HALHL', 'HALMA', 'METCR', 'DACHA', 'PORCN', 'CULSO', 'DEIGD', 'MARTH', 'ACIA4', 'ACTPJ', 'CLOB0', 'BRAIP', 'METIK', 'BRUMC', 'PREMB', 'BACCI', 'EUTLA', 'METMA', 'THEPD', 'MYCCM', 'HAMD5', 'NOSA0', 'SERID', 'RHOH1', 'ECOH1', 'LEPBP', 'HYPSL', 'VIBCJ', 'META3', 'GOSHI', 'STRFR', 'BRUMB', 'PROMA', 'STRPU', 'CANGA', 'CLOB1', 'COXB1', 'PAETH', 'STRA7', 'LEGPL', 'CHLT5', 'FRATO', 'STAA2', 'STRA9', 'GIAIA', 'STRP0', 'CORVD', 'ACIIR', 'CHLCH', 'PANAA', 'RICPU', 'TANFA', 'AGRSH', 'DESAD', 'MYCBO', 'EDWI9', 'LEPBL', 'TRYCI', 'ALCBS', 'DELAS', 'LINUN', 'NEUCR', 'NANS0', 'SULBS', 'STRE8', 'HELAH', 'BRUM5', 'DELSC', 'PECPM', 'METVM', 'STRGA', 'ACYPI', 'XANCB', 'CLAM3', 'CLOB2', 'GRAFK', 'XANP2', 'CONPW', 'FLASC', 'ACIBD', 'STAAF', 'CHLVA', 'SYNY3', 'STREN', 'STRSV', 'COLFT', 'HYPNA', 'LEGP2', 'CHEQI', 'MICAN', 'ACIAP', 'SINMB', 'ECOL5', 'GLUDA', 'PYRYC', 'METHD', 'ERWP6', 'EUBE2', 'FRATN', 'ENTFA', 'TRYCC', 'CORP9', 'DROPE', 'CREAS', 'COPCI', 'EHRCR', 'GLUOX', 'HALWC', 'MYCPK', 'TOBAC', 'YERPP', 'AMPCI', 'THEM3', 'STRPS', 'CAERE', 'HALED', 'ACIBT', 'NEIMM', 'THEAN', 'CHLT1', 'SPHPU', 'PSEUL', 'RHOFT', 'PYRTR', 'METB6', 'ONCVO', 'KRYMA', 'SPHWW', 'CITRI', 'STAAD', 'VIBVY', 'STRG1', 'NEIMN', 'MAGSA', 'HAEI8', 'FRACC', 'STRZN', 'DESAP', 'CHLTZ', 'ACICJ', 'COFCA', 'LISM2', 'MYCA5', 'FERPA', 'NITSI', 'PSEPJ', 'THEAB', 'CAPCC', 'YARLI', 'HIPCM', 'MOREP', 'STAPH', 'NATGS', 'EMTOG', 'YERE8', 'STRPJ', 'DEFTU', 'DRONA', 'DESAJ', 'LACCB', 'PALPW', 'DESK7', 'CHLSY', 'XYLFM', 'MYCTC', 'DROMO', 'ARCB4', 'SHEVD', 'OLICO', 'PELPD', 'ACAM1', 'CLOTE', 'DEBHA', 'CUPNH', 'CYNCS', 'CHRSD', 'SALNS', 'NEIMB', 'HYPSM', 'WOLPM', 'SHEB2', 'RHOP2', 'CLOB8', 'MYCCU', 'BURCM', 'MYCS5', 'PSEA6', 'SCHJY', 'ECO1A', 'VESOH', 'SOVGS', 'ANAMF', 'VERDV', 'STIAD', 'PSEA9', 'ERWAE', 'RICPR', 'ANODA', 'KOCRD', 'TUPBE', 'BURMA', 'PREDF', 'HYPSF', 'META2', 'PENCH', 'SHEWM', 'SULNB', 'ASPFN', 'USTV1', 'HYDTT', 'RICP3', 'STAHD', 'BORPD', 'METZD', 'HYAAE', 'THETC', 'MYCLG', 'CORLD', 'BIFA0', 'ECOSE', 'TUBMM', 'OSCVS', 'SYNJA', 'BACAH', 'BACT1', 'CLOAB', 'HERSS', 'ICHMU', 'ENTHI', 'MYCHJ', 'PSEU9', 'RHOPA', 'RUBGI', 'YERPS', 'ACCPU', 'SOLCM', 'CHLTR', 'APIME', 'STRM6', 'LOKAC', 'BRUME', 'SCHMA', 'MYCMO', 'LISMM', 'STRTO', 'SHESW', 'DEHMB', 'CLOD6', 'AGRRK', 'PENIT', 'XANOR', 'CHLAB', 'GEOSW', 'DICSQ', 'FLUTR', 'PAEPS', 'DROME', 'PSEP1', 'THET2', 'CRYCD', 'PSEPK', 'ACIBY', 'MYCLE', 'PHYMF', 'RICRS', 'VIBAE', 'LACDN', 'PELLD', 'KLEAK', 'BACC4', 'AERUA', 'NEIM0', 'XYLFT', 'MYCFJ', 'NOCFA', 'CAVFA', 'THEAH', 'METFV', 'PROM1', 'THECK', 'EXODN', 'ALIAT', 'LEIIN', 'LISIP', 'BUCBP', 'PHAAN', 'AMPQE', 'YERRU', 'PORAD', 'YERE3', 'CROTZ', 'NATA1', 'STANL', 'THEM7', 'BACPZ', 'CALH1', 'CLOSS', 'NATMM', 'PSEE4', 'BACIE', 'GRATM', 'DEKBR', 'KAZNA', 'ORNRL', 'TURPD', 'RHOOB', 'PENPA', 'SCHPO', 'HELHP', 'PELCD', 'HAEI6', 'HISS2', 'THEP1', 'YERE1', 'RHIIW', 'PETMA', 'STRM5', 'TETTS', 'XANOP', 'SPHGB', 'SYNPW', 'MYCH7', 'CLOK5', 'SACVD', 'RHILO', 'PSELT', 'HAEIG', 'TAYEM', 'CREFR', 'BREBN', 'LACGL', 'STRET', 'ACIB1', 'METAC', 'CELGA', 'LISW6', 'PSYA2', 'IDILO', 'STRT9', 'METBU', 'RALP1', 'FRAT1', 'METJA', 'CHLTE', 'SOLUE', 'THESM', 'MYCSL', 'RHIE6', 'AMBTC', 'HALHG', 'BORBN', 'IGNAA', 'AJECH', 'NATTJ', 'VEREI', 'VITBC', 'CLOC7', 'ALTMB', 'RICRO', 'GOSAR', 'CAMLR', 'MYOLU', 'BURSC', 'ARAAL', 'THEPJ', 'MARM1', 'EPTBU', 'POLAQ', 'ETHHY', 'ACTS5', 'SCHMD', 'CYASC', 'PHANO', 'NANEQ', 'CLOSW', 'XENBS', 'BRAFL', 'CALKI', 'BACP2', 'CORAD', 'MUCCI', 'METML', 'BACC3', 'AZOPC', 'STAC4', 'BURGS', 'MANSM', 'RUMA7', 'HALUD', 'FRATW', 'CORDD', 'LIBAP', 'SHEON', 'SHEB9', 'RALSO', 'PARL1', 'MARAH', 'BACHD', 'THEAC', 'STRZI', 'COCIM', 'ASPTN', 'BACPT', 'PELSI', 'PEDHD', 'CLOP1', 'IGNAJ', 'STRV1', 'LACPJ', 'METIM', 'PUCGT', 'CANAL', 'HELCP', 'KLEP7', 'BAUCH', 'BRAP9', 'STAAS', 'COXBU', 'THELD', 'BEUC1', 'CLOPS', 'CUPPJ', 'HYDCU', 'ARCVS', 'CHLRE', 'COPC7', 'PANAM', 'PUNST', 'RICR3', 'MYXFH', 'THEYD', 'VULM7', 'PLAYO', 'SULD5', 'ACTP7', 'HELRO', 'CAMHC', 'PICST', 'RHOPS', 'SOLLC', 'AROAE', 'CUTAS', 'ECO26', 'ARTGP', 'SPHMS', 'PSYTT', 'CORGB', 'FLAIG', 'MICPC', 'PRORD', 'BACSE', 'CLOB6', 'AKKM8', 'CAMFO', 'TETUR', 'THEIA', 'STRPM', 'DROKI', 'CLOBH', 'SHEPA', 'OGAPD', 'PORG3', 'KETVY', 'NEOBR', 'ZUNPS', 'PESFW', 'DROFC', 'CAUVC', 'BACC1', 'DEHLB', 'METP4', 'PASFU', 'HELP1', 'ALIAG', 'NEIM8', 'SHESM', 'ANAHA', 'ACIF2', 'FUSNN', 'HELAN', 'SINMK', 'MAGMM', 'NAUDC', 'PELUB', 'SHIB3', 'RHOPX', 'BURP0', 'PSYWF', 'HELPK', 'LYSSC', 'KLEVT', 'STRA2', 'METFA', 'CHLP1', 'CHLT2', 'BRASB', 'BOTFB', 'AJEDR', 'THEPS', 'MYCGA', 'LOTGI', 'BRANA', 'MYCSJ', 'HELP9', 'PEDPL', 'CONWI', 'AGRVS', 'HALMT', 'ROSCS', 'ZOBGA', 'MAHA5', 'DROYA', 'ALIFM', 'PHYAS', 'EIMMA', 'CORDI', 'SLAHD', 'CHOHO', 'TETHN', 'XYLFA', 'ENTFO', 'ACHXA', 'VICPA', 'PLAF4', 'VERMA', 'ACICS', 'BABBO', 'STRVP', 'ASPCL', 'CORMM', 'MYCFM', 'EIMAC', 'TREPD', 'MYCPB', 'GUITH', 'MEGSC', 'LACRS', 'RHOS4', 'PSEMN', 'MEIRD', 'SULSY', 'CYAME', 'STRP1', 'SHEFN', 'COXB2', 'FRANT', 'GEMAT', 'ECOKO', 'COLSU', 'ACTSZ', 'BLASB', 'SACS2', 'PSEKB', 'DESVR', 'CHLT7', 'AMPPE', 'SPHMC', 'DESHY', 'MOBCV', 'JUNHY', 'HAEPS', 'SALCH', 'GARVH', 'BATDE', 'ACESD', 'THET8', 'PYRAR', 'PROA2', 'RICAE', 'BURM1', 'PYROC', 'BURM9', 'ERWAC', 'XANAP', 'CAMFF', 'SPIAZ', 'EIMTE', 'GIAIB', 'PARBD', 'THEVU', 'PAEAT', 'NEIG1', 'XYLF2', 'ECO8N', 'LACAC', 'MYCWM', 'THEPA', 'YERPB', 'BRUO2', 'NEIMP', 'METNO', 'FRATE', 'BUCAI', 'IGNH4', 'TERTT', 'STAAN', 'YERPA', 'METM7', 'PARBH', 'GUITC', 'METPB', 'NANGC', 'BACMD', 'LEUGG', 'LACA3', 'CHEAB', 'EEGSY', 'GALCS', 'HELP7', 'ROSHA', 'METSZ', 'GOSRA', 'PRER2', 'BOTPV', 'RICCN', 'MARHT', 'BACT6', 'TREPU', 'SALDC', 'ZYMTR', 'SALG2', 'LEGPA', 'SYNPV', 'RHOCS', 'PUCGR', 'AZOSB', 'OSTLU', 'BIFBP', 'NITSE', 'HELPL', 'DESMD', 'METST', 'CALAS', 'ECOBW', 'ANOFW', 'MELPT', 'STRPQ', 'DESAH', 'ECO5E', 'MARMS', 'BLUGR', 'SYNFM', 'MAGSQ', 'STRPC', 'SHEPC', 'ARATH', 'CYAAP', 'MICMU', 'SPITD', 'DESTT', 'MYXSD', 'SYNS7', 'VIBCH', 'AEDAE', 'METOI', 'AGRFC', 'RICTY', 'MIDMI', 'BRESC', 'SALPK', 'LEIDB', 'CALOO', 'CHLMU', 'PROM2', 'PROM9', 'RICBR', 'KLEPH', 'MYCKA', 'GIBM7', 'BIFLO', 'HELFC', 'TRIUA', 'CYBFA', 'ARMGA', 'ARTS2', 'HERAR', 'MELGA', 'BACSK', 'HALXS', 'SORBI', 'ACIAC', 'NITSB', 'STRSX', 'LISMF', 'LACFM', 'ECOBD', 'THERP', 'CELFA', 'MYCGE', 'RIPO1', 'ACICP', 'BACVZ', 'DICTD', 'ZOONE', 'LISMC', 'WOLWR', 'CAUSK', 'FRAP2', 'PHYPN', 'MELUD', 'SOLIN', 'HYPDA', 'ARCPA', 'SULKY', 'STRZ6', 'BURCH', 'LACKZ', 'PROFC', 'ECOC2', 'SULMD', 'ZYMMN', 'STAA5', 'COCLU', 'SPHAL', 'GALAU', 'SIMAS', 'LACLK', 'BACV8', 'ORYGL', 'GLOV7', 'CALMQ', 'PYRF1', 'BARVW', 'SALRD', 'CHOCR', 'RHILS', 'COCH5', 'LENAE', 'DECAR', 'MYCTA', 'PSEUU', 'DAUCS', 'DEIPD', 'TSUPD', 'ERATE', 'MYCSH', 'STRR6', 'ROBBH', 'CHLTG', 'DESTA', 'BIFBA', 'CORDL', 'LISMO', 'POLSJ', 'METTV', 'ARTOC', 'BIFAP', 'HELPB', 'SALA4', 'SPAPN', 'AERVB', 'MYCGI', 'STRHJ', 'MYCCP', 'UNCTG', 'TRIEI', 'JANMA', 'HELPH', 'MNELE', 'EMEND', 'CAMJC', 'DROVI', 'INTC7', 'XIPMA', 'PHOPR', 'SINMW', 'CHLT4', 'PHACI', 'PSEF1', 'GASAC', 'PHAIB', 'DROEL', 'ECOSM', 'FLABB', 'PTEVA', 'HERA2', 'ENTI1', 'BACAC', 'ARTBC', 'SHEPW', 'CRYNJ', 'DESIS', 'DROSE', 'MALGO', 'DROGU', 'CRYPI', 'TOLAT', 'CHLL2', 'HAEI2', 'DESA1', 'LEAB4', 'SALR5', 'METPS', 'PICSO', 'STRPG', 'HOYSD', 'STRR3', 'DESMR', 'DESRD', 'CORDK', 'CLOAE', 'NANSJ', 'PAEMK', 'PORGI', 'STRDG', 'SHIDS', 'THICA', 'SPHCD', 'AMYMS', 'ARCG6', 'BACCZ', 'ORNAN', 'LYSEN', 'MELHA', 'GRIFR', 'BURL3', 'FIBSS', 'METAA', 'THEBM', 'GEOUR', 'TISMK', 'BACAS', 'LACCC', 'LACDB', 'PYRHO', 'TRISP', 'YERP1', 'HALSG', 'MYCCN', 'DROPS', 'HEIAB', 'PELSM', 'ZINIC', 'VANPO', 'MYCSR', 'PUNMI', 'BURCJ', 'JANSC', 'POSPM', 'VIBVM', 'TRECH', 'SALTO', 'SHIBS', 'KITSK', 'METLZ', 'SHIF8', 'MYCL9', 'DICP7', 'STRP2', 'CANAR', 'BEABA', 'MARPK', 'SOLSS', 'ECODH', 'RHOVT', 'MYCAP', 'PLAF7', 'SALTD', 'BUCAJ', 'MYCC1', 'ERYRF', 'ALTMS', 'PSESM', 'SHEDO', 'BORPA', 'OCTDE', 'PROM5', 'WADCW', 'ORYBR', 'BORBZ', 'TRAVS', 'BACAN', 'DESVV', 'STREJ', 'ACTMD', 'HELM1', 'FLAB3', 'THAOC', 'BACSU', 'RIEAR', 'NEMVE', 'METFK', 'ZYGRO', 'HELPM', 'TRICA', 'LISMH', 'STAA8', 'PSEU2', 'NEIMH', 'HELPY', 'LUPAN', 'CARHZ', 'PARTE', 'MALS4', 'CORGP', 'SULAA', 'PENNA', 'WALSE', 'URSMA', 'ANAPD', 'SALTY', 'PYROT', 'AERS4', 'SYNS9', 'BARBK', 'ORENI', 'BACA1', 'SELRL', 'XANC8', 'BERLS', 'SALAI', 'BRADI', 'MAGP6', 'STALH', 'LEPOC', 'LEIXX', 'YEASA', 'ACISJ', 'MYCSD', 'SYNP6', 'CLOBJ', 'DESAA', 'CUCSA', 'OCEP5', 'SEDSS', 'CHLTL', 'SALSA', 'ERYLH', 'SYNP3', 'BACCJ', 'DOTSN', 'DEHMV', 'DESPS', 'MYCH1', 'CALLD', 'HELBC', 'PROSM', 'RHIL3', 'ACAPL', 'EDWTF', 'SULTO', 'STRMU', 'STRZT', 'STRPW', 'STAHJ', 'METVS', 'ELUMP', 'METC4', 'SHESR', 'HYPAI', 'NITOC', 'RICMS', 'STAAE', 'THEVO', 'NEOSM', 'HYDS0', 'AJECN', 'RHOPB', 'STAES', 'ECOLU', 'METM6', 'HALRX', 'CHLPN', 'MEDTR', 'ACIB4', 'STRCO', 'BUCSC', 'THESX', 'YERPZ', 'TRAHO', 'THEXX', 'THOAR', 'EXIAB', 'KOMPG', 'DESM0', 'MYCTU', 'NOCCG', 'PSEBN', 'THEPX', 'ACIHW', 'DEIDV', 'MOOTA', 'OPITP', 'PLAMA', 'STAAM', 'ORYLA', 'MAGO7', 'CAMC1', 'TREPS', 'ECO27', 'DENA2', 'XANC5', 'CLOBL', 'BORBR', 'OCEIH', 'ARTOA', 'RHILW', 'METPW', 'ODOSD', 'MYCS3', 'RICFE', 'ANATU', 'PLAFP', 'STRU0', 'TERSS', 'MAGGR', 'VARPS', 'ROTMD', 'METCA', 'STRPZ', 'PSEA7', 'PHACH', 'DEIRA', 'PLAL2', 'BDEBA', 'USTMA', 'SULMC', 'RUMCH', 'ARTSS', 'HAEDU', 'LAWIP', 'MYCTF', 'LEPBA', 'AMPXN', 'KLULA', 'ANATE', 'CLAMS', 'HALBP', 'PHYPR', 'BACFN', 'CALS8', 'DICZ5', 'SULID', 'YEASV', 'SULIM', 'BIFAV', 'DESVM', 'BORA1', 'DEHMG', 'LEUCK', 'DROEU', 'CHLP8', 'ACIC5', 'NEIM7', 'DAPPU', 'ARCG3', 'METED', 'PYGNA', 'ACIBC', 'MESCW', 'BRAM5', 'BACCN', 'BRUC2', 'DROHY', 'LYNSP', 'PAEP6', 'THEXL', 'BLOPB', 'MELPD', 'MACCJ', 'SALRM', 'BIFLJ', 'BIFDB', 'PSEUV', 'ANADE', 'THEM4', 'HALVD', 'LEGPH', 'ERWPE', 'MARSH', 'ACIFV', 'PARDP', 'PLAFR', 'SOYBN', 'LOALO', 'PLAHL', 'NITMO', 'METTM', 'ENTAL', 'RICB8', 'NITG3', 'FRAAA', 'CULQU', 'VAVCU', 'ENTF6', 'MYCGF', 'GADMO', 'CAEBR', 'FILAD', 'MYCHP', 'PLABS', 'MYCA6', 'RICAH', 'CAPO3', 'MICS3', 'MYCHD', 'CELLD', 'PELPB', 'BOMMO', 'DROGR', 'SHEAM', 'GEOLS', 'DANRE', 'STRGG', 'OCHPR', 'BURCC', 'ALKOO', 'PASMH', 'PSECP', 'ENTDS', 'KETVW', 'YERP3', 'SULIN', 'BIFBS', 'BIFAS', 'SPITZ', 'STRIJ', 'NITGG', 'STREH', 'FOMPI', 'BIGNC', 'STAA1', 'THEKT', 'BRUMA', 'OCESG', 'KOSOT', 'EDWTE', 'THEAS', 'CAEBE', 'DEFDS', 'OLSUV', 'SYNRA', 'SCHCO', 'DEIPM', 'BRUAB', 'BIFL1', 'PARP8', 'GEOSY', 'EHRRG', 'ALKEH', 'STAPE', 'WHEAT', 'LACLS', 'LEEBM', 'AZOL4', 'RHIFH', 'CAEJA', 'OWEHD', 'ASPC5', 'GEOMG', 'SULAC', 'CRYGR', 'PROM4', 'RUMC1', 'RHOJR', 'SHEHH', 'ROSDO', 'DEHMC', 'ACISD', 'DESAS', 'ECO1E', 'SCHOY', 'MYCLB', 'DROBP', 'METAR', 'FUSO4', 'BORHD', 'MORCB', 'LEPIC', 'BUCAF', 'MYCCH', 'BEII9', 'CROS8', 'LACS1', 'MYCA0', 'CORRG', 'ENDPU', 'ALIF1', 'STRP7', 'DICNV', 'CRYGA', 'LACRD', 'NEIML', 'RUTMC', 'SPHJU', 'HALTV', 'METCZ', 'METSF', 'METSG', 'PICSI', 'CELJU', 'THIV6', 'TRYVY', 'VIBVU', 'SULMW', 'SORC5', 'KRIFD', 'SAPPC', 'VIBPA', 'ALIFI', 'CAPGI', 'LACCZ', 'BACF6', 'PSEA8', 'MOLML', 'DESDL', 'SEGRD', 'ESCF3', 'JACJA', 'ECOLW', 'HALH1', 'NITWN', 'CUPTR', 'CYAM1', 'DESRL', 'ECOLI', 'TRIV2', 'WALI9', 'MYXXD', 'RHOE4', 'PHATC', 'HALAF', 'GIAIC', 'PSEMY', 'POEFO', 'LACJF', 'LEPMJ', 'HAEP3', 'RABIT', 'ASTMX', 'LACBI', 'ECOD1', 'ORYME', 'SORMK', 'STAEQ', 'LACDA', 'KITGR', 'HIRBI', 'TREDE', 'BURCA', 'JONDD', 'NOCAA', 'COLGC', 'DEIGI', 'ASPNG', 'BURSG', 'STRM9', 'CLOBB', 'TILAU', 'STRGZ', 'CARH6', 'LACBA', 'STRFA', 'MYCMM', 'BABBI', 'SALEP', 'THEOJ', 'BACFR', 'PIRSD', 'THETG', 'HYPDU', 'HYPJE', 'BURA4', 'MONPE', 'DANPL', 'BACCQ', 'SETIT', 'LISM4', 'RALPJ', 'CYNSE', 'SULIH', 'LACCS', 'PELTS', 'PHOSB', 'PSEUP', 'STAA0', 'XENNA', 'HYPVG', 'STRTD', 'CAUVN', 'STREM', 'DINSH', 'STRCL', 'LACGT', 'CELLC', 'HAEI1', 'METMI', 'AJEC8', 'ERWSE', 'PREI7', 'CLOLD', 'HAES1', 'THECL', 'GEOSL', 'XYLCX', 'LEGPC', 'FRATM', 'ANASK', 'FRATT', 'AURST', 'BUCAP', 'CHLCV', 'STAAR', 'VARPE', 'CONA1', 'XENTR', 'GEOS8', 'ERWT9', 'CAEEL', 'DICT6', 'ARALY', 'YERPE', 'WOLPP', 'STAA3', 'SHEB5', 'DICPU', 'RICJY', 'BUCAT', 'CORP2', 'HELP3', 'NOSCE', 'BORAP', 'PHLGI', 'SELML', 'ECOKI', 'CORGL', 'DROAN', 'VIBNE', 'THEGP', 'BABMR', 'AEGTA', 'MACEU', 'RHOSK', 'PARD8', 'METTH', 'STEHR', 'SALT1', 'FERBD', 'PROMM', 'HELPF', 'DICDI', 'BIFAR', 'HALSA', 'COLP3', 'NANST', 'NATM8', 'NOCSJ', 'CAMJ8', 'COCSN', 'DESAT', 'STAAJ', 'ACIET', 'RHOS1', 'FRATF', 'CALEA', 'ECO44', 'DESDA', 'BLASP', 'BIFLM', 'CHLAA', 'ANAPZ', 'FLAPJ', 'LEPBD', 'PROM0', 'HUNCD', 'GEOOG', 'NOVAD', 'PYRSN', 'SODGM', 'AMOA5', 'BACMK', 'SAPGL', 'ARCFU', 'ATOPD', 'NOTFU', 'NEIMO', 'CLOS4', 'ECOUT', 'ACEMN', 'AURAN', 'MYCS2', 'RHORT', 'STAA4', 'RALS8', 'COREF', 'ACHLI', 'ESOLU', 'MURRD', 'LEPII', 'THAPS', 'HALJB', 'SYNPX', 'CHLTC', 'TREPZ', 'CANTE', 'ZYMMT', 'ANAPP', 'CRAGI', 'MARHV', 'PSEPF', 'SACEN', 'STRPI', 'CORJK', 'NEORI', 'MYCMG', 'ARTPN', 'METPP', 'CLODR', 'METV3', 'GOTA9', 'PHOLL', 'ROZAC', 'ASTEC', 'CHLTA', 'LACP3', 'CORDV', 'EXISA', 'RICM5', 'DESAG', 'METMM', 'AGABI', 'SHEB6', 'TALIS', 'AMICL', 'HETAN', 'AGGAD', 'PANSA', 'NAEGR', 'NATP1', 'METKA', 'ECO8A', 'CARSF', 'RUNSL', 'ISOPI', 'KYRT2', 'METHJ', 'PSEU5', 'FINM2', 'MYCGH']

In [46]:
len(bird_hog_tree)


6

In [59]:
bird_hog_tree_address="/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3a/hogmapX/out4/_tree_filt_7.txt"
#"/work/FAC/FBM/DBC/cdessim2/default/smajidi1/fastoma/v3a/A/f7_2kA/_tree__msa_concatanated_filtered_row_col_0.00101.txt"
bird_hog_tree= ete3.Tree(bird_hog_tree_address)


list_node=[]
for node in bird_hog_tree.traverse(strategy="postorder"):

    if node.is_leaf() : # why ?
        list_node.append(node.name)

len(list_node)

411

In [60]:
five_repetead=set(["TAEGU","FICAL","ANAPL","CHICK","SERCA","PARMJ"])
list_node_unq=set(list_node)-five_repetead
list_node_unq


{'ACACHL',
 'ACRARU',
 'AEGBEN',
 'AEGCAU',
 'AGAROS',
 'AGEPHO',
 'AILME',
 'ALACHE',
 'ALCTOR',
 'ALELAT',
 'ALERUF',
 'ALOBEC',
 'AMAGUI',
 'ANAPLA',
 'ANAZON',
 'ANHANH',
 'ANHRUF',
 'ANSCYG',
 'ANSSEM',
 'ANTCAR',
 'ANTMIN',
 'AOTNA',
 'APAVIT',
 'APHCOE',
 'APTAUS',
 'APTFOR',
 'APTHAA',
 'APTOWE',
 'APTROW',
 'AQUCHR',
 'ARAGUA',
 'ARDKOR',
 'AREINT',
 'ASASCU',
 'ATLROG',
 'ATRCLA',
 'BALREG',
 'BALREX',
 'BARMAR',
 'BOMGAR',
 'BOVIN',
 'BRAATR',
 'BRALEP',
 'BUCABY',
 'BUCCAP',
 'BUCRHI',
 'BUPERY',
 'BURBIS',
 'CAIMOS',
 'CALANN',
 'CALBOR',
 'CALJA',
 'CALNIC',
 'CALORN',
 'CALPUG',
 'CALSQU',
 'CALVIR',
 'CALWIL',
 'CAMPRO',
 'CANLF',
 'CAPHI',
 'CARCAR',
 'CARCRI',
 'CASCAS',
 'CATAUR',
 'CATFUS',
 'CAVPO',
 'CENBEN',
 'CENUNI',
 'CEPGRY',
 'CEPORN',
 'CERAT',
 'CERBRA',
 'CERCOR',
 'CERFAM',
 'CETCET',
 'CEUAER',
 'CEYCYA',
 'CHAALE',
 'CHAFRE',
 'CHAPAP',
 'CHAPEL',
 'CHATOR',
 'CHAVOC',
 'CHILA',
 'CHIMIN',
 'CHLAEN',
 'CHLCYA',
 'CHLHAR',
 'CHLMAC',
 'CHLSB',
 'CHLVIR'

In [68]:
# bird_hog_tree.prune(list_node_unq)
bird_hog_tree.write()

'(((RHIDAH:0.00916118,CHAPAP:0.00759941)1:0.00218398,(DICMEG:0.00764643,(MYIHEB:0.00930175,((LANLUD:0.0100277,(APHCOE:0.00418719,(CORMON:0.00242676,(CORCOR:0.00159595,CORBRA:0.00301169)1:0.000693276)1:0.00236036)1:0.00371554)1:0.000713619,(STRCIN:0.00728446,(PARRAG:0.00655149,IFRKOW:0.00728625)1:5.6528e-05)1:6.8882e-05)1:6.8104e-05)0:8.1981e-05)1:7.9969e-05)1:0.00134414,(EDOCOE:0.00927021,DAPCHR:0.012291)1:0.000149208,((MOHOCH:0.00887504,(PTILEU:0.0117964,(((CALWIL:0.0104743,NOTCIN:0.0163835)1:0.000624526,(CNELOR:0.0106946,(MELVER:0.0112781,((((ANTMIN:0.0143893,(POEATR:0.00417223,(PSEHUM:0.00339711,PARMAJ:0.00069855)1:0.000804694)1:0.00734026)1:0.00172581,((ALACHE:0.0166442,PANBIA:0.0165384)1:0.00104961,(NICCHL:0.0122264,(SYLVIR:0.0139867,((HIRRUS:0.0127127,(((BRAATR:0.00700473,PYCJOC:0.00676696)1:0.00539269,((SINWEB:0.00855782,(SYLATR:0.00541378,SYLBOR:0.00456145)1:0.00584524)1:0.000586819,((STEDEN:0.00419501,(ZOSHYP:0.0023303,ZOSLAT:0.00328568)1:0.00242008)1:0.00711097,(PORRUF:0.0113

In [69]:
ncbi_sub_tree_small.write()

'((((((BUCCAP:1,GALDEA:1)1:1,((((TAEGUT:1,LONSTR:1)1:1,(VIDCHA:1,VIDMAC:1)1:1)1:1,(((LOXLEU:1,LOXCUR:1)1:1,SERCAN:1)1:1,((CALORN:1,EMBFUC:1)1:1,(CARCAR:1,PASAMO:1,PHEMEL:1)1:1)1:1,UROPYL:1,CHLVIR:1,PEUTAE:1,HEMWIL:1)1:1,((PRUFUL:1,PRUHIM:1)1:1,PASDOM:1)1:1,(PROCAF:1,LEPASP:1)1:1,(AGEPHO:1,QUIMEX:1,MOLATE:1)1:1,(SETCOR:1,SETKIR:1)1:1,OREARF:1,MELVER:1,MOTALB:1,DICEXI:1,PLONIG:1)1:1,(((ZOSLAT:1,ZOSHYP:1)1:1,HYPCIN:1,STEDEN:1)1:1,(PORRUF:1,ILLCLE:1,PTEMEL:1,ERPZAN:1,POSRUF:1,MYSCRO:1,OXYMAD:1)1:1,(BRAATR:1,PYCJOC:1,NICCHL:1)1:1,((ACRARU:1,CETCET:1,HIPICT:1,SYLVIR:1,HYLPRA:1)1:1,(PANBIA:1,REGSAT:1)1:1,(SYLATR:1,SYLBOR:1)1:1,SINWEB:1)1:1,(PHYTRO:1,RHASIB:1)1:1,HIRRUS:1,ALACHE:1,LOCOCH:1,HORVUL:1,CISJUN:1,LEILUT:1,AEGCAU:1)1:1,(((CHLCYA:1,CHLHAR:1)1:1,IRECYA:1)1:1,((CORMON:1,CORCOR:1,CORBRA:1)1:1,CNELOR:1,APHCOE:1,ORIORI:1)1:1,(ALERUF:1,EULNIG:1,RHALEU:1,DAPCHR:1,FALFRO:1,PACPHI:1)1:1,(IFRKOW:1,PTILEU:1)1:1,(ERYMCC:1,DICMEG:1,MACNIG:1)1:1,LANLUD:1,MYIHEB:1,CALWIL:1,DRYGAM:1,EDOCOE:1,PARRAG:1

In [64]:
len(bird_hog_tree)

405

In [66]:
len(ncbi_sub_tree_small)

405

In [None]:
reroot_at_edge



In [71]:
bird_hog_tree.rerootatedge("XENLA")

# reroot_at_edge

AttributeError: 'TreeNode' object has no attribute 'rerootatedge'

In [67]:
out = bird_hog_tree.robinson_foulds(ncbi_sub_tree_small,unrooted_trees=True)  # , expand_polytomies = True
(rf, rf_max, common_attrs, names, edges_t1, edges_t2, discarded_edges_t1) =out

# rf, max_rf, common_leaves, parts_t1, parts_t2 = t1.robinson_foulds(t2)


print("RF distance is %s over a total of %s" %(rf, rf_max))


RF distance is 345 over a total of 569


In [91]:
#bird_hog_tree, ncbi_sub_tree_small
len(bird_hog_tree)

405

In [94]:
len(out[2]),len(out[3]),len(out[4]),len(out[5])

(405, 808, 573, 0)

In [88]:
out[0],out[1],list(out[2])[:3],list(out[3])[:3],list(out[4])[:3]

(345,
 569,
 ['CINMEX', 'SITEUR', 'PANPA'],
 [(('ACACHL',
    'ACRARU',
    'AEGBEN',
    'AEGCAU',
    'AGAROS',
    'AGEPHO',
    'AILME',
    'ALACHE',
    'ALCTOR',
    'ALELAT',
    'ALERUF',
    'ALOBEC',
    'AMAGUI',
    'ANAPLA',
    'ANAZON',
    'ANHANH',
    'ANHRUF',
    'ANSCYG',
    'ANSSEM',
    'ANTCAR',
    'ANTMIN',
    'AOTNA',
    'APAVIT',
    'APHCOE',
    'APTAUS',
    'APTFOR',
    'APTHAA',
    'APTOWE',
    'APTROW',
    'AQUCHR',
    'ARAGUA',
    'ARDKOR',
    'AREINT',
    'ASASCU',
    'ATLROG',
    'ATRCLA',
    'BALREG',
    'BALREX',
    'BARMAR',
    'BOMGAR',
    'BOVIN',
    'BRAATR',
    'BRALEP',
    'BUCABY',
    'BUCCAP',
    'BUCRHI',
    'BUPERY',
    'BURBIS',
    'CAIMOS',
    'CALANN',
    'CALBOR',
    'CALJA',
    'CALNIC',
    'CALORN',
    'CALPUG',
    'CALSQU',
    'CALVIR',
    'CALWIL',
    'CAMPRO',
    'CANLF',
    'CAPHI',
    'CARCAR',
    'CARCRI',
    'CASCAS',
    'CATAUR',
    'CATFUS',
    'CAVPO',
    'CENBEN',
    'CENUNI

In [96]:
ete3.v

<module 'ete3.version' from '/work/FAC/FBM/DBC/cdessim2/default/smajidi1/software/miniconda3/lib/python3.8/site-packages/ete3/version.py'>