In [1]:
# The following functions will be used if we want to start from proteme+hogmap.


"""
Structure of folders:
Put proteomes of species as fasta files in /omamer_search/proteome/
Run omamer and put the output of omamer in /omamer_search/hogmap/
oma_database_address= the address to the oma databases

hog and HOG are used interchangeably here. 
rHOG=rootHOG.  A subHOG itself is a HOG.

"""
 
def parse_oma_db(oma_database_address):    
    """
    a function for loading an oma database in hdf5 format using pyoma.browser.db.
    
    output: oma_db, list_oma_speices
    """
    oma_db = db.Database(oma_database_address)
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time, "- OMA data is parsed and its release name is:", oma_db.get_release_name())
    list_oma_speices = [z.uniprot_species_code for z in oma_db.tax.genomes.values()] 
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time,"- There are",len(list_oma_speices),"species in the OMA database.")    
    return (oma_db, list_oma_speices)

def parse_proteome(list_oma_speices):
    """    
    a function for parsing fasta files of proteins located in /omamer_search/proteome/
    using Bio.SeqIO.parse
    Each fasta file is for one species.  The file name is the species name.
    
    output: query_species_names: list of species name, prots_record_allspecies: list of Biopython record of species
    """
    project_files = listdir(address_working_folder+"/omamer_search/proteome/")
    query_species_names = []
    for file in project_files:
        if file.split(".")[-1] == "fa":
            file_name_split = file.split(".")[:-1]
            query_species_names.append('.'.join(file_name_split))
        if file.split(".")[-1] == "fasta":
            file_name_split = file.split(".")[:-1]
            query_species_names.append('.'.join(file_name_split))

    # we may assert existence of query_species_name+".fa/hogmap"
    prots_record_allspecies = [ ]
    for query_species_name in query_species_names:
        prot_address = address_working_folder +"omamer_search/proteome/"+ query_species_name + ".fa" 
        prots_record = list(SeqIO.parse(prot_address, "fasta")) 
        prots_record_allspecies.append(prots_record)

    query_species_num = len(query_species_names)    
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time,"- The are",str(query_species_num),"species in the proteome folder.")
    # for development
    for species_i in range(query_species_num):
        len_prot_record_i = len( prots_record_allspecies[species_i] )
        species_name_i = query_species_names[species_i]
        #print(species_name_i,len_prot_record_i)
        if species_name_i in list_oma_speices: 
            current_time = datetime.now().strftime("%H:%M:%S")
            print(current_time,"- the species",species_name_i," already exists in the oma database, remove them first")
            exit()
    # The proteins are parsed using  Bio.SeqIO.parse
    # the first part of the header line before space 
    # >tr|A0A2I3FYY2|A0A2I3FYY2_NOMLE Uncharacterized protein OS=Nomascus leucogenys OX=61853 GN=CLPTM1L PE=3 SV=1
    # will be ">tr|A0A2I3FYY2|A0A2I3FYY2_NOMLE"
    # [i.id for i in prots_record_allspecies[0] if len(i.id)!=30 and len(i.id)!=22 ] #'sp|O47892|CYB_NOMLE',
    return (query_species_names, prots_record_allspecies)


def parse_hogmap_omamer(query_species_names):
    """    
    a function for parsing output of omamer (hogmap files) located in /omamer_search/hogmap/
    Each hogmap file correspond to one fasta file of species, with the same name.
    Note that some records of fasta may removed in hogmap, becuase of being so short.
    
    hogmap file example:
    qseqid hogid overlap family-score subfamily-score qseqlen subfamily-medianseqlen
    A0A140TAT7_CIOIN HOG:B0833785.1c.8b 1 0.99 0.9 490 503

    output as list of list for all species:  
    prots_hogmap_name_allspecies, prots_hogmap_hogid_allspecies,
    prots_hogmap_subfscore_allspecies, prots_hogmap_seqlen_allspecies,
    prots_hogmap_subfmedseqlen_allspecies

    The order of species is the same as query_species_names.
    """
    prots_hogmap_name_allspecies = []
    prots_hogmap_hogid_allspecies = []
    prots_hogmap_subfscore_allspecies = []
    prots_hogmap_seqlen_allspecies = []
    prots_hogmap_subfmedseqlen_allspecies = []
    for query_species_name in query_species_names:
        omamer_output_address = address_working_folder + "omamer_search/hogmap/"+ query_species_name + ".hogmap"     
        omamer_output_file = open(omamer_output_address,'r');
        prots_hogmap_name = []
        prots_hogmap_hogid = []
        prots_hogmap_subfscore = []
        prots_hogmap_seqlen = []
        prots_hogmap_subfmedseqlen = []
        for line in omamer_output_file:
            line_strip=line.strip()
            if not line_strip.startswith('qs'):
                line_split= line_strip.split("\t")    
                #if line_split[1]!='na':
                prots_hogmap_name.append(line_split[0])
                prots_hogmap_hogid.append(line_split[1])
                prots_hogmap_subfscore.append(line_split[4]) # subfamily
                prots_hogmap_seqlen.append(line_split[5])
                prots_hogmap_subfmedseqlen.append(line_split[6])
        prots_hogmap_name_allspecies.append(prots_hogmap_name)
        prots_hogmap_hogid_allspecies.append(prots_hogmap_hogid)
        prots_hogmap_subfscore_allspecies.append(prots_hogmap_subfscore)
        prots_hogmap_seqlen_allspecies.append(prots_hogmap_seqlen)
        prots_hogmap_subfmedseqlen_allspecies.append(prots_hogmap_subfmedseqlen)
    
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time,"- There are ",len(prots_hogmap_name_allspecies)," species in the hogmap folder.")
    print(current_time,"- The first species",query_species_names[0]," contains ",len(prots_hogmap_hogid_allspecies[0])," proteins.")
    print(current_time,"- The first protein of first species is ", prots_hogmap_name_allspecies[0][0])
    hogmap_allspecies = (prots_hogmap_name_allspecies, prots_hogmap_hogid_allspecies, prots_hogmap_subfscore_allspecies, prots_hogmap_seqlen_allspecies, prots_hogmap_subfmedseqlen_allspecies)
    return  hogmap_allspecies
    
    
    
def filter_prot_mapped(query_species_names, query_prot_records_species, query_prot_names_species_mapped):
    """    
    a function for filtering biopython records in query_prot_records_species based on hogmaps
    The reason is that some very short records of fasta are removed in hogmap.
    So, we may lose track of order comparing hogmap and fasta file.
    The goal here is to remove those from seq record (of the fasta file).
    
    output: query_prot_records_species_filtered
    """    
    current_time = datetime.now().strftime("%H:%M:%S")
    print(current_time,"- Filtering proteins started.")
    query_prot_records_species_filtered=[]
    for species_idx in range(len(query_species_names)):    
        # from fasta file
        query_species_name=query_species_names[species_idx]
        #print(query_species_name)
        query_prot_records_species_i = query_prot_records_species[species_idx]
        query_prot_ids_records = [record.id for record in query_prot_records_species_i]
        # from hogmap file without proteins that are not mapped on any hogs
        query_prot_names_species_i = query_prot_names_species_mapped[species_idx]
        if len(query_prot_names_species_i) != len(query_prot_records_species_i):
            query_prot_records_filterd=[]
            for query_prot_name in query_prot_names_species_i:
                if query_prot_name in query_prot_ids_records:
                    prot_record_idx = query_prot_ids_records.index(query_prot_name)
                    prot_record = query_prot_records_species_i[prot_record_idx]
                    query_prot_records_filterd.append(prot_record)
                else:
                    current_time = datetime.now().strftime("%H:%M:%S")                    
                    logger_hog.error(str(current_time)+"- Error 149 "+query_species_name+" "+ query_prot_name)

            current_time = datetime.now().strftime("%H:%M:%S")        
            print(current_time,"- For the species", query_species_name, ", few proteins were ignored by omamer.")
            print(current_time,"- before filtering: in hogmap", len(query_prot_names_species_i), "in proteome", len(query_prot_records_species_i))
            print(current_time,"- After filtering:  in hogmap", len(query_prot_names_species_i), "in proteome", len(query_prot_records_filterd))            
        else:
            query_prot_records_filterd = query_prot_records_species_i
        query_prot_records_species_filtered.append(query_prot_records_filterd)
    current_time = datetime.now().strftime("%H:%M:%S")        
    print(current_time,"- For the rest of species, all proteins were mapped using OMAmer.")
    return query_prot_records_species_filtered



 
def add_species_name(query_prot_records_species, query_species_names):
    """
    adding the name of species to each protein record
    
    output: updated version of input
    """
    
    for ix in range(len(query_species_names)):
        query_species_name = query_species_names[ix]
        query_prot_records = query_prot_records_species[ix]
        for i_prot in range(len(query_prot_records)):
            query_prot_record = query_prot_records[i_prot]
            query_prot_record.description += "|species|"+query_species_name
    return query_prot_records_species




def group_prots_rootHOGs(prots_hogmap_hogid_allspecies, address_rhogs_folder):
    """    
    a function for finding those proteins that are mapped to the same rootHOG.
    Then, we write each rootHOG as a seprate fasta file in the address_rhogs_folder folder
    
    output: rhogid_num_list, rhogids_prot_records_query
    """  
    # extract rootHOG ID  "B0833755.5c.10g.24e.16c.18b" ->"B0833755"
    prots_hogmap_rhogid_allspecies = []
    for prots_hogmap_hogid in prots_hogmap_hogid_allspecies:
        prots_hogmap_rhogid = []
        for prot_hogmap_hogid in prots_hogmap_hogid:
            prot_hogmap_rhogid=prot_hogmap_hogid.split(".")[0] 
            prots_hogmap_rhogid.append(prot_hogmap_rhogid)
        prots_hogmap_rhogid_allspecies.append(prots_hogmap_rhogid)

    # gathering name of prots from all species,  group them based on rHOG that they mapped on
    rhogid_prot_idx_dic = {} 
    for species_idx in range(len(query_species_names)):
        species_name = query_species_names[species_idx]
        prots_hogmap_rhogid = prots_hogmap_rhogid_allspecies[species_idx]
        for prots_hogmap_idx in range(len(prots_hogmap_rhogid)):
            prot_hogmap_rhogid = prots_hogmap_rhogid[prots_hogmap_idx]
            if prot_hogmap_rhogid in rhogid_prot_idx_dic:
                rhogid_prot_idx_dic[prot_hogmap_rhogid].append((species_idx, prots_hogmap_idx))
            else:
                rhogid_prot_idx_dic[prot_hogmap_rhogid] = [(species_idx, prots_hogmap_idx)]
    #print(len(rhogid_prot_idx_dic)) #  rhogid_prot_idx_dic['HOG:0018405']

    #extracting prot records for each rootHOG
    rhogids_prot_records_query = [ ]
    rhogids_list = []
    for rhogid in rhogid_prot_idx_dic.keys() :
        rhogid_prot_records = []
        if rhogid != "na" and len(rhogid)>1: # ignore un-mapped prots
            rhogids_list.append(rhogid)
            rhogid_prot_idx =  rhogid_prot_idx_dic[rhogid]
            for (species_idx, prots_hogmap_idx) in rhogid_prot_idx:
                prot_record = query_prot_records_species_filtered[species_idx][prots_hogmap_idx] 
                rhogid_prot_records.append(prot_record)
            rhogids_prot_records_query.append(rhogid_prot_records) 
        #else:
        #    print("root hog na / lenght of one ",rhogid)
        
    #print(len(rhogids_prot_records_query),len(rhogids_prot_records_query[0]))
    rhogid_num_list= []
    for rhogid_idx in range(len(rhogids_list)):
        rhogid_prot_records_query= rhogids_prot_records_query[rhogid_idx] 
        rhogid = rhogids_list[rhogid_idx]
        rhogid_B= rhogid.split(":")[1]
        rhogid_num= int(rhogid_B[1:] ) # # B0613860
        rhogid_num_list.append(rhogid_num)
        if   len(rhogid_prot_records_query) > 2 : # len(rhogid_prot_records_query) < 100  and
            #rhogids_prot_records_oma = []
            #for hog_elements in oma_db.member_of_fam(rhogid_num):   # this gets the member of roothog 2 (HOG:000002)
            #    prot_hog_element = ProteinEntry(oma_db, hog_elements)
            #    #print(prot_hog_element.omaid, prot_hog_element.hog_family_nr, len(prot_hog_element.sequence),prot_hog_element.sequence[0])
            #    rhogids_prot_records_oma.append(SeqRecord(Seq(prot_hog_element.sequence), id=prot_hog_element.omaid))
            #rhogids_prot_records_both= rhogids_prot_records_oma +  rhogid_prot_records_query
            #rhogids_prot_records.append(rhogids_prot_records_both)
            SeqIO.write(rhogid_prot_records_query, address_rhogs_folder+"HOG_"+str(rhogid_num)+".fa", "fasta")
    #print("all HOGs   (>2 <100) has written.",len(rhogids_prot_records_query),len(rhogids_list), len(rhogid_prot_records_query), len(rhogid_prot_records_query[0]))
    
    current_time = datetime.now().strftime("%H:%M:%S")        
    print(current_time,"- Sequences of rootHOGs are writtend as fasta file in "+address_rhogs_folder)
    
    return (rhogid_num_list, rhogids_prot_records_query)








In [10]:
## the following are needed when we start from a rootHOG fasta file.


def read_species_tree(species_tree_address):
    """
    reading a species tree in Phyloxml format using ete3 package .
    
    output (species_tree)   
    """
    logger_hog.info(species_tree_address)
    #print(round(os.path.getsize(species_tree_address)/1000),"kb")
    project = Phyloxml()
    project.build_from_file(species_tree_address)
    # Each tree contains the same methods as a PhyloTree object
    for species_tree in project.get_phylogeny():   
        species_tree = species_tree

    for node_species_tree in species_tree.traverse(strategy = "postorder"):
        if node_species_tree.is_leaf():
            temp1 =node_species_tree.phyloxml_clade.get_taxonomy()[0]
            #print(temp1.get_code())     
            node_species_tree.name = temp1.get_code()
    #print(len(species_tree)); print(species_tree)
    return (species_tree)   
        

def prepare_species_tree(rhog_i, species_tree):
    """
    a function for extracting a subtree from the input species tree  a.k.a pruning,
    based on the names of species in the rootHOG.
    
    output: species_tree (pruned), species_names_rhog, prot_names_rhog
    """
    species_names_rhog= []
    prot_names_rhog=[]
    for rec in rhog_i:
        prot_name= rec.name #'tr|E3JPS4|E3JPS4_PUCGT
        #prot_name = prot_name_full.split("|")[1].strip() # # 'tr|E3JPS4|E3JPS4_PUCGT
        species_name = prot_name.split("|")[-1].split("_")[-1]
        if species_name=='RAT': species_name="RATNO"
        species_names_rhog.append(species_name)
        prot_names_rhog.append(prot_name)

    species_names_uniqe = set(species_names_rhog)
    logger_hog.info("The number of unique species in the rHOG is "+str(len(species_names_uniqe))+".")
    species_tree.prune(species_names_uniqe, preserve_branch_length=True )
    #species_tree.write()
    for node in species_tree.traverse(strategy = "postorder"):
        node_name = node.name
        if len(node_name) <1: 
            if node.is_leaf():
                node.name = "leaf_"+str(num_leaves_no_name)
            else:
                node_children = node.children
                list_children_names = [node_child.name for node_child in node_children]
                node.name = '_'.join(list_children_names)
    #print("Working on the following species tree.")
    #print(species_tree)
    
    return (species_tree, species_names_rhog, prot_names_rhog)


def merge_msa(list_msas):     
    """
    merge a list of MSAs (multiple sequnce aligmnet)
    by run mafft on them.
    Each element of msa should be a MultipleSeqAlignment object. 
    
    output: merged (msa)
    """
    logging.debug(list_msas)
    logging.debug(str(list_msas[0][0].id)+"\n")
    wrapper_mafft_merge = mafft.Mafft(list_msas, datatype="PROTEIN") 
    wrapper_mafft_merge.options['--merge'].active = True
    merged = wrapper_mafft_merge()
    logger_hog.info(str(len(list_msas))+" msas are merged into one with the length of "+str(len(merged))+" "+str(len(merged[0])) )
    return merged
 
def infer_gene_tree(msa, gene_tree_file_addr):
    
    """
    infere gene tree using fastTree for the input msa
    and write it as a file
    
    
    output: gene tree in nwk format 
    """
    wrapper_tree=fasttree.Fasttree(msa, datatype="PROTEIN")
    wrapper_tree.options.options['-fastest']    
    result_tree1 = wrapper_tree()

    time_taken_tree = wrapper_tree.elapsed_time 
    result_tree2 = wrapper_tree.result
    tree_nwk=str(result_tree2["tree"])
    current_time = datetime.now().strftime("%H:%M:%S")
    # for development we write the gene tree, the name of file should be limit in size in linux.
    # danger of overwriting
    if len(gene_tree_file_addr)>255: gene_tree_file_addr = gene_tree_file_addr[:250]+".nwk"
    file_gene_tree = open(gene_tree_file_addr,"w")
    file_gene_tree.write(tree_nwk)
    file_gene_tree.write(";\n")
    file_gene_tree.close() 
    
    return tree_nwk
  
    
    
def lable_SD_internal_nodes(tree_out):
    """
    for the input gene tree, run the species overlap method
    and label internal nodes of the gene tree
    
    output: labeled gene tree
    """
    species_name_dic={}
    counter_S=0
    counter_D=0
    
    for node in tree_out.traverse(strategy = "postorder"):
        #print("** now working on node ",node.name) # node_children
        if node.is_leaf() :
            prot_i = node.name
            species_name_dic[node] = { str(prot_i).split("|")[-1].split("_")[-1] }
        else:
            node.name= "S/D"
            leaves_list = node.get_leaves()   #print("leaves_list", leaves_list)
            species_name_set = set([ str(prot_i).split("|")[-1].split("_")[-1] for prot_i in leaves_list])
            #print("species_name_set", species_name_set)
            species_name_dic[node] = species_name_set

            node_children = node.children            #print(node_children)
            node_children_species_list = [species_name_dic[node_child] for node_child in node_children] # list of sets
            #print("node_children_species_list", node_children_species_list)
            node_children_species_intersection = set.intersection(*node_children_species_list)

            if  node_children_species_intersection : #print("node_children_species_list",node_children_species_list)
                counter_D += 1
                node.name = "D"+str(counter_D)
            else:
                counter_S += 1
                node.name = "S"+str(counter_S)
    return tree_out



    
def infer_HOG_rhog(rhogid_num, address_rhogs_folder, species_tree_address): 
    """
    The prot sequences of a rootHOG are located in the fasta file address_rhogs_folder+"HOG_rhogid_num.fa,
    we want to infer all subHOGs of this rootHOG for different taxanomic levels.
    
    output: a python dict (HOG_thisLevel):  key=taxanomic level, value= a list of subHOGs.   
    """
    # rhogid_num = rhogid_num_list[rhogid_num_i]
    logger_hog.info("\n"+"="*50+"\n"+"Working on root hog: "+str(rhogid_num)+". \n")  # +", ",rhogid_num_i,"-th. \n"
    prot_address = address_rhogs_folder+"HOG_"+str(rhogid_num)+".fa"
    rhog_i = list(SeqIO.parse(prot_address, "fasta")) 
    logger_hog.info("number of proteins in the rHOG is "+str(len(rhog_i))+".")

    (species_tree) = read_species_tree(species_tree_address)
    (species_tree, species_names_rhog, prot_names_rhog) = prepare_species_tree(rhog_i, species_tree)
    #species_tree.write() #print(species_tree.write())
    
    dic_sub_hogs = {}
    # finding hogs at each level of species tree (from leaves to root, bottom up)    
    for node_species_tree in species_tree.traverse(strategy = "postorder"):
        if node_species_tree.is_leaf() : 
            # each leaf itself is a subhog
            continue
        #print("\n"+"*"*15+"\n","Finding hogs for the taxonomic level:", node_species_tree.name,"\n")
        dic_sub_msas = []
        (dic_sub_hogs) = infer_HOG_thisLevel(node_species_tree, rhog_i, species_names_rhog, dic_sub_hogs, rhogid_num)
        HOG_thisLevel = dic_sub_hogs[node_species_tree.name]
        logger_hog.info("subHOGs in thisLevel are "+' '.join(["["+str(i)+"]" for i in HOG_thisLevel])+" .")
    return (HOG_thisLevel)


       
def infer_HOG_thisLevel(node_species_tree, rhog_i, species_names_rhog, dic_sub_hogs, rhogid_num):
    
    
    gene_trees_folder = address_working_folder + "/gene_trees_test/"
    if not os.path.exists(gene_trees_folder) :
        os.mkdir(gene_trees_folder) 
    
    if len(rhog_i) ==0:
        logger_hog.warning('There is no protein in the rHOG: '+str(rhogid_num)) 
        dic_sub_hogs[node_species_tree.name] = []
        return (dic_sub_hogs)
    
    elif len(rhog_i) ==1:
        logger_hog.warning('There is only one protein in the rHOG: '+str(rhogid_num)) 
        node_species_name = node_species_tree.children[0].name # there is only one species (for the one protein)
        prot= rhog_i[0]
        sub_hog_leaf = HOG(prot, node_species_name, rhogid_num)
        subHOGs_children = [sub_hog_leaf]
        HOG_this_level = subHOGs_children  
        dic_sub_hogs[node_species_tree.name] = HOG_this_level
        return (dic_sub_hogs)
        
    sub_msa_list_lowerLevel = [] # including subHOGS of lower level 
    subHOGs_children = []
    
    #print("working on node", node_species_tree.name,"with",len(node_species_tree.children),"children.")
    for node_child in node_species_tree.children:
        if  node_child.is_leaf():
            node_species_name = node_child.name
            #extracting those proteins of the rHOG that belongs to this species (child node of species tree)             
            interest_list = [idx  for idx in range(len(species_names_rhog)) if species_names_rhog[idx] == node_species_name ]
            rhog_part = [rhog_i[i] for i in interest_list]
            #sub_msa = [MultipleSeqAlignment([i]) for i in rhog_part]             #print("len",len(rhog_part))

            for prot in rhog_part : 
                sub_hog_leaf = HOG(prot, node_species_name, rhogid_num  ) # node_species_tree.name
                #list_all_hogs_ever.append(sub_hog_leaf)
                subHOGs_children.append(sub_hog_leaf)                
        else:   # the child node is an internal node, subHOGs are inferred till now during traversing.
            #print("sub msa for internal node", node_child.name,"is read from dic.")
            if node_child.name in dic_sub_hogs:
                sub_hogs_child  = dic_sub_hogs[node_child.name]
                subHOGs_children += sub_hogs_child
            else:
                logger_hog.error("Error 131, no sub msa for the internal node ",node_child.name, node_child, "\n",dic_sub_hogs)
                assert 2==1 
    temp11=[]
    for temp in [i._members for i in subHOGs_children]:
        temp11.append([ prot.split('|')[2] for prot in temp])
    #print("there are ",len(subHOGs_children),"subHOGs lower of this level:",[i._hogid for i in subHOGs_children],temp11)
    #print("We want to infer subHOGs at this level,i.e. merge few of them.")    
    subHOG_to_be_merged_set_other_Snodes = []
    
    if len(subHOGs_children) ==0:
        logger_hog.error('Error 139, There is no protein in this subhog, for rhog'+str(rhogid_num)) 

    elif len(subHOGs_children) ==1:
        HOG_this_level = subHOGs_children        
        #print("**** error 134 *** ", len(subHOGs_children),subHOGs_children) #return (-1,-1,-1)

    else:

        sub_msa_list_lowerLevel_ready = [hog._msa for hog in subHOGs_children]
        merged_msa = merge_msa(sub_msa_list_lowerLevel_ready) 
        logger_hog.info("All subHOGs are merged, merged msa is with length of"+str(len(merged_msa))+" "+str(len(merged_msa[0]))+".")
        
        gene_tree_file_addr =  gene_trees_folder+ "/tree_"+str(rhogid_num)+"_"+str(node_species_tree.name)+".nwk"
        gene_tree_raw = infer_gene_tree(merged_msa, gene_tree_file_addr)
        gene_tree = Tree(gene_tree_raw+";", format=0)
        logger_hog.info("Gene tree is infered with length of "+str(len(gene_tree))+".")
        #gene_tree_i +=1
        R = gene_tree.get_midpoint_outgroup()
        gene_tree.set_outgroup(R)  #print("Midpoint rooting is done for gene tree.")
        gene_tree = lable_SD_internal_nodes(gene_tree)
        #print("Overlap speciation is done for internal nodes of gene tree, as following:")
        print(str(gene_tree.write(format=1))[:-1]+str(gene_tree.name)+":0;")

        tree_leaves = [i.name for i in gene_tree.get_leaves() ]
        #assigned_leaves_to_hog = []        #sub_msas_list_this_level = []
        subHOGs_id_children_assigned = [] # the same as  subHOG_to_be_merged_all_id 
        HOG_this_level = []
        subHOG_to_be_merged_set_other_Snodes = []
        subHOG_to_be_merged_set_other_Snodes_flattned_temp  = []
        for node in gene_tree.traverse(strategy = "preorder", is_leaf_fn= lambda n :   hasattr(n, "processed") and   n.processed == True  ): # start from root
            #print("Leaves assigned to hog are ", assigned_leaves_to_hog)   #print("Traversing gene tree. Now at node", node.name)
            if not node.is_leaf() : 
                node_leaves_name = [ i.name for i in node.get_leaves() ] 
                #print(node_leaves_name)

                if node.name[0] =="S":  # this is a sub-hog.
                    subHOG_to_be_merged = [ ]
                    for node_leave_name in node_leaves_name: #print(node_leave_name)
                        for subHOG in subHOGs_children :
                            subHOG_members= subHOG._members
                            if node_leave_name in subHOG_members:  # could be improved
                                if subHOG._hogid  not in subHOG_to_be_merged_set_other_Snodes_flattned_temp:
                                    subHOG_to_be_merged.append(subHOG)
                                    subHOGs_id_children_assigned.append(subHOG._hogid)
                                else:
                                    print("issue 184",node.name,subHOG._hogid, node_leave_name)
                                    if  "processed" in  node: print(node.name)
                                    else:  print("processed not in ", node.name) # print(node_leave_name,"is in ",subHOG._hogid)
                    if subHOG_to_be_merged :
                        subHOG_to_be_merged_set = set(subHOG_to_be_merged)     
                        taxnomic_range= node_species_tree.name
                        HOG_this_node = HOG(subHOG_to_be_merged_set,taxnomic_range,rhogid_num , msa= merged_msa) 
                        HOG_this_level.append(HOG_this_node)
                        subHOG_to_be_merged_set_other_Snodes.append([i._hogid for i in subHOG_to_be_merged_set])
                        subHOG_to_be_merged_set_other_Snodes_flattned_temp= [item for items in subHOG_to_be_merged_set_other_Snodes for item in items]            
                    #  I don't need to traverse deeper in this clade
                    node.processed = True #print("?*?*  ", node.name)
                                        
            subHOG_to_be_merged_set_other_Snodes_flattned= [item for items in subHOG_to_be_merged_set_other_Snodes for item in items]       
            if  [i._hogid for i in subHOGs_children] == subHOG_to_be_merged_set_other_Snodes_flattned:
                break
        for subHOG in subHOGs_children :        # for the single branch  ( D include a  subhog and a S node. )
            if  subHOG._hogid  not in subHOGs_id_children_assigned :   #print("here", subHOG)
                HOG_this_level.append(subHOG)
        prot_list_sbuhog= [i._members for i in HOG_this_level]
        prot_list_sbuhog_short = []
        for prot_sub_list_sbuhog in prot_list_sbuhog:
            prot_list_sbuhog_short.append([ prot.split('|')[2] for prot in prot_sub_list_sbuhog])
        logger_hog.info("- "+str(len(prot_list_sbuhog_short))+"HOGs are inferred at the level "+node_species_tree.name+": "+ " ".join([str(i) for i in prot_list_sbuhog_short]))
    #print("By merging ",subHOG_to_be_merged_set_other_Snodes)

    #check for conflicts in merging
    #     for i in range(subHOG_to_be_merged_set_other_Snodes):
    #         if 
    #         for i in range(subHOG_to_be_merged_set_other_Snodes):
    # print("*&*& ",node_species_tree.name)
    dic_sub_hogs[node_species_tree.name] = HOG_this_level
    return (dic_sub_hogs)



class HOG:
    _hogid_iter = 1000
    def __init__(self, input_instantiate, taxnomic_range, rhogid_num, msa = None):       # _prot_names
        # the input_instantiate could be either
        #     1) a protein as the biopython seq record  SeqRecord(seq=Seq('MAPSSRSPSPRT. ] 
        # or  2) a set of intances of class HOG   wit a big msa
        # those variable starting with _ are local to the class, should not access directly  (although it is possbile)
        self._rhogid_num= rhogid_num
        self.__class__._hogid_iter += 1
        self._hogid= "hog"+str(self._rhogid_num)+"_"+str(self.__class__._hogid_iter)
        self._taxnomic_range = taxnomic_range  #print("**** a new HOG is instantiated with id", self._hogid)

        if  isinstance(input_instantiate, SeqRecord):    #if len(sub_hogs)==1:
            only_protein = input_instantiate # only one seq, only on child, leaf            
            self._members = set([only_protein.id])
            self._msa =  MultipleSeqAlignment([only_protein])
            self._subhogs = []
            # <<class 'Bio.Align.MultipleSeqAlignment'> instance (1 records of length 314) at 7f0e86c713d0>
            
        elif  msa    and   all(isinstance(x, HOG) for x in input_instantiate):  
            # here we want to merge few subHOGs and creat a new HOG.   #the n
            sub_hogs = input_instantiate
            hog_members = set()
            for sub_hog in sub_hogs: 
                hog_members |= sub_hog.get_members()  #union
            self._members =  hog_members              #set.union(*tup) 
            self._subhogs = list(input_instantiate)  # full members
            
            max_num_seq=30 # subsampling in msa 
            records_full = [record for record in msa if record.id in self._members]
            if len(records_full)> max_num_seq: 
                records_sub_sampled = sample(records_full, max_num_seq)   #  without replacement.
                logger_hog.info("we are doing subsamping now from "+str(len(records_full))+" to "+str(max_num_seq)+"seqs.")
            else:
                records_sub_sampled = records_full
            # removing some columns completely gap -  (not x   )
            # now select those proteins 
            self._msa =  MultipleSeqAlignment(records_sub_sampled)
            # without replacement sampling ,  # self._children = sub_hogs # as legacy  ?
        else:
            logger_hog.error("Error 169,  check the input format to instantiate a HOG class")
            assert False

    def __repr__(self):
        return "an object of class HOG of hogID="+self._hogid+", length="+str(len(self._members))+", taxonomy= "+str(self._taxnomic_range)
            
    def get_members(self):
        return set(self._members)
        #merge, gene tree, midpoint, lable_SD_internal_nodes, traverse_geneTree_assign_hog

    def to_orthoxml(self, indent=0):
        hog_elemnt = ET.Element('orthologGroup', attrib={"id":str(self._hogid)})
        property_element = ET.SubElement(hog_elemnt, "property", attrib={"name":"TaxRange", "value":str(self._taxnomic_range)})
        # the following could be improved ???   without this if it will be like, one property is enough
        #<orthologGroup>
        #    <property name="TaxRange" value="GORGO_HUMAN_PANTR"/>
        #    <property name="TaxRange" value="GORGO_HUMAN_PANTR"/>
        #if property_element not in hog_elemnt:
        #    hog_elemnt.append(property_element)
        #    print("*")
        #gene = ET.SubElement(species, "gene", attrib={"id":str(gene_counter), "protId":query_prot_record.id})               
        #hog_elemnt = ET.SubElement(species,
        
        if len(self._subhogs) == 0:
            #print("we are here   ********???--??? ",self._hogid)
            geneRef_elemnt = ET.Element('geneRef', attrib={'id': str(gene_id_name[list(self._members)[0]] )}) # # gene_id_name[query_prot_record.id]
            #hog_elemnt.append(geneRef_elemnt)
            # could be improved when the rhog contains only one protein
            return geneRef_elemnt # hog_elemnt
        
        def _sorter_key(sh): 
            return sh._taxnomic_range
        self._subhogs.sort(key=_sorter_key)  #print(f'{" "*indent}subhog: {self._taxnomic_range}:')
        for sub_clade, sub_hogs in itertools.groupby(self._subhogs, key=_sorter_key):
            list_of_subhogs_of_same_clade = list(sub_hogs) #print(f'{" "*(indent+1)} clade: {sub_clade} with {str(len(list_of_subhogs_of_same_clade))}')
            if len(list_of_subhogs_of_same_clade) > 1:
                paralog_element = ET.Element('paralogGroup')
                for sh in list_of_subhogs_of_same_clade:
                    paralog_element.append(sh.to_orthoxml(indent+2))
                hog_elemnt.append(paralog_element)
            else:
                hog_elemnt.append(list_of_subhogs_of_same_clade[0].to_orthoxml(indent+2))           
        return hog_elemnt

 

In [20]:
# rHog_is_ready= False

program has started. The oma database address is in  ./test_fastgethog/omamer_database/oma_path/OmaServer.h5
14:25:24 - OMA data is parsed and its release name is: OmaStandalone; 1.0.x
14:25:24 - There are 5 species in the OMA database.
14:25:24 - The are 3 species in the proteome folder.
14:25:24 - There are  3  species in the hogmap folder.
14:25:24 - The first species UP000000798_224324  contains  1553  proteins.
14:25:24 - The first protein of first species is  sp|D0VWU4|SECE_AQUAE
14:25:24 - Filtering proteins started.
14:25:24 - For the rest of species, all proteins were mapped using OMAmer.
3 1553
14:25:24 - Sequences of rootHOGs are writtend as fasta file in ./test_fastgethog/rhogs_out/


In [11]:
import logging
from datetime import datetime
import os
from os import listdir

from ete3 import Phyloxml
from ete3 import Tree

import zoo.wrappers.aligners.mafft as mafft
import zoo.wrappers.treebuilders.fasttree as fasttree

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment




if __name__ == "__main__":
    
    logging.basicConfig()
    logger_hog = logging.getLogger("hog")
    logger_hog.setLevel(logging.INFO) # WARN  
    # make sure addresses end with "/" 
    address_rhogs_folder =  "./rhogs/"
    address_working_folder = "./" 

    species_tree_address= address_working_folder+"lineage_tree_qfo.phyloxml"
    
    rHog_is_ready= True



    if rHog_is_ready : 
        
        ## create a list of rootHOG IDs  stored in the folder of rHOG .
        #rhog_files = listdir(address_rhogs_folder)
        #rhogid_num_list= []
        #for rhog_file in rhog_files:
        #    if rhog_file.split(".")[-1] == "fa":
        #        rhogid_num = int(rhog_file.split(".")[0].split("_")[1])
        #        rhogid_num_list.append(rhogid_num)

        rhogid_num = 556836  
        (HOG_thisLevel) = infer_HOG_rhog(rhogid_num, address_rhogs_folder, species_tree_address)

    else:    
        
        import pyoma.browser.db as db
        
        oma_database_address = address_working_folder+"omamer_database/oma_path/OmaServer.h5"
        print("program has started. The oma database address is in ",oma_database_address)
        (oma_db, list_oma_speices) = parse_oma_db(oma_database_address)

        (query_species_names, query_prot_records_species) = parse_proteome(list_oma_speices)   

        query_prot_records_species = add_species_name(query_prot_records_species,query_species_names)

        hogmap_allspecies_elements = parse_hogmap_omamer(query_species_names)

        (query_prot_names_species_mapped, prots_hogmap_hogid_allspecies, prots_hogmap_subfscore_allspecies, prots_hogmap_seqlen_allspecies, prots_hogmap_subfmedseqlen_allspecies) = hogmap_allspecies_elements
        query_prot_records_species_filtered =  filter_prot_mapped(query_species_names, query_prot_records_species, query_prot_names_species_mapped)
        print(len(query_prot_records_species_filtered),len(query_prot_records_species_filtered[0]))
        
        (rhogid_num_list, rhogids_prot_records_query) = group_prots_rootHOGs(prots_hogmap_hogid_allspecies, address_rhogs_folder)




INFO:hog:
Working on root hog: 556836. 

INFO:hog:number of proteins in the rHOG is 9.
INFO:hog:./test_fastgethog/lineage_tree_qfo.phyloxml
INFO:hog:The number of unique species in the rHOG is 8.
INFO:hog:2 msas are merged into one with the length of 2 204
INFO:hog:All subHOGs are merged, merged msa is with length of2 204.
INFO:hog:Gene tree is infered with length of 2.
INFO:hog:- 1HOGs are inferred at the level MOUSE_RATNO: ['NTAL_MOUSE', 'NTAL_RAT']
INFO:hog:subHOGs in thisLevel are [object of class HOG of hogID=hog556836_1003, length=2, taxonomy= MOUSE_RATNO] .
INFO:hog:3 msas are merged into one with the length of 3 243
INFO:hog:All subHOGs are merged, merged msa is with length of3 243.
INFO:hog:Gene tree is infered with length of 3.
INFO:hog:- 1HOGs are inferred at the level GORGO_HUMAN_PANTR: ['G3SKS4_GORGO', 'H2QUR1_PANTR', 'NTAL_HUMAN']
INFO:hog:subHOGs in thisLevel are [object of class HOG of hogID=hog556836_1007, length=3, taxonomy= GORGO_HUMAN_PANTR] .


(sp|Q9JHL0|NTAL_MOUSE:0.0908936,sp|Q8CGL2|NTAL_RAT:0.0908936)S1:0;
(tr|H2QUR1|H2QUR1_PANTR:0.00843292,(tr|G3SKS4|G3SKS4_GORGO:0.00833703,sp|Q9GZY6|NTAL_HUMAN:5e-09)S1:0.00843292)S2:0;


INFO:hog:2 msas are merged into one with the length of 5 250
INFO:hog:All subHOGs are merged, merged msa is with length of5 250.
INFO:hog:Gene tree is infered with length of 5.
INFO:hog:- 1HOGs are inferred at the level MOUSE_RATNO_GORGO_HUMAN_PANTR: ['NTAL_RAT', 'NTAL_MOUSE', 'G3SKS4_GORGO', 'H2QUR1_PANTR', 'NTAL_HUMAN']
INFO:hog:subHOGs in thisLevel are [object of class HOG of hogID=hog556836_1008, length=5, taxonomy= MOUSE_RATNO_GORGO_HUMAN_PANTR] .
INFO:hog:2 msas are merged into one with the length of 2 383
INFO:hog:All subHOGs are merged, merged msa is with length of2 383.
INFO:hog:Gene tree is infered with length of 2.
INFO:hog:- 1HOGs are inferred at the level CANLF_BOVIN: ['G3MYV0_BOVIN', 'A0A5F4CAL9_CANLF']
INFO:hog:subHOGs in thisLevel are [object of class HOG of hogID=hog556836_1011, length=2, taxonomy= CANLF_BOVIN] .


((sp|Q9JHL0|NTAL_MOUSE:0.0945749,sp|Q8CGL2|NTAL_RAT:0.0850889)S1:0.204182,(tr|H2QUR1|H2QUR1_PANTR:0.0167884,(tr|G3SKS4|G3SKS4_GORGO:0.00826907,sp|Q9GZY6|NTAL_HUMAN:5e-09)S2:5e-09)S3:0.204182)S4:0;
(tr|A0A5F4CAL9|A0A5F4CAL9_CANLF:0.219168,tr|G3MYV0|G3MYV0_BOVIN:0.219168)S1:0;


INFO:hog:2 msas are merged into one with the length of 7 422
INFO:hog:All subHOGs are merged, merged msa is with length of7 422.
INFO:hog:Gene tree is infered with length of 7.
INFO:hog:- 1HOGs are inferred at the level MOUSE_RATNO_GORGO_HUMAN_PANTR_CANLF_BOVIN: ['G3MYV0_BOVIN', 'NTAL_RAT', 'NTAL_MOUSE', 'G3SKS4_GORGO', 'H2QUR1_PANTR', 'A0A5F4CAL9_CANLF', 'NTAL_HUMAN']
INFO:hog:subHOGs in thisLevel are [object of class HOG of hogID=hog556836_1012, length=7, taxonomy= MOUSE_RATNO_GORGO_HUMAN_PANTR_CANLF_BOVIN] .
INFO:hog:3 msas are merged into one with the length of 9 429
INFO:hog:All subHOGs are merged, merged msa is with length of9 429.


((sp|Q9JHL0|NTAL_MOUSE:0.0648269,sp|Q8CGL2|NTAL_RAT:0.109031)S1:0.130208,((tr|A0A5F4CAL9|A0A5F4CAL9_CANLF:0.316505,tr|G3MYV0|G3MYV0_BOVIN:0.110056)S2:0.0501216,(tr|H2QUR1|H2QUR1_PANTR:0.0160516,(tr|G3SKS4|G3SKS4_GORGO:0.00791884,sp|Q9GZY6|NTAL_HUMAN:5e-09)S3:5e-09)S4:0.172186)S5:0.130208)S6:0;


INFO:hog:Gene tree is infered with length of 9.
INFO:hog:- 1HOGs are inferred at the level CHICK_MOUSE_RATNO_GORGO_HUMAN_PANTR_CANLF_BOVIN: ['A0A1D5P223_CHICK', 'NTAL_RAT', 'A0A5F4CAL9_CANLF', 'NTAL_HUMAN', 'G3MYV0_BOVIN', 'NTAL_MOUSE', 'G3SKS4_GORGO', 'NTAL_CHICK', 'H2QUR1_PANTR']
INFO:hog:subHOGs in thisLevel are [object of class HOG of hogID=hog556836_1015, length=9, taxonomy= CHICK_MOUSE_RATNO_GORGO_HUMAN_PANTR_CANLF_BOVIN] .


((tr|A0A1D5P223|A0A1D5P223_CHICK:0.00462657,sp|Q5S7W5|NTAL_CHICK:5e-09)D1:0.492744,(((tr|A0A5F4CAL9|A0A5F4CAL9_CANLF:0.308172,tr|G3MYV0|G3MYV0_BOVIN:0.101099)S1:0.0475405,(sp|Q9JHL0|NTAL_MOUSE:0.0629393,sp|Q8CGL2|NTAL_RAT:0.105049)S2:0.248291)S3:0.00311226,(tr|H2QUR1|H2QUR1_PANTR:0.0157155,(tr|G3SKS4|G3SKS4_GORGO:0.00775932,sp|Q9GZY6|NTAL_HUMAN:5e-09)S4:5e-09)S5:0.167083)S6:0.492744)S7:0;


In [None]:


#     rhogid_num_list_temp = rhogid_num_list[:10] # ["476045"] #["836500"]# 
    
#     all_prot_temp_list= []
#     for rhogid_num in rhogid_num_list_temp:
#         prot_address = address_out_hog+"HOG_"+str(rhogid_num)+".fa"
#         rhog_i = list(SeqIO.parse(prot_address, "fasta")) 
#         for prot_i in rhog_i:
#             all_prot_temp_list.append(prot_i.id)
#     print("there are proteins ",len(all_prot_temp_list))   
#     ortho_file  = ET.Element("orthoXML", attrib={"xmlns":"http://orthoXML.org/2011/", "origin":"OMA", "originVersion":"Nov 2021", "version":"0.3"} ) # 
#     gene_counter =1000
#     gene_id_name = {}
    
#     for species_i in range(len(query_species_names)):
#         no_gene_species = True  # for code develop ment
#         species_name = query_species_names[species_i]
#         species = ET.SubElement(ortho_file, "species", attrib={"name":species_name, "NCBITaxId":"1"})
#         database = ET.SubElement(species, "database", attrib={"name":"QFO database ", "version":"2020"})
#         genes = ET.SubElement(database, "genes")
#         query_prot_records =  query_prot_records_species[species_i]
#         for gene_i in range(len(query_prot_records)):                # [12:15]
#             query_prot_record= query_prot_records[gene_i]
#             gene_id_name[query_prot_record.id]= gene_counter

#             protid_short = query_prot_record.id.split("|")[1].strip() # tr|E3JPS4|E3JPS4_PUCGT
            
#             # make the file big
#             if query_prot_record.id in all_prot_temp_list:
                
#                 gene = ET.SubElement(genes, "gene", attrib={"id":str(gene_counter), "protId":protid_short})               
#                 no_gene_species = False  # for code develop ment             
#             gene_counter += 1
#         if no_gene_species :   # for code develop ment
#             gene = ET.SubElement(genes, "gene", attrib={"id":str(gene_counter), "protId":protid_short})               
#     groups_xml = ET.SubElement(ortho_file, "groups")
#     dic_rhog_resolved={}
#     list_rhog_done = []
#     logger_hog.info("parrallel is started")
    
#     number_max_workers = 4
#     with concurrent.futures.ProcessPoolExecutor(max_workers = number_max_workers) as executor: 
#         for rhogid_num, output_values in zip(rhogid_num_list_temp, executor.map(infer_HOG_rhog, rhogid_num_list_temp)):
#             (HOG_thisLevel) = output_values
#             dic_rhog_resolved[rhogid_num]=1
#             list_rhog_done.append(rhogid_num)
#             for hog_i in HOG_thisLevel:
#                 if len(hog_i._members)>1:
#                     # could be improved 
#                     HOG_thisLevel_xml = hog_i.to_orthoxml()
#                     groups_xml.append(HOG_thisLevel_xml)
#                     #print(hog_i._members)
#     xml_str = minidom.parseString(ET.tostring(ortho_file)).toprettyxml(indent="   ")
#     output_xml_name= address_working_folder + "/xml_100_short.xml"
#     with open(output_xml_name, "w") as file_out:
#         file_out.write(xml_str)
#     logger_hog.info("output orthoxml is written in the file"+str(output_xml_name))
#     logger_hog.info("all done !!")

In [None]:
#     rhogid_num_list_temp = rhogid_num_list[:10*1000] 
#     all_prot_temp_list= []
#     for rhogid_num in rhogid_num_list_temp:
#         prot_address = address_out_hog+"HOG_"+str(rhogid_num)+".fa"
#         rhog_i = list(SeqIO.parse(prot_address, "fasta")) 
#         for prot_i in rhog_i:
#             all_prot_temp_list.append(prot_i.id)
#     print("there are proteins ",len(all_prot_temp_list))   
#     ortho_file  = ET.Element("orthoXML", attrib={"xmlns":"http://orthoXML.org/2011/", "origin":"OMA", "originVersion":"Nov 2021", "version":"0.3"} ) # 
#     gene_counter =1000
#     gene_id_name = {}  
#     for species_i in range(len(query_species_names)):
#         no_gene_species = True  # for code develop ment
#         species_name = query_species_names[species_i]
#         species = ET.SubElement(ortho_file, "species", attrib={"name":species_name, "NCBITaxId":"1"})
#         database = ET.SubElement(species, "database", attrib={"name":"QFO database ", "version":"2020"})
#         genes = ET.SubElement(database, "genes")
#         query_prot_records =  query_prot_records_species[species_i]
#         for gene_i in range(len(query_prot_records)):                # [12:15]
#             query_prot_record= query_prot_records[gene_i]
#             gene_id_name[query_prot_record.id]= gene_counter

#             protid_short = query_prot_record.id.split("|")[1].strip() # tr|E3JPS4|E3JPS4_PUCGT
#             # make the file big
#             if query_prot_record.id in all_prot_temp_list:
#                 gene = ET.SubElement(genes, "gene", attrib={"id":str(gene_counter), "protId":protid_short})               
#                 no_gene_species = False  # for code develop ment
#             gene_counter += 1
#         if no_gene_species :   # for code develop ment
#             gene = ET.SubElement(genes, "gene", attrib={"id":str(gene_counter), "protId":protid_short})                   
#     groups_xml = ET.SubElement(ortho_file, "groups")
#     dic_rhog_resolved={}
#     list_rhog_done = []
#     logger_hog.info("parrallel is started")
    
#     xml_str = minidom.parseString(ET.tostring(ortho_file)).toprettyxml(indent="   ")
#     output_xml_name= address_working_folder + "/xml_10k_gene_name.xml"
#     with open(output_xml_name, "w") as file_out:
#         file_out.write(xml_str)
#     logger_hog.info("output orthoxml is written in the file"+str(output_xml_name))
#     logger_hog.info("all done !!")

In [None]:

# import time
# from sys import argvs    
# #from scipy.cluster.hierarchy import dendrogram, linkage, ward, leaves_list, fcluster
# import ete3
# import concurrent.futures
# #import ast #import pickle #import zoo #zoo.__file__ 
# from Bio.Seq import Seq 
# from collections import defaultdict
# import matplotlib      #matplotlib.use('Agg')
# import matplotlib.pyplot as plt
# from random import sample
# import itertools
# from pyoma.browser.models import ProteinEntry
# from pyoma.browser.hoghelper import build_hog_to_og_map
# import xml.etree.ElementTree as ET
# from xml.etree import ElementTree
# from xml.dom import minidom

# def run_one_msa(seqRecords_queries):
#     #current_time = datetime.now().strftime("%H:%M:%S")
#     #print(current_time, "- working on new OG with length of ",len(seqRecords_OG_queries))    
#     wrapper_mafft = mafft.Mafft(seqRecords_queries,datatype="PROTEIN") 
#     # MAfft error: Alphabet 'U' is unknown. -> add --anysymbol argument needed to define in the sourse code
#     # workaround sed "s/U/X/g"
#     wrapper_mafft.options.options['--retree'].set_value(1)
#     run_mafft = wrapper_mafft() # it's wrapper  storing the result  and time 
#     time_taken_mafft = wrapper_mafft.elapsed_time

#     result_mafft = wrapper_mafft.result 
#     time_taken_mafft2 = wrapper_mafft.elapsed_time
#     current_time = datetime.now().strftime("%H:%M:%S")
#     #print(current_time,"- time elapsed for MSA: ",time_taken_mafft2)
#     #print(current_time,"- MSA for an OG is just finished: ",time_taken_mafft2)
#     return(result_mafft)
