In [2]:
import pandas as pd
from ete3 import NCBITaxa
from ete3 import Tree



In [8]:
##Load df with all species in dataset
df_all=pd.read_csv("all_species_annotated.tsv",sep='\t', index_col=0)


In [17]:
## helper function to extract taxonomic information with ete3
## written by RK with ChatGPT
from ete3 import NCBITaxa

# You only need to instantiate NCBITaxa once per session.
# If you have never run it before (or want an update), call:
#   NCBITaxa().update_taxonomy_database()
ncbi = NCBITaxa()

def get_ranks(binomial: str, ranks=("class", "order", "family","phylum","subphylum","subfamily","superfamily","suborder",'infraorder','subclass','subphylum','subfamily')) -> dict:
    """
    Return the specified taxonomic ranks for a binomial name.

    Parameters
    ----------
    binomial : str
        Two-part scientific name, e.g. 'Homo sapiens'.
    ranks : iterable of str, optional
        Taxonomic ranks to retrieve (default: class, order, family).

    Returns
    -------
    dict {rank: name or None}
        Dictionary containing the requested ranks. If the name is not
        found, or a rank is absent in the lineage, its value is None.
    """
    # Translate the name → taxid
    name2taxid = ncbi.get_name_translator([binomial])
    if not name2taxid:
        return {rank: None for rank in ranks}

    taxid = name2taxid[binomial][0]

    # Full lineage & rank map
    lineage = ncbi.get_lineage(taxid)
    rank_map = ncbi.get_rank(lineage)
    name_map = ncbi.get_taxid_translator(lineage)

    # Extract desired ranks
    result = {}
    for rank in ranks:
        # Find the first taxid in the lineage whose rank matches
        taxids = [tid for tid, r in rank_map.items() if r == rank]
        result[rank] = name_map[taxids[0]] if taxids else None
    return result



In [18]:
for index, row in df_all.iterrows():
    rank=get_ranks(index)
    if str(rank['order'])!='None':
        for x in rank:
            df_all.loc[index,x]=rank[x]
    df_all.loc[index,'subfamily']=rank['subfamily']


In [23]:
for index, row in df_fil.iterrows():
    rank=get_ranks(index)
    if str(rank['order'])!='None':
        for x in rank:
            df_fil.loc[index,x]=rank[x]


In [78]:

def annotate_tree(t):
"""
takes an ete3 tree object
annotates each leaf with taxonomic features
"""
    for leaf in t:
        taxdict=get_ranks(leaf.name.split("_")[0])
        leaf.add_features(phylum=taxdict['phylum'])
        if taxdict['order']=='Psocoptera' or taxdict['order']=='Phthiraptera':
            leaf.add_features(order='Psocodea')
        else:
            leaf.add_features(order=taxdict['order'])
        leaf.add_features(classt=taxdict['class'])
        leaf.add_features(family=taxdict['family'])
        leaf.add_features(subfamily=taxdict['subfamily'])
        leaf.add_features(subphylum=taxdict['subphylum'])
        leaf.add_features(superfamily=taxdict['superfamily'])
        leaf.add_features(suborder=taxdict['suborder'])
        leaf.add_features(infraorder=taxdict['infraorder'])
    return t


In [12]:
## written by RK with chatgpt
def collapse_monophyletic_rank(tree, attr="order"):
    """
    Collapse every monophyletic clade whose leaves all share the same
    value for `attr` (e.g. 'order') and label the remaining leaf with
    that value.

    Parameters
    ----------
    tree : ete3.Tree  – the tree to modify in place
    attr : str        – the node feature to check (default: 'order')

    Returns
    -------
    ete3.Tree
        The same tree object, after collapsing.
    """
    for node in tree.traverse("postorder"):
        # Skip leaves – nothing to collapse.
        if node.is_leaf():
            continue

        # Collect the attribute values found in this subtree.
        orders_here = {getattr(leaf, attr) for leaf in node.iter_leaves()}

        # If exactly one unique value is present, clade is monophyletic for that value.
        if len(orders_here) == 1:
            order_name = orders_here.pop()

            # ↓ Remove all current descendants to make `node` a leaf.
            for child in list(node.children):
                child.detach()          # drops child subtrees

            # Label the new leaf.
            node.name = order_name
            node.add_features(**{attr: order_name})
    for node in tree:
        name=getattr(node, attr)
        node.name=name

    return tree


## Insecta order-level tree
Misof 2014

In [106]:
t=Tree('constraint_trees/Misof_2014.nwk')
t=annotate_tree(t)
keep=[x for x in t if x.classt=='Insecta']
t.prune(keep)

In [108]:
t=collapse_monophyletic_orders(t)

In [115]:
a=t&'Zorotypus_caudelli'
a.name='Zoraptera'
a=t&'Tanzaniophasma_sp._AD-2013'
a.name='Mantophasmatodea'


In [120]:
keep=list(set(df_all[df_all['class']=='Insecta'].order))
t.prune(keep)

In [11]:
print(t)


                              /-Lepidoptera
                           /-|
                          |   \-Trichoptera
                        /-|
                       |  |   /-Diptera
                       |   \-|
                       |      \-Antliophora
                     /-|
                    |  |      /-Coleoptera
                    |  |   /-|
                    |  |  |   \-Strepsiptera
                  /-|   \-|
                 |  |     |   /-Raphidioptera
                 |  |      \-|
               /-|  |         \-Megaloptera
              |  |  |
              |  |   \-Hymenoptera
            /-|  |
           |  |   \-Psocodea
           |  |
           |  |   /-Hemiptera
         /-|   \-|
        |  |      \-Thysanoptera
        |  |
        |  |   /-Dermaptera
        |  |  |
        |   \-|   /-Orthoptera
      /-|     |  |
     |  |      \-|      /-Blattodea
     |  |        |   /-|
     |  |         \-|   \-Mantodea
   /-|  |           |
  |  |  |       

In [121]:
t.write(format=9,outfile="constraint_trees/Insecta_Misof_order_constraints.nwk")

## Coleoptera 
Zhang 2018

In [13]:
t=Tree('constraint_trees/Coleoptera_Zhang_2018.nwk')

In [14]:
t=annotate_tree(t)

In [15]:
keep=[x for x in t if x.order=='Coleoptera' and x.family in (set(df_all[df_all.order=='Coleoptera'].family))]

In [16]:
t.prune(keep)

In [71]:
t=collapse_monophyletic_rank(t,'family')
print(t)


         /-Carabidae
      /-|
   /-|   \-Dytiscidae
  |  |
  |   \-Gyrinidae
--|
  |   /-Lampyridae
  |  |
   \-|      /-Hydrophilidae
     |   /-|
     |  |   \-Passalidae
      \-|
        |   /-Dermestidae
        |  |
         \-|      /-Chrysomelidae
           |   /-|
           |  |   \-Curculionidae
            \-|
              |   /-Meloidae
               \-|
                  \-Tenebrionidae


In [18]:
t.write(outfile='constraint_trees/Coleoptera_Zhang_2018_family_constraints.nwk', format=9)

In [19]:
t=Tree('constraint_trees/Coleoptera_Zhang_2018_family_constraints.nwk')

## Orthoptera
Chang 2020

In [101]:
t=Tree('constraint_trees/Orthoptera_Chang_2020.nwk')

In [102]:
t=annotate_tree(t)

In [103]:
keep=[x for x in t if x.order=='Orthoptera' and x.family in set(df_all[df_all.order=='Orthoptera'].family)]

In [104]:
t.prune(keep)

In [105]:
t=collapse_monophyletic_rank(t,'family')
print(t)


      /-Gryllidae
   /-|
--|   \-Tettigoniidae
  |
   \-Acrididae


In [106]:
t.write(outfile='constraint_trees/Orthoptera_Chang_2020_family_constraints.nwk', format=9)

## Diptera
Wiegmann 2020

In [257]:
t=Tree('constraint_trees/Diptera_Wiegmann_2011.nwk')

In [147]:
t=annotate_tree(t)
keep=[x for x in t if x.family in set(df_all[df_all.order=='Diptera'].family)]
t.prune(keep)

In [155]:
t=collapse_monophyletic_rank(t,'family')
print(t)


            /-Chironomidae
         /-|
      /-|   \-Simuliidae
     |  |
   /-|   \-Culicidae
  |  |
  |   \-Psychodidae
  |
--|      /-Sciaridae
  |   /-|
  |  |   \-Cecidomyiidae
  |  |
   \-|   /-Phoridae
     |  |
     |  |      /-Drosophilidae
      \-|   /-|
        |  |  |   /-Hippoboscidae
        |  |   \-|
         \-|     |   /-Muscidae
           |      \-|
           |         \-Calliphoridae
           |
            \-Tephritidae


In [293]:
t.write(outfile='constraint_trees/Diptera_Wiegmann_2011_family_constraints.nwk', format=9)

## Lepidoptera
Kawahara 2019

In [288]:
t=Tree('constraint_trees/Lepidoptera_Kawahara_2019.nwk')



In [289]:
t=annotate_tree(t)

In [290]:
keep=[x for x in t if x.family in set(df_all[df_all.order=='Lepidoptera'].family)]
t.prune(keep)

In [291]:
set(df_all[df_all.order=='Lepidoptera'].family)-set([x.family for x in t])

set()

In [292]:
t=collapse_monophyletic_rank(t,'family')
print(t)


   /-Micropterigidae
  |
  |      /-Tortricidae
  |     |
  |     |      /-Gelechiidae
  |     |     |
--|   /-|   /-|      /-Erebidae
  |  |  |  |  |   /-|
  |  |  |  |  |  |   \-Noctuidae
  |  |  |  |   \-|
  |  |   \-|     |   /-Bombycidae
  |  |     |      \-|
  |  |     |         \-Saturniidae
   \-|     |
     |     |   /-Papilionidae
     |      \-|
     |        |   /-Pieridae
     |         \-|
     |            \-Nymphalidae
     |
      \-Hepialidae


In [294]:
t.write(outfile='constraint_trees/Lepidoptera_Kawahara_2019_family_constraints.nwk', format=9)

## Hymenoptera
Blaimer_2023

In [246]:
import re, pathlib, sys
# If your tree is in a file:
tree = pathlib.Path('constraint_trees/Hymenoptera_Blaimer_2023.nwk').read_text()

# Remove everything between single quotes, including the quotes
clean = re.sub(r"'[^']*'", "", tree)

# Save or print the result
pathlib.Path("tree_clean.nwk").write_text(clean)
f=open('constraint_trees/Hymenoptera_Blaimer_2023.nwk','w')
f.write(clean)
f.close()

In [50]:
t=Tree('constraint_trees/Hymenoptera_Blaimer_2023.nwk')

In [54]:
t=annotate_tree(t)

In [262]:
keep=[x for x in t if x.family in set(df_all[df_all.order=='Hymenoptera'].family)]
t.prune(keep)

In [263]:
t=collapse_monophyletic_rank(t,'family')
print(t)


                     /-Pteromalidae
                  /-|
                 |  |   /-Pteromalidae
               /-|   \-|
              |  |      \-Encyrtidae
              |  |
            /-|   \-Pteromalidae
           |  |
           |  |   /-Pteromalidae
         /-|   \-|
        |  |      \-Trichogrammatidae
        |  |
      /-|   \-Pteromalidae
     |  |
     |  |   /-Formicidae
   /-|   \-|
  |  |      \-Apidae
  |  |
  |  |   /-Ichneumonidae
--|   \-|
  |      \-Braconidae
  |
  |      /-Tenthredinidae
  |   /-|
   \-|   \-Athaliidae
     |
      \-Tenthredinidae


In [None]:
##collapsed mixed clades
t=Tree("((Athaliidae_Tenthredinidae),((Ichneumonidae,Braconidae),((Formicidae,Apidae),(Pteromalidae_Encyrtidae_Trichogrammatidae))));")
print(t)


   /- /-Athaliidae_Tenthredinidae
  |
--|      /-Ichneumonidae
  |   /-|
  |  |   \-Braconidae
   \-|
     |      /-Formicidae
     |   /-|
      \-|   \-Apidae
        |
         \- /-Pteromalidae_Encyrtidae_Trichogrammatidae


In [60]:
t.write(outfile='constraint_trees/Hymenoptera_Blaimer_2023_family_constraints.nwk',format=9)

## Drosophilidae 
Suvorov_2022

In [13]:
t=Tree("constraint_trees/Drosophilidae_Suvorov_2022.nwk",format=1)
for x in t:
    x.name=x.name.replace('D_','Drosophila ')
    if x.name=='Drosophila pseudotalamancana':
        x.name='Drosophila gibberosa'
keep=[x for x in t if x.name in set(df_all[df_all.family=='Drosophilidae'].index)]

t.prune(keep)

In [145]:
print(t)


               /-Drosophila gibberosa
            /-|
           |  |   /-Drosophila hydei
           |   \-|
         /-|      \-Drosophila repleta
        |  |
        |  |   /-Drosophila virilis
      /-|   \-|
     |  |      \-Drosophila americana
     |  |
   /-|  |   /-Drosophila immigrans
  |  |   \-|
  |  |      \-Drosophila funebris
--|  |
  |   \-Drosophila busckii
  |
  |   /-Drosophila melanogaster
   \-|
      \-Drosophila willistoni


In [146]:
t.write(outfile='constraint_trees/Drosophilidae_Suvorov_2022_species_constraints.nwk',format=9)

Remaining constriant trees in constraint_trees were manually processed and all concatenated manually as described in SI methods to produce constraint_trees/all_combined.nwk

## All species manuscript phylogram (Figure 9 and supplementary figure 1)
make a species-level phylogram for all species by substituting species in for ranks in the constraint tree

In [102]:
##load a manually concatenated newick of all constraints
t=Tree("constraint_trees/all_combined.nwk",format=1)


In [103]:
for index, row in df_all.iterrows():
    r=";".join([str(row[x]) for x in ['genus','family','order','class','phylum','subphylum','subfamily','superfamily','suborder','infraorder','subclass','infraorder']])
    r=r+";"
    df_all.loc[index,'c']=r

In [126]:
##create a dictionary mapping between leaves in constraint tree and species within that leaf taxo
name_map={}
for x in t:
    ns=x.name
    l=[]
    if '_' in ns:
        for n in ns.split("_"):
        
            dfi=df_all[df_all.c.str.contains(n+";")]
            l.extend([x.replace(" ","_") for x in list(dfi.index)])
    
    else:
        n=ns
        dfi=df_all[df_all.c.str.contains(n+";")]
        l=[x.replace(" ","_") for x in list(dfi.index)]
    name_map[ns]=l

In [127]:
s=open("constraint_trees/all_combined.nwk","r").readlines()[0]
for x in name_map:
    p=str(name_map[x]).replace("'","").replace("[","(").replace("]",")").replace(" ","")+x
    s=s.replace(x,p)

In [128]:
for x in name_map:
    p=str(name_map[x]).replace("'","").replace("[","(").replace("]",")").replace(" ","")+x
    s=s.replace(x,p)

In [1]:
f=open('constraint_trees/all_combined_species.nwk','w')
f.write(s)
f.close()