In [4]:
import pickle
import pandas as pd
import multiprocessing as mp
import numpy as np
import pickle
import matplotlib.pyplot as plt
from Bio import SeqIO
import os
import subprocess
import ast
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

In [8]:
## define mapping between taxanomy and color of leaf on tree
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
colors=["blue","brown","green","pink","orange","black","red","yellow","magenta",'grey']
cmap={c:mcolors.to_hex(c) for c in colors}
color_to_taxonomy_legend={
    "blue": "Bacteria",
    "brown": "Fungi",
    "green":"Viridiplantae",
    "yellow" : "Viruses",
    "orange": "Non-Arthropod Metazoa",
    "pink": "Arthropod (non-chimera)",
    "magenta" : "Arthropod chimera (primary)",
    "red": "Arthropod chimera (secondary)",
    "black": "Other",
    'grey': 'Rotifera'
    
}

In [35]:
"""
takes a row of a dataframe with a 'lineage' and 'secondary' column (latter for secondary chimera, boolean)
returns a color label
"""
def color(row):
    x=row['lineage']
    if 'Fungi' in x:
        return 'brown'
    elif 'Viruses' in x:
        return 'yellow'
    elif 'Bacteria' in x:
        return 'blue'
    elif 'Viridiplantae' in x:
        return 'green'
    elif 'primary_chimera' in str(row):
        return 'magenta'
    elif 'secondary' in row.index and row['secondary']=='True':
        return 'red'
    elif 'Arthropoda' in x:
        return 'pink'
    elif 'Rotifera' in x:
        return 'grey'
    elif 'Metazoa' in x:
        return 'orange'
    else:
        return 'black'

In [11]:
"""
adds a legend to the itol_color_strip.txt annotation file
"""
def add_legend(interv,dftax):
    ch=";".join(interv.split(";")[0:2])
    file_path=f"outputs/phylogenetic_dataset/{ch}/{interv}/itol_color_strip.txt"
    f=open(file_path,"r").readlines()
    if "LEGEND_TITLE Taxonomic labels\n" not in f:
        
        colorsi=list(set(dftax['color']))

        cmapi={cmap[x]:color_to_taxonomy_legend[x] for x in colorsi}
        cmapi = dict(sorted(cmapi.items(), key=lambda item: item[1]))
        legend_text="LEGEND_TITLE Taxonomic labels\n"
        shapes="LEGEND_SHAPES "
        for i in range(len(cmapi)):
            shapes+=("1 ")
        shapes=shapes[:-1]
        legend_text=legend_text+"\n"+shapes
        legend_colors="LEGEND_COLORS "
        for c in cmapi:
            legend_colors+=c+" "
        legend_colors=legend_colors[:-1]
        legend_text=legend_text+"\n"+legend_colors
        legend_LABELS="LEGEND_LABELS "
        for c in cmapi:
            legend_LABELS+=cmapi[c].replace(" ","_")+" "
        legend_LABELS=legend_LABELS[:-1]
        legend_text=legend_text+"\n"+legend_LABELS+"\n"+"\n"
        legend_text=legend_text+"STRIP_WIDTH 40\n\n"
        with open(file_path, "r") as file:
            lines = file.readlines()

        for i, line in enumerate(lines):
            if line.strip() == "DATA":
                insert_index = i
                break
        lines.insert(insert_index, legend_text)
        with open(file_path, "w") as file:
            file.writelines(lines)
    else:
        print("present")

In [12]:
"""
applies minimum ancestor deviation rooting using a script from https://www.nature.com/articles/s41559-017-0193
"""
def root_tree(interv):
    ch=";".join(interv.split(";")[0:2])
    subprocess.run(["python3", "scripts/mad.py", f'outputs/phylogenetic_dataset/{ch}/{interv}/edited_ml_tree.treefile' ,"-n"]) 
    tbs=Tree(f'outputs/phylogenetic_dataset/{ch}/{interv}/edited_ml_tree.treefile')
    trs=Tree(f'outputs/phylogenetic_dataset/{ch}/{interv}/edited_ml_tree.treefile.rooted')

    ## use MAD-rooted tree to outgroup root tree with bootstraps (MAD.py removes )
    for n in trs.iter_descendants("preorder"):
        l =[ str(leaf) for leaf in n]
        l2=[x.split("--")[1] for x in l]
        break
    if len(l2)>1:
        tbs.set_outgroup(tbs.get_common_ancestor(l2))
    else:
        tbs.set_outgroup(l2[0])
    
    newick_str = tbs.write(format=0)  
    # optional: save to file
    with open(f'outputs/phylogenetic_dataset/{ch}/{interv}/final_rooted.tree', "w") as fh:
        fh.write(newick_str)

In [2]:
import sys
import itolapi
from itolapi import Itol
"""
Upload tree and associated annotation files to itol
"""
def upload_to_itol(interv):
    ch=";".join(interv.split(";")[0:2])
    current_dir = Path(os.path.realpath(f"outputs/phylogenetic_dataset/{ch}/{interv}/final_rooted.tree")).parent
    tree = current_dir/"final_rooted.tree"


    root_path = current_dir.parent
    sys.path.append(str(root_path))
    test = Itol()

    # Set the tree file


    test.add_file(current_dir / 'itol_color_strip.txt')
    test.add_file(current_dir / 'itol_taxonomic_info.txt')
    test.add_file(tree)
    # Add parameters

    test.params['APIkey'] = 'INSERT_API_KEY'
    test.params['treeName'] = interv
    if 'HGT' in interv:
        test.params['projectName'] ="HGT_trees"
    else:
        test.params['projectName'] ="Metazoan_trees"
    test.params['workspaceName'] = "Arthropod_chimeric_HGT_ML_trees"

    # Submit the tree, return false if fail
    return test.upload()

In [54]:
"""
Master function to annotate, root, and upload trees
"""
def upload_main(interv):
    ch=";".join(interv.split(";")[0:2])
    
    ##Load taxonomic info dataframe 
    combined=pd.read_csv(f'outputs/phylogenetic_dataset/{ch}/{interv}/combined_sequences_data.tsv',sep='\t')
    try:
        combined=combined.set_index('target_name')
    except:
        combined=combined.set_index('sseqid')


    tree = Tree(f'outputs/phylogenetic_dataset/{ch}/{interv}/ml_tree.treefile', format=0)
    leaf_names = tree.get_leaf_names()
    for index, row in combined.iterrows():
        try:
            combined.loc[index, 'itol_name']=[x for x in leaf_names if index.replace(";","_") in x][0]
        except:
            continue


    #correct tree leaf names (itol automatically removes characters after ","

    new_name_map={i:
    x.split(';')[1] if ';' in x else x
    for i,x in zip(combined.itol_name, combined.index) }    


    combined=combined[combined.lineage.astype(str)!='None']
    
    ##add color based on taxonomic info
    for index, row in combined.iterrows():
        combined.loc[index, 'color']=color(row)

    for leaf in tree.iter_leaves():
         leaf.name = new_name_map[leaf.name]


    newick_str = tree.write(format=0)  

    with open(f'outputs/phylogenetic_dataset/{ch}/{interv}/edited_ml_tree.treefile', "w") as fh:
        fh.write(newick_str)
    
    ## write itol annotation file for colors
    f=open(f"outputs/phylogenetic_dataset/{ch}/{interv}/itol_color_strip.txt","w")
    f1=open("scripts/itol_color_strip_template.txt","r")
    for x in f1.readlines():
        f.write(x)
    f1.close()
    for index, row in combined.iterrows():
        f.write(new_name_map[row['itol_name']]+" "+cmap[row.color])
        f.write("\n")
    f.close()
 
    ## write itol annotation file for taxonomic annotation
    f=open(f"outputs/phylogenetic_dataset/{ch}/{interv}/itol_taxonomic_info.txt","w")
    f1=open("scripts/itol_taxonomic_info_template.txt","r")
    for x in f1.readlines():
        f.write(x)
    f1.close()
    for index, row in combined.iterrows():
        s=row['species'].replace(" ","_")
        a=f"{s};{row['order']};{row['class']};{row['phylum']}"
        a=a.replace(',','_')
        f.write(new_name_map[row['itol_name']]+","+a+",-1,#000000,normal,1,0")
        f.write("\n")
    f.close()
    
    ## call MAD rooting tree
    root_tree(interv)
    
    ## add legend for colors 
    add_legend(interv,combined)
    
    ##uplaod
    upload_to_itol(interv)



In [8]:
from pathlib import Path

def find_parents_with_contree(root):
    """Return a sorted list of parent directories whose child contains
    an 'ml_tree.contree' file.
    """
    root = Path(root)            # <-- coerce str → Path
    parents: set[Path] = set()

    for child_dir in root.rglob("*"):
        if child_dir.is_dir() and (child_dir / "ml_tree.contree").is_file():
            parents.add(str(child_dir.resolve()).split("/")[-1])

    return sorted(str(p) for p in parents)
completed_intervals = find_parents_with_contree("outputs/phylogenetic_dataset")

In [21]:
##Upload HGT intervals
td=[x for x in completed_intervals if 'HGT' in x]
with mp.Pool(40) as pool:
    pool.map(upload_main,td)

In [23]:
##Upload HGT intervals
completed_intervals = find_parents_with_contree("outputs/phylogenetic_dataset")
td=[x for x in completed_intervals if 'Meta' in x]
with mp.Pool(40) as pool:
    pool.map(upload_main,td)

## Collect relevant data for dryad upload

In [25]:

hgt=list(pd.read_csv("Tree_manual_inspection_HGT.tsv",sep='\t',index_col=0).index)

In [27]:
!mkdir 'outputs/phylogenetic_data_filtered'
for x in hgt:
    d= f'outputs/phylogenetic_data_filtered/{x}'
    !mkdir -p "$d"
    ch=";".join(x.split(";")[0:2])
    od=f"outputs/phylogenetic_dataset/{ch}/{x}"
    !cp "$od"/"final_rooted.tree" "$d"/"final_rooted.tree"
    !cp "$od"/"ml_tree.iqtree" "$d"/"ml_tree.iqtree"
    !cp "$od"/"itol_color_strip.txt" "$d"/"itol_color_strip.txt"
    !cp "$od"/"itol_taxonomic_info.txt" "$d"/"itol_taxonomic_info.txt"
    !cp "$od"/"trimmed_MSA.fasta" "$d"/"trimmed_MSA.fasta"
    !cp "$od"/"MSA.fasta" "$d"/"MSA.fasta"
    !cp "$od"/"all_sequences.fa" "$d"/"all_sequences.fa"
    !cp "$od"/"combined_sequences_data.tsv" "$d"/"combined_sequences_data.tsv"
   

In [28]:
meta=list(pd.read_csv("Tree_manual_inspection_Metazoan.tsv",sep='\t',index_col=0).index)
for x in meta:
    d= f'outputs/phylogenetic_data_filtered/{x}'
    !mkdir -p "$d"
    ch=";".join(x.split(";")[0:2])
    od=f"outputs/phylogenetic_dataset/{ch}/{x}"
    !cp "$od"/"final_rooted.tree" "$d"/"final_rooted.tree"
    !cp "$od"/"ml_tree.iqtree" "$d"/"ml_tree.iqtree"
    !cp "$od"/"itol_color_strip.txt" "$d"/"itol_color_strip.txt"
    !cp "$od"/"itol_taxonomic_info.txt" "$d"/"itol_taxonomic_info.txt"
    !cp "$od"/"trimmed_MSA.fasta" "$d"/"trimmed_MSA.fasta"
    !cp "$od"/"MSA.fasta" "$d"/"MSA.fasta"
    !cp "$od"/"all_sequences.fa" "$d"/"all_sequences.fa"
    !cp "$od"/"combined_sequences_data.tsv" "$d"/"combined_sequences_data.tsv"

In [None]:
%%bash
tar -czvf phylogenetic_data_filtered.tar.gz \
    -C outputs \
    phylogenetic_data_filtered

## Tree upload for manuscript 8/27/2025
correct mis-labeling of some secondary chimeras on the tree, correct title of legend, upload all trees to one itol project

In [1]:
si2=pd.read_csv("SI tables/SI table 2.tsv",sep="\t",index_col=0)
##extract header
intervals=[]
import ast
for index, row in si2[si2.representative].iterrows():
    for h in ast.literal_eval(row.HGT_intervals):
        intervals.append(index+";HGT_"+str(h).replace(" ",""))
    for h in ast.literal_eval(row.Metazoan_intervals):
        intervals.append(index+";Meta_"+str(h).replace(" ",""))
        
    
import os
l=[x for x in os.listdir("outputs/phylogenetic_data_filtered") if '.ipynb' not in x]

for xi in l:
    df=pd.read_csv(f"outputs/phylogenetic_data_filtered/{xi}/combined_sequences_data.tsv",sep="\t")
   
    try:

        for index, row in df[df.target_name.isin((sec.index))].iterrows():
            df.loc[index,'secondary']=True
    except:
      
        for index, row in df[df.sseqid.isin((sec.index))].iterrows():
            df.loc[index,'secondary']=True
    df.to_csv(f"outputs/phylogenetic_data_filtered/{xi}/combined_sequences_data.tsv",sep="\t")   
##remove unnecessary columns from header
for xi in l:
    df=pd.read_csv(f"outputs/phylogenetic_data_filtered/{xi}/combined_sequences_data.tsv",sep="\t")
    df=df.drop([x for x in df.columns if 'Unnamed' in x],axis=1)
    df.to_csv(f"outputs/phylogenetic_data_filtered/{xi}/combined_sequences_data.tsv",sep="\t")  

for xi in l:
    df=pd.read_csv(f"outputs/phylogenetic_data_filtered/{xi}/combined_sequences_data.tsv",sep="\t",index_col=0)
    c=list(df.columns)
    c[0]='index'
    df.columns=c
    df=df.set_index('index')
    df.to_csv(f"outputs/phylogenetic_data_filtered/{xi}/combined_sequences_data.tsv",sep="\t")  



In [3]:
## correct itol labels

for interv in l:
    df=pd.read_csv(f"outputs/phylogenetic_data_filtered/{interv}/combined_sequences_data.tsv",sep="\t")
    file_path=f"outputs/phylogenetic_data_filtered/{interv}/itol_color_strip.txt"
    f=open(file_path,"r").readlines()
    f2=open(file_path,"w")
    for x in f:
        if 'Taxonomic labels' in x:
            x=x.replace('Taxonomic labels','Taxonomic_labels')
        f2.write(x)
    f2.close()
    if 'secondary' in df.columns:
        if 'True' in set(df.secondary.astype(str)):
            try:
                secs=set([x.split(";")[1] for x in df[df.secondary.astype(str)=='True']['target_name']])
            except:
                secs=set([x.split(";")[1] for x in df[df.secondary.astype(str)=='True']['sseqid']])
            f2=open(file_path,"w")
            for x in f:
                if 'LEGEND_COLORS' in x and "#ff0000" not in x:
                    x=x.strip()+" #ff0000\n"
                if 'LEGEND_LABELS' in x and 'Arthropod_chimera_(secondary)' not in x:
                    x=x.strip()+" Arthropod_chimera_(secondary)\n"
                elif x.split(" ")[0] in set(secs):
                    x=x.split(" ")[0]+" #ff0000\n"

                f2.write(x)
            f2.close()
import sys
import itolapi
from itolapi import Itol
from pathlib import Path
"""
Upload tree and associated annotation files to itol
"""
def upload_to_itol(interv):

    ch=";".join(interv.split(";")[0:2])
    current_dir = Path(os.path.realpath(f"outputs/phylogenetic_data_filtered/{interv}/final_rooted.tree")).parent
    tree = current_dir/"final_rooted.tree"


    root_path = current_dir.parent
    sys.path.append(str(root_path))
    test = Itol()

    # Set the tree file


    test.add_file(current_dir / 'itol_color_strip.txt')
    test.add_file(current_dir / 'itol_taxonomic_info.txt')
    test.add_file(tree)
    # Add parameters

    test.params['APIkey'] = 'INSERT KEY'
    if interv in intervals:
        interv="*"+interv
    test.params['treeName'] = interv
    test.params['projectName'] ="Arthropod HGT-chimera interval trees 8/27/2025"
    test.params['workspaceName'] = "Arthropod_chimeric_HGT_ML_trees"

    # Submit the tree, return false if fail
    return test.upload()
import multiprocessing as mp
with mp.Pool(40) as pool:
    pool.map(upload_to_itol,l)
        
        

In [None]:
%%bash
tar -czvf phylogenetic_data_filtered.tar.gz \
    -C outputs \
    phylogenetic_data_filtered