In [2]:
import pandas as pd
df=pd.read_csv("SI tables/SI table 2.tsv",sep="\t",index_col=0)
df=df[df.cluster==3]

## select the set of sequences for tree building
cluster 3 representatives, 1 per gene, prioritizing sequences in the main interval trees

In [42]:
dfi=pd.read_csv("outputs/phylogenetic_data_filtered/GCF_002217175.1;XP_035715507.1;Meta_(2,65)/combined_sequences_data.tsv",sep="\t")
td=set()
tree_prots=set(dfi[dfi.secondary.isin(['True','primary_chimera'])]['target_name'])
td=td|tree_prots

In [43]:
ex_gene=set(df[df.index.isin(tree_prots)].gene)
df=df[~df.gene.isin(ex_gene)]

In [44]:
import ast
df['HGT_len']=[(ast.literal_eval(x)[0][1]-ast.literal_eval(x)[0][0]) + (ast.literal_eval(y)[0][1]-ast.literal_eval(y)[0][0]) for x,y in zip(df.HGT_intervals, df.Metazoan_intervals)]
td=td|set(df.loc[df.groupby('gene')['HGT_len'].idxmax()].index)

In [46]:
!mkdir outputs/cluster_3_hgt

In [49]:
td=[x.split(";")[1] for x in td]

In [56]:
import subprocess
##retrieve protein sequences from ncbi 
for acc in td:

    url_prot=f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id={acc}&rettype=fasta" 
    subprocess.run(["mkdir",f"outputs/cluster_3_hgt/{acc}"])
    subprocess.run(["wget",url_prot,"-O",f"outputs/cluster_3_hgt/{acc}/prot.fasta"])

In [62]:
import os
from Bio import SeqIO
main_directory="outputs/cluster_3_hgt"

In [64]:
# Iterate through subdirectories and create a list of all fastas
#Written by RK w/ ChatGPT
all_prots=[]
for subdir in os.listdir(main_directory):
    subdirectory_path = os.path.join(main_directory, subdir)

    if os.path.isdir(subdirectory_path):
        prot_fasta_file = os.path.join(subdirectory_path, "prot.fasta")
     

        if os.path.exists(prot_fasta_file):
            # Read the sequences from nuc.fasta and add to the list
            sequences = list(SeqIO.parse(prot_fasta_file, "fasta"))
            all_prots.extend(sequences)

# Write the concatenated sequences to the output file
with open("outputs/cluster_3_hgt/concatenated_prot.fasta", "w") as output_handle:
    SeqIO.write(all_prots, output_handle, "fasta")

In [2]:
!sbatch "dnds_scripts/run_iqtree_dnds_pipe.sh" "outputs/cluster_3_hgt"

Submitted batch job 29687517


## Write iTOL annotation files for tree visualization

In [10]:
import ete3 
from ete3 import Tree
t=Tree("outputs/cluster_3_hgt/rev_aa.treefile")




In [8]:
sp=set()
for x in t:
    s=df[df.protein==x.name].species[0].replace(" ","_")
    sp.add(s)
print(len(sp))

11


  s=df[df.protein==x.name].species[0].replace(" ","_")


In [5]:
order_to_color = {
        "Diptera": "#FF0000",
        "Entomobryomorpha": "#FFFF00",
        "Symphypleona": "#FFD700",
        "Sarcoptiformes": "#7851A9",
    }
f=open(f"outputs/cluster_3_hgt/itol_text.txt","a")
for x in t:
    s=df[df.protein==x.name].species[0].replace(" ","_")
    o=df[df.protein==x.name].order[0].replace(" ","_")
    c=order_to_color[o]
    n=f"{x.name},{x.name}|{s},-1,{c},normal,1,0\n"
    f.write(n)
f.close()

  s=df[df.protein==x.name].species[0].replace(" ","_")
  o=df[df.protein==x.name].order[0].replace(" ","_")


In [27]:

for x in t:

    x.add_features(order=df[df.protein==x.name]['order'].values[0])

In [35]:
import os

def write_tree_colors_dataset(t, df, out_path,
                              order_to_color=None,
                              type_for_nodes="clade",   # "clade" or "branch"
                              width=4,                   # branch width
                              style="normal"):           # "normal" or "dashed"
    """
    Create an iTOL TREE_COLORS dataset from monophyletic clades defined by leaf attribute 'order'.

    Parameters
    ----------
    t : ete3.Tree
        Your tree with leaves named to match df['protein'].
    df : pandas.DataFrame
        Must contain columns 'protein' and 'order'.
    out_path : str
        Where to write the dataset file.
    order_to_color : dict[str,str] | None
        Mapping like {'Diptera': '#FF0000', ...}. If None, uses a default set.
    type_for_nodes : str
        'clade' colors the branch and all descendants; 'branch' colors only that branch.
    width : int
        Branch width in iTOL (TREE_COLORS).
    style : str
        Branch style in iTOL ('normal' or 'dashed').
    """

    # ---- Defaults (edit as needed) ----
    if order_to_color is None:
        order_to_color = {
            "Diptera": "#FF0000",
            "Entomobryomorpha": "#FFFF00",
            "Symphypleona": "#FFD700",
            "Sarcoptiformes": "#7851A9",
        }

    # ---- Annotate leaves with "order" from df ----
    prot_to_order = dict(zip(df['protein'], df['order']))

    # Only annotate leaves; internal nodes might have empty names
    for leaf in t.iter_leaves():
        ord_val = prot_to_order.get(leaf.name)
        if ord_val is not None:
            leaf.add_features(order=ord_val)

    # ---- Helper to label a node for iTOL (leaf or LCA via left|right) ----
    def node_label(n):
        if n.is_leaf():
            return n.name
        # pick a representative leaf from each child
        children = n.children
        left = children[0].get_leaves()[0].name
        right = children[1].get_leaves()[0].name
        return f"{left}|{right}"

    # ---- Ensure output directory exists ----
    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)

    # ---- Write TREE_COLORS file ----
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("TREE_COLORS\n")
        f.write("SEPARATOR COMMA\n")
        f.write("DATASET_LABEL,taxonomic_label\n")
        f.write("COLOR,#000000\n")  # dataset icon color (not used for rows)
        f.write("DATA\n")

        for order_name, color in order_to_color.items():
            # Correct ETE call: values=[match values], target_attr='order'
            for node in t.get_monophyletic(values=[order_name], target_attr='order'):
                try:
                    label = node_label(node)
                    if label:
                        # TREE_COLORS row: NODE_ID,TYPE,COLOR,STYLE,WIDTH
                        f.write(f"{label},{type_for_nodes},{color},{style},{width}\n")
                except Exception as e:
                    # Non-fatal; continue writing the rest
                    print(f"[warn] Could not write node (order={order_name}): {e}")

    print(f"Wrote TREE_COLORS dataset to: {out_path}")


write_tree_colors_dataset(
    t, df,
    out_path="outputs/cluster_3_hgt/tree_colors.txt",
    order_to_color={
        "Diptera": "#FF0000",
        "Entomobryomorpha": "#FFFF00",
        "Symphypleona": "#FFD700",
        "Sarcoptiformes": "#7851A9",
    },
    type_for_nodes="clade",  # or "branch"
    width=4,
    style="normal"
)


Wrote TREE_COLORS dataset to: outputs/cluster_3_hgt/tree_colors.txt
