In [1]:
import pickle
import pandas as pd
import multiprocessing as mp
import numpy as np
import pickle
import matplotlib.pyplot as plt
from Bio import SeqIO
import os
import subprocess
import ast
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

In [2]:
##load filtered round2 blast chimeras
import pickle
file_path = 'outputs/transposon_ankyrin_filtered_round2_chimera_intervals.pickle'
with open(file_path, 'rb') as file:
    chimeras=pickle.load(file)

In [3]:
##append to intervals
intervals=[]
for c in chimeras:
    for i in chimeras[c]:
        intervals.append(c+";"+chimeras[c][i]+"_"+str(i).replace(" ",""))

In [17]:
len(chimeras)

365

In [13]:
from Bio import SeqIO

#load fasta w/ all arthropod queries
all_seqs = SeqIO.to_dict(SeqIO.parse('outputs/all_arthropod_concatenated_proteins.fa', 'fasta'))
##add a secondary chimera pcr'd in the first pipeline iteration that is now suppressed in the latest a. albopictus annotation
a2=SeqIO.to_dict(SeqIO.parse('outputs/suppressed_aedes_albopictus.fa', 'fasta'))
all_seqs=all_seqs|a2

In [5]:
##load a dataframe of genome taxids from genome accessions
df1=pd.read_csv('Data/genbank_genomes_4_22_2025.tsv',sep='\t')
df2=pd.read_csv('Data/refseq_genomes_scaffold_plus_4_19_2025.tsv',sep='\t')
dftax=pd.concat([df1,df2]).set_index('Assembly Accession')


## Process hmmsearch output tsvs

In [6]:
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path


cols = [
    "target_name","tlen","query name","qlen","E-value","overall_score",
    "overall_bias","#","of","c-Evalue","i-Evalue","domain_score","bias",
    "hmmfrom","hmmto","alifrom","alito","envfrom","envto","acc",
    "description of target","species"
]

def process_interval(x):
    
    """
    Process hmmsearch hits for an interval: add a header, remove synthetic hits, 
    add taxid and species info to arthropod hits
    """
    
    arth_path = Path(f"outputs/hmmsearch_v_arthropod/{x}.tsv")
    nr_path   = Path(f"outputs/hmmsearch_v_nr/{x}.tsv")
    if 'species' not in open(arth_path,'r').readline():
        
        # original raw files: no header
        arth = pd.read_csv(arth_path, sep="\t", header=None)
        # drop unused cols
        arth = arth.drop([1,4], axis=1)
        # assign column names
        arth.columns = cols
        # annotate arthropod hits with species & taxid
        arth['species'] = [ dftax.loc[name.split(";")[0], 'Organism Name'] for name in arth['target_name'] ]
        arth['taxid']   = [ dftax.loc[name.split(";")[0], 'Organism Taxonomic ID'] for name in arth['target_name'] ]
        arth = arth.drop('description of target', axis=1)
        arth.to_csv(arth_path, sep="\t", index=False)
        del arth
    
    if 'species' not in open(nr_path,'r').readline():
        nr   = pd.read_csv(nr_path,   sep="\t", header=None)
        nr   = nr.drop(  [1,4], axis=1)
        nr.columns = cols
        nr = nr[~nr['species'].astype(str).str.contains("synthetic")]
        nr.to_csv(nr_path,   sep="\t", index=False)
        del nr 
    return 




In [102]:
with mp.Pool(40) as pool:
    results=pool.map(process_interval,intervals)

## Extract arthropod secondary chimeras

In [91]:
##Load arthropod protein accessions from NR
ar=pd.read_csv('outputs/arthropoda.accessions',sep='\t',header=None)
ar=set(ar[0])

In [19]:
##extract sequences that have no non-self blast-hits (bit-score>min(bit-score non-arthropod))
##these are assumed to have 0 secondary chimeras  and blast hits instead of hmmsearch hits are used for phylogenetic dataset contstruction
singleton_blast_hits=[]
for chimera in chimeras:
    ints=[x for x in intervals if chimera in x]
    for x in ints:
        a2=SeqIO.to_dict(SeqIO.parse(f'outputs/hmmbuild/{chimera}/{x}/sub_seq.fasta', 'fasta'))
        if len(a2.keys())<=1:
       
            singleton_blast_hits.append(x)
##chimeras that will be analyzed with diamond hits alone
singleton_blast_proteins=set([";".join(x.split(';')[0:2]) for x in singleton_blast_hits])
##chimeras with no diamond hit only intervals
non_singleton_chimeras=set(chimeras.keys())-singleton_blast_proteins



In [20]:
len(singleton_blast_hits), len(intervals)

(79, 807)

In [12]:
##load a dataframe of genome taxids from genome accessions
df1=pd.read_csv('Data/genbank_genomes_4_22_2025.tsv',sep='\t')
df2=pd.read_csv('Data/refseq_genomes_scaffold_plus_4_19_2025.tsv',sep='\t')
dftax=pd.concat([df1,df2]).set_index('Assembly Accession')
dftax.loc['GCF_006496715.1',['Organism Name','Organism Taxonomic ID']]=['Aedes albopictus',7160]

In [16]:
"""
takes an interval name, returns arhropod hmmsearch hits e-value<1e-2, along with coordinates
"""
def get_arthropod_hit_set(x,e_thresh=1e-4):
    ##extract blast hits if hmm profile is not built from >1 sequence
    if x in singleton_blast_hits:
        arth_path = Path(f"outputs/round2_diamond_v_arthropod_output_split/{x}.tsv")
        arth = pd.read_csv(arth_path, sep="\t")
        arth['species']=[dftax.loc[x.split(";")[0],'Organism Name'] for x in arth['sseqid']]
        arth = arth[(arth['evalue']<e_thresh)]
        arth = arth.loc[arth.groupby('sseqid', as_index=True)['bitscore'].idxmax(),:]
        # build the set of hit descriptors
        hit_set = {row.sseqid:[(row.sstart,row.send),row.species]
            for row in arth.itertuples()}
    ## else extract hmmsearch hits
    else:
        arth_path = Path(f"outputs/hmmsearch_v_arthropod/{x}.tsv")
        arth = pd.read_csv(arth_path, sep="\t")
        ##exception for the aedes albopictus hit GCF_006496715.1;XP_029735553.1 added for chimera GCF_002204515.2;XP_021699539.1
        if 'GCF_002204515.2;XP_021699539.1' in x:
            arth = arth[(arth['i-Evalue']<e_thresh)|(arth.target_name=='GCF_006496715.1;XP_029735553.1')]
        else:
            arth = arth[(arth['i-Evalue']<e_thresh)]
        arth = arth.loc[arth.groupby('target_name', as_index=True)['domain_score'].idxmax(),:]
        # build the set of hit descriptors
        hit_set = {row.target_name:[(row.envfrom,row.envto),row.species]
            for row in arth.itertuples()}
    del arth
    return hit_set


In [17]:
with mp.Pool(40) as pool:
    results=pool.map(get_arthropod_hit_set,intervals)
##build a dictionary of intervals to their hits
chimera_hits_dict={x:y for x,y in zip(intervals,results)}


In [19]:
"""
get the set of simultaneous hits to all intervals of the primary chimera
x: chimera accession
"""
def get_hit_intersection(x):

    # build a list of all hit sets for all intervals in chimera x
    sets_to_intersect = [
        {g for g in chimera_hits_dict[inter] }
        for inter in intervals
        if x in inter
    ]
    return (set.intersection(*sets_to_intersect) if sets_to_intersect else set())

In [20]:
## dictionary from chimera to all arthropod proteins that are simultaneous hits to all chimera intervals
chimera_intersection={}
for x in chimeras:
    chimera_intersection[x]=get_hit_intersection(x)

In [21]:
def confirm_secondary_chimera_interval_order(ch):
    """
    For a primary chimera accession (`ch`)—

    • loop over every **HGT** interval in its linear order  
    • pull the corresponding arthropod HMMer or blast coordinates in putative secondary chimeras (chimera_intersection dictionary)
    • check that the flanking N-terminal and/or C-terminal
      **metazoan** intervals are also present *in the right order*  in every putative secondary chimera
        – up-stream Meta block must start before the HGT block  
        – down-stream Meta block must start after the HGT block  
        – any mutual overlap with the HGT block must be < 15 bp/aa

    Returns
    -------
    list of confirmed secondary chimeras
    """
    # Interval-class mapping for this chimera, *in ascending order*
    d = chimeras[ch]
    intervals = list(d.keys())

    # Start an empty results frame indexed by the chimera/intersection set
    df = pd.DataFrame(index=list(chimera_intersection[ch]))

    # ---------------------------------------------------------------------
    # Walk through the ordered intervals, keeping track of position (i)
    # ---------------------------------------------------------------------
    for i, interval in enumerate(intervals):

        # Skip anything that isn't an HGT chunk
        if d[interval] != 'HGT':
            continue

        # -------------------------------------------------------------
        # (1)  Annotate HGT coordinates + species
        # -------------------------------------------------------------
        ints = str(interval).replace(" ", "")
        hgt_hit_set = get_arthropod_hit_set(f"{ch};HGT_{ints}")
        for idx in set(df.index) & set(hgt_hit_set):
         
            df.loc[idx, 'hgt_start'] = hgt_hit_set[idx][0][0]
            df.loc[idx, 'hgt_end']   = hgt_hit_set[idx][0][1]
            df.loc[idx, 'species']   = hgt_hit_set[idx][1]
            

        # -------------------------------------------------------------
        # (2)  Check the *up-stream* Meta block (N-terminal side)
        # -------------------------------------------------------------
        if i > 0 and df.shape[0]>0 and d[intervals[i-1]] == 'Meta':
            ints = str(intervals[i-1]).replace(" ", "")
            meta_hit_set = get_arthropod_hit_set(f"{ch};Meta_{ints}")

            for idx in set(meta_hit_set) & set(df.index):
                df.loc[idx, 'meta_up_start'] = meta_hit_set[idx][0][0]
                df.loc[idx, 'meta_up_end']   = meta_hit_set[idx][0][1]

            # How much does the up-stream Meta overlap the HGT block?
            df['up_len_overlap'] = (
                np.minimum(df['hgt_end'],  df['meta_up_end'])   # right bound
                - np.maximum(df['hgt_start'], df['meta_up_start'])  # left bound
            ).clip(lower=0)   # → 0 if there’s no overlap at all

            # Keep rows only if the Meta block is *before* the HGT
            # and they overlap by < 15 positions
            df = df[
                (df.meta_up_start < df.hgt_start) &
                (df.up_len_overlap < 15)
            ]

        # -------------------------------------------------------------
        # (3)  Check the *down-stream* Meta block (C-terminal side)
        # -------------------------------------------------------------
        if i < len(d) - 1 and df.shape[0]>0 and d[intervals[i+1]] == 'Meta':
            ints = str(intervals[i+1]).replace(" ", "")
            meta_hit_set = get_arthropod_hit_set(f"{ch};Meta_{ints}")

            for idx in set(meta_hit_set) & set(df.index):
                df.loc[idx, 'meta_down_start'] = meta_hit_set[idx][0][0]
                df.loc[idx, 'meta_down_end']   = meta_hit_set[idx][0][1]

            # Overlap length between HGT and down-stream Meta block
            df['down_len_overlap'] = (
                np.minimum(df['hgt_end'],  df['meta_down_end'])
                - np.maximum(df['hgt_start'], df['meta_down_start'])
            ).clip(lower=0)

            # Keep rows only if the Meta block is *after* the HGT
            # and they overlap by < 15 positions
            df = df[
                (df.meta_down_start > df.hgt_start) &
                (df.down_len_overlap < 15)
            ]

    return list(df.index)


In [23]:
##dictionary between primary chimera and its secondary chimeras
secondary_chimera_adjacency_list={}
for x in chimeras:
    secondary_chimera_adjacency_list[x]=confirm_secondary_chimera_interval_order(x)

In [24]:
##save pickle dictionary representation of adj list output
file_path = 'outputs/secondary_chimera_adjacency_list.pickle'
with open(file_path, 'wb') as file:
    pickle.dump(secondary_chimera_adjacency_list,file)

In [22]:
##save pickle dictionary representation of adj list output
file_path = 'outputs/secondary_chimera_adjacency_list.pickle'
with open(file_path, 'rb') as file:
    secondary_chimera_adjacency_list=pickle.load(file)

In [7]:
##save .txt representation of adj list output
f=open('outputs/secondary_chimera_adjacency_list.txt','w')
for k,v in secondary_chimera_adjacency_list.items():
    f.write(f"{k}:{v}\n")
f.close()

## Orthologous clustering of HGT-chimeras

In [25]:
##adjacency list only including primary sequences
secondary_chimera_adjacency_list_filtered={}
for x in secondary_chimera_adjacency_list:
    secondary_chimera_adjacency_list_filtered[x]=list(set(chimeras)&set(secondary_chimera_adjacency_list[x]))

In [25]:
from Bio import SeqIO
##load fasta with chimeras from the previous iteration to prioritize in selection of representative sequences per cluster
og = SeqIO.to_dict(SeqIO.parse('outputs/previous_iteration_chimeras.fa', 'fasta'))


In [27]:
import networkx as nx
##Build a graph from the adjacency list
G = nx.DiGraph((u, v) for u, nbrs in secondary_chimera_adjacency_list_filtered.items() for v in nbrs)
components = list(nx.weakly_connected_components(G))

In [11]:
representative_map={}
i=0
m=[]
for c in components:
    
    ##prioritize previously sequenced chimeras as the cluster representatives
    ## and those with hmmer hits
    if len(c&set(og.keys()))>0:
        
        hmmer_hit=list(non_singleton_chimeras&c&set(og))
        if len(hmmer_hit)>0:
            k=hmmer_hit[0]
            
        else:
            k=list(c&set(og.keys()))[0]
        representative_map[k]=c
        m.append(k)
        i+=1
    ##else slelect as the cluster representative the sequence with the maximum number of secondaries
    else:
        if len(c&non_singleton_chimeras)>0:
            d={x:secondary_chimera_adjacency_list[x] for x in set(c)&set(non_singleton_chimeras)}
        else:
            d={x:secondary_chimera_adjacency_list[x] for x in set(c)}
        max_len = max(map(len, d.values()))

        max_sec= [k for k, v in d.items() if len(v) == max_len][0]
        representative_map[max_sec]=c

In [35]:
##print the number of clusters
len(set(representative_map))

299

In [34]:
file_path = 'outputs/clustering_representative_seqs.pickle'
with open(file_path, 'wb') as file:
    pickle.dump(representative_map,file)

In [18]:
##save .txt representation of adj list output
f=open('outputs/clustering_representative_seqs.txt','w')
for k,v in representative_map.items():
    f.write(f"{k}:{v}\n")
f.close()

In [9]:
file_path = 'outputs/clustering_representative_seqs.pickle'
with open(file_path, 'rb') as file:
    representative_map=pickle.load(file)

## Secondary chimera blast confirmation
Uses DIAMOND blast to validate HGT or Metazoan annotations for each separated secondary chimera interval

In [79]:
!mkdir -p outputs/phylogenetic_dataset

In [1]:
##Load a dictionary of primary:secondary chimera mappings for PCR-validated secondary chimeras from a previous pipeline iteration
## this is to prioritize selection of pcr'd secondary chimeras (iff they appear as secondary chimeras in this screen)
import pickle
file_path = 'outputs/previous_iteration_secondary_chimeras.pickle'
with open(file_path, 'rb') as file:
    previous_iteration_secondary=pickle.load(file)

In [9]:
!mkdir -p outputs/secondary_chimera_fastas

In [33]:
"""
takes a chimera accession and writes a 'secondary_chimera' fasta for each interval
with the hmmsearch-demarcated intervals of the secondary chimera 
"""
def write_secondary_chimeras(c):
    os.makedirs(f"outputs/secondary_chimera_fastas/{c}", exist_ok=True)
    ints=[x for x in intervals if c in x]
    for x in ints:
        
        os.makedirs(f"outputs/secondary_chimera_fastas/{c}/{x}", exist_ok=True)
        
        f=open(f"outputs/secondary_chimera_fastas/{c}/{x}/secondary_chimera.fa",'w')
        
        ##selection if using blast hits
        if x in singleton_blast_hits:
            arth_path = Path(f"outputs/round2_diamond_v_arthropod_output_split/{x}.tsv")
            arth = pd.read_csv(arth_path, sep="\t")
            arth=arth[arth.sseqid.isin(secondary_chimera_adjacency_list[c])]
            arth=arth[arth.sseqid!=c]
            arth=arth.loc[arth.groupby('sseqid')['bitscore'].idxmax()]
            if arth.shape[0]>0:
                for index, row in arth.iterrows():
                    name=row.sseqid
                    start=row.sstart
                    stop=row.send
                    seq=str(all_seqs[name].seq)[start-1:stop]
                    name=name+";"+str((start,stop)).replace(" ","")
                    f.write(f'>{name}\n')
                    f.write(f'{seq}\n')
        ##selection if using hmmer hits
        else:
            arth_path = Path(f"outputs/hmmsearch_v_arthropod/{x}.tsv")
            arth = pd.read_csv(arth_path, sep="\t")
            arth=arth[arth.target_name.isin(secondary_chimera_adjacency_list[c])]
            arth=arth[arth.target_name!=c]
            arth=arth.loc[arth.groupby('target_name')['domain_score'].idxmax()]
            if arth.shape[0]>0:
                for index, row in arth.iterrows():
                    name=row.target_name
                    start=row.envfrom
                    stop=row.envto
                    seq=str(all_seqs[name].seq)[start-1:stop]
                    name=name+";"+str((start,stop)).replace(" ","")
                    f.write(f'>{name}\n')
                    f.write(f'{seq}\n')
        f.close()


In [34]:
def _safe_write(c):
    """Wrapper so failures on a worker don’t crash the whole pool."""
    try:
        write_secondary_chimeras(c)
    except Exception:
        # keep original behaviour: just show the item that failed
        print(c)

with mp.Pool(30) as pool:         # defaults to mp.cpu_count() workers
    pool.map(_safe_write, list(representative_map.keys()))
        # pool.map automatically waits for all tasks to finish

In [35]:
from pathlib import Path

root      = Path("outputs/secondary_chimera_fastas")          # adjust if needed
outfile   = Path("outputs/all_secondary_chimeras.fa")      # where to write

with outfile.open("w") as out:
    # pattern: root / * / * / secondary_chimera.fa
    for fasta in root.glob("*/*/secondary_chimera.fa"):
        tag = fasta.parent.name                   # immediate subdirectory
        with fasta.open() as fh:
            for line in fh:
                if line.startswith(">"):
                    # strip the leading '>' and trailing newline, then rewrite
                    out.write(f">{tag};;{line[1:].rstrip()}\n")
                else:
                    out.write(line)       

In [37]:
!sbatch "scripts/diamond_secondary.sh"


Submitted batch job 17319553


In [36]:
!sh scripts/split_blast_table.sh 'all_secondary_chimeras_out' 'outputs/secondary_chimera_interval_blast_results'

In [4]:
record_dict=SeqIO.to_dict(SeqIO.parse('outputs/all_secondary_chimeras.fa', 'fasta'))

In [5]:
#takes the name of an interval blast dataframe (string) stored in round2_diamond_output_split
#returns "Meta", "HGT" or none
def check_annot(n):

    """
    
    for every **secondary chimera** blast output,
    takes the name of an interval blast dataframe (string) stored in secondary_chimera_interval_blast
    returns "Meta", "HGT" or none
    
    """
    try:
        df=pd.read_csv(f"/n/netscratch/extavour_lab/Everyone/Rishabh/secondary_chimera_interval_blast_results/{n}.tsv",sep="\t", names="qseqid sseqid stitle staxids sscinames sphylums skingdoms pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(" "))

        leng=len(record_dict[n].seq)
        df["cov"]=(np.array(df.qend)-np.array(df.qstart)+1)/leng
        #filter by >30% coverage of the query
        dfo=df[df["cov"]>.30]
        dfo=dfo[~dfo.sphylums.astype(str).str.contains("Arthropoda")]
        dfo=dfo[~dfo.sphylums.astype(str).str.contains("Rotifera")]
        dfo=dfo[dfo.staxids.astype(str)!="nan"]
        ##exclude synthetic sequences
        dfm=dfo[dfo.staxids!=32630]

        dfmeta=dfm[dfm.skingdoms.astype(str).str.contains("Metazoa")]
        dfhgt=dfm[~dfm.skingdoms.astype(str).str.contains("Metazoa")]
        dfhgt["AI"]=np.log10(dfmeta.evalue.min()+1e-200)-np.log10(dfhgt.evalue+1e-200)
        dfmeta["MI"]=np.log10(dfhgt.evalue.min()+1e-200)-np.log10(dfmeta.evalue+1e-200)

        ##get the top 300 hits by lowest evalue
        dfmi=dfm.iloc[0:300,:]
        dfmetai=dfmi[dfmi.skingdoms.astype(str).str.contains("Metazoa")]
        dfhgti=dfmi[~dfmi.skingdoms.astype(str).str.contains("Metazoa")]


        hgt_condition= (dfhgt.evalue.min()<1e-4 or dfhgt.bitscore.max()>50) and len(set(dfhgt.staxids))>10 and (len(set(dfhgt[dfhgt.AI>5].staxids))>10 or len(set(dfhgti.staxids))/len(set(dfmi.staxids))>=.95)
        meta_condition= dfmeta.evalue.min()<.1  and (len(set(dfmeta[dfmeta.MI>1].staxids))>5 or (len(set(dfmetai.staxids))/len(set(dfmi.staxids))>=.50))
        if dfm.shape[0]>0:
            # print(dfhgt.evalue.min(), dfmetai.shape[0])
            if meta_condition:
                return "Meta"
            elif hgt_condition:
                return "HGT"
        del df,dfo, dfm,dfmeta, dfhgt, dfmi, dfmetai, dfhgti
    except:
        print(n,'error')
        return

In [6]:
td=list(record_dict.keys())
with mp.Pool(55) as pool:         # defaults to mp.cpu_count() workers
    result = pool.map(check_annot, td)
secondary_interval_annots={x:y for x,y in zip(td,result)}

GCF_025399875.1;XP_065172373.1;Meta_(688,743);;GCF_025399875.1;XP_065172344.1;(700,765) error
GCF_025399875.1;XP_065172373.1;Meta_(688,743);;GCF_025399875.1;XP_065172345.1;(700,765) error
GCF_025399875.1;XP_065172373.1;Meta_(688,743);;GCF_025399875.1;XP_065172507.1;(626,690) error
GCF_025399875.1;XP_065172373.1;Meta_(688,743);;GCF_025399875.1;XP_065172508.1;(626,690) error


In [7]:
interv_td=set([x.split(";;")[0] for x in td])
##dictionary storing the percentage of secondary chimeras with mismatching annotations for each chimera interval
p_mismatch={}
opposite_annot={}
for interv in interv_td:

    annot=interv.split(";")[-1].split("_")[0]
    all_secondaries={secondary_interval_annots[x] for x in secondary_interval_annots if interv in x}
    opposite={secondary_interval_annots[x] for x in secondary_interval_annots if interv in x and secondary_interval_annots[x]!=annot and secondary_interval_annots[x]!=None}
    p_mismatch[interv]=len(opposite)/len(all_secondaries)
    opposite_annot[interv]=opposite

In [10]:
##chimeras excluded because annotations of secondary chimeras switched from hgt to meta or vice-a-versa
to_exclude=set([";".join(x.split(";")[0:2]) for x in p_mismatch if p_mismatch[x]>0 and ";".join(x.split(";")[0:2]) in representative_map])
len(to_exclude)

41

In [11]:
##final set of clustered-hgt chimeras
hmmer_chimeras_to_include=set(representative_map)-set(to_exclude)

In [18]:
len(hmmer_chimeras_to_include)

258

## Data table output
Makes a table of all final, seconday-chimera filtered outputs

In [12]:
##load a dataframe of genome taxids from genome accessions
df1=pd.read_csv('Data/genbank_genomes_4_22_2025.tsv',sep='\t')
df2=pd.read_csv('Data/refseq_genomes_scaffold_plus_4_19_2025.tsv',sep='\t')
dftax=pd.concat([df1,df2]).set_index('Assembly Accession')
dftax.loc['GCF_006496715.1',['Organism Name','Organism Taxonomic ID']]=['Aedes albopictus',7160]

In [13]:
from ete3 import NCBITaxa


ncbi = NCBITaxa()          

def lowest_common_rank(taxids, ncbi_obj=ncbi):
    """
    Return the (taxid, scientific name, rank) of the deepest/commonest rank
    that all input NCBI taxids share.

    Parameters
    ----------
    taxids : Iterable[int]
        A sequence or set of NCBI taxonomy IDs (e.g. {9606, 10090, 9598}).
    ncbi_obj : ete3.NCBITaxa, optional
        Pre-instantiated NCBITaxa object (default: the module-level `ncbi`).

    Returns
    -------
    tuple (int, str, str)
        taxid, scientific name, and rank of the lowest common ancestor (LCA).

    Raises
    ------
    ValueError
        If `taxids` is empty or no common ancestor is found (shouldn’t happen
        unless an ID is not in the database).
    """
    taxids = list({int(t) for t in taxids})   # unique & cast to int
    if not taxids:
        raise ValueError("`taxids` must contain at least one ID")

    # Full lineage (root → leaf) for each taxid
    lineages = []
    for x in taxids:
        try:
            lineages.append(ncbi_obj.get_lineage(x))
        except:
            print(x)

    # All ancestors common to every lineage
    common = set(lineages[0]).intersection(*lineages[1:])
    if not common:
        raise ValueError("No common ancestor found – check the taxids.")

    # Depth of each ancestor in the first lineage: 0=root, larger=deeper
    depth = {tax: idx for idx, tax in enumerate(lineages[0])}

    # Pick the common taxon that’s deepest in the tree
    lca_taxid = max(common, key=lambda t: depth[t])

    # Translate to name and rank
    name = ncbi_obj.get_taxid_translator([lca_taxid])[lca_taxid]
    rank = ncbi_obj.get_rank([lca_taxid])[lca_taxid]

    return lca_taxid, name, rank


In [14]:
df=pd.DataFrame()

In [18]:
for x in hmmer_chimeras_to_include:
    ## fill in taxonomic information   
    species=[dftax.loc[xi.split(";")[0],'Organism Name'] for xi in secondary_chimera_adjacency_list[x] if xi.split(";")[0] in dftax.index ]
    ## fill in taxids of secondary chimeras
    taxids=list(set([int(dftax.loc[xi.split(";")[0],'Organism Taxonomic ID']) for xi in secondary_chimera_adjacency_list[x] if xi.split(";")[0] in dftax.index ]))
    rank=''
    try:
        ## fill in the lowest common taxonomic rank of the secondary chimeras
        rank=str(lowest_common_rank(taxids))

    except:
        print(x)
    df.loc[x,['n_species','span','secondary_chimera_species','secondary_chimera_sequences']]=len(set(species)),rank,str(set(species)),str(set(secondary_chimera_adjacency_list[x]))
    
        

3402493
3402493


In [19]:
df=df.sort_values('n_species',ascending=False)


In [20]:
for index, row in df.iterrows():
    df.loc[index,'HGT_intervals']=str([x for x in chimeras[index] if chimeras[index][x]=='HGT'])

In [21]:
for index, row in df.iterrows():
    df.loc[index,'Meta_intervals']=str([x for x in chimeras[index] if chimeras[index][x]=='Meta'])

In [22]:
cdhit=pd.read_csv("outputs/round2_chimeras_cdhit.txt",sep="\t")
cdhit['Query']=[x.split(">")[1] for x in cdhit['Query']]

In [23]:
for index, row in df.iterrows():
    ci=cdhit[cdhit.Query==index.split(';')[1]]
    ci=ci.sort_values('From')
    s=str([(x,y,z) for x,y,z in zip(ci['Short name'],ci['From'],ci['To'])])
    df.loc[index,'cdhit']=s


In [26]:
for index, row in df.iterrows():
    df.loc[index,'og']=index in og

In [27]:
df.to_csv('outputs/clustered_ankyrin_transposon_secondary_filtered_chimeras.tsv',sep='\t')

In [19]:
import pickle
file_path = 'outputs/transposon_ankyrin_filtered_round2_chimera_intervals.pickle'
with open(file_path, 'rb') as file:
    chimeras=pickle.load(file)
chimeras_filtered={x:chimeras[x] for x in chimeras if x in df.index}

In [23]:
##save dictionary representation of filtered chimeras output
file_path = 'outputs/clustered_ankyrin_transposon_secondary_filtered_chimeras.pickle'
with open(file_path, 'wb') as file:
    pickle.dump(chimeras_filtered,file)

In [22]:
##save .txt representation of filtered chimeras output
f=open('outputs/clustered_ankyrin_transposon_secondary_filtered_chimeras.txt','w')
for k,v in chimeras_filtered.items():
    f.write(f"{k}:{v}\n")
f.close()