In [3]:
import pandas as pd
from Bio import SeqIO
import pickle

In [4]:
##Load chimera mapping to intervals
file_path = 'outputs/round2_chimera_intervals.pickle'
with open(file_path, 'rb') as file:
    chimeras=pickle.load(file)
##print the number of hgt-chimeras with repbase annotations
len(set([";".join(x.split(";")[0:2]) for x in repdf.Query_Name])&set((chimeras_filtered)))

In [9]:
##load a dataframe of genome taxids from genome accessions
df1=pd.read_csv('Data/genbank_genomes_4_22_2025.tsv',sep='\t')
df2=pd.read_csv('Data/refseq_genomes_scaffold_plus_4_19_2025.tsv',sep='\t')
dftax=pd.concat([df1,df2]).set_index('Assembly Accession')


## Filter out ankyrin repeat proteins using  NCBI CD-search hits

In [1]:
## write round 2 chimeras to a fasta for submission to NCBI CD-search webportal https://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi
f=open('outputs/round2_chimeras_full_length.fa','w')
for x in chimeras:
    n=x.split(";")[1]
    f.write(f">{n}\n")
    f.write(str(record_dict[x].seq)+'\n')
f.close()
    
    

In [7]:
##load cd-search results (downloaded from webserver)
cdhit=pd.read_csv("outputs/round2_chimeras_cdd_search.txt",sep="\t")
cdhit['Query']=[x.split(">")[1] for x in cdhit['Query']]
ank=cdhit[cdhit['Short name'].isin(['ANKYR','Ank_2','PHA03095 superfamily','PHA03100 superfamily','ANKYR superfamily','Ank_4','Ank_5','PRANC superfamily'])]
no_ank=cdhit[~cdhit['Short name'].isin(['ANKYR','Ank_2','PHA03095 superfamily','PHA03100 superfamily','ANKYR superfamily','Ank_4','Ank_5','PRANC superfamily'])]

In [8]:

print(len(set(ank['Query'])))

71


In [9]:
##filter out ankyrin repeart proteins
chimeras_filtered={x:chimeras[x] for x in chimeras if x.split(";")[1] not in set(ank.Query)}
len(chimeras_filtered)

454

In [9]:
## find the number of intervals overlapping ankyrin repeat domains (not reported in manuscript; just for curiosity)
def intervals_overlap(interval1, interval2):
    start1, end1 = interval1
    start2, end2 = interval2

    # The intervals overlap if the later start is ≤ the earlier end
    return max(start1, start2) <= min(end1, end2)
overlaps=[]
for chimera in chimeras:
    anki=ank[ank.Query==chimera.split(";")[1]]
    c={}
    
    for interval in chimeras[chimera]:
        add=True
        for index, row in anki.iterrows():
            if  intervals_overlap(interval,(row['From'],row['To'])):
                overlaps.append(chimera+";"+chimeras[chimera][interval]+"_"+str(interval).replace(" ",""))
                break
 
                
        

In [11]:
len(overlaps)

100

In [12]:
len([x for x in overlaps if 'HGT' in x]),len([x for x in overlaps if 'Meta' in x])

(40, 60)

# Transposable element filtering


Split fastas into chunks for submission to CENSOR on the repbase website https://www.girinst.org/censor/index.php

In [12]:
def chunks(seq, n=100):
    """Yield successive n-sized chunks from seq."""
    for i in range(0, len(seq), n):
        yield seq[i:i + n]

parts=list(chunks(list(chimeras_filtered), 100))

!mkdir split_for_repbase
for i in range(len(parts)):
    f=open(f'split_for_repbase/{i}.fasta','w')
    for p in parts[i]:
        f.write(f">{p}\n")
        f.write(str(record_dict[p].seq)+'\n')
    f.close()
        
        

## Parse censor html outputs as a pandas dataframe and fasta file

In [1]:
from io import StringIO
import sys, re, pathlib
import re
import pandas as pd


def extract_local_alignments(html_text: str) -> pd.DataFrame:
    """
    Parse a CENSOR results page and return a tidy DataFrame containing

      • Local Alignments* data (query/hit coords, Dir, Sim, PosMmTs, Score)
      • TE_description  – concatenated DE lines for each Hit_Name
      • TE_species      – concatenated OS lines for each Hit_Name
      • TE_lineage      – concatenated OC lines for each Hit_Name

    Parameters
    ----------
    html_text : str
        The full HTML of a CENSOR results page.

    Returns
    -------
    pandas.DataFrame
        One row per alignment with the extra TE_* columns.
    """

    # ── 1. Grab the Local Alignments* section ──────────────────────────────
    h2 = re.search(r"<h2[^>]*>\s*Local\s+Alignments\*\s*</h2>", html_text,
                   flags=re.I)
    if not h2:
        raise ValueError("Local Alignments* section not found")
    section = html_text[h2.start():]

    # Read every <table> in that section (wrap string in StringIO)
    tables = pd.read_html(StringIO(section))
    align_dfs = [t for t in tables if {'Dir', 'Sim', 'Score'}.issubset(t.columns)]
    if not align_dfs:
        raise ValueError("No Local Alignments tables detected")
    df = pd.concat(align_dfs, ignore_index=True)

    # ── 2. Normalise duplicate “Name / From / To” columns ───────────────────
    rename, n_name = {}, 0
    n_from = n_to = 0
    for col in df.columns:
        low = col.lower()
        if low.startswith("name"):
            rename[col] = "Query_Name" if n_name == 0 else "Hit_Name"; n_name += 1
        elif low.startswith("from"):
            rename[col] = "Query_From" if n_from == 0 else "Hit_From"; n_from += 1
        elif low.startswith("to"):
            rename[col] = "Query_To" if n_to == 0 else "Hit_To"; n_to += 1
        elif "pos" in low:
            rename[col] = "PosMmTs"
        else:
            rename[col] = col
    df = df.rename(columns=rename)

    # ── 3. Parse Annotation of Repbase Sequences (<pre> block) ──────────────
    pre = re.search(r'Annotation of Repbase Sequences.*?<pre>(.*?)</pre>',
                    html_text, flags=re.S)
    desc_map, species_map, lineage_map = {}, {}, {}
    if pre:
        for rec in re.split(r'\n//\s*\n', pre.group(1)):          # split at terminator
            m_id = re.search(r'^ID\s+([^\s]+)', rec, flags=re.M)
            if not m_id:
                continue
            rid = m_id.group(1)

            desc_map[rid]    = " ".join(re.findall(r'^DE\s+(.+)$', rec, flags=re.M))
            species_map[rid] = " ".join(re.findall(r'^OS\s+(.+)$', rec, flags=re.M))
            lineage_map[rid] = " ".join(re.findall(r'^OC\s+(.+)$', rec, flags=re.M))

    # ── 4. Attach TE_* columns ──────────────────────────────────────────────
    df["TE_description"] = df["Hit_Name"].map(desc_map).fillna("")
    df["TE_species"]     = df["Hit_Name"].map(species_map).fillna("")
    df["TE_lineage"]     = df["Hit_Name"].map(lineage_map).fillna("")

    # Optional: put columns in a nice order
    preferred = ["Query_Name", "Query_From", "Query_To",
                 "Hit_Name",  "Hit_From",  "Hit_To",
                 "Dir", "Sim", "PosMmTs", "Score",
                 "TE_description", "TE_species", "TE_lineage"]
    df = df[[c for c in preferred if c in df.columns]]

    return df
import pathlib
repdf=pd.DataFrame()
for x in range(5):
    html_text = pathlib.Path(f'split_for_repbase/{x}_result.html').read_text(encoding="utf-8", errors="ignore")
    repdf=pd.concat([repdf,extract_local_alignments(html_text)])

In [10]:
##print the number of hgt-chimeras with repbase annotations
len(set([";".join(x.split(";")[0:2]) for x in repdf.Query_Name])&set((chimeras_filtered)))

215

In [20]:
##print fraction of hgt-chimeras with repbase annotations 
len(set([";".join(x.split(";")[0:2]) for x in repdf.Query_Name])&set((chimeras_filtered)))/len(set((chimeras_filtered)))

0.473568281938326

In [13]:
repdf.to_csv('outputs/censor_repbase_hits.tsv',sep='\t')

In [33]:

"""


Extract every CDS translation from the “Annotation of Repbase Sequences”
section in a batch of CENSOR HTML reports and write them all to one
FASTA file.

"""

from __future__ import annotations
import re, textwrap, pathlib, sys

# ── regexes ────────────────────────────────────────────────────────────────
ANN_RE   = re.compile(r'Annotation of Repbase Sequences.*?<pre>(.*?)</pre>',
                      re.S)
REC_SPLT = re.compile(r'\n//\s*\n')
ID_RE    = re.compile(r'^ID\s+(\S+)', re.M)
TR_RE    = re.compile(r'/translation="([^"]+)"', re.S)


def extract_cds_from_html(html_path: pathlib.Path) -> list[tuple[str, str]]:
    """
    Return a list of (header, aa_sequence) tuples for one CENSOR HTML file.
    Header format:  <Repbase-ID>_CDS<n>
    """
    html = html_path.read_text(encoding="utf-8", errors="ignore")

    m = ANN_RE.search(html)
    if not m:
        return []                         # no annotation section in this file

    translations: list[tuple[str, str]] = []
    for rec in REC_SPLT.split(m.group(1)):
        id_m = ID_RE.search(rec)
        if not id_m:
            continue
        rep_id = id_m.group(1)

        for idx, match in enumerate(TR_RE.findall(rec), start=1):
            aa = re.sub(r"\s+", "", match)              # strip whitespace/newlines
            translations.append((f"{rep_id}_CDS{idx}", aa))

    return translations

out_path = pathlib.Path("outputs/repbase_cds_translations.fasta")
out_path.parent.mkdir(parents=True, exist_ok=True)

fasta_lines: list[str] = []
n_seqs = 0

for x in range(5):                                     # adjust range if needed
    html_file = pathlib.Path(f"split_for_repbase/{x}_result.html")
    if not html_file.exists():
        print(f"[warn] {html_file} not found – skipping", file=sys.stderr)
        continue

    for header, seq in extract_cds_from_html(html_file):
        fasta_lines.append(f">{header}")
        fasta_lines.extend(textwrap.wrap(seq, 60))
        n_seqs += 1

if not fasta_lines:
    sys.exit("No CDS translations found in any input file")

out_path.write_text("\n".join(fasta_lines))






621232

In [14]:
## create a dictionary of between HGT intervals that overlap with Metazoan transposons:list of their hits
from collections import defaultdict
overlapping_metazoan_TEs=defaultdict(list)
for c in chimeras_filtered:
    rep=repdf[(repdf.Query_Name==c)&(repdf.TE_lineage.str.contains('Metazoa'))]
    if rep.shape[0]>=1:
        for inter in chimeras_filtered[c]:
            if chimeras_filtered[c][inter]=='HGT':
                for index, row in rep.iterrows():
                    if intervals_overlap((row.Query_From,row.Query_To),inter):
                        overlapping_metazoan_TEs[c+';'+'HGT_'+str(inter).replace(' ','')].append(row.Hit_Name)
                       
                

In [11]:
%%bash
##run diamond blastp of intervals vs concatenated censor protein outputs
singularity exec /cvmfs/singularity.galaxyproject.org/d/i/diamond:2.0.15--hb97b32f_1 diamond makedb --in outputs/repbase_cds_translations.fasta --db repbase_cds_translations
singularity exec /cvmfs/singularity.galaxyproject.org/d/i/diamond:2.0.15--hb97b32f_1 diamond blastp --db repbase_cds_translations.dmnd --query outputs/split_intervals.fasta --out outputs/diamond_v_repbase.tsv --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore --very-sensitive 

In [15]:
rephits=pd.read_csv("outputs/diamond_v_repbase.tsv",sep="\t",names="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(" "))

In [16]:
## exclude the second cds of medea from metazoan TEs, shown to be bacterial origin https://doi.org/10.1073/pnas.0800444105
rephits=rephits[rephits.sseqid!='Medea_CDS2']

In [17]:
rephits.sseqid=[x.split("_CDS")[0] for x in rephits.sseqid]

In [18]:
##filter out blast hits with queries with overlapping metazoan TES
rephitsfiltered=pd.DataFrame()
for x in overlapping_metazoan_TEs:
    rephitsfiltered=pd.concat([rephitsfiltered,rephits[(rephits.qseqid==x)]])


In [19]:
##load top non-metazoan blast hits for comparison to bitscore of TEs
for index,row in rephitsfiltered.iterrows():
    x=row['qseqid']
    dfblast=pd.read_csv(f'outputs/round2_diamond_output_split/{x}.tsv',sep='\t')
    dfblast=dfblast[~(dfblast.skingdoms.astype(str).str.contains('Metazoa'))&~(dfblast.staxids.astype(str).str.contains('32630'))].sort_values('bitscore',ascending=False)
    rephitsfiltered.loc[index,['non_meta_top_stitle','non_meta_top_skingdoms','non_meta_top_pident','non_meta_top_bitscore']]=list(dfblast.iloc[0,:].loc[['stitle','skingdoms','pident','bitscore']].values)


In [22]:
##get top TE hit per query
idx = rephitsfiltered.groupby('qseqid')['bitscore'].idxmax()
top_hits = rephitsfiltered.loc[idx]          
top_hits = top_hits.reset_index(drop=True)

In [23]:
## bit score ratio of non-metazoan hit to TE hit
top_hits['bit_ratio']=top_hits.non_meta_top_bitscore/top_hits.bitscore

In [24]:
## make a tsv of BLAST data for hgt interivals overlapping TEs
exclude_TE=top_hits
nmap={x:y for x,y in zip(repdf.Hit_Name,repdf.TE_description)}
exclude_TE['TE_description'] = [
    nmap[x] if x in nmap else 'missing'
    for x in exclude_TE.sseqid
]
exclude_TE['species']=[dftax.loc[x.split(';')[0],'Organism Name'] for x in exclude_TE.qseqid]
exclude_TE.columns=['query interval', 'query species', 'TE_ID', 'TE_description', 'TE_pident', 'TE_bitscore',
       'non_meta_top_stitle', 'non_meta_top_pident', 'non_meta_top_bitscore',
       'bit_ratio']
exclude_TE.to_csv('outputs/TE_bitscore_comparison.tsv',sep='\t')

In [41]:
exclude_TE.to_csv('outputs/TE_bitscore_comparison.tsv',sep='\t')

In [42]:
from collections import Counter

In [15]:
exclude_TE=exclude_TE[exclude_TE.bit_ratio<1.7]

In [16]:
len(set([";".join(x.split(";")[0:2]) for x in exclude_TE['query interval']])&set(chimeras_filtered))

90

In [44]:
##print out names of TE hits that are filtered out
Counter(list(exclude_TE.TE_description))

Counter({'Non-LTR retrotransposon from Tribolium castaneum.': 2,
         'Crypton DNA transposon from the Photinus pyralis genome, consensus.': 7,
         'LTR retrotransposon from Bemisia tabaci, internal portion, consensus.': 3,
         'L2-type retrotransposon sequence - a consensus.': 1,
         'Non-LTR retrotransposon.': 2,
         'missing': 8,
         'a R1 element from Heliconius melpomene.': 1,
         'LTR retrotransposon from the Sitobion miscanthi genome - internal portion consensus.': 1,
         'autonomous Polinton DNA transposon - consensus.': 2,
         'Non-LTR retrotransposon from the bear giant-skipper genome - consensus.': 6,
         'LTR retrotransposon from the Scaptodrosophila lebanonensis genome - internal portion consensus.': 1,
         'Non-LTR retrotransposon from the Cydia splendana genome: consensus.': 4,
         'LTR retrotransposon from the yellow fever mosquito genome: internal portion.': 8,
         'LTR retrotransposon from the boll weevil

In [19]:
from collections import defaultdict
chimeras_transposon_filtered=defaultdict(dict)
for c in chimeras_filtered:
    cdict={}
    for inter in chimeras[c]:
        if c+';'+'HGT_'+str(inter).replace(" ",'') not in list(exclude_TE['query interval']):
            chimeras_transposon_filtered[c][inter]=chimeras_filtered[c][inter]
chimeras_transposon_filtered={x:chimeras_transposon_filtered[x] for x in chimeras_transposon_filtered if 'HGT' in chimeras_transposon_filtered[x].values() and 'Meta' in chimeras_transposon_filtered[x].values() }
len(chimeras_transposon_filtered)

365

In [35]:
#store filtered chimeras as a pickle file
file_path = 'outputs/transposon_ankyrin_filtered_round2_chimera_intervals.pickle'
with open(file_path, 'wb') as file:
    pickle.dump(chimeras_transposon_filtered,file)

In [21]:
#store filtered chimeras as a pickle file
file_path = 'outputs/transposon_ankyrin_filtered_round2_chimera_intervals.pickle'
with open(file_path, 'rb') as file:
    chimeras_transposon_filtered=pickle.load(file)

In [22]:
##store filtered chimeras as a list output
f=open('outputs/transposon_ankyrin_filtered_round2_chimera_intervals.txt','w')
for k,v in chimeras_transposon_filtered.items():
    f.write(f"{k}:{v}\n")
f.close()