In [6]:
##load filtered round2 blast chimeras
import pickle
file_path = 'outputs/round2_chimera_intervals.pickle'
with open(file_path, 'rb') as file:
    chimeras=pickle.load(file)
##append to intervals
intervals=[]
for c in chimeras:
    for i in chimeras[c]:
        intervals.append(c+";"+chimeras[c][i]+"_"+str(i).replace(" ",""))

In [5]:
##add headers to round2 diamond outputs
for n in os.listdir('outputs/round2_diamond_output_split'):
    df=pd.read_csv(f"outputs/round2_diamond_output_split/{n}.tsv",sep="\t", names="qseqid sseqid stitle staxids sscinames sphylums skingdoms pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(" "))
    df.to_csv(f"outputs/round2_diamond_output_split/{n}.tsv",sep="\t")

In [8]:
import pickle
import pandas as pd
import multiprocessing as mp
import numpy as np
import pickle
import matplotlib.pyplot as plt
from Bio import SeqIO
import os
import subprocess
import ast
import warnings
warnings.filterwarnings('ignore')

def check_annot(n):
    """
    takes the name of an interval blast dataframe (string) stored in round2_diamond_output_split
    returns "Meta", "HGT" or none annotations
    
    """
    
    df=pd.read_csv(f"outputs/round2_diamond_output_split/{n}.tsv",sep="\t")
    
    leng=len(record_dict[n.split('.tsv')[0]].seq)
    df["cov"]=(np.array(df.qend)-np.array(df.qstart)+1)/leng
    #filter by >30% coverage of the query
    dfo=df[df["cov"]>.30]
    dfo=dfo[~dfo.sphylums.astype(str).str.contains("Arthropoda")]
    dfo=dfo[~dfo.sphylums.astype(str).str.contains("Rotifera")]
    dfo=dfo[dfo.staxids.astype(str)!="nan"]
    ##exclude synthetic sequences
    dfm=dfo[dfo.staxids!=32630]
    
    dfmeta=dfm[dfm.skingdoms.astype(str).str.contains("Metazoa")]
    dfhgt=dfm[~dfm.skingdoms.astype(str).str.contains("Metazoa")]
    dfhgt["AI"]=np.log10(dfmeta.evalue.min()+1e-200)-np.log10(dfhgt.evalue+1e-200)
    dfmeta["MI"]=np.log10(dfhgt.evalue.min()+1e-200)-np.log10(dfmeta.evalue+1e-200)
    
    ##get the top 300 hits by lowest evalue
    dfmi=dfm.iloc[0:300,:]
    dfmetai=dfmi[dfmi.skingdoms.astype(str).str.contains("Metazoa")]
    dfhgti=dfmi[~dfmi.skingdoms.astype(str).str.contains("Metazoa")]


    hgt_condition= (dfhgt.evalue.min()<1e-4 or dfhgt.bitscore.max()>50) and len(set(dfhgt.staxids))>10 and (len(set(dfhgt[dfhgt.AI>5].staxids))>10 or len(set(dfhgti.staxids))/len(set(dfmi.staxids))>=.95)
    meta_condition= dfmeta.evalue.min()<.1  and (len(set(dfmeta[dfmeta.MI>1].staxids))>5 or (len(set(dfmetai.staxids))/len(set(dfmi.staxids))>=.50))
    if dfm.shape[0]>0:
        # print(dfhgt.evalue.min(), dfmetai.shape[0])
        if meta_condition:
            return "Meta"
        elif hgt_condition:
            return "HGT"
    return

record_dict=SeqIO.to_dict(SeqIO.parse('outputs/split_intervals.fasta', 'fasta'))
meta=[x.split(".tsv")[0] for x in os.listdir("outputs/round2_diamond_output_split") if "Meta" in x]
hgt=[x.split(".tsv")[0] for x in os.listdir("outputs/round2_diamond_output_split") if "HGT" in x]

#annotate putative HGT intervals and store confirmed ones
with mp.Pool(28) as p:
    hgts2 = p.map(check_annot, hgt)
#dictionary between hgt interval and updated annotation
hgt_map={x:y for x,y in zip(hgt,hgts2)}
hgt_set=set([x.split(";")[0]+";"+x.split(";")[1] for x in hgt_map if hgt_map[x]=="HGT"])

#annotate putative metazoan intervals and store confirmed ones
with mp.Pool(28) as p:
    meta2 = p.map(check_annot, meta)
#dictionary between meta interval and updated annotation
meta_map={x:y for x,y in zip(meta,meta2)}
meta_set=set([x.split(";")[0]+";"+x.split(";")[1] for x in meta_map if meta_map[x]=="Meta"])

#select chimeras: contain confirmed HGT and Meta intervals
chimeras_filtered=meta_set&hgt_set



In [9]:
len(chimeras_filtered)

717

In [48]:
def get_hgt_data(n):
    """
    takes the name of an HGT interval blast dataframe (string) stored in round2_diamond_output_split
    returns summary statistics used for hgt inference
    """
    df=pd.read_csv(f"outputs/round2_diamond_output_split/{n}.tsv",sep="\t")
    leng=len(record_dict[n.split('.tsv')[0]].seq)
    df["cov"]=(np.array(df.qend)-np.array(df.qstart)+1)/leng
    #filter by >30% coverage of the query
    dfo=df[df["cov"]>.30]
    dfo=dfo[~dfo.sphylums.astype(str).str.contains("Arthropoda")]
    dfo=dfo[~dfo.sphylums.astype(str).str.contains("Rotifera")]
    dfo=dfo[dfo.staxids.astype(str)!="nan"]
    ##exclude synthetic sequences
    dfm=dfo[dfo.staxids!=32630]

    dfmeta=dfm[dfm.skingdoms.astype(str).str.contains("Metazoa")]
    dfhgt=dfm[~dfm.skingdoms.astype(str).str.contains("Metazoa")]
    dfhgt["AI"]=np.log10(dfmeta.evalue.min()+1e-200)-np.log10(dfhgt.evalue+1e-200)
    dfmeta["MI"]=np.log10(dfhgt.evalue.min()+1e-200)-np.log10(dfmeta.evalue+1e-200)

    ##get the top 300 hits by lowest evalue
    dfmi=dfm.iloc[0:300,:]
    dfmetai=dfmi[dfmi.skingdoms.astype(str).str.contains("Metazoa")]
    dfhgti=dfmi[~dfmi.skingdoms.astype(str).str.contains("Metazoa")]

    return list(dfhgt.loc[dfhgt.bitscore.idxmax(),['bitscore']])+list(dfhgt.loc[dfhgt.evalue.idxmin(),['evalue', 'cov','stitle','sscinames','sphylums','skingdoms']])+[dfmeta.shape[0], len(set(dfhgt.staxids)),len(set(dfhgti.staxids))/len(set(dfmi.staxids)),dfhgt.AI.max(),len(set(dfhgt[dfhgt.AI>5].staxids))]
    


In [61]:
def get_meta_data(n):
    """
    takes the name of a netazoan interval blast dataframe (string) stored in round2_diamond_output_split
    returns summary statistics used for hgt inference
    """
    df=pd.read_csv(f"outputs/round2_diamond_output_split/{n}.tsv",sep="\t")
    leng=len(record_dict[n.split('.tsv')[0]].seq)
    df["cov"]=(np.array(df.qend)-np.array(df.qstart)+1)/leng
    #filter by >30% coverage of the query
    dfo=df[df["cov"]>.30]
    dfo=dfo[~dfo.sphylums.astype(str).str.contains("Arthropoda")]
    dfo=dfo[~dfo.sphylums.astype(str).str.contains("Rotifera")]
    dfo=dfo[dfo.staxids.astype(str)!="nan"]
    ##exclude synthetic sequences
    dfm=dfo[dfo.staxids!=32630]

    dfmeta=dfm[dfm.skingdoms.astype(str).str.contains("Metazoa")]
    dfhgt=dfm[~dfm.skingdoms.astype(str).str.contains("Metazoa")]
    dfhgt["AI"]=np.log10(dfmeta.evalue.min()+1e-200)-np.log10(dfhgt.evalue+1e-200)
    dfmeta["MI"]=np.log10(dfhgt.evalue.min()+1e-200)-np.log10(dfmeta.evalue+1e-200)

    ##get the top 300 hits by lowest evalue
    dfmi=dfm.iloc[0:300,:]
    dfmetai=dfmi[dfmi.skingdoms.astype(str).str.contains("Metazoa")]
    dfhgti=dfmi[~dfmi.skingdoms.astype(str).str.contains("Metazoa")]

    return list(dfmeta.loc[dfmeta.bitscore.idxmax(),['bitscore']])+list(dfmeta.loc[dfmeta.evalue.idxmin(),['evalue','cov','stitle','sscinames','sphylums','skingdoms']])+[dfmeta.shape[0], len(set(dfmeta.staxids)),len(set(dfmetai.staxids))/len(set(dfmi.staxids)),dfmeta.MI.max(),len(set(dfmeta[dfmeta.MI>1].staxids))]
    


In [51]:
##Output a dataframe with summary statistics for hgt intervals
hgt_confirmed=[x for x in hgt_map if ";".join(x.split(';')[0:2]) in chimeras_filtered and hgt_map[x]=='HGT']
with mp.Pool(28) as p:
    hgt_data = p.map(get_hgt_data, hgt_confirmed)
hgt_df=pd.DataFrame(index=hgt_confirmed)
hgt_df.loc[:, ['bitscore_max','min_evalue','evalue_min_cov','evalue_min_title','evalue_min_sciname','evalue_min_phylum','evalue_min_kingdom','n_meta_hits','n_hgt_taxids','p_HGT300','AI','N_AI>5']]=hgt_data
hgt_df.to_csv("outputs/round2_blast_statistics_hgt_intervals.tsv",sep="\t")


In [62]:
##Output a dataframe with summary statistics for meta intervals
meta_confirmed=[x for x in meta_map if ";".join(x.split(';')[0:2]) in chimeras_filtered and meta_map[x]=='Meta']
with mp.Pool(28) as p:
    meta_data = p.map(get_meta_data, meta_confirmed)
meta_df=pd.DataFrame(index=meta_confirmed)
meta_df.loc[:, ['bitscore_max','evalue_min','evalue_min_cov','evalue_min_title','evalue_min_sciname','evalue_min_phylum','evalue_min_kingdom','n_meta_hits','n_hgt_taxids','p_Meta300','MI','N_MI>1']]=meta_data
meta_df.to_csv("outputs/round2_blast_statistics_meta_intervals.tsv",sep="\t")


In [10]:
from collections import defaultdict
##make a dict of dicts chimera_name:interval tuple:HGT/Meta annot 
chimeras_filtered_dict=defaultdict(dict)
for x in chimeras_filtered:
    m={ast.literal_eval(xm.split("_")[-1]):xm.split(";")[-1].split("_")[0] for xm in meta_confirmed if x in xm}
    h={ast.literal_eval(xm.split("_")[-1]):xm.split(";")[-1].split("_")[0] for xm in hgt_confirmed if x in xm}
    combo=m|h
    ##sort dict by intervals
    combo = {k: v for k, v in sorted(combo.items(), key=lambda kv: kv[0][0])}
    chimeras_filtered_dict[x]=combo

In [None]:
"""
parameter: name of an hgt chimera
assumes chimeras_filtered_dict is sorted 
returns whether any adjacent series of HGT, Meta intervals is found in a non-arthropod sequence (boolean)

"""
def check_non_arthropod_chimera(n,ethresh=1e-2):
    d=chimeras_filtered_dict[n]
    intervals=list(d.keys())
    for i in range(len(d)):
        if d[intervals[i]]=='HGT':
            ints=str(intervals[i]).replace(" ","")
            df=pd.read_csv(f"outputs/round2_diamond_output_split/{n};HGT_{ints}.tsv",sep="\t", names="qseqid sseqid stitle staxids sscinames sphylums skingdoms pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(" "))
            df=df[df.evalue<ethresh]
            hgts=set(df[~df.sphylums.astype(str).str.contains('Arthropoda')]['sseqid'])
            metas=set()
            ##check 
            if i>0 and d[intervals[i-1]]=='Meta' :
                ints=str(intervals[i-1]).replace(" ","")
                df=pd.read_csv(f"outputs/round2_diamond_output_split/{n};Meta_{ints}.tsv",sep="\t", names="qseqid sseqid stitle staxids sscinames sphylums skingdoms pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(" "))
                df=df[df.evalue<ethresh]
                metas=metas|set(df[~df.sphylums.astype(str).str.contains('Arthropoda')]['sseqid'])
            if i<len(d)-1 and d[intervals[i+1]]=='Meta':
                ints=str(intervals[i+1]).replace(" ","")
                df=pd.read_csv(f"outputs/round2_diamond_output_split/{n};Meta_{ints}.tsv",sep="\t", names="qseqid sseqid stitle staxids sscinames sphylums skingdoms pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(" "))
                df=df[df.evalue<ethresh]
                metas=metas|set(df[~df.sphylums.astype(str).str.contains('Arthropoda')]['sseqid'])
    return len(metas&hgts)>0




In [12]:

with mp.Pool(28) as p:
    overlap = p.map(check_non_arthropod_chimera, chimeras_filtered)

In [13]:
overlap_dict={x:y for x,y in zip(chimeras_filtered,overlap)}

In [14]:
no_overlaps=set([x for x in overlap_dict if not overlap_dict[x]])
chimeras_filtered_overlaps={x:chimeras_filtered_dict[x] for x in no_overlaps}

In [24]:
##print number of chimeras after filtering out chimeras in which adjacent series of hgt/non-hgt intervals are found in non-arthropod sequences
chimeras_filtered_overlaps={x:chimeras_filtered_overlaps[x] for x in chimeras_filtered_overlaps if 'partial' not in record_dict[x].description}
len(chimeras_filtered_overlaps)

525

In [28]:
##output a dictionary of chimera intervals
file_path = 'outputs/round2_chimera_intervals.pickle'
with open(file_path, 'wb') as file:
    pickle.dump(chimeras_filtered_overlaps,file)

##output .txt representation of dictionary of chimera intervals
f=open('outputs/round2_chimera_intervals.txt','w')
for k,v in chimeras_filtered_overlaps.items():
    f.write(f'{k}:{v}\n')
f.close()

##output a dictionary of ufll protein lengths, useful for plotting
lmap={x:len(record_dict[x].seq) for x in record_dict}
with open('outputs/length_map.pickle', 'wb') as file:
    pickle.dump(lmap,file)

