In [61]:
import pickle
import pandas as pd
import multiprocessing as mp
import numpy as np
import pickle
from Bio import SeqIO
import os
import subprocess
import ast
import os
from pathlib import Path
import glob

## Correct fasta headers for all arthropod protein fasta

In [4]:

##load fasta of all arthropod proteins in original search database
all_seqs = SeqIO.to_dict(SeqIO.parse('outputs/all_arthropod_concatenated_proteins.fa', 'fasta'))

## correct the double appending of refseq genome accessions to the arthropod concatenated fasta
def _rename_kv(kv):
    orig_id, rec = kv
    new_id = ";".join(orig_id.split(";")[-2:])
    rec.id = rec.name = rec.description = new_id
    return new_id, rec

with mp.Pool(20) as pool:
    renamed_pairs = pool.map(_rename_kv, all_seqs.items())

new_seqs = dict(renamed_pairs)
with open("outputs/all_arthropod_concatenated_proteins.fa", "w") as handle:
    SeqIO.write(new_seqs.values(), handle, "fasta")
    
##correct blast outputs and add headers   
for n in os.listdir("outputs/round2_diamond_v_arthropod_output_split"):
    arth=pd.read_csv(f"outputs/round2_diamond_v_arthropod_output_split/{n}",sep="\t", names="qseqid sseqid stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(" "))
    arth.sseqid=[";".join(x.split(";")[-2:]) for x in arth.sseqid]
    arth.stitle=[";".join(x.split(";")[-2:]) for x in arth.sseqid]
    arth.to_csv(f"outputs/round2_diamond_v_arthropod_output_split/{n}",sep="\t")
    


## Build profile HMMs for each interval

In [24]:
##load filtered round2 blast chimeras
import pickle
file_path = 'outputs/round2_chimera_intervals.pickle'
with open(file_path, 'rb') as file:
    chimeras=pickle.load(file)
##append to intervals
intervals=[]
for c in chimeras:
    for i in chimeras[c]:
        intervals.append(c+";"+chimeras[c][i]+"_"+str(i).replace(" ",""))
##load fasta of separated chimera intervals
interval_queries=SeqIO.to_dict(SeqIO.parse('outputs/split_intervals.fasta', 'fasta'))

 

In [43]:

#takes a sequence accession n[0]
#write to output fasta (n[1]).
def get_fasta(n):
    n1=n[0]
    n2=n[1]
    f=open(n2,'a')
    f.write(f">{n1}\n")
    s=str(all_seqs[n1].seq)
    f.write(s+"\n")
    f.close()
    return n

#write a copy of fasta_file (str) with all intervals in blast df to output_file (str)
def copy_fasta_with_substr(fasta_file, df, output_file):
    with open(output_file, "w") as out_handle:
        for seq_record in SeqIO.parse(fasta_file, "fasta"):
            seq_name = seq_record.id
       
            if seq_name in df["sseqid"].values:
                sub_df = df[df["sseqid"] == seq_name]
                for _, row in sub_df.iterrows():
                    sstart = row["sstart"]
                    send = row["send"]
                    subseq = seq_record.seq[sstart-1:send]
                    subseq_name = f"{seq_name}_{sstart}_{send}"
                    subseq_record = seq_record
                    subseq_record.id = subseq_name
                    subseq_record.description = ""
                    subseq_record.seq = subseq
                    SeqIO.write(subseq_record, out_handle, "fasta")
                    
#run MUSCLE to get MSA for multi-seq fasta n (str)
def get_muscle(n):
    protein=";".join(n.split(";")[0:2])
    a=subprocess.run(["sh","scripts/hmmbuild_muscle.sh", f"outputs/hmmbuild/{protein}/{n}"])
    
#run hmm_build to obtain HMMER profile HMM for MSA for query n     
def get_hmm_profile(n):
    protein=";".join(n.split(";")[0:2])
    a=subprocess.run(["sh","scripts/hmmbuild.sh", f"outputs/hmmbuild/{protein}/{n}"])
    
#takes the name of an interval 
#writes sequences of arthropod blast hits with >30% coverge of interval and e-value < min e-value of non-arthropod hits to fasta 
#runs MUSCLE to make a MSA, then builds profile HMM
def write_fastas_run_hmm(n):
    try:
        ##protein accession prefix (genome;protein)
        protein=";".join(n.split(";")[0:2])

        ##load non-arthropod hits
        df=pd.read_csv(f"outputs/round2_diamond_output_split/{n}.tsv",sep="\t", names="qseqid sseqid stitle staxids sscinames sphylums skingdoms pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(" "))
        df["cover"]=(np.array(df.qend)-np.array(df.qstart)+1)/len(interval_queries[n].seq)
        df=df[df.cover>.30]
        ##exclude synthetic seqs
        df=df[df.staxids.astype(str)!="32630"]
        non_arth=df[~df.astype(str).sphylums.str.contains("Arthropoda")]

        ##Load arthropod hits from diamond blast to arthropod database
        arth=pd.read_csv(f"outputs/round2_diamond_v_arthropod_output_split/{n}.tsv",sep="\t")
        arth["cover"]=(np.array(arth.qend)-np.array(arth.qstart)+1)/len(interval_queries[n].seq)
        arth=arth[arth.cover>.30]

        ##arthropod seqs with a greater bitscore than the top non-arthropod sequence
        arth=arth[arth.bitscore>=non_arth.bitscore.max()]
        try:
            os.mkdir(f"outputs/hmmbuild/{protein}")
        except:
            p=1
        try:
            os.mkdir(f"outputs/hmmbuild/{protein}/{n}")
        except:
            p=1

        for f in [(x,f"outputs/hmmbuild/{protein}/{n}/seq.fasta") for x in set(arth.sseqid)]:
            get_fasta(f)
        with open(f"outputs/hmmbuild/{protein}/{n}/unique_seq.fasta", "w") as outfile:
        # Run awk command to remove redundant sequences
            subprocess.run(["awk", "-f", "scripts/remove_redundant_seqs.awk", f"outputs/hmmbuild/{protein}/{n}/seq.fasta"], stdout=outfile)
        # extract portion of outputs alignign with query interval
        copy_fasta_with_substr(f"outputs/hmmbuild/{protein}/{n}/unique_seq.fasta",arth,f"outputs/hmmbuild/{protein}/{n}/sub_seq.fasta")
        get_muscle(n)
        get_hmm_profile(n)
        del arth, non_arth, df
    except:
        f=open('failed_hmmbuild.txt','a')
        f.write(n+"\n")
        f.close()

    return


In [6]:
os.mkdir('outputs/hmmbuild')

In [2]:
td=intervals
##run the write hmm script in parallel on all intervals
with mp.Pool(20) as pool:
    r = pool.map(write_fastas_run_hmm, td)

In [19]:
##collect all intervals with completed MSAs

base_dir=Path("outputs/hmmbuild")
level3_dirs = [
    p for p in base_dir.glob("*/*")    # 3 segments under base_dir
    if p.is_dir()
]
done=set()
lens=[]
for d in level3_dirs:
    if "sub_seq.hmm" in os.listdir(str(d))  :
        done.add(str(d).split("/")[-1])
   

In [21]:
##collect intervals where the MSAs fail to run within 20 hours, run with muscle super5 
td=list(set(done)-set(td))
def get_muscle(n):
    protein=";".join(n.split(";")[0:2])
    a=subprocess.run(["sh","scripts/hmmbuild_muscle_super5.sh", f"outputs/hmmbuild/{protein}/{n}"])

with mp.Pool(20) as pool:
    r = pool.map(write_fastas_run_hmm, td)

## Concatenate profile hmms and submit to hmmsearch

In [1]:

##collect all hmms that have successfully run
base_dir=Path("outputs/hmmbuild")
level3_dirs = [
    p for p in base_dir.glob("*/*")    # 3 segments under base_dir
    if p.is_dir()
]
done=set()
lens=[]
for d in level3_dirs:
    if "sub_seq.hmm" in os.listdir(str(d))  :
        done.add(str(d)+"/sub_seq.hmm")
    
a="~".join(done)
##concatenate all the hmms into a single file for hmmsearch agaisnt the arthropod-only database
!sh scripts/concat_hmms.sh "all_concatenated".hmm "$a"
        
##make concatenated hmm files of at most 10 hmms to run in parallel against nr        
def chunk(lst, size=10):
    """Yield successive `size`-sized chunks from lst."""
    for i in range(0, len(lst), size):
        yield lst[i : i + size]

chunks = list(chunk(list(done), 10))

chunk_map={i:chunks[i] for i in range(len(chunks))}

!mkdir concatenated_hmms
for i in chunk_map:
    j="~".join(chunk_map[i])
    ix=i+1
    !scripts/concat_hmms.sh "concatenated_hmms/$ix".hmm "$j"
    

In [6]:
##run hmmsearch vs nr on split concatenated profile hmms in parallel
!mkdir hmmsearchout_concat
!sbatch scripts/hmmsearch_array.sbatch

Submitted batch job 13970905


In [12]:
##run hmmsearch vs custom arthropod proteome 
!sbatch scripts/hmmsearch_vs_arthropoda.sh

Submitted batch job 14150084


In [23]:
##split hmmsearch outputs from arthropoda search when done 
!sh scripts/split_hmmer_csv.sh hmmsearch_v_arthropod_db.domtblout outputs/hmmsearch_v_arthropod 

In [None]:
##split hmmsearch outputs from NR search when done 
def split_hmmer_result(x):
    !sh scripts/split_hmmer_csv.sh hmmsearchout_concat/"$x" "outputs/hmmsearch_v_nr"
td=[x for x in os.listdir('hmmsearchout_concat') if '.domtblout' in x]
import multiprocessing as mp
with mp.Pool(20) as pool:
    result = pool.map(split_hmmer_result, td)
