In [1]:
import pandas as pd
import multiprocessing as mp
import os

In [10]:
df=pd.read_csv("phylogenetic_data_with_substitutions.tsv",sep="\t")

In [14]:
!mkdir -p hmmsearch_results

In [15]:
from pathlib import Path
from typing import Iterable, List, TextIO, Union
import gzip
import io


def extract_target_names(
    hmmer_output: Union[str, Path],
    *,
    unique: bool = False
) -> List[str]:
    """
    Extract every ‘target name’ from an HMMER output file.

    Parameters
    ----------
    hmmer_output : str or pathlib.Path
        Path to a hmmsearch/hmmscan screen, --tblout or --domtblout file.
    unique : bool, optional
        If True, de-duplicates while preserving the order of first appearance.

    Returns
    -------
    List[str]
        A list of target names (strings).  Order matches the file.
    """
    names: List[str] = []
    with open(hmmer_output) as fh:
        for ln in fh:
            if ln.startswith("#") or not ln.strip():
                continue                      # ignore headers / blank lines
            names.append(ln.split()[0])       # first column = target name

    if unique:
        seen = set()
        names = [n for n in names if not (n in seen or seen.add(n))]

    return set(names)


In [16]:
## run hmmsearch with OSK and LOTUS domain HMMs obtained from https://github.com/extavourlab/Oskar_Evolution/tree/main
## returns a set of OSK+LOTUS+ hits as putative oskars
def get_oskars(a):
    try:
        if 'GCA_' in a or 'GCF_' in a:
            f=f"../ncbi_dataset/data/{a}/protein_longest.faa"
        else:
            f=f"/net/bos-nfsisilon/ifs/rc_labs/extavour_lab/rkapoor/IF_project_iteration2/TSA_transdec/{a}/{a}.fasta.transdecoder.pep"
        if f'{a}_LOTUS' not in os.listdir(f'hmmsearch_results'):
            !singularity exec /cvmfs/singularity.galaxyproject.org/h/m/hmmer:3.4--hdbdd923_2 hmmsearch --domtblout "hmmsearch_results/""$a""_LOTUS" -E .05 'LOTUS_CONSENSUS.hmm' "$f" 
        if f'{a}_OSK' not in os.listdir(f'hmmsearch_results'):
            !singularity exec /cvmfs/singularity.galaxyproject.org/h/m/hmmer:3.4--hdbdd923_2 hmmsearch --domtblout "hmmsearch_results/""$a""_OSK" -E .05 'OSK_CONSENSUS.hmm' "$f" 
        LOTUS=extract_target_names(f"hmmsearch_results/{a}_LOTUS")
        OSK=extract_target_names(f"hmmsearch_results/{a}_OSK")
        return OSK&LOTUS
    except:
        f=open('fails.txt','a')
        f.write(a+"\n")
        f.close()
        

In [18]:
## make an insect-only dataframe and populate it with oskar accessions
dfi=df[df['class']=='Insecta']
##substitute clogmia for more recent genome
dfi.loc[dfi[dfi.species.str.contains('Clogmia')].index.values[0],'NCBI accession']="GCA_022818195.1"
dfi=dfi.set_index("NCBI accession")
td=list(dfi.index)

with mp.Pool(30) as pool:
    result=pool.map(get_oskars,td)
result={x:y for x,y in zip(td, result)}
for index, row in dfi.iterrows():
    dfi.loc[index,'oskar_accessions']=str(result[index])

In [33]:
## submit unannotated genomes to augustus webserver, then deposit in ncbi_dataset/data folder 
unannotated=['GCA_022818195.1',
 'GCA_964304675.1',
 'GCA_045786645.1',
 'GCA_000149185.1',
 'GCA_964258955.1',
 'GCA_039877355.1',
 'GCA_029207805.1']
##Update busco scores for oskar table
for x in unannotated:
    !sh run_busco_annotated_genome.sh "$x"
for x in unannotated:
    f=open(f"/n/netscratch/extavour_lab/Everyone/Rishabh/BUSCO_outputs/{x}/short_summary.specific.arthropoda_odb12.{x}.txt","r").readlines()
    b=float([x for x in f if "C:" in x][0].split("S:")[1].split("%")[0])
    dfi.loc[x,'BUSCO_complete_single_copy']=b

In [38]:
dfi['oskar_accessions']=dfi['oskar_accessions'].replace("set()","")

In [46]:
## write oskars to output fasta
f=open('oskar_proteins.fa','w')
import ast
from Bio import SeqIO

for index, row in dfi.iterrows():
    a=row['oskar_accessions']
    if a!='':
        a=ast.literal_eval(a)
        for ai in a:
            if 'GCA_' in index or 'GCF_' in index:
                fa=f"../ncbi_dataset/data/{index}/protein_longest.faa"
            else:
                fa=f"/net/bos-nfsisilon/ifs/rc_labs/extavour_lab/rkapoor/IF_project_iteration2/TSA_transdec/{index}/{index}.fasta.transdecoder.pep"

            seq_dict = SeqIO.to_dict(SeqIO.parse(fa, "fasta"))
            n=ai+" "+row.species
            f.write(f">{n}\n")
            s=str(seq_dict[ai].seq)
            f.write(s+"\n")
f.close()


In [58]:
df_blondel=pd.read_csv("blondel_tableS5.csv",index_col='species')
dfb=df_blondel.loc[list(set(dfi.species)&set(df_blondel.index)),:]
hits_by_idx = dfb.groupby(level=0)['filtered_hits'].sum()

In [73]:
for index, row in dfi.iterrows():
    if row.species in hits_by_idx.index:
        dfi.loc[index,'blondel_oskar_count']=hits_by_idx[row.species]
        
    if "{" in str(row.oskar_accessions) or row.blondel_oskar_count>0:
        dfi.loc[index,'oskar_present']=True
    elif str(row.oskar_accessions)=="" and str(row.blondel_oskar_count)=='nan' or row.blondel_oskar_count==0 :
        dfi.loc[index,'oskar_present']=False
 
        
        

In [75]:
dfi['blondel_oskar_count']=dfi['blondel_oskar_count'].fillna('')

In [77]:
dfi.loc[:,['species','subs','BUSCO_complete_single_copy','family','order','oskar_accessions','blondel_oskar_count','oskar_present','iq_tree_label']].to_csv('oskar_data.tsv',sep='\t')

## Perform Mann-Whitney comparison of BUSCO completeness by oskar status 

In [37]:
from scipy.stats import mannwhitneyu
osk_BUSCO=list(dfi[dfi.oskar_present==True].BUSCO_complete_single_copy)
no_osk_BUSCO=list(dfi[dfi.oskar_present==False].BUSCO_complete_single_copy)
import numpy as np
print(np.mean(osk_BUSCO), np.mean(no_osk_BUSCO))
mannwhitneyu(no_osk_BUSCO, osk_BUSCO, alternative="two-sided")

83.2945945945946 78.09318181818182


MannwhitneyuResult(statistic=630.5, pvalue=0.08271598388677139)

## TBLASTN of Phlebotomus papatasi oskar protein vs Clogmia albipunctata genome

In [5]:
%%bash
singularity exec /cvmfs/singularity.galaxyproject.org/b/l/blast:2.9.0--pl526he19e7b1_7 makeblastdb -in  ncbi_dataset/data/GCA_965637365.1/GCA_965637365.1_idCloAlbi1.1_genomic.fna  -dbtype nucl -parse_seqids -out clogmia_blast



Building a new DB, current time: 07/11/2025 18:19:20
New DB name:   /n/netscratch/extavour_lab/Everyone/Rishabh/clogmia_blast
New DB title:  ncbi_dataset/data/GCA_965637365.1/GCA_965637365.1_idCloAlbi1.1_genomic.fna
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 83 sequences in 3.70707 seconds.




In [8]:
%%bash
singularity exec /cvmfs/singularity.galaxyproject.org/b/l/blast:2.9.0--pl526he19e7b1_7 tblastn -query "Phlebotomus.fasta"  -db clogmia_blast

TBLASTN 2.9.0+


Reference: Stephen F. Altschul, Thomas L. Madden, Alejandro A.
Schaffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J.
Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of
protein database search programs", Nucleic Acids Res. 25:3389-3402.



Database: ncbi_dataset/data/GCA_965637365.1/GCA_965637365.1_idCloAlbi1.1_genom
ic.fna
           83 sequences; 309,212,404 total letters



Query= XP_055701593.1 maternal effect protein oskar [Phlebotomus papatasi]

Length=342
                                                                      Score        E
Sequences producing significant alignments:                          (Bits)     Value

OZ281279.1 Clogmia albipunctata genome assembly, chromosome: 2        32.7       3.8  


>OZ281279.1 Clogmia albipunctata genome assembly, chromosome: 2
Length=51435656

 Score = 32.7 bits (73),  Expect = 3.8, Method: Compositional matrix adjust.
 Identities = 17/57 (30%), Positives = 31/57 (54%), Gaps = 0/57 (0%)
 Frame



In [31]:
osk_BUSCO=list(dfi[dfi.oskar_present==True].BUSCO_complete_single_copy)
no_osk_BUSCO=list(dfi[dfi.oskar_present==False].BUSCO_complete_single_copy)