In [1]:
import pandas as pd
import multiprocessing as mp
import os

In [2]:
df=pd.read_csv("phylogenetic_data_with_substitutions.tsv",sep="\t",index_col=0)

In [12]:
!mkdir hmmsearch_results

In [3]:
from pathlib import Path
from typing import Iterable, List, TextIO, Union
import gzip
import io


def extract_target_names(
    hmmer_output: Union[str, Path],
    *,
    unique: bool = False
) -> List[str]:
    """
    Extract every ‘target name’ from an HMMER output file.

    Parameters
    ----------
    hmmer_output : str or pathlib.Path
        Path to a hmmsearch/hmmscan screen, --tblout or --domtblout file.
    unique : bool, optional
        If True, de-duplicates while preserving the order of first appearance.

    Returns
    -------
    List[str]
        A list of target names (strings).  Order matches the file.
    """
    names: List[str] = []
    with open(hmmer_output) as fh:
        for ln in fh:
            if ln.startswith("#") or not ln.strip():
                continue                      # ignore headers / blank lines
            names.append(ln.split()[0])       # first column = target name

    if unique:
        seen = set()
        names = [n for n in names if not (n in seen or seen.add(n))]

    return set(names)


In [15]:
def get_oskars(a):
    try:
        if 'GCA_' in a or 'GCF_' in a:
            f=f"../ncbi_dataset/data/{a}/protein_longest.faa"
        else:
            f=f"/net/bos-nfsisilon/ifs/rc_labs/extavour_lab/rkapoor/IF_project_iteration2/TSA_transdec/{a}/{a}.fasta.transdecoder.pep"
        if f'{a}_LOTUS' not in os.listdir(f'hmmsearch_results'):
            !singularity exec /cvmfs/singularity.galaxyproject.org/h/m/hmmer:3.4--hdbdd923_2 hmmsearch --domtblout "hmmsearch_results/""$a""_LOTUS" -E .05 'LOTUS_CONSENSUS.hmm' "$f" 
        if f'{a}_OSK' not in os.listdir(f'hmmsearch_results'):
            !singularity exec /cvmfs/singularity.galaxyproject.org/h/m/hmmer:3.4--hdbdd923_2 hmmsearch --domtblout "hmmsearch_results/""$a""_OSK" -E .05 'OSK_CONSENSUS.hmm' "$f" 
        LOTUS=extract_target_names(f"hmmsearch_results/{a}_LOTUS")
        OSK=extract_target_names(f"hmmsearch_results/{a}_OSK")
        return OSK&LOTUS
    except:
        f=open('fails.txt','a')
        f.write(a+"\n")
        f.close()
        

In [60]:
## make an insect-only dataframe and populate it with oskar accessions
dfi=df[df['class']=='Insecta']
dfi.loc[dfi[dfi.species.str.contains('Clogmia')].index.values[0],'NCBI accession']="GCA_022818195.1"
dfi=dfi.set_index("NCBI accession")
td=list(dfi.index)

with mp.Pool(30) as pool:
    result=pool.map(get_oskars,td)
result={x:y for x,y in zip(td, result)}
for index, row in dfi.iterrows():
    dfi.loc[index,'oskar_accessions']=str(result[index])

In [71]:
re_do_busco=['GCA_022818195.1',
 'GCA_964304675.1',
 'GCA_045786645.1',
 'GCA_000149185.1',
 'GCA_964258955.1',
 'GCA_039877355.1',
 'GCA_029207805.1']

In [72]:
!mkdir BUSCO_outputs

In [74]:
re_do_busco=['GCA_022818195.1',
 'GCA_964304675.1',
 'GCA_045786645.1',
 'GCA_000149185.1',
 'GCA_964258955.1',
 'GCA_039877355.1',
 'GCA_029207805.1']
for x in re_do_busco:
    !sh run_busco_annotated_genome.sh "$x"

  "cannot import name '(?P<module_name>[\w]+)", err.msg
2025-07-14 23:00:40 INFO:	***** Start a BUSCO v5.8.3 analysis, current time: 07/14/2025 23:00:40 *****
2025-07-14 23:00:40 INFO:	Configuring BUSCO with local environment
2025-07-14 23:00:40 INFO:	Running proteins mode
2025-07-14 23:00:40 INFO:	'Force' option selected; overwriting previous results directory
2025-07-14 23:00:40 INFO:	Input file is /n/netscratch/extavour_lab/Everyone/Rishabh/ncbi_dataset/data/GCA_022818195.1/protein_longest.faa
2025-07-14 23:00:40 ERROR:	Unable to run BUSCO in offline mode. Dataset /n/netscratch/extavour_lab/Everyone/Rishabh/panarthropoda_gc_specification_evolution/busco_downloads/lineages/arthropoda_odb12 does not exist.
2025-07-14 23:00:40 ERROR:	BUSCO analysis failed!
2025-07-14 23:00:40 ERROR:	Check the logs, read the user guide (https://busco.ezlab.org/busco_userguide.html), and check the BUSCO issue board on https://gitlab.com/ezlab/busco/issues

  "cannot import name '(?P<module_name>[\w]+)", 

In [33]:
LOTUS=extract_target_names(f"hmmsearch_results/{a}_LOTUS")

In [35]:
OSK=extract_target_names(f"hmmsearch_results/{a}_OSK")

In [46]:
df=pd.read_csv("phylogenetic_data_with_substitutions.tsv",sep="\t")
dfi=df[df['class']=='Insecta']
dfi.loc[dfi[dfi.species.str.contains('Clogmia')].index.values[0],'NCBI accession']="GCA_022818195.1"

In [47]:
dfi=df[df['class']=='Insecta']

In [48]:
dfi.loc[dfi[dfi.species.str.contains('Clogmia')].index.values[0],'NCBI accession']="GCA_022818195.1"
dfi=dfi.set_index("NCBI accession")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [26]:
dfi=dfi.set_index("NCBI accession")

In [49]:
dfi

Unnamed: 0_level_0,species,subs,iq_tree_label,data_type,BUSCO_complete_single_copy,family,order,class,phylum,GC_mech,Germ_band
NCBI accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GCA_963669975.1,Acanthoscelides obtectus,,Acanthoscelides_obtectus,annotated_genome,81.0,Chrysomelidae,Coleoptera,Insecta,Arthropoda,GP,long
GHUU01.1,Acheta domesticus,,Acheta_domesticus,TSA,72.9,Gryllidae,Orthoptera,Insecta,Arthropoda,IND,intermediate
GCF_005508785.2,Acyrthosiphon pisum,,Acyrthosiphon_pisum,annotated_genome,88.8,Aphididae,Hemiptera,Insecta,Arthropoda,GP,short
GCF_002204515.2,Aedes aegypti,,Aedes_aegypti,annotated_genome,92.6,Culicidae,Diptera,Insecta,Arthropoda,GP,long
GCF_943734735.2,Anopheles gambiae,,Anopheles_gambiae,annotated_genome,97.2,Culicidae,Diptera,Insecta,Arthropoda,GP,long
...,...,...,...,...,...,...,...,...,...,...,...
GDRS01.1,Stenopsychodes sp. AD-2015,{'Stenopsyche griseipennis'},Stenopsychidae,TSA,54.2,Stenopsychidae,Trichoptera,Insecta,Arthropoda,IND,
GAWW02.1,Tenthredo koehleri,{'Nematus ribesii'},Tenthredinidae,TSA,79.9,Tenthredinidae,Hymenoptera,Insecta,Arthropoda,IND,
GCF_040414725.1,Anabrus simplex,{'Conocephalus brevipennis'},Tettigoniidae,annotated_genome,96.3,Tettigoniidae,Orthoptera,Insecta,Arthropoda,IND,
GDPF01.1,Megasternum obscurum,{'Hydrophilus piceus'},Hydrophilidae,TSA,68.4,Hydrophilidae,Coleoptera,Insecta,Arthropoda,IND,short


In [32]:
!cp  ../augustus/augustus.aa ../ncbi_dataset/data/GCA_022818195.1/protein_longest.faa

In [53]:
td=list(dfi.index)
with mp.Pool(30) as pool:
    result=pool.map(get_oskars,td)
result={x:y for x,y in zip(td, result)}
for index, row in dfi.iterrows():
    dfi.loc[index,'oskar_accessions']=str(result[index])

In [54]:
result={x:y for x,y in zip(td, result)}

In [52]:
dfi

Unnamed: 0_level_0,species,subs,iq_tree_label,data_type,BUSCO_complete_single_copy,family,order,class,phylum,GC_mech,Germ_band
NCBI accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GCA_963669975.1,Acanthoscelides obtectus,,Acanthoscelides_obtectus,annotated_genome,81.0,Chrysomelidae,Coleoptera,Insecta,Arthropoda,GP,long
GHUU01.1,Acheta domesticus,,Acheta_domesticus,TSA,72.9,Gryllidae,Orthoptera,Insecta,Arthropoda,IND,intermediate
GCF_005508785.2,Acyrthosiphon pisum,,Acyrthosiphon_pisum,annotated_genome,88.8,Aphididae,Hemiptera,Insecta,Arthropoda,GP,short
GCF_002204515.2,Aedes aegypti,,Aedes_aegypti,annotated_genome,92.6,Culicidae,Diptera,Insecta,Arthropoda,GP,long
GCF_943734735.2,Anopheles gambiae,,Anopheles_gambiae,annotated_genome,97.2,Culicidae,Diptera,Insecta,Arthropoda,GP,long
...,...,...,...,...,...,...,...,...,...,...,...
GDRS01.1,Stenopsychodes sp. AD-2015,{'Stenopsyche griseipennis'},Stenopsychidae,TSA,54.2,Stenopsychidae,Trichoptera,Insecta,Arthropoda,IND,
GAWW02.1,Tenthredo koehleri,{'Nematus ribesii'},Tenthredinidae,TSA,79.9,Tenthredinidae,Hymenoptera,Insecta,Arthropoda,IND,
GCF_040414725.1,Anabrus simplex,{'Conocephalus brevipennis'},Tettigoniidae,annotated_genome,96.3,Tettigoniidae,Orthoptera,Insecta,Arthropoda,IND,
GDPF01.1,Megasternum obscurum,{'Hydrophilus piceus'},Hydrophilidae,TSA,68.4,Hydrophilidae,Coleoptera,Insecta,Arthropoda,IND,short


In [43]:
result['GCF_000648695.1']

'GCF_000648695.1'

In [55]:
result={x:y for x,y in zip(td, result)}
for index, row in dfi.iterrows():
    dfi.loc[index,'oskar_accessions']=str(result[index])
dfi

Unnamed: 0_level_0,species,subs,iq_tree_label,data_type,BUSCO_complete_single_copy,family,order,class,phylum,GC_mech,Germ_band,oskar_accessions
NCBI accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GCA_963669975.1,Acanthoscelides obtectus,,Acanthoscelides_obtectus,annotated_genome,81.0,Chrysomelidae,Coleoptera,Insecta,Arthropoda,GP,long,{'CAK1628125.1'}
GHUU01.1,Acheta domesticus,,Acheta_domesticus,TSA,72.9,Gryllidae,Orthoptera,Insecta,Arthropoda,IND,intermediate,{'GHUU01.1;GHUU01020574.1.p1'}
GCF_005508785.2,Acyrthosiphon pisum,,Acyrthosiphon_pisum,annotated_genome,88.8,Aphididae,Hemiptera,Insecta,Arthropoda,GP,short,set()
GCF_002204515.2,Aedes aegypti,,Aedes_aegypti,annotated_genome,92.6,Culicidae,Diptera,Insecta,Arthropoda,GP,long,{'XP_001656415.1'}
GCF_943734735.2,Anopheles gambiae,,Anopheles_gambiae,annotated_genome,97.2,Culicidae,Diptera,Insecta,Arthropoda,GP,long,{'XP_313289.4'}
...,...,...,...,...,...,...,...,...,...,...,...,...
GDRS01.1,Stenopsychodes sp. AD-2015,{'Stenopsyche griseipennis'},Stenopsychidae,TSA,54.2,Stenopsychidae,Trichoptera,Insecta,Arthropoda,IND,,{'GDRS01.1;GDRS01002278.1.p1'}
GAWW02.1,Tenthredo koehleri,{'Nematus ribesii'},Tenthredinidae,TSA,79.9,Tenthredinidae,Hymenoptera,Insecta,Arthropoda,IND,,{'GAWW02.1;GAWW02024808.1.p1'}
GCF_040414725.1,Anabrus simplex,{'Conocephalus brevipennis'},Tettigoniidae,annotated_genome,96.3,Tettigoniidae,Orthoptera,Insecta,Arthropoda,IND,,{'XP_068087189.1'}
GDPF01.1,Megasternum obscurum,{'Hydrophilus piceus'},Hydrophilidae,TSA,68.4,Hydrophilidae,Coleoptera,Insecta,Arthropoda,IND,short,set()


In [9]:

for index, row in dfi.iterrows():
    try:
        dfi.loc[index,'oskar']=str(get_oskars(index))
    except:
        print(index,row['species'])
        continue
    

GCA_963669975.1 Acanthoscelides obtectus
GHUU01.1 Acheta domesticus
GCF_005508785.2 Acyrthosiphon pisum
GCF_002204515.2 Aedes aegypti
GCF_943734735.2 Anopheles gambiae
GFCX01.1 Antheraea pernyi
GCF_003254395.2 Apis mellifera
GCF_917208135.1 Athalia rosae
GCF_016617805.1 Bactrocera tryoni
GCA_003018175.1 Blattella germanica
GCF_030269925.1 Bombyx mori
GCF_014529535.1 Bradysia coprophila
GCF_958450345.1 Calliphora vicina
GEUF01.1 Callosobruchus maculatus
GCF_003227725.1 Camponotus floridanus
GCA_022818195.1 Clogmia albipunctata
GCF_000648655.2 Copidosoma floridanum
GAYP02.1 Ctenocephalides felis
GCF_015732765.1 Culex quinquefasciatus
GCF_011750605.1 Drosophila busckii
GCF_003285905.1 Drosophila hydei
GCF_000001215.4 Drosophila melanogaster
GCF_030788295.1 Drosophila virilis
GCF_018902025.1 Drosophila willistoni
GAVW02.1 Epiophlebia superstes
GCA_964304675.1 Epiphyas postvittana
GAYQ02.1 Forficula auricularia
GCA_017312745.1 Gryllus bimaculatus
GCF_022581195.2 Helicoverpa zea
GEJI01.1 Ice

In [55]:
dfb=df_blondel.loc[list(set(dfi.species)&set(df_blondel.index)),:]
hits_by_idx = dfb.groupby(level=0)['filtered_hits'].sum()

In [68]:
for index, row in dfi.iterrows():
    if row.species in hits_by_idx.index:
        dfi.loc[index,'blondel_oskar_count']=hits_by_idx[row.species]
        
    if "{" in str(row.oskar) or row.blondel_oskar_count>0:
        dfi.loc[index,'oskar_present']=1
    elif str(row.oskar)=="set()" and str(row.blondel_oskar_count)=='nan' or row.blondel_oskar_count==0 :
        dfi.loc[index,'oskar_present']=0
        
        

In [70]:
dfi.to_csv("oskar_presence.tsv",sep="\t")

In [50]:
dfi['oskar_present']=[int(str(x)!='set()' and str(x)!='nan') for x in dfi.oskar]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
!tar -xvf "predictions.tar"

augustus/
augustus/augustus.gff
augustus/augustus.aa
augustus/augustus.codingseq
augustus/augustus.gbrowse
augustus/augustus.gtf
augustus/augustus.cdsexons


In [19]:
dfi[dfi.oskar_present==1].BUSCO_complete_single_copy.mean()

88.87812500000001

In [20]:
dfi[dfi.oskar_present==0].BUSCO_complete_single_copy.mean()

78.61190476190475

In [14]:
df[(df['class']=='Insecta')&(df.data_type=='unannotated_genome')].species

NCBI accession
GCA_022818195.1     Clogmia albipunctata
GCA_964304675.1     Epiphyas postvittana
GCA_045786645.1    Macrocentrus cingulum
GCA_000149185.1     Mayetiola destructor
GCA_964258955.1        Nymphalis antiopa
GCA_039877355.1      Pyrrhocoris apterus
GCA_029207805.1       Tribolium confusum
Name: species, dtype: object

In [None]:
dfim=df[(df['class']=='Insecta')&(df.data_type=='unannotated_genome')]

In [32]:
dfb=df_blondel.loc[list(set(dfi.species)&set(df_blondel.index)),:]
hits_by_idx = dfb.groupby(level=0)['filtered_hits'].sum()

In [33]:
hits_by_idx 

species
Clogmia albipunctata     0.0
Epiphyas postvittana     0.0
Macrocentrus cingulum    1.0
Mayetiola destructor     0.0
Pyrrhocoris apterus      0.0
Name: filtered_hits, dtype: float64

In [40]:
df_blondel=pd.read_csv("Table_S5_searches.csv",index_col='species')
len(set(df_blondel.index)&set(dfi.species))

57

In [46]:
!singularity exec /cvmfs/singularity.galaxyproject.org/h/m/hmmer:3.4--hdbdd923_2 hmmsearch --domtblout "hmmsearch_results/""$a""_LOTUS" -E .05 'LOTUS_CONSENSUS.hmm' "$f" 

# hmmsearch :: search profile(s) against a sequence database
# HMMER 3.4 (Aug 2023); http://hmmer.org/
# Copyright (C) 2023 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query HMM file:                  LOTUS_CONSENSUS.hmm
# target sequence database:        ncbi_dataset/data/GCA_963669975.1/protein_longest.faa
# per-dom hits tabular output:     hmmsearch_results/GCA_964212115.1_LOTUS
# sequence reporting threshold:    E-value <= 0.05
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Query:       oskar_filtered.aligned.LOTUS_domain  [M=101]
Scores for complete sequences (score includes all domains):
   --- full sequence ---   --- best 1 domain ---    -#dom-
    E-value  score  bias    E-value  score  bias    exp  N  Sequence     Description
    ------- ------ -----    ------- ------ -----   ---- --  --------     -----------
    6.1e-23   82.0   

In [1]:
!sh download_genome_only.sh "GCA_965637365.1"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 89.9M    0 89.9M    0     0  29.4M      0 --:--:--  0:00:03 --:--:-- 29.4M
curl: Saved to filename 'GCA_965637365.1.zip'
Archive:  GCA_965637365.1.zip
  inflating: README.md               
  inflating: ncbi_dataset/data/assembly_data_report.jsonl  
  inflating: ncbi_dataset/data/GCA_965637365.1/GCA_965637365.1_idCloAlbi1.1_genomic.fna  
  inflating: ncbi_dataset/data/dataset_catalog.json  
  inflating: md5sum.txt              


In [5]:
%%bash
singularity exec /cvmfs/singularity.galaxyproject.org/b/l/blast:2.9.0--pl526he19e7b1_7 makeblastdb -in  ncbi_dataset/data/GCA_965637365.1/GCA_965637365.1_idCloAlbi1.1_genomic.fna  -dbtype nucl -parse_seqids -out clogmia_blast



Building a new DB, current time: 07/11/2025 18:19:20
New DB name:   /n/netscratch/extavour_lab/Everyone/Rishabh/clogmia_blast
New DB title:  ncbi_dataset/data/GCA_965637365.1/GCA_965637365.1_idCloAlbi1.1_genomic.fna
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 83 sequences in 3.70707 seconds.




In [22]:
%%bash
gunzip GCA_964304675.1_ilEpiPost1.hap1.1_genomic.fna.gz

In [8]:
%%bash
singularity exec /cvmfs/singularity.galaxyproject.org/b/l/blast:2.9.0--pl526he19e7b1_7 tblastn -query "Phlebotomus.fasta"  -db clogmia_blast

TBLASTN 2.9.0+


Reference: Stephen F. Altschul, Thomas L. Madden, Alejandro A.
Schaffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J.
Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of
protein database search programs", Nucleic Acids Res. 25:3389-3402.



Database: ncbi_dataset/data/GCA_965637365.1/GCA_965637365.1_idCloAlbi1.1_genom
ic.fna
           83 sequences; 309,212,404 total letters



Query= XP_055701593.1 maternal effect protein oskar [Phlebotomus papatasi]

Length=342
                                                                      Score        E
Sequences producing significant alignments:                          (Bits)     Value

OZ281279.1 Clogmia albipunctata genome assembly, chromosome: 2        32.7       3.8  


>OZ281279.1 Clogmia albipunctata genome assembly, chromosome: 2
Length=51435656

 Score = 32.7 bits (73),  Expect = 3.8, Method: Compositional matrix adjust.
 Identities = 17/57 (30%), Positives = 31/57 (54%), Gaps = 0/57 (0%)
 Frame



In [15]:
from Bio import SeqIO
##load fasta with chimeras from the previous iteration to prioritize in selection of representative sequences per cluster
a="GCA_029207805.1_icTriConf1.1_genomic.fna"
prot = SeqIO.to_dict(SeqIO.parse(f'ncbi_dataset/data/GCA_029207805.1/GCA_029207805.1_icTriConf1.1_genomic.fna', 'fasta'))


In [None]:
!ls

In [5]:
!ls ncbi_dataset/data/GCA_029207805.1/

GCA_029207805.1_icTriConf1.1_genomic.fna


In [16]:
f=open('split1_GCA_964304675.1_ilEpiPost1.hap1.1_genomic.fna','w')

In [11]:
key=list(prot.keys())

In [16]:
len(list(prot.keys()))

331

In [38]:
f=open('split1_GCA_029207805.1_icTriConf1.1_genomic.fna','w')
key=list(prot.keys())
for i in range(0,17):
    n=key[i]
    f.write(f'>{n}\n')
    s=str(prot[n].seq)
    f.write(s+"\n")
f.close()

In [27]:
f=open('split2_GCA_029207805.1_icTriConf1.1_genomic.fna','w')
for i in range(170,len(prot)):
    n=key[i]
    f.write(f'>{n}\n')
    s=str(prot[n].seq)
    f.write(s+"\n")
f.close()

In [51]:
f=open('split3_GCA_039877355.1_ASM3987735v1_genomic.fna','w')
for i in range(5,8):
    n=key[i]
    f.write(f'>{n}\n')
    s=str(prot[n].seq)
    f.write(s+"\n")
f.close()

In [13]:
f=open('split4_GCA_039877355.1_ASM3987735v1_genomic.fna','w')
for i in range(8,len(prot)):
    n=key[i]
    f.write(f'>{n}\n')
    s=str(prot[n].seq)
    f.write(s+"\n")
f.close()

In [26]:
!gzip split1_GCA_029207805.1_icTriConf1.1_genomic.fna

In [28]:
!gzip split2_GCA_029207805.1_icTriConf1.1_genomic.fna

In [52]:
!gzip split3_GCA_039877355.1_ASM3987735v1_genomic.fna

In [14]:
!gzip split4_GCA_039877355.1_ASM3987735v1_genomic.fna

In [37]:
!rm -r test
!mkdir test

In [None]:
%%bash 
# Singularity container path
container="/cvmfs/singularity.galaxyproject.org/t/r/transdecoder:5.7.1--pl5321hdfd78af_1"
input_file="split1_GCA_029207805.1_icTriConf1.1_genomic.fna"
output_dir='test'
# Run TransDecoder.LongOrfs
singularity exec "$container" TransDecoder.LongOrfs \
  -t "$input_file" \
  --output_dir "$output_dir" \
  --complete_orfs_only 
  

In [1]:
!mkdir ncbi_dataset/data/GCA_965637365.1

mkdir: cannot create directory ‘ncbi_dataset/data/GCA_965637365.1’: File exists


In [2]:
!cp augustus/augustus.aa ncbi_dataset/data/GCA_965637365.1/protein_longest.faa

In [6]:
!rm -r busco_downloads

In [5]:
%%bash
n="GCA_965637365.1"

dir="ncbi_dataset/data/$n/protein_longest.faa"

singularity exec /cvmfs/singularity.galaxyproject.org/b/u/busco:5.8.3--pyhdfd78af_1 busco -i $dir -l arthropoda_odb12 -c 50 -m prot -f -o ./BUSCO_outputs/$n --offline 

2025-07-14 14:47:22 INFO:	***** Start a BUSCO v5.8.3 analysis, current time: 07/14/2025 14:47:22 *****
2025-07-14 14:47:22 INFO:	Configuring BUSCO with local environment
2025-07-14 14:47:22 INFO:	Running proteins mode
2025-07-14 14:47:22 INFO:	'Force' option selected; overwriting previous results directory
2025-07-14 14:47:22 INFO:	Input file is /n/netscratch/extavour_lab/Everyone/Rishabh/ncbi_dataset/data/GCA_965637365.1/protein_longest.faa


  "cannot import name '(?P<module_name>[\w]+)", err.msg
2025-07-14 14:47:22 CRITICAL:	Unhandled exception occurred:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/configparser.py", line 767, in get
    value = d[option]
            ~^^^^^^^^
  File "/usr/local/lib/python3.12/collections/__init__.py", line 1015, in __getitem__
    return self.__missing__(key)            # support subclasses that define __missing__
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/collections/__init__.py", line 1007, in __missing__
    raise KeyError(key)
KeyError: 'domain'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/site-packages/busco/BuscoRunner.py", line 161, in run
    self.runner = AnalysisRunner(self.config)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/busco/BuscoRunner.py", line 441, in __init__
    self.mode = self.

CalledProcessError: Command 'b'n="GCA_965637365.1"\n\ndir="ncbi_dataset/data/$n/protein_longest.faa"\n\nsingularity exec /cvmfs/singularity.galaxyproject.org/b/u/busco:5.8.3--pyhdfd78af_1 busco -i $dir -l arthropoda_odb12 -c 50 -m prot -f -o ./BUSCO_outputs/$n --offline \n'' returned non-zero exit status 1.

In [14]:
!tar -xvzf arthropoda_odb12.2025-04-11.tar.gz

arthropoda_odb12/
arthropoda_odb12/dataset.cfg
arthropoda_odb12/ancestral_variants
arthropoda_odb12/scores_cutoff
arthropoda_odb12/ancestral
arthropoda_odb12/refseq_db.faa.gz
arthropoda_odb12/prfl/
arthropoda_odb12/prfl/505at6656.prfl
arthropoda_odb12/prfl/84987at6656.prfl
arthropoda_odb12/prfl/179647at6656.prfl
arthropoda_odb12/prfl/160411at6656.prfl
arthropoda_odb12/prfl/8627at6656.prfl
arthropoda_odb12/prfl/91669at6656.prfl
arthropoda_odb12/prfl/23295at6656.prfl
arthropoda_odb12/prfl/83657at6656.prfl
arthropoda_odb12/prfl/86541at6656.prfl
arthropoda_odb12/prfl/1920193at6656.prfl
arthropoda_odb12/prfl/1349036at6656.prfl
arthropoda_odb12/prfl/148851at6656.prfl
arthropoda_odb12/prfl/102894at6656.prfl
arthropoda_odb12/prfl/421896at6656.prfl
arthropoda_odb12/prfl/159445at6656.prfl
arthropoda_odb12/prfl/1921765at6656.prfl
arthropoda_odb12/prfl/86557at6656.prfl
arthropoda_odb12/prfl/48299at6656.prfl
arthropoda_odb12/prfl/869at6656.prfl
arthropoda_odb12/prfl/283448at6656.prfl
arthropoda_odb

In [17]:
!mv arthropoda_odb12 busco_downloads/lineages/

In [15]:
!mkdir busco_downloads

In [18]:
!sh run_busco_annotated_genome.sh GCA_965637365.1

  "cannot import name '(?P<module_name>[\w]+)", err.msg
2025-07-14 14:58:15 INFO:	***** Start a BUSCO v5.8.3 analysis, current time: 07/14/2025 14:58:15 *****
2025-07-14 14:58:15 INFO:	Configuring BUSCO with local environment
2025-07-14 14:58:15 INFO:	Running proteins mode
2025-07-14 14:58:15 INFO:	'Force' option selected; overwriting previous results directory
2025-07-14 14:58:15 INFO:	Input file is /n/netscratch/extavour_lab/Everyone/Rishabh/ncbi_dataset/data/GCA_965637365.1/protein_longest.faa
2025-07-14 14:58:18 INFO:	Running BUSCO using lineage dataset arthropoda_odb12 (eukaryota, 2025-04-11)
2025-07-14 14:58:18 INFO:	***** Run HMMER on gene sequences *****
2025-07-14 14:58:18 INFO:	Running 1667 job(s) on hmmsearch, starting at 07/14/2025 14:58:18
  "cannot import name '(?P<module_name>[\w]+)", err.msg
  "cannot import name '(?P<module_name>[\w]+)", err.msg
  "cannot import name '(?P<module_name>[\w]+)", err.msg
  "cannot import name '(?P<module_name>[\w]+)", err.msg
  "cannot imp

In [19]:
!gunzip GCA_045786645.1_ASM4578664v1_genomic.gz

In [20]:
!head GCA_045786645.1_ASM4578664v1_genomic

augustus/                                                                                           0000755 0175000 0000144 00000000000 15034276007 013314  5                                                                                                    ustar   augustus-web                    users                                                                                                                                                                                                                  augustus/augustus.gff                                                                               0000644 0175000 0000144 00364014056 15034275763 015713  0                                                                                                    ustar   augustus-web                    users                                                                                                                                                                                          

In [21]:
!cat split2_GCA_964258955.1.aa split1_GCA_964258955.1.aa > ncbi_dataset/data/GCA_964258955.1/protein_longest.faa

In [22]:
!cat split1_GCA_039877355.1.aa split2_GCA_039877355.1.aa split3_GCA_039877355.1.aa split4_GCA_039877355.1.aa> ncbi_dataset/data/GCA_039877355.1/protein_longest.faa

In [24]:
!cat split1_GCA_964304675.1.aa split2_GCA_964304675.1.aa > ncbi_dataset/data/GCA_964304675.1/protein_longest.faa

In [1]:
!mv GCA_000149185.1.aa ncbi_dataset/data/GCA_000149185.1/protein_longest.faa