In [1]:
import pickle
import pandas as pd
import multiprocessing as mp
import numpy as np
import pickle
import matplotlib.pyplot as plt
from Bio import SeqIO
import os
import subprocess
import ast
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

## Add hmmsearch hits of XP_021699539.1 vs XP_029735553.1
XP_029735553.1 (Aedes albopictus) was a secondary chimera of XP_021699539.1 (Aedes aegypti) in the first pipeline iteration run on A. albopictus annotation release GCF_006496715.1, but was later marked as a lncRNA in the current A.albopictus annotation release. We confirmed its expression and sequence via RT-PCR and Sanger sequencing after the first iteration, so manually added XP_029735553.1 back for consideration as a secondary chimera of XP_021699539.1 in this iteration.

In [None]:
##validate that XP_029735553.1  is a hmmsearch hit to the metazoan and hgt intervals of XP_021699539.1 
## XP_021699539.1_hmms is a concatenation of the hmms of XP_021699539.1  intervals found in outputs/hmmbuild
##suppressed_aedes_albopictus.fa is a fasta of XP_029735553.1
!singularity exec /cvmfs/singularity.galaxyproject.org/h/m/hmmer:3.3.2--he1b5a44_0 hmmsearch -E 1e-2 --cpu 2 --noali --domtblout "XP_021699539.1_hmms.domtblout" "XP_021699539.1_hmms"  "outputs/suppressed_aedes_albopictus.fa"

# hmmsearch :: search profile(s) against a sequence database
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query HMM file:                  XP_021699539.1_hmms
# target sequence database:        outputs/suppressed_aedes_albopictus.fa
# per-dom hits tabular output:     XP_021699539.1_hmms.domtblout
# show alignments in output:       no
# sequence reporting threshold:    E-value <= 0.01
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Query:       GCF_002204515.2;XP_021699539.1;HGT_(5,150)  [M=129]
Scores for complete sequences (score includes all domains):
   --- full sequence ---   --- best 1 domain ---    -#dom-
    E-value  score  bias    E-value  score  bias    exp  N  Sequence                       Description
    ------- ------ -----    ------- ------ -----   ---- --  ------

In [55]:
!mkdir missing_aedes_albopictus_hmmsearch
!sh scripts/split_hmmer_csv.sh "XP_021699539.1_hmms.domtblout" missing_aedes_albopictus_hmmsearch


In [56]:
!sh scripts/split_hmmer_csv.sh "XP_021699539.1_hmms.domtblout" missing_aedes_albopictus_hmmsearch

In [149]:
##verify that all intervals of the query chimera GCF_002204515.2;XP_021699539.1 are significant hits to aedes albopictus GCF_006496715.1;XP_029735553.1
for x in os.listdir('missing_aedes_albopictus_hmmsearch'):
    if '.tsv' in x:
        # original raw files: no header
        arth = pd.read_csv(f'missing_aedes_albopictus_hmmsearch/{x}', sep="\t", header=None)
        # drop unused cols
        arth = arth.drop([1,4], axis=1)
              # assign column names
        arth.columns = cols
        print(arth.domain_score.max())

151.9
104.3
168.1
96.6


In [1]:
##load a dataframe of genome taxids from genome accessions
df1=pd.read_csv('Data/genbank_genomes_4_22_2025.tsv',sep='\t')
df2=pd.read_csv('Data/refseq_genomes_scaffold_plus_4_19_2025.tsv',sep='\t')
dftax=pd.concat([df1,df2]).set_index('Assembly Accession')
dftax.loc['GCF_006496715.1',['Organism Name','Organism Taxonomic ID']]=['Aedes albopictus',7160]

In [99]:
##add hits to aedes albopictus GCF_006496715.1;XP_029735553.1 to main hmmsearch outputs for GCF_002204515.2;XP_021699539.1
for x in os.listdir('missing_aedes_albopictus_hmmsearch'):
    if '.tsv' in x:
        # original raw files: no header
        arth = pd.read_csv(f'missing_aedes_albopictus_hmmsearch/{x}', sep="\t", header=None)
        # drop unused cols
        arth = arth.drop([1,4], axis=1)
        # assign column names
        arth.columns = cols
        # annotate arthropod hits with species & taxid
        arth['species'] = [ dftax.loc[name.split(";")[0], 'Organism Name'] for name in arth['target_name'] ]
        arth['taxid']   = [ dftax.loc[name.split(";")[0], 'Organism Taxonomic ID'] for name in arth['target_name'] ]

        arth = arth.drop('description of target', axis=1)
        ##drop e-values since database size differs
        arth.loc[:,['E-value','c-Evalue', 'i-Evalue']]=np.nan
        arthog=pd.read_csv(f'outputs/hmmsearch_v_arthropod/{x}',sep='\t',index_col=0)
        arth=arth.set_index('target_name')
        pd.concat([arth,arthog]).sort_values('overall_score',ascending=False).to_csv(f'outputs/hmmsearch_v_arthropod/{x}',sep='\t')
        