#batch submission of hh-suite scan on involved virus (3+vocid)

In [2]:
import pandas as pd
virus_list=pd.read_csv('_data/VMR_MSL38_v2.csv')

In [3]:
ref_genome_dict={
    'zika':'ZIKV||AY632535',
    'ebola':'EBOV||AF086833',
    'measles':'MeV||AB016162',
    'covid':'SARS-CoV-2||MN908947'
}

In [4]:
virus_meta={'covid':('Order','Nidovirales'),
            'zika':('Family','Flaviviridae'),
            'ebola':('Family','Filoviridae'),
            'measles':('Family','Paramyxoviridae'),
            }
if 0:
    for k,v in virus_meta.items():
        virus_list[virus_list[v[0]]==v[1]].to_csv(f'_data/{k}.csv',index=False)


In [5]:
from typing import Dict,List
from warnings import warn
def is_multiple_access(access:Dict[str,str])->bool:
    ''' 
    helper for `get_genbank_id`'s output 
    `False`: one seg; `True`:c segs 
    '''
    if '_' in access:
        return False
    else:
        return True
    
def get_file_stem(virus_series:pd.Series,
    access_col='true_access',name_col='true_name')->List[str]:
    '''
    v_list['true_access']=nido_list['Virus GENBANK accession'].apply(get_genbank_id)
    nido_list['true_name']=nido_list['Virus name abbreviation(s)'].apply(get_correct_name)
    '''
    _=virus_series
    access:Dict[str,str]= _[access_col]
    if not is_multiple_access(_[access_col]):
        return [f"{_[name_col]}||{access['_']}"]
    else:
        return [f"{_[name_col]}|{k}|{v}" for k,v in access.items()]

In [6]:
from pathlib import Path
from typing import List,Dict
def robust_split(s:str,split:str)->List[str]:
    '''
    split and remove blankspace
    '''
    return [i.strip() for i in s.split(split)]

def get_genbank_id(accession:str)->Dict[str,str]:
    '''
    accession: entry in VMR
    '''
    # TODO compatible with '/'
    assert isinstance(accession,str),f'accession: {accession} is not a str!'
    o={}
    holder_token=0
    if ';' in accession:
        for sub_a in robust_split(accession,';'):
            if ':' in sub_a:
                subk,subv=robust_split(sub_a,':')
                o[subk]=subv
            else:
                o[f'{holder_token}']=sub_a
                holder_token+=1
    else:
        if ":" in accession:
            subk,subv=robust_split(accession,':')
            o[subk]=subv
        else:
            o['_']=accession
    return o
    # else:
    #     return {'nan':'nan'}

def get_correct_name(name:str)->str:
    """
    some names block contains multiple name sep by ';'
    only use the first one
    warning: this name is not unique!
    """
    assert isinstance(name,str),f'valid input: {name}'
    if ';' in name:
        warn(f'multiple names: {name},use the first one')
        return robust_split(name,';')[0]
    else:
        return name
    
    
# if 0:
used_genome=[]
for k,v in virus_meta.items():
    sub_v_list:pd.DataFrame=virus_list[virus_list[v[0]]==v[1]].copy(deep=True)
    sub_v_list['true_access']=sub_v_list['Virus GENBANK accession'].apply(get_genbank_id)
    sub_v_list['true_name']=sub_v_list['Virus name abbreviation(s)'].apply(get_correct_name)

    for _,s in sub_v_list.iterrows():
        for stem in get_file_stem(s):
            p=Path(f'_data/genome_fasta/{stem}:segs.fasta')
            if p.is_file():
                # print(stem)
                used_genome.append(p)
            else:
                print(p)
    

_data/genome_fasta/BtCoV/020_16/M.dau/FIN/2016||MG923574:segs.fasta


  warn(f'multiple names: {name},use the first one')


In [50]:
# from tempfile import TemporaryDirectory
# dir=
from multiprocessing import Pool
from subprocess import run

def parse_fasta_name(fa_na:str)->Dict[str,str]:
    fa_na=fa_na[2:-1]
    return {i.split('=')[0]:i.split('=')[1] for i in fa_na.split(',')}

def hhblits(infile:Path,cpu:int=2):
    # infile='tmp/AAbV||GBBW01007738#2s#11629#11886.fasta'
    o=run(['hhblits','-i',infile,'-d','hhs-db/pfam','-cpu',str(cpu)],capture_output=True)#,'-o','hahaha.hhr'
    skip_tag=1
    output=[]
    for i in o.stdout.decode().split('\n'):
        if skip_tag:
            if i.startswith(' No Hit'):
                skip_tag=0
                # print(','.join([i[:4],i[4:34],i[34:40],i[40:48],i[48:56],i[56:63],i[63:69],i[69:74],i[74:85],i[85:94],i[94:]]))
        else:
            if len(i)>90 and float(i[34:40].strip())>50:
                # print(','.join([i[:4],i[4:34],i[34:40],i[40:48],i[48:56],i[56:63],i[63:69],i[69:74],i[74:85],i[85:94],i[94:]]))
                output.append([i[:4],i[4:34],i[34:40],i[40:48],i[48:56],i[56:63],i[63:69],i[69:74],i[74:85],i[85:94],i[94:]])
    return (infile,output)
    
def mpi_scan_hhblits(infile:Path,dir:Path=Path('tmp'),processes=8,cpu:int=2):
    pool1=Pool(processes=processes)
    # c=0
    f='placeholder'
    res=[]
    for i in open(infile,'r').readlines():
        if i.startswith('>'):
            if f!='placeholder':
                f.close()
                r=pool1.apply_async(hhblits,(dir/name_stem,))
                res.append(r)
            name_dict=parse_fasta_name(i)
            name_stem='#'.join(name_dict.values())+'.fasta'
            f=open(dir/name_stem,'w')
        if f!='placeholder':
            f.write(i)
    if f!='placeholder':
        f.close()
        r=pool1.apply_async(hhblits,(dir/name_stem,cpu))
        res.append(r)
    pool1.close()
    pool1.join()
    return [i.get() for i in res]

def scan_hhblits(infile:Path,dir:Path=Path('tmp'),cpu:int=2):
    # c=0
    f='placeholder'
    res=[]
    for i in open(infile,'r').readlines():
        if i.startswith('>'):
            if f!='placeholder':
                f.close()
                r=hhblits(dir/name_stem,cpu)
                res.append(r)
            name_dict=parse_fasta_name(i)
            name_stem='#'.join(name_dict.values())+'.fasta'
            f=open(dir/name_stem,'w')
        if f!='placeholder':
            f.write(i)
    if f!='placeholder':
        f.close()
        r=hhblits(dir/name_stem,cpu)
        res.append(r)
    return res
from tempfile import TemporaryDirectory
def tmp(i):
    '''
    scan used genomes
    '''
    with TemporaryDirectory() as t:  
        tmpdir=Path(t)
        return (i,scan_hhblits(i,tmpdir,8))
if 1:
    res=[]
    # infile=used_genome[0]
    pool=Pool(processes=8)
    for infile in used_genome:
        r=pool.apply_async(tmp,(infile,))
        res.append(r)
    pool.close()
    pool.join() 

# def split_fasta(infile:Path):
# for infile in used_genome:
    

In [52]:
_=[i.get() for i in res]

In [1]:
import pickle as pkl
pkl.dump(_,open('blit_out.pkl','wb'))

Hex translation corrected

In [22]:
import pickle as pkl
from typing import List,Tuple,Dict,Union
routine_dict=Dict[str,Union[str,int,float]]
from Bio.Seq import Seq
# from Bio.Alphabet import IUPAC
import warnings
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)
transannot=['0s','0a','1s','1a','2s','2a']
transannot_indice={j:i for i,j in enumerate(transannot)}

def fetch_seq(f:str)->str:
    '''
    f: efetch result entry
    
    output: nt seq in the entry
    '''
    gb:dict=pkl.load(open(f,'rb'))
    seq:str=gb['GBSet']['GBSeq']['GBSeq_sequence']
    return seq

def hextranslate(g:Union[str,Seq])->List[str]:
    '''
    genome: input nt seq
    output: list of sense trans*3 + antisense trans*3
    '''
    o=[]
    if isinstance(g,str):
        genome=Seq(g)
    else:
        genome=g
    genome_r=genome.reverse_complement()
    for i in [0,1,2]:
        o.append(genome[i:].translate()._data.decode())
        o.append(genome_r[i:].translate()._data.decode())
    return o

def get_hex(f:str)->str:
    return hextranslate(fetch_seq(f))

def get_valid_seg(translist:List[str],name:str='xx',thresh:int=100)->List[Tuple[routine_dict,str]]:
    '''
    split translist
    get head dict as :
    head={'name':name,
        'transannot':annot,
        'prob':b,
        'proe':e}
    and corresponding seqs
    '''
    output=[]
    for i,annot in zip(translist,transannot):
        b=e=0
        for seg in i.split('*'):
            e=b+len(seg)
            if len(seg)>thresh:
                head={'name':name,
                      'transannot':annot,
                      'prob':b,
                      'proe':e}
                output.append((head,seg))
            b=e+1
    return output

def seg_to_fasta(seg:List[Tuple[routine_dict,str]])->str:
    '''
    seg: from `get_valid_seg`
    '''
    o=[]
    for s in seg:
       head='> ' + ','.join([f'{k}={v}' for k,v in s[0].items()])
       o.extend([head,s[1]])
    return '\n'.join(o)
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
# record:SeqRecord = SeqIO.read("_data/genome_fasta/6PavOLV7||MN532594:genome.fasta", "fasta")
def gen_seg_fasta(i:str):
    assert i.endswith('genome.fasta'),'only accept *genome.fasta!'
    record:SeqRecord = SeqIO.read(i, "fasta")
    fasta_str=seg_to_fasta(get_valid_seg(hextranslate(record.seq)))
    o_file=i.replace('genome.fasta','segs.fasta')
    with open(o_file,'w') as f:
        f.write(fasta_str)

from glob import glob
if 0:
    pool=Pool(processes=16)
    for i in glob('_data/genome_fasta/*genome.fasta'):
        pool.apply_async(gen_seg_fasta,(i,))
    pool.close()
    pool.join()

In [28]:
gen_seg_fasta(i)

In [8]:
o=[]
for k,v in virus_meta.items():
    sub_v_list:pd.DataFrame=virus_list[virus_list[v[0]]==v[1]].copy(deep=True)
    sub_v_list['true_access']=sub_v_list['Virus GENBANK accession'].apply(get_genbank_id)
    sub_v_list['true_name']=sub_v_list['Virus name abbreviation(s)'].apply(get_correct_name)

    for _,s in sub_v_list.iterrows():
        for stem in get_file_stem(s):
            p=Path(f'_data/genome_fasta/{stem}:segs.fasta')
            if p.is_file():
                break
    break

In [9]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
# for i in SimpleFastaParser(open('/home/hugheslab1/zfdeng/pangengraph_2/_data/genome_fasta/6PavOLV7||MN532594:genome.fasta')):
#     break



In [19]:
print()


> name=xx,transannot=1a,prob=171,proe=374
ARRLKYWGPKLDAFPSCDHFHGLTVTHSPKMVLVAQLKLHDTQSQLRLFKYVTLFNWRRVPGMLTLPVDMKQVVLGGEQHVRIFGHRCQNRMALKGPQSHPPLKFRRKTRLQKTNGIACRDTQETRTPLEIPASLLSKEIPRGGILDNERRPILFSIIKGRFRPRPLYEQRRDLLPPRSNFLIQFFDTSKSPAFWDTVLAKYR
> name=xx,transannot=2s,prob=0,proe=719
NLLSSYSLKEVVYPGRHLLMTSIKKSLGAVGCPVMGRLKEFVGAIGEPYGLSLPLPDLQAVPFQDRLSLLKKFCGGFLEKPVCHLWHRPTMHLSRKSRMSIAMSLFLFRKVLPSEEHDVVEYAKKMSEESPDPSPHFLHFIREELPKLFRYGWDRGAYENSSLNSVLPISSCRGSARASGGCRMLGLSKGSDSWNDRESFVEHVLTATSCRKLKPSQLLQVETGGKYRIVSKSDLGMNSLRPLHSAIYNHLSRFSWLLRGDAKASRFSDFVRKPGEVFVSGDYESATDNLNGHVQREILRMILEQTDHVPQGIKDRALESMTSELDFDGVTYQQRRGQLMGNLLSFPLLCIVNYLAFRWVAGPSCPVKINGDDIVFRSSPGIADRWMESVSLAGLTLSRGKTMVDKSYFTLNSRMFISGFSKVHSVPCVRATAFFGLKEGPESLKGRSQSFCEGFSGHRRSLLRSRWLKVNRGAIEYSRRSVTRGLGINFSYGDLIRAGLWDREAWYLSFESEKALPPRLSVLCQNGIPEGWAFRSVEKLNKKVRSWGKKVAPLLVERAWSEASLNDREQDWSSLVVQDTPSWDFFRQQRRRDLKRRSRLLGISARNTVRFLKPRLTTELQRRMGLWSLQRHSVLAPVSEDSDMLFSSEDNLFHIYGQCQHPRHPPPVEEGDVFEKTELRLGIVELKLGDEYHLGRMSNGQTVKVIAGGKGVQF

In [10]:
record

SeqRecord(seq=Seq('aaaaccttctgtcaagctactctctcaaggaagtagtttacccgggtcgtcact...cgc'), id='6PavOLV7||MN532594', name='6PavOLV7||MN532594', description=' 6PavOLV7||MN532594', dbxrefs=[])

In [16]:
hextranslate('aaaaccttctgtcaagctactctctcaaggaagtagtttacccgggtcgtcacttattaatgacttcaattaagaagtcactaggtgccgtgggatgtcctgttatggggcgcttaaaggagtttgtgggtgcgataggggaaccttacggtttgtccctacctcttcccgatctccaggccgttccattccaggatcgtttgtcgcttctcaagaaattctgcggaggatttcttgagaagccggtttgtcatttgtggcatcgtccgacgatgcatctgtctcgcaagagccggatgtccatcgcgatgtcactctttcttttccgtaaggtgctaccttcggaggagcatgacgtggtcgaatatgcaaagaagatgagtgaagagtctccggacccttctcctcattttcttcacttcattcgggaagagttaccgaagctcttccgctacgggtgggatagaggtgcgtacgagaattcatctttgaattccgtattaccgatatcctcttgccgtggctctgctcgtgcctccggtggttgtagaatgcttgggttgtcaaagggctctgactcgtggaatgatcgtgaatcgttcgtcgagcatgttttgacagcgacctcgtgccgcaaactcaagccgtctcaacttttgcaggttgagactggcgggaagtatcgaatagtatctaagtctgacttgggtatgaattctttacgtcctctccattctgctatctacaaccacctgtcccgtttctcttggctattgcgtggagacgcaaaagcttctcgattctcagacttcgttcggaagcctggtgaggtgtttgtttcaggtgactatgaatcagccacggataatctcaatggacatgtccagagagagattcttaggatgatcctcgaacaaacagatcacgtccctcaggggataaaggatcgtgcgctggagtctatgacgtcggaacttgatttcgacggagtgacgtaccagcagagaaggggacagctgatggggaacttactctcttttcctttgctttgcattgtcaattacctagcttttaggtgggtggcagggccgagttgccccgttaagataaacggggacgatatcgtatttcggtcgagtcccggaatagctgatcgttggatggagagtgtttcccttgcgggcctaaccctttctcgtggtaagacgatggtggataagtcttacttcactttgaactcgcgcatgtttatcagtggcttttcaaaagtccactcggtaccatgcgtcagagcaactgctttcttcggtttgaaggaaggcccagagagcctcaagggacgatcgcagtcgttctgcgagggcttctctggtcaccgtcgttctcttttgagaagcaggtggttgaaggtgaatcgcggagctatcgagtattcgagacgctccgtcacgagggggttagggatcaatttttcgtacggagatttgatccgtgccggtctctgggaccgtgaggcttggtatctctcatttgagagcgagaaggcattgcctcctcgtttatctgtactttgccaaaacggtatcccagaaggctgggcttttcgaagtgtcgaaaaattgaataagaaagttcgatcttgggggaagaaggtcgcgcctttgctcgtagagcgggcgtggtcggaagcgtcccttaatgatcgagaacaggattggtcgtctctcgttgtccaagataccccctcgtgggatttctttagacagcagcgacgcagggatctcaagaggcgttctcgtctcttgggtatctcggcacgcaataccgttcgttttttgaagcctcgtcttacgacggaacttcagaggaggatgggattgtggtcccttcagcgccattctgttctggcaccggtgtccgaagattcggacatgttgttctcctccgaggacaacttgtttcatatctacgggcaatgtcagcatccccggcaccctccgccagttgaagagggtgacgtatttgaaaagacggagctgagattgggtatcgtggagcttaagctgggcgacgagtaccatcttgggagaatgagtaacggtcaaaccgtgaaagtgatcgcaggagggaaaggcgtccaatttgggcccccaatattttaatctcctcgctcatcaccggggcggtagactccggaccgatgtttcgacgtcggaccgggtgtggtttacccattgtttataacgaacctaggcggttggcttttctgcatcggaaagatgtttacggagactgtaactcaaagagtccttcgggaggtatcttacagtcgggcctagattagttgccgattccctttggtctggaaggacctctgagggtaggtgagggaaggaaggaacaggttgcacgccgtaatgcgtgtttaccgttagccaacccatcgccgtctagcgtagttgaaacaaaaaggtggtgagggcttcattgtgttagcctacaccgcgcggcgtgcgcaacgccatgaactctgtgcgatcttggcacagatgcaaaatattgcgtgtgaacatggcggtagttggtacatttcgacgtcttcccgaaaggaagacgcaaccttgacgggttagtgtcggctgctgtcgtggtctccaaagaacccatagtgttagcctacaccgc')

['KTFCQATLSRK*FTRVVTY**LQLRSH*VPWDVLLWGA*RSLWVR*GNLTVCPYLFPISRPFHSRIVCRFSRNSAEDFLRSRFVICGIVRRCICLARAGCPSRCHSFFSVRCYLRRSMTWSNMQRR*VKSLRTLLLIFFTSFGKSYRSSSATGGIEVRTRIHL*IPYYRYPLAVALLVPPVVVECLGCQRALTRGMIVNRSSSMF*QRPRAANSSRLNFCRLRLAGSIE*YLSLTWV*ILYVLSILLSTTTCPVSLGYCVETQKLLDSQTSFGSLVRCLFQVTMNQPRIISMDMSRERFLG*SSNKQITSLRG*RIVRWSL*RRNLISTE*RTSREGDS*WGTYSLFLCFALSIT*LLGGWQGRVAPLR*TGTISYFGRVPE*LIVGWRVFPLRA*PFLVVRRWWISLTSL*TRACLSVAFQKSTRYHASEQLLSSV*RKAQRASRDDRSRSARASLVTVVLF*EAGG*R*IAELSSIRDAPSRGG*GSIFRTEI*SVPVSGTVRLGISHLRARRHCLLVYLYFAKTVSQKAGLFEVSKN*IRKFDLGGRRSRLCS*SGRGRKRPLMIENRIGRLSLSKIPPRGISLDSSDAGISRGVLVSWVSRHAIPFVF*SLVLRRNFRGGWDCGPFSAILFWHRCPKIRTCCSPPRTTCFISTGNVSIPGTLRQLKRVTYLKRRS*DWVSWSLSWATSTILGE*VTVKP*K*SQEGKASNLGPQYFNLLAHHRGGRLRTDVSTSDRVWFTHCL*RT*AVGFSASERCLRRL*LKESFGRYLTVGPRLVADSLWSGRTSEGR*GKEGTGCTP*CVFTVSQPIAV*RS*NKKVVRASLC*PTPRGVRNAMNSVRSWHRCKILRVNMAVVGTFRRLPERKTQP*RVSVGCCRGLQRTHSVSLHR',
 'AV*ANTMGSLETTTAADTNPSRLRLPFGKTSKCTNYRHVHTQYFASVPRSHRVHGVAHAARCRLTQ*SPHHLFVSTTLDGDGLANGKHALRRATCSFL

In [7]:
[i for i in res if len(i[1])>0]

[]

In [73]:
from subprocess import run
# run(['which','hhblits'])
run(['pwd'])

/home/hugheslab1/zfdeng/pangengraph_2


CompletedProcess(args=['pwd'], returncode=0)

In [74]:
from subprocess import run
infile='tmp/AAbV||GBBW01007738#2s#11629#11886.fasta'
o=run(['hhblits','-i',infile,'-d','hhs-db/pfam'],capture_output=True)#,'-o','hahaha.hhr'


In [75]:
skip_tag=1
for i in o.stdout.decode().split('\n'):
    if skip_tag:
        if i.startswith(' No Hit'):
            skip_tag=0
            print(','.join([i[:4],i[4:34],i[34:40],i[40:48],i[48:56],i[56:63],i[63:69],i[69:74],i[74:85],i[85:94],i[94:]]))
    else:
        if len(i)>90 and float(i[34:40].strip())>20:
            print(','.join([i[:4],i[4:34],i[34:40],i[40:48],i[48:56],i[56:63],i[63:69],i[69:74],i[74:85],i[85:94],i[94:]]))

 No ,Hit                           ,  Prob, E-value, P-value,  Score,    SS, Cols, Query HMM , Template, HMM
  1 ,PF15200.9 ; KRTDAP ; Keratinoc,  23.9,     4.4, 0.00084,   26.1,   0.0,   16,   87-102  ,  55-70  ,(77)


In [48]:
o.stderr.decode()


'- 23:13:23.140 INFO: Search results will be written to tmp/AAbV||GBBW01007738#2s#9112#9392.fasta.hhr\n\n- 23:13:23.156 INFO: Searching 19632 column state sequences.\n\n- 23:13:23.217 ERROR: Input file (tmp/AAbV||GBBW01007738#2s#9112#9392.fasta.fasta) could not be opened!\n\n'

In [25]:
print(o.stdout.decode())

Query         name=AAbV||GBBW01007738,transannot=0a,prob=188,proe=342
Match_columns 154
No_of_seqs    1 out of 1
Neff          1
Searched_HMMs 100
Date          Sun Mar 10 22:44:09 2024
Command       hhblits -i tmp/AAbV||GBBW01007738#0a#188#342.fasta -d hhs-db/pfam -o hahaha.hhr 

 No Hit                             Prob E-value P-value  Score    SS Cols Query HMM  Template HMM
  1 PF00424.21 ; REV ; REV protein   5.0      39   0.008   19.4   0.0   36   27-62     50-85  (91)
  2 PF17800.4 ; NPL ; Nucleoplasmi   3.8      49   0.011   20.3   0.0   15   23-37     23-37  (212)
  3 PF06390.15 ; NESP55 ; Neuroend   3.7      65   0.011   22.3   0.0   18   24-41     68-85  (257)
  4 PF10270.12 ; MMgT ; Membrane m   2.7      68   0.017   16.9   0.0   28   40-67     35-62  (120)
  5 PF14233.9 ; DUF4335 ; Domain o   2.6      75   0.018   18.2   0.0   15   42-56      4-18  (191)
  6 PF10880.11 ; DUF2673 ; Protein   2.2 1.3E+02   0.022   17.6   0.0   15   26-40      9-23  (82)
  7 PF19870.2 ; DUF63

In [14]:
infile

PosixPath('_data/genome_fasta/AAbV||GBBW01007738:segs.fasta')

In [13]:
len(open(infile,'r').readlines())

132

In [17]:
from Bio import Entrez
import xmltodict
Entrez.email='zfevan.deng@mail.utoronto.ca'

def fetch_parse(i:str):
    '''
    i: genbank accession
    '''
    xml_file=Entrez.efetch(db='nuccore',id=i,rettype="gb", retmode="xml")
    data_dict = xmltodict.parse(xml_file.read())
    return data_dict

import pickle as pkl
pkl.dump(fetch_parse('MG923574'),open('BtCoV#020_16#M.dauFIN#2016||MG923574.pkl','wb'))