In [1]:
import pandas as pd
vmr=pd.read_csv('VMR_MSL38_v2.csv')
vmr.columns

Index(['Sort', 'Isolate Sort', 'Realm', 'Subrealm', 'Kingdom', 'Subkingdom',
       'Phylum', 'Subphylum', 'Class', 'Subclass', 'Order', 'Suborder',
       'Family', 'Subfamily', 'Genus', 'Subgenus', 'Species',
       'Exemplar or additional isolate', 'Virus name(s)',
       'Virus name abbreviation(s)', 'Virus isolate designation',
       'Virus GENBANK accession', 'Virus REFSEQ accession', 'Genome coverage',
       'Genome composition', 'Host source'],
      dtype='object')

In [None]:
#TODO logging control!

Zika virus: Sort=7210-7213, Genus=Orthoflavivirus, Family=Flaviviridae
Ebola virus: Sort=8655, Genus=Orthoebolavirus, Family=Filoviridae
measles virus: Sort=8828 Genus=Morbillivirus Family=Paramyxoviridae


In [2]:
print('Family:')
for i in ['Flaviviridae','Filoviridae','Paramyxoviridae']:
    l=len(vmr[vmr['Family']==i])
    print(f'{i},{l}')
print('Genus:')
for i in ['Orthoflavivirus','Orthoebolavirus','Morbillivirus']:
    l=len(vmr[vmr['Genus']==i])
    print(f'{i},{l}')

Family:
Flaviviridae,148
Filoviridae,16
Paramyxoviridae,87
Genus:
Orthoflavivirus,79
Orthoebolavirus,6
Morbillivirus,7


## S1: 

*-subset: subset of vmr with additional col of "true_access","true_name";

*-fetch_names.list: (list of scan json)

check: Step1:2_3,ipynb, do_efetch.py


In [3]:
from typing import List,Dict
from warnings import warn
def robust_split(s:str,split:str)->List[str]:
    return [i.strip() for i in s.split(split)]

def get_genbank_id(accession:str)->Dict[str,str]:
    assert isinstance(accession,str),f'accession: {accession} is not a str!'
    o={}
    holder_token=0
    if ';' in accession:
        for sub_a in robust_split(accession,';'):
            if ':' in sub_a:
                subk,subv=robust_split(sub_a,':')
                o[subk]=subv
            else:
                o[f'{holder_token}']=sub_a
                holder_token+=1
    else:
        if ":" in accession:
            subk,subv=robust_split(accession,':')
            o[subk]=subv
        else:
            o['_']=accession
    return o

def get_correct_name(name:str)->str:
    "some names block contains multiple name sep by ';'"
    assert isinstance(name,str),f'valid input: {name}'
    if ';' in name:
        warn(f'multiple names: {name},use the first one')
        return robust_split(name,';')[0]
    else:
        return name
    
def is_multiple_access(access:Dict[str,str])->bool:
    ''' `False`: one seg; `True`:c segs '''
    if '_' in access:
        return False
    else:
        return True
    
def get_file_stem(virus_series:pd.Series,
    access_col='true_access',name_col='true_name')->List[str]:
    _=virus_series
    access:Dict[str,str]= _[access_col]
    if not is_multiple_access(_[access_col]):
        return [f"{_[name_col]}||{access['_']}"]
    else:
        return [f"{_[name_col]}|{k}|{v}" for k,v in access.items()]
    

In [28]:
from pathlib import Path
from subprocess import run

def extract_subset(col:str='Family',kw:str='Flaviviridae',
                   odir='zika_subset',all_scan='scan_result',
                   vmr:pd.DataFrame=vmr):
    '''
    `col`,`kw`: selection condition on `all_scan`
    
    `odir`: recommended suffix of '_subset'
    '''
    odir,all_scan=Path(odir),Path(all_scan)
    #
    odir.mkdir(mode=511,exist_ok=True)
    sub_list=vmr[vmr[col]==kw].copy(deep=True)
    sub_list['true_access']=sub_list['Virus GENBANK accession'].apply(get_genbank_id)
    sub_list['true_name']=sub_list['Virus name abbreviation(s)'].apply(get_correct_name)

    #save subset.csv
    file_stems=[]
    for _,s in sub_list.iterrows():
        file_stems.extend(get_file_stem(s))
    sub_list.to_csv(odir.with_suffix('.csv'),index=False)

    #write hard copy of subset
    err_file=odir.with_suffix('.err')
    with open(str(err_file),'w') as f:
        for i in file_stems:
            gf=all_scan/f'{i}:genome.json'
            sf=all_scan/f'{i}:segs.json'
            if not gf.is_file():
                print(f'not found: {gf}', file=f)
            else:
                run(['cp',gf.absolute(),(odir/gf.name).absolute()])
            if not sf.is_file():
                print(f'not found: {sf}', file=f)
            else:
                run(['cp',sf.absolute(),(odir/sf.name).absolute()])
    
    #write subset list
    list_file=odir.with_suffix('.list')
    with open(str(list_file),'w') as f:
        for i in odir.iterdir():
            print(i.name,file=f)
        
    
    


In [27]:
family_dict={
    'zika':'Flaviviridae',
    'ebola':'Filoviridae',
    'measles':'Paramyxoviridae'
}
for k,v in family_dict.items():
    extract_subset(kw=v,odir=f'{k}_subset')

## S2: 

*domains.csv: every domain in the subset entries

check: Step1:extract_domains.py


*accession-annotation.csv: the unique accession-annotation col in domains.csv

check: Step2:scatter_plot.py

In [40]:
from glob import glob
from typing import Generator,Tuple
import json
import pandas as pd
def iter_match(genome_dict:dict)->Generator[Tuple[dict,dict],None,None]:
    '''
    in a scan result json dict (of nt),
    iter through all orf-match pairs
    '''
    for orf in genome_dict['results'][0]['openReadingFrames']:
        for match in orf['protein']['matches']:
            yield (orf,match)
def get_domains(file_limit='nido_subset/*:genome.json',
                ostem='nido-domains'):
    '''
    get all PFAM hit from files matching the `file_limit` 
    and save to ${ostem}.csv
    
    entries of output scv: 
    genome_name,genome_length,domain_accession,strand,
    start,end,hmmStart,hmmEnd,evalue,domain_annotation
    
    accession: interproscan id
    '''
    o=[]
    for gfile in glob(file_limit):
        # print('\n\n##'+gfile.split('/')[1].replace(':genome.json',''))
        genome_name=gfile.split('/')[1].replace(':genome.json','')
        genome_dict=json.load(open(gfile))
        genome_length=len(genome_dict['results'][0]['sequence'])
        for orf,match in iter_match(genome_dict):
            orf_info=orf['start'],orf['end'],orf['strand']
            match_info=(match['signature']['accession'],
                    f"{match['signature']['name']}:{match['signature']['description']}",
                    match['signature']['signatureLibraryRelease']['library'])
            if match_info[2]=='PFAM': #'PROSITE_PROFILES'
                for loc in match['locations']:
                    data=(loc['start'],
                        loc['end'],
                        loc['hmmStart'],
                        loc['hmmEnd'],
                        loc['evalue'])
                    entry={}
                    entry['genome_name']=genome_name
                    entry['genome_length']=genome_length
                    entry['domain_accession']=match_info[0]
                    
                    entry['strand']=orf_info[2]
                    if entry['strand']=='SENSE':
                        entry['start']=orf_info[0]+data[0]*3
                        entry['end']=orf_info[0]+data[1]*3
                    else:
                        entry['start']=orf_info[1]-data[1]*3
                        entry['end']=orf_info[1]-data[0]*3
                    entry['hmmStart']=data[2]
                    entry['hmmEnd']=data[3]
                    entry['evalue']=data[4]
                    entry['domain_annotation']=match_info[1]
                    o.append(entry)
    
    domains=pd.DataFrame(o)
    opath=Path(ostem).with_suffix('.csv')
    domains.to_csv(opath,index=False)
    
    acan_dict={}
    acan_dict['accession'],acan_dict['annotation']=[],[]
    for accession in domains['domain_accession'].unique():
        annot=domains[domains['domain_accession']==accession].iloc[0]['domain_annotation']
        acan_dict['accession'].append(accession)
        acan_dict['annotation'].append(annot)
        
    domain_annotations=pd.DataFrame(acan_dict)
    domain_annotations.to_csv(Path(ostem+'-acan').with_suffix('.csv'),index=False)

In [41]:
family_dict={
    'zika':'Flaviviridae',
    'ebola':'Filoviridae',
    'measles':'Paramyxoviridae'
}


for k,v in family_dict.items():
    get_domains(file_limit=f'{k}_subset/*:genome.json',
                ostem=f'{k}-domains')

## S3:
*sort.list

check Step1: extract_domains.py


In [44]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from glob import glob
import json
from typing import Generator,Tuple,Optional,Iterable
import pandas as pd
from Bio import Align

def iter_match(genome_dict:dict)->Generator[Tuple[dict,dict],None,None]:
    '''
    same itermath as `extract_domains.py`
    put here for convienience
    '''
    for orf in genome_dict['results'][0]['openReadingFrames']:
        for match in orf['protein']['matches']:
            yield (orf,match)

def get_fasta(accession:str='PF00680',file_limit='nido_subset/*:genome.json'):
    '''
    file_limit: restrict the file to search for the accession;
    accession: in each file, the accession to fetch;
    '''
    o=[]
    o_dict={}
    b=0
    for gfile in glob(file_limit):
        # print('\n\n##'+gfile.split('/')[1].replace(':genome.json',''))
        genome_name=gfile.split('/')[1].replace(':genome.json','')
        genome_dict=json.load(open(gfile))
        for orf,match in iter_match(genome_dict):
            if match['signature']['accession']==accession:
                seq=orf['protein']['sequence']
                if len(match['locations'])==1:
                    _=match['locations'][0]
                    b,e=_['start'],_['end']
                    rdrp_seq=seq[b:e]
                    rdrp_name=genome_name
                    o.append(f'>{rdrp_name}\n{rdrp_seq}')
                    o_dict[rdrp_name]=rdrp_seq
                else:
                    for i,_ in enumerate(match['locations']):
                        b,e=_['start'],_['end']
                        rdrp_seq=seq[b:e]
                        rdrp_name=genome_name+'#'+str(i)
                        o.append(f'>{rdrp_name}\n{rdrp_seq}')
                        o_dict[rdrp_name]=rdrp_seq
    # print('\n'.join(o),file=open('nido-rdrp.fasta','w'))  
    return o_dict,o

def write_fasta(file_limit:str='nido_subset/*:genome.json',
                odir:str='cov19-hits',suffix:str='',
                used_domain:Iterable[str]=['PF00680'],
                ):
    '''
    `file_limit`: decides json to be extract
    `odir`: dir to save those fasta
    `used_domain`: decide which domains will be extracted.
    '''
    o={}
    odir=Path(odir)
    odir.mkdir(mode=511,exist_ok=True)
    for i in used_domain:
        o[i]=get_fasta(i,file_limit)[0]
    for k,v in o.items():
        with open(str(odir/f'{k}{suffix}.fasta'),'w') as f:
            fastas='\n'.join([f'>{k1}\n{v1}' for k1,v1 in v.items()])
            f.write(fastas)

In [46]:
ref_genome_dict={
    'zika':'ZIKV||AY632535',
    'ebola':'EBOV||AF086833',
    'measles':'MeV||AB016162'
}

In [47]:
for k,v in ref_genome_dict.items():
    write_fasta(file_limit=f'{k}_subset/*:genome.json',
                odir=f'{k}_fasta',
                used_domain=pd.read_csv(f'{k}-domains-acan.csv')['accession'])
    write_fasta(file_limit=f'{k}_subset/{v}:genome.json',
            odir=f'{k}_fasta',
            suffix='-ref',
            used_domain=pd.read_csv(f'{k}-domains-acan.csv')['accession'])

In [49]:
from subprocess import run,PIPE

def cal_diamond_identity(wdir:str='cov19-hits',
                     ref_suffix:str='-ref'):
    for i in Path(wdir).iterdir():
        stem=i.stem
        if ref_suffix not in stem and i.suffix=='.fasta':
            _=run(['./diamond',
                    'makedb',
                    '--in',
                    str(i),
                    '-d',
                    str(i.with_name(f'{stem}-reference'))],
                    stdout=PIPE,
                    stderr=PIPE)
            _=run(['./diamond','blastp',
                '-d',str(i.with_name(f'{stem}-reference')),
                    '-q',str(i.with_name(f'{stem}{ref_suffix}.fasta')),
                    '-o',str(i.with_name(f'{stem}-match.tsv')),
                    '--id', '0' ,
                    '--max-target-seqs', '300', 
                    '--header', 'verbose', 
                    '--min-score', '0', 
                    '--query-cover', '0', 
                    '--subject-cover', '0', 
                    '--evalue','1'],
                    stdout=PIPE,
                    stderr=PIPE)

In [50]:
for k,v in ref_genome_dict.items():
    cal_diamond_identity(f'{k}_fasta')