In [69]:
import numpy as np
import pandas as pd
from os import path
from datetime import datetime
from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastnCommandline



---
## Build local BLAST database (only need to build once)

In [4]:
#!makeblastdb -in GenbankRef/gbCOI.fasta -parse_seqids -title COIref -dbtype nucl
#!makeblastdb -in GenbankRef/gbCOX3.fasta -parse_seqids -title COX3ref -dbtype nucl
#!makeblastdb -in GenbankRef/gbCYTB.fasta -parse_seqids -title CYTBref -dbtype nucl
#!makeblastdb -in GenbankRef/gbND2.fasta -parse_seqids -title ND2ref -dbtype nucl
#!makeblastdb -in GenbankRef/gbND4.fasta -parse_seqids -title ND4ref -dbtype nucl



Building a new DB, current time: 03/18/2021 03:15:39
New DB name:   /home/soonjye/Documents/Snowleopard_Github/GenbankRef/gbCOI.fasta
New DB title:  COIref
Sequence type: Nucleotide
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 837845 sequences in 21.7983 seconds.


Building a new DB, current time: 03/18/2021 03:16:02
New DB name:   /home/soonjye/Documents/Snowleopard_Github/GenbankRef/gbCOX3.fasta
New DB title:  COX3ref
Sequence type: Nucleotide
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 9470 sequences in 0.249865 seconds.


Building a new DB, current time: 03/18/2021 03:16:02
New DB name:   /home/soonjye/Documents/Snowleopard_Github/GenbankRef/gbCYTB.fasta
New DB title:  CYTBref
Sequence type: Nucleotide
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 9977 sequences in 0.307142 seconds.


Building a new DB, current time: 03/18/202

---
## Prepare Query file
fasta file of sequencing reads

In [11]:
query_file = 'CT-4.fa'

---
## Run Blast against Local Reference Databases

In [None]:
!mkdir {query_file}_blastResults

blastx_cline = NcbiblastnCommandline(query=query_file, db='GenbankRef/gbCOI.fasta', evalue=0.001, outfmt="6 qseqid qlen sseqid stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore", out=query_file+'_blastResults/blastCOI.tsv')
print('\nBlasting '+query_file+' against COI database...', end=" ")
t0 = datetime.now()
stdout, stderr = blastx_cline()
t1 = datetime.now()
print('Completed. Runtime: ', t1 - t0, '\n')
                                     
blastx_cline = NcbiblastnCommandline(query=query_file, db='GenbankRef/gbCOX3.fasta', evalue=0.001, outfmt="6 qseqid qlen sseqid stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore", out=query_file+'_blastResults/blastCOX3.tsv')
print('Blasting '+query_file+' against COX3 database...', end=" ")
t0 = datetime.now()
stdout, stderr = blastx_cline()
t1 = datetime.now()
print('Completed. Runtime: ', t1 - t0, '\n')

blastx_cline = NcbiblastnCommandline(query=query_file, db='GenbankRef/gbCYTB.fasta', evalue=0.001, outfmt="6 qseqid qlen sseqid stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore", out=query_file+'_blastResults/blastCYTB.tsv')
print('Blasting '+query_file+' against CYTB database...', end =" ")
t0 = datetime.now()
stdout, stderr = blastx_cline()
t1 = datetime.now()
print('Completed. Runtime: ', t1 - t0, '\n')

blastx_cline = NcbiblastnCommandline(query=query_file, db='GenbankRef/gbND2.fasta', evalue=0.001, outfmt="6 qseqid qlen sseqid stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore", out=query_file+'_blastResults/blastND2.tsv')
print('Blasting '+query_file+' against ND2 database...', end =" ")
t0 = datetime.now()
stdout, stderr = blastx_cline()
t1 = datetime.now()
print('Completed. Runtime: ', t1 - t0, '\n')

blastx_cline = NcbiblastnCommandline(query=query_file, db='GenbankRef/gbND4.fasta', evalue=0.001, outfmt="6 qseqid qlen sseqid stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore", out=query_file+'_blastResults/blastND4.tsv')
print('Blasting '+query_file+' against ND4 database...', end =" ")
t0 = datetime.now()
stdout, stderr = blastx_cline()
t1 = datetime.now()
print('Completed. Runtime: ', t1 - t0, '\n')


mkdir: cannot create directory ‘CT-4.fa_blastResults’: File exists

Blasting CT-4.fa against COI database... 

---
# Parse Blast results

In [46]:
barcodes = ['COI', 'COX3', 'CYTB', 'ND2', 'ND4']

print('Query File =', query_file)
for barcode in barcodes:
    blastn = pd.read_csv(query_file+'_blastResults/blast'+barcode+'.tsv', sep='\t', header=None)
    blastn.columns = 'qseqid qlen sseqid stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore'.split(' ')
    reads = set(blastn['qseqid'])
    print('Parsing blast ' + barcode + ' result \t Number of reads with hits =', len(reads))


    punciaidx  = []
    pantheridx = []
    neitheridx = []
    prev = ''
    
    ## filter hits if it belongs to human, bacteria, or fungi
    for idx, eachrow in blastn.iterrows():
        if (idx+1) % 100000 == 0:    print(idx)
        
        if 'Homo_sapiens' in eachrow['stitle']:    continue
        if 'Bacteria' in eachrow['stitle']:    continue
        if 'Fungi' in eachrow['stitle']:    continue

        read = eachrow['qseqid']
        if read != prev:
            prev = read
            if 'Panthera_uncia' in eachrow['stitle']:
                punciaidx.append(idx)
            elif 'Panthera' in eachrow['stitle']:
                pantheridx.append(idx)
            else:
                neitheridx.append(idx)


    ## filter hits if it is <98% identity and <50 bp overlap
    subset = blastn.loc[punciaidx+pantheridx+neitheridx, :]

    towrite= []
    for idx, eachrow in subset.iterrows():
        qlen   = eachrow[1]
        alen   = eachrow[5]
        pident = eachrow[4]
        if alen > 50 and pident > 98:
            towrite.append(idx)

    if len(towrite) == 0:
        print('Parsing blast ' + barcode + ' result \t No hit that belongs to Panthera nor others with identity > 98%')
        continue

    ## retrieve read sequence from query_file
    subset = subset.loc[towrite,:]
    qseqids = list(set(list(subset['qseqid'])))
    qseqdict = dict()

    zipfa = SeqIO.parse(query_file, 'fasta')
    for read in zipfa:
        if read.id in qseqids:
            qseqdict[read.id] = read.seq

    qseqs = []
    for idx, eachrow in subset.iterrows():
        qseqs.append(str(qseqdict[eachrow['qseqid']]))
    subset['qseq'] = qseqs
    subset.to_csv(query_file + '_blastResults/blast' + barcode + '-iden98.tsv', sep='\t')


Query File = CT-4.fa
Parsing blastCOX3results 	 Number of reads with hits = 14
Parsing blastCYTBresults 	 Number of reads with hits = 2439
Parsing blastND2results 	 Number of reads with hits = 189
Parsing blastND4results 	 Number of reads with hits = 193


## Extract Top 1 hit for each read

In [60]:
barcodes = ['COI', 'COX3', 'CYTB', 'ND2', 'ND4']

print('Query File =', query_file)
print('Extract top hit of blast result (>98% identity & >50bp overlap)')
for barcode in barcodes:
    print('\tExtracting gene', barcode, '...')
    if not path.exists(query_file + '_blastResults/blast' + barcode + '-iden98.tsv'):    continue
    ident = pd.read_csv(query_file + '_blastResults/blast' + barcode + '-iden98.tsv', sep='\t', index_col=0)

    qseqids = []
    best5 = []
    best1 = []

    for idx, eachrow in ident.iterrows():
        qseqid = eachrow['qseqid']
        if qseqid in qseqids:    continue

        qseqids.append(qseqid)

        subset = ident[ident['qseqid'] == qseqid]
        subset = subset.sort_index('index')
        
        ## the index is generated by blastn output
        ## the index is arranged according to qseqid, then e-value
        [best1.append(x) for x in subset.index[:1]]

    ident.loc[best1, :].to_csv(query_file + '_blastResults/blast' + barcode + '-top1.tsv', sep='\t')

print('\tCompleted')

Query File = CT-4.fa
Extract top hit of blast result (>98% identity & >50bp overlap)
	Extracting gene COI ...
	Extracting gene COX3 ...
	Extracting gene CYTB ...
	Extracting gene ND2 ...
	Extracting gene ND4 ...
	Completed


## Counting Prey Species

In [73]:
allprey = dict()

barcodes = ['COI', 'COX3', 'CYTB', 'ND2', 'ND4']

print('Query File =', query_file)
print('Counting species from top1 result...')
for idx, barcode in enumerate(barcodes):
    print('\tCounting of gene', barcode, '...')
    if not path.exists(query_file + '_blastResults/blast' + barcode + '-top1.tsv'):    continue
    ident = pd.read_csv(query_file + '_blastResults/blast' + barcode + '-top1.tsv', sep='\t', index_col=0)
    
    preys = list(ident.stitle)
    preys = [x.split(';')[-1] for x in preys]

    for prey in preys:
        if prey not in allprey.keys():    allprey[prey] = [0]*len(barcodes)
        allprey[prey][idx] += 1

allprey = pd.DataFrame.from_dict(allprey, orient='index')
allprey.columns = barcodes
allprey.to_csv(query_file + '_blastResults/spc_count_.tsv', sep='\t')


print('\tCompleted')

Query File = CT-4.fa
Counting species from top1 result...
	Counting of gene COI ...
	Counting of gene COX3 ...
	Counting of gene CYTB ...
	Counting of gene ND2 ...
	Counting of gene ND4 ...
	Complete
