In [4]:
import numpy as np
import pandas as pd
from os import path
from datetime import datetime
from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastnCommandline




---
## Build local BLAST database
#### only need to build once. Ignore this part if you have performed this before

In [5]:
## change the name of genes that is going to be built into local database
gene_names = ['COX3', 'CYTB']

for gene_name in gene_names:
    path_fastafile = 'GbRefgene/gb'+gene_name+'.fasta'
    dbtitle = gene_name+'ref'
    !makeblastdb -in {path_fastafile} -parse_seqids -title {dbtitle} -dbtype nucl





Building a new DB, current time: 08/10/2021 17:52:16
New DB name:   /home/soonjye/Documents/Snowleopard_Github/GbRefgene/gbCOX3.fasta
New DB title:  COX3ref
Sequence type: Nucleotide
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 9470 sequences in 0.252355 seconds.


Building a new DB, current time: 08/10/2021 17:52:16
New DB name:   /home/soonjye/Documents/Snowleopard_Github/GbRefgene/gbCYTB.fasta
New DB title:  CYTBref
Sequence type: Nucleotide
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 9977 sequences in 0.313547 seconds.


---
## Prepare Query file
fasta file of sequencing reads

In [6]:
query_files = ['CT-4.fa']
# change to filenames that is going to be blasted

gene_names = ['COX3', 'CYTB']
# change the list of genes that is gonna be blasted upon

---
## Run Blast against Local Reference Databases

In [8]:
for query_file in query_files:

    !mkdir {query_file}_blastResults

    for gene_name in gene_names:
        
        path_fastafile = 'GbRefgene/gb'+gene_name+'.fasta'
        path_blastout  = query_file+'_blastResults/blast'+gene_name+'.tsv'
        
        blastx_cline = NcbiblastnCommandline(query=query_file, db=path_fastafile, evalue=0.001, outfmt="6 qseqid qlen sseqid stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore", out=path_blastout)
        print('\nBlasting '+query_file+' against', gene_name, 'database...', end=" ")
        t0 = datetime.now()
        stdout, stderr = blastx_cline()
        t1 = datetime.now()
        print('Completed. Runtime: ', t1 - t0)
                                     
    print('\n')




Blasting CT-4.fa against COX3 database... Completed. Runtime:  0:02:39.165319 


Blasting CT-4.fa against CYTB database... Completed. Runtime:  0:02:47.590053 



---
# Parse Blast results

In [75]:
barcodes = ['COI', 'COX3', 'CYTB', 'ND2', 'ND4']

print('Query File =', query_file)
for barcode in barcodes:
    blastn = pd.read_csv(query_file+'_blastResults/blast'+barcode+'.tsv', sep='\t', header=None)
    blastn.columns = 'qseqid qlen sseqid stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore'.split(' ')
    reads = set(blastn['qseqid'])
    print('Parsing blast ' + barcode + ' result \t Number of reads with hits =', len(reads))


    punciaidx  = []
    pantheridx = []
    neitheridx = []
    prev = ''
    
    ## filter hits if it belongs to human, bacteria, or fungi
    for idx, eachrow in blastn.iterrows():
        if (idx+1) % 100000 == 0:    print(idx)
        
        if 'Homo_sapiens' in eachrow['stitle']:    continue
        if 'Bacteria' in eachrow['stitle']:    continue
        if 'Fungi' in eachrow['stitle']:    continue

        read = eachrow['qseqid']
        if read != prev:
            prev = read
            if 'Panthera_uncia' in eachrow['stitle']:
                punciaidx.append(idx)
            elif 'Panthera' in eachrow['stitle']:
                pantheridx.append(idx)
            else:
                neitheridx.append(idx)


    ## filter hits if it is <98% identity and <50 bp overlap
    subset = blastn.loc[punciaidx+pantheridx+neitheridx, :]

    towrite= []
    for idx, eachrow in subset.iterrows():
        qlen   = eachrow[1]
        alen   = eachrow[5]
        pident = eachrow[4]
        if alen > 50 and pident > 98:
            towrite.append(idx)

    if len(towrite) == 0:
        print('Parsing blast ' + barcode + ' result \t No hit that belongs to Panthera nor others with identity > 98%')
        continue

    ## retrieve read sequence from query_file
    subset = subset.loc[towrite,:]
    qseqids = list(set(list(subset['qseqid'])))
    qseqdict = dict()

    zipfa = SeqIO.parse(query_file, 'fasta')
    for read in zipfa:
        if read.id in qseqids:
            qseqdict[read.id] = read.seq

    qseqs = []
    for idx, eachrow in subset.iterrows():
        qseqs.append(str(qseqdict[eachrow['qseqid']]))
    subset['qseq'] = qseqs
    subset.to_csv(query_file + '_blastResults/blast' + barcode + '-iden98.tsv', sep='\t')


Query File = CT-4.fa
Parsing blast COI result 	 Number of reads with hits = 77
Parsing blast COX3 result 	 Number of reads with hits = 14
Parsing blast CYTB result 	 Number of reads with hits = 2439
Parsing blast ND2 result 	 Number of reads with hits = 189
Parsing blast ND4 result 	 Number of reads with hits = 193


## Extract Top 1 hit for each read

In [76]:
barcodes = ['COI', 'COX3', 'CYTB', 'ND2', 'ND4']

print('Query File =', query_file)
print('Extract top hit of blast result (>98% identity & >50bp overlap)')
for barcode in barcodes:
    print('\tExtracting gene', barcode, '...')
    if not path.exists(query_file + '_blastResults/blast' + barcode + '-iden98.tsv'):    continue
    ident = pd.read_csv(query_file + '_blastResults/blast' + barcode + '-iden98.tsv', sep='\t', index_col=0)

    qseqids = []
    best5 = []
    best1 = []

    for idx, eachrow in ident.iterrows():
        qseqid = eachrow['qseqid']
        if qseqid in qseqids:    continue

        qseqids.append(qseqid)

        subset = ident[ident['qseqid'] == qseqid]
        subset = subset.sort_index('index')
        
        ## the index is generated by blastn output
        ## the index is arranged according to qseqid, then e-value
        [best1.append(x) for x in subset.index[:1]]

    ident.loc[best1, :].to_csv(query_file + '_blastResults/blast' + barcode + '-top1.tsv', sep='\t')

print('\tCompleted')

Query File = CT-4.fa
Extract top hit of blast result (>98% identity & >50bp overlap)
	Extracting gene COI ...
	Extracting gene COX3 ...
	Extracting gene CYTB ...
	Extracting gene ND2 ...
	Extracting gene ND4 ...
	Completed


## Counting Prey Species

In [77]:
allprey = dict()

barcodes = ['COI', 'COX3', 'CYTB', 'ND2', 'ND4']

print('Query File =', query_file)
print('Counting species from top1 result...')
for idx, barcode in enumerate(barcodes):
    print('\tCounting of gene', barcode, '...')
    if not path.exists(query_file + '_blastResults/blast' + barcode + '-top1.tsv'):    continue
    ident = pd.read_csv(query_file + '_blastResults/blast' + barcode + '-top1.tsv', sep='\t', index_col=0)
    
    preys = list(ident.stitle)
    preys = [x.split(';')[-1] for x in preys]

    for prey in preys:
        if prey not in allprey.keys():    allprey[prey] = [0]*len(barcodes)
        allprey[prey][idx] += 1

allprey = pd.DataFrame.from_dict(allprey, orient='index')
allprey.columns = barcodes
allprey.to_csv(query_file + '_blastResults/spc_count_.tsv', sep='\t')


print('\tCompleted')

Query File = CT-4.fa
Counting species from top1 result...
	Counting of gene COI ...
	Counting of gene COX3 ...
	Counting of gene CYTB ...
	Counting of gene ND2 ...
	Counting of gene ND4 ...
	Completed
