In [None]:
import numpy as np
import pandas as pd
import math
from datetime import datetime
from os import path
from Bio import Entrez
from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastnCommandline
Entrez.email = ''
Entrez.api_key = ''

# register api key and input here along with the registered email


In [None]:
## Inspect the datafile
## the accession datafile can be downloaded from https://www.ncbi.nlm.nih.gov/gene/?term=rbcl

acc_file = 'GBgene/cytb_acc.txt'
seq_acc = pd.read_csv(acc_file, sep='\t')
print(seq_acc.shape)
seq_acc.head()


---
# Retrieve Gene seq and taxonomy from GenBank

In [None]:
acc_file = 'GBgene/cytb_acc.txt'
gene_name = 'CYTB'
## change accession datafile name and gene name

path_fastafile = 'GbRefgene/gb'+gene_name+'.fasta'


accs = pd.read_csv(acc_file, sep='\t')
#t0 = datetime.now()
#seqfasta = []
#ncaccs = []
#hsapiens = 0
#unknownpos = []

for idx, eachrow in accs.iterrows():
    if idx < 12649:    continue
    ncacc    = eachrow['genomic_nucleotide_accession.version']
    print(idx, 'ncacc =', ncacc)
    if type(ncacc) == float:    continue
    
    startpos = eachrow['start_position_on_the_genomic_accession']
    endpos   = eachrow['end_position_on_the_genomic_accession']
    if math.isnan(startpos) or math.isnan(endpos):
        unknownpos.append(ncacc)
        continue
    
    
    handle = Entrez.efetch(db="nucleotide", id=ncacc, rettype="gb", retmode="xml")
    gbrec  = Entrez.read(handle)
    gbrec  = gbrec[0]
    if 'Homo sapiens' in gbrec['GBSeq_organism']:
        hsapiens += 1
        continue
        
    handle = Entrez.efetch(db="nucleotide", id=ncacc, rettype="fasta", retmode="text")
    ftrec  = SeqIO.read(handle, 'fasta')
    
    startpos = int(startpos) - 1
    endpos   = int(endpos)
    ftrec.seq = ftrec.seq[startpos:endpos]
    ftrec.description = gbrec['GBSeq_taxonomy'].replace(' ', '') + ';' + gbrec['GBSeq_organism'].replace(' ', '_')
    
    
    while ncacc in ncaccs:
        ncacc = ncacc[:-1] + str(int(ncacc[-1])+1)
    ftrec.id = ncacc
    ncaccs.append(ncacc)
    seqfasta.append(ftrec)
    
    if (idx+1) % 1000 == 0:
        print('Row', idx, '- Number of seq =', len(seqfasta))


t1 = datetime.now()
print('Total time:', t1-t0)
print('Num of Homo sapiens =', hsapiens)
print('Num of seq with no position =', len(unknownpos))
SeqIO.write(seqfasta, path_fastafile, 'fasta')


---
## Prepare local Gene local BLAST database (only need to build once)

In [None]:
## change -title
## make sure the gene name is the same as previous
gene_name = 'CYTB'

path_fastafile = 'GbRefgene/gb'+gene_name+'.fasta'
dbtitle = gene_name+'ref'
!makeblastdb -in {path_fastafile} -parse_seqids -title {dbtitle} -dbtype nucl
