In [2]:
import numpy as np
import pandas as pd
import math
from datetime import datetime
from os import path
from Bio import Entrez
from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastnCommandline
Entrez.email = ""
Entrez.api_key = ''
# register api key and input here along with the registered email


In [2]:
## Inspect the datafile
## the accession datafile can be downloaded from https://www.ncbi.nlm.nih.gov/gene/?term=rbcl

acc_file = 'rbcl_acc.txt'
seq_acc = pd.read_csv(acc_file, sep='\t')
print(seq_acc.shape)
seq_acc.head()


(17597, 18)


Unnamed: 0,tax_id,Org_name,GeneID,CurrentID,Status,Symbol,Aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count,OMIM,Unnamed: 17
0,3055,Chlamydomonas reinhardtii,2717040,0,live,rbcL,ChreCp049,RuBisCO large subunit,"RuBisCO large subunit|ribulose-1,5-bisphosphat...",,,NC_005353.1,122490.0,123917.0,minus,0.0,,
1,4577,Zea mays,845212,0,live,rbcL,ZemaCp032,"ribulose-1,5-bisphosphate carboxylase/oxygenas...","ribulose-1,5-bisphosphate carboxylase/oxygenas...",,,NC_001666.2,56874.0,58304.0,plus,0.0,,
2,4097,Nicotiana tabacum,800513,0,live,rbcL,NitaCp031,RuBisCO large subunit,"RuBisCO large subunit|ribulose-1,5-bisphosphat...",,,NC_001879.2,57600.0,59033.0,plus,0.0,,
3,3702,Arabidopsis thaliana,844754,0,live,rbcL,ArthCp030,"ribulose-1,5-bisphosphate carboxylase/oxygenas...","ribulose-1,5-bisphosphate carboxylase/oxygenas...",,,NC_000932.1,54958.0,56397.0,plus,0.0,,
4,39946,Oryza sativa Indica Group,4126887,0,live,rbcL,OrsaiCp23,"ribulose-1,5-bisphosphate carboxylase/oxygenas...","ribulose-1,5-bisphosphate carboxylase/oxygenas...",,,NC_008155.1,54030.0,55484.0,plus,0.0,,


---
# Retrieve Gene seq and taxonomy from GenBank

In [None]:
acc_file = 'rbcl_acc.txt'
gene_name = 'RBCL'
## change accession datafile name and gene name

path_fastafile = 'GbRefgene/gb'+gene_name+'.fasta'


accs = pd.read_csv(acc_file, sep='\t')
t0 = datetime.now()
seqfasta = []
ncaccs = []
hsapiens = 0
unknownpos = []

for idx, eachrow in accs.iterrows():
    ncacc    = eachrow['genomic_nucleotide_accession.version']
    print(idx, 'ncacc =', ncacc)
    if type(ncacc) == float:    continue
    
    startpos = eachrow['start_position_on_the_genomic_accession']
    endpos   = eachrow['end_position_on_the_genomic_accession']
    if math.isnan(startpos) or math.isnan(endpos):
        unknownpos.append(ncacc)
        continue
    
    
    handle = Entrez.efetch(db="nucleotide", id=ncacc, rettype="gb", retmode="xml")
    gbrec  = Entrez.read(handle)
    gbrec  = gbrec[0]
    if 'Homo sapiens' in gbrec['GBSeq_organism']:
        hsapiens += 1
        continue
        
    handle = Entrez.efetch(db="nucleotide", id=ncacc, rettype="fasta", retmode="text")
    ftrec  = SeqIO.read(handle, 'fasta')
    
    startpos = int(startpos) - 1
    endpos   = int(endpos)
    ftrec.seq = ftrec.seq[startpos:endpos]
    ftrec.description = gbrec['GBSeq_taxonomy'].replace(' ', '') + ';' + gbrec['GBSeq_organism'].replace(' ', '_')
    
    
    while ncacc in ncaccs:
        ncacc = ncacc[:-1] + str(int(ncacc[-1])+1)
    ftrec.id = ncacc
    ncaccs.append(ncacc)
    seqfasta.append(ftrec)
    
    if (idx+1) % 1000 == 0:
        print('Row', idx, '- Number of seq =', len(seqfasta))


t1 = datetime.now()
print('Total time:', t1-t0)
print('Num of Homo sapiens =', hsapiens)
print('Num of seq with no position =', len(unknownpos))
SeqIO.write(seqfasta, path_fastafile, 'fasta')


---
## Prepare local Gene local BLAST database (only need to build once)

In [2]:
## change -title
## make sure the gene name is the same as previous
gene_name = 'RBCL'

path_fastafile = 'GbRefgene/gb'+gene_name+'.fasta'
dbtitle = gene_name+'ref'
!makeblastdb -in {path_fastafile} -parse_seqids -title {dbtitle} -dbtype nucl




Building a new DB, current time: 08/10/2021 17:41:53
New DB name:   /home/soonjye/Documents/Snowleopard_Github/GbRefgene/gbRBCL.fasta
New DB title:  COX3ref
Sequence type: Nucleotide
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 13801 sequences in 0.751754 seconds.
