In [1]:
from Bio import Entrez
Entrez.email = 'c.du@biology.leidenuniv.nl'
import os

In [2]:
targets = [] # target ID storage
count = 0    #  
c = 0
step = 2000
webaccession = None

In [3]:
while c < count or count == 0:
    print(f'No.{int(c/step+1)}', end = '>')
    handle = Entrez.esearch(db='nuccore', 
                            term='txid85011[Organism] AND refseq[filter]',
                            # txid85011 = Lineage (full): 
                            #     root; cellular organisms; Bacteria;
                            #         Terrabacteria group; Actinobacteria; Actinobacteria
                            usehistory=True,
                            webenv=webaccession, # reuse teh first query
                            retstart=c, # Continue from num c
                            retmax=step # Maxium number returned
                           )
    record = Entrez.read(handle)
    handle.close()
    
    if 'WarningList' in record: # If anything wrong I could know
        print(record['WarningList']['OutputMessage'])
    
    # Put current list of IDs in our targets dict    
    targets.append(record['IdList'])
    print(f"{len(record['IdList'])}", end = '|')  # the number got from this loop.
    
    if webaccession == None:
        webaccession = record["WebEnv"] # start new search only in the first loop
        print(f'\nWebAccession:\n{webaccession}')
        count = int(record['Count']) # total number of hits get from the first search attempt
        print(f"Total entry {count}.")    

    c += step
    
#     if c > step*2:
#         break

# now print and check the first 5 IDs of the first 10 lists of IDs
print(f"\nTotal target sets {len(targets)}\nFirst 10:")
for targ in targets[:10]:
    print(f"{targ[:5]}...")
print('...')

No.1>2000|
WebAccession:
NCID_1_77863634_130.14.18.34_9001_1537132678_1127887493_0MetA0_S_MegaStore
Total entry 278135.
No.2>2000|No.3>2000|No.4>2000|No.5>2000|No.6>2000|No.7>2000|No.8>2000|No.9>2000|No.10>2000|No.11>2000|No.12>2000|No.13>2000|No.14>2000|No.15>2000|No.16>2000|No.17>2000|No.18>2000|No.19>2000|No.20>2000|No.21>2000|No.22>2000|No.23>2000|No.24>2000|No.25>2000|No.26>2000|No.27>2000|No.28>2000|No.29>2000|No.30>2000|No.31>2000|No.32>2000|No.33>2000|No.34>2000|No.35>2000|No.36>2000|No.37>2000|No.38>2000|No.39>2000|No.40>2000|No.41>2000|No.42>2000|No.43>2000|No.44>2000|No.45>2000|No.46>2000|No.47>2000|No.48>2000|No.49>2000|No.50>2000|No.51>2000|No.52>2000|No.53>2000|No.54>2000|No.55>2000|No.56>2000|No.57>2000|No.58>2000|No.59>2000|No.60>2000|No.61>2000|No.62>2000|No.63>2000|No.64>2000|No.65>2000|No.66>2000|No.67>2000|No.68>2000|No.69>2000|No.70>2000|No.71>2000|No.72>2000|No.73>2000|No.74>2000|No.75>2000|No.76>2000|No.77>2000|No.78>2000|No.79>2000|No.80>2000|No.81>2000|No.82>20

# Fetching all data from online (getting IDs from previous stored data)

In [4]:
import pickle
import lzma
from datetime import date

In [8]:
# Dump targets got from search in pickle file
targetsPickle = f'refseq_strep_{date.today().strftime("%Y%m%d")}.pickle.xz'
with lzma.open(targetsPickle, 'wb') as output:
    pickle.dump(targets, output)
targetsPickle

'refseq_strep_20180916.pickle.xz'

In [9]:
# Read data from last search
dateOfSearch = 20170902
with lzma.open(f'refseq_strep_{dateOfSearch}.pickle.xz', 'rb') as pickle_in:
    targets = pickle.load(pickle_in)

FileNotFoundError: [Errno 2] No such file or directory: 'refseq_strep_20170902.pickle.xz'

In [6]:
nucids = []
total_nucid = 0

for i in range(len(targets)): # flatten the targets list of list
    targ = targets[i]
    print(f'No.{i+1}', end = '|')
    for nucid in targ:
        nucids.append(nucid)

nucids = list(set(nucids)) # remove redundant if we have some
print(f"\nTotal nucids {len(nucids)}")

No.1|No.2|No.3|No.4|No.5|No.6|No.7|No.8|No.9|No.10|No.11|No.12|No.13|No.14|No.15|No.16|No.17|No.18|No.19|No.20|No.21|No.22|No.23|No.24|No.25|No.26|No.27|No.28|No.29|No.30|No.31|No.32|No.33|No.34|No.35|No.36|No.37|No.38|No.39|No.40|No.41|No.42|No.43|No.44|No.45|No.46|No.47|No.48|No.49|No.50|No.51|No.52|No.53|No.54|No.55|No.56|No.57|No.58|No.59|No.60|No.61|No.62|No.63|No.64|No.65|No.66|No.67|No.68|No.69|No.70|No.71|No.72|No.73|No.74|No.75|No.76|No.77|No.78|No.79|No.80|No.81|No.82|No.83|No.84|No.85|No.86|No.87|No.88|No.89|No.90|No.91|No.92|No.93|No.94|No.95|No.96|No.97|No.98|No.99|No.100|No.101|No.102|No.103|No.104|No.105|No.106|No.107|No.108|No.109|No.110|No.111|No.112|No.113|No.114|No.115|No.116|No.117|
Total nucids 232772


In [23]:
downloadStep = 100 # this should not be changed during download
start = 0 # should start from 0 if nothing have downloaded

In [25]:
from math import ceil
from time import strftime

stepEnd = ceil(len(nucids)/downloadStep) # this is the last end number (ceil makes this number exclusive)
print(f'Last group No. {str(stepEnd-1).zfill(4)}')

numToFetch = 2328

end = start + numToFetch # range(start, end) means not including end number!!

if end > stepEnd:
    end = stepEnd
    
print(f'Now fetching groups from {start} to {end-1}')

gbfilePath = '/mnt/d/WORKs/temp/downloadingGenomes/'
log_file = f'{gbfilePath}fetching.log'
timestamp = strftime('%X %d/%m/%Y %Z')

# write log file
with open(log_file,'a') as log_handle:
    log_handle.write(f'\n{timestamp:*^50}\nFrom {start*downloadStep+1} to {end*downloadStep+1-1} (inclusive)\n{"":*^50}\n')

for i in range(start, stepEnd):
    # Break the loop for shorter operation and debugging time
    if i == end:
        break
    # Decide range of ids to fetch
    id_start = i*downloadStep
    id_end = (i+1)*downloadStep
    file_Nu = str(i).zfill(4)
    if id_end > len(nucids):
        id_end = len(nucids)
    ids = nucids[id_start:id_end]
    
    # Write note to screen and log file
    logstr = f"Fetching {file_Nu}: {ids[:4]}...({len(ids)})"
    print(logstr)
    with open(log_file,'a') as log_handle:
        log_handle.write(f'{logstr}\n')
    
    output_file = f'{gbfilePath}stre_No_{file_Nu}.gb'
    # Fetching...
    with Entrez.efetch(db = 'nuccore',
                       id = ids,
                       rettype = 'gbwithparts',
                       retmode = 'text'
                      ) as handle:
        with open(output_file, 'w') as out_handle:
            out_handle.write(handle.read())
    
    # Write finishing note to screen and log file
    logstr = f"Finished {file_Nu}: {os.stat(output_file).st_size/1024/1024:.2f} MB {output_file} \n"
    print(logstr)
    with open(log_file,'a') as log_handle:
        log_handle.write(f'{logstr}\n')

totalSize = 0 # calculate total amount data got from entrez
for file in os.listdir(gbfilePath):
    if file.endswith('.gb'):
        totalSize += os.stat(os.path.join(gbfilePath,file)).st_size
logstr = f'{totalSize/1024/1024:.2f} MB'
logstr = f"Group finished, already got {logstr} data!"
print(logstr)
with open(log_file,'a') as log_handle:
    log_handle.write(f'{logstr}\n')

start = end # ready for next round of fetching
print(f'Next download will start from group {start} (between 0 - {stepEnd-1}).')
print('Before starting next query, please set how many groups you want to fetch based on your schedule.')

Last group No. 2327
Now fetching groups from 2328 to 2327
Group finished, already got 17839.13 MB data!
Next download will start from group 2328 (between 0 - 2327).
Before starting next query, please set how many groups you want to fetch based on your schedule.


# Convert genbank file to blast database

gb files needs to be converet to fasta file before making a database

In [None]:
from Bio import SeqIO
from Bio import Seq
from time import strftime
timestamp = strftime('%X %d/%m/%Y %Z')

print('Changing gb to fasta...')

# Decide how many file to convert
numFilesToConvert = 3000
storage_folder_fa = '/mnt/d/WORKs/temp/downloadingGenomes/AllstrepFasta/'
storage_folder_gb = '/mnt/d/WORKs/temp/downloadingGenomes/'

# Setup starting value
startNumFile = os.path.join(storage_folder_fa, 'temp')
if os.path.isfile(startNumFile):
    with open(startNumFile, 'rb') as handle:
        start = pickle.load(handle)
else:
    start = 0
    with open(startNumFile, 'wb') as handle:
        pickle.dump(start, handle)
end = start + numFilesToConvert
totalGbs = sum(file.endswith('gb') for file in os.listdir(storage_folder_gb))
if end > totalGbs:
    end = totalGbs

# write log file
log_file = os.path.join(storage_folder_fa,'converting.log')
with open(log_file,'a') as log_handle:
    log_handle.write(f'\n{timestamp:*^50}\nFrom {start} to {end}\n{"":*^50}\n')

for i in range(start,end):
    num = str(i).zfill(4)
    
    input_file = os.path.join(storage_folder_gb, f'stre_No_{num}.gb') # This way I can check if there is a gap in fetched file number
    output_file = os.path.join(storage_folder_fa, f'stre_No_{num}.fa')

   
    # Write note to screen and log file
    logstr = f"Converting stre_No_{num}.gb..."
    print(logstr)
    with open(log_file,'a') as log_handle:
        log_handle.write(f'{logstr}\n')
 
    records = SeqIO.parse(input_file, 'genbank')

    with open(output_file, 'w') as fasta_out_handle:
        num_empty = 0
        recordWithSeq = []
        for record in records:
    #         print(type(record.seq))
            if type(record.seq)==Seq.UnknownSeq: # Empty records will load as UnknownSeq
                num_empty += 1
                pass
            else:
                recordWithSeq.append(record)
        if len(recordWithSeq) == 0: # if all records are empty, there is no point of writing it to fasta
            logstr = f'There is no sequence in stre_No_{num}.gb, proceed to next file...'
        else:
            SeqIO.write(recordWithSeq, fasta_out_handle, 'fasta')
            logstr = f"Finished converting {num}, ignored {num_empty} empty records."
        
        print(logstr)
        with open(log_file,'a') as log_handle:
            log_handle.write(f'{logstr}\n')

# prepare for next round, set start number and dump to temp file        
start = end
pickle.dump(start, startNumFile)

# Make blast database

In [17]:
import subprocess
import os

In [None]:
sourceDir = '/mnt/d/WORKs/temp/downloadingGenomes/AllstrepFasta/'
logFile = '/mnt/d/WORKs/temp/downloadingGenomes/blastdb/makeblastdb.log'
outputDir = '/mnt/d/WORKs/temp/downloadingGenomes/blastdb/'

totalNum = sum(file.endswith('fa') for file in os.listdir(sourceDir))
print('Total number of files to be converted {totalNum}.\nConverted:', end='')

converted = 0
for file in os.listdir(sourceDir):
    if not file.endswith('fa'):
        continue
    gbFile = os.path.join(sourceDir, file)
    args = ['makeblastdb',
            '-in', gbFile,
            '-input_type', 'fasta',
            '-dbtype', 'nucl',
            '-title', f'{file[:-3]}',
            '-out', os.path.join(outputDir, f'{file[:-3]}'),
            '-logfile', logFile,
            '-taxid', '85011'
           ]
    run = subprocess.run(args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
    if run.returncode == 0:
        converted += 1
        print(f'{converted}|', end = '')
        pass
    else:
        with open(logFile, 'r') as log:
            for line in log.readlines():
                print(line)
        break
print(f'\n\nFinished, databases made: {converted}')

## Merge blast database into one

In [28]:
databaseDir = '/mnt/d/WORKs/temp/downloadingGenomes/blastdb/'
listFile = '/mnt/d/WORKs/temp/downloadingGenomes/blastdb/listofdbs'
databaseList = []
for file in os.listdir(databaseDir):
    if not file.endswith('nsq'):
        continue
    dbName = file.split('.')[0]
    databaseList.append(os.path.join(databaseDir, dbName))
with open(listFile, 'w') as handle:
    handle.write('\n'.join(databaseList))
    
args = ['blastdb_aliastool',
        '-dblist_file', listFile,
        '-dbtype', 'nucl',
        '-out', os.path.join(databaseDir, 'allStrepNucl20170902'),
        '-title', 'allStrepNucl20170902',
       ]
run = subprocess.run(args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
if run.returncode == 0:
    pass
else:
    print(run.stdout.decode())
    print(run.stderr.decode())