In [9]:
from Bio import Entrez
Entrez.email = input("Your email address for EntrezSearch:")
import os
import pickle
import lzma
from datetime import date
from time import strftime, sleep
from urllib.error import HTTPError
from http.client import IncompleteRead
import sys
from Bio import SeqIO
from Bio import Seq
import subprocess


prefix = "txid105422"
searchTerm = f'{prefix}[Organism] AND refseq[filter] NOT "sequencing project"[Title]'

newSearch = True
newFetch = True

if newSearch:
    dateOfSearch = date.today().strftime("%Y%m%d") # For new search
else:
    dateOfSearch = '20180422' # For old search, change the date accrodingly
    
def writeLog(logFile, logStr, end = '\n'):
    print(logStr, end = end)
    with open(logFile,'a') as logHandle:
        logHandle.write(f'{logStr}{end}')

Your email address for EntrezSearch:c.du@biology.leidenuniv.nl


* Taxonomy ID: 228398 (for references in articles please use NCBI:txid228398)  
 Scientific name: Streptacidiphilus Kim et al. 2003
* Taxonomy ID: 2063 (for references in articles please use NCBI:{prefix})  
 Scientific name: Kitasatospora corrig. Omura et al. 1983 emend. Zhang et al. 1997
* Taxonomy ID: 1883 (for references in articles please use NCBI:txid1883)  
 Scientific name: Streptomyces Waksman and Henrici 1943 (Approved Lists 1980) emend. Wellington et al. 199* 
* Taxonomy ID: 1914443 (for references in articles please use NCBI:txid1914443)  
 Scientific name: Allostreptomyces Huang et al. 2017
* Taxonomy ID: 329648 (for references in articles please use NCBI:txid329648)  
 Scientific name: "Parastreptomyces" 1) Nichols et al. 2005
* Taxonomy ID: 65518 (for references in articles please use NCBI:txid65518)  
 Scientific name: "Trichotomospora" 1) Lian et al. 1985
* Taxonomy ID: 234292 (for references in articles please use NCBI:txid234292)  
 Scientific name: unclassified Streptomycetaceae
* Taxonomy ID: 259296 (for references in articles please use NCBI:txid259296)  
 Scientific name: environmental samples

In [10]:
def getRecords(searchTerm, c = 0, step = 2000, webaccession = None):
    targets = []
    count = 0
    while c < count or count == 0:
        print(f'No.{int(c/step+1)}', end = '>')
        handle = Entrez.esearch(db='nuccore', 
                                term=searchTerm,
                                usehistory=True,
                                webenv=webaccession, # reuse teh first query
                                retstart=c, # Continue from num c
                                retmax=step # Maxium number returned
                               )
        record = Entrez.read(handle)
        handle.close()
        if record['Count'] == "0":
            print("\nThere is no genome with this search.")
            break
        if 'WarningList' in record: # If anything wrong
            print(record['WarningList']['OutputMessage'])

        # Put current list of IDs in our targets dict    
        targets.append(record['IdList'])
        print(f"{len(record['IdList'])}", end = '|')  # the number got from this loop.

        if webaccession == None:
            webaccession = record["WebEnv"] # start new search only in the first loop
            print(f'\nWebAccession:\n{webaccession}')
            count = int(record['Count']) # total number of hits get from the first search attempt
            print(f"Total entry {count}.")    
        c += step
    return c, count, webaccession, targets


c, count, webaccession, targets = getRecords(searchTerm = searchTerm)

# now print and check the first 5 IDs of the first 10 lists of IDs
print(f"\nTotal target sets {len(targets)}\nFirst 10:")
for targ in targets[:10]:
    print(f"{targ[:5]}...")
print('...')


No.1>141|
WebAccession:
NCID_1_119459368_130.14.22.215_9001_1529576568_398040618_0MetA0_S_MegaStore
Total entry 141.

Total target sets 1
First 10:
['219857341', '755060028', '755060022', '755060015', '755060009']...
...


# Fetching all data from online (getting IDs from previous stored data)

In [11]:
# Dump targets got from search in pickle file
pickleFiles = "/Users/durand.dc/Desktop/"
targetsPickle = os.path.join(pickleFiles, f'refseq_{prefix}_{dateOfSearch}.pickle.xz')
if newSearch:
    with lzma.open(targetsPickle, 'wb') as pickleOut:
        pickle.dump(targets, pickleOut)
    newSearch = False # switch off newSearch
else:
    with lzma.open(targetsPickle, 'rb') as pickleIn:
        targets = pickle.load(pickleIn)
        
# flatten the targets list of list
# remove redundant if we have some

nucids = sorted(list(set(item for sublist in targets for item in sublist)))

print(f"Total nucids {len(nucids)}\nRemember to switch off <newSearch>")

Total nucids 141
Remember to switch off <newSearch>


In [4]:
def fetch(index, file_Nu, ids, returnType = ['fasta','fasta']):
    '''fetch(index, file_Nu, ids, returnType = ['fasta','fasta'])
    
    other option is:
    returnType = ['gbwithparts','gb']'''

    writeLog(logFile, f"Fetching {file_Nu}: {ids[:4]}...({len(ids)})")
    
    output_file = os.path.join(outputPath, f'{prefix}_No_{file_Nu}.{returnType[1]}')

    # Fetching...
    with Entrez.efetch(db = 'nuccore',
                       id = ids,
                       rettype = returnType[0],
                       retmode = 'text'
                      ) as handle:
        with open(output_file, 'w') as out_handle:
            out_handle.write(handle.read())
    
    # Write finishing note to screen and log file
    logstr = f"Finished {file_Nu}: {os.stat(output_file).st_size/1024/1024:.2f} MB {output_file} \n"
    print(logstr)
    with open(logFile,'a') as log_handle:
        log_handle.write(f'{logstr}\n')
    return True


def finishing(allDone = True):
    totalSize = 0 # calculate total amount data got from entrez
    for file in os.listdir(outputPath):
        if file.endswith('.gb') or file.endswith('.fasta'):
            totalSize += os.stat(os.path.join(outputPath,file)).st_size
    
    writeLog(logFile, f"Group finished, already got {totalSize/1024/1024:.2f} MB data!")
    if allDone:
            writeLog(logFile,'Finished fetching all IDs.')
    else:
        writeLog(logFile, f'Next download will start from group {start} (of 0 - {len(idStacks)-1}).')
        writeLog(logFile, f'{timestamp:*^80}')
        print('Before starting next query, please set how many groups you want to fetch based on your schedule.')        

In [12]:
start = 0 # should start from 0 if nothing have downloaded

In [13]:
downloadStep = 300 # this should not be changed during download

idStacks = []
for i in range(0,len(nucids),downloadStep):
    idStacks.append(nucids[i:i+downloadStep])

print(f'Last group No. {str(len(idStacks)-1).zfill(4)}')

numToFetch = 999 # number of files to fetch (specifing if you can not finish in one go)

end = start + numToFetch # range(start, end) means not including end number!!

if end > len(idStacks):
    end = len(idStacks)

outputPath = '/Users/durand.dc/Desktop/downloadingGenomes/'
logFile = f'{outputPath}fetching.log'
timestamp = strftime('%X %d/%m/%Y %Z')

writeLog(logFile, f'''
{timestamp:*^80}
Now fetching groups from {start} to {end-1}
Each group have {downloadStep} nuclIDs
IDs from {start*downloadStep+1} to {end*downloadStep+1-1} (inclusive)
{"":*^80}
''')
# Mean loop
for i in range(start, len(idStacks)):
    # Break the loop for shorter operation and debugging time
    if i == end and i != len(idStacks)-1:
        start = i
        finishing(allDone = False)
        break
        
    ids = idStacks[i]
    
    succeed = False
    retryTimes = 0
    while succeed == False:
        try:
            succeed = fetch(index = i, file_Nu = str(i).zfill(4), ids = ids)
        except HTTPError:
            retryTimes += 1
            if retryTimes == 3:
                print("Failed 3 times due to HTTPError (NCBI server problem), please try again another time.")
                start = i
                print(f'Start value set for next trial. [{start}]')
                break
            else:
                print('Failed due to HTTPError, pause for 45 seconds...',end = '')
                sleep(45)
                print('Retrying...')
        except IncompleteRead:
            start = i
            print(f'Failed due to connection loss, start value set for next trial. [{start}]') 
            break
        except:
            start = i
            print(f'Failed due to unknown error, start value set for next trial. [{start}]') 
            print(f'{sys.exc_info()}')
            break
    if not succeed:
        break
    elif i == len(idStacks) - 1:
        finishing(allDone = True)

Last group No. 0000

****************************12:23:13 21/06/2018 CEST****************************
Now fetching groups from 0 to 0
Each group have 300 nuclIDs
IDs from 1 to 300 (inclusive)
********************************************************************************

Fetching 0000: ['219857341', '631251409', '755039748', '755040325']...(141)
Finished 0000: 24.62 MB /Users/durand.dc/Desktop/downloadingGenomes/txid105422_No_0000.gb 

Group finished, already got 61.02 MB data!
Finished fetching all IDs.


# Convert genbank file to blast database

gb files needs to be converet to fasta file before making a database  
also the files needed to be combined incase combine blast database don't work, also for easy file transfering

In [None]:
def convertGb2Fasta(storage_folder_gb, storage_folder_fa, logFile, sizeLimit, numFilesToConvert = 9999):
    print('Changing gb to fasta...')
    
    totalGbs = sum(file.endswith('gb') for file in os.listdir(storage_folder_gb))

    # write log file
    writeLog(logFile, f'\n{timestamp:*^50}\n{totalGbs} of gb files in total\n{"":*^50}\n')
    # setup starting values
    recordWithSeq = [] # container for seqs for the function of dumping certain size of seqs together.
    seqTotalLength = 0
    num_empty = 0
    breakPoint = 0
    
    # Main loop
    for i in range(totalGbs):
        num = str(i).zfill(4)
        input_file = os.path.join(storage_folder_gb, f'{prefix}_No_{num}.gb') 
        # If there is a gap in the input file, it will currupt here.

        # Write note to screen and log file
        writeLog(logFile, f"{num}|{seqTotalLength}", end = '\t')
        
        records = SeqIO.parse(input_file, 'genbank')
        for record in records:
            if type(record.seq) == Seq.UnknownSeq: # Empty records will load as UnknownSeq
                num_empty += 1
                pass
            else:
                recordWithSeq.append(record)
                seqTotalLength += len(record)

        if i == end - 1:
            print('\nReach the end, writing it out...')
            output_file = os.path.join(storage_folder_fa, f'{prefix}_No_{breakPoint}to{num}.fasta')
            with open(output_file, 'w') as fasta_out_handle:
                if len(recordWithSeq) == 0: # if all records are empty, there is no point of writing it to fasta
                    writeLog(logFile, f'There is no sequence in this batch, proceed to next file...')
                else:
                    SeqIO.write(recordWithSeq, fasta_out_handle, 'fasta')
                    writeLog(logFile, f"Finished converting {num}, ignored {num_empty} empty records in this batch.")
        elif seqTotalLength >= sizeLimit:
            print('\nReach the size limit of single file, writing it out')
            output_file = os.path.join(storage_folder_fa, f'{prefix}_No_{breakPoint}to{num}.fasta')
            with open(output_file, 'w') as fasta_out_handle:
                if len(recordWithSeq) == 0: # if all records are empty, there is no point of writing it to fasta
                    writeLog(logFile, f'There is no sequence in this batch, proceed to next file...')
                else:
                    SeqIO.write(recordWithSeq, fasta_out_handle, 'fasta')
                    writeLog(logFile, f"Finished converting {num}, ignored {num_empty} empty records in this batch.")
            # Reset batch
            recordWithSeq = list()
            seqTotalLength = 0
            num_empty = 0
            breakPoint = str(i).zfill(4)
            
            
def combineFasta(fetchedFasta,combinedFasta,logFile,sizeLimit):
    if not os.path.isdir(combinedFasta):
        os.mkdir(combinedFasta)

    fileList = [os.path.join(fetchedFasta, file) for file in os.listdir(fetchedFasta) if file.endswith('.fasta')]
    seqTotalLength = 0
    seqsInOneFile = []
    breakPoint = 0
    for i in range(len(fileList)):
        parseSeqs = list(SeqIO.parse(fileList[i],'fasta'))
        for seq in parseSeqs:
            seqTotalLength += len(seq)
            if len(seq) == 0:
                print('There is 0 length sequences')
                break
        seqsInOneFile += parseSeqs
        logStr = f"{i}|{seqTotalLength}"
        writeLog(logFile, logStr, end = '\t')

        if i == len(fileList) - 1:
            outputFile = os.path.join(combinedFasta, f'{prefix}_{breakPoint}_to_{i}.fasta')
            logStr = f'\nFinished combining, write out last batch...\n{outputFile}'
            writeLog(logFile, logStr)
            SeqIO.write(seqsInOneFile, outputFile, 'fasta')
            writeLog(logFile, 'Write succeeded.')
        elif seqTotalLength >= sizeLimit:
            outputFile = os.path.join(combinedFasta, f'{prefix}_{breakPoint}_to_{i}.fasta')
            logStr = f'\nReached file size limit, write out...\n{outputFile}'
            writeLog(logFile, logStr)
            SeqIO.write(seqsInOneFile, outputFile, 'fasta')
            breakPoint = i+1
            seqsInOneFile = []
            seqTotalLength = 0
            writeLog(logFile, 'Write succeeded.')

            
timestamp = strftime('%X %d/%m/%Y %Z')
fastaFolder = '/Users/durand.dc/Desktop/downloadingGenomes/fasta/'
fetched = '/Users/durand.dc/Desktop/downloadingGenomes/'
logFile = os.path.join(fastaFolder,'converting.log')
print('Converting...')
sizeLimit = 1000000000

if sum(file.endswith('gb') for file in os.listdir(fetched)) == 0:
    combineFasta(fetched,fastaFolder,logFile,sizeLimit)
else:
    convertGb2Fasta(fetched,fastaFolder,logFile,sizeLimit)


# Make blast database

In [None]:
sourceDir = '/Users/durand.dc/Desktop/downloadingGenomes/fasta'
outputDir = '/Users/durand.dc/Desktop/downloadingGenomes/blastdb/'
logFile = '/Users/durand.dc/Desktop/downloadingGenomes/blastdb/makeblastdb.log'

if not os.path.isdir(outputDir):
    os.mkdir(outputDir)
    
# Change here accroding to the format fetched from entrez 
totalNum = sum(file.endswith('fasta') for file in os.listdir(sourceDir))
print(f'Total number of files to be converted {totalNum}.\nConverted:', end='')

converted = 0
for file in os.listdir(sourceDir):
    if not file.endswith('fasta'):
        continue
    singleSeqFile = os.path.join(sourceDir, file)
    args = ['makeblastdb',
            '-in', singleSeqFile,
            '-input_type', 'fasta',
            '-dbtype', 'nucl',
            '-title', f'{file[:-6]}',
            '-out', os.path.join(outputDir, f'{file[:-6]}'),
            '-logfile', logFile,
            '-taxid', '1883'
           ]
    run = subprocess.run(args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
    if run.returncode == 0:
        converted += 1
        print(f'{converted}|', end = '')
        pass
    else:
        with open(logFile, 'r') as log:
            for line in log.readlines():
                print(line)
        break
print(f'\n\nFinished, databases made: {converted}')

## Merge blast database into one

In [None]:
databaseDir = '/Users/durand.dc/Desktop/downloadingGenomes/blastdb/'
listFile = '/Users/durand.dc/Desktop/downloadingGenomes/blastdb/listofdbs.txt'
databaseList = []
for file in os.listdir(databaseDir):
    if not file.endswith('nsq'):
        continue
    dbName = file.split('.')[0]
    databaseList.append(os.path.join(databaseDir, dbName))
with open(listFile, 'w') as handle:
    handle.write('\n'.join(databaseList))
    
args = ['blastdb_aliastool',
        '-dblist_file', listFile,
        '-dbtype', 'nucl',
        '-out', os.path.join(databaseDir, f'{prefix}Nucl{dateOfSearch}'),
        '-title', f'{prefix}Nucl{dateOfSearch}',
       ]
run = subprocess.run(args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
if run.returncode == 0:
    pass
else:
    print(run.stdout.decode())
    print(run.stderr.decode())