In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from Bio import Entrez, SeqIO
import os
from time import sleep
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
Entrez.email = "rosalia.palomino@alumnos.unican.es"
outdir = "integron_gbk/"
max_size = 15000

In [3]:
def fetch_accessions():
    # Retrieve all accessions from INTEGRALL list section
    print('Getting accessions from http://integrall.bio.ua.pt/?list')
    d_accessions = {}

    for i in range(0,12000,50):
        update_status = i // 120
        print("[%-50s] %d%%" % ('=' * (update_status // 2), update_status), end = '\r')
        url = f'http://integrall.bio.ua.pt/?list&s={i}&ob=org'
        headers = {"user-agent": "Mozilla/5.0"}

        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")

        tables = soup.select('table')

        my_table = tables[2]
        rows = my_table.findChildren(['tr'])

        for row in rows[3:-1]:
            cells = row.findChildren('td')
            acc = cells[0].text.strip()
            org = cells[1].text.strip()
            int = cells[2].text.strip()
            cass = cells[3].text.strip()
            d_accessions[acc] = [org, int, cass]

    # remove initial empty key
    d_accessions.pop('', None)

    print(f'\n{len(d_accessions)} accessions found in first search.')

    return d_accessions

In [4]:
def get_sequence_size(acc):
    try:
        handle = Entrez.esummary(db="nucleotide", id=acc, retmode="xml")
        summary = Entrez.read(handle)
        handle.close()
        return int(summary[0]["Length"])
    except Exception as e:
        return -1

In [None]:
def download_gbk(acc, max_size):
    try:
        sleep(0.35)
        size = get_sequence_size(acc)
        if size == -1:
            return (acc, "summary_failed")
        if size > max_size:
            return (acc, "too_large")

        handle = Entrez.efetch(db="nucleotide", id=acc, rettype="gb", retmode="text")
        gbk = handle.read()
        handle.close()

        with open(os.path.join(outdir, f"{acc}.gbk"), "w") as out:
            out.write(gbk)
        return (acc, "success")

    except Exception as e:
        return (acc, "error")

In [None]:
# def main():
accessions = fetch_accessions()
n_workers = 8
results = {"success": [], "too_large": [], "summary_failed": [], "error": []}

print("Fetching sequences (parallel)...")
with ThreadPoolExecutor(max_workers=n_workers) as executor:
    futures = {executor.submit(download_gbk, acc, max_size): acc for acc in accessions}
    for i, future in enumerate(as_completed(futures)):
        acc, status = future.result()
        results[status].append(acc)
        print(f"[{i+1}/{len(accessions)}] {acc}: {status}", end='\x1b[1K\r')

for key, val in results.items():
    print(f"{key}: {len(val)} entries")

# if __name__ == "__main__":
#     main()

Getting accessions from http://integrall.bio.ua.pt/?list
11955 accessions found in first search.
Fetching sequences (parallel)...


            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.


[381/11955] KJ546444: errorss_failed