In [2]:
from concurrent.futures import ThreadPoolExecutor
from Bio import Entrez

Entrez.email = "your_email@example.com"

def download_sequence(accession, filename, retstart=None, retmax=None):
    print(f"Downloading {filename}...")
    if retstart is not None:
        handle = Entrez.efetch(
            db="nucleotide",
            id=accession,
            rettype="fasta",
            retstart=retstart,
            retmax=retmax
        )
    else:
        handle = Entrez.efetch(
            db="nucleotide",
            id=accession,
            rettype="fasta"
        )
    
    with open(filename, "w") as f:
        f.write(handle.read())
    print(f"Completed {filename}")

# Download in parallel
with ThreadPoolExecutor(max_workers=5) as executor:
    executor.submit(download_sequence, "NC_000001.11", "human_chr1_subset.fasta", 0, 12500000)
    executor.submit(download_sequence, "NC_000932.1", "arabidopsis_chloroplast.fasta")
    executor.submit(download_sequence, "NC_005089.1", "mouse_mitochondrial.fasta")
    executor.submit(download_sequence, "NC_001709.16", "fruitfly_mitochondrial.fasta")
    executor.submit(download_sequence, "NC_000913.3", "ecoli_K12.fasta")
    executor.submit(download_sequence, "NC_002655.2", "ecoli_O157.fasta")
    executor.submit(download_sequence, "NC_012892.2", "ecoli_BL21.fasta")
    executor.submit(download_sequence, "NC_004431.1", "ecoli_CFT073.fasta")
    executor.submit(download_sequence, "NC_008253.1", "ecoli_536.fasta")
    executor.submit(download_sequence, "NC_000964.3", "bsubtilis.fasta")

Downloading human_chr1_subset.fasta...
Downloading arabidopsis_chloroplast.fasta...
Downloading mouse_mitochondrial.fasta...
Downloading fruitfly_mitochondrial.fasta...
Downloading ecoli_K12.fasta...
Completed mouse_mitochondrial.fasta
Downloading ecoli_O157.fasta...
Downloading ecoli_BL21.fasta...
Completed arabidopsis_chloroplast.fasta
Downloading ecoli_CFT073.fasta...
Completed ecoli_O157.fasta
Downloading ecoli_536.fasta...
Completed ecoli_K12.fasta
Downloading bsubtilis.fasta...
Completed ecoli_CFT073.fasta
Completed ecoli_BL21.fasta
Completed bsubtilis.fasta
Completed ecoli_536.fasta
Completed human_chr1_subset.fasta


In [1]:
import requests
import os
import time

# Define sequences to download (with zebrafish replacing fruitfly)
sequences = {
    "human_chr1_subset": {
        "accession": "NC_000001.11",
        "retstart": 0,
        "retmax": 12_500_000,
        "filename": "human_chr1_subset.fasta"
    },
    "arabidopsis_chloroplast": {
        "accession": "NC_000932.1",
        "filename": "arabidopsis_chloroplast.fasta"
    },
    "mouse_mitochondrial": {
        "accession": "NC_005089.1",
        "filename": "mouse_mitochondrial.fasta"
    },
    # Replaced fruitfly with zebrafish
    "zebrafish_mitochondrial": {
        "accession": "NC_002333.2",
        "filename": "zebrafish_mitochondrial.fasta"
    },
    "ecoli_K12": {
        "accession": "NC_000913.3",
        "filename": "ecoli_K12.fasta"
    },
    "ecoli_O157": {
        "accession": "NC_002655.2",
        "filename": "ecoli_O157.fasta"
    },
    "ecoli_BL21": {
        "accession": "NC_012892.2",
        "filename": "ecoli_BL21.fasta"
    },
    "ecoli_CFT073": {
        "accession": "NC_004431.1",
        "filename": "ecoli_CFT073.fasta"
    },
    "ecoli_536": {
        "accession": "NC_008253.1",
        "filename": "ecoli_536.fasta"
    },
    "bsubtilis": {
        "accession": "NC_000964.3",
        "filename": "bsubtilis.fasta"
    }
}

# Create directory
os.makedirs("ncbi_dataset", exist_ok=True)

# Download function
def download_sequence(info, max_retries=3):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    
    for attempt in range(max_retries):
        try:
            print(f"Attempting {info['filename']} (attempt {attempt + 1})...")
            
            # Prepare parameters
            params = {
                "db": "nucleotide",
                "id": info["accession"],
                "rettype": "fasta",
                "retmode": "text"
            }
            
            # Add range parameters for human chromosome
            if "retstart" in info:
                params["retstart"] = info["retstart"]
                params["retmax"] = info["retmax"]
            
            # Make request
            response = requests.get(base_url, params=params, timeout=120)
            
            # Check if response is valid
            if response.status_code == 200 and len(response.text) > 100:
                filepath = os.path.join("ncbi_dataset", info["filename"])
                with open(filepath, "w") as f:
                    f.write(response.text)
                print(f"✅ Successfully downloaded {info['filename']}")
                return True
            else:
                raise Exception(f"Invalid response: {response.status_code}")
                
        except Exception as e:
            print(f"❌ Error: {str(e)}")
            if attempt < max_retries - 1:
                wait_time = 10 * (attempt + 1)
                print(f"Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
            else:
                print(f"❌ Failed to download {info['filename']}")
                return False

# Download with delays
for name, info in sequences.items():
    success = download_sequence(info)
    if not success:
        print(f"Skipping {name}")
    
    time.sleep(5)

print("\nDownload process completed!")

# Create updated metadata CSV
metadata = """filename,species,kingdom,gc_content
human_chr1_subset.fasta,Homo sapiens,Humans,50.2
arabidopsis_chloroplast.fasta,Arabidopsis thaliana,Plants,36.3
mouse_mitochondrial.fasta,Mus musculus,Animals,40.0
zebrafish_mitochondrial.fasta,Danio rerio,Animals,45.6
ecoli_K12.fasta,Escherichia coli,Bacteria,50.8
ecoli_O157.fasta,Escherichia coli,Bacteria,50.5
ecoli_BL21.fasta,Escherichia coli,Bacteria,50.7
ecoli_CFT073.fasta,Escherichia coli,Bacteria,50.6
ecoli_536.fasta,Escherichia coli,Bacteria,50.9
bsubtilis.fasta,Bacillus subtilis,Bacteria,43.5"""

with open("ncbi_dataset/metadata.csv", "w") as f:
    f.write(metadata)

print("✅ Metadata CSV updated with zebrafish")

Attempting human_chr1_subset.fasta (attempt 1)...
✅ Successfully downloaded human_chr1_subset.fasta
Attempting arabidopsis_chloroplast.fasta (attempt 1)...
✅ Successfully downloaded arabidopsis_chloroplast.fasta
Attempting mouse_mitochondrial.fasta (attempt 1)...
✅ Successfully downloaded mouse_mitochondrial.fasta
Attempting zebrafish_mitochondrial.fasta (attempt 1)...
✅ Successfully downloaded zebrafish_mitochondrial.fasta
Attempting ecoli_K12.fasta (attempt 1)...
✅ Successfully downloaded ecoli_K12.fasta
Attempting ecoli_O157.fasta (attempt 1)...
✅ Successfully downloaded ecoli_O157.fasta
Attempting ecoli_BL21.fasta (attempt 1)...
✅ Successfully downloaded ecoli_BL21.fasta
Attempting ecoli_CFT073.fasta (attempt 1)...
✅ Successfully downloaded ecoli_CFT073.fasta
Attempting ecoli_536.fasta (attempt 1)...
✅ Successfully downloaded ecoli_536.fasta
Attempting bsubtilis.fasta (attempt 1)...
✅ Successfully downloaded bsubtilis.fasta

Download process completed!
✅ Metadata CSV updated with z