# Download Gene Information
This notebook downloads a list of genes for organisms from [NCBI](https://www.ncbi.nlm.nih.gov/).

To ensure platform independence, this notebook uses Python libraries to download and unzip a compressed file.

In [1]:
import os
import shutil
import requests
import gzip

If LOCAL_SCRATCH_DIR environment variable is not set, this notebook creates the ../data directory to store temporary files.

In [2]:
DATA_DIR = os.getenv("LOCAL_SCRATCH_DIR", default="../data")
os.makedirs(DATA_DIR, exist_ok=True)

In [3]:
url = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz"
filename_in = os.path.join(DATA_DIR, "gene_info.gz")
filename_out = os.path.join(DATA_DIR, "gene_info.tsv")

Download using streaming to handle large files that exceed available memory

In [4]:
def download_http(url, filename):
    with requests.get(url, stream=True) as r:
        with open(filename, "wb") as f:
            shutil.copyfileobj(r.raw, f)

In [5]:
def unzip(filename_in, filename_out):
    with gzip.open(filename_in, "rb") as f_in:
        with open(filename_out, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

In [6]:
print(f"downloading {url} to {filename_in}")
download_http(url, filename_in)

downloading https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz to ../data/gene_info.gz


In [7]:
print(f"unzipping {filename_in} to {filename_out}")
unzip(filename_in, filename_out)

unzipping ../data/gene_info.gz to ../data/gene_info.tsv
