In [1]:
import os
import gzip
import shutil
import requests
import pandas as pd
from tqdm import tqdm

# Create a data folder
os.makedirs("ncbi_gene_data", exist_ok=True)

# Base NCBI Gene data URL
base_url = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/"

# Files to download
files_to_download = [
    "gene_info.gz",
    "gene2accession.gz",
    "gene2refseq.gz",
    "gene2pubmed.gz",
    "gene_summary.gz",
    "mim2gene_medgen",
    "go_process.xml"
]

def download_and_extract(file_name):
    gz_path = f"ncbi_gene_data/{file_name}"
    extracted_path = gz_path.replace(".gz", "")

    # Download
    print(f"Downloading: {file_name}")
    response = requests.get(base_url + file_name, stream=True)
    with open(gz_path, 'wb') as f:
        shutil.copyfileobj(response.raw, f)

    # Extract if .gz
    if file_name.endswith(".gz"):
        print(f"Extracting: {file_name}")
        with gzip.open(gz_path, 'rb') as f_in, open(extracted_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    return extracted_path

# Step 1: Download and extract
extracted_files = []
for file in tqdm(files_to_download):
    path = download_and_extract(file)
    extracted_files.append(path)

# Step 2: Load gene_info as example
gene_info_path = [f for f in extracted_files if "gene_info" in f][0]
columns = [
    "tax_id", "GeneID", "Symbol", "LocusTag", "Synonyms", "dbXrefs",
    "chromosome", "map_location", "description", "type_of_gene",
    "Symbol_from_nomenclature_authority", "Full_name_from_nomenclature_authority",
    "Nomenclature_status", "Other_designations", "Modification_date", "Feature_type"
]

print(f"\nReading gene_info from: {gene_info_path}")
df_gene_info = pd.read_csv(gene_info_path, sep="\t", comment="#", names=columns, low_memory=False)
print(df_gene_info.head())

# Step 3: Save to CSV
df_gene_info.to_csv("ncbi_gene_data/gene_info_cleaned.csv", index=False)
print("Saved cleaned gene_info to CSV.")

# Repeat similar steps to process other files if needed


ModuleNotFoundError: No module named 'requests'