In [1]:
pip install requests pandas tqdm

Note: you may need to restart the kernel to use updated packages.


In [1]:
# find correct dataset name for your species
from pybiomart import Server
server = Server(host='http://www.ensembl.org')
server.marts['ENSEMBL_MART_ENSEMBL'].list_datasets()[server.marts['ENSEMBL_MART_ENSEMBL'].list_datasets()['display_name'].str.contains('Naked')]



Unnamed: 0,name,display_name
211,hgfemale_gene_ensembl,Naked mole-rat female genes (Naked_mole-rat_ma...


In [2]:
#dataset name for HetGla
nmr_dataset_name = 'hgfemale_gene_ensembl'
# dataset for human
human_dataset_name = 'hsapiens_gene_ensembl'
# dataset for mouse
mouse_dataset_name = 'mmusculus_gene_ensembl'

In [3]:
from pybiomart import Server
import pandas as pd

# Connect to Ensembl BioMart
server = Server(host='http://www.ensembl.org')

# Dataset names (as you already found)
datasets = {
    'nmr': 'hgfemale_gene_ensembl',          # Heterocephalus glaber female
    'human': 'hsapiens_gene_ensembl',
    'mouse': 'mmusculus_gene_ensembl'
}

def get_gene_table(dataset_name, species_name):
    """
    Fetch gene list for a species from Ensembl BioMart.
    Returns gene_id, gene_name, chromosome, biotype, description.
    """
    dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets[dataset_name]
    
    df = dataset.query(attributes=[
        'ensembl_gene_id',
        'external_gene_name',
        'chromosome_name',
        'gene_biotype',
        'description'
    ])
    df = df.rename(columns={
        'Gene stable ID': 'gene_id',
        'Gene name': 'gene_name',
        'Chromosome/scaffold name': 'chromosome',
        'Gene type': 'biotype',
        'Gene description': 'description'
    })
    df['species'] = species_name
    return df

# === Retrieve all genes ===
nmr_genes = get_gene_table(datasets['nmr'], 'naked_mole_rat')
human_genes = get_gene_table(datasets['human'], 'human')
mouse_genes = get_gene_table(datasets['mouse'], 'mouse')

# Save for later use
nmr_genes.to_csv('nmr_genes_biomart.csv', index=False)
human_genes.to_csv('human_genes_biomart.csv', index=False)
mouse_genes.to_csv('mouse_genes_biomart.csv', index=False)

print(f"Extracted {len(nmr_genes)} NMR genes, {len(human_genes)} human genes, {len(mouse_genes)} mouse genes")


Extracted 35328 NMR genes, 86369 human genes, 78334 mouse genes


In [6]:
nmr_gene_list = nmr_genes['gene_name'].dropna().tolist()

In [5]:
import requests
import pandas as pd
import time
from tqdm import tqdm

ENSEMBL_REST = "https://rest.ensembl.org"

def get_orthologs(gene_list, from_species, to_species, output_file, max_retries=5, sleep_time=0.3):
    results = []
    headers = {"Content-Type": "application/json"}

    for gene in tqdm(gene_list, desc=f"{from_species}->{to_species}"):
        retries = 0
        while retries < max_retries:
            url = f"{ENSEMBL_REST}/homology/symbol/{from_species}/{gene}?target_species={to_species}&type=orthologues"
            try:
                r = requests.get(url, headers=headers, timeout=15)
                if r.status_code == 429:
                    # Too many requests, back off
                    wait = int(r.headers.get("Retry-After", 2))
                    print(f"Rate limited. Waiting {wait}s…")
                    time.sleep(wait)
                    continue
                if not r.ok:
                    retries += 1
                    time.sleep(sleep_time)
                    continue

                data = r.json()
                if not data.get("data"):
                    break

                homologies = data["data"][0].get("homologies", [])
                for h in homologies:
                    if h["type"] == "ortholog_one2one":
                        results.append({
                            f"{from_species}_gene": gene,
                            f"{to_species}_gene": h["target"]["id"],
                            "orthology_type": h["type"],
                            "perc_id": h["target"]["perc_id"],
                            "dn_ds": h.get("dn_ds")
                        })
                break  # move to next gene if successful

            except requests.exceptions.RequestException as e:
                retries += 1
                time.sleep(sleep_time)
                if retries == max_retries:
                    print(f"Failed to fetch {gene}: {e}")

        time.sleep(sleep_time)  # politeness delay

    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    return df


In [None]:

# NMR → Human
nmr_human_orth = get_orthologs(nmr_genes['gene_name'].dropna().tolist(), "heterocephalus_glaber_female", "homo_sapiens", "nmr_to_human.csv")

print("Saved orthology tables:")
print(" - nmr_to_human.csv")

In [6]:
# === Example usage ===

# Mouse → NMR
mouse_nmr_orth = get_orthologs(mouse_genes['gene_name'].dropna().tolist(), "mus_musculus", "heterocephalus_glaber_female", "mouse_to_nmr.csv")

print("Saved orthology tables:")
print(" - mouse_to_nmr.csv")


mus_musculus->heterocephalus_glaber_female:  82%|████████▏ | 63439/77801 [13:35:44<28:20:12,  7.10s/it]

Failed to fetch Gm52705: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Max retries exceeded with url: /homology/symbol/mus_musculus/Gm52705?target_species=heterocephalus_glaber_female&type=orthologues (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x71c58ae68790>: Failed to establish a new connection: [Errno 113] No route to host'))


mus_musculus->heterocephalus_glaber_female:  82%|████████▏ | 63441/77801 [13:36:04<31:55:29,  8.00s/it]

Failed to fetch Gm69268: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Max retries exceeded with url: /homology/symbol/mus_musculus/Gm69268?target_species=heterocephalus_glaber_female&type=orthologues (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x71c58ae6a230>: Failed to establish a new connection: [Errno 113] No route to host'))


mus_musculus->heterocephalus_glaber_female: 100%|██████████| 77801/77801 [18:36:18<00:00,  1.16it/s]   

Saved orthology tables:
 - mouse_to_nmr.csv





In [7]:
# === Example usage ===

# Human → NMR
human_nmr_orth = get_orthologs(human_genes['gene_name'].dropna().tolist(), "homo_sapiens","heterocephalus_glaber_female", "human_to_nmr.csv")
print(" - human_to_nmr.csv")

homo_sapiens->heterocephalus_glaber_female: 100%|██████████| 48698/48698 [27:13:47<00:00,  2.01s/it]       

 - human_to_nmr.csv



