In [28]:
%%capture
!pip install mygene


In [8]:
import os
import pandas as pd
from datetime import datetime
from gseapy import enrichr
import json


In [13]:
import gseapy as gp

libs = gp.get_library_name()
print(libs)


['ARCHS4_Cell-lines', 'ARCHS4_IDG_Coexp', 'ARCHS4_Kinases_Coexp', 'ARCHS4_TFs_Coexp', 'ARCHS4_Tissues', 'Achilles_fitness_decrease', 'Achilles_fitness_increase', 'Aging_Perturbations_from_GEO_down', 'Aging_Perturbations_from_GEO_up', 'Allen_Brain_Atlas_10x_scRNA_2021', 'Allen_Brain_Atlas_down', 'Allen_Brain_Atlas_up', 'Azimuth_2023', 'Azimuth_Cell_Types_2021', 'BioCarta_2013', 'BioCarta_2015', 'BioCarta_2016', 'BioPlanet_2019', 'BioPlex_2017', 'CCLE_Proteomics_2020', 'COMPARTMENTS_Curated_2025', 'COMPARTMENTS_Experimental_2025', 'CORUM', 'COVID-19_Related_Gene_Sets', 'COVID-19_Related_Gene_Sets_2021', 'Cancer_Cell_Line_Encyclopedia', 'CellMarker_2024', 'CellMarker_Augmented_2021', 'ChEA_2013', 'ChEA_2015', 'ChEA_2016', 'ChEA_2022', 'Chromosome_Location', 'Chromosome_Location_hg19', 'ClinVar_2019', 'DGIdb_Drug_Targets_2024', 'DSigDB', 'Data_Acquisition_Method_Most_Popular_Genes', 'DepMap_CRISPR_GeneDependency_CellLines_2023', 'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019', 'DepMap_WG_C

In [36]:
def run_gene_enrichment(feature_csv_path, config_path="config.json", top_n=20):
    # ---- Load config ----
    with open(config_path) as f:
        config = json.load(f)

    experiment_name = config.get("experiment_name", "experiment")
    species = config.get("species", "mouse")  # Accepts "human", "mouse", etc.
    gene_column = config.get("voted_feature_column", "Gene")
    enrichr_libraries = [
        "GO_Biological_Process_2023",
        "GO_Cellular_Component_2023",
        "GO_Molecular_Function_2023",
        "Reactome_2022",
        "ARCHS4_Tissues",
        "Rare_Diseases_AutoRIF_Gene_Lists",
        "KEGG_2019_Mouse",
        "WikiPathways_2024_Mouse",
        "MGI_Mammalian_Phenotype_Level_4_2024",
        "Mouse_Gene_Atlas"
    ]


    # ---- Load gene list ----
    df = pd.read_csv(feature_csv_path)
    genes = df[gene_column].tolist()
    symbols = list(ensembl_to_symbol(genes).values())
    print(genes)

    # ---- Enrichment ----
    for lib in enrichr_libraries:
        print(f"Running enrichment for: {lib}")
        enr = enrichr(
            gene_list=symbols,
            gene_sets=lib,
            organism=species,
            outdir=None,
        )
        

        if enr.results is not None and not enr.results.empty:
            top_results = enr.results.sort_values("Adjusted P-value").head(top_n)
            output_path = os.path.join("results/analysis", f"{experiment_name}_{lib}_top_enrichment.csv")
            top_results.to_csv(output_path, index=False)
            print(f"Saved: {output_path}")
        else:
            print(f"No enrichment found for: {lib}")


In [37]:
run_gene_enrichment("results/analysis/fv_RR1_baseline_ann_nb_ridge_20250618_120513.csv", top_n=30)

['ENSMUSG00000117874', 'ENSMUSG00000075318', 'ENSMUSG00000111709', 'ENSMUSG00000109564', 'ENSMUSG00000094686', 'ENSMUSG00000078901', 'ENSMUSG00000032323', 'ENSMUSG00000049436', 'ENSMUSG00000041794', 'ENSMUSG00000035373', 'ENSMUSG00000027379', 'ENSMUSG00000027500', 'ENSMUSG00000022613', 'ENSMUSG00000074183', 'ENSMUSG00000034918', 'ENSMUSG00000028587', 'ENSMUSG00000027871', 'ENSMUSG00000022878', 'ENSMUSG00000035352', 'ENSMUSG00000054083', 'ENSMUSG00000027070', 'ENSMUSG00000021697', 'ENSMUSG00000079494', 'ENSMUSG00000026255', 'ENSMUSG00000026609', 'ENSMUSG00000063590', 'ENSMUSG00000029384', 'ENSMUSG00000010476', 'ENSMUSG00000058297', 'ENSMUSG00000032172']
Running enrichment for: MGI_Mammalian_Phenotype_Level_4_2024
Saved: results/analysis/RR1_rf_baseline_MGI_Mammalian_Phenotype_Level_4_2024_top_enrichment.csv
Running enrichment for: Mouse_Gene_Atlas
Saved: results/analysis/RR1_rf_baseline_Mouse_Gene_Atlas_top_enrichment.csv


In [21]:
enr.results[enr.results['Adjusted P-value'] < 0.05][['Term', 'Genes', 'Adjusted P-value']]

Unnamed: 0,Term,Genes,Adjusted P-value


In [29]:
df = pd.read_csv("results/analysis/fv_RR1_baseline_ann_nb_ridge_20250618_120513.csv")
genes = df["Gene"].tolist()
print(genes)

['ENSMUSG00000117874', 'ENSMUSG00000075318', 'ENSMUSG00000111709', 'ENSMUSG00000109564', 'ENSMUSG00000094686', 'ENSMUSG00000078901', 'ENSMUSG00000032323', 'ENSMUSG00000049436', 'ENSMUSG00000041794', 'ENSMUSG00000035373', 'ENSMUSG00000027379', 'ENSMUSG00000027500', 'ENSMUSG00000022613', 'ENSMUSG00000074183', 'ENSMUSG00000034918', 'ENSMUSG00000028587', 'ENSMUSG00000027871', 'ENSMUSG00000022878', 'ENSMUSG00000035352', 'ENSMUSG00000054083', 'ENSMUSG00000027070', 'ENSMUSG00000021697', 'ENSMUSG00000079494', 'ENSMUSG00000026255', 'ENSMUSG00000026609', 'ENSMUSG00000063590', 'ENSMUSG00000029384', 'ENSMUSG00000010476', 'ENSMUSG00000058297', 'ENSMUSG00000032172']


In [31]:
import mygene

def ensembl_to_symbol(ensembl_ids):
    mg = mygene.MyGeneInfo()
    query_res = mg.querymany(ensembl_ids, scopes='ensembl.gene', fields='symbol', species='mouse')
    
    # Create dict mapping ensembl -> symbol (if found)
    ensembl_to_symbol_map = {}
    for entry in query_res:
        if 'notfound' not in entry and 'symbol' in entry:
            ensembl_to_symbol_map[entry['query']] = entry['symbol']
    
    return ensembl_to_symbol_map

# Example usage
ensembl_ids = ["ENSMUSG00000075318", "ENSMUSG00000111709"]
mapping = ensembl_to_symbol(genes)
print(mapping)


{'ENSMUSG00000117874': 'Gm50318', 'ENSMUSG00000075318': 'Scn2a', 'ENSMUSG00000111709': 'Gsta13', 'ENSMUSG00000109564': 'Muc16', 'ENSMUSG00000094686': 'Ccl21a', 'ENSMUSG00000078901': 'Zfp1009', 'ENSMUSG00000032323': 'Cyp11a1', 'ENSMUSG00000049436': 'Upk1b', 'ENSMUSG00000041794': 'Myrip', 'ENSMUSG00000035373': 'Ccl7', 'ENSMUSG00000027379': 'Bub1', 'ENSMUSG00000027500': 'Stmn2', 'ENSMUSG00000022613': 'Miox', 'ENSMUSG00000074183': 'Gsta1', 'ENSMUSG00000034918': 'Cdhr2', 'ENSMUSG00000028587': 'Orc1', 'ENSMUSG00000027871': 'Hsd3b1', 'ENSMUSG00000022878': 'Adipoq', 'ENSMUSG00000035352': 'Ccl12', 'ENSMUSG00000054083': 'Capn12', 'ENSMUSG00000027070': 'Lrp2', 'ENSMUSG00000021697': 'Depdc1b', 'ENSMUSG00000079494': 'Nat8f5', 'ENSMUSG00000026255': 'Efhd1', 'ENSMUSG00000026609': 'Ush2a', 'ENSMUSG00000063590': 'Slc22a28', 'ENSMUSG00000029384': '2010109A12Rik', 'ENSMUSG00000010476': 'Ebf3', 'ENSMUSG00000058297': 'Spock2', 'ENSMUSG00000032172': 'Olfm2'}
