## 🌱 Create Model‑Organism & Human Gene Nodes with Orthology Links

This notebook processes the GeneLab datasets to build Neo4j node and relationship files for model‑organism genes (MGene), human ortholog genes, and their orthology links. ortholog_mapper packages to extract gene IDs, map to human orthologs, and write CSVs ready for SPOKE ingestion.

Author: Chisom Aniekwensi (sommaniekwensi@gmail.com)

In [1]:
import pandas as pd
import numpy as np
import pickle
import random
import os
import kg_utils
import logging
import scipy.stats as stats  # Make sure to import scipy.stats

Successfully created kg_utils.pkl


In [2]:
# Define your KG version
KG_VERSION = "v0.0.1"  # Replace with your version

In [5]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                   handlers=[logging.FileHandler("nasa_kg.log"), logging.StreamHandler()])
logger = logging.getLogger("nasa_kg")

utils = {
    "setup_directories": kg_utils.setup_directories,
    "save_dataframe": kg_utils.save_dataframe,
    "calculate_statistics": kg_utils.calculate_statistics,
    "logger": kg_utils.logger
}

In [7]:
# Extract functions and data
# Extract functions and variables
setup_directories = utils['setup_directories']
save_dataframe = utils['save_dataframe']
calculate_statistics = utils['calculate_statistics']
logger = utils['logger']

# Get the directories
dirs = kg_utils.setup_directories()

Base Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025
Knowledge Graph Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph
Version Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1
Nodes Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes
Relationships Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels


In [9]:
# Gene mapping dictionary with proper entrez IDs
GENE_MAPPINGS = {
    "10090": {  # Mouse to Human 
        "14679": {"human_entrez_id": "2773", "human_symbol": "GNAI3", "function": "G protein signaling"},
        "26413": {"human_entrez_id": "6009", "human_symbol": "RHEB", "function": "mTOR signaling"},
        "19645": {"human_entrez_id": "7157", "human_symbol": "TP53", "function": "Tumor suppression"}
    }
}

In [15]:
# Load study data from pickle file
import pickle
with open('kg_study_data.pkl', 'rb') as f:
    data = pickle.load(f)


In [17]:
# Extract studies from data
studies = data['studies']
mgenes = []
for study in studies:
    taxonomy = study["taxonomy"]
    for entrez, symbol in [("14679", "GNAI3"), ("26413", "RHEB"), ("19645", "TP53")]:
        # Generate realistic expression data for control vs experimental
        control_expr = np.random.normal(10, 1, 5)  # 5 control samples
        exp_expr = np.random.normal(12, 2, 5)      # 5 experimental samples
        
        # Calculate statistics - make sure to capture the return values
        p_value = calculate_statistics(exp_expr, control_expr, "t-test")
        fold_change = calculate_statistics(exp_expr, control_expr, "fold_change")
        
        mgenes.append({
            "identifier": entrez,
            "name": symbol,
            "organism": study["organism"],
            "taxonomy": taxonomy,
            "log2fc": round(random.uniform(-3, 3), 2),
            "p_value": p_value,  # Now p_value is properly defined
            "adjusted_p_value": round(p_value * len(studies) * 3, 4)  
        })

In [19]:
# Create human genes and orthology relationships
human_genes = {}
ortholog_rels = []

for gene in mgenes:
    gene_id = gene["identifier"]
    taxonomy = gene["taxonomy"]
    human_mapping = GENE_MAPPINGS.get(taxonomy, {}).get(gene_id, {})
    
    if human_mapping and "human_entrez_id" in human_mapping:
        h_id = human_mapping["human_entrez_id"]
        human_genes[h_id] = {
            "identifier": h_id,
            "name": human_mapping["human_symbol"],
            "function": human_mapping.get("function", "")
        }
        
        ortholog_rels.append({
            "from": gene_id,
            "to": h_id
        })



In [25]:
"""# Save gene nodes and orthology relationships
pd.DataFrame(mgenes).to_csv(os.path.join(dirs["nodes"], "MGene.csv"), index=False)
pd.DataFrame(list(human_genes.values())).to_csv(os.path.join(dirs["nodes"], "Gene.csv"), index=False)
pd.DataFrame(ortholog_rels).to_csv(os.path.join(dirs["relationship"], "MGene-IS_ORTHOLOG_MGiG-Gene.csv"), index=False"""

dirs = kg_utils.setup_directories()
kg_utils.save_dataframe(pd.DataFrame(mgenes), os.path.join(dirs["nodes"], "MGene.csv"))
kg_utils.save_dataframe(pd.DataFrame(list(human_genes.values())), os.path.join(dirs["nodes"], "Gene.csv"))
kg_utils.save_dataframe(pd.DataFrame(ortholog_rels), os.path.join(dirs["rels"], "MGene-IS_ORTHOLOG_MGiG-Gene.csv"))


2025-05-04 18:50:12,375 - INFO - Saved 9 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes\MGene.csv
2025-05-04 18:50:12,381 - INFO - Saved 3 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes\Gene.csv
2025-05-04 18:50:12,387 - INFO - Saved 9 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels\MGene-IS_ORTHOLOG_MGiG-Gene.csv


Base Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025
Knowledge Graph Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph
Version Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1
Nodes Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes
Relationships Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels
