## 🧪 Create SANS Nodes & Relationships
This notebook processes the dataset manifest, extracting SANS-related metadata and mapping tissues/cells to UBERON/Cell Ontology. It generates Neo4j CSVs for SANS nodes and their relationships to studies, anatomy, cell types, genes, and methylation regions using ontology_mapper.

Author: Chisom Aniekwensi (sommaniekwensi@gmail.com)

In [61]:
import pandas as pd
import numpy as np
import random
import pickle
import os
import kg_utils
import logging
import scipy.stats as stats  # Make sure to import scipy.stats

In [63]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                   handlers=[logging.FileHandler("nasa_kg.log"), logging.StreamHandler()])
logger = logging.getLogger("nasa_kg")

# Get directories
dirs = kg_utils.setup_directories()

Base Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025
Knowledge Graph Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph
Version Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1
Nodes Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes
Relationships Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels


In [65]:
# Load SANS data from CSV files
try:
    findings_df = pd.read_csv(os.path.join(dirs["nodes"], "ClinicalFinding.csv"))
    biomarkers_df = pd.read_csv(os.path.join(dirs["nodes"], "Biomarker.csv"))
    env_factors_df = pd.read_csv(os.path.join(dirs["nodes"], "EnvironmentalFactor.csv"))
    
    findings = findings_df.to_dict('records')
    biomarkers = biomarkers_df.to_dict('records')
    env_factors = env_factors_df.to_dict('records')
    logger.info(f"Loaded {len(findings)} findings, {len(biomarkers)} biomarkers, and {len(env_factors)} environmental factors")
except FileNotFoundError:
    logger.warning("One or more SANS data files not found - creating empty lists")
    findings = []
    biomarkers = []
    env_factors = []

2025-05-04 19:38:09,102 - INFO - Loaded 15 findings, 9 biomarkers, and 9 environmental factors


In [67]:
# Load studies from CSV
studies_df = pd.read_csv(os.path.join(dirs["nodes"], "Study.csv"))
studies = studies_df.to_dict('records')
logger.info(f"Loaded {len(studies)} studies")

2025-05-04 19:38:12,585 - INFO - Loaded 3 studies


In [69]:
# SANS imaging assays with detailed metadata
IMAGING_ASSAYS = [
    {"name": "Optical Coherence Tomography", "abbrev": "OCT", "targets": ["eye", "retina"], 
     "description": "Non-invasive imaging technique using light waves", "resolution": "5-10 μm"},
    {"name": "Magnetic Resonance Imaging", "abbrev": "MRI", "targets": ["brain"], 
     "description": "Uses magnetic fields to generate images", "resolution": "1-2 mm"},
    {"name": "Tonometry", "abbrev": "IOP", "targets": ["eye"], 
     "description": "Measures intraocular pressure", "resolution": "1 mmHg"}
]

In [71]:
# Anatomical structure mappings with ontology IDs
ANATOMY = {
    "eye": {"identifier": "UBERON:0000970", "name": "eye", "description": "Organ of vision"},
    "brain": {"identifier": "UBERON:0000955", "name": "brain", "description": "Central nervous system organ"},
    "retina": {"identifier": "UBERON:0000966", "name": "retina", "description": "Light-sensitive tissue"}
}

In [73]:
# Create assays with proper metadata
assays = []
measurements = []

In [75]:
for study in studies:
    for assay_info in IMAGING_ASSAYS:
        assay_id = f"{study['identifier']}_{assay_info['abbrev'].lower()}"
        assay = {
            "identifier": assay_id,
            "name": assay_info["name"],
            "description": assay_info["description"],
            "resolution": assay_info["resolution"],
            "type": "non-omics",
            "category": "SANS"
        }
        
        # Add targets
        for target in assay_info["targets"]:
            target_id = ANATOMY[target]["identifier"]
            assay[f"target_{target}_id"] = target_id
            
            # Add measurements for specific targets
            if assay_info["abbrev"] == "OCT" and target == "retina":
                measurements.append({
                    "identifier": f"{assay_id}_retinal_thickness",
                    "name": "Retinal Thickness",
                    "value": f"{round(np.random.normal(250, 20), 1)} μm",
                    "reference_range": "200-300 μm",
                    "assay_id": assay_id,
                    "target_id": target_id
                })
        
        assays.append(assay)

In [77]:
# 1. Mission-Study relationships
mission_study_rels = []
for study in studies:
    mission_study_rels.append({
        "from": study["mission_id"],
        "to": study["identifier"]
    })

In [79]:
# 2. Study-Assay relationships
study_assay_rels = []
for assay in assays:
    study_id = assay["identifier"].split("_")[0]
    study_assay_rels.append({
        "from": study_id,
        "to": assay["identifier"]
    })

In [81]:
# 3. Assay-Anatomy relationships
assay_anatomy_rels = []
for assay in assays:
    for key, value in assay.items():
        if key.startswith("target_") and key.endswith("_id"):
            assay_anatomy_rels.append({
                "from": assay["identifier"],
                "to": value
            })

In [83]:
# 4. Study-Finding relationships with proper data types
study_finding_rels = []
if findings:
    for finding in findings:
        study_id = finding["identifier"].split("_")[0]
        study_finding_rels.append({
            "from": study_id,  # string: GeneLab Data System ID
            "to": finding["identifier"],  # string: Finding identifier
            "effect_size": float(finding.get("effect_size", 0)),  # float: Effect size
            "p_value": float(finding.get("p_value", 0.05))  # float: Statistical p-value
        })

In [85]:
# 5. Study-Biomarker relationships with proper data types
study_biomarker_rels = []
if biomarkers:
    for biomarker in biomarkers:
        study_id = biomarker["identifier"].split("_")[0]
        study_biomarker_rels.append({
            "from": study_id,  # string: GeneLab Data System ID
            "to": biomarker["identifier"],  # string: Biomarker identifier
            "fold_change": float(biomarker.get("fold_change", 1)),  # float: Fold change value
            "p_value": float(biomarker.get("p_value", 0.05))  # float: Statistical p-value
        })

In [87]:
# 6. Study-EnvFactor relationships with proper data types
study_envfactor_rels = []
if env_factors:
    for env_factor in env_factors:
        study_id = env_factor["identifier"].split("_")[0]
        study_envfactor_rels.append({
            "from": study_id,  # string: GeneLab Data System ID
            "to": env_factor["identifier"],  # string: Environmental factor identifier
            "odds_ratio": float(env_factor.get("odds_ratio", 1))  # float: Odds ratio value
        })

In [89]:
# Save assay and anatomy nodes
kg_utils.save_dataframe(pd.DataFrame(assays), os.path.join(dirs["nodes"], "Assay.csv"))
kg_utils.save_dataframe(pd.DataFrame(list(ANATOMY.values())), os.path.join(dirs["nodes"], "Anatomy.csv"))
kg_utils.save_dataframe(pd.DataFrame(measurements), os.path.join(dirs["nodes"], "Measurement.csv"))

2025-05-04 19:40:05,793 - INFO - Saved 9 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes\Assay.csv
2025-05-04 19:40:05,800 - INFO - Saved 3 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes\Anatomy.csv
2025-05-04 19:40:05,806 - INFO - Saved 3 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes\Measurement.csv


In [91]:
# Save all relationship files
kg_utils.save_dataframe(pd.DataFrame(mission_study_rels), os.path.join(dirs["rels"], "Mission-CONDUCTED_MICS-Study.csv"))
kg_utils.save_dataframe(pd.DataFrame(study_assay_rels), os.path.join(dirs["rels"], "Study-PERFORMED_SpAS-Assay.csv"))
kg_utils.save_dataframe(pd.DataFrame(assay_anatomy_rels), os.path.join(dirs["rels"], "Assay-INVESTIGATED_ASiA-Anatomy.csv"))

2025-05-04 19:40:08,428 - INFO - Saved 3 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels\Mission-CONDUCTED_MICS-Study.csv
2025-05-04 19:40:08,436 - INFO - Saved 9 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels\Study-PERFORMED_SpAS-Assay.csv
2025-05-04 19:40:08,442 - INFO - Saved 12 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels\Assay-INVESTIGATED_ASiA-Anatomy.csv


In [93]:
# Only save relationships if there's data
if study_finding_rels:
    kg_utils.save_dataframe(pd.DataFrame(study_finding_rels), os.path.join(dirs["rels"], "Study-EXHIBITED_SeC-ClinicalFinding.csv"))
if study_biomarker_rels:
    kg_utils.save_dataframe(pd.DataFrame(study_biomarker_rels), os.path.join(dirs["rels"], "Study-EXHIBITED_SeB-Biomarker.csv"))
if study_envfactor_rels:
    kg_utils.save_dataframe(pd.DataFrame(study_envfactor_rels), os.path.join(dirs["rels"], "Study-EXPOSED_TO_SeE-EnvironmentalFactor.csv"))

2025-05-04 19:40:09,893 - INFO - Saved 15 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels\Study-EXHIBITED_SeC-ClinicalFinding.csv
2025-05-04 19:40:09,898 - INFO - Saved 9 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels\Study-EXHIBITED_SeB-Biomarker.csv
2025-05-04 19:40:09,904 - INFO - Saved 9 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels\Study-EXPOSED_TO_SeE-EnvironmentalFactor.csv


In [95]:
logger.info("All assay nodes and relationships created successfully")
print("\nKnowledge Graph build complete!")
print(f"Files created in: {os.path.abspath(dirs['version'])}")

2025-05-04 19:40:12,338 - INFO - All assay nodes and relationships created successfully



Knowledge Graph build complete!
Files created in: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1
