## 🧿 Create SANS nodes
This notebook processes the GeneLab datasets to build Neo4j node and relationship files for SANS 

Author: Chisom Aniekwensi (sommaniekwensi@gmail.com)

In [1]:
import pandas as pd
import numpy as np
import random
import os
import kg_utils
import logging
import scipy.stats as stats  # Make sure to import scipy.stats

Successfully created kg_utils.pkl


In [3]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                   handlers=[logging.FileHandler("nasa_kg.log"), logging.StreamHandler()])
logger = logging.getLogger("nasa_kg")

utils = {
    "setup_directories": kg_utils.setup_directories,
    "save_dataframe": kg_utils.save_dataframe,
    "calculate_statistics": kg_utils.calculate_statistics,
    "KG_VERSION": kg_utils.KG_VERSION,
    "logger": kg_utils.logger
}

In [5]:
# SANS data definitions
SANS_DATA = {
    "clinical": {
        "eye": ["optic_disc_edema", "chorioretinal_folds", "globe_flattening"],
        "brain": ["gray_matter_reduction", "csf_volume_decrease"]
    },
    "biomarkers": ["homocysteine", "cystathionine", "methylcitric_acid"],
    "environmental": ["co2", "radiation", "microgravity"]
}

In [7]:
# Load study data from pickle file
import pickle
with open('kg_study_data.pkl', 'rb') as f:
    data = pickle.load(f)

# Extract studies from data
studies = data['studies']

In [9]:
# Create clinical findings
findings = []
for study in studies:
    for location, finding_list in SANS_DATA["clinical"].items():
        for finding in finding_list:
            findings.append({
                "identifier": f"{study['identifier']}_{finding}",
                "name": finding.replace("_", " ").title(),
                "category": location.title(),
                "effect_size": round(random.uniform(0.1, 2.0), 2)
            })

In [11]:
# Create biomarkers
biomarkers = []
for study in studies:
    for marker in SANS_DATA["biomarkers"]:
        biomarkers.append({
            "identifier": f"{study['identifier']}_{marker}",
            "name": marker.replace("_", " ").title(),
            "category": "Biomarker",
            "fold_change": round(random.uniform(0.5, 3.0), 2)
        })


In [13]:
# Create environmental factors
env_factors = []
for study in studies:
    for factor in SANS_DATA["environmental"]:
        env_factors.append({
            "identifier": f"{study['identifier']}_{factor}",
            "name": factor.replace("_", " ").title(),
            "category": "Environmental",
            "odds_ratio": round(random.uniform(1.1, 4.0), 2)
        })

In [15]:
# Save SANS-related node files
dirs = kg_utils.setup_directories()
kg_utils.save_dataframe(pd.DataFrame(findings), os.path.join(dirs["nodes"], "ClinicalFinding.csv"))
kg_utils.save_dataframe(pd.DataFrame(biomarkers), os.path.join(dirs["nodes"], "Biomarker.csv"))
kg_utils.save_dataframe(pd.DataFrame(env_factors), os.path.join(dirs["nodes"], "EnvironmentalFactor.csv"))

2025-05-04 18:54:27,866 - INFO - Saved 15 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes\ClinicalFinding.csv
2025-05-04 18:54:27,872 - INFO - Saved 9 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes\Biomarker.csv
2025-05-04 18:54:27,880 - INFO - Saved 9 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes\EnvironmentalFactor.csv


Base Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025
Knowledge Graph Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph
Version Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1
Nodes Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes
Relationships Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels
