## 🧪 Create Assay Nodes & Relationships

This notebook reads the dataset manifest, extracts assay and material metadata, maps tissues/cells to UBERON/Cell Ontology, and writes Neo4j CSVs for assay nodes and their relationships to studies, anatomy, cell types, genes, and methylation regions using ontology_mapper.

Author: Chisom Aniekwensi (sommaniekwensi@gmail.com)

In [22]:
import pandas as pd
import os
import logging
import pickle
import kg_utils
import time
from pathlib import Path
from typing import Dict, List

In [3]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                   handlers=[logging.FileHandler("nasa_kg.log"), logging.StreamHandler()])
logger = logging.getLogger("nasa_kg")


In [5]:
# Load the utilities
with open('kg_utils.pkl', 'rb') as f:
    kg_utils = pickle.load(f)

# Set up directories
directories = kg_utils["setup_directories"]()

Base Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025
Knowledge Graph Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph
Version Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1
Nodes Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes
Relationships Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels


In [7]:
# function to save dataframe to a CSV file in the specified directory

def save_dataframe(df, filename, directory):
    os.makedirs(directory, exist_ok=True)
    file_path = os.path.join(directory, filename)
    try:
        df.to_csv(file_path, index=False)
        logger.info(f"Saved {len(df)} rows to {file_path}")
    except PermissionError:
        # Fall back to a directory we definitely have access to
        backup_dir = os.path.join(os.getcwd(), "backup_data")
        os.makedirs(backup_dir, exist_ok=True)
        backup_path = os.path.join(backup_dir, filename)
        df.to_csv(backup_path, index=False)
        logger.warning(f"Permission denied at {file_path}. Saved to {backup_path} instead.")
    return df

In [9]:
# Statistical calculation function
def calculate_statistics(value1, value2, method="t-test"):
    if method == "t-test":
        t_stat, p_value = stats.ttest_ind(value1, value2)
        effect_size = abs(np.mean(value1) - np.mean(value2)) / np.std(np.concatenate([value1, value2]))
        return {"t_statistic": round(t_stat, 3), "p_value": round(p_value, 4), "effect_size": round(effect_size, 2)}
    elif method == "fold_change":
        fold_change = np.mean(value1) / np.mean(value2) if np.mean(value2) != 0 else 1.0
        return {"fold_change": round(fold_change, 2), "log2fc": round(np.log2(fold_change), 2)}
    return {}

In [11]:
# Create Study nodes with proper column headers
studies = [
    {"identifier": "OSD-557", "name": "SANS Eye Structure Study", "organism": "Mus musculus", 
     "description": "Analysis of eye structures in spaceflight", "taxonomy": "10090",
     "data_types": "imaging,clinical", "mission_id": "RR9"},
    {"identifier": "OSD-568", "name": "SANS Retina Analysis", "organism": "Mus musculus", 
     "description": "Retinal thickness measurements", "taxonomy": "10090",
     "data_types": "imaging,clinical", "mission_id": "RR9"},
    {"identifier": "OSD-679", "name": "IOP and OCT Assessments", "organism": "Mus musculus", 
     "description": "Intraocular pressure measurements", "taxonomy": "10090",
     "data_types": "imaging,clinical,biomarker", "mission_id": "HLU"}
]

In [13]:
# Create Mission nodes with proper column headers
missions = [
    {"identifier": "RR9", "name": "Rodent Research 9", 
     "description": "Investigation of spaceflight effects on rodent physiology", 
     "duration_days": 35, "organization": "NASA"},
    {"identifier": "HLU", "name": "Hindlimb Unloading Experiment", 
     "description": "Ground-based analog for spaceflight effects", 
     "duration_days": 28, "organization": "NASA"}
]

In [24]:
# Call functions as module methods
dirs = kg_utils.setup_directories()
kg_utils.save_dataframe(pd.DataFrame(studies), os.path.join(dirs["nodes"], "Study.csv"))
kg_utils.save_dataframe(pd.DataFrame(missions), os.path.join(dirs["nodes"], "Mission.csv"))

2025-05-04 18:41:24,035 - INFO - Saved 3 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes\Study.csv
2025-05-04 18:41:24,043 - INFO - Saved 2 rows to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes\Mission.csv


Base Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025
Knowledge Graph Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph
Version Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1
Nodes Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\nodes
Relationships Directory: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1\rels


In [26]:
# Save studies and missions for other notebooks
with open('kg_study_data.pkl', 'wb') as f:
    pickle.dump({"studies": studies, "missions": missions}, f)

logger.info("Utilities saved and ready for use in other notebooks, Study and Mission nodes created successfully")

2025-05-04 18:42:03,471 - INFO - Utilities saved and ready for use in other notebooks, Study and Mission nodes created successfully
