## 🌐 Import & Inspect NASA SANS Knowledge Graph in Neo4j

This notebook bulk-imports the CSV node and relationship exports from our NASA KG 
pipeline into a Neo4j graph database, then runs exploratory queries to validate schema, 
metadata, counts, and full-text search functionality.

Author: [Chisom Aniekwensi]

In [1]:
!pip install py2neo pandas



In [26]:
import os, pandas as pd, glob
from py2neo import Graph

In [33]:
# Connect to Neo4j
def connect_to_neo4j():
    uri = "bolt://localhost:7687"
    user = input("Neo4j username [neo4j]: ").strip() or "neo4j"
    password = input("Neo4j password: ").strip()
    return Graph(uri, auth=(user, password))

In [34]:
# Find KG directory
def find_kg_dir(path=None):
    base_dir = path or input("Path to knowledge graph directory: ").strip() or os.getcwd()
    print(f"Searching in: {base_dir}")
    
    for nodes_dir in glob.glob(os.path.join(base_dir, "**/nodes"), recursive=True):
        rels_dir = os.path.join(os.path.dirname(nodes_dir), "rels")
        if os.path.exists(rels_dir):
            print(f"Found KG at: {os.path.dirname(nodes_dir)}")
            return {"nodes": nodes_dir, "rels": rels_dir}
    print("KG not found. Check the path.")
    return None

In [35]:
# Import nodes
def import_nodes(graph, nodes_dir):
    count = 0
    for file in [f for f in os.listdir(nodes_dir) if f.endswith('.csv')]:
        node_type = file.replace('.csv', '')
        df = pd.read_csv(os.path.join(nodes_dir, file))
        print(f"Importing {len(df)} {node_type} nodes...")
        
        # Create index
        graph.run(f"CREATE INDEX IF NOT EXISTS FOR (n:{node_type}) ON (n.identifier)")
        
        # Import in batches
        for i in range(0, len(df), 500):
            batch = df.iloc[i:i+500].fillna('').to_dict('records')
            graph.run(f"UNWIND $records AS r CREATE (n:{node_type}) SET n = r", records=batch)
            count += len(batch)
    return count

In [36]:
def import_relationships(graph, rels_dir):
    """Import relationships from CSV files"""
    rel_count = 0
    for file_name in [f for f in os.listdir(rels_dir) if f.endswith('.csv')]:
        parts = file_name.replace('.csv', '').split('-')
        if len(parts) != 3: continue
        source_type, rel_type, target_type = parts
        
        df = pd.read_csv(os.path.join(rels_dir, file_name))
        print(f"🔗 Importing {len(df)} {rel_type} relationships...")
        
        # Create indices for faster lookups
        graph.run(f"CREATE INDEX IF NOT EXISTS FOR (n:{source_type}) ON (n.identifier)")
        graph.run(f"CREATE INDEX IF NOT EXISTS FOR (n:{target_type}) ON (n.identifier)")
        
        for i in range(0, len(df), 500):
            batch = df.iloc[i:i+500].fillna('').to_dict('records')
            query = f"""
            UNWIND $records AS r
            MATCH (a:{source_type} {{identifier: r.from}})
            MATCH (b:{target_type} {{identifier: r.to}})
            CREATE (a)-[rel:{rel_type}]->(b)
            SET rel = r
            REMOVE rel.from, rel.to
            """
            rel_count += graph.run(query, records=batch).stats().relationships_created
    return rel_count


In [37]:
# Import relationships
def import_relationships(graph, rels_dir):
    count = 0
    for file in [f for f in os.listdir(rels_dir) if f.endswith('.csv')]:
        parts = file.replace('.csv', '').split('-')
        if len(parts) != 3: continue
        source, rel_type, target = parts
        
        df = pd.read_csv(os.path.join(rels_dir, file))
        print(f"Importing {len(df)} {rel_type} relationships...")
        
        # Create indices
        graph.run(f"CREATE INDEX IF NOT EXISTS FOR (n:{source}) ON (n.identifier)")
        graph.run(f"CREATE INDEX IF NOT EXISTS FOR (n:{target}) ON (n.identifier)")
        
        # Import in batches
        batch_count = 0
        for i in range(0, len(df), 500):
            batch = df.iloc[i:i+500].fillna('').to_dict('records')
            query = f"""
            UNWIND $records AS r
            MATCH (a:{source} {{identifier: r.from}})
            MATCH (b:{target} {{identifier: r.to}})
            CREATE (a)-[rel:{rel_type}]->(b)
            SET rel = r
            REMOVE rel.from, rel.to
            """
            graph.run(query, records=batch)
            batch_count += len(batch)
            print(f"  Processed {batch_count}/{len(df)} {rel_type} relationships")
        count += batch_count
    return count


In [38]:
# Main function
if __name__ == "__main__":
    print("NASA SANS Knowledge Graph Importer")
    print("==================================")
    
    # Connect to Neo4j
    graph = connect_to_neo4j()
    
    # Find and import KG
    kg_dir = find_kg_dir()
    if kg_dir:
        # Clear DB if requested
        if input("Clear database? (y/n): ").lower() == 'y':
            graph.run("MATCH (n) DETACH DELETE n")
        
        # Import data
        nodes = import_nodes(graph, kg_dir['nodes'])
        rels = import_relationships(graph, kg_dir['rels'])
        
        # Add styling - each in a separate query
        print("Adding styling...")
        graph.run("MATCH (s:Study) SET s.ui_color = '#6baed6'")
        graph.run("MATCH (m:Mission) SET m.ui_color = '#fd8d3c'")
        graph.run("MATCH (g:Gene) SET g.ui_color = '#74c476'")
        graph.run("MATCH (c:ClinicalFinding) SET c.ui_color = '#fb6a4a'")
        
        # Verify
        count = graph.run("MATCH (n) RETURN count(n) AS count").data()[0]["count"]
        print(f"Import complete! Database now has {count} nodes and {rels} relationships")

NASA SANS Knowledge Graph Importer


Neo4j username [neo4j]:  neo4j
Neo4j password:  neo4jdemo
Path to knowledge graph directory:  C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph


Searching in: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph
Found KG at: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1


Clear database? (y/n):  n


Importing 3 Anatomy nodes...
Importing 9 Assay nodes...
Importing 9 Biomarker nodes...
Importing 15 ClinicalFinding nodes...
Importing 9 EnvironmentalFactor nodes...
Importing 3 Gene nodes...
Importing 3 Measurement nodes...
Importing 9 MGene nodes...
Importing 2 Mission nodes...
Importing 3 Study nodes...
Importing 12 INVESTIGATED_ASiA relationships...
  Processed 12/12 INVESTIGATED_ASiA relationships
Importing 9 IS_ORTHOLOG_MGiG relationships...
  Processed 9/9 IS_ORTHOLOG_MGiG relationships
Importing 3 CONDUCTED_MICS relationships...
  Processed 3/3 CONDUCTED_MICS relationships
Importing 9 EXHIBITED_SeB relationships...
  Processed 9/9 EXHIBITED_SeB relationships
Importing 15 EXHIBITED_SeC relationships...
  Processed 15/15 EXHIBITED_SeC relationships
Importing 9 EXPOSED_TO_SeE relationships...
  Processed 9/9 EXPOSED_TO_SeE relationships
Importing 9 PERFORMED_SpAS relationships...
  Processed 9/9 PERFORMED_SpAS relationships
Adding styling...
Import complete! Database now has 195 n

In [20]:
# Example usage
if __name__ == "__main__":
    print("NASA SANS Knowledge Graph Importer")
    print("==================================")
    
    # Ask for KG directory
    kg_path = input("Enter path to knowledge graph directory (or press Enter to search current directory): ")
    
    # Run the import
    graph = import_sans_kg(base_dir=kg_path or None, clear_db=True)
    
    if graph:
        # Verify import
        study_count = graph.run("MATCH (s:Study) RETURN count(s) AS count").data()[0]["count"]
        print(f"Verification: Found {study_count} Study nodes in the database")

NASA SANS Knowledge Graph Importer


Enter path to knowledge graph directory (or press Enter to search current directory):  C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph


Searching for knowledge graph in: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph
Found knowledge graph at: C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\knowledge_graph\v0.0.1


Username [neo4j]:  neo4j
Password:  neo4jdemo


✅ Connected to sans-genelab


Clear database? (y/n):  y


ClientError: [Database.DatabaseNotFound] Database does not exist. Database name: 'sans-genelab'.

In [11]:
# MAIN FUNCTION
def import_sans_kg(base_dir=None, clear_db=False):
    """Main function to import SANS knowledge graph"""
    print("🚀 NASA SANS Knowledge Graph Import Tool")
    
    # Find knowledge graph
    kg_dir = find_kg_dir(base_dir or os.getcwd())
    if not kg_dir:
        print("❌ Knowledge graph directory not found")
        return False
        
    # Connect to Neo4j
    graph = connect_to_neo4j("sans-genelab")
    if not graph:
        return False
    
    # Clear database if requested
    if clear_db and input("Clear existing database? (y/n): ").lower() == 'y':
        graph.run("MATCH (n) DETACH DELETE n")
        print("🧹 Database cleared")
    
    # Import data
    node_count = import_nodes(graph, kg_dir['nodes'])
    rel_count = import_relationships(graph, kg_dir['rels'])
    
    # Configure GeneLab-specific settings
    configure_genelab_database(graph)
    apply_grass_styling(graph)
    
    print(f"✅ Import complete! Added {node_count} nodes and {rel_count} relationships")
    return True


In [12]:
# Example usage
if __name__ == "__main__":
    import_sans_kg(clear_db=True)

🚀 NASA SANS Knowledge Graph Import Tool
❌ Knowledge graph directory not found
