Constructs a nondirected similarity graph. 
First, we parallel batch upload all nodes. 

In [None]:
# Cell 1

import asyncio
import os
import logging
import sys
import pickle
import json
import time
from tqdm import tqdm
from pinecone import Pinecone
from neo4j import GraphDatabase
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configurations
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT", "us-east1-gcp")
PINECONE_INDEX_NAME = "belief-embeddings"

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Script Constants
FETCH_BATCH_SIZE = 500
CHECKPOINT_DIR = "checkpoints"
FAILED_UPLOADS_FILE = "failed_uploads.json"
VECTOR_IDS_FILE = "../randomly_discovered_ids.txt"  # Relative path to ensure it finds the file

# Toggles
RESET_NEO4J = False
RESET_LOGS = False

# Create checkpoint directory
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Logging setup
LOG_FILE = "pipeline.log"

if RESET_LOGS:
    # Clear the log file
    with open(LOG_FILE, "w"):
        pass
    # Also remove the checkpoint files, if they exist
    processed_path = os.path.join(CHECKPOINT_DIR, "processed_ids.pkl")
    uploaded_path  = os.path.join(CHECKPOINT_DIR, "uploaded_ids.pkl")
    if os.path.exists(processed_path):
        os.remove(processed_path)
    if os.path.exists(uploaded_path):
        os.remove(uploaded_path)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(LOG_FILE)
    ]
)
logger = logging.getLogger(__name__)
logger.info("✅ Logging initialized.")

In [None]:
# Cell 2

def init_pinecone():
    """
    Initialize Pinecone connection and get index handle.
    """
    pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
    index = pc.Index(PINECONE_INDEX_NAME)
    stats = index.describe_index_stats()
    logger.info(f"✅ Connected to Pinecone. Total vectors: {stats.get('total_vector_count', 0)}")
    return index, pc

def init_neo4j():
    """Initialize Neo4j connection with retry logic."""
    if not NEO4J_URI:
        raise ValueError("⚠️ NEO4J_URI is not set. Check your .env file.")

    max_attempts = 3
    for attempt in range(max_attempts):
        try:
            driver = GraphDatabase.driver(
                NEO4J_URI,
                auth=(NEO4J_USER, NEO4J_PASSWORD),
                max_connection_lifetime=3600
            )
            with driver.session() as session:
                # Quick test query
                if session.run("RETURN 1").single():
                    logger.info("✅ Successfully connected to Neo4j")
                    return driver
        except Exception as e:
            logger.warning(f"⚠️ Connection attempt {attempt + 1} failed: {str(e)}")
            time.sleep(2 ** attempt)
    raise Exception("❌ Failed to connect to Neo4J after multiple attempts.")

def reset_neo4j(driver):
    """Delete all nodes and relationships from Neo4j (if enabled)."""
    if RESET_NEO4J:
        with driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
        logger.info("❌ Neo4j database fully reset.")


# Initialize
index, pc = init_pinecone()
driver = init_neo4j()

# Reset Neo4j if toggled
reset_neo4j(driver)

In [None]:
# Cell 3

class ProcessTracker:
    def __init__(self):
        # We'll load any existing (if logs aren't reset)
        self.processed_ids = set()
        self.uploaded_ids = set()
        self.load_checkpoint()

    def load_checkpoint(self):
        """Load processed and uploaded IDs from disk."""
        processed_path = os.path.join(CHECKPOINT_DIR, "processed_ids.pkl")
        uploaded_path  = os.path.join(CHECKPOINT_DIR, "uploaded_ids.pkl")

        try:
            if os.path.exists(processed_path):
                with open(processed_path, "rb") as f:
                    self.processed_ids = pickle.load(f)
                logger.info(f"🔄 Loaded {len(self.processed_ids)} processed IDs.")
            if os.path.exists(uploaded_path):
                with open(uploaded_path, "rb") as f:
                    self.uploaded_ids = pickle.load(f)
                logger.info(f"🔄 Loaded {len(self.uploaded_ids)} uploaded IDs.")
        except Exception as e:
            logger.error(f"⚠️ Error loading checkpoints: {str(e)}")

    def save_checkpoint(self):
        """Save progress to disk."""
        processed_path = os.path.join(CHECKPOINT_DIR, "processed_ids.pkl")
        uploaded_path  = os.path.join(CHECKPOINT_DIR, "uploaded_ids.pkl")
        try:
            with open(processed_path, "wb") as f:
                pickle.dump(self.processed_ids, f)
            with open(uploaded_path, "wb") as f:
                pickle.dump(self.uploaded_ids, f)
            logger.info(f"✅ Saved checkpoints: {len(self.processed_ids)} processed, {len(self.uploaded_ids)} uploaded")
        except Exception as e:
            logger.error(f"⚠️ Error saving checkpoints: {str(e)}")

    def mark_processed(self, vec_ids):
        self.processed_ids.update(vec_ids)
        self.save_checkpoint()

    def mark_uploaded(self, vec_ids):
        self.uploaded_ids.update(vec_ids)
        self.save_checkpoint()

    def get_unprocessed(self, all_ids):
        return [vid for vid in all_ids if vid not in self.processed_ids]

    def get_unuploaded(self, all_ids):
        return [vid for vid in all_ids if vid not in self.uploaded_ids]


tracker = ProcessTracker()


In [None]:
# Cell 4

def fetch_vector_batch(index, batch_ids):
    """
    Fetch metadata from Pinecone for given 'batch_ids'.
    """
    if not batch_ids:
        return []
    try:
        response = index.fetch(ids=batch_ids)
        return [
            {"id": vid, "metadata": vdata.metadata}
            for vid, vdata in response.vectors.items()
        ]
    except Exception as e:
        logger.error(f"❌ Error fetching vectors: {str(e)}")
        return []

def upload_to_neo4j(driver, vectors):
    """
    Batch upload metadata to Neo4j.
    """
    if not vectors:
        return

    # Filter out any vectors already uploaded
    unuploaded_vectors = [v for v in vectors if v["id"] not in tracker.uploaded_ids]
    if not unuploaded_vectors:
        return

    BATCH_UPLOAD_SIZE = 500
    failed_uploads = []
    success_ids = []

    with driver.session() as session:
        for i in range(0, len(unuploaded_vectors), BATCH_UPLOAD_SIZE):
            batch = unuploaded_vectors[i : i + BATCH_UPLOAD_SIZE]

            concept_batch = [v for v in batch if v["metadata"].get("type", "").lower() == "concept"]
            belief_batch  = [v for v in batch if v["metadata"].get("type", "").lower() == "belief"]
            unknown_batch = [v for v in batch if v["metadata"].get("type", "").lower() not in ("concept","belief")]

            try:
                if concept_batch:
                    session.execute_write(lambda tx: tx.run(
                        """
                        UNWIND $concepts AS item
                        MERGE (n:Concept { id: item.id })
                        SET n += item.metadata
                        """,
                        concepts=concept_batch
                    ))
                if belief_batch:
                    session.execute_write(lambda tx: tx.run(
                        """
                        UNWIND $beliefs AS item
                        MERGE (n:Belief { id: item.id })
                        SET n += item.metadata
                        """,
                        beliefs=belief_batch
                    ))
                if unknown_batch:
                    session.execute_write(lambda tx: tx.run(
                        """
                        UNWIND $unknowns AS item
                        MERGE (n:UnknownType { id: item.id })
                        SET n += item.metadata
                        """,
                        unknowns=unknown_batch
                    ))

                # Mark success
                success_ids.extend([v["id"] for v in batch])

            except Exception as e:
                logger.error(f"❌ Error writing batch to Neo4j: {str(e)}")
                failed_uploads.extend([v["id"] for v in batch])

    if success_ids:
        tracker.mark_uploaded(success_ids)

    if failed_uploads:
        with open(FAILED_UPLOADS_FILE, "w") as f:
            json.dump(failed_uploads, f, indent=2)
        logger.error(f"❌ Failed uploads logged in {FAILED_UPLOADS_FILE}")


In [None]:
# Cell 5

async def main():
    # 1) Read all vector IDs from randomly_discovered_ids.txt
    with open(VECTOR_IDS_FILE, "r") as f:
        all_ids = f.read().splitlines()
    logger.info(f"📌 Found {len(all_ids)} IDs in '{VECTOR_IDS_FILE}'")

    # 2) Filter out already processed ones
    unprocessed_ids = tracker.get_unprocessed(all_ids)
    logger.info(f"🔎 {len(unprocessed_ids)} unprocessed IDs remain.")

    # 3) Process in batches
    for i in range(0, len(unprocessed_ids), FETCH_BATCH_SIZE):
        batch_ids = unprocessed_ids[i : i + FETCH_BATCH_SIZE]
        # Fetch from Pinecone
        vectors = fetch_vector_batch(index, batch_ids)
        # Mark as processed
        tracker.mark_processed(batch_ids)
        # Upload to Neo4j
        upload_to_neo4j(driver, vectors)

        batch_num = (i // FETCH_BATCH_SIZE) + 1
        logger.info(f"✅ Finished batch {batch_num} of size {len(batch_ids)}")

await main()

In [None]:
import pickle
all_ids = pickle.load(open("vector_ids.pkl", "rb"))
unique_ids = set(all_ids)
print(len(all_ids), len(unique_ids))
