Fixing the ID discrepancies

In [None]:
from neo4j import GraphDatabase
import os

from neo4j import GraphDatabase
from dotenv import load_dotenv
from pinecone import Pinecone

# Neo4j Config
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Pinecone setup
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT", "us-east1-gcp")
PINECONE_INDEX_NAME = "belief-embeddings"

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(PINECONE_INDEX_NAME)


driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))


In [None]:
import os
import pickle
import logging
import time
from collections import defaultdict
from tqdm import tqdm

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Pinecone & Neo4j Setup (Assume `index` and `driver` are initialized)
CHECKPOINT_FILE = "processed_neo4j_ids.pkl"
BATCH_SIZE = 100  # Process in batches of 100

# Load or initialize processed Neo4j node IDs
def load_checkpoint():
    """Load previously processed Neo4j node IDs to resume without duplication."""
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "rb") as f:
            return pickle.load(f)
    return set()

def save_checkpoint(processed_ids):
    """Save processed Neo4j node IDs."""
    with open(CHECKPOINT_FILE, "wb") as f:
        pickle.dump(processed_ids, f)

def fetch_unprocessed_overloaded_concepts():
    """
    Fetch Overloaded_Concept nodes from Neo4j that do NOT already have an 'id' field.
    Returns a list of {name, neo4j_id}.
    """
    unprocessed_concepts = []
    with driver.session() as session:
        results = session.run(
            """
            MATCH (oc:Overloaded_Concept)
            WHERE oc.id IS NULL
            RETURN oc.name AS name, ID(oc) AS neo4j_id
            """
        )
        for record in results:
            unprocessed_concepts.append({
                "name": record["name"],
                "neo4j_id": record["neo4j_id"]
            })

    return unprocessed_concepts


def process_overloaded_concepts():
    """
    Lookup Pinecone IDs and attach them to Overloaded_Concept nodes in Neo4j.
    """
    processed_ids = load_checkpoint()
    concepts_to_process = fetch_unprocessed_overloaded_concepts()

    # Filter out already processed ones
    unprocessed_concepts = [c for c in concepts_to_process if c["neo4j_id"] not in processed_ids]
    logger.info(f"🔎 {len(unprocessed_concepts)} Overloaded_Concepts to process.")

    if not unprocessed_concepts:
        logger.info("✅ No unprocessed concepts found. Exiting.")
        return

    for i in range(0, len(unprocessed_concepts), BATCH_SIZE):
        batch = unprocessed_concepts[i : i + BATCH_SIZE]
        update_queries = []
        batch_processed_ids = []

        for concept in batch:
            concept_name = concept["name"]
            neo4j_id = concept["neo4j_id"]


            # Get vector dimension
            index_stats = index.describe_index_stats()
            VECTOR_DIM = index_stats["dimension"]  # Set this dynamically

            response = index.query(
                vector=[0] * VECTOR_DIM,  # Dummy zero vector of the correct dimension
                filter={"concept": {"$eq": concept_name}},  # Ensure exact match
                top_k=1,
                include_metadata=True
            )



            if not response.matches:
                logger.warning(f"⚠️ Pinecone ID missing for '{concept_name}', skipping.")
                continue

            pinecone_id = response.matches[0].id  # Get the exact Pinecone vector ID

            # Create Cypher query to update the Neo4j node
            update_queries.append(
                {"neo4j_id": neo4j_id, "pinecone_id": pinecone_id}
            )

        # Batch update Neo4j
        try:
            with driver.session() as session:
                session.run(
                    """
                    UNWIND $updates AS update
                    MATCH (oc:Overloaded_Concept)
                    WHERE ID(oc) = update.neo4j_id
                    SET oc.id = update.pinecone_id
                    """,
                    updates=update_queries
                )

            # Save processed IDs only if Neo4j update succeeds
            batch_processed_ids = [c["neo4j_id"] for c in update_queries]
            processed_ids.update(batch_processed_ids)
            save_checkpoint(processed_ids)

            logger.info(f"✅ Updated {len(batch_processed_ids)} Overloaded_Concepts in Neo4j.")

        except Exception as e:
            logger.error(f"❌ Neo4j update failed: {str(e)}")
            logger.error("🔴 Hard stopping due to write failure.")
            return

        # Log progress
        if (i // BATCH_SIZE) % 1 == 0:  # Log every batch
            logger.info(f"📢 Processed {i + BATCH_SIZE}/{len(unprocessed_concepts)} concepts...")

# Run the function
process_overloaded_concepts()
