Since concepts and beliefs were extracted from documents, there are duplicate Concept nodes, but with separate origins and meanings.
We're going to restructure the nodes to combine synonymous nodes into a new class apart from Belief and Concept.

In [None]:
import os
import logging
import sys
import pickle
import json
import time
from tqdm import tqdm
from neo4j import GraphDatabase
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Neo4j Config
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Script Constants
CHECKPOINT_DIR = "checkpoints"
RESET_OVERLOADED_NODES = False

# Create checkpoint directory
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Logging setup
LOG_FILE = "concept_processing.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(LOG_FILE)
    ]
)
logger = logging.getLogger(__name__)
logger.info("✅ Logging initialized.")

# Initialize Neo4j Connection
def init_neo4j():
    if not NEO4J_URI:
        raise ValueError("⚠️ NEO4J_URI is not set. Check your .env file.")
    
    driver = GraphDatabase.driver(
        NEO4J_URI,
        auth=(NEO4J_USER, NEO4J_PASSWORD),
        max_connection_lifetime=3600
    )
    with driver.session() as session:
        if session.run("RETURN 1").single():
            logger.info("✅ Successfully connected to Neo4j")
            return driver
    raise Exception("❌ Failed to connect to Neo4J.")

In [None]:
def reset_overloaded_nodes(driver):
    if RESET_OVERLOADED_NODES:
        with driver.session() as session:
            result = session.run("MATCH (oc:Overloaded_Concept) DETACH DELETE oc RETURN count(*) as deleted")
            deleted = result.single()["deleted"]
            logger.info(f"❌ Deleted {deleted} Overloaded_Concept nodes.")

class ProcessTracker:
    def __init__(self):
        self.processed_ids = set()

    def mark_processed(self, vec_ids):
        if vec_ids:
            self.processed_ids.update(vec_ids)

def find_duplicate_concepts(driver):
    duplicate_concepts = {}
    with driver.session() as session:
        result = session.run(
            """
            MATCH (c:Concept)
            WITH c.concept as concept_name, collect(c) as concepts
            WHERE size(concepts) > 1
            RETURN concept_name, [c IN concepts | {
                id: c.id, 
                sep_id: c.sep_id, 
                document_title: c.document_title
            }] as concept_instances
            """
        )
        for record in result:
            concept_name = record["concept_name"]
            instances = record["concept_instances"]
            duplicate_concepts[concept_name] = instances
    logger.info(f"🔍 Found {len(duplicate_concepts)} concepts with duplicates")
    return duplicate_concepts

def merge_duplicate_concepts(driver, duplicate_concepts, tracker):
    total_merged = 0
    processed_ids = set()
    with tqdm(total=len(duplicate_concepts), desc="Merging duplicate concepts") as pbar:
        with driver.session() as session:
            for concept_name, instances in duplicate_concepts.items():
                try:
                    ids = [instance["id"] for instance in instances]
                    sep_ids = [instance["sep_id"] for instance in instances]
                    doc_titles = [instance["document_title"] for instance in instances]
                    session.run(
                        """
                        MERGE (oc:Overloaded_Concept {name: $name})
                        SET oc.type = "overloaded_concept",
                            oc.sep_ids = $sep_ids,
                            oc.document_titles = $doc_titles,
                            oc.vector_ids = $vector_ids
                        """,
                        name=concept_name,
                        sep_ids=sep_ids,
                        doc_titles=doc_titles,
                        vector_ids=ids
                    )
                    delete_result = session.run(
                        """
                        MATCH (c:Concept) 
                        WHERE c.id IN $ids
                        DETACH DELETE c
                        RETURN count(*) as deleted
                        """,
                        ids=ids
                    )
                    deleted = delete_result.single()["deleted"]
                    logger.info(f"🔄 Merged '{concept_name}' from {len(ids)} concepts, deleted {deleted} nodes")
                    processed_ids.update(ids)
                    total_merged += 1
                except Exception as e:
                    logger.error(f"❌ Error processing '{concept_name}': {str(e)}")
                pbar.update(1)
    tracker.mark_processed(list(processed_ids))
    logger.info(f"✅ Successfully merged {total_merged} duplicate concepts")
    return total_merged

In [None]:
def main():
    driver = init_neo4j()
    reset_overloaded_nodes(driver)
    tracker = ProcessTracker()
    try:
        logger.info("🔍 Searching for duplicate concepts across the database...")
        duplicate_concepts = find_duplicate_concepts(driver)
        if duplicate_concepts:
            merge_duplicate_concepts(driver, duplicate_concepts, tracker)
        else:
            logger.info("ℹ️ No duplicate concepts found in the database")
    except Exception as e:
        logger.error(f"❌ Unexpected error: {str(e)}")
    finally:
        driver.close()
        logger.info("✅ Script execution completed")

if __name__ == "__main__":
    main()


We need to do the same thing for Pinecone, and attach that to this. 